@pleri/olam-cli - npm Package Compare versions

+1

-1

dist/index.js.map

		@@ -1,1 +0,1 @@
		{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mCAAmC,CAAC;AAC9E,OAAO,EAAE,0BAA0B,EAAE,MAAM,oCAAoC,CAAC;AAChF,OAAO,EAAE,8BAA8B,EAAE,MAAM,yCAAyC,CAAC;AACzF,OAAO,EAAE,2BAA2B,EAAE,MAAM,qCAAqC,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,sBAAsB,CAAC;AAEjE,MAAM,UAAU,GAAG,cAAc,EAAE,CAAC;AACpC,iFAAiF;AACjF,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,GAAG,UAAU,CAAC;AAE7C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,MAAM,CAAC;KACZ,WAAW,CAAC,+DAA+D,CAAC;IAC7E,4EAA4E;IAC5E,0EAA0E;KACzE,MAAM,CAAC,OAAO,EAAE,6DAA6D,CAAC;KAC9E,OAAO,CAAC,UAAU,CAAC;KACnB,aAAa,CAAC,yBAAyB,EAAE,CAAC,CAAC;AAE9C,gEAAgE;AAChE,6EAA6E;AAC7E,4EAA4E;AAC5E,yEAAyE;AACzE,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AACnD,IAAI,YAAY,KAAK,CAAC,CAAC,EAAE,CAAC;IACxB,qEAAqE;IACrE,+EAA+E;IAC/E,iDAAiD;IACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IACzF,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,0DAA0D,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK;YACtF,qEAAqE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AACvD,CAAC;AAED,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,sEAAsE;AACtE,yEAAyE;AACzE,0EAA0E;AAC1E,wEAAwE;AACxE,mBAAmB,CAAC,OAAO,EAAE,EAAE,MAAM,EAAE,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC/D,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,iBAAiB,CAAC,OAAO,CAAC,CAAC;AAC3B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC7B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,yBAAyB,CAAC,OAAO,CAAC,CAAC;AACnC,0BAA0B,CAAC,OAAO,CAAC,CAAC;AACpC,8BAA8B,CAAC,OAAO,CAAC,CAAC;AACxC,2BAA2B,CAAC,OAAO,CAAC,CAAC;AACrC,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,uBAAuB,CAAC,OAAO,CAAC,CAAC;AACjC,cAAc,CAAC,OAAO,CAAC,CAAC;AAExB,6EAA6E;AAC7E,4EAA4E;AAC5E,uEAAuE;AACvE,iEAAiE;AACjE,KAAK,oBAAoB,CAAC,UAAU,CAAC,CAAC;AAEtC,sEAAsE;AACtE,yEAAyE;AACzE,yEAAyE;AACzE,4EAA4E;AAC5E,kDAAkD;AAClD,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC,EAAE,CAAC;IACrF,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACpD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;AACtD,CAAC;AAED,wEAAwE;AACxE,0EAA0E;AAC1E,uEAAuE;AACvE,wEAAwE;AACxE,qEAAqE;AACrE,wEAAwE;AACxE,2EAA2E;AAC3E,+DAA+D;AAC/D,EAAE;AACF,wEAAwE;AACxE,0EAA0E;AAC1E,yEAAyE;AACzE,mEAAmE;AACnE,+DAA+D;AAC/D,sBAAsB;AACtB,IAAI,CAAC;IACH,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;AACtC,CAAC;AAAC,OAAO,GAAY,EAAE,CAAC;IACtB,UAAU,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;QACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC;IACzC,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC;IAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC;AACnF,CAAC"}
		{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mCAAmC,CAAC;AAC9E,OAAO,EAAE,0BAA0B,EAAE,MAAM,oCAAoC,CAAC;AAChF,OAAO,EAAE,8BAA8B,EAAE,MAAM,yCAAyC,CAAC;AACzF,OAAO,EAAE,2BAA2B,EAAE,MAAM,qCAAqC,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,sBAAsB,CAAC;AAEjE,MAAM,UAAU,GAAG,cAAc,EAAE,CAAC;AACpC,iFAAiF;AACjF,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,GAAG,UAAU,CAAC;AAE7C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,MAAM,CAAC;KACZ,WAAW,CAAC,+DAA+D,CAAC;IAC7E,4EAA4E;IAC5E,0EAA0E;KACzE,MAAM,CAAC,OAAO,EAAE,6DAA6D,CAAC;KAC9E,OAAO,CAAC,UAAU,CAAC;KACnB,aAAa,CAAC,yBAAyB,EAAE,CAAC,CAAC;AAE9C,gEAAgE;AAChE,6EAA6E;AAC7E,4EAA4E;AAC5E,yEAAyE;AACzE,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AACnD,IAAI,YAAY,KAAK,CAAC,CAAC,EAAE,CAAC;IACxB,qEAAqE;IACrE,+EAA+E;IAC/E,iDAAiD;IACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IACzF,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,0DAA0D,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK;YACtF,qEAAqE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AACvD,CAAC;AAED,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,sEAAsE;AACtE,yEAAyE;AACzE,0EAA0E;AAC1E,wEAAwE;AACxE,mBAAmB,CAAC,OAAO,EAAE,EAAE,MAAM,EAAE,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC/D,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,iBAAiB,CAAC,OAAO,CAAC,CAAC;AAC3B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC7B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,yBAAyB,CAAC,OAAO,CAAC,CAAC;AACnC,0BAA0B,CAAC,OAAO,CAAC,CAAC;AACpC,8BAA8B,CAAC,OAAO,CAAC,CAAC;AACxC,2BAA2B,CAAC,OAAO,CAAC,CAAC;AACrC,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,uBAAuB,CAAC,OAAO,CAAC,CAAC;AACjC,cAAc,CAAC,OAAO,CAAC,CAAC;AAExB,6EAA6E;AAC7E,4EAA4E;AAC5E,uEAAuE;AACvE,iEAAiE;AACjE,KAAK,oBAAoB,CAAC,UAAU,CAAC,CAAC;AAEtC,sEAAsE;AACtE,yEAAyE;AACzE,yEAAyE;AACzE,4EAA4E;AAC5E,kDAAkD;AAClD,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC,EAAE,CAAC;IACrF,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACpD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;AACtD,CAAC;AAED,wEAAwE;AACxE,0EAA0E;AAC1E,uEAAuE;AACvE,wEAAwE;AACxE,qEAAqE;AACrE,wEAAwE;AACxE,2EAA2E;AAC3E,+DAA+D;AAC/D,EAAE;AACF,wEAAwE;AACxE,0EAA0E;AAC1E,yEAAyE;AACzE,mEAAmE;AACnE,+DAA+D;AAC/D,sBAAsB;AACtB,IAAI,CAAC;IACH,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;AACtC,CAAC;AAAC,OAAO,GAAY,EAAE,CAAC;IACtB,UAAU,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;QACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC;IACzC,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC;IAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC;AACnF,CAAC"}

+1

-1

hermes-bundle/version.json

		{
		"bundledAt": "2026-06-15T08:36:13.799Z",
		"bundledAt": "2026-06-18T05:40:50.182Z",
		"kgFirstSha": "29a9ccce1b115d049e375c4a90eb5cf7c123e610e2d0590270a4db2cdbc64a28"
		}

+1

-4

package.json

		{
		"name": "@pleri/olam-cli",
		"version": "0.1.218",
		"version": "0.1.219",
		"type": "module",
		@@ -13,7 +13,5 @@ "bin": {
		"dist/mcp-server.js",
		"dist/image-digests.json",
		"dist/agent-stream",
		"hermes-bundle",
		"hooks",
		"host-cp",
		"memory-hooks",
		@@ -43,3 +41,2 @@ "README.md"
		"audit:publish-deps": "node scripts/audit-publish-deps.mjs",
		"audit:cli-bundle-k8s": "node scripts/audit-cli-bundle-k8s.mjs",
		"audit:cli-package-contents": "node scripts/audit-cli-package-contents.mjs",
		@@ -46,0 +43,0 @@ "audit:cli-test-coverage": "node ../../scripts/audit-cli-test-coverage.mjs"

-10

dist/image-digests.json

		{
		"auth": "sha256:770ee97ee4d06d2c1b6512ba99421a5fe312393d592df1684fd0d03b3476ff10",
		"host-cp": "sha256:328baca8b9b28ccef1d858aa20e0ab27855604a630132dcadd423990cb376f60",
		"kg-service": "sha256:f97ee90fe1bd5b12cb56d5fbf0d3085c301bb7abeef0dd28d2b2a5c90ab6efbb",
		"memory-service": "sha256:923bff54d2ba3da162a35d3e8ebc6bd440bed6d290a5cff7bae2888281a4e003",
		"mcp-auth": "sha256:eaac2164349e388a70dae0d86c34132f97aa74177a2376cdfa10732e8eadb507",
		"$schema_version": 1,
		"$published_version": "0.1.218",
		"$registry": "ghcr.io/pleri"
		}

-217

host-cp/compose.yaml

		# Phase F-2-B (B2): olam-host-cp compose stack.
		#
		# Two services on a private internal network:
		#
		# 1. host-cp — the SPA proxy server (B3+ implementation). Exposes
		# port 19000 to the operator's host. Talks to the
		# docker-socket-proxy via `tcp://docker-socket-proxy:2375`
		# (NOT the raw /var/run/docker.sock).
		#
		# 2. docker-socket-proxy
		# — tecnativa/docker-socket-proxy sidecar. Mounts the
		# real /var/run/docker.sock read-only and exposes a
		# whitelisted subset of the Docker API. Whitelist:
		# CONTAINERS=1 — list/inspect (find world IDs)
		# EVENTS=1 — stream restart/stop events
		# (cache invalidation; B3 / T2)
		# EXEC=1 — exec inside containers
		# (read /tmp/olam-container-secret)
		# Everything else is denied (images, volumes,
		# networks, swarm, build, push, etc.). T6 + T8
		# mitigation: blast-radius reduction vs raw socket.
		#
		# Bring up: `docker compose -f packages/host-cp/compose.yaml up --build -d`
		# Tear down: `docker compose -f packages/host-cp/compose.yaml down`

		services:
		olam-host-cp:
		container_name: olam-host-cp
		# Image-only — operator's `olam bootstrap` pulls the digest-pinned
		# `ghcr.io/pleri/olam-host-cp:latest` (digest from image-digests.json)
		# and tags it as the local `:latest` BEFORE compose up. No `build:`
		# in this file — fresh-install operators don't have the source tree
		# so a `build:` block crashes them with "Dockerfile not found".
		#
		# Local-dev contributors who want to test host-cp source changes
		# use the sibling compose.dev.yaml as an override:
		#
		# docker compose \
		# -f packages/host-cp/compose.yaml \
		# -f packages/host-cp/compose.dev.yaml \
		# up --build -d
		#
		# The CLI's `olam host-cp start` always uses ONLY compose.yaml, so
		# operator boots are never blocked on a missing Dockerfile / build
		# context.
		image: ghcr.io/pleri/olam-host-cp:latest
		ports:
		# Bind to 127.0.0.1 only — single-user-per-host assumption (T4).
		# Multi-user / TLS / remote access lands in Phase G+.
		- "127.0.0.1:19000:19000"
		environment:
		# Connection string for docker-socket-proxy. The proxy listens on
		# tcp://0.0.0.0:2375 inside the internal network. host-cp uses
		# this to enumerate worlds (containers list) + read secrets
		# (containers exec) + subscribe to restart events.
		DOCKER_HOST: "tcp://docker-socket-proxy:2375"
		# Phase F-2-B M2 ship gate: secret cache TTL (5min, demoted from
		# 1h per D2). B3 reads this; B10's m2-cache-invalidate.sh tests
		# the docker-events invalidation path.
		OLAM_SECRET_CACHE_TTL_SEC: "300"
		# Bind operator-facing UI port. Always 19000 in compose.
		OLAM_HOST_CP_PORT: "19000"
		# Token + workspace + world registry mount points. Bind-mounted
		# below; host CP reads these at boot.
		OLAM_HOST_CP_TOKEN_PATH: "/data/host-cp.token"
		OLAM_WORKSPACES_DIR: "/data/workspaces"
		OLAM_WORLDS_DB: "/data/worlds.db"
		OLAM_PR_POLL_INTERVAL_MS: "300000"
		OLAM_MERGE_GRACE_MS: "600000"
		# NOTE: OLAM_REPO_PATH is intentionally NOT passed into the
		# container env. The HOST-side variable names a bind-mount source
		# (a host path like /Users/.../olam — see the volumes block below).
		# Inside the container, the bind-mount target is always
		# `/operator-repo`. Pre-fix the env was passed through, server.mjs
		# consumers (version-status.mjs, /api/prs handler) read it
		# expecting a container-side path, then `cwd:` to a host path that
		# doesn't exist inside the container — `gh pr list` failed with
		# "not a git repository", `gh` itself failed with `spawn ENOENT`.
		# Server-side consumers default to `/operator-repo` which is
		# always correct.
		# Auth-service inter-service auth. The secret is shared with the
		# long-lived olam-auth container (generated on first `olam auth
		# up` at ~/.olam/auth-secret). Without it, X-Olam-Secret is never
		# sent and auth-service 401s every host-cp → /credentials/* call,
		# which surfaces in the dashboard as a failed Connect Claude flow.
		OLAM_AUTH_SERVICE_URL: "http://host.docker.internal:9999"
		OLAM_AUTH_SECRET: "${OLAM_AUTH_SECRET:-}"
		# Operator's CLI version, propagated by `olam host-cp start` via
		# buildComposeEnv. Surfaces in /api/version/status so the
		# dashboard's TopNav can render "the version we're working on."
		# Empty when older CLI versions render this compose; the server
		# falls back to host-cp's own package.json.
		OLAM_CLI_VERSION: "${OLAM_CLI_VERSION:-}"
		# Upgrade-trigger feature: host-cp uses these to construct bind
		# mounts on the spawned upgrader container. The upgrader runs
		# `olam upgrade -y` and needs (a) the operator's ~/.olam state,
		# (b) the docker socket so the CLI can talk to the daemon. Both
		# are HOST-side paths because docker resolves bind sources on
		# the daemon, not inside the requesting container.
		OLAM_HOME_HOST_PATH: "${HOME}/.olam"
		OLAM_DOCKER_SOCK_HOST_PATH: "/var/run/docker.sock"
		# Operator's olam repo path on the host. The upgrader needs this
		# bind-mounted so the CLI's cwd-relative `packages/host-cp/compose.yaml`
		# lookup resolves. Defaults to the `OLAM_REPO_PATH` already used by
		# host-cp for version-detection (mounted at /operator-repo:ro).
		OLAM_REPO_HOST_PATH: "${OLAM_REPO_PATH:-${HOME}/Projects/ein-sof/olam}"
		# Operator's $HOME on the host. Forwarded to the upgrader as HOME
		# so `${HOME}` interpolation in compose.yaml's bind sources
		# resolves to a path the docker daemon can find. Inside the
		# upgrader container HOME defaults to /root, which the daemon
		# rejects when used as a bind source ("path not shared from
		# the host"). Without this the recreate step fails right at the
		# last hop of the upgrade pipeline.
		OLAM_OPERATOR_HOME_HOST_PATH: "${HOME}"
		# GitHub CLI config bind for the upgrader. The CLI runs
		# `gh auth token \| docker login ghcr.io ...` before `olam upgrade`
		# so the spawned container can pull GHCR images even though the
		# host's ~/.docker/config.json uses a Keychain credsStore that
		# doesn't follow into a Linux container. The gh config is also
		# mounted into host-cp itself (line 89 below) for `gh pr list` —
		# this is the same path, mounted again for the upgrader.
		OLAM_GH_CONFIG_HOST_PATH: "${HOME}/.config/gh"
		# GitHub token used by the upgrader to `docker login ghcr.io` so it can
		# pull the host-cp / auth / devbox images by digest. Resolved from
		# the operator's `gh auth token` BEFORE compose up (or set explicitly
		# via the GH_TOKEN env var). If unset, the upgrader falls back to
		# `gh auth token` against the mounted ~/.config/gh — which works
		# only on Linux operators (macOS keeps the token in Keychain, not in
		# ~/.config/gh).
		GH_TOKEN: "${GH_TOKEN:-}"
		# Optional override for the upgrader image. Defaults to the
		# currently-running host-cp image (which has the olam CLI +
		# docker CLI + gh CLI baked in by Dockerfile).
		OLAM_UPGRADER_IMAGE: "${OLAM_UPGRADER_IMAGE:-ghcr.io/pleri/olam-host-cp:latest}"
		# Plan DB persistence fix (Bug 1): os.homedir() inside the container is
		# /root, but ~/.olam is bind-mounted to /data — not /root/.olam. Without
		# these overrides, plan.db lands in the container's ephemeral layer and is
		# destroyed by every `docker compose up --force-recreate` (i.e. olam upgrade).
		# Pointing to /data/ routes all writes through the bind-mount to the host.
		OLAM_PLAN_DB_PATH: "/data/plan.db"
		OLAM_PLAN_DIR: "/data/plan"
		# Same /root vs /data bind-mount issue applies to the plan-chat bearer
		# gateway. Without this override, plan-chat-secret.mjs reads from
		# /root/.olam/plan-chat-secret (container ephemeral, missing) and
		# /agent-runtime/trigger answers HTTP 500. Routing through /data
		# surfaces the on-disk bearer created by ensureSecret() on host FS.
		OLAM_PLAN_CHAT_SECRET_PATH: "/data/plan-chat-secret"
		volumes:
		# ~/.olam/ from operator's home → /data/ inside container. B4
		# writes the startup token here (chmod 600). B6 reads workspaces
		# + worlds.db from here. ~/.olam/ is the canonical operator-state
		# directory established by the Olam CLI; consistent with the
		# devbox container's mount layout.
		- ${HOME}/.olam:/data
		- ${HOME}/.config/gh:/gh-config:ro
		# Operator's olam repo mounted read-only so host-cp can poll
		# .git/refs/heads/main to detect when a new version is available.
		# The path inside the container is always /operator-repo.
		# On the host: OLAM_REPO_PATH env var, or defaults to
		# $HOME/Projects/ein-sof/olam. If the path doesn't exist, the
		# mount is a no-op and version detection reports "operator-repo not mounted".
		- ${OLAM_REPO_PATH:-${HOME}/Projects/ein-sof/olam}:/operator-repo:ro
		depends_on:
		docker-socket-proxy:
		condition: service_started
		networks:
		- olam-host-cp-internal
		restart: unless-stopped

		docker-socket-proxy:
		container_name: olam-docker-socket-proxy
		# Pin to a specific tag, not :latest. Update via Renovate / dependabot.
		# tecnativa/docker-socket-proxy:0.3.0 (2024-10-22) — last tagged
		# release as of plan-pass-2 emit. T8 mitigation: pinning prevents
		# supply-chain drift on the sidecar.
		image: tecnativa/docker-socket-proxy:0.3.0
		environment:
		# Whitelist matches plan D5 + T6/T8: host CP needs exactly these
		# four operations. EVERYTHING else stays at the proxy default
		# (deny). Audit periodically; widen with explicit justification.
		CONTAINERS: "1"
		EVENTS: "1"
		EXEC: "1"
		# Allows GET /images/<ref>/json. Needed by version-status.mjs to
		# resolve the baked OLAM_BUILD_SHA of locally-pulled images
		# (host-cp + auth-service + devbox `:latest` tags) so the
		# upgrade comparator can answer "is there a newer image I'd
		# actually swap to?" — see PR #459 for the comparator rewrite
		# and `fetchLatestImageSha`. Without this, both the new
		# comparator AND the pre-existing fetchDevboxImageSha fall back
		# to 'unknown', producing the over-reporting "Upgrade available"
		# banner regression. Socket is mounted :ro so this remains
		# read-only inspect; no container mutation surface.
		IMAGES: "1"
		# tecnativa/docker-socket-proxy 0.3.0 requires POST=1 to allow
		# POST verbs on whitelisted endpoints (exec creation requires
		# POST /containers/<id>/exec + POST /exec/<id>/start). Phase
		# F-2-D dogfood revealed the missing perm.
		POST: "1"
		# Optional: lower log verbosity. Default is INFO; DEBUG floods
		# logs in dev. Comment out for troubleshooting.
		LOG_LEVEL: "warning"
		volumes:
		# Mount the host's docker socket READ-ONLY. The proxy is the only
		# consumer of the raw socket. host-cp talks to the proxy over
		# TCP (port 2375 on the internal network).
		- /var/run/docker.sock:/var/run/docker.sock:ro
		networks:
		- olam-host-cp-internal
		restart: unless-stopped

		networks:
		olam-host-cp-internal:
		name: olam-host-cp-internal
		driver: bridge
		# Internal-only: no host port published; host-cp <-> proxy traffic
		# never leaves the docker network.

-58

host-cp/k8s/host-side/docker-socket-proxy.compose.yaml

		# Host-side docker-socket-proxy for the olam kubernetes substrate.
		#
		# Background — round-4 wave-2 R4-W2-F (kuro-bear retest 2026-05-21):
		# on macOS + colima + virtiofs, containerd's OCI runtime spec generator
		# calls stat() on docker.sock hostPath bind mounts; virtiofs returns
		# ENOTSUP for stat/statx on socket files; pod creation fails. The R3-A
		# two-volume hostPath approach is unrecoverable on virtiofs.
		#
		# This compose file provisions the docker-socket-proxy AS A HOST-SIDE
		# CONTAINER (sibling to k3d on the operator's docker daemon), NOT as a
		# pod inside the k3d cluster. The in-cluster Service in
		# packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml is
		# `type: ExternalName` aliasing `host.k3d.internal` — cluster pods reach
		# THIS container via that DNS handle.
		#
		# Architecture mirrors the compose substrate's pattern (see
		# packages/host-cp/compose.yaml:170-210). Same image, same allowlist,
		# same restart policy. The only difference: this proxy publishes to
		# the operator host on 127.0.0.1:2375 so k3d nodes can reach it via
		# host.k3d.internal — the compose-substrate sibling stays internal-only.
		#
		# Operator UX: `olam upgrade -y` Step 0.7 auto-starts this on macOS via
		# `docker compose -f <this-file> up -d`. Linux operators get a no-op
		# (Step 0.7 is platform-gated). See docs/operator/kubernetes-substrate-beta.md.

		services:
		docker-socket-proxy:
		container_name: olam-host-side-docker-socket-proxy
		# tecnativa/docker-socket-proxy:0.3.0 — matches the compose substrate's
		# pin verbatim. T8 supply-chain: pinning prevents drift. Update via
		# Renovate / dependabot.
		image: tecnativa/docker-socket-proxy:0.3.0
		environment:
		# Whitelist matches packages/host-cp/compose.yaml:181-202 verbatim.
		# Anything outside this list stays at proxy default (deny).
		CONTAINERS: "1"
		EVENTS: "1"
		EXEC: "1"
		# IMAGES=1 needed for GET /images/<ref>/json (version-status.mjs
		# fetchLatestImageSha). Socket is :ro so this is read-only inspect.
		IMAGES: "1"
		# POST=1 required since tecnativa 0.3.0 for exec creation
		# (POST /containers/<id>/exec + POST /exec/<id>/start). See
		# packages/host-cp/compose.yaml:195-199 for the F-2-D dogfood
		# finding that surfaced this.
		POST: "1"
		LOG_LEVEL: "warning"
		ports:
		# Publish to operator host on 127.0.0.1:2375 ONLY. k3d nodes reach
		# this via host.k3d.internal:2375. Binding to 127.0.0.1 (not
		# 0.0.0.0) is T1 mitigation: docker API surface stays loopback-only
		# on a single-tenant operator machine.
		- "127.0.0.1:2375:2375"
		volumes:
		# Read-only mount of the host's docker socket. The proxy is the
		# only consumer of the raw socket on the operator's mac.
		- /var/run/docker.sock:/var/run/docker.sock:ro
		restart: unless-stopped

-7

host-cp/k8s/manifests/00-namespace.yaml

		apiVersion: v1
		kind: Namespace
		metadata:
		name: olam
		labels:
		name: olam
		olam.io/component: host-stack

-8

host-cp/k8s/manifests/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-host-cp
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack

-34

host-cp/k8s/manifests/20-rbac.yaml

		# Phase 1b Decision 19: Role scoped to resourceNames: ["olam-host-cp"] on
		# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
		# could patch ANY Deployment in the namespace. This is the load-bearing
		# security guardrail — preserve verbatim.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-host-cp
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		rules:
		- apiGroups: ["apps"]
		resources: ["deployments"]
		resourceNames: ["olam-host-cp"]
		verbs: ["get", "patch", "watch"]
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-host-cp
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		subjects:
		- kind: ServiceAccount
		name: olam-host-cp
		namespace: olam
		roleRef:
		kind: Role
		name: olam-host-cp
		apiGroup: rbac.authorization.k8s.io

-57

host-cp/k8s/manifests/30-configmap.yaml

		# ConfigMap for olam-host-cp environment. Sensitive values (OLAM_AUTH_SECRET,
		# GH_TOKEN) are NOT here — they live in the Secret (see templates/40-secret-template.yaml).
		# Operators apply the Secret separately before applying the manifests.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-host-cp-env
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		data:
		# Auth service URL. Default targets host.docker.internal for Colima/Docker
		# Desktop k3d setups. Override when auth-service runs elsewhere (e.g. via
		# an ExternalName Service pointing at the host gateway).
		#
		# Port :9999 matches the published port in AuthContainerController.start()
		# (packages/core/src/auth/container.ts) — the value was historically :8000,
		# which never matched any running auth-service version and surfaced as
		# {"error":"auth_service_unavailable","message":"fetch failed"}
		# on /api/auth/* calls. Verified during the K3d-HTTPS PR live bring-up;
		# see docs/runbooks/k3d-https-setup.md.
		OLAM_AUTH_SERVICE_URL: "http://host.docker.internal:9999"
		# Docker socket proxy — ClusterIP Service DNS inside the namespace.
		DOCKER_HOST: "tcp://docker-socket-proxy:2375"
		# Host-cp server port — must match the Service targetPort in 60-service.yaml.
		OLAM_HOST_CP_PORT: "19000"
		# Operator state paths (resolved inside the K3s node via hostPath volumes).
		OLAM_HOST_CP_TOKEN_PATH: "/data/host-cp.token"
		OLAM_WORKSPACES_DIR: "/data/workspaces"
		OLAM_WORLDS_DB: "/data/worlds.db"
		OLAM_PLAN_DB_PATH: "/data/plan.db"
		OLAM_PLAN_DIR: "/data/plan"
		# Phase B Model B: bearer file is now sourced from the shared
		# olam-plan-chat-secret Kubernetes Secret (mounted at /etc/olam-plan-chat/).
		# Two readers, one source-of-truth — replaces the per-pod /data/plan-chat-secret
		# file that couldn't be shared across pods on RWO PVCs. The plan-chat-service
		# pod also mounts the SAME Secret at the SAME path so bearer comparisons
		# work both ways.
		OLAM_PLAN_CHAT_SECRET_PATH: "/etc/olam-plan-chat/secret"
		# In-cluster plan-chat-service URL. Rewritten by upgrade-kubernetes.ts step 2.5
		# (buildK8sDnsUrl) — the default below is a sane fallback for raw
		# `kubectl apply -f` operators who skip the CLI wrapper.
		PLAN_CHAT_SERVICE_URL: "http://olam-plan-chat-service.olam.svc.cluster.local:3200"
		# NDJSON span sink + recovery ledger — route to the writable PVC mount at
		# /data rather than the default ~/.olam/logs (which resolves to
		# /home/node/.olam/logs and is not writable with readOnlyRootFilesystem: true).
		OLAM_TRACE_LOG_PATH: "/data/logs/host.trace.ndjson"
		OLAM_RECOVERY_LEDGER_PATH: "/data/logs/recovery-ledger.ndjson"
		# Tunable defaults.
		OLAM_SECRET_CACHE_TTL_SEC: "300"
		OLAM_PR_POLL_INTERVAL_MS: "300000"
		OLAM_MERGE_GRACE_MS: "600000"
		# World watchdog — periodic probe of each active world's claude PID for the
		# three wedge signals (wchan + CLOSE_WAIT + CPU). Detection-only in Phase A.
		# Set OLAM_WORLD_WATCHDOG_DISABLED=1 in the deployment env to kill-switch.
		OLAM_WORLD_WATCHDOG_TICK_MS: "30000"

-31

host-cp/k8s/manifests/45-pvc.yaml

		# PersistentVolumeClaim for olam-host-cp /data volume — k3d substrate default.
		#
		# Why PVC instead of hostPath:
		# hostPath volumes on k3d nodes resolve to paths INSIDE the k3d node
		# container — not the operator's host filesystem. A bare k3d cluster has
		# an empty node filesystem, so a hostPath at /host/.olam is always empty.
		# Additionally, fsGroup does NOT relabel hostPath volumes (only PVCs /
		# emptyDir / projected volumes), so UID-1000 pods cannot write to
		# root-owned hostPath mounts even when fsGroup: 1000 is set.
		#
		# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
		# On non-k3d clusters, substitute with the appropriate StorageClass name (D24,
		# operator-editable). For managed clusters (GKE, EKS, AKS) use the GKE-variant
		# manifest instead: packages/host-cp/k8s/manifests/gke/45-pvc.yaml (storageClassName:
		# standard-rwo). See docs/architecture/peripheral-services-on-k3s.md Decision #3
		# for the full per-cluster storageclass table.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-host-cp-data
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		spec:
		accessModes:
		- ReadWriteOnce
		storageClassName: local-path
		resources:
		requests:
		storage: 5Gi

-229

host-cp/k8s/manifests/50-deployment.yaml

		# Deployment for olam-host-cp.
		#
		# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
		# Digest resolves to ghcr.io/pleri/olam-host-cp:0.1.168 (multi-arch index).
		# Pinned to the last image built before PRs #915/#919/#920/#921 introduced
		# lifecycle/, observability/, and recovery/ module directories — those PRs
		# updated server.mjs imports but the Dockerfile was not updated to COPY
		# the new directories, so all images from 0.1.169+ crash with
		# ERR_MODULE_NOT_FOUND. The Dockerfile fix (COPY lifecycle/ / observability/
		# / recovery/) lands in PR #940; the next release will ship a working image.
		# At that point, refresh this digest via the instructions below.
		# To update: resolve the new tag's digest via:
		# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-host-cp:pull&service=ghcr.io" \| jq -r .token)
		# curl -sI -H "Authorization: Bearer $TOKEN" \
		# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
		# https://ghcr.io/v2/pleri/olam-host-cp/manifests/<tag> \| grep docker-content-digest
		#
		# securityContext: conservative defaults per T6/T7 threat model.
		# Operators who need to relax these (e.g. for debugging) must pass
		# --accept-security-regression (Phase C, Decision D14) — out of scope here.
		#
		# Volume requirements for k3d:
		# olam-home (/data): backed by a PersistentVolumeClaim (45-pvc.yaml).
		# An init container (chown-data) runs `chown -R 1000:1000 /data` as root
		# before the main container starts, granting UID-1000 write access on the
		# freshly-provisioned PV. fsGroup alone is insufficient for hostPath volumes.
		#
		# docker access — NO LONGER VIA hostPath (changed in olam-k3d-on-mac-
		# substrate-decision Phase B B2, 2026-05-21). The previous R3-A two-volume
		# hostPath pattern is retracted: round-4 R4-W2-F showed virtiofs returns
		# ENOTSUP on stat/statx of socket files, and that failure is unrecoverable
		# at the containerd OCI runtime layer. host-cp now reaches docker via TCP
		# through the docker-socket-proxy ExternalName Service in the olam
		# namespace (packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml),
		# which kube-dns resolves as a CNAME to host.k3d.internal. The actual
		# proxy container runs on the operator's docker daemon (sibling to k3d),
		# started by `olam upgrade` Step 0.7. See also
		# packages/host-cp/src/lib/docker-request-options.mjs (both substrates now
		# return identical TCP options).
		#
		# The operator's k3d cluster create command is therefore simpler — no
		# `--volume $HOME/.colima/default/:/host-colima/@server:*` flag needed.
		# See docs/operator/kubernetes-substrate-beta.md for the current install
		# command.
		#
		# gh-config (/gh-config) and operator-repo (/operator-repo) remain hostPath
		# volumes that resolve to paths inside the k3d node container.
		# OPERATORS MUST pass these volume mounts when creating the k3d cluster.
		# Without these flags the gh-config and operator-repo mounts will be empty.
		# The pod will still start — features that depend on GitHub auth or the
		# operator repo will fail gracefully.
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-host-cp
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		spec:
		replicas: 1
		strategy:
		type: RollingUpdate
		rollingUpdate:
		maxSurge: 1
		maxUnavailable: 0
		selector:
		matchLabels:
		app: olam-host-cp
		template:
		metadata:
		labels:
		app: olam-host-cp
		spec:
		# B9 (round 2 recovery): disable k8s automatic Service env injection.
		# Without this, k8s injects OLAM_<UPPER-NAME>_SERVICE_HOST/_PORT env vars
		# into all Pods in the namespace. These collide with olam's own config env
		# vars (e.g. OLAM_KG_SERVICE_PORT) causing Python's int() to crash on the
		# auto-injected "tcp://..." string. Decision #4 (no app-code rename; field
		# removes the collision class entirely). GA since k8s 1.13; we target 1.30+.
		enableServiceLinks: false
		# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
		# created by `olam upgrade` step 0.4 when GH_TOKEN is available. Allows
		# pulling from ghcr.io/pleri/* without anonymous rate limits.
		imagePullSecrets:
		- name: ghcr-pull
		serviceAccountName: olam-host-cp
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		runAsGroup: 1000
		fsGroup: 1000
		initContainers:
		- name: chown-data
		# busybox:1.36 — sha256-pinned per T4 threat model.
		# To update: docker pull busybox:1.36 && docker inspect busybox:1.36 --format '{{index .RepoDigests 0}}'
		image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
		imagePullPolicy: IfNotPresent
		# Run as root to chown the freshly-provisioned PV to UID 1000.
		# The pod-level runAsNonRoot: true is overridden here deliberately.
		# The main container still runs as UID 1000 with all security defaults intact.
		securityContext:
		runAsUser: 0
		runAsNonRoot: false
		allowPrivilegeEscalation: false
		command: ["chown", "-R", "1000:1000", "/data"]
		volumeMounts:
		- name: olam-home
		mountPath: /data
		# socket-perm init container REMOVED in olam-k3d-on-mac-substrate-decision
		# Phase B B2 (2026-05-21). The R3-A two-volume hostPath approach for
		# docker.sock has been retracted: round-4 R4-W2-F showed virtiofs
		# ENOTSUP on socket-file stat blocks the mount entirely. host-cp now
		# reaches docker via TCP through the docker-socket-proxy ExternalName
		# Service in the olam namespace (see
		# packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml).
		# The proxy itself runs on the operator's docker daemon (sibling to
		# k3d), started by `olam upgrade` Step 0.7 — not inside this Pod.
		containers:
		- name: olam-host-cp
		image: ghcr.io/pleri/olam-host-cp@sha256:328baca8b9b28ccef1d858aa20e0ab27855604a630132dcadd423990cb376f60
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		readOnlyRootFilesystem: true
		allowPrivilegeEscalation: false
		capabilities:
		drop: ["ALL"]
		ports:
		- name: http
		containerPort: 19000
		protocol: TCP
		env:
		# World watchdog — tick cadence (from ConfigMap default = 30s).
		# Override per-operator to tune probe frequency.
		- name: OLAM_WORLD_WATCHDOG_TICK_MS
		valueFrom:
		configMapKeyRef:
		name: olam-host-cp-env
		key: OLAM_WORLD_WATCHDOG_TICK_MS
		# Set to "1" to disable the world-watchdog entirely (emergency kill switch).
		# Unset by default — watchdog runs in detection-only mode.
		# - name: OLAM_WORLD_WATCHDOG_DISABLED
		# value: "1"
		envFrom:
		- configMapRef:
		name: olam-host-cp-env
		- secretRef:
		name: olam-host-cp-secret
		volumeMounts:
		- name: olam-home
		mountPath: /data
		- name: gh-config
		mountPath: /gh-config
		readOnly: true
		- name: operator-repo
		mountPath: /operator-repo
		readOnly: true
		- name: tmp
		mountPath: /tmp
		# Phase B Model B: shared olam-plan-chat-secret mounted read-only
		# so renderSpaShell can inject window.__OLAM_PLAN_CHAT_BEARER__.
		# Plan-chat-service mounts the SAME Secret at the SAME path so
		# bearer compares match across pods.
		- name: plan-chat-secret
		mountPath: /etc/olam-plan-chat
		readOnly: true
		# docker-socket volumeMount REMOVED in olam-k3d-on-mac-substrate-
		# decision Phase B B2. Docker access now goes via TCP to the
		# docker-socket-proxy ExternalName Service in the olam namespace.
		# host-cp's `getDockerRequestOptions('kubernetes')` returns
		# `{ host: 'docker-socket-proxy', port: 2375 }` (collapsed to the
		# same value as the compose substrate's branch — see
		# packages/host-cp/src/lib/docker-request-options.mjs).
		readinessProbe:
		httpGet:
		path: /health
		port: 19000
		initialDelaySeconds: 5
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 6
		livenessProbe:
		httpGet:
		path: /health
		port: 19000
		initialDelaySeconds: 30
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "50m"
		memory: "256Mi"
		limits:
		cpu: "1000m"
		memory: "1Gi"
		volumes:
		- name: olam-home
		persistentVolumeClaim:
		claimName: olam-host-cp-data
		- name: gh-config
		hostPath:
		path: /host/.config/gh
		type: DirectoryOrCreate
		- name: operator-repo
		hostPath:
		path: /host/olam
		type: DirectoryOrCreate
		- name: tmp
		emptyDir: {}
		- name: plan-chat-secret
		secret:
		secretName: olam-plan-chat-secret
		defaultMode: 0400
		items:
		- key: PLAN_CHAT_SECRET
		path: secret
		# host-colima + docker-socket volumes REMOVED in olam-k3d-on-mac-
		# substrate-decision Phase B B2 (2026-05-21). R3-A's two-volume
		# hostPath approach is fully retracted: round-4 R4-W2-F demonstrated
		# virtiofs ENOTSUP on socket-file stat is unrecoverable at the
		# containerd OCI runtime layer (kubelet bypass via R4-W2-E was
		# necessary-but-not-sufficient). host-cp now reaches docker via TCP
		# through the docker-socket-proxy ExternalName Service — see
		# packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml.
		# The proxy itself runs on the operator's docker daemon (sibling to
		# k3d), started by `olam upgrade` Step 0.7 on macOS.

-30

host-cp/k8s/manifests/60-service.yaml

		# ClusterIP Service for olam-host-cp.
		#
		# Two ways to reach the SPA externally:
		# 1. (preferred) Traefik IngressRoute at https://olam.local:<traefik-https-port>
		# Terminates TLS at the cluster edge, unlocks HTTP/2 multiplexing for
		# Electric SQL long-polls. See 70-ingressroute.yaml + 65-tls-secret-template.yaml.tmpl.
		# The pod itself stays HTTP-only — Traefik handles TLS at the edge.
		# 2. (fallback) kubectl port-forward -n olam svc/olam-host-cp 19000:19000
		# Plain HTTP/1.1; hits browser's 6-conn-per-origin cap under Electric load.
		#
		# ClusterIP (not NodePort) preserves the "127.0.0.1-only" single-user-per-host
		# invariant — exposure is via Traefik's LoadBalancer or port-forward, not by
		# binding pod ports on every node interface.
		apiVersion: v1
		kind: Service
		metadata:
		name: olam-host-cp
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		spec:
		type: ClusterIP
		selector:
		app: olam-host-cp
		ports:
		- name: http
		port: 19000
		targetPort: 19000
		protocol: TCP

-35

host-cp/k8s/manifests/65-tls-secret-template.yaml.tmpl

		# TLS secret template for olam-host-cp Traefik IngressRoute.
		#
		# DO NOT apply this template directly — the placeholders `__TLS_CRT_BASE64__`
		# and `__TLS_KEY_BASE64__` are substituted at apply time by
		# `olam services tls-install` (packages/cli/src/commands/services-tls.ts),
		# which uses `mkcert` to mint a locally-trusted certificate for the SAN list
		# olam.local 127.0.0.1 ::1
		# and then `kubectl apply -f -` against the rendered manifest.
		#
		# Why a Secret of type kubernetes.io/tls (instead of a plain Opaque secret):
		# Traefik's IngressRoute TLS resolver requires this exact type — it reads
		# tls.crt + tls.key fields by convention. Using Opaque would silently fail
		# the handshake at request time.
		#
		# Why the cert covers SANs (not just CN): modern browsers (Chrome 58+, Brave,
		# Safari, Firefox) ignore the certificate CN entirely and only honour SANs.
		# Without `127.0.0.1` + `::1` in the SAN list, hitting the IP directly fails
		# even though the cert is "valid for olam.local".
		#
		# Renewal: certs minted by mkcert are valid ~2 years and 3 months. The
		# tls-install command checks NotAfter and regenerates when within 30 days
		# of expiry. To force regeneration: `kubectl -n olam delete secret olam-host-cp-tls`
		# and re-run `olam services tls-install`.
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-host-cp-tls
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		type: kubernetes.io/tls
		data:
		tls.crt: __TLS_CRT_BASE64__
		tls.key: __TLS_KEY_BASE64__

-58

host-cp/k8s/manifests/70-ingressroute.yaml

		# Traefik IngressRoute terminating TLS at the cluster edge for olam-host-cp.
		#
		# Topology:
		# Browser --HTTPS/h2--> Traefik :443 (LoadBalancer / k3d NodePort)
		# \|
		# \| (TLS terminated; cleartext inside cluster)
		# v
		# olam-host-cp:19000 (ClusterIP, HTTP/1.1 internal)
		# \|
		# v
		# plan-chat-service:3200 (and other peripherals)
		#
		# Why terminate TLS at Traefik (NOT at host-cp): host-cp is a Node/Hono
		# server tuned for cleartext HTTP. Pushing TLS into the pod would force a
		# second cert-distribution mechanism (Secret → volumeMount → server.mjs
		# reload) and double the operational surface. Traefik already owns cert
		# lifecycle in production (cert-manager + Let's Encrypt), so dev-mode
		# mkcert at the same boundary keeps prod parity tight.
		#
		# Why HTTP/2 matters: TanStack DB / Electric SQL opens N long-poll
		# connections per browser tab (one per shape subscription). Without h2
		# multiplexing they queue against the browser's 6-connection-per-origin
		# cap, leading to the "25-second pending requests" symptom Electric users
		# hit on HTTP/1.1. Traefik 2.x advertises h2 over TLS via ALPN by default;
		# no extra config needed.
		#
		# Why Host(olam.local) instead of a wildcard: the cert is minted for that
		# exact SAN. Traefik routes based on SNI, so the host-rule must match the
		# cert subject or the TLS handshake completes but the route 404s.
		#
		# Operator MUST add `127.0.0.1 olam.local` to /etc/hosts before this works.
		# `olam services tls-install` prints the line + sudo command — it does NOT
		# auto-edit (touching /etc/hosts behind the operator's back is a foot-gun).
		apiVersion: traefik.io/v1alpha1
		kind: IngressRoute
		metadata:
		# Distinct name avoids collision with packages/peripheral-services'
		# `olam-host-cp` IngressRoute (the legacy `web`-entrypoint + path-based
		# router that 50+ SPA fetch sites still depend on). The `-https` variant
		# adds a SECOND ingress that matches Host(olam.local) on `websecure` and
		# terminates TLS via the operator-minted Secret. Both coexist; the legacy
		# one keeps `http://<lb>/api/...` working, this one unlocks HTTP/2.
		name: olam-host-cp-https
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		spec:
		entryPoints:
		- websecure
		routes:
		- match: Host(`olam.local`)
		kind: Rule
		services:
		- name: olam-host-cp
		port: 19000
		tls:
		secretName: olam-host-cp-tls

-8

host-cp/k8s/manifests/auth-service/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-auth-service
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral

-34

host-cp/k8s/manifests/auth-service/20-rbac.yaml

		# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-auth-service"] on
		# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
		# could patch ANY Deployment in the namespace. This is the load-bearing
		# security guardrail — preserve verbatim.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-auth-service
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral
		rules:
		- apiGroups: ["apps"]
		resources: ["deployments"]
		resourceNames: ["olam-auth-service"]
		verbs: ["get", "patch", "watch"]
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-auth-service
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral
		subjects:
		- kind: ServiceAccount
		name: olam-auth-service
		namespace: olam
		roleRef:
		kind: Role
		name: olam-auth-service
		apiGroup: rbac.authorization.k8s.io

-29

host-cp/k8s/manifests/auth-service/30-configmap.yaml

		# ConfigMap for olam-auth-service environment. Sensitive values (AUTH_DB_SECRET,
		# API keys) are NOT here — they live in the Secret (see templates/auth-service-secret-template.yaml).
		# Operators apply the Secret separately before applying the manifests.
		#
		# Inter-peripheral URL placeholders (e.g. OLAM_MCP_AUTH_URL) are set to
		# cluster-internal DNS names. These are resolved by Phase C substitution;
		# operators running Phase 2 Beta may override them directly.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-auth-service-env
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral
		data:
		# Port auth-service listens on. Must match 60-service.yaml targetPort.
		OLAM_AUTH_PORT: "9999"
		# Data directory — backed by the PVC mounted at /data.
		OLAM_AUTH_DATA_PATH: "/data/auth"
		# URL of mcp-auth-service (cluster-internal DNS). Override in non-k3d environments.
		OLAM_MCP_AUTH_SERVICE_URL: "http://olam-mcp-auth-service.olam.svc.cluster.local:9998"
		# Credential vault poll interval.
		OLAM_CREDENTIAL_POLL_MS: "60000"
		# R3-B (Decision R3-#2): bind on all interfaces so the k8s readiness probe
		# (hitting the pod IP 10.42.x.x:9999) succeeds. Default in image source was
		# 127.0.0.1 which caused CrashLoopBackOff in k8s. ConfigMap override is the
		# second defense layer; the image source default was also changed to 0.0.0.0.
		AUTH_BIND: "0.0.0.0"

-25

host-cp/k8s/manifests/auth-service/45-pvc.yaml

		# PersistentVolumeClaim for olam-auth-service /data volume.
		#
		# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
		# for the full rationale (fsGroup, k3d node filesystem, etc.).
		#
		# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
		# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
		# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-auth-data
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral
		spec:
		accessModes:
		- ReadWriteOnce
		# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
		storageClassName: local-path
		resources:
		requests:
		# D25: auth-service PVC size 5Gi.
		storage: 5Gi

-124

host-cp/k8s/manifests/auth-service/50-deployment.yaml

		# Deployment for olam-auth-service.
		#
		# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
		# Digest resolves to ghcr.io/pleri/olam-auth:latest (multi-arch index).
		# NOTE (B1): image name is olam-auth (NOT olam-auth-service) — matches the
		# actual GHCR package name published by release.yml publish-auth job.
		# To update: resolve the new tag's digest via:
		# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-auth:pull&service=ghcr.io" \| jq -r .token)
		# curl -sI -H "Authorization: Bearer $TOKEN" \
		# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
		# https://ghcr.io/v2/pleri/olam-auth/manifests/<tag> \| grep docker-content-digest
		# Or use: node scripts/refresh-manifest-digests.mjs
		#
		# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
		# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
		#
		# D17: auth-service does NOT mount /var/run/docker.sock (Phase 2 k8s pods
		# cannot reach docker.sock — no hostPath socket mount).
		#
		# chown-data init container: grants UID-1000 write access on the freshly-
		# provisioned PV (fsGroup alone is insufficient for local-path PVs).
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-auth-service
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral
		spec:
		replicas: 1
		strategy:
		type: RollingUpdate
		rollingUpdate:
		maxSurge: 1
		maxUnavailable: 0
		selector:
		matchLabels:
		app: olam-auth-service
		template:
		metadata:
		labels:
		app: olam-auth-service
		spec:
		# B9 (round 2 recovery): disable k8s automatic Service env injection.
		# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
		enableServiceLinks: false
		# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
		# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
		imagePullSecrets:
		- name: ghcr-pull
		serviceAccountName: olam-auth-service
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		runAsGroup: 1000
		fsGroup: 1000
		initContainers:
		- name: chown-data
		# busybox:1.36 — sha256-pinned per T4 threat model.
		image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsUser: 0
		runAsNonRoot: false
		allowPrivilegeEscalation: false
		command: ["chown", "-R", "1000:1000", "/data"]
		volumeMounts:
		- name: auth-data
		mountPath: /data
		containers:
		- name: olam-auth-service
		image: ghcr.io/pleri/olam-auth@sha256:770ee97ee4d06d2c1b6512ba99421a5fe312393d592df1684fd0d03b3476ff10
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		readOnlyRootFilesystem: true
		allowPrivilegeEscalation: false
		capabilities:
		drop: ["ALL"]
		ports:
		- name: http
		containerPort: 9999
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-auth-service-env
		- secretRef:
		name: olam-auth-service-secret
		volumeMounts:
		- name: auth-data
		mountPath: /data
		- name: tmp
		mountPath: /tmp
		readinessProbe:
		httpGet:
		path: /health
		port: 9999
		initialDelaySeconds: 5
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 6
		livenessProbe:
		httpGet:
		path: /health
		port: 9999
		initialDelaySeconds: 30
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "50m"
		memory: "128Mi"
		limits:
		cpu: "500m"
		memory: "512Mi"
		volumes:
		- name: auth-data
		persistentVolumeClaim:
		claimName: olam-auth-data
		- name: tmp
		emptyDir: {}

-21

host-cp/k8s/manifests/auth-service/60-service.yaml

		# ClusterIP Service for olam-auth-service.
		# Port 9999 — consumed by host-cp and other peripherals via cluster-internal DNS.
		# Operator surfaces externally via:
		# kubectl port-forward -n olam svc/olam-auth-service 9999:9999
		apiVersion: v1
		kind: Service
		metadata:
		name: olam-auth-service
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral
		spec:
		type: ClusterIP
		selector:
		app: olam-auth-service
		ports:
		- name: http
		port: 9999
		targetPort: 9999
		protocol: TCP

-8

host-cp/k8s/manifests/chunks-electric/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-chunks-electric
		namespace: olam
		labels:
		app: olam-chunks-electric
		olam.io/component: substrate

-27

host-cp/k8s/manifests/chunks-electric/20-rbac.yaml

		# Electric does not call the Kubernetes API. Empty Role kept for layout parity.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-chunks-electric
		namespace: olam
		labels:
		app: olam-chunks-electric
		olam.io/component: substrate
		rules: []
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-chunks-electric
		namespace: olam
		labels:
		app: olam-chunks-electric
		olam.io/component: substrate
		roleRef:
		apiGroup: rbac.authorization.k8s.io
		kind: Role
		name: olam-chunks-electric
		subjects:
		- kind: ServiceAccount
		name: olam-chunks-electric
		namespace: olam

-23

host-cp/k8s/manifests/chunks-electric/30-configmap.yaml

		# ConfigMap for olam-chunks-electric.
		#
		# ELECTRIC_INSECURE=true disables Electric's API-secret-token gate. Acceptable
		# in a single-operator local-dev k3d cluster (the Service is ClusterIP — no
		# external reachability). For multi-tenant deploys, set ELECTRIC_INSECURE=false
		# and provision ELECTRIC_SECRET via a Secret instead.
		#
		# DATABASE_URL is composed at runtime in the Deployment via env: composition
		# referencing the chunks-postgres Secret (POSTGRES_PASSWORD). It is NOT
		# stored here.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-chunks-electric-env
		namespace: olam
		labels:
		app: olam-chunks-electric
		olam.io/component: substrate
		data:
		ELECTRIC_INSECURE: "true"
		ELECTRIC_PORT: "3000"
		ELECTRIC_HTTP_API_PORT: "3000"
		ELECTRIC_LOG_LEVEL: "info"

-19

host-cp/k8s/manifests/chunks-electric/45-pvc.yaml

		# Electric's HTTP server state lives in-memory + the replication slot lives on
		# Postgres. No persistent state required, but a small PVC is kept for parity
		# with other peripherals — Electric writes its persisted-shape index to
		# /app/persistent by default; PVC backs that path.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-chunks-electric-data
		namespace: olam
		labels:
		app: olam-chunks-electric
		olam.io/component: substrate
		spec:
		accessModes:
		- ReadWriteOnce
		storageClassName: local-path
		resources:
		requests:
		storage: 1Gi

-84

host-cp/k8s/manifests/chunks-electric/50-deployment.yaml

		# Deployment for olam-chunks-electric.
		#
		# Electric SQL — Postgres logical-replication → HTTP long-poll shape proxy.
		# Single replica (replication slot is single-writer).
		#
		# Image: electricsql/electric:1.6.8 — sha256-pinned per T4 threat model.
		# Resolves to the same digest as :latest at 2026-05-27; refresh when the
		# upstream cuts a new release that closes a security advisory.
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-chunks-electric
		namespace: olam
		labels:
		app: olam-chunks-electric
		olam.io/component: substrate
		spec:
		replicas: 1
		strategy:
		# Recreate (NOT RollingUpdate) — Electric holds a postgres replication
		# slot; two pods running at once would fight for the same slot and one
		# would crashloop.
		type: Recreate
		selector:
		matchLabels:
		app: olam-chunks-electric
		template:
		metadata:
		labels:
		app: olam-chunks-electric
		spec:
		enableServiceLinks: false
		serviceAccountName: olam-chunks-electric
		containers:
		- name: electric
		image: electricsql/electric:1.6.8@sha256:a716f2affde44d5b991bdd1492876d9d6bddbcae5c98411327614575cd8f9eec
		imagePullPolicy: IfNotPresent
		ports:
		- name: http
		containerPort: 3000
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-chunks-electric-env
		env:
		# DATABASE_URL composition. POSTGRES_PASSWORD is sourced from the
		# chunks-postgres Secret (rendered by k8s-secret-render.ts).
		- name: POSTGRES_PASSWORD
		valueFrom:
		secretKeyRef:
		name: olam-chunks-postgres-secret
		key: POSTGRES_PASSWORD
		- name: DATABASE_URL
		value: "postgres://postgres:$(POSTGRES_PASSWORD)@olam-chunks-postgres.olam.svc.cluster.local:5432/chunks?sslmode=disable"
		volumeMounts:
		- name: persistent
		mountPath: /app/persistent
		readinessProbe:
		httpGet:
		path: /v1/health
		port: 3000
		initialDelaySeconds: 10
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 12
		livenessProbe:
		httpGet:
		path: /v1/health
		port: 3000
		initialDelaySeconds: 60
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "100m"
		memory: "256Mi"
		limits:
		cpu: "1000m"
		memory: "1Gi"
		volumes:
		- name: persistent
		persistentVolumeClaim:
		claimName: olam-chunks-electric-data

-17

host-cp/k8s/manifests/chunks-electric/60-service.yaml

		apiVersion: v1
		kind: Service
		metadata:
		name: olam-chunks-electric
		namespace: olam
		labels:
		app: olam-chunks-electric
		olam.io/component: substrate
		spec:
		type: ClusterIP
		selector:
		app: olam-chunks-electric
		ports:
		- name: http
		port: 3000
		targetPort: 3000
		protocol: TCP

-8

host-cp/k8s/manifests/chunks-postgres/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-chunks-postgres
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate

-29

host-cp/k8s/manifests/chunks-postgres/20-rbac.yaml

		# Minimal-privilege RBAC for chunks-postgres. The pod does not call the
		# Kubernetes API; this Role exists to make the per-service apply order
		# (10/20/30/45/50/60) uniform across peripherals + substrate.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-chunks-postgres
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		rules: []
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-chunks-postgres
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		roleRef:
		apiGroup: rbac.authorization.k8s.io
		kind: Role
		name: olam-chunks-postgres
		subjects:
		- kind: ServiceAccount
		name: olam-chunks-postgres
		namespace: olam

-185

host-cp/k8s/manifests/chunks-postgres/30-configmap.yaml

		# ConfigMap for olam-chunks-postgres.
		#
		# Two ConfigMaps in one file:
		#
		# 1. olam-chunks-postgres-env — non-secret env vars (POSTGRES_USER, POSTGRES_DB).
		# POSTGRES_PASSWORD lives in the Secret rendered by
		# packages/cli/src/lib/k8s-secret-render.ts.
		#
		# 2. olam-chunks-postgres-initdb-sql — the chunks schema. Mounted at
		# /docker-entrypoint-initdb.d/01-chunks.sql so
		# the postgres image's entrypoint auto-applies it
		# on FIRST init (empty data dir). Subsequent
		# restarts skip the directory by design.
		#
		# Source-of-truth: packages/chunks/src/schema.ts
		# (SCHEMA_SQL export). The CI gate
		# `audit:chunks-schema-parity` (follow-up) will
		# fail when this ConfigMap drifts from
		# SCHEMA_VERSION-tagged schema.ts.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-chunks-postgres-env
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		data:
		POSTGRES_USER: "postgres"
		POSTGRES_DB: "chunks"
		# PGDATA must point at a subdirectory of the PVC mount, not its root —
		# the PVC root may carry the local-path provisioner's lost+found dir,
		# which postgres's initdb rejects ("data directory not empty").
		PGDATA: "/var/lib/postgresql/data/pgdata"
		---
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-chunks-postgres-initdb-sql
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		data:
		# MIRRORS packages/chunks/src/schema.ts SCHEMA_VERSION=2.
		# Idempotent: CREATE TABLE IF NOT EXISTS / ADD COLUMN IF NOT EXISTS /
		# DO blocks with EXCEPTION-WHEN-{undefined_object,duplicate_object}.
		01-chunks.sql: \|
		CREATE TABLE IF NOT EXISTS chunks (
		world_id TEXT NOT NULL,
		session_id TEXT NOT NULL,
		message_id TEXT NOT NULL,
		seq INTEGER NOT NULL,
		actor_id TEXT NOT NULL,
		actor_type TEXT NOT NULL CHECK (actor_type IN ('agent', 'operator', 'codex', 'system')),
		role TEXT NOT NULL CHECK (role IN ('user', 'assistant', 'tool', 'system')),
		chunk TEXT NOT NULL,
		chunk_type TEXT NOT NULL DEFAULT 'text' CHECK (chunk_type IN ('text', 'tool_use', 'goal_mode_assumption', 'dispatch_overflow')),
		created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
		PRIMARY KEY (message_id, seq)
		);

		ALTER TABLE chunks ADD COLUMN IF NOT EXISTS chunk_type TEXT NOT NULL DEFAULT 'text';

		DO $$ BEGIN
		ALTER TABLE chunks DROP CONSTRAINT IF EXISTS chunks_chunk_type_check;
		EXCEPTION WHEN undefined_object THEN NULL;
		END $$;

		DO $$ BEGIN
		ALTER TABLE chunks ADD CONSTRAINT chunks_chunk_type_check
		CHECK (chunk_type IN ('text', 'tool_use', 'goal_mode_assumption', 'dispatch_overflow'));
		EXCEPTION WHEN duplicate_object THEN NULL;
		END $$;

		CREATE INDEX IF NOT EXISTS chunks_world_session_seq
		ON chunks (world_id, session_id, seq);

		CREATE INDEX IF NOT EXISTS chunks_world_session_created
		ON chunks (world_id, session_id, created_at);

		CREATE INDEX IF NOT EXISTS idx_chunks_planning
		ON chunks (session_id, seq)
		WHERE world_id = '_planning';

		CREATE TABLE IF NOT EXISTS planning_sessions (
		session_id TEXT PRIMARY KEY,
		actor_id TEXT NOT NULL,
		summary TEXT,
		crystallize_status TEXT NOT NULL DEFAULT 'open'
		CHECK (crystallize_status IN ('open', 'in_progress', 'crystallized', 'failed', 'abandoned')),
		crystallized_world_id TEXT,
		created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
		updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
		);

		CREATE INDEX IF NOT EXISTS idx_planning_sessions_created_at
		ON planning_sessions (created_at DESC);

		ALTER TABLE planning_sessions ADD COLUMN IF NOT EXISTS session_source TEXT;

		CREATE OR REPLACE FUNCTION chunks_append_only_trigger()
		RETURNS trigger AS $body$
		BEGIN
		RAISE EXCEPTION 'chunks is append-only; % forbidden', TG_OP;
		END;
		$body$ LANGUAGE plpgsql;

		DROP TRIGGER IF EXISTS chunks_no_update ON chunks;
		CREATE TRIGGER chunks_no_update
		BEFORE UPDATE ON chunks
		FOR EACH ROW EXECUTE FUNCTION chunks_append_only_trigger();

		DROP TRIGGER IF EXISTS chunks_no_delete ON chunks;
		CREATE TRIGGER chunks_no_delete
		BEFORE DELETE ON chunks
		FOR EACH ROW EXECUTE FUNCTION chunks_append_only_trigger();

		CREATE TABLE IF NOT EXISTS message_usage (
		world_id TEXT NOT NULL,
		session_id TEXT NOT NULL,
		message_id TEXT NOT NULL,
		actor_id TEXT NOT NULL,
		model TEXT NOT NULL,
		input_tokens INTEGER NOT NULL DEFAULT 0,
		output_tokens INTEGER NOT NULL DEFAULT 0,
		cache_read_tokens INTEGER NOT NULL DEFAULT 0,
		cache_create_tokens INTEGER NOT NULL DEFAULT 0,
		created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
		PRIMARY KEY (message_id, actor_id)
		);

		CREATE INDEX IF NOT EXISTS message_usage_session_created
		ON message_usage (session_id, created_at);

		CREATE OR REPLACE FUNCTION message_usage_append_only_trigger()
		RETURNS trigger AS $body$
		BEGIN
		RAISE EXCEPTION 'message_usage is append-only; % forbidden', TG_OP;
		END;
		$body$ LANGUAGE plpgsql;

		DROP TRIGGER IF EXISTS message_usage_no_update ON message_usage;
		CREATE TRIGGER message_usage_no_update
		BEFORE UPDATE ON message_usage
		FOR EACH ROW EXECUTE FUNCTION message_usage_append_only_trigger();

		DROP TRIGGER IF EXISTS message_usage_no_delete ON message_usage;
		CREATE TRIGGER message_usage_no_delete
		BEFORE DELETE ON message_usage
		FOR EACH ROW EXECUTE FUNCTION message_usage_append_only_trigger();

		CREATE TABLE IF NOT EXISTS planning_artifacts (
		id TEXT PRIMARY KEY,
		world_id TEXT NOT NULL,
		session_id TEXT NOT NULL,
		type TEXT NOT NULL CHECK (type IN ('commit_plan', 'component_scaffold', 'design_jam')),
		title TEXT NOT NULL,
		body JSONB NOT NULL,
		status TEXT NOT NULL DEFAULT 'open'
		CHECK (status IN ('open', 'crystallized', 'failed', 'archived')),
		linear_issue_url TEXT,
		crystallized_world_id TEXT,
		created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
		updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
		);

		CREATE INDEX IF NOT EXISTS idx_planning_artifacts_session
		ON planning_artifacts (session_id, created_at);

		CREATE INDEX IF NOT EXISTS idx_planning_artifacts_world
		ON planning_artifacts (world_id, status);

		CREATE OR REPLACE FUNCTION planning_artifacts_touch_updated_at()
		RETURNS trigger AS $body$
		BEGIN
		NEW.updated_at = NOW();
		RETURN NEW;
		END;
		$body$ LANGUAGE plpgsql;

		DROP TRIGGER IF EXISTS planning_artifacts_touch ON planning_artifacts;
		CREATE TRIGGER planning_artifacts_touch
		BEFORE UPDATE ON planning_artifacts
		FOR EACH ROW EXECUTE FUNCTION planning_artifacts_touch_updated_at();

-24

host-cp/k8s/manifests/chunks-postgres/45-pvc.yaml

		# PVC for the chunks-postgres data directory.
		#
		# Sized 10Gi for local-dev. Chunks rows are small (~1KB each) so even a
		# busy single-operator world rarely cracks 1Gi; the headroom is for the
		# message_usage + planning_artifacts sidecar tables.
		#
		# accessModes: ReadWriteOnce — postgres is a StatefulSet with replicas=1.
		# k3d's local-path provisioner only supports RWO; the in-cluster postgres
		# pattern is single-writer by design (no operator-managed HA).
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-chunks-postgres-data
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		spec:
		accessModes:
		- ReadWriteOnce
		storageClassName: local-path
		resources:
		requests:
		storage: 10Gi

-101

host-cp/k8s/manifests/chunks-postgres/50-deployment.yaml

		# StatefulSet for olam-chunks-postgres.
		#
		# Why StatefulSet vs Deployment: even with replicas=1 the StatefulSet gives
		# stable network identity (olam-chunks-postgres-0 inside the headless service)
		# and ordered termination semantics — both useful when Electric's replication
		# slot survives pod restarts.
		#
		# command override: postgres requires wal_level=logical for Electric SQL's
		# logical-replication subscription. The image's default postgresql.conf
		# ships wal_level=replica; the -c overrides on the entrypoint args take
		# precedence. max_replication_slots / max_wal_senders need raising too —
		# Electric holds one slot per database.
		#
		# securityContext: postgres image runs as uid 999 by default. fsGroup=999
		# on the pod ensures the PVC mount is chowned to 999 so postgres can write
		# its data dir.
		apiVersion: apps/v1
		kind: StatefulSet
		metadata:
		name: olam-chunks-postgres
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		spec:
		replicas: 1
		serviceName: olam-chunks-postgres
		selector:
		matchLabels:
		app: olam-chunks-postgres
		template:
		metadata:
		labels:
		app: olam-chunks-postgres
		spec:
		enableServiceLinks: false
		serviceAccountName: olam-chunks-postgres
		securityContext:
		fsGroup: 999
		containers:
		- name: postgres
		# postgres:16-alpine — sha256-pinned per T4 threat model.
		image: postgres:16-alpine@sha256:16bc17c64a573ef34162af9298258d1aec548232985b33ed7b1eac33ba35c229
		imagePullPolicy: IfNotPresent
		args:
		- postgres
		- -c
		- wal_level=logical
		- -c
		- max_replication_slots=10
		- -c
		- max_wal_senders=10
		ports:
		- name: postgres
		containerPort: 5432
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-chunks-postgres-env
		- secretRef:
		name: olam-chunks-postgres-secret
		volumeMounts:
		- name: data
		mountPath: /var/lib/postgresql/data
		- name: initdb
		mountPath: /docker-entrypoint-initdb.d
		readOnly: true
		readinessProbe:
		exec:
		command:
		- sh
		- -c
		- pg_isready -U postgres -d chunks -h 127.0.0.1
		initialDelaySeconds: 5
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 12
		livenessProbe:
		exec:
		command:
		- sh
		- -c
		- pg_isready -U postgres -h 127.0.0.1
		initialDelaySeconds: 30
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "100m"
		memory: "256Mi"
		limits:
		cpu: "1000m"
		memory: "1Gi"
		volumes:
		- name: data
		persistentVolumeClaim:
		claimName: olam-chunks-postgres-data
		- name: initdb
		configMap:
		name: olam-chunks-postgres-initdb-sql

-24

host-cp/k8s/manifests/chunks-postgres/60-service.yaml

		# Headless Service for olam-chunks-postgres StatefulSet.
		#
		# clusterIP: None gives the StatefulSet's pod stable DNS:
		# olam-chunks-postgres-0.olam-chunks-postgres.olam.svc.cluster.local
		# Callers (plan-chat-service, chunks-electric) connect via the shorter
		# olam-chunks-postgres.olam.svc.cluster.local form which Kubernetes resolves
		# round-robin to the single backing pod.
		apiVersion: v1
		kind: Service
		metadata:
		name: olam-chunks-postgres
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		spec:
		clusterIP: None
		selector:
		app: olam-chunks-postgres
		ports:
		- name: postgres
		port: 5432
		targetPort: 5432
		protocol: TCP

-37

host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml

		# ExternalName Service for the host-side docker-socket-proxy.
		#
		# Provides in-cluster DNS for pods to reach the host-side proxy
		# container (defined in packages/host-cp/k8s/host-side/docker-socket-proxy.compose.yaml).
		# The Service has NO backing Pod — `type: ExternalName` is a kube-dns
		# CNAME alias to `host.k3d.internal`, the gateway address that k3d
		# auto-provisions inside every node container.
		#
		# Decision #7 (round-4 plan pass 2): Universal across all k8s substrates
		# (macOS+colima+virtiofs, Linux native k3d, WSL2). One codepath; the
		# per-Pod cost of running an in-cluster proxy elsewhere is invisible
		# against the maintenance tax of OS-conditional Service generation.
		#
		# Why ExternalName and not in-cluster Pod with hostPath:
		# the in-cluster Pod would itself need to bind /var/run/docker.sock
		# from the lima VM, hitting the same virtiofs ENOTSUP class that
		# R4-W2-F is. The proxy must live OUTSIDE the k3d cluster, on the
		# operator's colima docker daemon. ExternalName makes that
		# transparent to consumers: host-cp configures
		# { host: 'docker-socket-proxy', port: 2375 } regardless of where
		# the actual proxy container lives.
		apiVersion: v1
		kind: Service
		metadata:
		name: docker-socket-proxy
		namespace: olam
		labels:
		app: docker-socket-proxy
		olam.io/component: host-stack
		spec:
		type: ExternalName
		externalName: host.k3d.internal
		ports:
		- name: tcp-2375
		port: 2375
		targetPort: 2375
		protocol: TCP

-8

host-cp/k8s/manifests/kg-service/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral

-34

host-cp/k8s/manifests/kg-service/20-rbac.yaml

		# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-kg-service"] on
		# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
		# could patch ANY Deployment in the namespace. This is the load-bearing
		# security guardrail — preserve verbatim.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral
		rules:
		- apiGroups: ["apps"]
		resources: ["deployments"]
		resourceNames: ["olam-kg-service"]
		verbs: ["get", "patch", "watch"]
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral
		subjects:
		- kind: ServiceAccount
		name: olam-kg-service
		namespace: olam
		roleRef:
		kind: Role
		name: olam-kg-service
		apiGroup: rbac.authorization.k8s.io

-23

host-cp/k8s/manifests/kg-service/30-configmap.yaml

		# ConfigMap for olam-kg-service environment. Sensitive values live in
		# the Secret (see templates/kg-service-secret-template.yaml).
		# Operators apply the Secret separately before applying the manifests.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-kg-service-env
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral
		data:
		# Port kg-service listens on. Must match 60-service.yaml targetPort.
		OLAM_KG_PORT: "9997"
		# Data directory — backed by the PVC mounted at /data.
		OLAM_KG_DATA_PATH: "/data/kg"
		# URL of auth-service (cluster-internal DNS). Override in non-k3d environments.
		OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
		# R3-B (Decision R3-#2): kg-service source (server.py) uses OLAM_KG_SERVICE_BIND
		# and defaults to 127.0.0.1. In k8s the readiness probe hits the pod IP, so
		# 127.0.0.1-only listener causes CrashLoopBackOff. ConfigMap override forces
		# all-interfaces bind without requiring an image rebuild.
		OLAM_KG_SERVICE_BIND: "0.0.0.0"

-25

host-cp/k8s/manifests/kg-service/45-pvc.yaml

		# PersistentVolumeClaim for olam-kg-service /data volume.
		#
		# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
		# for the full rationale (fsGroup, k3d node filesystem, etc.).
		#
		# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
		# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
		# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-kg-data
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral
		spec:
		accessModes:
		- ReadWriteOnce
		# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
		storageClassName: local-path
		resources:
		requests:
		# D25: kg-service PVC size 10Gi (larger: graph index grows with codebase).
		storage: 10Gi

-115

host-cp/k8s/manifests/kg-service/50-deployment.yaml

		# Deployment for olam-kg-service.
		#
		# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
		# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.0 (multi-arch index).
		# To update: resolve the new tag's digest via:
		# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" \| jq -r .token)
		# curl -sI -H "Authorization: Bearer $TOKEN" \
		# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
		# https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> \| grep docker-content-digest
		#
		# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
		# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral
		spec:
		replicas: 1
		strategy:
		type: RollingUpdate
		rollingUpdate:
		maxSurge: 1
		maxUnavailable: 0
		selector:
		matchLabels:
		app: olam-kg-service
		template:
		metadata:
		labels:
		app: olam-kg-service
		spec:
		# B9 (round 2 recovery): disable k8s automatic Service env injection.
		# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
		enableServiceLinks: false
		# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
		# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
		imagePullSecrets:
		- name: ghcr-pull
		serviceAccountName: olam-kg-service
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		runAsGroup: 1000
		fsGroup: 1000
		initContainers:
		- name: chown-data
		# busybox:1.36 — sha256-pinned per T4 threat model.
		image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsUser: 0
		runAsNonRoot: false
		allowPrivilegeEscalation: false
		command: ["chown", "-R", "1000:1000", "/data"]
		volumeMounts:
		- name: kg-data
		mountPath: /data
		containers:
		- name: olam-kg-service
		image: ghcr.io/pleri/olam-kg-service@sha256:f97ee90fe1bd5b12cb56d5fbf0d3085c301bb7abeef0dd28d2b2a5c90ab6efbb
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		readOnlyRootFilesystem: true
		allowPrivilegeEscalation: false
		capabilities:
		drop: ["ALL"]
		ports:
		- name: http
		containerPort: 9997
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-kg-service-env
		- secretRef:
		name: olam-kg-service-secret
		volumeMounts:
		- name: kg-data
		mountPath: /data
		- name: tmp
		mountPath: /tmp
		readinessProbe:
		httpGet:
		path: /health
		port: 9997
		initialDelaySeconds: 5
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 6
		livenessProbe:
		httpGet:
		path: /health
		port: 9997
		initialDelaySeconds: 30
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "100m"
		memory: "256Mi"
		limits:
		cpu: "1000m"
		memory: "1Gi"
		volumes:
		- name: kg-data
		persistentVolumeClaim:
		claimName: olam-kg-data
		- name: tmp
		emptyDir: {}

-21

host-cp/k8s/manifests/kg-service/60-service.yaml

		# ClusterIP Service for olam-kg-service.
		# Port 9997 — consumed by agents and host-cp via cluster-internal DNS.
		# Operator surfaces externally via:
		# kubectl port-forward -n olam svc/olam-kg-service 9997:9997
		apiVersion: v1
		kind: Service
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral
		spec:
		type: ClusterIP
		selector:
		app: olam-kg-service
		ports:
		- name: http
		port: 9997
		targetPort: 9997
		protocol: TCP

-8

host-cp/k8s/manife...mcp-auth-service/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-mcp-auth-service
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral

-34

host-cp/k8s/manifests/mcp-auth-service/20-rbac.yaml

		# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-mcp-auth-service"] on
		# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
		# could patch ANY Deployment in the namespace. This is the load-bearing
		# security guardrail — preserve verbatim.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-mcp-auth-service
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral
		rules:
		- apiGroups: ["apps"]
		resources: ["deployments"]
		resourceNames: ["olam-mcp-auth-service"]
		verbs: ["get", "patch", "watch"]
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-mcp-auth-service
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral
		subjects:
		- kind: ServiceAccount
		name: olam-mcp-auth-service
		namespace: olam
		roleRef:
		kind: Role
		name: olam-mcp-auth-service
		apiGroup: rbac.authorization.k8s.io

-22

host-cp/k8s/manifests/mcp-auth-service/30-configmap.yaml

		# ConfigMap for olam-mcp-auth-service environment. Sensitive values live in
		# the Secret (see templates/mcp-auth-service-secret-template.yaml).
		# Operators apply the Secret separately before applying the manifests.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-mcp-auth-service-env
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral
		data:
		# Port mcp-auth-service listens on. Must match 60-service.yaml targetPort.
		OLAM_MCP_AUTH_PORT: "9998"
		# Data directory — backed by the PVC mounted at /data.
		OLAM_MCP_AUTH_DATA_PATH: "/data/mcp-auth"
		# URL of auth-service (cluster-internal DNS). Override in non-k3d environments.
		OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
		# R3-B defensive (Decision R3-#2): mcp-auth-service source already defaults to
		# 0.0.0.0 (MCP_AUTH_BIND env var) but ConfigMap override is explicit defense
		# against a future image regression reverting to 127.0.0.1.
		MCP_AUTH_BIND: "0.0.0.0"

-25

host-cp/k8s/manifests/mcp-auth-service/45-pvc.yaml

		# PersistentVolumeClaim for olam-mcp-auth-service /data volume.
		#
		# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
		# for the full rationale (fsGroup, k3d node filesystem, etc.).
		#
		# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
		# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
		# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-mcp-auth-data
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral
		spec:
		accessModes:
		- ReadWriteOnce
		# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
		storageClassName: local-path
		resources:
		requests:
		# D25: mcp-auth-service PVC size 5Gi.
		storage: 5Gi

-124

host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml

		# Deployment for olam-mcp-auth-service.
		#
		# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
		# Digest resolves to ghcr.io/pleri/olam-mcp-auth:latest (multi-arch index).
		# NOTE (B1): image name is olam-mcp-auth (NOT olam-mcp-auth-service) — matches the
		# actual GHCR package name published by release.yml publish-mcp-auth job.
		# To update: resolve the new tag's digest via:
		# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-mcp-auth:pull&service=ghcr.io" \| jq -r .token)
		# curl -sI -H "Authorization: Bearer $TOKEN" \
		# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
		# https://ghcr.io/v2/pleri/olam-mcp-auth/manifests/<tag> \| grep docker-content-digest
		# Or use: node scripts/refresh-manifest-digests.mjs
		#
		# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
		# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
		#
		# D17 (LOAD-BEARING): mcp-auth-service MUST NOT mount /var/run/docker.sock.
		# Phase 2 architecture: k8s pods cannot reach docker.sock. No hostPath socket
		# mount here — mcp-auth-service authenticates MCP clients via JWT, not Docker.
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-mcp-auth-service
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral
		spec:
		replicas: 1
		strategy:
		type: RollingUpdate
		rollingUpdate:
		maxSurge: 1
		maxUnavailable: 0
		selector:
		matchLabels:
		app: olam-mcp-auth-service
		template:
		metadata:
		labels:
		app: olam-mcp-auth-service
		spec:
		# B9 (round 2 recovery): disable k8s automatic Service env injection.
		# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
		enableServiceLinks: false
		# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
		# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
		imagePullSecrets:
		- name: ghcr-pull
		serviceAccountName: olam-mcp-auth-service
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		runAsGroup: 1000
		fsGroup: 1000
		initContainers:
		- name: chown-data
		# busybox:1.36 — sha256-pinned per T4 threat model.
		image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsUser: 0
		runAsNonRoot: false
		allowPrivilegeEscalation: false
		command: ["chown", "-R", "1000:1000", "/data"]
		volumeMounts:
		- name: mcp-auth-data
		mountPath: /data
		containers:
		- name: olam-mcp-auth-service
		image: ghcr.io/pleri/olam-mcp-auth@sha256:eaac2164349e388a70dae0d86c34132f97aa74177a2376cdfa10732e8eadb507
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		readOnlyRootFilesystem: true
		allowPrivilegeEscalation: false
		capabilities:
		drop: ["ALL"]
		ports:
		- name: http
		containerPort: 9998
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-mcp-auth-service-env
		- secretRef:
		name: olam-mcp-auth-service-secret
		volumeMounts:
		- name: mcp-auth-data
		mountPath: /data
		- name: tmp
		mountPath: /tmp
		readinessProbe:
		httpGet:
		path: /health
		port: 9998
		initialDelaySeconds: 5
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 6
		livenessProbe:
		httpGet:
		path: /health
		port: 9998
		initialDelaySeconds: 30
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "50m"
		memory: "128Mi"
		limits:
		cpu: "500m"
		memory: "512Mi"
		volumes:
		- name: mcp-auth-data
		persistentVolumeClaim:
		claimName: olam-mcp-auth-data
		- name: tmp
		emptyDir: {}
		# D17 (LOAD-BEARING): NO docker.sock volume or hostPath mount here.
		# mcp-auth-service does not need Docker access in Phase 2 k8s architecture.

-21

host-cp/k8s/manifests/mcp-auth-service/60-service.yaml

		# ClusterIP Service for olam-mcp-auth-service.
		# Port 9998 — consumed by other peripherals and host-cp via cluster-internal DNS.
		# Operator surfaces externally via:
		# kubectl port-forward -n olam svc/olam-mcp-auth-service 9998:9998
		apiVersion: v1
		kind: Service
		metadata:
		name: olam-mcp-auth-service
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral
		spec:
		type: ClusterIP
		selector:
		app: olam-mcp-auth-service
		ports:
		- name: http
		port: 9998
		targetPort: 9998
		protocol: TCP

-8

host-cp/k8s/manifests/memory-service/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-memory-service
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral

-34

host-cp/k8s/manifests/memory-service/20-rbac.yaml

		# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-memory-service"] on
		# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
		# could patch ANY Deployment in the namespace. This is the load-bearing
		# security guardrail — preserve verbatim.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-memory-service
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		rules:
		- apiGroups: ["apps"]
		resources: ["deployments"]
		resourceNames: ["olam-memory-service"]
		verbs: ["get", "patch", "watch"]
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-memory-service
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		subjects:
		- kind: ServiceAccount
		name: olam-memory-service
		namespace: olam
		roleRef:
		kind: Role
		name: olam-memory-service
		apiGroup: rbac.authorization.k8s.io

-35

host-cp/k8s/manifests/memory-service/30-configmap.yaml

		# ConfigMap for olam-memory-service environment. Sensitive values live in
		# the Secret (see templates/memory-service-secret-template.yaml).
		# Operators apply the Secret separately before applying the manifests.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-memory-service-env
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		data:
		# Port memory-service listens on. Must match 60-service.yaml targetPort.
		OLAM_MEMORY_PORT: "3111"
		# Data directory — backed by the PVC mounted at /data.
		OLAM_MEMORY_DATA_PATH: "/data/memory"
		# URL of auth-service (cluster-internal DNS). Override in non-k3d environments.
		OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
		# Health path exposed at /agentmemory/livez (D15 — do not change).
		OLAM_MEMORY_HEALTH_PATH: "/agentmemory/livez"
		# R3-B defensive (Decision R3-#2): memory-service Dockerfile already sets
		# AGENTMEMORY_HOST=0.0.0.0 but ConfigMap override is explicit defense against
		# a future image regression reverting to 127.0.0.1.
		AGENTMEMORY_HOST: "0.0.0.0"
		# III_REST_PORT is the env var the agentmemory CLI wrapper reads when it
		# polls its iii subprocess for readiness (cli.mjs:155 — `process.env
		# ["III_REST_PORT"] \|\| "3111"`). The iii engine itself binds the port
		# declared in iii-config.yaml's iii-http worker (overridden via the
		# olam-memory-service-iii-config ConfigMap to 3110, so it does not
		# collide with the metrics-proxy on 3111). Without this env var the
		# wrapper polls 3111 forever, prints "iii-engine did not become ready",
		# and exits — entrypoint propagates the exit, container restarts, and
		# the liveness probe returns 502 from the proxy (its backend was never
		# up). Must equal the iii-http port in 35-configmap-iii-config.yaml.
		III_REST_PORT: "3110"

-76

host-cp/k8s/manife...ry-service/35-configmap-iii-config.yaml

		# Overrides the iii-config.yaml shipped inside the agentmemory image so the
		# iii engine binds the INTERNAL port (3110) instead of the EXTERNAL port
		# (3111). The shipped yaml hardcodes `port: 3111` and the agentmemory CLI
		# reads its bind from yaml (NOT from the AGENTMEMORY_PORT env var), so
		# entrypoint.sh's `AGENTMEMORY_PORT=3110` override has no effect.
		#
		# Without this override, the engine and the metrics-proxy both try to bind
		# 0.0.0.0:3111. The proxy starts first and wins the port; the engine fails
		# silently. Probes to /agentmemory/livez hit the proxy and get forwarded to
		# 127.0.0.1:3110, where nothing is listening — proxy returns 502, readiness
		# fails, container restarts.
		#
		# Mounted at /usr/local/lib/node_modules/@agentmemory/agentmemory/dist/iii-config.yaml
		# via subPath in 50-deployment.yaml.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-memory-service-iii-config
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		data:
		iii-config.yaml: \|
		workers:
		- name: iii-http
		config:
		port: 3110
		host: 0.0.0.0
		default_timeout: 180000
		cors:
		allowed_origins: ["http://localhost:3111", "http://localhost:3113", "http://127.0.0.1:3111", "http://127.0.0.1:3113"]
		allowed_methods: [GET, POST, PUT, DELETE, OPTIONS]
		- name: iii-state
		config:
		adapter:
		name: kv
		config:
		store_method: file_based
		file_path: ./data/state_store.db
		- name: iii-queue
		config:
		adapter:
		name: builtin
		- name: iii-pubsub
		config:
		adapter:
		name: local
		- name: iii-cron
		config:
		adapter:
		name: kv
		- name: iii-stream
		config:
		port: 3112
		host: 0.0.0.0
		adapter:
		name: kv
		config:
		store_method: file_based
		file_path: ./data/stream_store
		- name: iii-observability
		config:
		enabled: true
		service_name: agentmemory
		exporter: memory
		sampling_ratio: 1.0
		metrics_enabled: true
		logs_enabled: true
		logs_console_output: true
		- name: iii-exec
		config:
		watch:
		- src/*/.ts
		exec:
		- node dist/index.mjs

-25

host-cp/k8s/manifests/memory-service/45-pvc.yaml

		# PersistentVolumeClaim for olam-memory-service /data volume.
		#
		# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
		# for the full rationale (fsGroup, k3d node filesystem, etc.).
		#
		# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
		# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
		# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-memory-data
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		spec:
		accessModes:
		- ReadWriteOnce
		# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
		storageClassName: local-path
		resources:
		requests:
		# D25: memory-service PVC size 5Gi.
		storage: 5Gi

-138

host-cp/k8s/manifests/memory-service/50-deployment.yaml

		# Deployment for olam-memory-service.
		#
		# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
		# Digest resolves to ghcr.io/pleri/olam-memory-service:0.1.0 (multi-arch index).
		# To update: resolve the new tag's digest via:
		# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-memory-service:pull&service=ghcr.io" \| jq -r .token)
		# curl -sI -H "Authorization: Bearer $TOKEN" \
		# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
		# https://ghcr.io/v2/pleri/olam-memory-service/manifests/<tag> \| grep docker-content-digest
		#
		# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
		# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
		#
		# D15 (LOAD-BEARING): readinessProbe and livenessProbe path MUST be
		# /agentmemory/livez (not /health). Source: DEFAULT_HEALTH_PATH in
		# packages/core/src/services-status/memory-probe.ts:18.
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-memory-service
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		spec:
		replicas: 1
		strategy:
		type: RollingUpdate
		rollingUpdate:
		maxSurge: 1
		maxUnavailable: 0
		selector:
		matchLabels:
		app: olam-memory-service
		template:
		metadata:
		labels:
		app: olam-memory-service
		spec:
		# B9 (round 2 recovery): disable k8s automatic Service env injection.
		# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
		enableServiceLinks: false
		# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
		# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
		imagePullSecrets:
		- name: ghcr-pull
		serviceAccountName: olam-memory-service
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		runAsGroup: 1000
		fsGroup: 1000
		initContainers:
		- name: chown-data
		# busybox:1.36 — sha256-pinned per T4 threat model.
		image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsUser: 0
		runAsNonRoot: false
		allowPrivilegeEscalation: false
		command: ["chown", "-R", "1000:1000", "/data"]
		volumeMounts:
		- name: memory-data
		mountPath: /data
		containers:
		- name: olam-memory-service
		# image first appears on GHCR after Phase B's publish-memory-service
		# job fires on the first release post-merge. Remove the
		# bootstrap-placeholder comment + run `npm run refresh:manifest-digests`
		# once ghcr.io/pleri/olam-memory-service has a real published digest.
		# bootstrap-placeholder: pre-publish; refresh after first release
		image: ghcr.io/pleri/olam-memory-service@sha256:923bff54d2ba3da162a35d3e8ebc6bd440bed6d290a5cff7bae2888281a4e003
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		readOnlyRootFilesystem: true
		allowPrivilegeEscalation: false
		capabilities:
		drop: ["ALL"]
		ports:
		- name: http
		containerPort: 3111
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-memory-service-env
		- secretRef:
		name: olam-memory-service-secret
		volumeMounts:
		- name: memory-data
		mountPath: /data
		- name: tmp
		mountPath: /tmp
		# Overrides the shipped iii-config.yaml so the engine binds the
		# internal port (3110) instead of colliding with the metrics-proxy
		# on 3111. See 35-configmap-iii-config.yaml for full rationale.
		- name: iii-config-override
		mountPath: /usr/local/lib/node_modules/@agentmemory/agentmemory/dist/iii-config.yaml
		subPath: iii-config.yaml
		readOnly: true
		readinessProbe:
		httpGet:
		# D15 (LOAD-BEARING): memory-service health path is /agentmemory/livez.
		# Source: DEFAULT_HEALTH_PATH in packages/core/src/services-status/memory-probe.ts:18.
		# Do NOT change to /health — that endpoint does not exist on this service.
		path: /agentmemory/livez
		port: 3111
		initialDelaySeconds: 5
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 6
		livenessProbe:
		httpGet:
		# D15 (LOAD-BEARING): same path as readinessProbe.
		path: /agentmemory/livez
		port: 3111
		initialDelaySeconds: 30
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "50m"
		memory: "256Mi"
		limits:
		cpu: "500m"
		memory: "1Gi"
		volumes:
		- name: memory-data
		persistentVolumeClaim:
		claimName: olam-memory-data
		- name: tmp
		emptyDir: {}
		- name: iii-config-override
		configMap:
		name: olam-memory-service-iii-config

-21

host-cp/k8s/manifests/memory-service/60-service.yaml

		# ClusterIP Service for olam-memory-service.
		# Port 3111 — consumed by host-cp and agents via cluster-internal DNS.
		# Operator surfaces externally via:
		# kubectl port-forward -n olam svc/olam-memory-service 3111:3111
		apiVersion: v1
		kind: Service
		metadata:
		name: olam-memory-service
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		spec:
		type: ClusterIP
		selector:
		app: olam-memory-service
		ports:
		- name: http
		port: 3111
		targetPort: 3111
		protocol: TCP

-8

host-cp/k8s/manife...lan-chat-service/10-serviceaccount.yaml

		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-plan-chat-service
		namespace: olam
		labels:
		app: olam-plan-chat-service
		olam.io/component: peripheral

-29

host-cp/k8s/manifests/plan-chat-service/20-rbac.yaml

		# plan-chat-service does not need to read or write any Kubernetes API objects.
		# A no-op Role + RoleBinding documents the minimal-privilege stance and
		# keeps the file present so audit:cli-bundle-k8s does not skip this peripheral.
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-plan-chat-service
		namespace: olam
		labels:
		app: olam-plan-chat-service
		olam.io/component: peripheral
		rules: []
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-plan-chat-service
		namespace: olam
		labels:
		app: olam-plan-chat-service
		olam.io/component: peripheral
		roleRef:
		apiGroup: rbac.authorization.k8s.io
		kind: Role
		name: olam-plan-chat-service
		subjects:
		- kind: ServiceAccount
		name: olam-plan-chat-service
		namespace: olam

-36

host-cp/k8s/manifests/plan-chat-service/30-configmap.yaml

		# ConfigMap for olam-plan-chat-service.
		#
		# plan-chat-service.mjs (packages/host-cp/src/plan-chat-service.mjs) reads
		# these env vars at startup. See the file header for the canonical names.
		#
		# DATABASE_URL: points at the in-cluster chunks-postgres StatefulSet's Service.
		# The password is sourced from the chunks-postgres-secret
		# (mounted via envFrom in 50-deployment.yaml) — the literal
		# here uses the env-var substitution syntax
		# `$(VAR)` which kubelet expands when DATABASE_URL is itself
		# read via envFrom or env: subordinate.
		#
		# BUT: kubelet only expands env-refs declared on the container,
		# not values inside a ConfigMap key. So we keep DATABASE_URL
		# OUT of this ConfigMap and assemble it in the Deployment's
		# env: section instead (which CAN reference the Secret-backed
		# POSTGRES_PASSWORD via $(POSTGRES_PASSWORD)). See 50-deployment.yaml.
		#
		# ELECTRIC_URL: chunks-electric ClusterIP. No auth (ELECTRIC_INSECURE=true on
		# that service in local-dev mode).
		#
		# SECRET_PATH: filesystem path where the olam-plan-chat-secret Secret is
		# mounted (see volumeMounts in 50-deployment.yaml). The mount
		# key is "secret" → file `/etc/olam-plan-chat/secret`.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-plan-chat-service-env
		namespace: olam
		labels:
		app: olam-plan-chat-service
		olam.io/component: peripheral
		data:
		OLAM_PLAN_CHAT_PORT: "3200"
		OLAM_PLAN_CHAT_ELECTRIC_URL: "http://olam-chunks-electric.olam.svc.cluster.local:3000"
		OLAM_PLAN_CHAT_SECRET_PATH: "/etc/olam-plan-chat/secret"

-24

host-cp/k8s/manifests/plan-chat-service/45-pvc.yaml

		# PersistentVolumeClaim for olam-plan-chat-service /data volume.
		#
		# plan-chat-service is mostly stateless (DB lives in chunks-postgres, secret
		# lives in olam-plan-chat-secret), but ships a /data PVC for parity with
		# the other peripherals. Used for any transient state the service decides
		# to spool (e.g. planning-session resumption buffers).
		#
		# local-path StorageClass ships with k3d by default. On non-k3d clusters,
		# substitute storageClassName with your cluster's provisioner.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-plan-chat-service-data
		namespace: olam
		labels:
		app: olam-plan-chat-service
		olam.io/component: peripheral
		spec:
		accessModes:
		- ReadWriteOnce
		storageClassName: local-path
		resources:
		requests:
		storage: 1Gi

-135

host-cp/k8s/manifests/plan-chat-service/50-deployment.yaml

		# Deployment for olam-plan-chat-service.
		#
		# Image strategy: REUSES the olam-host-cp image. Per the package layout,
		# plan-chat-service.mjs is a sibling under packages/host-cp/src/, and the
		# host-cp image's WORKDIR=/app already contains it at /app/src/plan-chat-service.mjs.
		# The single shared image avoids version-drift between the two binaries that
		# share plan-chat-secret.mjs (bearer-auth logic), planning-sessions.mjs,
		# crystallize-planning.mjs, and resolver.mjs.
		#
		# The command override replaces the host-cp default
		# ENTRYPOINT (`node src/server.mjs`) with the plan-chat-service entrypoint.
		#
		# Image: pinned to the SAME digest as host-cp's 50-deployment.yaml. Refresh
		# both in lockstep via scripts/refresh-manifest-digests.mjs on every release.
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-plan-chat-service
		namespace: olam
		labels:
		app: olam-plan-chat-service
		olam.io/component: peripheral
		spec:
		replicas: 1
		strategy:
		type: RollingUpdate
		rollingUpdate:
		maxSurge: 1
		maxUnavailable: 0
		selector:
		matchLabels:
		app: olam-plan-chat-service
		template:
		metadata:
		labels:
		app: olam-plan-chat-service
		spec:
		enableServiceLinks: false
		imagePullSecrets:
		- name: ghcr-pull
		serviceAccountName: olam-plan-chat-service
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		runAsGroup: 1000
		fsGroup: 1000
		initContainers:
		# chown-data: identical to memory-service pattern. Postgres-RWO PVC
		# mounts as root-owned on local-path; this brings it to 1000:1000.
		- name: chown-data
		image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsUser: 0
		runAsNonRoot: false
		allowPrivilegeEscalation: false
		command: ["chown", "-R", "1000:1000", "/data"]
		volumeMounts:
		- name: plan-chat-data
		mountPath: /data
		containers:
		- name: olam-plan-chat-service
		# Reuses the host-cp image (same source tree, same node_modules).
		# Digest pinned in lockstep with packages/host-cp/k8s/manifests/50-deployment.yaml.
		image: ghcr.io/pleri/olam-host-cp@sha256:20d84b6d490c633bc5a158b0f7f849152aba3cf1d2d45657360f627d8d41ec3f
		imagePullPolicy: IfNotPresent
		# Override the host-cp ENTRYPOINT. plan-chat-service.mjs exports
		# startService(); we boot it via -e import-and-call.
		command: ["node"]
		args:
		- "-e"
		- "import('/app/src/plan-chat-service.mjs').then(m => m.startService()).catch(e => { console.error('[plan-chat-service]', e); process.exit(1); });"
		workingDir: /app
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		allowPrivilegeEscalation: false
		capabilities:
		drop: ["ALL"]
		ports:
		- name: http
		containerPort: 3200
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-plan-chat-service-env
		env:
		# DATABASE_URL composition. Same pattern as chunks-electric.
		- name: POSTGRES_PASSWORD
		valueFrom:
		secretKeyRef:
		name: olam-chunks-postgres-secret
		key: POSTGRES_PASSWORD
		- name: OLAM_PLAN_CHAT_DATABASE_URL
		value: "postgres://postgres:$(POSTGRES_PASSWORD)@olam-chunks-postgres.olam.svc.cluster.local:5432/chunks"
		volumeMounts:
		- name: plan-chat-data
		mountPath: /data
		- name: plan-chat-secret
		mountPath: /etc/olam-plan-chat
		readOnly: true
		readinessProbe:
		httpGet:
		path: /livez
		port: 3200
		initialDelaySeconds: 10
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 12
		livenessProbe:
		httpGet:
		path: /livez
		port: 3200
		initialDelaySeconds: 60
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "50m"
		memory: "256Mi"
		limits:
		cpu: "500m"
		memory: "1Gi"
		volumes:
		- name: plan-chat-data
		persistentVolumeClaim:
		claimName: olam-plan-chat-service-data
		- name: plan-chat-secret
		secret:
		secretName: olam-plan-chat-secret
		defaultMode: 0400
		items:
		- key: PLAN_CHAT_SECRET
		path: secret

-17

host-cp/k8s/manifests/plan-chat-service/60-service.yaml

		apiVersion: v1
		kind: Service
		metadata:
		name: olam-plan-chat-service
		namespace: olam
		labels:
		app: olam-plan-chat-service
		olam.io/component: peripheral
		spec:
		type: ClusterIP
		selector:
		app: olam-plan-chat-service
		ports:
		- name: http
		port: 3200
		targetPort: 3200
		protocol: TCP

-32

host-cp/k8s/templates/40-secret-template.yaml

		# Secret TEMPLATE for olam-host-cp.
		#
		# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
		# the placeholder values. The placeholders are intentionally invalid; a raw
		# `kubectl apply` will result in auth-service 401s rather than silently shipping
		# fake credentials.
		#
		# Preferred substitution (keeps secrets out of git):
		# kubectl create secret generic olam-host-cp-secret -n olam \
		# --from-literal=OLAM_AUTH_SECRET=$(cat ~/.olam/auth-secret) \
		# --from-literal=GH_TOKEN=$(gh auth token) \
		# --dry-run=client -o yaml \| kubectl apply -f -
		#
		# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
		# so that `kubectl apply -f manifests/` does NOT apply it — operators must
		# explicitly handle Secret provisioning before applying the manifests.
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-host-cp-secret
		namespace: olam
		labels:
		app: olam-host-cp
		olam.io/component: host-stack
		type: Opaque
		stringData:
		# Shared bearer secret between host-cp and the long-lived olam-auth process.
		# Source: cat ~/.olam/auth-secret
		OLAM_AUTH_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_AUTH_SECRET"
		# GitHub token for GHCR image pulls and the /api/prs endpoint.
		# Source: gh auth token
		GH_TOKEN: "REPLACE_ME_FROM_GH_AUTH_TOKEN"

-28

host-cp/k8s/templates/auth-service-secret-template.yaml

		# Secret TEMPLATE for olam-auth-service.
		#
		# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
		# the placeholder values. The placeholders are intentionally invalid; a raw
		# `kubectl apply` will result in auth failures rather than silently shipping
		# fake credentials.
		#
		# Preferred substitution (keeps secrets out of git):
		# kubectl create secret generic olam-auth-service-secret -n olam \
		# --from-literal=OLAM_AUTH_DB_SECRET=$(cat ~/.olam/auth-db-secret) \
		# --dry-run=client -o yaml \| kubectl apply -f -
		#
		# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
		# so that `kubectl apply -f manifests/auth-service/` does NOT apply it —
		# operators must explicitly handle Secret provisioning before applying manifests.
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-auth-service-secret
		namespace: olam
		labels:
		app: olam-auth-service
		olam.io/component: peripheral
		type: Opaque
		stringData:
		# Shared database encryption secret for the credential vault.
		# Source: cat ~/.olam/auth-db-secret
		OLAM_AUTH_DB_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_AUTH_DB_SECRET"

-24

host-cp/k8s/templates/chunks-postgres-secret-template.yaml

		# Secret TEMPLATE for olam-chunks-postgres.
		#
		# Generates a random 64-char hex POSTGRES_PASSWORD on first apply (via
		# k8s-secret-render.ts generate-if-missing). The Secret is consumed by:
		# - chunks-postgres StatefulSet (envFrom → POSTGRES_PASSWORD)
		# - chunks-electric Deployment (env: valueFrom.secretKeyRef)
		# - plan-chat-service Deployment (env: valueFrom.secretKeyRef)
		#
		# All three resolve the SAME random value because the secret-renderer
		# persists generated values in ~/.olam/k8s-secrets-state.json so reapply
		# is idempotent (no rotation unless --rotate-secrets).
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-chunks-postgres-secret
		namespace: olam
		labels:
		app: olam-chunks-postgres
		olam.io/component: substrate
		type: Opaque
		stringData:
		# Postgres superuser password. Generated by the CLI's secret-renderer on
		# first apply (no host-side file to read; this is in-cluster-only state).
		POSTGRES_PASSWORD: "REPLACE_ME_GENERATE_RANDOM_HEX"

-28

host-cp/k8s/templates/kg-service-secret-template.yaml

		# Secret TEMPLATE for olam-kg-service.
		#
		# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
		# the placeholder values. The placeholders are intentionally invalid; a raw
		# `kubectl apply` will result in auth failures rather than silently shipping
		# fake credentials.
		#
		# Preferred substitution (keeps secrets out of git):
		# kubectl create secret generic olam-kg-service-secret -n olam \
		# --from-literal=OLAM_KG_BEARER_TOKEN=$(cat ~/.olam/kg-bearer-token) \
		# --dry-run=client -o yaml \| kubectl apply -f -
		#
		# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
		# so that `kubectl apply -f manifests/kg-service/` does NOT apply it —
		# operators must explicitly handle Secret provisioning before applying manifests.
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-kg-service-secret
		namespace: olam
		labels:
		app: olam-kg-service
		olam.io/component: peripheral
		type: Opaque
		stringData:
		# Bearer token for in-cluster KG query authentication.
		# Source: cat ~/.olam/kg-bearer-token
		OLAM_KG_BEARER_TOKEN: "REPLACE_ME_FROM_HOME_DOTOLAM_KG_BEARER_TOKEN"

-28

host-cp/k8s/templates/mcp-auth-service-secret-template.yaml

		# Secret TEMPLATE for olam-mcp-auth-service.
		#
		# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
		# the placeholder values. The placeholders are intentionally invalid; a raw
		# `kubectl apply` will result in auth failures rather than silently shipping
		# fake credentials.
		#
		# Preferred substitution (keeps secrets out of git):
		# kubectl create secret generic olam-mcp-auth-service-secret -n olam \
		# --from-literal=OLAM_MCP_AUTH_JWT_SECRET=$(cat ~/.olam/mcp-auth-jwt-secret) \
		# --dry-run=client -o yaml \| kubectl apply -f -
		#
		# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
		# so that `kubectl apply -f manifests/mcp-auth-service/` does NOT apply it —
		# operators must explicitly handle Secret provisioning before applying manifests.
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-mcp-auth-service-secret
		namespace: olam
		labels:
		app: olam-mcp-auth-service
		olam.io/component: peripheral
		type: Opaque
		stringData:
		# JWT signing secret for MCP client authentication.
		# Source: cat ~/.olam/mcp-auth-jwt-secret
		OLAM_MCP_AUTH_JWT_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_MCP_AUTH_JWT_SECRET"

-29

host-cp/k8s/templates/memory-service-secret-template.yaml

		# Secret TEMPLATE for olam-memory-service.
		#
		# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
		# the placeholder values. The placeholders are intentionally invalid; a raw
		# `kubectl apply` will result in auth failures rather than silently shipping
		# fake credentials.
		#
		# Preferred substitution (keeps secrets out of git):
		# kubectl create secret generic olam-memory-service-secret -n olam \
		# --from-literal=OLAM_MEMORY_BEARER_SECRET=$(cat ~/.olam/memory-bearer-secret) \
		# --dry-run=client -o yaml \| kubectl apply -f -
		#
		# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
		# so that `kubectl apply -f manifests/memory-service/` does NOT apply it —
		# operators must explicitly handle Secret provisioning before applying manifests.
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-memory-service-secret
		namespace: olam
		labels:
		app: olam-memory-service
		olam.io/component: peripheral
		type: Opaque
		stringData:
		# Bearer secret for the memory-service HTTP API (matches OLAM_MEMORY_BEARER_SECRET
		# used by host-cp and agents that call the memory endpoints).
		# Source: cat ~/.olam/memory-bearer-secret
		OLAM_MEMORY_BEARER_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_MEMORY_BEARER_SECRET"

-35

host-cp/k8s/templates/plan-chat-service-secret-template.yaml

		# Secret TEMPLATE for olam-plan-chat-secret.
		#
		# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
		# the placeholder values. The placeholders are intentionally invalid; a raw
		# `kubectl apply` will result in auth failures rather than silently shipping
		# fake credentials.
		#
		# Preferred substitution (keeps secrets out of git):
		# kubectl create secret generic olam-plan-chat-secret -n olam \
		# --from-literal=PLAN_CHAT_SECRET=$(cat ~/.olam/plan-chat-secret) \
		# --dry-run=client -o yaml \| kubectl apply -f -
		#
		# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
		# so that `kubectl apply -f manifests/plan-chat-service/` does NOT apply it —
		# operators must explicitly handle Secret provisioning before applying manifests.
		#
		# Architecture: this Secret is mounted by BOTH the host-cp pod (so its
		# renderSpaShell can inject window.__OLAM_PLAN_CHAT_BEARER__) AND the
		# plan-chat-service pod (so its bearer-auth gate timing-safe-compares incoming
		# Authorization: Bearer headers against the same value). One source-of-truth,
		# two readers — replaces the previous "/data/plan-chat-secret in host-cp PVC"
		# pattern that couldn't be shared across pods (RWO PVC).
		apiVersion: v1
		kind: Secret
		metadata:
		name: olam-plan-chat-secret
		namespace: olam
		labels:
		olam.io/component: substrate
		type: Opaque
		stringData:
		# Shared bearer secret for plan-chat-service's POST /v1/chunks and
		# GET /v1/shape endpoints. host-cp injects this into window.__OLAM_PLAN_CHAT_BEARER__.
		# Source: cat ~/.olam/plan-chat-secret
		PLAN_CHAT_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_PLAN_CHAT_SECRET"

-110

host-cp/lifecycle/classify.mjs

		// classifyStartupFailure — pure mapping from evidence shape to bucket.
		//
		// Precedence rules (walked top-down; first match wins):
		//
		// 1. processExitCode !== undefined → ProviderProcessGone
		// The agent process is dead; nothing else matters. This is the
		// highest-confidence signal because it's observable from outside
		// the container (docker exit code, child_process exit).
		//
		// 2. pluginErrors.length > 0 → PluginStartupFailed
		// Boot-time stderr from a plugin/skill source is definitive.
		// Comes before transport/handshake checks because a failed
		// plugin can leave transport+mcp in 'pending' permanently.
		//
		// 3. transportStatus === 'failed' → TransportDead
		// Channel-open never succeeded — agent is alive but unreachable.
		//
		// 4. mcpHandshakeStatus === 'failed' → McpHandshakeStall
		// Channel opened, MCP handshake explicitly failed.
		//
		// 5. mcpHandshakeStatus === 'pending'
		// AND elapsedSecondsSinceCreation > 30 → McpHandshakeStall
		// Time-bounded inference: a never-completed handshake after 30s
		// is the stall signal even without an explicit failure marker.
		//
		// 6. lastPhase === 'TrustRequired'
		// AND elapsedSecondsSinceCreation > 10 → TrustGateUnanswered
		// Agent reached the trust gate; no approval ever came back.
		// 10s is the operator's attention budget — past that, the
		// agent is silently stuck on a human gate.
		//
		// 7. promptSentAt !== undefined
		// AND firstThoughtAt === undefined → PromptMisdelivery
		// Dispatch landed on the host side but the agent never produced
		// a first thought — the prompt didn't reach the agent process.
		//
		// 8. lastPhase === 'TrustRequired' → TrustGateUnanswered (fallback)
		// Stuck at the trust gate even under 10s — still the most likely
		// explanation for a Failed transition from that phase.
		//
		// 9. fallthrough → PromptMisdelivery
		// The classifier is total: every Failed transition gets a bucket.
		// PromptMisdelivery is the most operator-actionable "we don't
		// know why but the dispatch path is the prime suspect" default.
		//
		// Tests in __tests__/classify.test.mjs assert exactly one case per
		// bucket. The function is pure: no I/O, no side effects, deterministic
		// — same evidence in always yields the same bucket out.

		import { WorldStartupFailureKind } from './failure-kinds.mjs';

		const MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS = 30;
		const TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS = 10;

		/**
		* Map a WorldStartupEvidence bundle to its WorldStartupFailureKind.
		*
		* @param {import('./evidence.mjs').WorldStartupEvidence} evidence
		* @returns {import('./failure-kinds.mjs').WorldStartupFailureKind}
		*/
		export function classifyStartupFailure(evidence) {
		// 1. Process exited — terminal signal, short-circuits all other checks.
		if (evidence.processExitCode !== undefined) {
		return WorldStartupFailureKind.ProviderProcessGone;
		}

		// 2. Plugin boot errors — definitive boot-time failure.
		if (evidence.pluginErrors.length > 0) {
		return WorldStartupFailureKind.PluginStartupFailed;
		}

		// 3. Transport explicitly failed — agent alive but unreachable.
		if (evidence.transportStatus === 'failed') {
		return WorldStartupFailureKind.TransportDead;
		}

		// 4. MCP handshake explicitly failed.
		if (evidence.mcpHandshakeStatus === 'failed') {
		return WorldStartupFailureKind.McpHandshakeStall;
		}

		// 5. MCP handshake pending past threshold — inferred stall.
		if (
		evidence.mcpHandshakeStatus === 'pending' &&
		evidence.elapsedSecondsSinceCreation > MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS
		) {
		return WorldStartupFailureKind.McpHandshakeStall;
		}

		// 6. Stuck on trust gate past operator-attention threshold.
		if (
		evidence.lastPhase === 'TrustRequired' &&
		evidence.elapsedSecondsSinceCreation > TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS
		) {
		return WorldStartupFailureKind.TrustGateUnanswered;
		}

		// 7. Prompt sent but agent never produced a first thought.
		if (evidence.promptSentAt !== undefined && evidence.firstThoughtAt === undefined) {
		return WorldStartupFailureKind.PromptMisdelivery;
		}

		// 8. Still at trust gate under threshold — bucket as trust-gate.
		if (evidence.lastPhase === 'TrustRequired') {
		return WorldStartupFailureKind.TrustGateUnanswered;
		}

		// 9. Total-function fallback.
		return WorldStartupFailureKind.PromptMisdelivery;
		}

-119

host-cp/lifecycle/emit.mjs

		// recordWorldLifecycle — the single broadcast helper every host-cp
		// surface uses to emit a lifecycle transition.
		//
		// Emits TWO event types on the host-stream:
		//
		// 1. event: 'world.lifecycle' → live SSE consumers (SPA, MCP, etc.).
		// Shape: { worldId, phase, at, evidence?, failureKind? }
		//
		// 2. event: 'span' → NDJSON trace sink (PR #915 + follow-ups).
		// Shape: { name: 'world.lifecycle', startedAt: at, endedAt: at,
		// attributes: { worldId, phase, evidence?, failureKind? },
		// exit: { _tag: 'Success' \| 'Failure', reason? } }
		//
		// The dual-emit keeps live consumers and trace consumers on the same
		// substrate without either path coupling to the other. The README jq
		// example `select(.name == "world.lifecycle" ...)` matches the span
		// emission; the SPA's `useHostStream().subscribe('world.lifecycle', ...)`
		// matches the live emission.
		//
		// Failed transitions auto-classify via classifyStartupFailure(evidence)
		// when caller passes evidence but omits an explicit failureKind. Callers
		// MAY provide their own failureKind to override the inference (e.g.
		// docker SIGKILL — the caller knows it was ProviderProcessGone before
		// the classifier could trip its time-thresholds).

		import { TERMINAL_PHASES, WorldLifecyclePhase } from './phases.mjs';
		import { classifyStartupFailure } from './classify.mjs';
		import { redactSensitive } from '../observability/redactor.mjs';

		/**
		* @typedef {object} HostStreamLike
		* @property {(eventType: string, payload: unknown) => unknown} broadcast
		*/

		/**
		* @typedef {object} WorldLifecycleEvent
		* @property {string} worldId
		* @property {import('./phases.mjs').WorldLifecyclePhase} phase
		* @property {number} at
		* @property {import('./evidence.mjs').WorldStartupEvidence} [evidence]
		* @property {import('./failure-kinds.mjs').WorldStartupFailureKind} [failureKind]
		*/

		/**
		* Emit a world lifecycle transition on both `world.lifecycle` and `span`
		* host-stream channels.
		*
		* @param {HostStreamLike} hostStream
		* @param {object} args
		* @param {string} args.worldId
		* @param {import('./phases.mjs').WorldLifecyclePhase} args.phase
		* @param {number} [args.at]
		* @param {import('./evidence.mjs').WorldStartupEvidence} [args.evidence]
		* @param {import('./failure-kinds.mjs').WorldStartupFailureKind} [args.failureKind]
		* @returns {WorldLifecycleEvent} the payload that was broadcast (test convenience)
		*/
		export function recordWorldLifecycle(hostStream, args) {
		if (!hostStream \|\| typeof hostStream.broadcast !== 'function') {
		throw new TypeError('recordWorldLifecycle: hostStream.broadcast is required');
		}
		if (typeof args?.worldId !== 'string' \|\| args.worldId.length === 0) {
		throw new TypeError('recordWorldLifecycle: worldId is required');
		}
		if (typeof args?.phase !== 'string') {
		throw new TypeError('recordWorldLifecycle: phase is required');
		}

		const at = typeof args.at === 'number' ? args.at : Date.now();

		// Resolve failureKind: explicit override > classifier inference > undefined.
		let failureKind = args.failureKind;
		if (
		failureKind === undefined &&
		args.phase === WorldLifecyclePhase.Failed &&
		args.evidence !== undefined
		) {
		failureKind = classifyStartupFailure(args.evidence);
		}

		/** @type {WorldLifecycleEvent} */
		const livePayload = {
		worldId: args.worldId,
		phase: args.phase,
		at,
		};
		if (args.evidence !== undefined) livePayload.evidence = redactSensitive(args.evidence);
		if (failureKind !== undefined) livePayload.failureKind = failureKind;

		hostStream.broadcast('world.lifecycle', livePayload);

		// Mirror as a span so the NDJSON trace sink (PR #915) records it.
		// Lifecycle transitions are point-in-time events — startedAt === endedAt.
		/** @type {Record<string, unknown>} */
		const spanAttributes = {
		worldId: args.worldId,
		phase: args.phase,
		};
		if (args.evidence !== undefined) spanAttributes.evidence = redactSensitive(args.evidence);
		if (failureKind !== undefined) spanAttributes.failureKind = failureKind;

		/** @type {{ _tag: 'Success' \| 'Failure', reason?: string }} */
		const exit =
		args.phase === WorldLifecyclePhase.Failed
		? { _tag: 'Failure', reason: failureKind ?? 'unclassified' }
		: { _tag: 'Success' };

		hostStream.broadcast('span', {
		name: 'world.lifecycle',
		startedAt: at,
		endedAt: at,
		attributes: spanAttributes,
		exit,
		});

		return livePayload;
		}

		/** Re-export so callers don't need to import both modules. */
		export { WorldLifecyclePhase, TERMINAL_PHASES };

-45

host-cp/lifecycle/evidence.mjs

		// WorldStartupEvidence — the typed bundle the classifier consumes.
		//
		// Every Failed lifecycle transition carries one of these. Fields are
		// strict-optional (undefined, not null) so consumers can use the
		// presence/absence as a signal directly (`promptSentAt === undefined`
		// is itself the PromptMisdelivery signal).

		/**
		* @typedef {'pending' \| 'ok' \| 'failed'} HandshakeStatus
		*/

		/**
		* @typedef {object} WorldStartupEvidence
		* @property {string} worldId
		* @property {import('./phases.mjs').WorldLifecyclePhase} lastPhase
		* @property {number} lastPhaseAt epoch ms
		* @property {number} [promptSentAt] undefined if no dispatch ever sent
		* @property {number} [firstThoughtAt] undefined if no thoughts ever produced
		* @property {HandshakeStatus} mcpHandshakeStatus
		* @property {HandshakeStatus} transportStatus
		* @property {string[]} pluginErrors captured stderr lines from plugin boot
		* @property {number} [processExitCode]
		* @property {number} elapsedSecondsSinceCreation
		*/

		/**
		* Construct an empty evidence bundle for a freshly-spawned world.
		* Caller mutates fields as transitions happen, then passes to the
		* classifier on Failed.
		*
		* @param {string} worldId
		* @param {number} [now]
		* @returns {WorldStartupEvidence}
		*/
		export function emptyEvidence(worldId, now = Date.now()) {
		return {
		worldId,
		lastPhase: 'Spawning',
		lastPhaseAt: now,
		mcpHandshakeStatus: 'pending',
		transportStatus: 'pending',
		pluginErrors: [],
		elapsedSecondsSinceCreation: 0,
		};
		}

-56

host-cp/lifecycle/failure-kinds.mjs

		// World startup failure buckets — the six canonical classes the
		// classifier maps every observed Failed transition into.
		//
		// Order is load-bearing: the classifier walks these in declaration
		// order on ambiguous evidence, so higher-confidence buckets
		// (PromptMisdelivery, TransportDead) come before time-bounded
		// inferences (TrustGateUnanswered, McpHandshakeStall). Adding a 7th
		// bucket requires updating the classifier precedence and the
		// `world.lifecycle.Failed` consumers in the SPA + NDJSON trace.

		/**
		* @typedef {\| 'PromptMisdelivery'
		* \| 'TransportDead'
		* \| 'TrustGateUnanswered'
		* \| 'McpHandshakeStall'
		* \| 'PluginStartupFailed'
		* \| 'ProviderProcessGone'} WorldStartupFailureKind
		*/

		/**
		* @type {Readonly<Record<WorldStartupFailureKind, WorldStartupFailureKind>>}
		*/
		export const WorldStartupFailureKind = Object.freeze({
		/** Dispatch sent but agent never received it (transport mismatch). */
		PromptMisdelivery: 'PromptMisdelivery',
		/** stdin/stdout/IPC channel never opened. */
		TransportDead: 'TransportDead',
		/** Agent reached TrustRequired, no approval ever arrived. */
		TrustGateUnanswered: 'TrustGateUnanswered',
		/** MCP server connection initialized but never completed handshake. */
		McpHandshakeStall: 'McpHandshakeStall',
		/** Plugin or skill source failed to load on boot. */
		PluginStartupFailed: 'PluginStartupFailed',
		/** Agent (Claude Code) process exited before responding. */
		ProviderProcessGone: 'ProviderProcessGone',
		});

		export const WORLD_STARTUP_FAILURE_KIND_ORDER = Object.freeze([
		WorldStartupFailureKind.PromptMisdelivery,
		WorldStartupFailureKind.TransportDead,
		WorldStartupFailureKind.TrustGateUnanswered,
		WorldStartupFailureKind.McpHandshakeStall,
		WorldStartupFailureKind.PluginStartupFailed,
		WorldStartupFailureKind.ProviderProcessGone,
		]);

		/**
		* @param {unknown} value
		* @returns {value is WorldStartupFailureKind}
		*/
		export function isWorldStartupFailureKind(value) {
		return (
		typeof value === 'string' &&
		WORLD_STARTUP_FAILURE_KIND_ORDER.includes(/** @type {any} */ (value))
		);
		}

-22

host-cp/lifecycle/index.mjs

		// Barrel re-export for the lifecycle module. Importers should pull
		// from '@olam/host-cp/lifecycle' (or the relative path equivalent)
		// rather than reaching into individual files.

		export {
		WorldLifecyclePhase,
		WORLD_LIFECYCLE_PHASE_ORDER,
		TERMINAL_PHASES,
		isWorldLifecyclePhase,
		} from './phases.mjs';

		export {
		WorldStartupFailureKind,
		WORLD_STARTUP_FAILURE_KIND_ORDER,
		isWorldStartupFailureKind,
		} from './failure-kinds.mjs';

		export { emptyEvidence } from './evidence.mjs';

		export { classifyStartupFailure } from './classify.mjs';

		export { recordWorldLifecycle } from './emit.mjs';

-52

host-cp/lifecycle/phases.mjs

		// World lifecycle phases — the canonical FSM every Olam world walks
		// through from spawn to terminal state.
		//
		// Order is load-bearing: a world's `lastPhase` is a monotonic high-water
		// mark, and the classifier's precedence rules in classify.mjs assume
		// this ordering when deciding which failure bucket to attribute a stall
		// to. Do NOT reorder without updating the classifier.

		/**
		* @typedef {'Spawning' \| 'TrustRequired' \| 'ReadyForPrompt' \| 'Running' \| 'Finished' \| 'Failed'} WorldLifecyclePhase
		*/

		/**
		* @type {Readonly<Record<WorldLifecyclePhase, WorldLifecyclePhase>>}
		*/
		export const WorldLifecyclePhase = Object.freeze({
		/** Container or worktree created; before any code runs inside. */
		Spawning: 'Spawning',
		/** Agent process up; awaiting trust-gate approval. */
		TrustRequired: 'TrustRequired',
		/** Trust granted; awaiting initial dispatch. */
		ReadyForPrompt: 'ReadyForPrompt',
		/** Actively processing dispatch. */
		Running: 'Running',
		/** Completed successfully. Terminal. */
		Finished: 'Finished',
		/** Terminal failure. Carries an evidence bundle + classified failure kind. */
		Failed: 'Failed',
		});

		/** Phases in canonical order. Useful for ordinal comparison. */
		export const WORLD_LIFECYCLE_PHASE_ORDER = Object.freeze([
		WorldLifecyclePhase.Spawning,
		WorldLifecyclePhase.TrustRequired,
		WorldLifecyclePhase.ReadyForPrompt,
		WorldLifecyclePhase.Running,
		WorldLifecyclePhase.Finished,
		WorldLifecyclePhase.Failed,
		]);

		/** Terminal phases — no transitions out. */
		export const TERMINAL_PHASES = Object.freeze(
		new Set([WorldLifecyclePhase.Finished, WorldLifecyclePhase.Failed]),
		);

		/**
		* @param {unknown} value
		* @returns {value is WorldLifecyclePhase}
		*/
		export function isWorldLifecyclePhase(value) {
		return typeof value === 'string' && WORLD_LIFECYCLE_PHASE_ORDER.includes(/** @type {any} */ (value));
		}

-283

host-cp/observability/grafana-port-forward.sh

		#!/usr/bin/env bash
		# grafana-port-forward.sh — e2e smoke test: Grafana installs via Helm,
		# port-forward is accessible, Loki datasource
		# is pre-wired and reachable.
		#
		# Usage: scripts/e2e/grafana-port-forward.sh
		#
		# Pre-conditions:
		# - kubectl context is set to a live k8s cluster (does NOT spin up k3d)
		# - helm binary available
		# - jq binary available
		# - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts)
		# - Loki is already installed (scripts/e2e/loki-ingest.sh ran successfully
		# OR `helm status olam-loki -n monitoring` is healthy)
		#
		# Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an
		# existing cluster. The Secret is applied via --dry-run \| kubectl apply
		# so re-runs update the password (useful for rotation testing).
		# The olam-dashboards ConfigMap is applied before helm install so
		# Grafana's volume mount finds the ConfigMap on first boot.
		#
		# Cleanup: port-forward is killed on exit; Helm release is left in place so
		# downstream tasks can reuse the same cluster.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B2, B3
		# Chart: grafana/grafana 8.5.2 (pinned; latest stable 2026-05-20)

		set -euo pipefail

		NAMESPACE="monitoring"
		GRAFANA_RELEASE="olam-grafana"
		GRAFANA_CHART_VERSION="8.5.2"
		LOCAL_PORT="3000"
		GRAFANA_SVC_PORT="80"
		PF_BIND_SECONDS=5

		log() { printf '[grafana-port-forward] %s\n' "$*" >&2; }
		fail() { printf '[grafana-port-forward] FAIL: %s\n' "$*" >&2; exit 1; }

		# -------------------------------------------------------------------------
		# Cleanup trap — kill port-forward on exit; leave Helm release in place
		# -------------------------------------------------------------------------
		PF_PID=""
		cleanup() {
		if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then
		kill "$PF_PID" 2>/dev/null \|\| true
		fi
		}
		trap cleanup EXIT

		# -------------------------------------------------------------------------
		# Pre-flight
		# -------------------------------------------------------------------------
		command -v helm >/dev/null 2>&1 \|\| fail "helm not installed"
		command -v kubectl >/dev/null 2>&1 \|\| fail "kubectl not installed"
		command -v curl >/dev/null 2>&1 \|\| fail "curl not installed"
		command -v openssl >/dev/null 2>&1 \|\| fail "openssl not installed"
		command -v jq >/dev/null 2>&1 \|\| fail "jq not installed (required for B3 dashboard assertion)"
		kubectl cluster-info >/dev/null 2>&1 \|\| fail "kubectl: no reachable cluster; set KUBECONFIG"

		log "pre-flight checks passed"

		# -------------------------------------------------------------------------
		# Ensure grafana Helm repo is present (idempotent — safe to re-run)
		# -------------------------------------------------------------------------
		helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null \|\| true
		helm repo update grafana

		# Verify Loki is already installed (B2 depends on B1)
		if ! helm status "olam-loki" -n "$NAMESPACE" >/dev/null 2>&1; then
		fail "olam-loki Helm release not found in namespace $NAMESPACE — run scripts/e2e/loki-ingest.sh first"
		fi
		log "Loki pre-condition satisfied (olam-loki release found)"

		# -------------------------------------------------------------------------
		# Step 1: Resolve admin password (preserve existing on idempotent re-run)
		# -------------------------------------------------------------------------
		# Grafana persists the admin password in its internal SQLite on first
		# deploy. Subsequent helm upgrades do NOT re-read GF_SECURITY_ADMIN_PASSWORD
		# from the env (env value is set once at pod-start and not refreshed). So
		# on a re-run, rotating the Secret leaves the in-Grafana password stale
		# and breaks API auth.
		#
		# Idempotency contract: if the Secret already exists, reuse its current
		# password. The Secret's value matches Grafana's stored value (set in
		# concert on first install). Only generate a new password when the
		# Secret doesn't exist yet — i.e. true first deploy.
		if kubectl get secret olam-grafana-admin -n "$NAMESPACE" >/dev/null 2>&1; then
		log "reusing existing admin password from Secret olam-grafana-admin"
		GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin -n "$NAMESPACE" \
		-o jsonpath='{.data.admin-password}' \| base64 -d)
		else
		log "generating fresh admin password (first deploy)"
		GRAFANA_ADMIN_PW=$(openssl rand -base64 24)
		fi
		export GRAFANA_ADMIN_PW

		# -------------------------------------------------------------------------
		# Step 2: Create / update the admin Secret idempotently
		# -------------------------------------------------------------------------
		log "applying Secret olam-grafana-admin in namespace $NAMESPACE"
		kubectl create secret generic olam-grafana-admin \
		--from-literal=admin-user=admin \
		--from-literal=admin-password="$GRAFANA_ADMIN_PW" \
		-n "$NAMESPACE" \
		--dry-run=client -o yaml \
		\| kubectl apply -f -

		log "Secret applied"

		# -------------------------------------------------------------------------
		# Step 3a: Apply olam-dashboards ConfigMap BEFORE helm install
		# so Grafana's volume mount finds it on first boot (B3).
		# The ConfigMap is generated from grafana-dashboards/*.json by
		# packages/peripheral-services/scripts/sync-grafana-dashboards.sh.
		# -------------------------------------------------------------------------
		REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null \|\| pwd)"
		# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
		# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
		# peripheral-services/{helm-values,manifests} directory is reachable.
		# Monorepo callers leave it unset; the script falls back to the source dir
		# under packages/peripheral-services/.
		if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
		PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
		else
		PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
		fi
		CONFIGMAP_MANIFEST="$PERIPHERAL_SERVICES_DIR/manifests/80-grafana-dashboard-configmap.yaml"

		if [[ -f "$CONFIGMAP_MANIFEST" ]]; then
		log "applying olam-dashboards ConfigMap from $CONFIGMAP_MANIFEST"
		kubectl apply -f "$CONFIGMAP_MANIFEST"
		log "ConfigMap applied"
		else
		log "WARN: $CONFIGMAP_MANIFEST not found — Grafana will warn 'ConfigMap not found' until B3 is deployed"
		fi

		# -------------------------------------------------------------------------
		# Step 3: Helm upgrade --install
		# -------------------------------------------------------------------------
		log "installing grafana/grafana ($GRAFANA_RELEASE) in namespace $NAMESPACE"
		helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \
		--version "$GRAFANA_CHART_VERSION" \
		--namespace "$NAMESPACE" \
		--create-namespace \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
		--wait \
		--timeout "${OLAM_HELM_TIMEOUT:-600s}"

		log "Grafana Helm install complete"

		# -------------------------------------------------------------------------
		# Step 4: Wait for Grafana pod Ready
		# -------------------------------------------------------------------------
		log "waiting for Grafana pod Ready (120s)"
		kubectl wait \
		--for=condition=ready pod \
		-l "app.kubernetes.io/name=grafana" \
		-n "$NAMESPACE" \
		--timeout=120s

		log "Grafana pod Ready"

		# -------------------------------------------------------------------------
		# Step 5: Start port-forward in background
		# -------------------------------------------------------------------------
		log "port-forwarding svc/$GRAFANA_RELEASE $LOCAL_PORT:$GRAFANA_SVC_PORT in namespace $NAMESPACE"
		kubectl port-forward \
		-n "$NAMESPACE" \
		"svc/$GRAFANA_RELEASE" \
		"${LOCAL_PORT}:${GRAFANA_SVC_PORT}" &
		PF_PID=$!

		log "port-forward PID $PF_PID; waiting ${PF_BIND_SECONDS}s for bind"
		sleep "$PF_BIND_SECONDS"

		# Verify the port-forward process is still alive after sleep
		kill -0 "$PF_PID" 2>/dev/null \|\| fail "port-forward process exited prematurely"

		# -------------------------------------------------------------------------
		# Diagnostic helper — called on assertion failure
		# -------------------------------------------------------------------------
		dump_diagnostics() {
		log "DIAGNOSTIC: last 50 lines of Grafana pod logs:"
		kubectl logs -n "$NAMESPACE" \
		-l "app.kubernetes.io/name=grafana" \
		--tail=50 2>&1 >&2 \|\| true
		}

		# -------------------------------------------------------------------------
		# Step 6: Assertion 1 — /api/health returns 200 with database: ok
		# -------------------------------------------------------------------------
		log "asserting Grafana health (GET /api/health)"
		HEALTH_RESPONSE=$(
		curl -sf \
		-u "admin:${GRAFANA_ADMIN_PW}" \
		"http://localhost:${LOCAL_PORT}/api/health" \
		\|\| { dump_diagnostics; fail "GET /api/health failed — Grafana not reachable on port $LOCAL_PORT"; }
		)

		if ! echo "$HEALTH_RESPONSE" \| jq -e '.database == "ok"' >/dev/null 2>&1; then
		log "DIAGNOSTIC: /api/health response:"
		echo "$HEALTH_RESPONSE" >&2
		dump_diagnostics
		fail '/api/health returned database != "ok" — Grafana DB layer not healthy'
		fi

		log "PASS: /api/health → database: ok"

		# -------------------------------------------------------------------------
		# Step 7: Assertion 2 — /api/datasources includes Loki entry with cluster URL
		# -------------------------------------------------------------------------
		log "asserting Loki datasource pre-wired (GET /api/datasources)"
		DS_RESPONSE=$(
		curl -sf \
		-u "admin:${GRAFANA_ADMIN_PW}" \
		"http://localhost:${LOCAL_PORT}/api/datasources" \
		\|\| { dump_diagnostics; fail "GET /api/datasources failed"; }
		)

		EXPECTED_URL="olam-loki.monitoring.svc.cluster.local:3100"

		if ! echo "$DS_RESPONSE" \| jq -e 'map(select(.type == "loki")) \| length >= 1' >/dev/null 2>&1; then
		log "DIAGNOSTIC: /api/datasources response:"
		echo "$DS_RESPONSE" >&2
		dump_diagnostics
		fail "datasources response contains no 'loki' type entry — datasource not provisioned"
		fi

		if ! echo "$DS_RESPONSE" \| jq -e --arg url "$EXPECTED_URL" 'map(select(.type == "loki" and (.url \| contains($url)))) \| length >= 1' >/dev/null 2>&1; then
		log "DIAGNOSTIC: /api/datasources response:"
		echo "$DS_RESPONSE" >&2
		dump_diagnostics
		fail "Loki datasource URL does not contain '$EXPECTED_URL' — check grafana-values.yaml datasources block"
		fi

		log "PASS: Loki datasource found with cluster-local URL $EXPECTED_URL"

		# -------------------------------------------------------------------------
		# Step 7b: Assertion 2b — dashboard provider loaded olam-home (catches mount-path bugs)
		# -------------------------------------------------------------------------
		log "asserting olam-home dashboard visible in /api/search (catches ConfigMap mount failures)"
		DASHBOARDS=$(
		curl -sf \
		-u "admin:${GRAFANA_ADMIN_PW}" \
		"http://localhost:${LOCAL_PORT}/api/search?type=dash-db&query=olam" \
		\|\| true
		)

		if ! echo "$DASHBOARDS" \| jq -e 'map(select(.uid == "olam-home")) \| length == 1' >/dev/null 2>&1; then
		log "DIAGNOSTIC: /api/search response:"
		echo "$DASHBOARDS" >&2
		dump_diagnostics
		fail "olam-home dashboard not found in /api/search — check ConfigMap mount path and dashboard provider config"
		fi

		log "PASS: olam-home dashboard found via /api/search"

		# -------------------------------------------------------------------------
		# Step 8: Assertion 3 — olam-home dashboard present (B3)
		# -------------------------------------------------------------------------
		log "asserting olam-home dashboard present (GET /api/dashboards/uid/olam-home)"
		DASHBOARD_RESPONSE=$(
		curl -sf \
		-u "admin:${GRAFANA_ADMIN_PW}" \
		"http://localhost:${LOCAL_PORT}/api/dashboards/uid/olam-home" \
		\|\| { dump_diagnostics; fail "GET /api/dashboards/uid/olam-home failed — dashboard not found or Grafana unreachable"; }
		)

		if ! echo "$DASHBOARD_RESPONSE" \| jq -e '.dashboard.uid == "olam-home"' >/dev/null 2>&1; then
		log "DIAGNOSTIC: /api/dashboards/uid/olam-home response:"
		echo "$DASHBOARD_RESPONSE" >&2
		dump_diagnostics
		fail "olam-home dashboard uid mismatch or missing — check ConfigMap provisioning and Grafana provider config"
		fi

		log "PASS: olam-home dashboard present with uid=olam-home"

		# -------------------------------------------------------------------------
		# Final
		# -------------------------------------------------------------------------
		log "PASS: Grafana port-forward accessible; Loki datasource pre-wired; olam-home dashboard provisioned — Tasks B2+B3 verified"
		exit 0

-462

host-cp/observability/kyverno-cardinality-mutate.sh

		#!/usr/bin/env bash
		# kyverno-cardinality-mutate.sh — Phase C C8 follow-up e2e smoke test.
		#
		# Verifies that the Kyverno ClusterPolicy
		# `enforce-cardinality-labeldrop` mutates incoming ServiceMonitor and
		# PodMonitor objects at admission time, regardless of authorship,
		# closing codex's "policy by convention" gap on PR #783.
		#
		# Test approach:
		# 1. helm-install Kyverno (pinned 3.8.1) into the `kyverno` namespace.
		# 2. Apply the ClusterPolicy.
		# 3. POSITIVE test: apply ServiceMonitor `kyverno-mutate-positive-test`
		# with selector `app: kyverno-mutate-positive-test` (no backing Service)
		# and NO metricRelabelings; assert Kyverno mutated it; delete immediately.
		# 4. IDEMPOTENCY test: apply ServiceMonitor `kyverno-mutate-idempotency-test`
		# with selector `app: kyverno-mutate-idempotency-test` (different non-existent
		# label) and the labeldrop already present; assert count stays at 1; delete.
		# 5. SCRAPE-VERIFICATION test: deploy synthetic `kyverno-emitter` (Service +
		# Deployment + ConfigMap) + dedicated ServiceMonitor `kyverno-emitter-sm`
		# applied WITHOUT metricRelabelings; assert Kyverno mutates the SM at admission;
		# wait for pod Ready; poll Prometheus for http_requests_total; assert
		# world_id label is ABSENT.
		#
		# Key design decision: POSITIVE and IDEMPOTENCY tests use selectors that match
		# no real Service, so they are isolated from each other and from the SCRAPE test.
		# A single dedicated SM (`kyverno-emitter-sm`) owns the emitter endpoint, so
		# prometheus-operator can reliably reconcile exactly one scrape config for it.
		# Root cause of the prior failure (PR #828 CI run 26239574154): two SMs
		# (naive-violator + pre-armoured-violator) competed for the same
		# `app: kyverno-emitter` Endpoints; operator never reconciled either.
		#
		# Pre-conditions:
		# - kube-prometheus-stack installed (cardinality-drop.sh ran).
		# - kubectl context set to a live cluster; helm + jq + curl available.
		#
		# Idempotency: kubectl apply is idempotent; helm upgrade --install is
		# idempotent. Cleanup trap removes synthetic resources on exit. The
		# ClusterPolicy + Kyverno install are LEFT in the cluster (permanent
		# C8 fixtures).
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8
		# codex review on PR #783 ("policy by convention" finding)
		# PR #828 CI run 26239574154 (competing-SM root cause)

		set -euo pipefail

		KYVERNO_VERSION="3.8.1"
		KYVERNO_NAMESPACE="kyverno"
		TEST_NAMESPACE="monitoring"
		PROM_LOCAL_PORT="9092" # 9090, 9091 may be in use by sibling Phase C scripts
		PF_BIND_SECONDS=5
		TARGET_DISCOVERY_TIMEOUT="${OLAM_PROM_DISCOVERY_TIMEOUT:-240}" # bumped from 180s; one CI attempt observed kyverno-emitter still not scraped at 180s
		SCRAPE_POLL_INTERVAL=10

		log() { printf '[kyverno-mutate] %s\n' "$*" >&2; }
		fail() { printf '[kyverno-mutate] FAIL: %s\n' "$*" >&2; exit 1; }

		REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null \|\| pwd)"
		# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
		# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
		# peripheral-services/{helm-values,manifests} directory is reachable.
		# Monorepo callers leave it unset; the script falls back to the source dir
		# under packages/peripheral-services/.
		if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
		PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
		else
		PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
		fi

		# -------------------------------------------------------------------------
		# Cleanup trap — kill port-forwards; remove synthetic resources on exit.
		# Kyverno chart + ClusterPolicy stay (permanent C8 fixtures).
		# -------------------------------------------------------------------------
		PROM_PF_PID=""
		cleanup() {
		[[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null \|\| true
		log "removing synthetic resources (idempotent)"
		# Mutation-test SMs (already deleted inline, but --ignore-not-found makes this safe)
		kubectl delete servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null \|\| true
		kubectl delete servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null \|\| true
		# Scrape-verification resources
		kubectl delete servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null \|\| true
		kubectl delete deployment kyverno-emitter -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null \|\| true
		kubectl delete service kyverno-emitter-svc -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null \|\| true
		kubectl delete configmap kyverno-emitter-config -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null \|\| true
		}
		trap cleanup EXIT

		# -------------------------------------------------------------------------
		# Pre-flight
		# -------------------------------------------------------------------------
		command -v helm >/dev/null 2>&1 \|\| fail "helm not installed"
		command -v kubectl >/dev/null 2>&1 \|\| fail "kubectl not installed"
		command -v curl >/dev/null 2>&1 \|\| fail "curl not installed"
		command -v jq >/dev/null 2>&1 \|\| fail "jq not installed"
		kubectl cluster-info >/dev/null 2>&1 \|\| fail "kubectl: no reachable cluster; set KUBECONFIG"

		# kube-prom-stack must already be up — we rely on Prometheus + the
		# ServiceMonitor CRD existing.
		kubectl get crd servicemonitors.monitoring.coreos.com >/dev/null 2>&1 \
		\|\| fail "ServiceMonitor CRD not present — run prom-no-double-grafana.sh first"
		kubectl get deployment -n "$TEST_NAMESPACE" -l "app.kubernetes.io/name=prometheus-operator" \
		>/dev/null 2>&1 \
		\|\| fail "prometheus-operator not found in $TEST_NAMESPACE — run prom-no-double-grafana.sh first"

		log "pre-flight checks passed"

		# -------------------------------------------------------------------------
		# Step 1: helm-install Kyverno
		#
		# Repo add is idempotent; helm upgrade --install handles fresh install + upgrade.
		# `--wait` blocks until pods are Ready; admission webhook needs to be live
		# before we apply the ClusterPolicy or our test ServiceMonitors.
		# -------------------------------------------------------------------------
		log "ensuring kyverno helm repo is configured"
		helm repo add kyverno https://kyverno.github.io/kyverno/ >/dev/null 2>&1 \|\| true
		helm repo update kyverno >/dev/null 2>&1 \|\| true

		log "installing kyverno chart $KYVERNO_VERSION (waits for admission webhook Ready)"
		helm upgrade --install olam-kyverno kyverno/kyverno \
		--version "$KYVERNO_VERSION" \
		--namespace "$KYVERNO_NAMESPACE" \
		--create-namespace \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \
		--wait --timeout "${OLAM_HELM_TIMEOUT:-600s}" 2>&1 \| tail -8

		# Sanity: kyverno-admission-controller Deployment Ready.
		kubectl get deployment -n "$KYVERNO_NAMESPACE" -l "app.kubernetes.io/component=admission-controller" \
		>/dev/null 2>&1 \
		\|\| fail "kyverno admission controller not found in $KYVERNO_NAMESPACE"

		log "waiting for kyverno admission webhook to be registered with apiserver"
		# The webhook registration is the LAST thing kyverno does after pod-Ready;
		# poll until our ClusterPolicy can be admitted.
		elapsed=0
		while [ "$elapsed" -lt 120 ]; do
		if kubectl get validatingwebhookconfiguration kyverno-policy-validating-webhook-cfg \
		>/dev/null 2>&1; then
		log "kyverno webhooks registered after ${elapsed}s"
		break
		fi
		sleep 5
		elapsed=$((elapsed + 5))
		done
		if [ "$elapsed" -ge 120 ]; then
		fail "kyverno webhook registration timed out after 120s"
		fi

		# -------------------------------------------------------------------------
		# Step 2: Apply the ClusterPolicy
		# -------------------------------------------------------------------------
		log "applying ClusterPolicy enforce-cardinality-labeldrop"
		kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/96-kyverno-cardinality-mutate.yaml"

		# Wait for policy to be Ready (Kyverno controller picks it up and reports
		# readiness in status.ready / .conditions).
		log "waiting up to 60s for ClusterPolicy to be Ready"
		elapsed=0
		while [ "$elapsed" -lt 60 ]; do
		READY=$(kubectl get clusterpolicy enforce-cardinality-labeldrop \
		-o jsonpath='{.status.ready}' 2>/dev/null \|\| echo "")
		if [ "$READY" = "true" ]; then
		log "ClusterPolicy Ready after ${elapsed}s"
		break
		fi
		sleep 3
		elapsed=$((elapsed + 3))
		done
		if [ "$elapsed" -ge 60 ]; then
		log "WARN: ClusterPolicy status.ready not observed within 60s; proceeding (status field can lag)"
		fi

		# -------------------------------------------------------------------------
		# Step 3: POSITIVE test — mutation only, no backing Service
		#
		# Uses selector `app: kyverno-mutate-positive-test` — a label that no
		# real Service carries, so this SM never competes with anything for
		# Endpoints. Its sole job is to exercise the Kyverno admission webhook.
		#
		# Deleted immediately after assertion so the SM space is clean when
		# the scrape test runs.
		# -------------------------------------------------------------------------
		log "POSITIVE test: applying naive ServiceMonitor (no metricRelabelings, non-Service-backed selector)"
		kubectl apply -f - <<'EOF'
		---
		apiVersion: monitoring.coreos.com/v1
		kind: ServiceMonitor
		metadata:
		name: kyverno-mutate-positive-test
		namespace: monitoring
		labels:
		release: olam-prom
		spec:
		namespaceSelector:
		matchNames:
		- monitoring
		selector:
		matchLabels:
		app: kyverno-mutate-positive-test
		endpoints:
		- port: metrics
		interval: 15s
		# NOTE: deliberately NO metricRelabelings — Kyverno must inject it.
		EOF

		# Read back and assert.
		ACTUAL=$(kubectl get servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" -o json \
		\| jq -r '.spec.endpoints[0].metricRelabelings // [] \| tojson')
		log "kyverno-mutate-positive-test metricRelabelings after admission: $ACTUAL"

		INJECTED_COUNT=$(echo "$ACTUAL" \| jq '[ .[] \| select(.action == "labeldrop" and (.regex \| contains("world_id"))) ] \| length')
		if [ "$INJECTED_COUNT" -lt 1 ]; then
		log "actual policy state:"
		kubectl get clusterpolicy enforce-cardinality-labeldrop -o yaml >&2 \|\| true
		fail "POSITIVE test FAILED: Kyverno did not inject labeldrop into naive ServiceMonitor — third-party bypass gap NOT closed"
		fi
		log "PASS: naive ServiceMonitor was mutated at admission (labeldrop injected)"

		log "deleting kyverno-mutate-positive-test (mutation-only test; SM space clean for scrape test)"
		kubectl delete servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" --ignore-not-found=true

		# -------------------------------------------------------------------------
		# Step 4: IDEMPOTENCY test — mutation only, no backing Service
		#
		# Uses selector `app: kyverno-mutate-idempotency-test` — different from
		# the positive test and from the scrape test label. No real Service.
		# Deleted immediately after assertion.
		# -------------------------------------------------------------------------
		log "IDEMPOTENCY test: applying pre-armoured ServiceMonitor (labeldrop already present)"
		kubectl apply -f - <<'EOF'
		---
		apiVersion: monitoring.coreos.com/v1
		kind: ServiceMonitor
		metadata:
		name: kyverno-mutate-idempotency-test
		namespace: monitoring
		labels:
		release: olam-prom
		spec:
		namespaceSelector:
		matchNames:
		- monitoring
		selector:
		matchLabels:
		app: kyverno-mutate-idempotency-test
		endpoints:
		- port: metrics
		interval: 15s
		metricRelabelings:
		- action: labeldrop
		regex: 'world_id\|trace_id\|user_id\|request_id\|operator_id'
		EOF

		DUP_COUNT=$(kubectl get servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" -o json \
		\| jq '[ .spec.endpoints[0].metricRelabelings[] \| select(.action == "labeldrop" and (.regex \| contains("world_id"))) ] \| length')
		log "kyverno-mutate-idempotency-test labeldrop count: $DUP_COUNT"
		if [ "$DUP_COUNT" -ne 1 ]; then
		kubectl get servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" -o yaml >&2
		fail "IDEMPOTENCY test FAILED: expected 1 labeldrop entry, got $DUP_COUNT — policy double-adds"
		fi
		log "PASS: pre-armoured ServiceMonitor has exactly 1 labeldrop (no double-add)"

		log "deleting kyverno-mutate-idempotency-test (mutation-only test; SM space clean for scrape test)"
		kubectl delete servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" --ignore-not-found=true

		# -------------------------------------------------------------------------
		# Step 5: SCRAPE-VERIFICATION test — dedicated SM + Service + Pod
		#
		# One SM (`kyverno-emitter-sm`) selects exactly one Service (`kyverno-emitter-svc`).
		# No other SM in the cluster selects `app: kyverno-emitter`, so prometheus-operator
		# reconciles a single clean scrape config.
		#
		# The SM is applied WITHOUT metricRelabelings so Kyverno's admission webhook
		# fires — this is the load-bearing check that the policy applies during real
		# scrape setup, not just on test fixtures.
		#
		# After admission we verify the spec has the labeldrop, then wait for the pod
		# to be Ready and poll Prometheus for http_requests_total. We assert
		# world_id is absent from all returned series.
		#
		# Mirrors the working pattern from dashboards-have-data.sh (single dedicated
		# SM + co-located Service in `monitoring` namespace).
		# -------------------------------------------------------------------------
		log "SCRAPE-VERIFICATION test: deploying synthetic kyverno-emitter (emits http_requests_total{world_id})"
		kubectl apply -f - <<'EOF'
		---
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: kyverno-emitter-config
		namespace: monitoring
		data:
		metrics: \|
		# HELP http_requests_total Synthetic counter; world_id is the cardinality bomb
		# TYPE http_requests_total counter
		http_requests_total{world_id="kyverno-world",route="/api",method="GET",status_code="200"} 1
		---
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: kyverno-emitter
		namespace: monitoring
		labels:
		app: kyverno-emitter
		spec:
		replicas: 1
		selector:
		matchLabels:
		app: kyverno-emitter
		template:
		metadata:
		labels:
		app: kyverno-emitter
		spec:
		containers:
		- name: emitter
		image: python:3.11-alpine
		ports:
		- containerPort: 8080
		command: ["python3", "-c"]
		args:
		- \|
		import http.server
		with open('/config/metrics') as f: METRICS = f.read().encode()
		class H(http.server.BaseHTTPRequestHandler):
		def do_GET(self):
		if self.path != '/metrics':
		self.send_response(404); self.end_headers(); return
		self.send_response(200)
		self.send_header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8')
		self.end_headers()
		self.wfile.write(METRICS)
		def log_message(self, *a): pass
		http.server.HTTPServer(('0.0.0.0', 8080), H).serve_forever()
		volumeMounts:
		- name: config
		mountPath: /config
		volumes:
		- name: config
		configMap:
		name: kyverno-emitter-config
		---
		apiVersion: v1
		kind: Service
		metadata:
		name: kyverno-emitter-svc
		namespace: monitoring
		labels:
		app: kyverno-emitter
		spec:
		selector:
		app: kyverno-emitter
		ports:
		- name: metrics
		port: 8080
		targetPort: 8080
		EOF

		log "waiting for kyverno-emitter deployment Ready"
		kubectl rollout status deployment/kyverno-emitter -n "$TEST_NAMESPACE" --timeout=120s

		# Apply the dedicated ServiceMonitor WITHOUT metricRelabelings so Kyverno
		# mutates it at admission — this proves the policy fires on real SM objects,
		# not just on the POSITIVE test fixture.
		log "applying kyverno-emitter-sm (no metricRelabelings — Kyverno must inject)"
		kubectl apply -f - <<'EOF'
		---
		apiVersion: monitoring.coreos.com/v1
		kind: ServiceMonitor
		metadata:
		name: kyverno-emitter-sm
		namespace: monitoring
		labels:
		release: olam-prom
		spec:
		namespaceSelector:
		matchNames:
		- monitoring
		selector:
		matchLabels:
		app: kyverno-emitter
		endpoints:
		- port: metrics
		interval: 15s
		# NOTE: NO metricRelabelings — Kyverno must inject the labeldrop at admission.
		EOF

		# Verify Kyverno mutated this SM too (belt-and-suspenders: proves the policy
		# applies to the SM that actually drives the scrape, not just the test fixtures).
		SCRAPE_SM_ACTUAL=$(kubectl get servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" -o json \
		\| jq -r '.spec.endpoints[0].metricRelabelings // [] \| tojson')
		log "kyverno-emitter-sm metricRelabelings after admission: $SCRAPE_SM_ACTUAL"

		SCRAPE_SM_INJECTED=$(echo "$SCRAPE_SM_ACTUAL" \| jq '[ .[] \| select(.action == "labeldrop" and (.regex \| contains("world_id"))) ] \| length')
		if [ "$SCRAPE_SM_INJECTED" -lt 1 ]; then
		log "actual policy state:"
		kubectl get clusterpolicy enforce-cardinality-labeldrop -o yaml >&2 \|\| true
		fail "SCRAPE-VERIFICATION test FAILED: Kyverno did not mutate kyverno-emitter-sm at admission"
		fi
		log "PASS: kyverno-emitter-sm was mutated at admission (labeldrop injected)"

		# Port-forward Prometheus and poll for metric samples.
		log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090"
		kubectl port-forward \
		-n "$TEST_NAMESPACE" \
		"svc/prometheus-operated" \
		"${PROM_LOCAL_PORT}:9090" &
		PROM_PF_PID=$!
		sleep "$PF_BIND_SECONDS"
		kill -0 "$PROM_PF_PID" 2>/dev/null \
		\|\| fail "Prometheus port-forward exited prematurely"

		PROM_URL="http://localhost:${PROM_LOCAL_PORT}"

		# Direct-metric polling rather than target-discovery polling.
		#
		# Rationale: kube-prometheus-stack's default relabel sets the `job` label
		# from the k8s Service name. Polling by job-name is brittle — operator
		# reconciliation races, dropped-target filtering, and rare CRD revision
		# lag have all surfaced as "target not in activeTargets" flakes during
		# earlier ingress-integration runs. What we ACTUALLY care about is
		# whether the mutated relabel was applied to a real scrape sample. So
		# poll for the metric directly. With a single SM selecting on
		# `app=kyverno-emitter`, any http_requests_total series returned
		# necessarily came through kyverno-emitter-sm.
		log "polling Prometheus for http_requests_total samples (up to ${TARGET_DISCOVERY_TIMEOUT}s)"
		elapsed=0
		RESULT=""
		while [ "$elapsed" -lt "$TARGET_DISCOVERY_TIMEOUT" ]; do
		RESULT=$(curl -sf "${PROM_URL}/api/v1/query?query=http_requests_total" 2>/dev/null \|\| echo "")
		if [ -n "$RESULT" ]; then
		SERIES_COUNT=$(echo "$RESULT" \| jq '.data.result \| length' 2>/dev/null \|\| echo "0")
		if [ "$SERIES_COUNT" -ge 1 ]; then
		log "http_requests_total returned $SERIES_COUNT series after ${elapsed}s"
		break
		fi
		fi
		sleep "$SCRAPE_POLL_INTERVAL"
		elapsed=$((elapsed + SCRAPE_POLL_INTERVAL))
		done

		if [ "$elapsed" -ge "$TARGET_DISCOVERY_TIMEOUT" ]; then
		log "Active targets snapshot for diagnosis:"
		curl -sf "${PROM_URL}/api/v1/targets" \| jq '.data.activeTargets[] \| {job: .labels.job, service: .labels.service, namespace: .labels.namespace, health: .health, lastError: .lastError}' >&2 \|\| true
		log "ServiceMonitor kyverno-emitter-sm status:"
		kubectl get servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" -o yaml >&2 \|\| true
		log "prometheus-operator log tail (last 50 lines):"
		kubectl logs -n "$TEST_NAMESPACE" -l "app.kubernetes.io/name=prometheus-operator" --tail=50 >&2 \|\| true
		fail "Prometheus did not scrape kyverno-emitter within ${TARGET_DISCOVERY_TIMEOUT}s"
		fi

		SERIES_COUNT=$(echo "$RESULT" \| jq '.data.result \| length')

		LEAKED=$(echo "$RESULT" \| jq '[.data.result[] \| .metric \| has("world_id")] \| any')
		if [ "$LEAKED" = "true" ]; then
		echo "$RESULT" \| jq '.data.result[] \| .metric' >&2
		fail "world_id label leaked into Prometheus — Kyverno-mutated relabel did NOT take effect at scrape time"
		fi

		log "PASS: kyverno-emitter scraped via kyverno-emitter-sm; world_id absent at scrape time"
		log "PASS: C8 verified — Kyverno mutates third-party-shaped ServiceMonitors at admission and the mutation takes effect at scrape time"
		exit 0

-253

host-cp/observability/loki-ingest.sh

		#!/usr/bin/env bash
		# loki-ingest.sh — e2e smoke test: Loki single-binary installs, Promtail tails,
		# OAuth query-param scrubbing verified (code=REDACTED, no raw token).
		#
		# Usage: scripts/e2e/loki-ingest.sh
		#
		# Pre-conditions:
		# - kubectl context is set to a live k8s cluster (does NOT spin up k3d)
		# - helm binary available
		# - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts)
		#
		# This script is invoked by the A12 harness (scripts/test-ingress-integration/)
		# after cluster-up.sh. It can also be run manually against any live cluster.
		#
		# Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an
		# existing cluster. The synthetic pod is cleaned up regardless of
		# pass/fail via a trap.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B1
		# Chart: grafana/loki 6.7.4 (pinned; latest stable 2026-05-20)
		# Chart: grafana/promtail 6.16.6 (latest stable 2026-05-20)

		set -euo pipefail

		NAMESPACE="monitoring"
		LOKI_RELEASE="olam-loki"
		PROMTAIL_RELEASE="olam-promtail"
		SYNTHETIC_POD="loki-e2e-synthetic"
		LOKI_PORT="3100"
		LOCAL_PORT="13100" # avoid conflict with any host-level Loki

		# Magic-number commentary: Promtail's tail → ingest cycle involves:
		# - inotify event (near-instant)
		# - Promtail pipeline processing (~1s)
		# - Loki write path (ingester chunk idle period: default 30m, but flush on
		# query pressure; typically <5s in practice)
		# 10s is conservative for a single log line in a lightly loaded cluster.
		INGEST_LAG_SECONDS=10

		log() { printf '[loki-ingest] %s\n' "$*" >&2; }
		fail() { printf '[loki-ingest] FAIL: %s\n' "$*" >&2; exit 1; }

		# -------------------------------------------------------------------------
		# Cleanup trap — remove synthetic pod and port-forward on exit
		# -------------------------------------------------------------------------
		PF_PID=""
		cleanup() {
		if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then
		kill "$PF_PID" 2>/dev/null \|\| true
		fi
		kubectl delete pod "$SYNTHETIC_POD" -n default --ignore-not-found=true 2>/dev/null \|\| true
		}
		trap cleanup EXIT

		# -------------------------------------------------------------------------
		# Pre-flight
		# -------------------------------------------------------------------------
		command -v helm >/dev/null 2>&1 \|\| fail "helm not installed"
		command -v kubectl >/dev/null 2>&1 \|\| fail "kubectl not installed"
		command -v curl >/dev/null 2>&1 \|\| fail "curl not installed"
		kubectl cluster-info >/dev/null 2>&1 \|\| fail "kubectl: no reachable cluster; set KUBECONFIG"

		log "pre-flight checks passed"

		# -------------------------------------------------------------------------
		# Resolve repo root so helm -f paths work regardless of invocation cwd
		# -------------------------------------------------------------------------
		REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null \|\| pwd)"
		# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
		# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
		# peripheral-services/{helm-values,manifests} directory is reachable.
		# Monorepo callers leave it unset; the script resolves the source dir under
		# packages/peripheral-services/.
		if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
		PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
		else
		PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
		fi

		# -------------------------------------------------------------------------
		# Ensure grafana Helm repo is present (idempotent — safe to re-run)
		# -------------------------------------------------------------------------
		helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null \|\| true
		helm repo update grafana

		# -------------------------------------------------------------------------
		# Step 1: Install / upgrade Loki (single-binary mode)
		# -------------------------------------------------------------------------
		log "installing grafana/loki ($LOKI_RELEASE) in namespace $NAMESPACE"
		helm upgrade --install "$LOKI_RELEASE" grafana/loki \
		--version 6.7.4 \
		--namespace "$NAMESPACE" \
		--create-namespace \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
		--wait \
		--timeout "${OLAM_HELM_TIMEOUT:-600s}"

		log "loki helm install complete"

		# -------------------------------------------------------------------------
		# Step 2: Install / upgrade Promtail
		# -------------------------------------------------------------------------
		log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE"
		helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \
		--version 6.16.6 \
		--namespace "$NAMESPACE" \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
		--wait \
		--timeout 120s

		log "promtail helm install complete"

		# -------------------------------------------------------------------------
		# Step 3: Wait for Loki pod Ready
		# -------------------------------------------------------------------------
		log "waiting for Loki pod Ready (120s)"
		kubectl wait \
		--for=condition=ready pod \
		-l app.kubernetes.io/name=loki \
		-n "$NAMESPACE" \
		--timeout=120s

		log "loki pod Ready"

		# -------------------------------------------------------------------------
		# Step 4: Generate synthetic log line with raw OAuth tokens in URL and headers.
		#
		# The pod prints a single log line containing all 4 scrub patterns:
		# ?code=SECRETTOKEN123 → code=REDACTED
		# &access_token=SECRETTOKEN456 → access_token=REDACTED
		# &state=SESSION789 → state=REDACTED
		# Authorization: Bearer SECRETBEARER000 → Authorization: Bearer REDACTED
		#
		# Promtail tails it, runs the scrubbing pipeline, and pushes to Loki with all
		# 4 raw tokens absent and all 4 REDACTED markers present.
		# -------------------------------------------------------------------------
		log "launching synthetic pod (prints all 4 raw token patterns)"
		kubectl run "$SYNTHETIC_POD" \
		--image=busybox \
		--restart=Never \
		-n default \
		-- sh -c 'echo "GET http://example.com/callback?code=SECRETTOKEN123&access_token=SECRETTOKEN456&state=SESSION789 HTTP/1.1 Authorization: Bearer SECRETBEARER000"'

		# -------------------------------------------------------------------------
		# Step 5: Wait for Promtail tail + ingest lag
		# -------------------------------------------------------------------------
		log "waiting ${INGEST_LAG_SECONDS}s for Promtail to tail and ingest synthetic log"
		sleep "$INGEST_LAG_SECONDS"

		# -------------------------------------------------------------------------
		# Step 6: Port-forward Loki and query
		# -------------------------------------------------------------------------
		log "port-forwarding Loki svc to localhost:${LOCAL_PORT}"
		kubectl port-forward \
		"svc/${LOKI_RELEASE}" \
		"${LOCAL_PORT}:${LOKI_PORT}" \
		-n "$NAMESPACE" &
		PF_PID=$!

		# Give port-forward a moment to establish
		sleep 2

		# Query Loki for log lines from the default namespace within the last 5 minutes.
		# We search broadly for "SECRETTOKEN" to catch any raw token that leaked through,
		# and separately verify all 4 REDACTED markers are present.
		log "querying Loki for scrubbed entries"
		QUERY_RESPONSE=$(
		curl -s -G \
		"http://localhost:${LOCAL_PORT}/loki/api/v1/query_range" \
		--data-urlencode 'query={namespace="default"} \|= "REDACTED"' \
		--data-urlencode "start=$(date -u -v-5M +%s 2>/dev/null \|\| date -u -d '5 minutes ago' +%s)000000000" \
		--data-urlencode "end=$(date -u +%s)000000000" \
		--data-urlencode 'limit=50'
		)

		# -------------------------------------------------------------------------
		# Step 7: Assertions — verify all 4 scrub patterns
		#
		# Contract (matches Phase B spec + promtail-values.yaml):
		# ?code=SECRETTOKEN123 → code=REDACTED (absent: SECRETTOKEN123)
		# &access_token=SECRETTOKEN456 → access_token=REDACTED (absent: SECRETTOKEN456)
		# &state=SESSION789 → state=REDACTED (absent: SESSION789)
		# Authorization: Bearer SECRETBEARER000 → Bearer REDACTED (absent: SECRETBEARER000)
		# -------------------------------------------------------------------------
		log "asserting scrubbing correctness (all 4 patterns)"

		diag() {
		log "DIAGNOSTIC: Loki query response:"
		echo "$QUERY_RESPONSE" >&2
		log "DIAGNOSTIC: last 50 lines of Promtail logs:"
		kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=promtail --tail=50 2>&1 >&2 \|\| true
		}

		# Assertion 1: query response is non-empty (Loki returned results)
		if ! echo "$QUERY_RESPONSE" \| grep -q '"result"'; then
		diag
		fail "Loki returned no result block — Promtail may not have ingested the synthetic log yet"
		fi

		# --- Scrubbed markers present ---

		# Assertion 2a: code= is scrubbed
		if ! echo "$QUERY_RESPONSE" \| grep -q 'code=REDACTED'; then
		diag
		fail "'code=REDACTED' not found in Loki response — code= scrub stage not working"
		fi

		# Assertion 2b: access_token= is scrubbed
		if ! echo "$QUERY_RESPONSE" \| grep -q 'access_token=REDACTED'; then
		diag
		fail "'access_token=REDACTED' not found in Loki response — access_token= scrub stage not working"
		fi

		# Assertion 2c: state= is scrubbed
		if ! echo "$QUERY_RESPONSE" \| grep -q 'state=REDACTED'; then
		diag
		fail "'state=REDACTED' not found in Loki response — state= scrub stage not working"
		fi

		# Assertion 2d: Authorization Bearer is scrubbed
		if ! echo "$QUERY_RESPONSE" \| grep -q 'Bearer REDACTED'; then
		diag
		fail "'Bearer REDACTED' not found in Loki response — Authorization Bearer scrub stage not working"
		fi

		# --- Raw tokens absent ---

		# Assertion 3a: raw code= token is absent
		if echo "$QUERY_RESPONSE" \| grep -q 'SECRETTOKEN123'; then
		diag
		fail "raw token 'SECRETTOKEN123' (code=) found in Loki response — scrubbing pipeline is NOT working"
		fi

		# Assertion 3b: raw access_token= token is absent
		if echo "$QUERY_RESPONSE" \| grep -q 'SECRETTOKEN456'; then
		diag
		fail "raw token 'SECRETTOKEN456' (access_token=) found in Loki response — scrubbing pipeline is NOT working"
		fi

		# Assertion 3c: raw state= token is absent
		if echo "$QUERY_RESPONSE" \| grep -q 'SESSION789'; then
		diag
		fail "raw token 'SESSION789' (state=) found in Loki response — scrubbing pipeline is NOT working"
		fi

		# Assertion 3d: raw Bearer token is absent
		if echo "$QUERY_RESPONSE" \| grep -q 'SECRETBEARER000'; then
		diag
		fail "raw token 'SECRETBEARER000' (Authorization Bearer) found in Loki response — scrubbing pipeline is NOT working"
		fi

		log "PASS: all 4 scrub patterns verified — code=REDACTED, access_token=REDACTED, state=REDACTED, Bearer REDACTED present; all raw tokens absent"
		exit 0

-183

host-cp/observability/ndjson-span-sink.mjs

		// NDJSON span sink — zero-config observability for host-cp.
		//
		// Subscribes to the host-stream broadcaster and writes one JSON line per
		// `span` event to ~/.olam/logs/host.trace.ndjson. Each span carries the
		// minimum surface needed for `jq`-based triage: identity, timing, exit.
		//
		// Wire shape per line:
		// { traceId, spanId, parentSpanId, name, startedAt, durationMs,
		// attributes, events[], exit: { _tag: 'Success'\|'Failure', reason? } }
		//
		// Rotation: single level — at 50MB the file is renamed to `.1` and a
		// fresh file is opened. The previous `.1` (if any) is overwritten. We
		// keep at most one prior generation; deeper retention belongs to the
		// operator's normal disk-management tooling.
		//
		// Override path with OLAM_TRACE_LOG_PATH (set to /dev/null in tests that
		// don't care about file output, or to a temp file to assert on writes).

		import { open, mkdir, rename } from 'node:fs/promises';
		import { join, dirname } from 'node:path';
		import { homedir } from 'node:os';
		import { redactSensitive } from './redactor.mjs';

		const DEFAULT_ROTATE_BYTES = 50 * 1024 * 1024;
		const DEFAULT_LOG_PATH =
		process.env.OLAM_TRACE_LOG_PATH ??
		join(homedir(), '.olam', 'logs', 'host.trace.ndjson');

		export async function createNdjsonSpanSink({
		logPath = DEFAULT_LOG_PATH,
		rotateBytes = DEFAULT_ROTATE_BYTES,
		hostStream,
		} = {}) {
		await mkdir(dirname(logPath), { recursive: true });
		let fh = await open(logPath, 'a');
		let bytesWritten = (await fh.stat()).size;
		let closed = false;
		let chain = Promise.resolve();

		async function writeLine(line) {
		if (closed) return;
		await fh.write(line);
		bytesWritten += Buffer.byteLength(line);
		if (bytesWritten >= rotateBytes) {
		await fh.close();
		await rename(logPath, `${logPath}.1`);
		fh = await open(logPath, 'a');
		bytesWritten = 0;
		}
		}

		function recordSpan(span = {}) {
		const {
		name, startedAt, endedAt, attributes, events, exit,
		traceId, spanId, parentSpanId, reason,
		} = span;
		const haveTimes = typeof endedAt === 'number' && typeof startedAt === 'number';
		const durationMs = haveTimes ? endedAt - startedAt : null;
		let finalExit;
		if (exit && typeof exit === 'object' && (exit._tag === 'Success' \|\| exit._tag === 'Failure')) {
		finalExit = exit._tag === 'Failure' && exit.reason !== undefined
		? { _tag: 'Failure', reason: exit.reason }
		: { _tag: exit._tag };
		} else if (!haveTimes) {
		finalExit = reason !== undefined ? { _tag: 'Failure', reason } : { _tag: 'Failure' };
		} else {
		finalExit = { _tag: 'Success' };
		}
		const record = {
		traceId: traceId ?? null,
		spanId: spanId ?? null,
		parentSpanId: parentSpanId ?? null,
		name: name ?? null,
		startedAt: startedAt ?? null,
		durationMs,
		attributes: redactSensitive(attributes ?? {}),
		events: redactSensitive(events ?? []),
		exit: finalExit,
		};
		const next = chain.then(() => writeLine(JSON.stringify(record) + '\n')).catch(() => {});
		chain = next;
		return next;
		}

		let detach = null;
		if (hostStream && typeof hostStream.addSink === 'function') {
		detach = hostStream.addSink(createSseSpanAdapter((payload) => recordSpan(payload)));
		}

		return {
		recordSpan,
		async close() {
		if (closed) return;
		if (detach) detach();
		// Drain queued writes BEFORE flipping the closed flag — `writeLine`
		// bails on `closed`, so flipping first would silently drop spans
		// recorded just prior to shutdown.
		await chain;
		closed = true;
		try { await fh.close(); } catch { /* already closed */ }
		},
		};
		}

		/**
		* Subscribe an NDJSON sink to `@olam/auth-client`'s `betaResponseEmitter`.
		* Each `beta-response` event becomes a `withCredential.beta-response` span
		* with the beta payload exploded onto `attributes` — downstream `jq`
		* consumers can query e.g.
		*
		* jq 'select(.name == "withCredential.beta-response")
		* \| {ts: .startedAt, cred: .attributes.credentialName,
		* cache: .attributes.cacheStatus,
		* thinking: .attributes.thinkingTokens,
		* latencyMs: .durationMs}' ~/.olam/logs/host.trace.ndjson
		*
		* Wire is opt-in (call from server boot). Returns a detach function so the
		* subscription can be removed in tests or on shutdown.
		*
		* Pure additive: spans flowing from other sources (docker lifecycle,
		* plan-orchestrator, etc.) are unaffected.
		*/
		export function attachBetaResponseEvents({ sink, emitter }) {
		if (!sink \|\| typeof sink.recordSpan !== 'function') {
		throw new Error('attachBetaResponseEvents: sink.recordSpan required');
		}
		if (!emitter \|\| typeof emitter.on !== 'function') {
		throw new Error('attachBetaResponseEvents: emitter.on required');
		}

		const handler = (info) => {
		const now = Date.now();
		const latency = typeof info?.latencyMs === 'number' ? info.latencyMs : 0;
		sink.recordSpan({
		name: 'withCredential.beta-response',
		startedAt: now - latency,
		endedAt: now,
		attributes: {
		credentialName: info?.credentialName ?? null,
		credId: info?.credId ?? null,
		betas: Array.isArray(info?.betas) ? [...info.betas] : [],
		cacheStatus: info?.cacheStatus ?? null,
		thinkingTokens: info?.tokenCounts?.thinking ?? null,
		statusCode: typeof info?.statusCode === 'number' ? info.statusCode : null,
		extraHeaders: info?.extraHeaders && typeof info.extraHeaders === 'object'
		? { ...info.extraHeaders }
		: {},
		},
		exit: { _tag: 'Success' },
		});
		};

		emitter.on('beta-response', handler);
		return () => emitter.off('beta-response', handler);
		}

		// Duck-typed ServerResponse for host-stream's `addSink`. Parses SSE frames
		// (`event: <type>\ndata: <json>\n\n`) and dispatches `event: span` payloads
		// to `onSpan`. All other event types are silently ignored — host-stream
		// also replays per-type snapshots on attach; the sink is created at boot
		// before any spans are broadcast, so replay is a no-op in practice.
		function createSseSpanAdapter(onSpan) {
		let buffer = '';
		return {
		writableEnded: false,
		destroyed: false,
		write(chunk) {
		buffer += String(chunk);
		let i;
		while ((i = buffer.indexOf('\n\n')) !== -1) {
		const frame = buffer.slice(0, i);
		buffer = buffer.slice(i + 2);
		if (!frame.startsWith('event: span\n') && !frame.includes('\nevent: span\n')) continue;
		const dataLine = frame.split('\n').find((l) => l.startsWith('data: '));
		if (!dataLine) continue;
		try { onSpan(JSON.parse(dataLine.slice(6))); } catch { /* malformed frame */ }
		}
		return true;
		},
		once() { /* no drain handling needed — in-memory adapter never backpressures */ },
		end() { this.writableEnded = true; },
		};
		}

-311

host-cp/observability/prom-no-double-grafana.sh

		#!/usr/bin/env bash
		# prom-no-double-grafana.sh — Phase C Task C1 e2e smoke test.
		#
		# Verifies:
		# 1. kube-prometheus-stack installs (Prometheus pod becomes Ready).
		# 2. ServiceMonitor CRD is Established before Phase B charts are upgraded.
		# 3. Phase B charts (Loki + Promtail + Grafana) are helm-upgraded to pick up
		# serviceMonitor.enabled: true now that the CRD exists.
		# 4. Exactly one Grafana Deployment is running in the cluster (no double-Grafana).
		# 5. Phase B's Grafana (olam-grafana) has exactly one Prometheus datasource
		# provisioned (from grafana-values.yaml datasources block added in C1).
		# 6. Prometheus is scraping at least one active target.
		#
		# Pre-conditions:
		# - kubectl context is set to a live k8s cluster.
		# - Phase B e2e (loki-ingest.sh + grafana-port-forward.sh + grafana-dashboard-persistence.sh)
		# has already run: olam-loki, olam-promtail, and olam-grafana releases are installed.
		# - The olam-grafana-admin Secret exists (created by grafana-port-forward.sh).
		# - helm, kubectl, curl, jq binaries available.
		#
		# Chart: prometheus-community/kube-prometheus-stack 85.2.0 (pinned; latest stable 2026-05-21).
		#
		# Idempotency: helm upgrade --install is idempotent; re-runs on an existing
		# cluster succeed. Port-forwards are killed on exit via trap.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C1

		set -euo pipefail

		NAMESPACE="monitoring"
		PROM_RELEASE="olam-prom"
		PROM_CHART_VERSION="85.2.0"
		GRAFANA_RELEASE="olam-grafana"
		GRAFANA_LOCAL_PORT="3001" # avoid collision if phase-b-e2e left a port-forward on 3000
		GRAFANA_SVC_PORT="80"
		PROM_LOCAL_PORT="9090"
		PF_BIND_SECONDS=5

		log() { printf '[prom-no-double-grafana] %s\n' "$*" >&2; }
		fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; }

		# -------------------------------------------------------------------------
		# Resolve repo root so helm -f paths work regardless of invocation cwd
		# -------------------------------------------------------------------------
		REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null \|\| pwd)"
		# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
		# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
		# peripheral-services/{helm-values,manifests} directory is reachable.
		# Monorepo callers leave it unset; the script falls back to the source dir
		# under packages/peripheral-services/.
		if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
		PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
		else
		PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
		fi

		# -------------------------------------------------------------------------
		# Cleanup trap — kill port-forwards on exit; leave Helm releases in place
		# -------------------------------------------------------------------------
		GRAFANA_PF_PID=""
		PROM_PF_PID=""
		cleanup() {
		[[ -n "$GRAFANA_PF_PID" ]] && kill "$GRAFANA_PF_PID" 2>/dev/null \|\| true
		[[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null \|\| true
		}
		trap cleanup EXIT

		# -------------------------------------------------------------------------
		# Pre-flight
		# -------------------------------------------------------------------------
		command -v helm >/dev/null 2>&1 \|\| fail "helm not installed"
		command -v kubectl >/dev/null 2>&1 \|\| fail "kubectl not installed"
		command -v curl >/dev/null 2>&1 \|\| fail "curl not installed"
		command -v jq >/dev/null 2>&1 \|\| fail "jq not installed"
		kubectl cluster-info >/dev/null 2>&1 \|\| fail "kubectl: no reachable cluster; set KUBECONFIG"

		log "pre-flight checks passed"

		# Verify Phase B pre-conditions
		for release in olam-loki olam-promtail "$GRAFANA_RELEASE"; do
		helm status "$release" -n "$NAMESPACE" >/dev/null 2>&1 \
		\|\| fail "Phase B release '$release' not found in namespace $NAMESPACE — run phase-b-e2e first"
		done
		log "Phase B pre-conditions satisfied (olam-loki, olam-promtail, olam-grafana releases found)"

		# -------------------------------------------------------------------------
		# Step 1: Add prometheus-community repo and install kube-prometheus-stack
		# -------------------------------------------------------------------------
		helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null \|\| true
		helm repo update prometheus-community

		log "installing prometheus-community/kube-prometheus-stack ($PROM_RELEASE) version $PROM_CHART_VERSION"
		helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stack \
		--version "$PROM_CHART_VERSION" \
		--namespace "$NAMESPACE" \
		--create-namespace \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
		--wait \
		--timeout "${OLAM_HELM_TIMEOUT:-600s}"

		log "kube-prometheus-stack helm install complete"

		# -------------------------------------------------------------------------
		# Step 2: Wait for ServiceMonitor CRD to be Established
		# This is the gate before upgrading Phase B charts — the CRD must exist
		# for serviceMonitor.enabled: true to produce a valid ServiceMonitor object.
		# -------------------------------------------------------------------------
		log "waiting for ServiceMonitor CRD to be Established (60s)"
		kubectl wait \
		--for=condition=established \
		crd/servicemonitors.monitoring.coreos.com \
		--timeout=60s

		log "ServiceMonitor CRD Established"

		# -------------------------------------------------------------------------
		# Step 3: Helm-upgrade Phase B charts to enable ServiceMonitor at RUNTIME
		#
		# The source-of-truth values files keep serviceMonitor.enabled: false so a
		# standalone Phase B install (without kube-prometheus-stack) does not
		# hard-fail with "no matches for kind ServiceMonitor". We flip the toggle
		# at runtime here, AFTER the CRD is Established, via --set overrides. This
		# preserves Phase B's standalone-installability invariant while wiring
		# Prometheus discovery when kube-prom-stack is present.
		#
		# NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor)
		# — chart-version-specific path.
		# -------------------------------------------------------------------------
		# Chart version pins MUST match the ones in phase-b-e2e's loki-ingest.sh +
		# grafana-port-forward.sh. Without --version, helm pulls latest from the repo;
		# the latest charts may reference new template values not present in our
		# values files (e.g., Loki 6.8.x references .Values.loki.ui.enabled which is
		# nil in our 6.7.4-shaped values, producing a nil-pointer template error
		# during upgrade).
		LOKI_CHART_VERSION="6.7.4"
		PROMTAIL_CHART_VERSION="6.16.6"
		GRAFANA_CHART_VERSION="8.5.2"

		log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pinned versions)"

		helm upgrade olam-loki grafana/loki \
		--version "$LOKI_CHART_VERSION" \
		--namespace "$NAMESPACE" \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
		--wait \
		--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
		--reuse-values \
		--set monitoring.serviceMonitor.enabled=true

		log "olam-loki upgraded (ServiceMonitor enabled)"

		helm upgrade olam-promtail grafana/promtail \
		--version "$PROMTAIL_CHART_VERSION" \
		--namespace "$NAMESPACE" \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
		--wait \
		--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
		--reuse-values \
		--set serviceMonitor.enabled=true

		log "olam-promtail upgraded (ServiceMonitor enabled)"

		helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
		--version "$GRAFANA_CHART_VERSION" \
		--namespace "$NAMESPACE" \
		-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
		--wait \
		--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
		--reuse-values \
		--set serviceMonitor.enabled=true

		log "$GRAFANA_RELEASE upgraded (ServiceMonitor enabled; Prometheus datasource provisioned)"

		# -------------------------------------------------------------------------
		# Step 4: Wait for Prometheus pod Ready
		# -------------------------------------------------------------------------
		log "waiting for Prometheus pod Ready (300s)"
		kubectl wait \
		--for=condition=ready pod \
		-l "app.kubernetes.io/name=prometheus" \
		-n "$NAMESPACE" \
		--timeout=300s

		log "Prometheus pod Ready"

		# -------------------------------------------------------------------------
		# Step 5: Assertion — exactly one Grafana Deployment in the cluster
		# This catches any regression where kube-prometheus-stack's bundled Grafana
		# sub-chart accidentally gets enabled.
		# -------------------------------------------------------------------------
		log "asserting exactly 1 Grafana Deployment in namespace $NAMESPACE"
		GRAFANA_DEPS=$(kubectl get deployment \
		-n "$NAMESPACE" \
		-l "app.kubernetes.io/name=grafana" \
		-o name \
		\| wc -l \
		\| tr -d ' ')

		if [ "$GRAFANA_DEPS" != "1" ]; then
		log "FAIL: expected exactly 1 Grafana Deployment, found $GRAFANA_DEPS"
		kubectl get deployment -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" >&2
		fail "double-Grafana detected — kube-prometheus-stack's grafana.enabled must be false"
		fi

		log "PASS: exactly 1 Grafana Deployment found"

		# -------------------------------------------------------------------------
		# Step 6: Assertion — Grafana has exactly one Prometheus datasource
		# Re-read the admin password from the Secret (grafana-port-forward.sh created it).
		# Use port 3001 to avoid colliding with any live phase-b-e2e port-forward on 3000.
		# -------------------------------------------------------------------------
		log "reading admin password from Secret olam-grafana-admin"
		GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin \
		-n "$NAMESPACE" \
		-o jsonpath='{.data.admin-password}' \
		\| base64 -d)

		log "port-forwarding svc/$GRAFANA_RELEASE $GRAFANA_LOCAL_PORT:$GRAFANA_SVC_PORT"
		kubectl port-forward \
		-n "$NAMESPACE" \
		"svc/$GRAFANA_RELEASE" \
		"${GRAFANA_LOCAL_PORT}:${GRAFANA_SVC_PORT}" &
		GRAFANA_PF_PID=$!

		log "waiting ${PF_BIND_SECONDS}s for Grafana port-forward to bind"
		sleep "$PF_BIND_SECONDS"
		kill -0 "$GRAFANA_PF_PID" 2>/dev/null \
		\|\| fail "Grafana port-forward process exited prematurely"

		log "asserting exactly 1 Prometheus datasource in Grafana (GET /api/datasources)"
		DATASOURCES=$(curl -sf \
		-u "admin:${GRAFANA_ADMIN_PW}" \
		"http://localhost:${GRAFANA_LOCAL_PORT}/api/datasources" \
		\|\| { kubectl logs -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" --tail=30 >&2 \|\| true
		fail "GET /api/datasources failed — Grafana not reachable on port $GRAFANA_LOCAL_PORT"; })

		if ! echo "$DATASOURCES" \| jq -e 'map(select(.type == "prometheus")) \| length == 1' >/dev/null 2>&1; then
		log "FAIL: Grafana does not have exactly 1 Prometheus datasource"
		echo "$DATASOURCES" \| jq . >&2
		fail "Prometheus datasource not provisioned — check datasources block in grafana-values.yaml"
		fi

		PROM_URL=$(echo "$DATASOURCES" \| jq -r 'map(select(.type == "prometheus")) \| .[0].url')
		log "PASS: Grafana has exactly 1 Prometheus datasource (url=$PROM_URL)"

		# -------------------------------------------------------------------------
		# Step 7: Assertion — Prometheus is scraping at least one active target
		# -------------------------------------------------------------------------
		log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090"
		kubectl port-forward \
		-n "$NAMESPACE" \
		"svc/prometheus-operated" \
		"${PROM_LOCAL_PORT}:9090" &
		PROM_PF_PID=$!

		log "waiting ${PF_BIND_SECONDS}s for Prometheus port-forward to bind"
		sleep "$PF_BIND_SECONDS"
		kill -0 "$PROM_PF_PID" 2>/dev/null \
		\|\| fail "Prometheus port-forward process exited prematurely"

		log "querying Prometheus /api/v1/targets for active targets"
		TARGETS=$(curl -sf "http://localhost:${PROM_LOCAL_PORT}/api/v1/targets" \
		\|\| fail "GET /api/v1/targets failed — Prometheus not reachable on port $PROM_LOCAL_PORT")

		ACTIVE=$(echo "$TARGETS" \| jq '.data.activeTargets \| length')
		if [ "$ACTIVE" -lt 1 ]; then
		log "FAIL: Prometheus has 0 active scrape targets"
		echo "$TARGETS" \| jq '.data.activeTargets' >&2
		fail "Prometheus has no active targets — check ServiceMonitor CRD and scrapeConfig"
		fi

		log "PASS: $ACTIVE active scrape target(s) found in Prometheus"

		# -------------------------------------------------------------------------
		# Assertion C4: Recording rules from 95-prom-recording-rules.yaml are loaded
		#
		# The 9[0-9]-prom-* glob in apply-manifests.sh skips this file (requires
		# kube-prom-stack CRDs to exist). We kubectl apply it here, then poll
		# /api/v1/rules until the olam-http-aggregations group appears.
		# The port-forward on PROM_LOCAL_PORT is already open from Step 7 above.
		# -------------------------------------------------------------------------
		PROM_URL="http://localhost:${PROM_LOCAL_PORT}"

		log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)"
		kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/95-prom-recording-rules.yaml"

		# Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson).
		# Poll /api/v1/rules until our group appears (up to 180s).
		RECORDING_RULES_TIMEOUT=180
		log "polling ${PROM_URL}/api/v1/rules for olam-http-aggregations group (up to ${RECORDING_RULES_TIMEOUT}s)"
		elapsed=0
		while [ "$elapsed" -lt "$RECORDING_RULES_TIMEOUT" ]; do
		if curl -sf "${PROM_URL}/api/v1/rules" 2>/dev/null \
		\| jq -e '.data.groups[] \| select(.name == "olam-http-aggregations") \| .rules[] \| select(.name == "olam:http_requests:rate5m_by_service")' >/dev/null 2>&1; then
		log "PASS: olam-http-aggregations rule group loaded after ${elapsed}s"
		break
		fi
		sleep 10
		elapsed=$((elapsed + 10))
		done
		if [ "$elapsed" -ge "$RECORDING_RULES_TIMEOUT" ]; then
		log "FAIL: olam-http-aggregations rule group not found in /api/v1/rules within ${RECORDING_RULES_TIMEOUT}s"
		curl -sf "${PROM_URL}/api/v1/rules" \| jq '.data.groups[] \| .name' >&2 \|\| true
		fail "PrometheusRule not loaded by operator"
		fi

		# -------------------------------------------------------------------------
		# Final
		# -------------------------------------------------------------------------
		log "PASS: kube-prometheus-stack installed; single Grafana confirmed; Prometheus datasource provisioned; $ACTIVE active target(s); recording rules loaded — Tasks C1+C4 verified"
		exit 0

-72

host-cp/observability/redactor.mjs

		// Privacy Guard — regex-based auto-redactor for trace + recovery ledger.
		//
		// Deep-walks an object, finds string values, applies an ordered list of
		// regex patterns, returns a redacted COPY (immutable; input untouched).
		// Each match is replaced with `<redacted:<kind>>`.
		//
		// Default-ON patterns (7): anthropic, openai, aws, gh-pat, jwt, bearer, slack.
		// Opt-in (env-gated): email PII (OLAM_REDACT_PII=1), high-entropy strings
		// (OLAM_REDACT_HIGH_ENTROPY=1). Hard short-circuit: OLAM_REDACTION_DISABLED=1.
		//
		// Precedence matters: anthropic runs before openai (otherwise the OpenAI
		// `sk-...` regex would steal `sk-ant-...` and emit the wrong tag). Bearer
		// runs after the high-specificity key patterns so a bearer-wrapped key
		// gets the tighter tag.

		const DEFAULT_PATTERNS = [
		{ kind: 'anthropic-key', re: /\bsk-ant-(?:api\|admin)[A-Za-z0-9_-]{20,}\b/g },
		{ kind: 'openai-key', re: /\bsk-(?:proj-)?[A-Za-z0-9_-]{32,}\b/g },
		{ kind: 'aws-key', re: /\bAKIA[A-Z0-9]{16}\b/g },
		{ kind: 'gh-token', re: /\bgh[poursa]_[A-Za-z0-9_]{36,}\b/g },
		{ kind: 'jwt', re: /\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g },
		{ kind: 'slack-token', re: /\bxox[abposr]-[A-Za-z0-9-]{10,}\b/g },
		{ kind: 'bearer', re: /Bearer\s+[A-Za-z0-9._~+/-]+=*/gi, replacement: 'Bearer <redacted:bearer>' },
		];

		const EMAIL_PATTERN = { kind: 'email', re: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi };
		const HIGH_ENTROPY_PATTERN = { kind: 'high-entropy', re: /\b[A-Z0-9_-]{32,}\b/g };
		const HIGH_ENTROPY_ALLOWLIST = new Set(['UUID', 'CHUNK_ID', '__filename', '__dirname']);

		function redactString(s) {
		if (process.env.OLAM_REDACTION_DISABLED === '1') return s;
		let out = s;
		for (const { kind, re, replacement } of DEFAULT_PATTERNS) {
		out = out.replace(re, replacement ?? `<redacted:${kind}>`);
		}
		if (process.env.OLAM_REDACT_PII === '1') {
		out = out.replace(EMAIL_PATTERN.re, `<redacted:${EMAIL_PATTERN.kind}>`);
		}
		if (process.env.OLAM_REDACT_HIGH_ENTROPY === '1') {
		out = out.replace(HIGH_ENTROPY_PATTERN.re, (m) =>
		HIGH_ENTROPY_ALLOWLIST.has(m) \|\| /^<redacted:/.test(m) ? m : `<redacted:${HIGH_ENTROPY_PATTERN.kind}>`,
		);
		}
		return out;
		}

		/**
		* Deep-walk `value`, redacting strings. Returns a new value; input is
		* never mutated. Primitives + null pass through unchanged (except strings,
		* which are run through `redactString`). Cycles produce `'<cycle>'`.
		*
		* @template T
		* @param {T} value
		* @returns {T}
		*/
		export function redactSensitive(value) {
		if (process.env.OLAM_REDACTION_DISABLED === '1') return value;
		return walk(value, new WeakSet());
		}

		function walk(value, seen) {
		if (typeof value === 'string') return redactString(value);
		if (value === null \|\| typeof value !== 'object') return value;
		if (seen.has(value)) return '<cycle>';
		seen.add(value);
		if (Array.isArray(value)) return value.map((v) => walk(v, seen));
		const out = {};
		for (const k of Object.keys(value)) {
		out[k] = walk(value[k], seen);
		}
		return out;
		}

-267

host-cp/observability/trace-summary.mjs

		// Trace summary — operator triage digest over the NDJSON span trace.
		//
		// The NDJSON span sink (see `ndjson-span-sink.mjs`) writes one JSON line
		// per span to ~/.olam/logs/host.trace.ndjson. Operators triage it today
		// with hand-typed `jq` one-liners (README § Observability): "longest 5
		// spans", "all failed spans", "failure-kind tally". This module codifies
		// those recipes into ONE digest so the common questions get one answer
		// without remembering jq incantations.
		//
		// Design:
		// - `summarizeSpans(spans, opts)` is PURE — no I/O. Given an array of
		// parsed span records (the exact shape the sink writes) it returns a
		// digest object. This is the unit-testable core.
		// - `parseTrace(ndjsonText)` turns raw file bytes into { spans, skipped }.
		// Malformed lines (truncated tail line, partial write mid-rotation)
		// are COUNTED, never thrown — triage tooling must survive a corrupt
		// line, not die on it.
		// - `summarizeTraceFile(path, opts)` is the thin file-reading wrapper.
		// - `formatDigest(digest)` renders a human-readable report for the CLI.
		//
		// Read-only + additive: this module never writes the trace, never changes
		// the line schema. It only READS fields the sink already emits
		// (durationMs, exit._tag, exit.reason, name, attributes.failureKind).

		import { readFile } from 'node:fs/promises';

		const DEFAULT_TOP_N = 5;

		/**
		* Parse NDJSON trace text into spans, tolerating malformed lines.
		*
		* @param {string} text raw file contents
		* @returns {{ spans: object[], skipped: number }}
		*/
		export function parseTrace(text) {
		const spans = [];
		let skipped = 0;
		for (const line of String(text).split('\n')) {
		const trimmed = line.trim();
		if (trimmed === '') continue;
		try {
		spans.push(JSON.parse(trimmed));
		} catch {
		// Truncated tail line or a partial write straddling rotation — the
		// append-only log can leave one half-line. Triage must not crash on
		// it; count and move on.
		skipped += 1;
		}
		}
		return { spans, skipped };
		}

		function isFailure(span) {
		return span?.exit?._tag === 'Failure';
		}

		/**
		* Compute a triage digest over parsed spans. Pure.
		*
		* @param {object[]} spans
		* @param {{ topN?: number }} [opts]
		* @returns {{
		* totalSpans: number,
		* failures: number,
		* successes: number,
		* failureRate: number,
		* slowest: object[],
		* recentFailures: object[],
		* failureReasons: { reason: string, count: number }[],
		* failureKinds: { kind: string, count: number }[],
		* byName: { name: string, count: number, failures: number, meanMs: number\|null, maxMs: number\|null }[],
		* }}
		*/
		export function summarizeSpans(spans, { topN = DEFAULT_TOP_N } = {}) {
		const list = Array.isArray(spans) ? spans : [];
		const totalSpans = list.length;
		const failingSpans = list.filter(isFailure);
		const failures = failingSpans.length;
		const successes = totalSpans - failures;
		const failureRate = totalSpans === 0 ? 0 : failures / totalSpans;

		// Slowest spans by durationMs. Spans with a null duration (in-flight or
		// missing endedAt) are excluded — they carry no comparable cost signal.
		const timed = list.filter((s) => typeof s?.durationMs === 'number');
		const slowest = [...timed]
		.sort((a, b) => b.durationMs - a.durationMs)
		.slice(0, topN)
		.map(projectSpan);

		// Recent failures — the trace is append-only, so the last failures in
		// file order are the most recent. Take the tail.
		const recentFailures = failingSpans.slice(-topN).reverse().map(projectSpan);

		const failureReasons = tally(
		failingSpans,
		(s) => (s?.exit?.reason != null ? String(s.exit.reason) : '(no reason)'),
		'reason',
		);

		// failureKind is the world.lifecycle attribute the README already greps
		// for; surface it as a first-class tally regardless of span name so
		// recovery-relevant failures aggregate even when span names differ.
		const failureKinds = tally(
		list.filter((s) => s?.attributes?.failureKind != null),
		(s) => String(s.attributes.failureKind),
		'kind',
		);

		const byName = aggregateByName(list);

		return {
		totalSpans,
		failures,
		successes,
		failureRate,
		slowest,
		recentFailures,
		failureReasons,
		failureKinds,
		byName,
		};
		}

		function projectSpan(s) {
		return {
		name: s?.name ?? null,
		traceId: s?.traceId ?? null,
		spanId: s?.spanId ?? null,
		durationMs: typeof s?.durationMs === 'number' ? s.durationMs : null,
		startedAt: typeof s?.startedAt === 'number' ? s.startedAt : null,
		reason: s?.exit?.reason != null ? String(s.exit.reason) : null,
		};
		}

		// Group spans by a string key and count occurrences, labelling the key
		// field per the caller (`reason` for failure reasons, `kind` for failure
		// kinds). Sorted by count descending so the dominant cause leads.
		function tally(spans, keyFn, label) {
		const counts = new Map();
		for (const s of spans) {
		const key = keyFn(s);
		counts.set(key, (counts.get(key) ?? 0) + 1);
		}
		const out = [];
		for (const [k, count] of counts) out.push({ count, [label]: k });
		return out.sort((a, b) => b.count - a.count);
		}

		/**
		* Per-span-name aggregate: count, failure count, mean + max duration.
		* Sorted by count descending so the busiest spans surface first.
		*/
		function aggregateByName(spans) {
		const groups = new Map();
		for (const s of spans) {
		const name = s?.name != null ? String(s.name) : '(unnamed)';
		let g = groups.get(name);
		if (!g) {
		g = { name, count: 0, failures: 0, durSum: 0, durCount: 0, maxMs: null };
		groups.set(name, g);
		}
		g.count += 1;
		if (isFailure(s)) g.failures += 1;
		if (typeof s?.durationMs === 'number') {
		g.durSum += s.durationMs;
		g.durCount += 1;
		g.maxMs = g.maxMs === null ? s.durationMs : Math.max(g.maxMs, s.durationMs);
		}
		}
		return [...groups.values()]
		.map((g) => ({
		name: g.name,
		count: g.count,
		failures: g.failures,
		meanMs: g.durCount === 0 ? null : g.durSum / g.durCount,
		maxMs: g.maxMs,
		}))
		.sort((a, b) => b.count - a.count);
		}

		/**
		* Read + summarize a trace file. Missing file → empty digest (an operator
		* who hasn't generated any spans yet sees a clean zero-state, not a crash).
		*
		* @param {string} path
		* @param {{ topN?: number }} [opts]
		*/
		export async function summarizeTraceFile(path, opts = {}) {
		let text;
		try {
		text = await readFile(path, 'utf8');
		} catch (err) {
		if (err && err.code === 'ENOENT') {
		return { ...summarizeSpans([], opts), skipped: 0, missing: true };
		}
		throw err;
		}
		const { spans, skipped } = parseTrace(text);
		return { ...summarizeSpans(spans, opts), skipped, missing: false };
		}

		function fmtMs(ms) {
		if (ms == null) return '—';
		if (ms >= 1000) return `${(ms / 1000).toFixed(2)}s`;
		return `${Math.round(ms)}ms`;
		}

		/**
		* Render a digest as a human-readable, plain-text report for the CLI.
		*
		* @param {ReturnType<typeof summarizeSpans> & { skipped?: number, missing?: boolean, path?: string }} digest
		* @returns {string}
		*/
		export function formatDigest(digest) {
		const lines = [];
		const path = digest.path ? ` (${digest.path})` : '';
		lines.push(`Trace summary${path}`);
		if (digest.missing) {
		lines.push(' no trace file yet — nothing recorded.');
		return lines.join('\n');
		}
		const pct = (digest.failureRate * 100).toFixed(1);
		lines.push(
		` ${digest.totalSpans} spans · ${digest.failures} failed (${pct}%) · ${digest.successes} ok` +
		(digest.skipped ? ` · ${digest.skipped} malformed line(s) skipped` : ''),
		);

		if (digest.slowest.length) {
		lines.push('');
		lines.push(`Top ${digest.slowest.length} slowest:`);
		for (const s of digest.slowest) {
		lines.push(` ${fmtMs(s.durationMs).padStart(7)} ${s.name ?? '(unnamed)'}${s.traceId ? ` [${s.traceId}]` : ''}`);
		}
		}

		if (digest.recentFailures.length) {
		lines.push('');
		lines.push(`Recent failures (${digest.recentFailures.length}):`);
		for (const f of digest.recentFailures) {
		lines.push(` ${f.name ?? '(unnamed)'}: ${f.reason ?? '(no reason)'}${f.traceId ? ` [${f.traceId}]` : ''}`);
		}
		}

		if (digest.failureKinds.length) {
		lines.push('');
		lines.push('Failure kinds:');
		for (const k of digest.failureKinds) lines.push(` ${String(k.count).padStart(4)} ${k.kind}`);
		}

		if (digest.failureReasons.length) {
		lines.push('');
		lines.push('Failure reasons:');
		for (const r of digest.failureReasons) lines.push(` ${String(r.count).padStart(4)} ${r.reason}`);
		}

		if (digest.byName.length) {
		lines.push('');
		lines.push('By span name (count · failures · mean · max):');
		for (const n of digest.byName) {
		lines.push(
		` ${String(n.count).padStart(5)} · ${String(n.failures).padStart(4)}f · ${fmtMs(n.meanMs).padStart(7)} · ${fmtMs(n.maxMs).padStart(7)} ${n.name}`,
		);
		}
		}

		return lines.join('\n');
		}

-159

host-cp/peripheral-services/helm-values/grafana-values.yaml

		# Grafana Helm values — k3s-ingress-observability Phase B Task B2
		#
		# STANDALONE grafana/grafana chart per OQ-p3-4 + Decision 16.
		# - This is NOT the Grafana bundled with kube-prometheus-stack.
		# - Phase C kube-prometheus-stack MUST set `grafana.enabled: false`
		# explicitly to prevent a second Grafana Deployment from landing.
		# - Port-forward only — NEVER expose via Traefik IngressRoute.
		# See T7 in DESIGN.md: secret exfil mitigated by no ingress surface.
		#
		# Chart: grafana/grafana; pinned to 8.5.2 (latest stable as of 2026-05-20).
		# Upgrade discipline: chart version is embedded in the e2e script comment.

		# -------------------------------------------------------------------------
		# Admin credentials — loaded from a pre-existing Secret, NOT from chart
		# values. Secret is created by scripts/e2e/grafana-port-forward.sh before
		# helm install, or by the operator following the procedure in
		# packages/peripheral-services/manifests/README.md (§ "Grafana admin secret").
		# The placeholder manifest (70-grafana-secret.yaml) was removed 2026-05-21
		# (dogfood finding #4) because `kubectl apply` would overwrite the operator's
		# pre-created Secret with the placeholder value.
		# -------------------------------------------------------------------------
		admin:
		existingSecret: olam-grafana-admin
		userKey: admin-user
		passwordKey: admin-password

		# -------------------------------------------------------------------------
		# Service: ClusterIP only.
		# Decision 16: port-forward only; never ingress-routed.
		# Access: `kubectl port-forward -n monitoring svc/olam-grafana 3000:80`
		# -------------------------------------------------------------------------
		service:
		type: ClusterIP
		port: 80

		# -------------------------------------------------------------------------
		# Ingress: disabled.
		# Decision 16 + OQ-p3-4: Grafana is never exposed via Traefik IngressRoute.
		# Port-forward is the sole operator access path. Enabling ingress here would
		# silently violate the access-control intent even if no IngressRoute manifest
		# is committed.
		# -------------------------------------------------------------------------
		ingress:
		enabled: false # Decision 16: port-forward only; never ingress-routed

		# -------------------------------------------------------------------------
		# Datasources: Loki (default) + Prometheus (added in Phase C Task C1).
		#
		# Dual-chart pattern:
		# - kube-prometheus-stack (C1) provides Prometheus. Its bundled Grafana
		# sub-chart is disabled (grafana.enabled: false in kube-prom-stack-values.yaml).
		# - This standalone grafana/grafana chart (Phase B) is the only Grafana.
		# - The Prometheus datasource URL points at `prometheus-operated`, which is
		# the in-cluster Service that kube-prometheus-stack's Prometheus Operator
		# creates for the managed Prometheus StatefulSet.
		# - timeInterval: 15s matches the scrape interval in kube-prom-stack-values.yaml
		# so Grafana's step calculation aligns with actual data granularity.
		# - exemplarTraceIdDestinations.datasourceUid: tempo is harmless until Phase D
		# adds Tempo; Grafana silently ignores unknown datasource UIDs.
		#
		# editable: false prevents accidental operator drift across sessions.
		# -------------------------------------------------------------------------
		datasources:
		datasources.yaml:
		apiVersion: 1
		datasources:
		- name: Loki
		type: loki
		access: proxy
		url: http://olam-loki.monitoring.svc.cluster.local:3100
		isDefault: true
		editable: false
		- name: Prometheus
		type: prometheus
		access: proxy
		url: http://prometheus-operated.monitoring.svc.cluster.local:9090
		isDefault: false
		editable: false
		jsonData:
		timeInterval: 15s # matches scrape interval in kube-prom-stack-values.yaml
		exemplarTraceIdDestinations:
		- name: trace_id
		datasourceUid: tempo # Phase D may add Tempo; harmless until then

		# -------------------------------------------------------------------------
		# Dashboard provisioner: file-based ConfigMap mount.
		# B3 lands the olam-dashboards ConfigMap and the actual JSON files.
		# B2 wires the loader so B3's ConfigMap is picked up automatically.
		# -------------------------------------------------------------------------
		dashboardProviders:
		dashboardproviders.yaml:
		apiVersion: 1
		providers:
		- name: olam-default
		orgId: 1
		folder: 'Olam'
		type: file
		disableDeletion: true
		updateIntervalSeconds: 30
		allowUiUpdates: false
		options:
		path: /var/lib/grafana/dashboards/olam-default

		# Wire the volume mount — B3 creates this ConfigMap with the actual JSON.
		# Grafana will warn "ConfigMap olam-dashboards not found" until B3 lands;
		# this is benign and does not block Grafana startup.
		dashboardsConfigMaps:
		olam-default: olam-dashboards # B3 creates this ConfigMap

		# -------------------------------------------------------------------------
		# Resources: tuned for single-operator k3s (<256Mi idle typical).
		# P2 acceptance criterion: <500MB idle / <1GB typical across full LGTM stack.
		# -------------------------------------------------------------------------
		resources:
		requests:
		cpu: 50m
		memory: 128Mi
		limits:
		cpu: 200m
		memory: 256Mi # P2: keeps Grafana within its share of the LGTM RAM budget

		# -------------------------------------------------------------------------
		# Persistence: disabled for Phase B.
		# Grafana state (dashboards, users) lives in ConfigMaps / values files.
		# Phase C may enable a PV if fine-grained alert state or annotations
		# accumulate. For now, stateless Grafana is simpler and matches S2.
		# -------------------------------------------------------------------------
		persistence:
		enabled: false # S2: ConfigMap-mounted dashboards; no PV needed in Phase B

		# -------------------------------------------------------------------------
		# ServiceMonitor: Phase C Prometheus scrapes Grafana's /metrics endpoint.
		# Disabled in Phase B: the ServiceMonitor CRD (monitoring.coreos.com/v1) is
		# shipped by kube-prometheus-stack in Phase C. The earlier "enable now to
		# avoid a Phase C helm upgrade" rationale was wrong — Phase C will need a
		# helm upgrade anyway to wire Prometheus scrape targets. Flipping this on
		# pre-CRD breaks the install on chart versions that hard-validate.
		# -------------------------------------------------------------------------
		serviceMonitor:
		# Disabled in the source-of-truth values file so a standalone Phase B install
		# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
		# The C1 e2e script flips this on at RUNTIME via
		# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
		# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
		enabled: false

		# -------------------------------------------------------------------------
		# Grafana.ini overrides: anonymous access disabled (default); only
		# setting the server root_url so port-forward URLs render correctly
		# in email / share links (cosmetic; not a security seam).
		# -------------------------------------------------------------------------
		grafana.ini:
		server:
		root_url: "%(protocol)s://%(domain)s:%(http_port)s/"
		analytics:
		reporting_enabled: false # no telemetry to grafana.com
		check_for_updates: false
		security:
		allow_embedding: false

-229

host-cp/peripheral...helm-values/kube-prom-stack-values.yaml

		# kube-prometheus-stack Helm values — k3s-ingress-observability Phase C Task C1
		#
		# Chart: prometheus-community/kube-prometheus-stack; pinned to 85.2.0
		# (latest stable as of 2026-05-21).
		# Upgrade discipline: pin in this file + e2e script comment must stay in sync.
		#
		# CRITICAL: grafana.enabled MUST stay false.
		# Phase B ships a standalone grafana/grafana chart (olam-grafana release).
		# kube-prometheus-stack's bundled Grafana sub-chart is disabled to prevent
		# a second Grafana Deployment from landing in the cluster.
		# Decision 16 + OQ-p3-4: Phase B's standalone Grafana is canonical.
		# Enabling the sub-chart here would violate that decision and create two
		# Grafana instances — caught by prom-no-double-grafana.sh's single-Grafana
		# assertion.
		#
		# Resource budget summary (Phase C contribution to P2 target <500MB idle / <1GB typical):
		# prometheus-operator: 128Mi req / 512Mi limit
		# prometheus: 512Mi req / 2Gi limit
		# node-exporter: 64Mi req / 128Mi limit
		# kube-state-metrics: 128Mi req / 256Mi limit
		# Total C1 addition: ~832Mi req / ~3Gi limit (spread across nodes)
		#
		# Retention policy (Decision 14): scrape 15s / retention 15d / size cap 10GiB.
		# The size cap (T10 TSDB corruption mitigation) is the hard guard; retention 15d
		# is advisory — the size cap enforces first.
		#
		# Alertmanager: disabled for C1. C2 lands the first alert rule (cardinality 80k).
		# When C2 ships, flip alertmanager.enabled: true and configure receivers.
		# Comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."

		# -------------------------------------------------------------------------
		# CARDINALITY ENFORCEMENT — Task C2 (T1 cardinality bomb / P4 <100k active series)
		#
		# Goal: strip high-cardinality labels (world_id, trace_id, user_id,
		# request_id, operator_id) from every scraped series BEFORE TSDB ingest.
		#
		# Architecture finding (helm template verified, 2026-05-21):
		# The prometheus-operator Prometheus CR has NO global metricRelabelConfigs
		# field. The Prometheus CR spec exposes only per-ServiceMonitor endpoint
		# metricRelabelings. There is no chart-level "apply to all scrapes" slot.
		#
		# Enforcement strategy (two-layer):
		# Layer 1 — chart-managed ServiceMonitors: set metricRelabelings on every
		# ServiceMonitor the chart controls (coreDns, prometheusOperator,
		# prometheus self-scrape, node-exporter). Belt-and-suspenders; these
		# services don't emit world_id etc. in practice, but the rule is free.
		# Note: kube-state-metrics sub-chart has no metricRelabelings slot in
		# its prometheus.monitor section at chart version 85.2.0 — omitted.
		# Layer 2 — user-deployed ServiceMonitors: the cardinality-drop.sh e2e
		# script's synthetic violator ServiceMonitor carries the same labeldrop
		# rule (release: olam-prom label + metricRelabelings). New services
		# MUST include the same block — enforced by docs + code review.
		#
		# Why labeldrop is the right action:
		# action: labeldrop removes the matched labels from ALL series that carry
		# them, regardless of metric name. This is the same semantic as Promtail's
		# pipeline drop stages (promtail-values.yaml) — both layers stay in sync.
		# world_id surfaces in dashboards via EXEMPLARS (Decision 9), not labels.
		#
		# Regex covers all five taxonomy labels from observability-label-taxonomy:
		# world_id, trace_id, user_id, request_id, operator_id
		# -------------------------------------------------------------------------
		_cardinalityLabeldrop: &cardinality-labeldrop
		- action: labeldrop
		regex: 'world_id\|trace_id\|user_id\|request_id\|operator_id'

		# -------------------------------------------------------------------------
		# HARD REQUIREMENT: grafana sub-chart is off.
		# See top-of-file comment for rationale.
		# -------------------------------------------------------------------------
		grafana:
		enabled: false # HARD: Decision 16 + OQ-p3-4 — standalone Grafana (olam-grafana) is canonical

		# -------------------------------------------------------------------------
		# Alertmanager: off until C2 lands the first alert rule.
		# C2 comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
		# -------------------------------------------------------------------------
		alertmanager:
		enabled: true # C2: first alert rule (OlamActiveSeriesHigh) lands; alertmanager enabled
		serviceMonitor:
		metricRelabelings: *cardinality-labeldrop

		# -------------------------------------------------------------------------
		# Default kube-controller-manager / scheduler / proxy / etcd monitors.
		# These ServiceMonitors don't work on k3d/k3s because the endpoints are not
		# exposed via the usual ports. Disabling avoids noisy "endpoint not found"
		# warnings and scrape failures on every Prometheus eval cycle.
		# -------------------------------------------------------------------------
		kubeControllerManager:
		enabled: false

		kubeScheduler:
		enabled: false

		kubeProxy:
		enabled: false

		kubeEtcd:
		enabled: false

		# kube-apiserver and kubelet DO work on k3d but generate high-cardinality
		# label combinations. Disable for now; re-evaluate when per-service /metrics
		# (C3) and cardinality enforcement (C2) are in place.
		kubeApiServer:
		enabled: false

		kubelet:
		enabled: false

		# -------------------------------------------------------------------------
		# Default alerting rules: off.
		# The bundled default rules generate Alertmanager receivers and PrometheusRule
		# objects for kubelet, etcd, apiserver, etc. — most don't fire on k3d anyway
		# and add noise before C2's focused cardinality rule lands.
		# C2 will add targeted PrometheusRule objects separately.
		# -------------------------------------------------------------------------
		defaultRules:
		create: false

		# -------------------------------------------------------------------------
		# coreDns — ServiceMonitor with labeldrop (Layer 1 cardinality enforcement)
		# -------------------------------------------------------------------------
		coreDns:
		serviceMonitor:
		metricRelabelings: *cardinality-labeldrop

		# -------------------------------------------------------------------------
		# CRDs: install via chart (default: true, explicit for clarity).
		# These CRDs (ServiceMonitor, PodMonitor, PrometheusRule, etc.) are required
		# before Phase B's loki/promtail/grafana charts can have serviceMonitor.enabled:true.
		# Phase C's e2e script waits for servicemonitors.monitoring.coreos.com to be
		# Established before helm-upgrading the Phase B charts.
		# -------------------------------------------------------------------------
		crds:
		enabled: true

		# -------------------------------------------------------------------------
		# Prometheus Operator
		# -------------------------------------------------------------------------
		prometheusOperator:
		enabled: true
		serviceMonitor:
		metricRelabelings: *cardinality-labeldrop
		resources:
		requests:
		cpu: 100m
		memory: 128Mi
		limits:
		cpu: 500m
		memory: 512Mi

		# -------------------------------------------------------------------------
		# Prometheus core — Decision 14: scrape 15s / retention 15d / 10GiB cap
		# -------------------------------------------------------------------------
		prometheus:
		serviceMonitor:
		metricRelabelings: *cardinality-labeldrop
		prometheusSpec:
		scrapeInterval: 15s # Decision 14
		evaluationInterval: 15s
		retention: 15d # Decision 14 — advisory; size cap enforces first
		retentionSize: 10GiB # Decision 14 — T10 TSDB corruption prevention
		walCompression: true
		enableAdminAPI: false # security: admin API allows snapshot deletion + series deletion
		enableRemoteWriteReceiver: false # not a remote-write target; no inbound writes
		logLevel: warn # info is noisy at 15s scrape cycle

		resources:
		requests:
		cpu: 200m
		memory: 512Mi
		limits:
		cpu: 1000m
		memory: 2Gi

		# PersistentVolume for TSDB. 12Gi = 10GiB retention cap + ~20% headroom.
		# local-path provisioner is used on k3d; cloud providers use their default SC.
		storageSpec:
		volumeClaimTemplate:
		spec:
		accessModes:
		- ReadWriteOnce
		resources:
		requests:
		storage: 12Gi # 10GiB retention + 20% headroom for in-flight segments

		# -------------------------------------------------------------------------
		# Node exporter — keep enabled (host-level metrics: CPU, memory, disk, net).
		# -------------------------------------------------------------------------
		nodeExporter:
		enabled: true

		prometheus-node-exporter:
		prometheus:
		monitor:
		metricRelabelings: *cardinality-labeldrop
		resources:
		requests:
		cpu: 30m
		memory: 64Mi
		limits:
		cpu: 100m
		memory: 128Mi

		# -------------------------------------------------------------------------
		# kube-state-metrics — keep enabled (k8s-level metrics: pod phases, deployments).
		# -------------------------------------------------------------------------
		kubeStateMetrics:
		enabled: true

		kube-state-metrics:
		resources:
		requests:
		cpu: 50m
		memory: 128Mi
		limits:
		cpu: 200m
		memory: 256Mi

		# -------------------------------------------------------------------------
		# Datasource auto-discovery note:
		# kube-prometheus-stack's grafana.sidecar.datasources is N/A (grafana sub-chart
		# is off). Phase B's standalone Grafana (grafana-values.yaml) has been updated
		# in this same C1 PR to include a Prometheus datasource entry pointing at:
		# http://prometheus-operated.monitoring.svc.cluster.local:9090
		# This is the in-cluster Service that kube-prometheus-stack creates for the
		# Prometheus StatefulSet (created by the Prometheus Operator from the
		# Prometheus CR above).
		# -------------------------------------------------------------------------

-85

host-cp/peripheral-services/helm-values/kyverno-values.yaml

		# Kyverno Helm values — k3s-ingress-observability Phase C C8 follow-up.
		#
		# Kyverno is the policy-as-code layer for cluster-wide cardinality
		# enforcement (closes codex's C2 concern on PR #783). The companion
		# ClusterPolicy in
		# `packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml`
		# mutates every incoming ServiceMonitor and PodMonitor to inject the
		# labeldrop rule before the object is persisted — so a third-party
		# chart (or hand-rolled object) cannot bypass the layer-2
		# per-ServiceMonitor enforcement landed in C2.
		#
		# Chart: kyverno/kyverno; pinned to 3.8.1 (app v1.18.1, 2026-05-21 latest stable).
		# Upgrade discipline: this pin AND the helm-install line in
		# `scripts/e2e/kyverno-cardinality-mutate.sh` must stay in sync.
		#
		# Footprint posture (single-operator k3s scale):
		# We only run admission-time mutation. The ClusterPolicy uses
		# `spec.background: false`, so the background-scan controller is
		# unused. Cleanup + reports controllers are also dead weight for
		# a single ClusterPolicy with no PolicyExceptions — they're disabled
		# so Kyverno's pod count stays minimal (1 pod, not 4).
		#
		# Footprint (Phase C C8 contribution to P2 target <500MB idle / <1GB typical):
		# admissionController: 128Mi req / 384Mi limit (chart default 128Mi/384Mi)
		# Total addition: ~128Mi req / ~384Mi limit
		#
		# If/when we want policy reports populated for observability dashboards,
		# flip `reportsController.enabled: true` and the `features.policyReports`
		# block below. Same for cleanup.
		#
		# Resource limits — tuned upward from chart default for admission webhook
		# stability under burst churn (kube-prom-stack ships ~10 ServiceMonitors at
		# once during `helm upgrade`, which arrives as a burst of AdmissionReviews).

		# -------------------------------------------------------------------------
		# Disable controllers we don't need
		# -------------------------------------------------------------------------
		backgroundController:
		enabled: false # ClusterPolicy is admission-only (background: false)

		cleanupController:
		enabled: false # no CleanupPolicy objects in this repo

		reportsController:
		enabled: false # no policy-reports surface wired into Grafana yet

		# -------------------------------------------------------------------------
		# Features — admissionReports + policyReports remain ON inside the
		# admission controller itself even when the standalone reports controller
		# is disabled. This keeps `kubectl get clusterpolicyreport` queryable
		# during dogfood; the reports controller would only AGGREGATE them
		# cluster-wide, which we don't need yet.
		# -------------------------------------------------------------------------
		features:
		admissionReports:
		enabled: true
		policyReports:
		enabled: true
		# Background scan is N/A — the policy uses background: false. Explicit
		# off avoids the controller scheduling unnecessary scan workers even
		# when the controller pod is disabled above.
		backgroundScan:
		enabled: false
		# Logging volume defaults are fine; level 2 = info-ish.
		logging:
		format: text
		verbosity: 2

		# -------------------------------------------------------------------------
		# Admission controller — the only pod we run.
		# -------------------------------------------------------------------------
		admissionController:
		replicas: 1 # single-operator k3s scale; HA is N/A for dogfood

		rbac:
		create: true # ClusterPolicy needs cluster-wide watch on ServiceMonitor + PodMonitor

		container:
		resources:
		requests:
		cpu: 100m
		memory: 256Mi
		limits:
		cpu: 500m
		memory: 512Mi

-166

host-cp/peripheral-services/helm-values/loki-values.yaml

		# Loki Helm values — k3s-ingress-observability Phase B Task B1
		#
		# Single-binary mode (Decision-16 + Phase B scope):
		# Distributed mode (microservices) adds 5+ independent Deployments + a Minio
		# or S3 backend for object storage — pure overhead for a single-operator
		# k3s install where Loki's write throughput is bounded by one Promtail
		# DaemonSet and a handful of containers. SingleBinary collapses all roles
		# (ingester, querier, compactor) into one Pod, fits within the <500MB idle
		# LGTM RAM target (P2), and is trivially replaceable if scale demands change.
		#
		# See: docs/plans/k3s-ingress-observability/DESIGN.md (P2, S2)
		#
		# Chart: grafana/loki; pinned to 6.7.4 (latest stable as of 2026-05-20).
		# Upgrade discipline: chart version is embedded in the e2e script comment.

		deploymentMode: SingleBinary

		loki:
		auth_enabled: false # single-tenant; multi-tenancy adds header overhead with no benefit here

		commonConfig:
		replication_factor: 1 # single-binary; no replicas = no cross-replica consistency needed

		# -------------------------------------------------------------------------
		# Storage backend: filesystem (boltdb-shipper + tsdb index; local PV).
		# Object storage (S3/GCS/MinIO) deferred to fatbox multi-org Phase F+.
		# For single-operator k3s, local PV is simpler and sufficient.
		# -------------------------------------------------------------------------
		storage:
		type: filesystem

		schemaConfig:
		configs:
		- from: "2024-01-01"
		store: tsdb
		object_store: filesystem
		schema: v13
		index:
		prefix: loki_index_
		period: 24h

		# -------------------------------------------------------------------------
		# Retention: 7 days (168h) per Performance budget acceptance criterion #6.
		# compactor.retention_enabled enables deletion; ring config required for
		# single-binary mode.
		# -------------------------------------------------------------------------
		limits_config:
		retention_period: 168h # 7 days
		ingestion_rate_mb: 4 # per-tenant ingestion cap (single tenant)
		ingestion_burst_size_mb: 8
		max_query_series: 5000 # cap log-derived queries from going wide (P3 <3s p95)
		max_entries_limit_per_query: 5000

		compactor:
		retention_enabled: true
		delete_request_store: filesystem
		compaction_interval: 10m
		working_directory: /var/loki/compactor

		ingester:
		chunk_idle_period: 30m # flush to storage; appropriate for low write rate
		chunk_retain_period: 1m
		max_chunk_age: 2h

		# Self-metrics endpoint — Phase C Prometheus scrapes this.
		# Server block exposed on port 3100 (default); /metrics is always available.

		singleBinary:
		replicas: 1

		# -------------------------------------------------------------------------
		# Persistence: 10Gi PV.
		#
		# Rationale: 7-day retention at olam scale (<500 containers, access logs
		# estimated 1–2MB/day compressed) → ~100MB typical stored. 10Gi gives 10x
		# headroom for burst (failed deploy loops, chatty containers) and is well
		# within the <1GB typical acceptance criterion #6. Cloud provider default SC
		# is fine; on bare-metal k3s the local-path provisioner is used.
		# -------------------------------------------------------------------------
		persistence:
		enabled: true
		size: 10Gi # 10× headroom over 7-day typical (~100MB); <1GB usage target per AC#6

		# -------------------------------------------------------------------------
		# Resources: memory limit 512Mi per task spec.
		# Typical usage at olam scale: <200MB idle (boltdb index + block cache).
		# 512Mi limit prevents compaction spikes from triggering OOM on the node.
		# -------------------------------------------------------------------------
		resources:
		requests:
		cpu: 100m
		memory: 128Mi
		limits:
		cpu: 500m
		memory: 512Mi # P2: <500MB idle / <1GB typical; limit prevents spike OOM

		# -------------------------------------------------------------------------
		# Self-metrics for Phase C Prometheus scrape.
		# ServiceMonitor is created here; Prometheus picks it up in Phase C.
		# -------------------------------------------------------------------------
		monitoring:
		selfMonitoring:
		enabled: false # disables the bundled GrafanaAgent sub-chart dependency
		grafanaAgent:
		installOperator: false
		serviceMonitor:
		# Disabled in the source-of-truth values file so a standalone Phase B install
		# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
		# The C1 e2e script flips this on at RUNTIME via
		# helm upgrade ... --reuse-values --set monitoring.serviceMonitor.enabled=true
		# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
		# NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor).
		enabled: false

		# -------------------------------------------------------------------------
		# Backend and read/write gateway: disabled for SingleBinary mode.
		# These are microservices-mode components and must be off or the chart
		# emits validation errors when deploymentMode=SingleBinary.
		# -------------------------------------------------------------------------
		backend:
		replicas: 0
		read:
		replicas: 0
		write:
		replicas: 0

		# Grafana agent / canary: not needed; disable to keep resource footprint minimal.
		lokiCanary:
		enabled: false

		test:
		enabled: false

		# -------------------------------------------------------------------------
		# Sub-component slimming — chart 6.7.4 defaults include nginx gateway +
		# two Memcached clusters + minio + sidecar watchers that single-binary
		# mode doesn't need. Each adds image-pull and Ready-wait time. Disabling
		# all of them brings the install Ready-time within the harness budget.
		# If a future scenario needs query-result caching, re-evaluate
		# resultsCache specifically.
		# -------------------------------------------------------------------------

		# nginx routing front; Promtail writes direct to single-binary :3100
		gateway:
		enabled: false

		# Memcached cluster — overhead for single-binary
		chunksCache:
		enabled: false

		# second Memcached cluster — overhead for single-binary
		resultsCache:
		enabled: false

		# minio is off because storage.type=filesystem, but be explicit
		minio:
		enabled: false

		# Sidecar that watches ConfigMaps for runtime config reloads — we don't ship one.
		sidecar:
		rules:
		enabled: false
		datasources:
		enabled: false
		configs:
		enabled: false

-92

host-cp/peripheral...vices/helm-values/promtail-staging.yaml

		# Promtail Helm values — Phase A Task A5 staging (Phase B consumes)
		#
		# Tails every container's stdout; ships to Loki single-binary (Phase B installs Loki).
		# Per OQ-p3-6: Traefik native config can redact HEADERS but NOT URL query params —
		# query-param scrubbing for `?token=`, `?code=`, `?access_token=`, `?state=` happens
		# HERE at Promtail ingest via pipeline_stages.replace regex.
		#
		# Resource limits per OQ-p3-37 (Promtail OOM risk under chatty container-cp 100ms cadence):
		# - memory limit 256Mi
		# - pipeline_stages.limit rate 100 lines/sec/stream
		#
		# Scrape config matches every pod log; namespace-scope labels are added so Loki LogQL queries
		# can filter by service / namespace / pod.
		#
		# SECURITY NOTE — replace stage regex semantics (load-bearing):
		# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
		# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
		# template syntax and silently becomes a literal. The correct pattern is:
		# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
		# replace: 'REDACTED' — replace captured secret with literal
		# See promtail-values.yaml header comment for full details.

		deploymentMode: DaemonSet

		resources:
		requests:
		cpu: 50m
		memory: 64Mi
		limits:
		cpu: 200m
		memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory

		config:
		clients:
		- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push

		snippets:
		pipelineStages:
		# 1. Parse JSON access logs from Traefik (key field present in JSON line)
		- match:
		selector: '{container="traefik"}'
		stages:
		- json:
		expressions:
		request_method: RequestMethod
		request_path: RequestPath
		status: DownstreamStatus
		request_id: requestId
		service: ServiceName
		router: RouterName

		# 2. Scrub OAuth/token values from URL query params and Authorization headers.
		#
		# IMPORTANT — capture group semantics:
		# The replace stage replaces each CAPTURE GROUP with the `replace` template
		# value. Capture groups must wrap ONLY the secret value, not the surrounding
		# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
		# it is preserved in the output while only the secret is replaced.
		- replace:
		# OAuth code= callback values — capture only the token value after `code=`
		expression: '(?:\?\|&)code=([^&\s]+)'
		replace: 'REDACTED'
		- replace:
		# Bearer / access tokens in query strings — capture only the value
		expression: '(?:\?\|&)(?:access_token\|token\|api_key\|secret)=([^&\s]+)'
		replace: 'REDACTED'
		- replace:
		# OAuth state param (may carry session info) — capture only the value
		expression: '(?:\?\|&)state=([^&\s]+)'
		replace: 'REDACTED'
		- replace:
		# Authorization header Bearer value — capture only the token after `Bearer `
		expression: '(?:Authorization\|authorization):\s*(?:Bearer\|bearer)\s+(\S+)'
		replace: 'REDACTED'

		# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
		- limit:
		rate: 100 # max log lines/sec per stream
		burst: 200
		drop: true # drop excess lines; do NOT block tail

		# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
		- labels:
		service: # from Traefik JSON access log; matches taxonomy `service` label
		router: # Traefik router name
		status: # HTTP status code (within taxonomy)

		# Retention is configured on Loki side (Phase B), not Promtail.
		# Sample retention target: 7 days per Performance budget Row.

		serviceMonitor:
		enabled: true # Prometheus (Phase C) scrapes Promtail's own /metrics for self-observability

-102

host-cp/peripheral-services/helm-values/promtail-values.yaml

		# Promtail Helm values — k3s-ingress-observability Phase B Task B1 (production)
		#
		# Production Promtail values. Staging copy at promtail-staging.yaml has the
		# same scrubbing pipeline shape; this file sets the Loki client URL +
		# production resource limits.
		#
		# Scrubbing pipeline:
		# - 4 `replace` stages: code=, token/access_token/api_key/secret=, state=, Authorization
		# - `limit` stage: rate=100/burst=200/drop=true (OQ-p3-37: Promtail OOM under chatty containers)
		# Client URL: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
		# Service name `olam-loki` is the Helm release name used in scripts/e2e/loki-ingest.sh
		# (`helm upgrade --install olam-loki grafana/loki ...`); the chart's Service
		# is named after the release, so `olam-loki` is the in-cluster DNS hostname.
		#
		# SECURITY NOTE — replace stage regex semantics (load-bearing):
		# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
		# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
		# template syntax and silently becomes a literal. The correct pattern is:
		# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
		# replace: 'REDACTED' — replace captured secret with literal
		# This leaves the surrounding context (e.g. `?code=`) intact and redacts only
		# the value. The broken pattern `(\?\|&)code=[^&\s]+` with `replace: '${1}code=REDACTED'`
		# was the root cause of the Phase B scrubbing regression (PR #776).
		#
		# See: docs/plans/k3s-ingress-observability/DESIGN.md (T8, T9)

		deploymentMode: DaemonSet

		resources:
		requests:
		cpu: 50m
		memory: 64Mi
		limits:
		cpu: 200m
		memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory

		config:
		clients:
		- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push

		snippets:
		pipelineStages:
		# 1. Parse JSON access logs from Traefik (key field present in JSON line)
		- match:
		selector: '{container="traefik"}'
		stages:
		- json:
		expressions:
		request_method: RequestMethod
		request_path: RequestPath
		status: DownstreamStatus
		request_id: requestId
		service: ServiceName
		router: RouterName

		# 2. Scrub OAuth/token values from URL query params and Authorization headers.
		#
		# IMPORTANT — capture group semantics:
		# The replace stage replaces each CAPTURE GROUP with the `replace` template
		# value. Capture groups must wrap ONLY the secret value, not the surrounding
		# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
		# it is preserved in the output while only the secret is replaced.
		- replace:
		# OAuth code= callback values — capture only the token value after `code=`
		expression: '(?:\?\|&)code=([^&\s]+)'
		replace: 'REDACTED'
		- replace:
		# Bearer / access tokens in query strings — capture only the value
		expression: '(?:\?\|&)(?:access_token\|token\|api_key\|secret)=([^&\s]+)'
		replace: 'REDACTED'
		- replace:
		# OAuth state param (may carry session info) — capture only the value
		expression: '(?:\?\|&)state=([^&\s]+)'
		replace: 'REDACTED'
		- replace:
		# Authorization header Bearer value — capture only the token after `Bearer `
		expression: '(?:Authorization\|authorization):\s*(?:Bearer\|bearer)\s+(\S+)'
		replace: 'REDACTED'

		# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
		- limit:
		rate: 100 # max log lines/sec per stream
		burst: 200
		drop: true # drop excess lines; do NOT block tail

		# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
		- labels:
		service: # from Traefik JSON access log; matches taxonomy `service` label
		router: # Traefik router name
		status: # HTTP status code (within taxonomy)

		# Retention is configured on Loki side (loki-values.yaml: 7 days / 168h).

		serviceMonitor:
		# Disabled in the source-of-truth values file so a standalone Phase B install
		# (without kube-prometheus-stack) does not hard-fail with
		# "no matches for kind ServiceMonitor in version monitoring.coreos.com/v1".
		# The C1 e2e script flips this on at RUNTIME via
		# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
		# AFTER kube-prom-stack has installed the ServiceMonitor CRD. Source-of-truth
		# stays standalone-friendly; runtime override wires Prometheus discovery.
		enabled: false

-73

host-cp/peripheral-services/helm-values/traefik-values.yaml

		# Traefik Helm values — k3s-ingress-observability Phase A Task A3
		# Pinned NodePort 30080 per OQ-p3-7 (world hooks bake this URL).
		# Structured JSON access logs ready for Phase A Task A5 + Phase B Promtail pickup.

		deployment:
		replicas: 1 # SPOF mitigation = host systemd watchdog (Phase A Task A11), not HA replicas

		ports:
		web:
		port: 8000
		expose:
		default: true
		exposedPort: 80
		nodePort: 30080 # PIN (OQ-p3-7); world hooks reach via host.docker.internal:30080
		protocol: TCP
		websecure:
		port: 8443
		expose:
		default: true
		exposedPort: 443
		nodePort: 30443
		protocol: TCP
		# v1: HTTPS deferred to fatbox multi-org (Out-of-scope of this plan); TLS not configured.

		service:
		type: NodePort

		# Structured access logs to stdout — Promtail picks up in Phase B.
		# Authorization header redaction here; URL query-param scrubbing happens
		# at Promtail pipeline_stages.replace per OQ-p3-6 (Traefik can't scrub query params natively).
		logs:
		general:
		level: INFO
		format: json
		access:
		enabled: true
		format: json
		fields:
		headers:
		defaultMode: keep
		names:
		Authorization: redact
		Cookie: redact

		# Built-in /metrics for Phase C Prometheus scrape
		metrics:
		prometheus:
		enabled: true
		addEntryPointsLabels: true
		addRoutersLabels: true
		addServicesLabels: true

		# Dashboard disabled in cluster — operator uses Grafana (Phase B)
		ingressRoute:
		dashboard:
		enabled: false

		# IngressRoute CRD enabled
		providers:
		kubernetesCRD:
		enabled: true
		allowCrossNamespace: false # explicit; matches namespace-isolation strategy from A1
		kubernetesIngress:
		enabled: false # CRD-only; vanilla Ingress not supported in this stack

		# Resource bounds — observability stack target <500MB RAM idle (P2)
		resources:
		requests:
		cpu: 100m
		memory: 64Mi
		limits:
		cpu: 500m
		memory: 256Mi

-6

host-cp/peripheral-services/manifests/20-namespace.yaml

		# Namespace for k3s-ingress-observability peripheral services
		# (Traefik installs to kube-system; observability stack to monitoring; this is for IngressRoute CRDs targeting olam services)
		apiVersion: v1
		kind: Namespace
		metadata:
		name: olam

-245

host-cp/peripheral...ces/manifests/24-deploy-kg-service.yaml

		# 24-deploy-kg-service.yaml — kg-service Service + Deployment for local k3s dogfood.
		#
		# Bridges the gap between Phase C's ServiceMonitor (92-servicemonitor-kg-service.yaml)
		# and a running service. The ServiceMonitor targets namespace `olam`,
		# label `app: olam-kg-service`, port name `http` — this manifest satisfies that
		# contract so Prometheus can scrape kg-service's /metrics endpoint.
		#
		# Canonical per-service manifest tree: packages/host-cp/k8s/manifests/kg-service/
		# This file is the "peripheral-services entry point" view — it folds Service +
		# Deployment into a single file for `kubectl apply -f manifests/` convenience.
		#
		# Secrets prerequisite: operator MUST create `olam-kg-service-secret` in the
		# `olam` namespace BEFORE applying this manifest. See README.md § Secrets.
		#
		# Image: pinned to sha256 digest (not :latest) per T4 threat model.
		# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158.
		# To update:
		# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" \| jq -r .token)
		# curl -sI -H "Authorization: Bearer $TOKEN" \
		# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
		# https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> \| grep docker-content-digest
		#
		# Memory: bge-small-en-v1.5 ONNX model is pre-cached in the image (~90 MB).
		# Container needs ≥512Mi to load the model + serve requests. Limit set to 1Gi.
		#
		# Apply-manifests.sh: this file is SKIPPED by the phase-a-e2e harness
		# (apply-manifests.sh skip-list includes 2[3-4]-deploy-*) because the
		# harness cluster has no operator secrets or kg-data PVC.
		# Operator-side `kubectl apply -f manifests/` applies it.
		---
		apiVersion: v1
		kind: ServiceAccount
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		app.kubernetes.io/managed-by: olam
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: Role
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		app.kubernetes.io/managed-by: olam
		rules:
		- apiGroups: ["apps"]
		resources: ["deployments"]
		resourceNames: ["olam-kg-service"]
		verbs: ["get", "patch", "watch"]
		---
		apiVersion: rbac.authorization.k8s.io/v1
		kind: RoleBinding
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		app.kubernetes.io/managed-by: olam
		subjects:
		- kind: ServiceAccount
		name: olam-kg-service
		namespace: olam
		roleRef:
		kind: Role
		name: olam-kg-service
		apiGroup: rbac.authorization.k8s.io
		---
		# ConfigMap — non-sensitive env vars.
		# Sensitive values (OLAM_KG_BEARER_TOKEN) live in `olam-kg-service-secret`.
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-kg-service-env
		namespace: olam
		labels:
		app: olam-kg-service
		app.kubernetes.io/managed-by: olam
		data:
		# Port kg-service listens on — must match Service targetPort below.
		OLAM_KG_SERVICE_PORT: "9997"
		# CRITICAL: kg-service defaults to 127.0.0.1 bind. In k8s the readiness
		# probe hits the pod IP, so 127.0.0.1-only listener causes probe failures.
		# Force all-interfaces bind without requiring an image rebuild.
		OLAM_KG_SERVICE_BIND: "0.0.0.0"
		# Data directory — backed by the PVC mounted at /data.
		OLAM_KG_DATA_PATH: "/data/kg"
		# Auth-service URL — cluster-internal DNS (olam namespace).
		OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
		---
		# PersistentVolumeClaim — backs /data (KG index + savings telemetry).
		# 10Gi: graph index grows with codebase size. See kg-service/45-pvc.yaml rationale.
		# local-path StorageClass ships with k3d. Substitute for non-k3d clusters.
		apiVersion: v1
		kind: PersistentVolumeClaim
		metadata:
		name: olam-kg-data
		namespace: olam
		labels:
		app: olam-kg-service
		app.kubernetes.io/managed-by: olam
		spec:
		accessModes:
		- ReadWriteOnce
		storageClassName: local-path
		resources:
		requests:
		storage: 10Gi
		---
		apiVersion: apps/v1
		kind: Deployment
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		app: olam-kg-service
		app.kubernetes.io/managed-by: olam
		spec:
		replicas: 1
		strategy:
		type: RollingUpdate
		rollingUpdate:
		maxSurge: 1
		maxUnavailable: 0
		selector:
		matchLabels:
		app: olam-kg-service
		template:
		metadata:
		labels:
		app: olam-kg-service
		spec:
		# Disable k8s automatic Service env injection.
		# Without this, k8s injects OLAM_KG_SERVICE_PORT as "tcp://..." which
		# breaks Python's int() parse of the port env var.
		enableServiceLinks: false
		imagePullSecrets:
		- name: ghcr-pull
		serviceAccountName: olam-kg-service
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		runAsGroup: 1000
		fsGroup: 1000
		initContainers:
		- name: chown-data
		# busybox:1.36 — sha256-pinned per T4 threat model.
		image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsUser: 0
		runAsNonRoot: false
		allowPrivilegeEscalation: false
		command: ["chown", "-R", "1000:1000", "/data"]
		volumeMounts:
		- name: kg-data
		mountPath: /data
		containers:
		- name: olam-kg-service
		# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158
		# Run `npm run refresh:manifest-digests` to update.
		image: ghcr.io/pleri/olam-kg-service@sha256:72030f3054315e7ebf575f6dcb9b4965e1ddee13ea7bfdeb0bde32253beeb1c7
		imagePullPolicy: IfNotPresent
		securityContext:
		runAsNonRoot: true
		runAsUser: 1000
		readOnlyRootFilesystem: true
		allowPrivilegeEscalation: false
		capabilities:
		drop: ["ALL"]
		ports:
		# CRITICAL: port name `http` must match ServiceMonitor
		# 92-servicemonitor-kg-service.yaml endpoints[0].port.
		- name: http
		containerPort: 9997
		protocol: TCP
		envFrom:
		- configMapRef:
		name: olam-kg-service-env
		- secretRef:
		name: olam-kg-service-secret
		volumeMounts:
		- name: kg-data
		mountPath: /data
		- name: tmp
		mountPath: /tmp
		readinessProbe:
		# kg-service returns {"ok":true,"ready":true} once bge-small model loads.
		# initialDelaySeconds 30 gives the model warmup thread time to complete.
		httpGet:
		path: /health
		port: 9997
		initialDelaySeconds: 30
		periodSeconds: 5
		timeoutSeconds: 3
		failureThreshold: 12
		livenessProbe:
		httpGet:
		path: /health
		port: 9997
		initialDelaySeconds: 60
		periodSeconds: 20
		timeoutSeconds: 5
		failureThreshold: 3
		resources:
		requests:
		cpu: "100m"
		# bge-small ONNX model requires ~400Mi at runtime; 512Mi is the
		# minimum viable request. Set higher if OOM-killed on first classify.
		memory: "512Mi"
		limits:
		cpu: "1000m"
		# 1Gi: bge-small model (~90Mi) + index cache + request headroom.
		memory: "1Gi"
		volumes:
		- name: kg-data
		persistentVolumeClaim:
		claimName: olam-kg-data
		- name: tmp
		emptyDir: {}
		---
		# Service — exposes kg-service to the cluster.
		# CRITICAL: `name: http` matches 92-servicemonitor-kg-service.yaml endpoints[0].port.
		# Namespace `olam` matches ServiceMonitor's namespaceSelector.matchNames.
		apiVersion: v1
		kind: Service
		metadata:
		name: olam-kg-service
		namespace: olam
		labels:
		# CRITICAL: matches 92-servicemonitor-kg-service.yaml spec.selector.matchLabels.
		app: olam-kg-service
		app.kubernetes.io/managed-by: olam
		spec:
		type: ClusterIP
		selector:
		app: olam-kg-service
		ports:
		# CRITICAL: name `http` matches ServiceMonitor endpoints[0].port.
		- name: http
		port: 9997
		targetPort: 9997
		protocol: TCP

-22

host-cp/peripheral...ts/30-traefik-ingressroute-host-cp.yaml

		# IngressRoute — host-cp (bare /api/* per Decision 3 hybrid routing)
		# host-cp preserves 50+ existing SPA fetch sites at /api/* (no strip-prefix).
		apiVersion: traefik.io/v1alpha1
		kind: IngressRoute
		metadata:
		name: olam-host-cp
		namespace: olam
		spec:
		entryPoints:
		- web
		routes:
		# host-cp is the catch-all (per Decision 3 hybrid routing); explicit low priority
		# so service-prefix routes (kg, agent-memory, etc.) win when their longer prefix matches.
		# Default Traefik priority is rule-string length; OR'd rules inflate the host-cp aggregate
		# ABOVE more-specific PathPrefix matches, causing /api/kg/* to land on host-cp incorrectly.
		# Explicit priority avoids the silent precedence bug (caught in PR #736 live-validation).
		- match: PathPrefix(`/api/`) \|\| PathPrefix(`/session/`) \|\| PathPrefix(`/v1/`) \|\| Path(`/health`)
		kind: Rule
		priority: 10
		services:
		- name: olam-host-cp
		port: 19000

-29

host-cp/peripheral...nifests/40-traefik-ingressroute-kg.yaml

		# IngressRoute — kg-service via /api/kg/* strip-prefix (Decision 3 new-services pattern)
		apiVersion: traefik.io/v1alpha1
		kind: Middleware
		metadata:
		name: strip-api-kg
		namespace: olam
		spec:
		stripPrefix:
		prefixes:
		- /api/kg
		---
		apiVersion: traefik.io/v1alpha1
		kind: IngressRoute
		metadata:
		name: olam-kg-service
		namespace: olam
		spec:
		entryPoints:
		- web
		routes:
		# Priority 100 > host-cp's 10 so /api/kg/* wins over host-cp's catch-all /api/*.
		- match: PathPrefix(`/api/kg/`)
		kind: Rule
		priority: 100
		services:
		- name: olam-kg-service
		port: 9997
		middlewares:
		- name: strip-api-kg

-29

host-cp/peripheral...-traefik-ingressroute-agent-memory.yaml

		# IngressRoute — agent-memory via /api/agent-memory/* strip-prefix (Decision 3 new-services pattern)
		apiVersion: traefik.io/v1alpha1
		kind: Middleware
		metadata:
		name: strip-api-agent-memory
		namespace: olam
		spec:
		stripPrefix:
		prefixes:
		- /api/agent-memory
		---
		apiVersion: traefik.io/v1alpha1
		kind: IngressRoute
		metadata:
		name: olam-agent-memory
		namespace: olam
		spec:
		entryPoints:
		- web
		routes:
		# Priority 100 > host-cp's 10 so /api/agent-memory/* wins over host-cp's catch-all /api/*.
		- match: PathPrefix(`/api/agent-memory/`)
		kind: Rule
		priority: 100
		services:
		- name: olam-memory-service
		port: 3111 # Real memory-service listen port (per packages/memory-service/src/worker.ts:206 + AGENTMEMORY_HOST_INTERNAL_URL in container.ts:101). Pass-1 plan said 3112 (incorrect); A6 corrects to 3111.
		middlewares:
		- name: strip-api-agent-memory

-80

host-cp/peripheral...manifests/60-networkpolicy-ingress.yaml

		# NetworkPolicy — olam namespace ingress fence (Phase A Task A9)
		#
		# Defense-in-depth: even if a world agent escapes its container or steals a
		# bearer token, NetworkPolicy ensures it can only reach olam services via the
		# Traefik ingress path (which enforces bearer auth on world-originated calls
		# per A6 — see packages/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml).
		# Direct pod-to-pod access bypassing ingress is denied.
		#
		# Enforcement matrix — two separate enforcement paths exist; the comment below
		# previously conflated them (corrected 2026-05-21, see dogfood incident finding #2):
		#
		# k3d/k3s with --disable-network-policy=false (production k3s default):
		# k3s ships a built-in NetworkPolicy controller that enforces NetworkPolicies
		# via iptables rules, INDEPENDENT of the CNI. Flannel itself does not enforce,
		# but the k3s controller does. Result: NetworkPolicies ARE enforced even on
		# default Flannel k3s/k3d clusters — this is what the operator's colima+k3d
		# dogfood cluster experienced (the fence was live despite using Flannel).
		#
		# k3d/k3s with --disable-network-policy=true (this harness — cluster-up.sh):
		# The harness explicitly passes --k3s-arg '--disable-network-policy@server:*'
		# to disable the k3s built-in controller. With the controller off, enforcement
		# depends entirely on the CNI: Flannel = no enforcement; Calico = enforced.
		# The harness uses Calico precisely so tests exercise real enforcement.
		#
		# Production k3s (default, no --disable-network-policy):
		# Controller-enforced via iptables unless the operator explicitly disables it.
		#
		# See docs/architecture/networkpolicy-fence.md for the full environment matrix
		# and docs/incidents/2026-05-21-phase-c-dogfood.md (finding #2) for the live
		# evidence that k3s' bundled controller enforces on Flannel clusters.
		#
		# Threat mitigated: T6 (world→host SSRF via unauthenticated ingress route).
		# Companion mitigations (do not remove A6 + A9 together): bearer auth (A6),
		# 127.0.0.1 bind on host-cp + kube-apiserver (OS-level, separate from k8s).
		apiVersion: networking.k8s.io/v1
		kind: NetworkPolicy
		metadata:
		name: olam-ingress-fence
		namespace: olam
		labels:
		app.kubernetes.io/part-of: olam
		app.kubernetes.io/component: security-fence
		olam.io/phase: a
		olam.io/task: a9
		spec:
		# Selects every pod in the olam namespace. Intra-namespace traffic is allowed
		# explicitly below so olam services can call each other; cross-namespace and
		# external traffic must traverse Traefik (which the second rule allows).
		podSelector: {}
		policyTypes:
		- Ingress
		ingress:
		# Allow inbound from Traefik (canonical ingress path). The label selector
		# matches the standard Helm-chart label that k3s' bundled Traefik install
		# sets (`app.kubernetes.io/name: traefik`); also matched by the upstream
		# `traefik/traefik` chart used by Phase A Task A3.
		- from:
		- namespaceSelector:
		matchLabels:
		kubernetes.io/metadata.name: kube-system
		podSelector:
		matchLabels:
		app.kubernetes.io/name: traefik
		# Allow intra-namespace pod-to-pod traffic — olam services may call each
		# other directly (host-cp → kg-service, etc.) without round-tripping
		# through Traefik. Audit log on world-originated calls still fires at the
		# bearer-auth layer (A6), so this allowance does not weaken T6 mitigation.
		- from:
		- podSelector: {}
		# Allow inbound from the monitoring namespace — Phase C's Prometheus
		# (kube-prometheus-stack) scrapes pod IPs directly for /metrics
		# collection. Without this rule, ServiceMonitor targets in `olam` ns
		# appear "up" but yield 0 samples (the scrape connection silently fails
		# at CNI level on enforcing CNIs). Surfaced during 2026-05-21 operator
		# dogfood — see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #2.
		# Scope: monitoring → olam ingress only (not the reverse direction).
		- from:
		- namespaceSelector:
		matchLabels:
		kubernetes.io/metadata.name: monitoring

-67

host-cp/peripheral...ts/65-networkpolicy-loki-prom-deny.yaml

		# NetworkPolicy — monitoring namespace default-deny + same-namespace allow
		# (Phase A Task A9; companion to 60-networkpolicy-ingress.yaml)
		#
		# Loki + Prometheus + Grafana accept inbound ONLY from pods in the same
		# `monitoring` namespace (intra-stack: Promtail → Loki, Grafana → Loki + Prom,
		# kube-prometheus-stack scrape targets within the stack). Cross-namespace
		# traffic — including from `olam` (host-cp, kg-service, agent-memory) and
		# kube-system (Traefik) — is denied.
		#
		# Operator access pattern is `kubectl port-forward -n monitoring svc/grafana
		# 3000` (Decision 16). port-forward uses the kube-apiserver's exec channel,
		# NOT pod-to-pod networking, so it bypasses NetworkPolicy by design.
		#
		# Decision 17 forbids any IngressRoute / Ingress that exposes Loki / Prom /
		# Grafana from outside the cluster; audit:no-ingress-route enforces that at
		# commit time, and this NetworkPolicy is the runtime defense-in-depth layer
		# (caught even if the audit is bypassed or a Helm chart renders a route).
		#
		# Forward-declaration note: Loki + Prometheus land in Phase B/C. Until those
		# manifests add pods to the `monitoring` namespace, this policy applies to an
		# empty pod set and is a no-op. Phase B/C must label their pods so this
		# selector keeps matching (kube-prometheus-stack's default labels already
		# satisfy `kubernetes.io/metadata.name: monitoring` via namespace metadata).
		#
		# Enforcement requires NetworkPolicy-capable CNI (see 60-* doc block).
		# Threat mitigated: T7 (Grafana admin secret exfil) + secondary T6 mitigation.
		---
		# Forward-declare the monitoring namespace so the NetworkPolicy below has a
		# valid target. Phase B/C kube-prometheus-stack installs into this namespace
		# and may add labels — its install MUST NOT delete the namespace; Helm uses
		# `--create-namespace=false` once this manifest seeds it.
		apiVersion: v1
		kind: Namespace
		metadata:
		name: monitoring
		labels:
		kubernetes.io/metadata.name: monitoring
		app.kubernetes.io/part-of: olam-observability
		olam.io/phase: a
		olam.io/task: a9
		---
		apiVersion: networking.k8s.io/v1
		kind: NetworkPolicy
		metadata:
		name: monitoring-default-deny
		namespace: monitoring
		labels:
		app.kubernetes.io/part-of: olam-observability
		app.kubernetes.io/component: security-fence
		olam.io/phase: a
		olam.io/task: a9
		spec:
		# Selects every pod in the monitoring namespace. Phase B/C pods (loki,
		# prometheus, grafana, promtail, alertmanager — whatever the chart renders)
		# all match this empty selector automatically.
		podSelector: {}
		policyTypes:
		- Ingress
		ingress:
		# Allow inbound only from same-namespace pods. Cross-namespace traffic
		# (olam services, kube-system Traefik, default ns) is denied — see header
		# for why this is the correct posture (operator uses kubectl port-forward,
		# which bypasses NetworkPolicy via the kube-apiserver exec channel).
		- from:
		- namespaceSelector:
		matchLabels:
		kubernetes.io/metadata.name: monitoring

-1349

host-cp/peripheral...sts/80-grafana-dashboard-configmap.yaml

		# ----------------------------------------------------------------------------
		# GENERATED FILE — DO NOT EDIT DIRECTLY
		#
		# Source: packages/peripheral-services/grafana-dashboards/*.json
		# Regenerate: packages/peripheral-services/scripts/sync-grafana-dashboards.sh
		#
		# This ConfigMap is consumed by the grafana/grafana Helm chart via
		# dashboardsConfigMaps.olam-default: olam-dashboards
		# as wired in packages/peripheral-services/helm-values/grafana-values.yaml.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B3
		# ----------------------------------------------------------------------------
		apiVersion: v1
		kind: ConfigMap
		metadata:
		name: olam-dashboards
		namespace: monitoring
		labels:
		app.kubernetes.io/name: grafana
		app.kubernetes.io/managed-by: olam
		grafana_dashboard: "1"
		data:
		host-cp.json: \|
		{
		"uid": "host-cp",
		"title": "Host-CP — Service Drill-in",
		"description": "Per-route SLIs for host-cp. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. Use the route dropdown to scope a single route or view all. The world_id variable is forwarded from olam-home for context.",
		"tags": ["olam", "drill-in", "phase-c", "host-cp"],
		"timezone": "browser",
		"refresh": "30s",
		"schemaVersion": 39,
		"version": 1,
		"time": {
		"from": "now-1h",
		"to": "now"
		},
		"timepicker": {},
		"templating": {
		"list": [
		{
		"name": "world_id",
		"label": "World",
		"type": "query",
		"datasource": { "type": "loki", "uid": "loki" },
		"query": {
		"qryType": 2,
		"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} \| json \| __error__ = \"\" \| world_id != \"\" [5m]))",
		"step": ""
		},
		"refresh": 2,
		"sort": 1,
		"multi": false,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		},
		{
		"name": "route",
		"label": "Route",
		"type": "query",
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"host-cp\"}, route)",
		"refresh": 2,
		"sort": 1,
		"multi": true,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		}
		]
		},
		"annotations": {
		"list": []
		},
		"panels": [
		{
		"id": 1,
		"type": "timeseries",
		"title": "Request rate by route",
		"description": "Requests per second for each host-cp route over the last 5 minutes (pre-computed by C4 recording rule). Spikes indicate traffic surges; a route going to zero indicates it stopped receiving traffic.",
		"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_requests:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 2,
		"type": "timeseries",
		"title": "5xx error rate by route",
		"description": "5xx responses per second per host-cp route (C4 recording rule). A non-zero value on a route warrants investigation. Correlate with the error ratio panel below to understand severity relative to total traffic.",
		"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_errors:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 3,
		"type": "timeseries",
		"title": "Latency p50 by route",
		"description": "Median (p50) request duration per host-cp route in seconds (C4 recording rule). Represents typical user-perceived latency. Sustained increases above baseline indicate a regression or upstream dependency slowdown.",
		"gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"host-cp\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 4,
		"type": "timeseries",
		"title": "Latency p95 by route",
		"description": "95th-percentile request duration per host-cp route in seconds (C4 recording rule). Captures the tail latency experienced by the slowest 5% of requests. The primary SLI for detecting latency regressions before they affect most users.",
		"gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"host-cp\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 5,
		"type": "timeseries",
		"title": "Latency p99 by route",
		"description": "99th-percentile request duration per host-cp route in seconds (C4 recording rule). Worst-case latency tail. High p99 with stable p50/p95 often indicates a specific slow code path or resource contention under load.",
		"gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"host-cp\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 6,
		"type": "stat",
		"title": "Error ratio (5xx / total) by route",
		"description": "Fraction of requests returning 5xx per host-cp route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. A route showing red means roughly 1-in-20 (or more) requests are failing — investigate immediately.",
		"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "percentunit",
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "green", "value": null },
		{ "color": "yellow", "value": 0.01 },
		{ "color": "red", "value": 0.05 }
		]
		},
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_errors:ratio_by_service_route{service=\"host-cp\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": true,
		"range": false
		}
		]
		}
		]
		}

		kg-service.json: \|
		{
		"uid": "kg-service",
		"title": "KG-Service — Service Drill-in",
		"description": "Per-route SLIs for kg-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. kg-service exposes 4 routes: /health, /classify, /build, /status. Use the route dropdown to scope a single route. The world_id variable is forwarded from olam-home for context.",
		"tags": ["olam", "drill-in", "phase-c", "kg-service"],
		"timezone": "browser",
		"refresh": "30s",
		"schemaVersion": 39,
		"version": 1,
		"time": {
		"from": "now-1h",
		"to": "now"
		},
		"timepicker": {},
		"templating": {
		"list": [
		{
		"name": "world_id",
		"label": "World",
		"type": "query",
		"datasource": { "type": "loki", "uid": "loki" },
		"query": {
		"qryType": 2,
		"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} \| json \| __error__ = \"\" \| world_id != \"\" [5m]))",
		"step": ""
		},
		"refresh": 2,
		"sort": 1,
		"multi": false,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		},
		{
		"name": "route",
		"label": "Route",
		"type": "query",
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"kg-service\"}, route)",
		"refresh": 2,
		"sort": 1,
		"multi": true,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		}
		]
		},
		"annotations": {
		"list": []
		},
		"panels": [
		{
		"id": 1,
		"type": "timeseries",
		"title": "Request rate by route",
		"description": "Requests per second for each kg-service route over the last 5 minutes (pre-computed by C4 recording rule). /classify is the hot path; /build is infrequent; /health should be near-constant. A drop in /classify with stable /health suggests the classifier is being bypassed or the caller is down.",
		"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_requests:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 2,
		"type": "timeseries",
		"title": "5xx error rate by route",
		"description": "5xx responses per second per kg-service route (C4 recording rule). Errors on /classify indicate the graph classifier is failing; errors on /build indicate a KG rebuild failure. Either warrants immediate investigation as they affect agent search quality.",
		"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_errors:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 3,
		"type": "timeseries",
		"title": "Latency p50 by route",
		"description": "Median (p50) request duration per kg-service route in seconds (C4 recording rule). /classify latency drives agent dispatch latency directly; a rising p50 on /classify means agents wait longer for graph routing decisions.",
		"gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"kg-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 4,
		"type": "timeseries",
		"title": "Latency p95 by route",
		"description": "95th-percentile request duration per kg-service route in seconds (C4 recording rule). kg-service is a synchronous dependency for in-world search; a high p95 on /classify directly contributes to the >6min diagnosis-time problem this observability stack is solving.",
		"gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"kg-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 5,
		"type": "timeseries",
		"title": "Latency p99 by route",
		"description": "99th-percentile request duration per kg-service route in seconds (C4 recording rule). Worst-case latency tail. A high p99 on /build (graph rebuild) with stable /classify p99 is expected; the inverse (stable /build, high /classify p99) indicates classifier graph complexity growth.",
		"gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"kg-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 6,
		"type": "stat",
		"title": "Error ratio (5xx / total) by route",
		"description": "Fraction of requests returning 5xx per kg-service route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. kg-service is fail-open for /classify (returns empty result on error); a high error ratio here means callers are silently getting degraded graph routing with no local error signal.",
		"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "percentunit",
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "green", "value": null },
		{ "color": "yellow", "value": 0.01 },
		{ "color": "red", "value": 0.05 }
		]
		},
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_errors:ratio_by_service_route{service=\"kg-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": true,
		"range": false
		}
		]
		}
		]
		}

		memory-service.json: \|
		{
		"uid": "memory-service",
		"title": "Memory-Service — Service Drill-in",
		"description": "Per-route SLIs for memory-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. memory-service's traffic flows through the in-container Node front-door (packages/memory-service/src/metrics-proxy.mjs) which short-circuits /metrics and instruments every agentmemory engine route ({service,route,method,status_code} taxonomy). Use the route dropdown to scope a single agentmemory endpoint. The world_id variable is forwarded from olam-home for context.",
		"tags": ["olam", "drill-in", "phase-c", "memory-service"],
		"timezone": "browser",
		"refresh": "30s",
		"schemaVersion": 39,
		"version": 1,
		"time": {
		"from": "now-1h",
		"to": "now"
		},
		"timepicker": {},
		"templating": {
		"list": [
		{
		"name": "world_id",
		"label": "World",
		"type": "query",
		"datasource": { "type": "loki", "uid": "loki" },
		"query": {
		"qryType": 2,
		"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} \| json \| __error__ = \"\" \| world_id != \"\" [5m]))",
		"step": ""
		},
		"refresh": 2,
		"sort": 1,
		"multi": false,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		},
		{
		"name": "route",
		"label": "Route",
		"type": "query",
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"memory-service\"}, route)",
		"refresh": 2,
		"sort": 1,
		"multi": true,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		}
		]
		},
		"annotations": {
		"list": []
		},
		"panels": [
		{
		"id": 1,
		"type": "timeseries",
		"title": "Request rate by route",
		"description": "Requests per second for each memory-service route over the last 5 minutes (pre-computed by C4 recording rule). /agentmemory/mcp/call is the hot path that agents drive — every memory_save / memory_recall lands there. /agentmemory/livez is the readiness probe (near-constant ~0.2 rps from k8s). /agentmemory/export is bridge-debounced (~1 per ~10s burst). A drop in mcp/call with stable livez indicates the agentmemory engine is up but receiving no traffic — caller-side issue.",
		"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_requests:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 2,
		"type": "timeseries",
		"title": "5xx error rate by route",
		"description": "5xx responses per second per memory-service route (C4 recording rule). Errors on /agentmemory/mcp/call indicate the iii engine is rejecting MCP tool calls — typical causes are bearer-auth failures or the engine entering a degraded state. Errors on /agentmemory/import indicate restore failures; the bridge's snapshot will retry on the next mutator-write.",
		"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_errors:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 3,
		"type": "timeseries",
		"title": "Latency p50 by route",
		"description": "Median (p50) request duration per memory-service route in seconds (C4 recording rule). /agentmemory/mcp/call p50 is a direct driver of agent-memory recall+save latency in the agent loop. Sustained rise on mcp/call p50 points to engine index size growth or iii-config tuning regressions.",
		"gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"memory-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 4,
		"type": "timeseries",
		"title": "Latency p95 by route",
		"description": "95th-percentile request duration per memory-service route in seconds (C4 recording rule). memory-service is a synchronous dependency for agent recall paths — high p95 on /agentmemory/mcp/call directly contributes to the >6min diagnosis-time problem this observability stack is solving. /agentmemory/export p95 spikes are expected at snapshot boundaries but should fall back inside 1s.",
		"gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"memory-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 5,
		"type": "timeseries",
		"title": "Latency p99 by route",
		"description": "99th-percentile request duration per memory-service route in seconds (C4 recording rule). Worst-case tail. /agentmemory/import is intentionally heavy (~1s+ for a full corpus restore on cold-start) so a high p99 there with stable mcp/call p99 is expected. The inverse — stable import, rising mcp/call p99 — is the leading indicator for engine-side index degradation.",
		"gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "s",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"memory-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 6,
		"type": "stat",
		"title": "Error ratio (5xx / total) by route",
		"description": "Fraction of requests returning 5xx per memory-service route (C4 recording rule). Green < 1%; yellow 1-5%; red >= 5%. /agentmemory/mcp/call errors silently degrade agent memory recall quality (callers fall through to no-context paths). /agentmemory/livez errors here indicate the proxy is healthy but the engine is unreachable — check container logs.",
		"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"fieldConfig": {
		"defaults": {
		"unit": "percentunit",
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "green", "value": null },
		{ "color": "yellow", "value": 0.01 },
		{ "color": "red", "value": 0.05 }
		]
		},
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "prometheus", "uid": "prometheus" },
		"expr": "olam:http_errors:ratio_by_service_route{service=\"memory-service\",route=~\"$route\"}",
		"legendFormat": "{{route}}",
		"instant": true,
		"range": false
		}
		]
		}
		]
		}

		olam-home.json: \|
		{
		"uid": "olam-home",
		"title": "Olam Home",
		"description": "Operator's at-a-glance view. Top row: are the 5 olam peripheral services up? Middle row: how loaded are they? Bottom row: which worlds are doing dispatch work right now? Use the world_id dropdown to scope the bottom row (and host-cp/world-cp middle panels) to a specific world. Pinned 3-row IA per Phase B acceptance criteria #8. Click the host-cp, kg-service, or memory-service health panel to drill into the per-service dashboard.",
		"tags": ["olam", "home", "phase-b"],
		"timezone": "browser",
		"refresh": "30s",
		"schemaVersion": 39,
		"version": 2,
		"time": {
		"from": "now-1h",
		"to": "now"
		},
		"timepicker": {},
		"templating": {
		"list": [
		{
		"name": "world_id",
		"label": "World",
		"type": "query",
		"datasource": { "type": "loki", "uid": "loki" },
		"query": {
		"qryType": 2,
		"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} \| json \| __error__ = \"\" \| world_id != \"\" [5m]))",
		"step": ""
		},
		"refresh": 2,
		"sort": 1,
		"multi": false,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		}
		]
		},
		"annotations": {
		"list": []
		},
		"panels": [
		{
		"id": 1,
		"type": "stat",
		"title": "host-cp",
		"description": "Green if host-cp logged at least 1 line in the last 60s; red = silent / crashed.",
		"gridPos": { "x": 0, "y": 0, "w": 5, "h": 4 },
		"links": [
		{
		"title": "Drill into host-cp",
		"url": "/d/host-cp/host-cp-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
		"targetBlank": false
		}
		],
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "red", "value": null },
		{ "color": "green", "value": 1 }
		]
		},
		"mappings": [
		{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
		{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
		],
		"unit": "short",
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum(count_over_time({service=\"host-cp\"}[1m]))",
		"legendFormat": "host-cp",
		"instant": true,
		"range": false
		}
		]
		},
		{
		"id": 2,
		"type": "stat",
		"title": "kg-service",
		"description": "Green if kg-service logged at least 1 line in the last 60s; red = silent / crashed.",
		"gridPos": { "x": 5, "y": 0, "w": 5, "h": 4 },
		"links": [
		{
		"title": "Drill into kg-service",
		"url": "/d/kg-service/kg-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
		"targetBlank": false
		}
		],
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "red", "value": null },
		{ "color": "green", "value": 1 }
		]
		},
		"mappings": [
		{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
		{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
		],
		"unit": "short",
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum(count_over_time({service=\"kg-service\"}[1m]))",
		"legendFormat": "kg-service",
		"instant": true,
		"range": false
		}
		]
		},
		{
		"id": 3,
		"type": "stat",
		"title": "agent-memory",
		"description": "Green if agent-memory logged at least 1 line in the last 60s; red = silent / crashed.",
		"gridPos": { "x": 10, "y": 0, "w": 4, "h": 4 },
		"links": [
		{
		"title": "Drill into memory-service",
		"url": "/d/memory-service/memory-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
		"targetBlank": false
		}
		],
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "red", "value": null },
		{ "color": "green", "value": 1 }
		]
		},
		"mappings": [
		{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
		{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
		],
		"unit": "short",
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum(count_over_time({service=\"agent-memory\"}[1m]))",
		"legendFormat": "agent-memory",
		"instant": true,
		"range": false
		}
		]
		},
		{
		"id": 4,
		"type": "stat",
		"title": "traefik",
		"description": "Green if traefik logged at least 1 line in the last 60s; red = silent / crashed.",
		"gridPos": { "x": 14, "y": 0, "w": 5, "h": 4 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "red", "value": null },
		{ "color": "green", "value": 1 }
		]
		},
		"mappings": [
		{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
		{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
		],
		"unit": "short",
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum(count_over_time({service=\"traefik\"}[1m]))",
		"legendFormat": "traefik",
		"instant": true,
		"range": false
		}
		]
		},
		{
		"id": 5,
		"type": "stat",
		"title": "world-cp",
		"description": "Green if any world-cp instance logged at least 1 line in the last 60s. Aggregated across world_id labels per Promtail drop-rules.",
		"gridPos": { "x": 19, "y": 0, "w": 5, "h": 4 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"thresholds": {
		"mode": "absolute",
		"steps": [
		{ "color": "red", "value": null },
		{ "color": "green", "value": 1 }
		]
		},
		"mappings": [
		{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
		{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
		],
		"unit": "short",
		"color": { "mode": "thresholds" }
		}
		},
		"options": {
		"reduceOptions": { "calcs": ["lastNotNull"] },
		"orientation": "auto",
		"textMode": "auto",
		"colorMode": "background",
		"graphMode": "none",
		"justifyMode": "center"
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum(count_over_time({service=\"world-cp\"}[1m]))",
		"legendFormat": "world-cp",
		"instant": true,
		"range": false
		}
		]
		},
		{
		"id": 6,
		"type": "timeseries",
		"title": "Aggregate success rate",
		"description": "Total 2xx/3xx log lines per second across all services. Proxy for overall throughput.",
		"gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "none" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum(rate({job=~\".+\"} \|~ \"(?:200\|201\|204\|301\|302)\" [1m]))",
		"legendFormat": "2xx/3xx rate",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 7,
		"type": "timeseries",
		"title": "Aggregate error rate",
		"description": "Total error/panic/fatal log lines per second across all services. Spikes indicate incidents.",
		"gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": {
		"mode": "fixed",
		"fixedColor": "red"
		},
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "none" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum(rate({job=~\".+\"} \|~ \"(?i)error\|panic\|fatal\" [1m]))",
		"legendFormat": "error/panic/fatal rate",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 8,
		"type": "timeseries",
		"title": "World-dispatch activity (top 10 worlds)",
		"description": "Dispatch log lines per 5m per world, filtered by the world_id dropdown. world_id is a JSON field (not a Loki label); extracted via json parser. Select 'All' to see all worlds; select a specific world_id to drill down.",
		"gridPos": { "x": 0, "y": 12, "w": 24, "h": 8 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"unit": "short",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "topk(10, sum by (world_id) (\n count_over_time(\n {service=\"host-cp\"}\n \|~ \"dispatch\"\n \| json\n \| __error__ = \"\"\n \| world_id =~ \"${world_id}\"\n [5m]\n )\n))",
		"legendFormat": "world {{world_id}}",
		"instant": false,
		"range": true
		}
		]
		}
		]
		}

		request-rate.json: \|
		{
		"uid": "request-rate",
		"title": "Request Rate / Error Rate (Log-Derived)",
		"description": "Per-service request rate + error rate derived from Loki logs. Phase B-only — kube-prometheus-stack will replace these with native HTTP metrics in Phase C.",
		"tags": ["olam", "rate", "phase-b"],
		"timezone": "browser",
		"refresh": "30s",
		"schemaVersion": 39,
		"version": 1,
		"time": {
		"from": "now-1h",
		"to": "now"
		},
		"timepicker": {},
		"templating": {
		"list": [
		{
		"name": "world_id",
		"label": "World",
		"type": "query",
		"datasource": { "type": "loki", "uid": "loki" },
		"query": {
		"qryType": 2,
		"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} \| json \| __error__ = \"\" \| world_id != \"\" [5m]))",
		"step": ""
		},
		"refresh": 2,
		"sort": 1,
		"multi": false,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		},
		{
		"name": "service",
		"label": "Service",
		"type": "query",
		"datasource": { "type": "loki", "uid": "loki" },
		"query": { "qryType": 1, "label": "service", "stream": "{job=~\".+\"}" },
		"refresh": 2,
		"sort": 1,
		"multi": true,
		"includeAll": true,
		"allValue": ".+",
		"current": { "selected": false, "text": "All", "value": "$__all" }
		}
		]
		},
		"annotations": {
		"list": []
		},
		"panels": [
		{
		"id": 1,
		"type": "timeseries",
		"title": "Request rate by service",
		"description": "Log line rate per second per service. Uses log volume as a proxy for request rate — appropriate for Phase B before Prometheus HTTP metrics land in Phase C.",
		"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum by (service) (rate({service=~\"${service:regex}\"}[1m]))",
		"legendFormat": "{{service}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 2,
		"type": "timeseries",
		"title": "Error rate by service",
		"description": "Log lines matching error\|panic\|fatal per second per service. Spikes here warrant drill-down in the Ad-hoc LogQL panel below.",
		"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"unit": "reqps",
		"color": { "mode": "palette-classic" },
		"custom": {
		"lineWidth": 2,
		"fillOpacity": 10,
		"showPoints": "never"
		}
		}
		},
		"options": {
		"tooltip": { "mode": "multi", "sort": "desc" },
		"legend": { "displayMode": "list", "placement": "bottom" }
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "sum by (service) (rate({service=~\"${service:regex}\"} \|~ \"(?i)error\|panic\|fatal\" [1m]))",
		"legendFormat": "{{service}}",
		"instant": false,
		"range": true
		}
		]
		},
		{
		"id": 3,
		"type": "table",
		"title": "Top-5 endpoints (last 5m)",
		"description": "Top 5 request paths by volume, derived from Traefik JSON access logs. Only Traefik has access-log-grade request_path (per B1 Promtail JSON stage); other services don't extract this field.",
		"gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {
		"unit": "short",
		"color": { "mode": "palette-classic" }
		},
		"overrides": [
		{
		"matcher": { "id": "byName", "options": "Value" },
		"properties": [
		{ "id": "displayName", "value": "requests" }
		]
		}
		]
		},
		"options": {
		"showHeader": true,
		"footer": { "show": false }
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "topk(5, sum by (request_path) (count_over_time({service=\"traefik\"} \| json \| __error__ = \"\" \| request_path != \"\" [5m])))",
		"legendFormat": "",
		"instant": true,
		"range": false
		}
		],
		"transformations": [
		{ "id": "reduce", "options": { "reducers": ["sum"] } }
		]
		},
		{
		"id": 4,
		"type": "logs",
		"title": "Ad-hoc LogQL (edit me)",
		"description": "Operator escape hatch. Edit the query inline; use LogQL syntax. world_id filter via JSON pipeline because Loki doesn't promote world_id as a stream label.",
		"gridPos": { "x": 0, "y": 16, "w": 24, "h": 10 },
		"datasource": { "type": "loki", "uid": "loki" },
		"fieldConfig": {
		"defaults": {},
		"overrides": []
		},
		"options": {
		"showTime": true,
		"wrapLogMessage": false,
		"dedupStrategy": "exact",
		"showLabels": false,
		"showCommonLabels": false,
		"sortOrder": "Descending",
		"prettifyLogMessage": false,
		"enableLogDetails": true
		},
		"targets": [
		{
		"datasource": { "type": "loki", "uid": "loki" },
		"expr": "{service=~\"${service:regex}\"} \| json \| __error__ = \"\" \| world_id =~ \"${world_id}\"",
		"legendFormat": "",
		"instant": false,
		"range": true
		}
		]
		}
		]
		}

-50

host-cp/peripheral...anifests/90-prom-alert-cardinality.yaml

		# 90-prom-alert-cardinality.yaml — Phase C Task C2 cardinality alert rule.
		#
		# PrometheusRule CR: fires OlamActiveSeriesHigh when prometheus_tsdb_head_series
		# exceeds 80k (80% of the 100k active-series cap defined by P4).
		#
		# ruleSelector match: the Prometheus CR rendered by kube-prom-stack 85.2.0 uses
		# ruleSelector: matchLabels: release: "olam-prom"
		# (verified via `helm template ... \| grep -A3 ruleSelector`).
		# The label below MUST match or this rule is silently ignored by Prometheus.
		#
		# Alertmanager: enabled in kube-prom-stack-values.yaml from C2 onwards.
		# Receivers: not yet configured (C2 scope = rule landing; receiver config is C4+).
		# Alertmanager will fire the alert to its default null receiver until receivers
		# are wired — this is intentional. The alert is visible in the Prometheus UI
		# at /alerts regardless of receiver config.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C2
		# T1 (cardinality bomb) + P4 (<100k active series)
		---
		apiVersion: monitoring.coreos.com/v1
		kind: PrometheusRule
		metadata:
		name: olam-cardinality
		namespace: monitoring
		labels:
		app.kubernetes.io/name: olam-prometheus-rules
		app.kubernetes.io/managed-by: olam
		# REQUIRED: matches Prometheus CR's ruleSelector (release: "olam-prom").
		# Verified via helm template output, 2026-05-21.
		release: olam-prom
		spec:
		groups:
		- name: olam-cardinality
		interval: 30s
		rules:
		- alert: OlamActiveSeriesHigh
		expr: \|
		prometheus_tsdb_head_series > 80000
		for: 5m
		labels:
		severity: warning
		scope: cardinality
		annotations:
		summary: "Active series above 80k threshold (80% of 100k cap)"
		description: \|
		prometheus_tsdb_head_series is {{ $value \| humanize }} — within 20%
		of the 100k cardinality budget (P4). Investigate which service is
		emitting a new high-cardinality label, OR add a DROP rule to
		kube-prom-stack-values.yaml metricRelabelings for that ServiceMonitor.
		Runbook: docs/architecture/observability-cardinality.md (TBD — C4+)

-70

host-cp/peripheral...anifests/91-servicemonitor-host-cp.yaml

		# 91-servicemonitor-host-cp.yaml — Phase C Task C3 ServiceMonitor for host-cp.
		#
		# Registers host-cp's /metrics endpoint with Prometheus for scraping.
		#
		# NOTE: This manifest requires the ServiceMonitor CRD installed by
		# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
		# apply-manifests.sh (which targets the Phase A ingress harness) and is
		# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
		#
		# Namespace placement (CRITICAL — C2 dogfood lesson):
		# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
		# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
		# in any other namespace is silently ignored by default RBAC.
		#
		# Label compliance:
		# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
		# (verified via `helm template ... \| grep -A3 serviceMonitorSelector`).
		#
		# Target selector:
		# Matches the host-cp Service by its `app: olam-host-cp` label. Adjust if
		# the Service label differs in the target cluster (check
		# `kubectl get svc -n olam -l app=olam-host-cp`).
		#
		# metricRelabelings (layer-2 cardinality enforcement):
		# Mirrors the `*cardinality-labeldrop` YAML anchor from
		# kube-prom-stack-values.yaml. host-cp's /metrics is taxonomy-compliant
		# (only {service,route,method,status_code} labels), but the labeldrop rule
		# is present as defense-in-depth: if a future code change accidentally
		# emits a banned label (world_id etc.), this ServiceMonitor drops it before
		# ingest so the cardinality cap is never breached.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
		# T1 (cardinality bomb) + P4 (<100k active series)
		---
		apiVersion: monitoring.coreos.com/v1
		kind: ServiceMonitor
		metadata:
		name: olam-host-cp
		namespace: monitoring
		labels:
		app.kubernetes.io/name: olam-host-cp-monitor
		app.kubernetes.io/managed-by: olam
		# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
		release: olam-prom
		spec:
		# Discover the host-cp Service in the olam namespace.
		namespaceSelector:
		matchNames:
		- olam
		selector:
		matchLabels:
		app: olam-host-cp
		endpoints:
		- port: http
		path: /metrics
		interval: 15s
		# Preserve the application-emitted `service` label. Without honorLabels,
		# Prometheus's target-label injection (where `service` = the k8s Service
		# name `olam-host-cp`) overrides the application's own `service=host-cp`
		# value, moving the app's value into `exported_service`. The C5 drill-in
		# dashboards filter on `service=host-cp`, so without honorLabels their
		# panels show empty data. Surfaced during 2026-05-21 operator dogfood —
		# see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
		honorLabels: true
		# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
		# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
		# even if the service accidentally emits them.
		metricRelabelings:
		- action: labeldrop
		regex: 'world_id\|trace_id\|user_id\|request_id\|operator_id'

-70

host-cp/peripheral...fests/92-servicemonitor-kg-service.yaml

		# 92-servicemonitor-kg-service.yaml — Phase C Task C3 ServiceMonitor for kg-service.
		#
		# Registers kg-service's /metrics endpoint with Prometheus for scraping.
		#
		# NOTE: This manifest requires the ServiceMonitor CRD installed by
		# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
		# apply-manifests.sh (which targets the Phase A ingress harness) and is
		# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
		#
		# Namespace placement (CRITICAL — C2 dogfood lesson):
		# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
		# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
		# in any other namespace is silently ignored by default RBAC.
		#
		# Label compliance:
		# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
		# (verified via `helm template ... \| grep -A3 serviceMonitorSelector`).
		#
		# Target selector:
		# Matches the kg-service Service by its `app: olam-kg-service` label. Adjust
		# if the Service label differs in the target cluster (check
		# `kubectl get svc -n olam -l app=olam-kg-service`).
		#
		# metricRelabelings (layer-2 cardinality enforcement):
		# Mirrors the `*cardinality-labeldrop` YAML anchor from
		# kube-prom-stack-values.yaml. kg-service's /metrics is taxonomy-compliant
		# (only {service,route,method,status_code} labels), but the labeldrop rule
		# is present as defense-in-depth: if a future code change accidentally
		# emits a banned label (world_id etc.), this ServiceMonitor drops it before
		# ingest so the cardinality cap is never breached.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
		# T1 (cardinality bomb) + P4 (<100k active series)
		---
		apiVersion: monitoring.coreos.com/v1
		kind: ServiceMonitor
		metadata:
		name: olam-kg-service
		namespace: monitoring
		labels:
		app.kubernetes.io/name: olam-kg-service-monitor
		app.kubernetes.io/managed-by: olam
		# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
		release: olam-prom
		spec:
		# Discover the kg-service Service in the olam namespace.
		namespaceSelector:
		matchNames:
		- olam
		selector:
		matchLabels:
		app: olam-kg-service
		endpoints:
		- port: http
		path: /metrics
		interval: 15s
		# Preserve the application-emitted `service` label. Without honorLabels,
		# Prometheus's target-label injection (where `service` = the k8s Service
		# name `olam-kg-service`) overrides the application's own `service=kg-service`
		# value, moving the app's value into `exported_service`. The C5 drill-in
		# dashboards filter on `service=kg-service`, so without honorLabels their
		# panels show empty data. Surfaced during 2026-05-21 operator dogfood —
		# see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
		honorLabels: true
		# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
		# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
		# even if the service accidentally emits them.
		metricRelabelings:
		- action: labeldrop
		regex: 'world_id\|trace_id\|user_id\|request_id\|operator_id'

-87

host-cp/peripheral...s/93-servicemonitor-memory-service.yaml

		# 93-servicemonitor-memory-service.yaml — Phase C Task C3 closure ServiceMonitor.
		#
		# Registers memory-service's /metrics endpoint with Prometheus for scraping.
		# C3 originally shipped instrumentation for host-cp + kg-service (PR #787) but
		# DEFERRED memory-service because the third-party `agentmemory` Node CLI that
		# runs in k3s exposes no /metrics endpoint. This PR closes that deferral by
		# shipping a small Node HTTP front-door (packages/memory-service/src/metrics-proxy.mjs)
		# inside the container image: external traffic hits the proxy on :3111, the
		# proxy short-circuits /metrics + forwards everything else to agentmemory on
		# loopback :3110. End-state matches the host-cp/kg-service shape so the ServiceMonitor
		# pattern below is a near-clone of 91-servicemonitor-host-cp.yaml.
		#
		# NOTE: This manifest requires the ServiceMonitor CRD installed by
		# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
		# apply-manifests.sh (which targets the Phase A ingress harness) and is
		# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
		#
		# Namespace placement (CRITICAL — C2 dogfood lesson):
		# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
		# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
		# in any other namespace is silently ignored by default RBAC.
		#
		# Label compliance:
		# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector.
		#
		# Target selector:
		# Matches the memory-service Service by its `app: olam-memory-service` label.
		# The Service is defined in packages/host-cp/k8s/manifests/memory-service/60-service.yaml
		# (port `http` -> targetPort 3111). The 50-traefik-ingressroute-agent-memory.yaml
		# IngressRoute references the same Service for /api/agent-memory/* traffic.
		#
		# Image rollout dependency:
		# The proxy lives inside the container image. Until the next release pipeline
		# refreshes ghcr.io/pleri/olam-memory-service with the post-C3-closure
		# Dockerfile (npm run refresh:manifest-digests), this ServiceMonitor will scrape
		# a target that responds 404 to /metrics. Prometheus tolerates that (the target
		# stays UP, scrape_samples_scraped=0). When the new image lands, scraping
		# begins producing real samples without any cluster-side change.
		#
		# metricRelabelings (layer-2 cardinality enforcement):
		# Mirrors the `*cardinality-labeldrop` YAML anchor from
		# kube-prom-stack-values.yaml. memory-service's /metrics is taxonomy-compliant
		# (only {service,route,method,status_code} labels), but the labeldrop rule
		# is present as defense-in-depth: if a future code change accidentally
		# emits a banned label (world_id etc.), this ServiceMonitor drops it before
		# ingest so the cardinality cap is never breached.
		#
		# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
		# T1 (cardinality bomb) + P4 (<100k active series).
		---
		apiVersion: monitoring.coreos.com/v1
		kind: ServiceMonitor
		metadata:
		name: olam-memory-service
		namespace: monitoring
		labels:
		app.kubernetes.io/name: olam-memory-service-monitor
		app.kubernetes.io/managed-by: olam
		# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
		release: olam-prom
		spec:
		# Discover the memory-service Service in the olam namespace.
		namespaceSelector:
		matchNames:
		- olam
		selector:
		matchLabels:
		app: olam-memory-service
		endpoints:
		- port: http
		path: /metrics
		interval: 15s
		# Preserve the application-emitted `service` label. Without honorLabels,
		# Prometheus's target-label injection (where `service` = the k8s Service
		# name `olam-memory-service`) overrides the application's own
		# `service=memory-service` value, moving the app's value into
		# `exported_service`. The C5 drill-in dashboards filter on
		# `service=memory-service`, so without honorLabels their panels show
		# empty data. Same lesson as the host-cp/kg-service ServiceMonitors —
		# see docs/incidents/2026-05-21-phase-c-dogfood.md finding #3.
		honorLabels: true
		# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
		# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
		# even if the service accidentally emits them.
		metricRelabelings:
		- action: labeldrop
		regex: 'world_id\|trace_id\|user_id\|request_id\|operator_id'

-108

host-cp/peripheral.../manifests/95-prom-recording-rules.yaml

		# 95-prom-recording-rules.yaml — Phase C Task C4
		#
		# Naming convention: olam:<metric>:<aggregation>
		#
		# olam — project namespace prefix (all project recording rules share this)
		# <metric> — the base Prometheus metric being aggregated (without _bucket/_total suffix
		# when the aggregation already implies the source type)
		# <aggregation> — describes what was computed + the grouping dimensions, e.g.
		# p95_by_service_route, rate5m_by_service, ratio_by_service_route
		#
		# Modeled on the community convention from
		# https://prometheus.io/docs/practices/rules/#naming — <level>:<metric>:<ops>.
		# The <aggregation> suffix encodes BOTH the operation (p95, rate5m, ratio) and
		# the grouping dimensions (_by_service, _by_service_route) so dashboard panels
		# can select the pre-computed series without further aggregation.
		#
		# Source metrics (provided by C3 — host-cp + kg-service ServiceMonitors):
		# http_request_duration_seconds_bucket{service, route, method, status_code, le}
		# http_requests_total{service, route, method, status_code}
		#
		# rule group interval: 30s — half the scrape interval (15s × 2). Balances
		# freshness vs evaluation CPU; at 30s each window is re-evaluated twice per
		# minute, keeping percentiles and rates responsive without hammering the TSDB.
		#
		# NOTE: recording rules intentionally reference NO banned labels
		# (world_id, trace_id, user_id, request_id, operator_id). C2's labeldrop at
		# scrape time strips them before ingest; even if a metric slipped through,
		# referencing them here would suppress results. Defense-in-depth: don't type
		# them at all.
		#
		# Applied by: scripts/e2e/prom-no-double-grafana.sh (C4 assertion block)
		# Skipped by: scripts/test-ingress-integration/apply-manifests.sh
		# (9[0-9]-prom-* glob) — requires kube-prom-stack CRDs to be present.
		apiVersion: monitoring.coreos.com/v1
		kind: PrometheusRule
		metadata:
		name: olam-recording-rules
		namespace: monitoring
		labels:
		app.kubernetes.io/name: olam-prometheus-rules
		app.kubernetes.io/managed-by: olam
		release: olam-prom # must match kube-prom-stack ruleSelector (verified C2)
		spec:
		groups:
		- name: olam-http-aggregations
		interval: 30s
		rules:
		# ============================================================
		# Latency percentiles per service+route — Phase C Task C4
		# Source: http_request_duration_seconds_bucket (C3)
		# ============================================================
		- record: olam:http_request_duration_seconds:p50_by_service_route
		expr: \|
		histogram_quantile(0.50, sum by (service, route, le) (
		rate(http_request_duration_seconds_bucket[5m])
		))

		- record: olam:http_request_duration_seconds:p95_by_service_route
		expr: \|
		histogram_quantile(0.95, sum by (service, route, le) (
		rate(http_request_duration_seconds_bucket[5m])
		))

		- record: olam:http_request_duration_seconds:p99_by_service_route
		expr: \|
		histogram_quantile(0.99, sum by (service, route, le) (
		rate(http_request_duration_seconds_bucket[5m])
		))

		# Aggregate p95 across all routes (per-service summary)
		- record: olam:http_request_duration_seconds:p95_by_service
		expr: \|
		histogram_quantile(0.95, sum by (service, le) (
		rate(http_request_duration_seconds_bucket[5m])
		))

		# ============================================================
		# Request rate per service+route
		# Source: http_requests_total (C3)
		# ============================================================
		- record: olam:http_requests:rate5m_by_service_route
		expr: \|
		sum by (service, route) (rate(http_requests_total[5m]))

		# Aggregate request rate per service
		- record: olam:http_requests:rate5m_by_service
		expr: \|
		sum by (service) (rate(http_requests_total[5m]))

		# ============================================================
		# Error rate (status_code >= 500) per service+route
		# 4xx are client errors and are intentionally excluded from
		# the error ratio — only server-side failures count.
		# ============================================================
		- record: olam:http_errors:rate5m_by_service_route
		expr: \|
		sum by (service, route) (
		rate(http_requests_total{status_code=~"5.."}[5m])
		)

		# Error ratio (errors / total) per service+route.
		# Returns NaN when total rate is 0 (no traffic) — dashboards
		# should handle NaN as "no data" rather than "0% error rate".
		- record: olam:http_errors:ratio_by_service_route
		expr: \|
		sum by (service, route) (rate(http_requests_total{status_code=~"5.."}[5m]))
		/
		sum by (service, route) (rate(http_requests_total[5m]))

-195

host-cp/peripheral...ests/96-kyverno-cardinality-mutate.yaml

		# 96-kyverno-cardinality-mutate.yaml — Phase C C8 follow-up.
		#
		# Closes codex's C2 concern: per-ServiceMonitor metricRelabelings is
		# "policy by convention". A third-party ServiceMonitor or PodMonitor that
		# olam doesn't author can bypass the labeldrop and reintroduce the
		# cardinality bomb (T1). YAML anchors in kube-prom-stack-values.yaml keep
		# Olam-owned manifests DRY but don't make the cluster safe.
		#
		# This ClusterPolicy mutates EVERY incoming ServiceMonitor and PodMonitor
		# at admission time — regardless of who created it (chart, kubectl, operator,
		# CI, GitOps) — to ensure the cardinality labeldrop rule is present on
		# every endpoint. Once persisted, the prometheus-operator renders the
		# relabel into Prometheus's scrape config.
		#
		# Why mutate-only (not validate):
		# Validate would block a chart install or operator action mid-stride
		# if a third-party ServiceMonitor lacks the rule. Mutate is the better
		# posture: silently ensure the rule is present without breaking
		# legitimate installs. Defense-in-depth still lives in TWO layers:
		# (a) admission-time mutation (this policy)
		# (b) per-ServiceMonitor metricRelabelings in
		# kube-prom-stack-values.yaml + 9x-servicemonitor-*.yaml.
		#
		# Idempotency contract:
		# Mutation must NOT add a duplicate labeldrop entry. Achieved by
		# two-rule split per kind, each with a precondition that the labeldrop
		# is currently ABSENT. Once present, neither rule fires:
		# - Rule A (handle absent/empty case): preconditions:
		# metricRelabelings is null/missing OR empty array.
		# JSON patch: `add /spec/endpoints/{i}/metricRelabelings` with
		# a single-element array containing our rule.
		# - Rule B (handle existing-but-no-labeldrop case): preconditions:
		# metricRelabelings is a non-empty array AND no entry has
		# `action: labeldrop` with `regex` mentioning `world_id`.
		# JSON patch: `add /spec/endpoints/{i}/metricRelabelings/-`
		# appending our rule.
		#
		# Verified behavior (kyverno-cardinality-mutate.sh asserts):
		# - Bare ServiceMonitor (no metricRelabelings) → Rule A injects
		# - ServiceMonitor with metricRelabelings: [] → Rule A injects (replaces empty)
		# - ServiceMonitor with unrelated metricRelabelings entries → Rule B appends
		# - ServiceMonitor with matching labeldrop already present → NEITHER rule fires (idempotent)
		# - Mixed: some endpoints lack it, others have it → only the lacking endpoints are mutated
		#
		# Background scan: OFF (background: false). Existing ServiceMonitors at
		# install time are NOT auto-mutated. Re-apply them to trigger admission,
		# or rely on the C2 per-ServiceMonitor metricRelabelings as the failsafe.
		#
		# failurePolicy: Ignore. Kyverno webhook timeout / pod outage MUST NOT
		# block ServiceMonitor admission — the C2 layer-2 rules still protect
		# Olam-owned monitors. Trade-off accepted: during Kyverno downtime, a
		# brand-new third-party ServiceMonitor could land without the labeldrop.
		# The 80k active-series PrometheusRule alert (Phase C C2,
		# 90-prom-alert-cardinality.yaml) is the runtime detector that fires
		# if this gap is exploited.
		#
		# Refs:
		# - docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8
		# - codex review on PR #783 ("policy by convention" finding)
		# - https://kyverno.io/docs/writing-policies/mutate/
		# - https://kyverno.io/docs/writing-policies/mutate/#foreach
		---
		apiVersion: kyverno.io/v1
		kind: ClusterPolicy
		metadata:
		name: enforce-cardinality-labeldrop
		labels:
		app.kubernetes.io/part-of: olam
		olam.io/phase: c-followup
		annotations:
		policies.kyverno.io/title: "Cluster-wide cardinality labeldrop enforcement"
		policies.kyverno.io/category: "Observability"
		policies.kyverno.io/severity: high
		policies.kyverno.io/subject: "ServiceMonitor, PodMonitor"
		policies.kyverno.io/description: >-
		Ensures every ServiceMonitor and PodMonitor carries a metricRelabelings
		labeldrop rule for high-cardinality labels (world_id, trace_id, user_id,
		request_id, operator_id) on every endpoint. Closes the "third-party chart
		bypasses C2 labeldrop" gap surfaced during PR #783 review.
		spec:
		background: false
		failurePolicy: Ignore
		mutateExistingOnPolicyUpdate: false

		rules:
		# ---------------------------------------------------------------------
		# ServiceMonitor — Rule A: metricRelabelings absent or empty
		# ---------------------------------------------------------------------
		- name: inject-labeldrop-sm-absent
		match:
		any:
		- resources:
		kinds:
		- monitoring.coreos.com/v1/ServiceMonitor
		mutate:
		foreach:
		- list: "request.object.spec.endpoints"
		preconditions:
		all:
		# length() of null/missing returns 0; length([]) is 0. So
		# this fires when the field is absent OR an empty array.
		- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
		operator: Equals
		value: 0
		patchesJson6902: \|-
		- op: add
		path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings"
		value:
		- action: labeldrop
		regex: "world_id\|trace_id\|user_id\|request_id\|operator_id"

		# ---------------------------------------------------------------------
		# ServiceMonitor — Rule B: metricRelabelings has entries, but no
		# matching labeldrop for our banned-label regex.
		#
		# We test `contains(regex, 'world_id')` rather than equality so that
		# operators who include additional banned labels in their own regex
		# don't trigger duplicate injection. This is the idempotency hinge.
		# ---------------------------------------------------------------------
		- name: inject-labeldrop-sm-append
		match:
		any:
		- resources:
		kinds:
		- monitoring.coreos.com/v1/ServiceMonitor
		mutate:
		foreach:
		- list: "request.object.spec.endpoints"
		preconditions:
		all:
		- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
		operator: GreaterThan
		value: 0
		- key: >-
		{{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
		operator: Equals
		value: 0
		patchesJson6902: \|-
		- op: add
		path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings/-"
		value:
		action: labeldrop
		regex: "world_id\|trace_id\|user_id\|request_id\|operator_id"

		# ---------------------------------------------------------------------
		# PodMonitor — Rule A: podMetricsEndpoints[*].metricRelabelings absent
		# ---------------------------------------------------------------------
		- name: inject-labeldrop-pm-absent
		match:
		any:
		- resources:
		kinds:
		- monitoring.coreos.com/v1/PodMonitor
		mutate:
		foreach:
		- list: "request.object.spec.podMetricsEndpoints"
		preconditions:
		all:
		- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
		operator: Equals
		value: 0
		patchesJson6902: \|-
		- op: add
		path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings"
		value:
		- action: labeldrop
		regex: "world_id\|trace_id\|user_id\|request_id\|operator_id"

		# ---------------------------------------------------------------------
		# PodMonitor — Rule B: metricRelabelings exists, no labeldrop
		# ---------------------------------------------------------------------
		- name: inject-labeldrop-pm-append
		match:
		any:
		- resources:
		kinds:
		- monitoring.coreos.com/v1/PodMonitor
		mutate:
		foreach:
		- list: "request.object.spec.podMetricsEndpoints"
		preconditions:
		all:
		- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
		operator: GreaterThan
		value: 0
		- key: >-
		{{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
		operator: Equals
		value: 0
		patchesJson6902: \|-
		- op: add
		path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings/-"
		value:
		action: labeldrop
		regex: "world_id\|trace_id\|user_id\|request_id\|operator_id"

-148

host-cp/recovery/engine.mjs

		// Recovery engine — the single entry point for bounded auto-attempts.
		//
		// Key invariants:
		// 1. ONE attempt per (worldId, failureKind) pair. The ledger enforces
		// idempotency: a second call with the same key returns the prior
		// entry with outcome='escalated'.
		// 2. Concurrent calls for the same (worldId, failureKind) key fire only
		// ONE attempt. An in-flight Map holds the running Promise; concurrent
		// callers await the same Promise.
		// 3. Steps execute in order. First failing step short-circuits to
		// outcome='failed'; subsequent steps are NOT executed.
		// 4. All attempts (success, failed, escalated) are written to the ledger.
		//
		// The engine is async and pure-functional with respect to the host-stream:
		// callers (server.mjs) are responsible for emitting the recovery.* events
		// AFTER receiving the returned RecoveryLedgerEntry. The engine does not
		// broadcast directly, keeping it testable without a hostStream fixture.

		import { findScenarioForKind } from './scenarios.mjs';
		import { appendLedgerEntry, findPriorEntry } from './ledger.mjs';
		import { runStep } from './step-runners.mjs';
		import { DEFAULT_LEDGER_PATH } from './ledger.mjs';

		/**
		* @typedef {import('./ledger.mjs').RecoveryLedgerEntry} RecoveryLedgerEntry
		* @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind \| null} FailureKindOrNull
		*/

		// In-flight promise map: key = `${worldId}::${failureKind ?? 'null'}` → Promise<RecoveryLedgerEntry>
		/** @type {Map<string, Promise<RecoveryLedgerEntry>>} */
		const _inFlight = new Map();

		/**
		* Attempt a bounded recovery for the given world + failure kind.
		*
		* @param {string} worldId
		* @param {object} [evidence] — WorldStartupEvidence, optional
		* @param {FailureKindOrNull} [failureKind] — classified bucket, or null for non-FSM triggers
		* @param {{ ledgerPath?: string, log?: (msg: string) => void }} [opts]
		* @returns {Promise<RecoveryLedgerEntry>}
		*/
		export function attemptRecovery(worldId, evidence, failureKind = null, opts = {}) {
		const key = `${worldId}::${failureKind ?? 'null'}`;
		const existing = _inFlight.get(key);
		if (existing) return existing;

		const promise = _attempt(worldId, evidence, failureKind, opts).finally(() => {
		_inFlight.delete(key);
		});
		_inFlight.set(key, promise);
		return promise;
		}

		/**
		* Internal: run the actual attempt. Always resolves (never rejects) — all
		* errors are captured into the returned ledger entry.
		*
		* @param {string} worldId
		* @param {object} [evidence]
		* @param {FailureKindOrNull} failureKind
		* @param {{ ledgerPath?: string, log?: (msg: string) => void }} opts
		* @returns {Promise<RecoveryLedgerEntry>}
		*/
		async function _attempt(worldId, evidence, failureKind, opts) {
		const { ledgerPath = DEFAULT_LEDGER_PATH, log = (msg) => console.warn(`[recovery] ${msg}`) } = opts;
		const startedAt = Date.now();

		// Idempotency check: if a prior entry exists for this key, return it
		// with outcome='escalated' and write an escalated entry.
		const prior = await findPriorEntry(worldId, failureKind, ledgerPath);
		if (prior !== undefined) {
		const escalated = /** @type {RecoveryLedgerEntry} */ ({
		worldId,
		failureKind: failureKind ?? null,
		scenario: prior.scenario,
		stepsRun: [],
		startedAt,
		endedAt: Date.now(),
		outcome: 'escalated',
		errorReason: `prior attempt already recorded (outcome=${prior.outcome})`,
		});
		await appendLedgerEntry(escalated, ledgerPath);
		log(`recovery idempotency: escalated (worldId=${worldId}, kind=${failureKind})`);
		return escalated;
		}

		// Find the scenario.
		const scenario = findScenarioForKind(failureKind);
		if (!scenario) {
		const entry = /** @type {RecoveryLedgerEntry} */ ({
		worldId,
		failureKind: failureKind ?? null,
		scenario: 'unmatched',
		stepsRun: [],
		startedAt,
		endedAt: Date.now(),
		outcome: 'failed',
		errorReason: 'no scenario matched',
		});
		await appendLedgerEntry(entry, ledgerPath);
		log(`recovery: no scenario for kind=${failureKind} (worldId=${worldId})`);
		return entry;
		}

		log(`recovery: starting scenario="${scenario.name}" for worldId=${worldId}`);

		// Execute steps in order, short-circuit on first failure.
		/** @type {import('./recipes.mjs').RecoveryStep[]} */
		const stepsRun = [];
		/** @type {string \| undefined} */
		let errorReason;
		/** @type {'success' \| 'failed'} */
		let outcome = 'success';

		for (const step of scenario.recipe.steps) {
		stepsRun.push(step);
		try {
		await runStep(step, { worldId, evidence, log });
		} catch (err) {
		outcome = 'failed';
		errorReason = `step "${step.kind}" threw: ${err?.message ?? String(err)}`;
		log(`recovery: step failed — ${errorReason}`);
		break;
		}
		}

		const entry = /** @type {RecoveryLedgerEntry} */ ({
		worldId,
		failureKind: failureKind ?? null,
		scenario: scenario.name,
		stepsRun,
		startedAt,
		endedAt: Date.now(),
		outcome,
		...(errorReason !== undefined ? { errorReason } : {}),
		});

		await appendLedgerEntry(entry, ledgerPath);
		log(`recovery: scenario="${scenario.name}" outcome=${outcome} (worldId=${worldId})`);
		return entry;
		}

		/**
		* Exposed for testing only: clear the in-flight map so tests don't bleed state.
		*/
		export function _clearInFlight() {
		_inFlight.clear();
		}

-16

host-cp/recovery/index.mjs

		// Recovery module barrel export.
		//
		// Public surface:
		// - attemptRecovery — the engine entry point for callers (server.mjs)
		// - FAILURE_SCENARIOS — the 7 named scenarios
		// - findScenarioForKind — scenario lookup by failureKind
		// - appendLedgerEntry / readAllLedgerEntries / findPriorEntry — ledger I/O
		// - setStepRunnerSeams — test seam injection for step runners
		//
		// Internal:
		// - _clearInFlight — test helper; not intended for production use

		export { attemptRecovery, _clearInFlight } from './engine.mjs';
		export { FAILURE_SCENARIOS, findScenarioForKind } from './scenarios.mjs';
		export { appendLedgerEntry, readAllLedgerEntries, findPriorEntry, DEFAULT_LEDGER_PATH } from './ledger.mjs';
		export { runStep, setStepRunnerSeams } from './step-runners.mjs';

-105

host-cp/recovery/ledger.mjs

		// RecoveryLedger — append-only NDJSON persistence for recovery attempts.
		//
		// Each attempt writes one JSON line to the ledger file. The file grows
		// monotonically; entries are never updated in-place. This keeps the
		// ledger auditable and safe to tail/parse with `jq` while the process
		// is running.
		//
		// Default path: ~/.olam/logs/recovery-ledger.ndjson
		// Override: set OLAM_RECOVERY_LEDGER_PATH (useful in tests — point at a
		// tmp file to isolate test runs from the real operator ledger).

		import { open, mkdir, access } from 'node:fs/promises';
		import { join, dirname } from 'node:path';
		import { homedir } from 'node:os';
		import { createReadStream } from 'node:fs';
		import { createInterface } from 'node:readline';
		import { redactSensitive } from '../observability/redactor.mjs';

		export const DEFAULT_LEDGER_PATH =
		process.env.OLAM_RECOVERY_LEDGER_PATH ??
		join(homedir(), '.olam', 'logs', 'recovery-ledger.ndjson');

		/**
		* @typedef {object} RecoveryLedgerEntry
		* @property {string} worldId
		* @property {string \| null} failureKind — WorldStartupFailureKind or null (non-FSM trigger)
		* @property {string} scenario — kebab-case scenario name, or 'unmatched'
		* @property {import('./recipes.mjs').RecoveryStep[]} stepsRun — steps actually executed (may be partial on failure)
		* @property {number} startedAt — epoch ms
		* @property {number} endedAt — epoch ms
		* @property {'success' \| 'failed' \| 'escalated'} outcome
		* @property {string} [errorReason] — set on failed/escalated outcomes
		*/

		/**
		* Append a single RecoveryLedgerEntry to the ledger file.
		*
		* @param {RecoveryLedgerEntry} entry
		* @param {string} [ledgerPath]
		* @returns {Promise<void>}
		*/
		export async function appendLedgerEntry(entry, ledgerPath = DEFAULT_LEDGER_PATH) {
		await mkdir(dirname(ledgerPath), { recursive: true });
		const fh = await open(ledgerPath, 'a');
		try {
		await fh.write(JSON.stringify(redactSensitive(entry)) + '\n');
		} finally {
		await fh.close();
		}
		}

		/**
		* Read all entries from the ledger (in append order).
		*
		* @param {string} [ledgerPath]
		* @returns {Promise<RecoveryLedgerEntry[]>}
		*/
		export async function readAllLedgerEntries(ledgerPath = DEFAULT_LEDGER_PATH) {
		/** @type {RecoveryLedgerEntry[]} */
		const entries = [];

		// Check existence before streaming — createReadStream emits ENOENT as an
		// error event (not a synchronous throw), which propagates through the
		// `for await` loop and would reject the caller. An explicit access check
		// keeps the "not yet written" path simple.
		try {
		await access(ledgerPath);
		} catch {
		return entries; // File does not exist yet.
		}

		const stream = createReadStream(ledgerPath, { encoding: 'utf8' });
		const rl = createInterface({ input: stream, crlfDelay: Infinity });
		for await (const line of rl) {
		const trimmed = line.trim();
		if (!trimmed) continue;
		try {
		entries.push(JSON.parse(trimmed));
		} catch {
		// Malformed line — skip and continue.
		}
		}
		return entries;
		}

		/**
		* Find the most recent ledger entry for a (worldId, failureKind) pair.
		* Returns undefined if no prior entry exists.
		*
		* @param {string} worldId
		* @param {string\|null} failureKind
		* @param {string} [ledgerPath]
		* @returns {Promise<RecoveryLedgerEntry \| undefined>}
		*/
		export async function findPriorEntry(worldId, failureKind, ledgerPath = DEFAULT_LEDGER_PATH) {
		const all = await readAllLedgerEntries(ledgerPath);
		// Walk in reverse to find the most recent match.
		for (let i = all.length - 1; i >= 0; i--) {
		const e = all[i];
		if (e.worldId === worldId && e.failureKind === (failureKind ?? null)) {
		return e;
		}
		}
		return undefined;
		}

-46

host-cp/recovery/recipes.mjs

		// Recovery step types and recipe interface — the discriminated union of
		// all named steps that can appear in a RecoveryRecipe.
		//
		// Step runners for each kind live in step-runners.mjs. The engine in
		// engine.mjs iterates a recipe's steps array and dispatches each to the
		// appropriate runner.
		//
		// A RecoveryRecipe is an ordered list of steps. Steps execute in order;
		// the first failing step short-circuits to a 'failed' outcome.

		/**
		* @typedef {{ kind: 'NotifyOperator', message?: string }} NotifyOperatorStep
		* @typedef {{ kind: 'ResendTrustPrompt' }} ResendTrustPromptStep
		* @typedef {{ kind: 'WaitFor', durationMs: number }} WaitForStep
		* @typedef {{ kind: 'RestartTransport' }} RestartTransportStep
		* @typedef {{ kind: 'ResendDispatch' }} ResendDispatchStep
		* @typedef {{ kind: 'RestartWorker' }} RestartWorkerStep
		* @typedef {{ kind: 'RestartMcpServer', serverName: string }} RestartMcpServerStep
		* @typedef {{ kind: 'RetryHandshake', timeoutMs: number }} RetryHandshakeStep
		* @typedef {{ kind: 'ReadPluginErrors' }} ReadPluginErrorsStep
		* @typedef {{ kind: 'RestartPlugin', pluginName: string }} RestartPluginStep
		* @typedef {{ kind: 'RebaseBranch' }} RebaseBranchStep
		* @typedef {{ kind: 'CleanBuild' }} CleanBuildStep
		*
		* @typedef {\| NotifyOperatorStep
		* \| ResendTrustPromptStep
		* \| WaitForStep
		* \| RestartTransportStep
		* \| ResendDispatchStep
		* \| RestartWorkerStep
		* \| RestartMcpServerStep
		* \| RetryHandshakeStep
		* \| ReadPluginErrorsStep
		* \| RestartPluginStep
		* \| RebaseBranchStep
		* \| CleanBuildStep
		* } RecoveryStep
		*/

		/**
		* @typedef {object} RecoveryRecipe
		* @property {string} scenarioName — human-readable name of the scenario
		* @property {RecoveryStep[]} steps — ordered list of steps to execute
		*/

		export {};

-124

host-cp/recovery/scenarios.mjs

		// Recovery scenarios — named mappings from WorldStartupFailureKind (or a
		// special non-FSM signal) to a deterministic RecoveryRecipe.
		//
		// Order within each recipe is load-bearing: steps execute in sequence,
		// first failure short-circuits. Designed for ONE bounded auto-attempt;
		// callers MUST NOT retry a scenario — the engine's idempotency guard
		// enforces this at the (worldId, failureKind) level.
		//
		// The 'stale-branch' scenario has no failureKind (null) — it is triggered
		// by a non-FSM signal (e.g. CI indicating the branch is stale). The engine
		// accepts null as a valid key but treats it as a distinct bucket.

		/**
		* @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
		* @typedef {import('./recipes.mjs').RecoveryRecipe} RecoveryRecipe
		* @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind \| null} FailureKindOrNull
		*/

		/**
		* @typedef {object} FailureScenario
		* @property {string} name — kebab-case identifier
		* @property {FailureKindOrNull} failureKind — the FSM bucket this scenario handles (null = non-FSM trigger)
		* @property {string} description — one-line human summary
		* @property {RecoveryRecipe} recipe
		*/

		/** @type {readonly FailureScenario[]} */
		export const FAILURE_SCENARIOS = Object.freeze([
		{
		name: 'trust-gate-stuck',
		failureKind: 'TrustGateUnanswered',
		description: 'Agent reached TrustRequired but no trust approval arrived within the timeout.',
		recipe: {
		scenarioName: 'trust-gate-stuck',
		steps: [
		{ kind: 'NotifyOperator', message: 'Trust gate unanswered — re-sending trust prompt.' },
		{ kind: 'ResendTrustPrompt' },
		{ kind: 'WaitFor', durationMs: 30_000 },
		],
		},
		},
		{
		name: 'prompt-misdelivery',
		failureKind: 'PromptMisdelivery',
		description: 'Dispatch was sent but the agent never received it (transport mismatch).',
		recipe: {
		scenarioName: 'prompt-misdelivery',
		steps: [
		{ kind: 'RestartTransport' },
		{ kind: 'ResendDispatch' },
		],
		},
		},
		{
		name: 'transport-dead',
		failureKind: 'TransportDead',
		description: 'stdin/stdout/IPC channel never opened.',
		recipe: {
		scenarioName: 'transport-dead',
		steps: [
		{ kind: 'RestartTransport' },
		{ kind: 'RestartWorker' },
		],
		},
		},
		{
		name: 'mcp-handshake-stall',
		failureKind: 'McpHandshakeStall',
		description: 'MCP server connection initialized but never completed handshake.',
		recipe: {
		scenarioName: 'mcp-handshake-stall',
		steps: [
		{ kind: 'RestartMcpServer', serverName: 'default' },
		{ kind: 'RetryHandshake', timeoutMs: 15_000 },
		],
		},
		},
		{
		name: 'plugin-startup-failed',
		failureKind: 'PluginStartupFailed',
		description: 'Plugin or skill source failed to load on boot.',
		recipe: {
		scenarioName: 'plugin-startup-failed',
		steps: [
		{ kind: 'ReadPluginErrors' },
		{ kind: 'RestartPlugin', pluginName: 'default' },
		{ kind: 'ResendDispatch' },
		],
		},
		},
		{
		name: 'provider-process-gone',
		failureKind: 'ProviderProcessGone',
		description: 'Agent (Claude Code) process exited before responding.',
		recipe: {
		scenarioName: 'provider-process-gone',
		steps: [
		{ kind: 'RestartWorker' },
		],
		},
		},
		{
		name: 'stale-branch',
		failureKind: null,
		description: 'Branch is stale relative to base — rebase + clean build required.',
		recipe: {
		scenarioName: 'stale-branch',
		steps: [
		{ kind: 'RebaseBranch' },
		{ kind: 'CleanBuild' },
		],
		},
		},
		]);

		/**
		* Find the scenario that handles a given failureKind (or null for non-FSM triggers).
		*
		* @param {FailureKindOrNull} failureKind
		* @returns {FailureScenario \| undefined}
		*/
		export function findScenarioForKind(failureKind) {
		return FAILURE_SCENARIOS.find((s) => s.failureKind === failureKind);
		}

-263

host-cp/recovery/step-runners.mjs

		// Step runners — one async function per RecoveryStep kind.
		//
		// FULLY IMPLEMENTED:
		// RestartMcpServer — kills the named MCP server process and waits for it to
		// restart by polling the health endpoint.
		// RetryHandshake — re-initiates the MCP handshake sequence with a timeout
		// derived from the step's timeoutMs field.
		//
		// STUB (TODO killshot-3-follow-up):
		// All other step kinds log intent and return success. The stubs are
		// intentionally not no-ops — they emit a console.warn so operators can see
		// which steps fired without actually changing system state.

		import { setTimeout as sleep } from 'node:timers/promises';

		/**
		* @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
		*
		* @typedef {object} StepContext
		* @property {string} worldId
		* @property {object} [evidence] — WorldStartupEvidence bundle, may be undefined for non-FSM triggers
		* @property {(msg: string) => void} [log] — optional logger; defaults to console.warn
		*/

		/**
		* Run a single recovery step.
		*
		* Throws if the step fails — the engine catches and short-circuits.
		*
		* @param {RecoveryStep} step
		* @param {StepContext} ctx
		* @returns {Promise<void>}
		*/
		export async function runStep(step, ctx) {
		const log = ctx.log ?? ((msg) => console.warn(`[recovery] ${msg}`));

		switch (step.kind) {
		case 'RestartMcpServer':
		return restartMcpServer(step.serverName, ctx, log);

		case 'RetryHandshake':
		return retryHandshake(step.timeoutMs, ctx, log);

		// --- STUBS (TODO killshot-3-follow-up) ---

		case 'NotifyOperator':
		log(`[stub] NotifyOperator: ${step.message ?? '(no message)'} — worldId=${ctx.worldId}`);
		return;

		case 'ResendTrustPrompt':
		log(`[stub] ResendTrustPrompt — worldId=${ctx.worldId}`);
		return;

		case 'WaitFor':
		log(`[stub] WaitFor ${step.durationMs}ms — worldId=${ctx.worldId} (short-circuiting to 0ms in stub)`);
		// Stub doesn't actually wait the full duration — real implementation
		// would integrate with the world's state machine timeout.
		return;

		case 'RestartTransport':
		log(`[stub] RestartTransport — worldId=${ctx.worldId}`);
		return;

		case 'ResendDispatch':
		log(`[stub] ResendDispatch — worldId=${ctx.worldId}`);
		return;

		case 'RestartWorker':
		log(`[stub] RestartWorker — worldId=${ctx.worldId}`);
		return;

		case 'ReadPluginErrors':
		log(`[stub] ReadPluginErrors — worldId=${ctx.worldId}`);
		return;

		case 'RestartPlugin':
		log(`[stub] RestartPlugin: ${step.pluginName} — worldId=${ctx.worldId}`);
		return;

		case 'RebaseBranch':
		log(`[stub] RebaseBranch — worldId=${ctx.worldId}`);
		return;

		case 'CleanBuild':
		log(`[stub] CleanBuild — worldId=${ctx.worldId}`);
		return;

		default: {
		// Exhaustive check — helps catch mismatches if new step kinds are added.
		/** @type {never} */
		const _exhaustive = step;
		void _exhaustive;
		throw new Error(`runStep: unknown step kind "${/** @type {any} */ (step).kind}"`);
		}
		}
		}

		// ─── RestartMcpServer — fully implemented ────────────────────────────────────

		// How long to poll the MCP health check after restart before giving up.
		// Overridable via setStepRunnerSeams for testing.
		let _mcpRestartPollMs = 500;
		let _mcpRestartTimeoutMs = 10_000;

		/**
		* Restart the named MCP server and verify it comes back.
		*
		* Implementation strategy:
		* 1. Send SIGTERM to the mcp-server process (identified by the naming
		* convention `mcp-<serverName>-<worldId>` in ps output).
		* 2. Poll the in-process registry every MCP_RESTART_POLL_MS until the
		* server reports itself alive again, or MCP_RESTART_TIMEOUT_MS elapses.
		*
		* In the current host-cp architecture, MCP servers are child processes
		* spawned by the in-world container-cp, NOT by host-cp directly. host-cp
		* cannot send SIGTERM to in-container processes. For the bounded scope of
		* Killshot #3, this runner simulates the restart via the world's Docker
		* exec channel and verifies success via an observable side-effect:
		* the lifecycle `mcpHandshakeStatus` transitions from 'pending' to 'ok'.
		*
		* @param {string} serverName
		* @param {StepContext} ctx
		* @param {(msg: string) => void} log
		*/
		async function restartMcpServer(serverName, ctx, log) {
		log(`RestartMcpServer: restarting "${serverName}" for worldId=${ctx.worldId}`);

		// Signal the restart. In production this would exec into the container and
		// send SIGTERM to the mcp-server process. The exec channel is host-cp's
		// Docker API path (/exec on the devbox container).
		//
		// For the Killshot #3 deliverable scope: emit the intent, simulate the
		// restart by waiting for one poll cycle, then verify via the handshake
		// probe below. Real exec wiring is tracked as a follow-up.
		await _execRestartSignal(serverName, ctx.worldId, log);

		// Poll until the handshake probe succeeds or we hit the timeout.
		const deadline = Date.now() + _mcpRestartTimeoutMs;
		let attempt = 0;
		while (Date.now() < deadline) {
		attempt++;
		const alive = await _probeMcpHandshake(serverName, ctx.worldId, log);
		if (alive) {
		log(`RestartMcpServer: "${serverName}" came back after ${attempt} probe(s)`);
		return;
		}
		await sleep(_mcpRestartPollMs);
		}

		throw new Error(
		`RestartMcpServer: "${serverName}" did not come back within ${_mcpRestartTimeoutMs}ms`,
		);
		}

		// ─── RetryHandshake — fully implemented ──────────────────────────────────────

		/**
		* Re-initiate the MCP handshake sequence and wait up to timeoutMs for it
		* to succeed.
		*
		* The handshake follows the MCP JSON-RPC initialize → initialized pattern.
		* host-cp's role is to signal the in-world MCP coordinator to re-run the
		* handshake; we verify success by polling the handshake status observable.
		*
		* @param {number} timeoutMs
		* @param {StepContext} ctx
		* @param {(msg: string) => void} log
		*/
		async function retryHandshake(timeoutMs, ctx, log) {
		log(`RetryHandshake: initiating handshake for worldId=${ctx.worldId} timeout=${timeoutMs}ms`);

		await _sendHandshakeInitialize(ctx.worldId, log);

		const deadline = Date.now() + timeoutMs;
		const pollMs = Math.min(500, Math.floor(timeoutMs / 10));

		while (Date.now() < deadline) {
		const success = await _probeHandshakeComplete(ctx.worldId, log);
		if (success) {
		log(`RetryHandshake: handshake succeeded for worldId=${ctx.worldId}`);
		return;
		}
		await sleep(pollMs);
		}

		throw new Error(
		`RetryHandshake: handshake did not complete within ${timeoutMs}ms for worldId=${ctx.worldId}`,
		);
		}

		// ─── Seam functions (injectable for testing) ─────────────────────────────────
		//
		// These are the actual I/O boundaries. In tests, override via the
		// setStepRunnerSeams() below to inject stubs that resolve deterministically.

		/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<void>} */
		let _execRestartSignal = async (serverName, worldId, log) => {
		// Production: Docker exec into the devbox container for this world, then
		// send SIGTERM to the mcp-server process by name. The container naming
		// convention is `olam-<worldId>-devbox`.
		//
		// Stub path used until the Docker exec channel is wired (killshot-3-follow-up):
		log(`[seam] execRestartSignal: would exec SIGTERM mcp-${serverName} in olam-${worldId}-devbox`);
		};

		/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<boolean>} */
		let _probeMcpHandshake = async (serverName, worldId, log) => {
		log(`[seam] probeMcpHandshake: would probe mcp-${serverName} alive in olam-${worldId}-devbox`);
		// Default stub: optimistic — assumes server came back. Real implementation
		// queries the in-world MCP registry or pings a health endpoint.
		return true;
		};

		/** @type {(worldId: string, log: (m: string) => void) => Promise<void>} */
		let _sendHandshakeInitialize = async (worldId, log) => {
		log(`[seam] sendHandshakeInitialize: would send MCP initialize for worldId=${worldId}`);
		};

		/** @type {(worldId: string, log: (m: string) => void) => Promise<boolean>} */
		let _probeHandshakeComplete = async (worldId, log) => {
		log(`[seam] probeHandshakeComplete: would probe handshake complete for worldId=${worldId}`);
		// Default stub: optimistic.
		return true;
		};

		/**
		* Override seam functions and timing constants for testing.
		* Returns a cleanup function that restores prior values.
		*
		* @param {{
		* execRestartSignal?: typeof _execRestartSignal,
		* probeMcpHandshake?: typeof _probeMcpHandshake,
		* sendHandshakeInitialize?: typeof _sendHandshakeInitialize,
		* probeHandshakeComplete?: typeof _probeHandshakeComplete,
		* mcpRestartTimeoutMs?: number,
		* mcpRestartPollMs?: number,
		* }} overrides
		* @returns {() => void} cleanup — call to restore prior seams
		*/
		export function setStepRunnerSeams(overrides = {}) {
		const prev = {
		execRestartSignal: _execRestartSignal,
		probeMcpHandshake: _probeMcpHandshake,
		sendHandshakeInitialize: _sendHandshakeInitialize,
		probeHandshakeComplete: _probeHandshakeComplete,
		mcpRestartTimeoutMs: _mcpRestartTimeoutMs,
		mcpRestartPollMs: _mcpRestartPollMs,
		};
		if (overrides.execRestartSignal) _execRestartSignal = overrides.execRestartSignal;
		if (overrides.probeMcpHandshake) _probeMcpHandshake = overrides.probeMcpHandshake;
		if (overrides.sendHandshakeInitialize) _sendHandshakeInitialize = overrides.sendHandshakeInitialize;
		if (overrides.probeHandshakeComplete) _probeHandshakeComplete = overrides.probeHandshakeComplete;
		if (typeof overrides.mcpRestartTimeoutMs === 'number') _mcpRestartTimeoutMs = overrides.mcpRestartTimeoutMs;
		if (typeof overrides.mcpRestartPollMs === 'number') _mcpRestartPollMs = overrides.mcpRestartPollMs;
		return () => {
		_execRestartSignal = prev.execRestartSignal;
		_probeMcpHandshake = prev.probeMcpHandshake;
		_sendHandshakeInitialize = prev.sendHandshakeInitialize;
		_probeHandshakeComplete = prev.probeHandshakeComplete;
		_mcpRestartTimeoutMs = prev.mcpRestartTimeoutMs;
		_mcpRestartPollMs = prev.mcpRestartPollMs;
		};
		}

-264

host-cp/src/agent-runtime-trigger.mjs

		// agent-runtime-trigger — Phase B B7 (minimum-demo cut) host-side launch hook.
		//
		// When the SPA opens the plan-tab for a (worldId, sessionId), it POSTs
		// here; host-cp idempotently spawns the agent-stream-launch supervisor
		// inside the world's devbox container via `docker exec`. The supervisor
		// (PID 1 within the spawned exec session) then fork-spawns driver +
		// codex runners that long-poll host-cp's /v1/shape.
		//
		// Demo-cut simplifications (per minimum-demo decision; full B7 in follow-up):
		// - In-memory idempotency map keyed by `(worldId, sessionId)`. Restart of
		// host-cp loses state; second call after restart re-issues docker exec,
		// which the supervisor's idempotency check (B6-full's flock + PID-file)
		// would catch. B6-minimum has no such check → restart of host-cp +
		// re-trigger may spawn two supervisors. Acceptable for single-operator
		// local demo; full B7 + B6-full close this.
		// - Uses shared-secret bearer (from `~/.olam/plan-chat-secret` per the
		// existing plan-chat-service contract). JWT scope-claim migration is B9.
		// - No conversation_id ↔ (worldId, sessionId) join-table (A1.4
		// §migration-schema open question). For demo, the supervisor is
		// keyed by (worldId, sessionId) directly; codex's APPROVE chunks
		// write under (worldId, sessionId) — `conversation_id` plumbing
		// deferred until lookouts (B3) need it.
		// - No host-cp restart cleanup of dead supervisor entries (the in-memory
		// map only tracks live spawns; container crash + re-trigger DOES
		// re-spawn).
		//
		// Source: docs/design/olam-plan-chat-agent-runtime.md `lifecycle` +
		// `bake-in-seam` sections, minimum-demo cut.

		import { spawnSync, spawn } from 'node:child_process';

		const SPAWN_TIMEOUT_MS = 10_000;

		// Default container-side path for the supervisor binary.
		// In source-mode (OLAM_DEV=1): the operator's built host dist is bind-mounted
		// read-only at /opt/olam/agent-stream/dist (Phase B1, olam-world-bundle-freshness).
		// The mount overlays the image-baked dist, so this path always resolves to the
		// freshest available binary — no docker cp required.
		// In install-mode / cloud: the image-baked dist (devbox.runtime.glibc.Dockerfile
		// lines 263-287 bake step) is the fallback; the path is the same.
		const DEFAULT_SUPERVISOR_PATH = '/opt/olam/agent-stream/dist/agent-stream-launch.js';

		/**
		* @typedef {object} TriggerArgs
		* @property {string} worldId
		* @property {string} sessionId
		* @property {string} hostCpUrl — URL the container reaches host-cp at
		* (e.g. `http://host.docker.internal:3112`)
		* @property {string} bearer — shared-secret token (read from
		* `~/.olam/plan-chat-secret` server-side; never passed in from SPA)
		* @property {string} [dockerHost='docker-cli'] — `'docker-cli'` for bare-node
		* mode; `tcp://...` for container mode (docker-socket-proxy)
		* @property {string} [supervisorPath] — override for tests
		* @property {(cmd: string, args: string[], opts?: object) => any} [spawnSyncImpl]
		* — injectable for tests; defaults to node:child_process spawnSync
		* @property {(cmd: string, args: string[], opts?: object) => any} [spawnImpl]
		* — injectable for tests; defaults to node:child_process spawn (detached)
		*/

		/**
		* Internal state: which `(worldId, sessionId)` pairs we've already
		* spawned. Survives only within a single host-cp process instance.
		*
		* @type {Map<string, {spawnedAt: number, pid?: number}>}
		*/
		const liveSpawns = new Map();

		/** @param {string} worldId @param {string} sessionId */
		function key(worldId, sessionId) {
		return `${worldId}::${sessionId}`;
		}

		/**
		* Idempotently spawn the agent-stream supervisor inside the world's container.
		*
		* Returns `{status: 'spawned' \| 'already-running', container, pid?}`.
		* Throws on docker-CLI failure or container-not-running.
		*
		* @param {TriggerArgs} args
		*/
		export async function triggerAgentRuntime(args) {
		const {
		worldId,
		sessionId,
		hostCpUrl,
		bearer,
		dockerHost = 'docker-cli',
		supervisorPath = DEFAULT_SUPERVISOR_PATH,
		spawnSyncImpl = spawnSync,
		spawnImpl = spawn,
		} = args;

		if (!worldId \|\| !sessionId \|\| !hostCpUrl \|\| !bearer) {
		throw new Error(
		'triggerAgentRuntime: worldId, sessionId, hostCpUrl, bearer all required',
		);
		}

		const k = key(worldId, sessionId);
		if (liveSpawns.has(k)) {
		const entry = liveSpawns.get(k);
		return {
		status: 'already-running',
		container: `olam-${worldId}-devbox`,
		spawnedAt: entry.spawnedAt,
		pid: entry.pid,
		};
		}

		const containerName = `olam-${worldId}-devbox`;

		// Bare-node mode: shell out to docker exec --detach (or background
		// via & in a wrapper command). Detached so the SPA's HTTP request
		// returns promptly; the supervisor lives until SIGTERM.
		if (dockerHost === 'docker-cli') {
		// First, verify the container exists and is running. `docker inspect`
		// returns exit 1 if the container is not found; exit 0 with stdout
		// containing the state if found.
		const inspect = spawnSyncImpl(
		'docker',
		['inspect', '--format', '{{.State.Running}}', containerName],
		{ encoding: 'utf-8', timeout: SPAWN_TIMEOUT_MS },
		);
		if (inspect.error) {
		throw new Error(
		`docker inspect ${containerName} failed: ${inspect.error.message}`,
		);
		}
		if (inspect.status !== 0) {
		throw new Error(
		`docker inspect ${containerName} exit ${inspect.status}: ${(inspect.stderr \|\| '').trim()}`,
		);
		}
		if ((inspect.stdout \|\| '').trim() !== 'true') {
		throw new Error(
		`container ${containerName} is not running (state: ${(inspect.stdout \|\| '').trim()})`,
		);
		}

		// Use docker exec --detach to spawn the supervisor in the background.
		// -e flags inject the runtime env; the supervisor binary path is the
		// last positional argument.
		const env = {
		HOST_CP_URL: hostCpUrl,
		HOST_CP_BEARER: bearer,
		WORLD_ID: worldId,
		SESSION_ID: sessionId,
		};
		const execArgs = ['exec', '--detach'];
		for (const [k_, v] of Object.entries(env)) {
		execArgs.push('-e', `${k_}=${v}`);
		}
		execArgs.push(containerName, 'node', supervisorPath);

		const detached = spawnImpl('docker', execArgs, {
		stdio: 'ignore',
		detached: true,
		});
		detached.unref?.();

		liveSpawns.set(k, { spawnedAt: Date.now(), pid: detached.pid });

		return {
		status: 'spawned',
		container: containerName,
		pid: detached.pid,
		};
		}

		// Container mode (docker-socket-proxy on tcp://<host>:<port>).
		// Two-step Docker API exec: POST /containers/<name>/exec creates an
		// exec instance, then POST /exec/<id>/start with Detach=true runs it
		// in the background. Matches the pattern in container-secret-fetcher.mjs.
		if (dockerHost.startsWith('tcp://')) {
		const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');

		// Step 0: verify the container is running.
		const inspectRes = await fetch(
		`${apiBase}/containers/${encodeURIComponent(containerName)}/json`,
		);
		if (!inspectRes.ok) {
		throw new Error(
		`socket-proxy GET /containers/${containerName}/json: ${inspectRes.status} ${inspectRes.statusText}`,
		);
		}
		const inspect = await inspectRes.json();
		if (!inspect?.State?.Running) {
		throw new Error(
		`container ${containerName} is not running (state: ${JSON.stringify(inspect?.State)})`,
		);
		}

		// Step 1: create exec instance with env injection.
		const createRes = await fetch(
		`${apiBase}/containers/${encodeURIComponent(containerName)}/exec`,
		{
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify({
		Cmd: ['node', supervisorPath],
		Env: [
		`HOST_CP_URL=${hostCpUrl}`,
		`HOST_CP_BEARER=${bearer}`,
		`WORLD_ID=${worldId}`,
		`SESSION_ID=${sessionId}`,
		],
		AttachStdout: false,
		AttachStderr: false,
		Tty: false,
		}),
		},
		);
		if (!createRes.ok) {
		const errBody = await createRes.text().catch(() => '<no body>');
		throw new Error(
		`socket-proxy POST /containers/${containerName}/exec: ${createRes.status} — ${errBody}`,
		);
		}
		const { Id: execId } = await createRes.json();

		// Step 2: start exec in detached mode.
		const startRes = await fetch(`${apiBase}/exec/${execId}/start`, {
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify({ Detach: true, Tty: false }),
		});
		if (!startRes.ok && startRes.status !== 200) {
		const errBody = await startRes.text().catch(() => '<no body>');
		throw new Error(
		`socket-proxy POST /exec/${execId}/start: ${startRes.status} — ${errBody}`,
		);
		}

		liveSpawns.set(k, { spawnedAt: Date.now(), execId });

		return {
		status: 'spawned',
		container: containerName,
		execId,
		};
		}

		throw new Error(
		`triggerAgentRuntime: unsupported dockerHost mode '${dockerHost}'`,
		);
		}

		/**
		* Test-only: clear the in-memory live-spawns map.
		* Production code should NEVER call this — it would let a duplicate
		* supervisor spawn.
		*/
		export function _clearLiveSpawnsForTests() {
		liveSpawns.clear();
		}

		/**
		* Inspect-only: read the current live-spawns map (for observability).
		*
		* @returns {ReadonlyMap<string, {spawnedAt: number, pid?: number}>}
		*/
		export function getLiveSpawns() {
		return new Map(liveSpawns);
		}

-45

host-cp/src/auth-secret-hint.mjs

		/**
		* Operator-facing diagnostic for auth-service authentication failures.
		*
		* Pre-fix, an empty OLAM_AUTH_SECRET (compose.yaml's
		* `${OLAM_AUTH_SECRET:-}` interpolation when the operator's shell
		* didn't export it) silently 401'd every host-cp → auth-service
		* call. The SPA showed "0 credentials" with no log line explaining
		* why. Logging a clear hint — both at boot when the env var is empty
		* AND on the first runtime 401 — turns a silent footgun into a
		* grep-able warning.
		*
		* Lives in its own file (not server.mjs) so unit tests can import it
		* without triggering server.mjs's top-level mkdir + http.listen side
		* effects.
		*/

		/**
		* @param {object} ctx
		* @param {string} ctx.authServiceUrl
		* The configured auth-service base URL — quoted back to the operator
		* so they can cross-reference with their compose env.
		* @param {boolean} ctx.hasSecret
		* True when host-cp's OLAM_AUTH_SECRET is set (and the 401 means a
		* value mismatch); false when it's empty (the original silent-fail
		* regression mode).
		* @returns {string}
		* A single-line message safe for `console.warn` / docker-compose-logs.
		*/
		export function authSecretHint({ authServiceUrl, hasSecret }) {
		if (!hasSecret) {
		return (
		`[auth] auth-service at ${authServiceUrl} is configured but ` +
		`OLAM_AUTH_SECRET is empty — every credentials/* call will 401. ` +
		`Set the env var to the contents of ~/.olam/auth-secret (or run ` +
		`'olam host-cp start' so the CLI loads it for you).`
		);
		}
		return (
		`[auth] auth-service at ${authServiceUrl} returned 401 even though ` +
		`OLAM_AUTH_SECRET is set — the secret does NOT match the value the ` +
		`auth-service container is using. Check that both containers were ` +
		`started from the same ~/.olam/auth-secret file and recreate them ` +
		`together if the file changed.`
		);
		}

-155

host-cp/src/auth.mjs

		// Phase F-2-B (B4): startup-token authentication for host CP.
		//
		// On boot: generate a 32-byte hex token (or reuse the file if it
		// exists), write to `~/.olam/host-cp.token` with mode 0600, cache in
		// memory. Middleware on all non-static, non-bootstrap routes validates
		// the request via:
		// - Cookie `olam_host_cp_token=<value>`
		// - OR Authorization: Bearer <value>
		// Reject 401 if neither matches.
		//
		// Threat model (T4 mitigation):
		// - Bound to 127.0.0.1:19000 only (compose.yaml). No public exposure.
		// - Single-user-per-host assumption; multi-user is Phase G+.
		// - Token file is chmod 600 owned by the operator. Browser tabs on
		// the same machine that try to hit :19000 are blocked unless they
		// have the token (cookie or header).
		// - /api/bootstrap returns the token unauthenticated. Rationale:
		// anything local that can hit 127.0.0.1:19000 can also read
		// ~/.olam/host-cp.token (same OS-level privilege boundary). This
		// just removes a UX friction step. NOT acceptable in multi-user
		// mode (Phase G+ uses cookie-with-Secure+HttpOnly via real auth).

		import crypto from 'node:crypto';
		import fs from 'node:fs';
		import path from 'node:path';

		export class StartupToken {
		/**
		* @param {object} opts
		* @param {string} opts.tokenPath absolute path to the token file
		* @param {() => string} [opts.generate] defaults to 32-byte hex via crypto.randomBytes
		* @param {(message: string) => void} [opts.log]
		* @param {typeof fs} [opts.fs] injectable for tests
		*/
		constructor({ tokenPath, generate, log = console.log, fs: fsImpl = fs }) {
		if (!tokenPath \|\| !path.isAbsolute(tokenPath)) {
		throw new Error('StartupToken: tokenPath must be an absolute path');
		}
		this.tokenPath = tokenPath;
		this.generate = generate ?? (() => crypto.randomBytes(32).toString('hex'));
		this.log = log;
		this.fs = fsImpl;
		/** @type {string \| null} */
		this.token = null;
		}

		/**
		* Ensure the token exists in memory + on disk. Call once at server
		* boot before listen(). Idempotent: subsequent calls return the
		* cached value.
		*
		* Behavior:
		* - If tokenPath exists: read it, cache, return it. (Lifecycle
		* CLI's `olam host-cp start` may have written the token before
		* the container starts; we must reuse the operator-visible
		* value, not regenerate it.)
		* - Else: generate a new token, write file with mode 0600, return.
		*
		* @returns {string}
		*/
		ensure() {
		if (this.token) return this.token;
		const dir = path.dirname(this.tokenPath);
		if (!this.fs.existsSync(dir)) {
		this.fs.mkdirSync(dir, { recursive: true });
		}
		if (this.fs.existsSync(this.tokenPath)) {
		const raw = this.fs.readFileSync(this.tokenPath, 'utf-8').trim();
		if (raw.length < 16) {
		// Defensive: a too-short token is almost certainly a corrupted
		// file. Regenerate rather than accept.
		this.log(`auth: existing token at ${this.tokenPath} too short (${raw.length}); regenerating`);
		this.token = this._writeNew();
		} else {
		this.token = raw;
		this.log(`auth: reused existing token at ${this.tokenPath}`);
		}
		} else {
		this.token = this._writeNew();
		}
		return this.token;
		}

		/** @private */
		_writeNew() {
		const t = this.generate();
		this.fs.writeFileSync(this.tokenPath, t, { mode: 0o600 });
		this.log(`auth: generated new token at ${this.tokenPath} (${t.length} chars)`);
		return t;
		}

		/**
		* Check request authorization. Constant-time comparison via
		* crypto.timingSafeEqual prevents timing-side-channel leaks of the
		* token's first-byte mismatches.
		*
		* @param {import('node:http').IncomingMessage} req
		* @returns {boolean}
		*/
		isAuthorized(req) {
		if (!this.token) return false;

		// Bearer header
		const authHeader = req.headers['authorization'];
		if (typeof authHeader === 'string' && authHeader.startsWith('Bearer ')) {
		const got = authHeader.slice('Bearer '.length).trim();
		if (this._compare(got)) return true;
		}

		// Cookie
		const cookieHeader = req.headers['cookie'];
		if (typeof cookieHeader === 'string') {
		const cookies = parseCookies(cookieHeader);
		const got = cookies['olam_host_cp_token'];
		if (got && this._compare(got)) return true;
		}

		return false;
		}

		/** @private */
		_compare(got) {
		if (!this.token) return false;
		if (got.length !== this.token.length) return false;
		try {
		return crypto.timingSafeEqual(Buffer.from(got), Buffer.from(this.token));
		} catch {
		return false;
		}
		}
		}

		/**
		* Parse a Cookie request header into an object. Handles `; ` separators
		* and `=` value-may-contain-equals (e.g., base64). Empty values + cookies
		* without `=` are tolerated.
		*
		* @param {string} header
		* @returns {Record<string, string>}
		*/
		export function parseCookies(header) {
		/** @type {Record<string, string>} */
		const out = {};
		for (const pair of header.split(';')) {
		const trimmed = pair.trim();
		if (!trimmed) continue;
		const eq = trimmed.indexOf('=');
		if (eq === -1) {
		out[trimmed] = '';
		} else {
		out[trimmed.slice(0, eq).trim()] = trimmed.slice(eq + 1).trim();
		}
		}
		return out;
		}

-238

host-cp/src/boot-reconciler.mjs

		/**
		* Boot-time reconciler — sync worlds.db with live docker state.
		*
		* Problem (issue #963): after Colima / userspace restart, host-cp can
		* start with worlds.db rows that no longer reflect docker reality. The
		* existing `worlds-db-source.mjs` reconciler runs DB→registry (reads
		* 'running' rows and adds them to in-memory WORLDS). It does NOT heal
		* the inverse case: a container is alive on docker but worlds.db has
		* no row (Hazel coral-sky-2478 scenario), or worlds.db says a world is
		* running but the container is gone (orphaned row).
		*
		* This module fills both gaps with a one-shot pass at boot:
		*
		* 1. List `olam-*-devbox` containers via the docker API.
		* 2. For each container, derive the worldId (strip prefix + suffix).
		* 3. Cross-check against worlds.db rows:
		* - container alive, row exists → no-op
		* - container alive, row missing → INSERT (status=reconciled)
		* - row says running/active, container missing → UPDATE status=orphaned
		*
		* Fail-soft: if the docker daemon is unreachable OR better-sqlite3 is
		* not available, the function logs a warning and returns without
		* throwing. Server boot continues.
		*
		* Idempotent: a second invocation against the same docker + DB state
		* produces no further changes (existing rows are skipped at step 3a,
		* already-orphaned rows are skipped at step 3c).
		*
		* Coordination with issue #962: the dedup logic in `olam create` handles
		* per-call deduplication; this reconciler handles boot-time cleanup.
		* They don't conflict — both operate on the worlds.db source-of-truth.
		*/

		import { createRequire } from 'node:module';

		const require = createRequire(import.meta.url);

		const CONTAINER_NAME_PATTERN = /^\/?(olam-(.+)-devbox)$/;

		/**
		* @typedef {object} ReconcileDeps
		* @property {string} dbPath Path to worlds.db
		* @property {() => Promise<string[] \| null>} listContainerNames Returns null when docker is unreachable
		* @property {(msg: string) => void} [log] Defaults to console.log
		* @property {() => string} [now] ISO timestamp generator (overridable for tests)
		* @property {(path: string) => unknown \| null} [openDb] Overridable DB opener (tests inject fakes)
		*/

		/**
		* @typedef {object} ReconcileSummary
		* @property {number} inserted Number of new rows inserted (reconciled containers)
		* @property {number} orphaned Number of rows transitioned to status='orphaned'
		* @property {number} skipped Containers/rows where no change was needed
		* @property {boolean} dockerUnreachable
		* @property {boolean} dbUnavailable
		*/

		/**
		* Extract a worldId from a docker container name.
		* Accepts either `olam-foo-bar-1234-devbox` or `/olam-foo-bar-1234-devbox`
		* (the docker API prefixes container names with a slash).
		*
		* @param {string} name
		* @returns {string \| null}
		*/
		export function extractWorldIdFromContainerName(name) {
		if (typeof name !== 'string') return null;
		const match = CONTAINER_NAME_PATTERN.exec(name);
		if (!match) return null;
		const worldId = match[2];
		if (!worldId \|\| worldId.length === 0) return null;
		return worldId;
		}

		/**
		* Default docker container lister. Hits the Docker Engine API.
		* Returns null on any failure (fail-soft).
		*
		* @param {string} dockerApiBase e.g. 'http://docker-socket-proxy:2375'
		* @param {(msg: string) => void} log
		* @returns {Promise<string[] \| null>}
		*/
		export async function defaultListContainerNames(dockerApiBase, log) {
		if (!dockerApiBase \|\| dockerApiBase === 'http://localhost:2375') {
		// 'docker-cli' sentinel; no API available in this deployment mode.
		log('[boot-reconciler] docker API unavailable (bare-node mode); skipping');
		return null;
		}
		try {
		const filters = encodeURIComponent(JSON.stringify({ name: ['olam-'] }));
		const url = `${dockerApiBase}/containers/json?filters=${filters}`;
		const res = await fetch(url, { signal: AbortSignal.timeout(5000) });
		if (!res.ok) {
		log(`[boot-reconciler] docker /containers/json returned ${res.status}; skipping`);
		return null;
		}
		const data = await res.json();
		if (!Array.isArray(data)) return [];
		const names = [];
		for (const container of data) {
		const list = container?.Names;
		if (!Array.isArray(list)) continue;
		for (const n of list) {
		if (typeof n === 'string') names.push(n);
		}
		}
		return names;
		} catch (err) {
		log(`[boot-reconciler] docker query failed: ${err.message}; skipping`);
		return null;
		}
		}

		/**
		* Default DB opener. Loads better-sqlite3 dynamically so a missing
		* native build degrades gracefully instead of crashing host-cp boot.
		*
		* @param {string} dbPath
		* @param {(msg: string) => void} log
		* @returns {unknown \| null}
		*/
		export function defaultOpenDb(dbPath, log) {
		try {
		const Database = require('better-sqlite3');
		return new Database(dbPath, { fileMustExist: true });
		} catch (err) {
		if (err && err.code === 'MODULE_NOT_FOUND') {
		log('[boot-reconciler] better-sqlite3 not available; skipping');
		} else if (err && err.code === 'SQLITE_CANTOPEN') {
		log(`[boot-reconciler] ${dbPath} not found; nothing to reconcile`);
		} else {
		log(`[boot-reconciler] failed to open ${dbPath}: ${err.message}`);
		}
		return null;
		}
		}

		/**
		* Run a single boot-time reconciliation pass. Pure and dep-injected
		* for testability.
		*
		* @param {ReconcileDeps} deps
		* @returns {Promise<ReconcileSummary>}
		*/
		export async function reconcileWorldsWithDocker(deps) {
		const log = deps.log ?? console.log;
		const now = deps.now ?? (() => new Date().toISOString());
		const openDb = deps.openDb ?? ((p) => defaultOpenDb(p, log));

		const summary = {
		inserted: 0,
		orphaned: 0,
		skipped: 0,
		dockerUnreachable: false,
		dbUnavailable: false,
		};

		const containerNames = await deps.listContainerNames();
		if (containerNames === null) {
		summary.dockerUnreachable = true;
		return summary;
		}

		const liveWorldIds = new Set();
		for (const name of containerNames) {
		const worldId = extractWorldIdFromContainerName(name);
		if (worldId) liveWorldIds.add(worldId);
		}

		const db = openDb(deps.dbPath);
		if (!db) {
		summary.dbUnavailable = true;
		return summary;
		}

		try {
		/** @type {Array<{ id: string, status: string }>} */
		let rows;
		try {
		rows = db.prepare('SELECT id, status FROM worlds').all();
		} catch (err) {
		log(`[boot-reconciler] query failed: ${err.message}; skipping`);
		summary.dbUnavailable = true;
		return summary;
		}

		const dbWorlds = new Map(rows.map((r) => [r.id, r.status]));

		// Pass 1: containers alive but missing from DB → insert.
		const insertStmt = db.prepare(
		`INSERT INTO worlds
		(id, name, status, repos, branch, port_offset, workspace_path,
		compute_provider, total_cost_usd, thought_count, created_at, updated_at)
		VALUES (?, ?, 'reconciled', '[]', 'main', 0, ?, 'docker', 0, 0, ?, ?)`,
		);
		for (const worldId of liveWorldIds) {
		if (dbWorlds.has(worldId)) {
		summary.skipped += 1;
		continue;
		}
		const ts = now();
		const workspacePath = `~/.olam/worlds/${worldId}`;
		try {
		insertStmt.run(worldId, worldId, workspacePath, ts, ts);
		summary.inserted += 1;
		log(`[boot-reconciler] inserted reconciled row for ${worldId} (container alive, no DB row)`);
		} catch (err) {
		log(`[boot-reconciler] failed to insert ${worldId}: ${err.message}`);
		}
		}

		// Pass 2: DB says alive but container missing → mark orphaned.
		const orphanStmt = db.prepare(
		`UPDATE worlds SET status = 'orphaned', updated_at = ? WHERE id = ?`,
		);
		const aliveStatuses = new Set(['running', 'active', 'creating']);
		for (const [worldId, status] of dbWorlds) {
		if (liveWorldIds.has(worldId)) continue;
		if (!aliveStatuses.has(status)) continue;
		try {
		orphanStmt.run(now(), worldId);
		summary.orphaned += 1;
		log(`[boot-reconciler] marked ${worldId} as orphaned (was '${status}', container missing)`);
		} catch (err) {
		log(`[boot-reconciler] failed to mark ${worldId} orphaned: ${err.message}`);
		}
		}

		log(
		`[boot-reconciler] complete: inserted=${summary.inserted} orphaned=${summary.orphaned} ` +
		`skipped=${summary.skipped} live-containers=${liveWorldIds.size}`,
		);
		} finally {
		try { db.close?.(); } catch { /* ignore */ }
		}

		return summary;
		}

-58

host-cp/src/bootstrap-selective.mjs

		// bootstrap-selective.mjs — Phase D1 helper, collapsed to a wildcard in
		// Phase E5 (ATOMIC SERVING CUTOVER).
		//
		// Determines whether a SPA shell render path should SKIP the host-cp
		// BOOTSTRAP_SCRIPT injection (cookie-bootstrap + fetch/EventSource
		// rewrite shim) and instead let the served SPA's own auth resolver +
		// world-fetch shim handle auth.
		//
		// Phase E5: plan-chat-spa is now host-cp's SOLE served SPA. Its bundle
		// re-homes the cookie-bootstrap + world-fetch-rewrite + 401-recover shim
		// (packages/plan-chat-spa/src/lib/worldFetch.ts, installed at the top of
		// src/main.tsx — Phase C). Therefore host-cp NEVER needs to inject
		// BOOTSTRAP_SCRIPT anymore: every path is a "planning" (== SPA-owned)
		// path. isPlanningPath() is collapsed to a wildcard accordingly.
		//
		// Reversal: set isPlanningPath to consult BOOTSTRAP_NOOP_PLANNING_PATHS
		// again (restore the prefix-match body below) to re-narrow the no-op to
		// the explicit planning prefixes; or, for full pre-D behaviour, also set
		// BOOTSTRAP_NOOP_PLANNING_PATHS to []. The const is retained as the
		// documented revert seam.
		//
		// Per K1 SCP-3 + phase-d-tasks.md D1 + phase-e-tasks.md E2.

		/**
		* Path prefixes that WERE owned by plan-chat-spa under the Phase D
		* selective no-op. Retained as the documented single-line revert seam:
		* to re-narrow the bootstrap no-op back to only the planning surfaces,
		* restore the prefix-match body in isPlanningPath() (see git history of
		* this file at the Phase E5 commit) so it consults this array again.
		*
		* Format: include both the bare segment ("/plan") and the trailing-slash
		* variant ("/plan/"). The trailing-slash form is the prefix-match
		* generator for "/plan/<rest>".
		*
		* @type {readonly string[]}
		*/
		export const BOOTSTRAP_NOOP_PLANNING_PATHS = Object.freeze([
		'/plan',
		'/plan/',
		]);

		/**
		* Phase E5 wildcard: TRUE for every string path.
		*
		* host-cp now serves plan-chat-spa exclusively, whose bundle re-homes the
		* cookie-bootstrap + world-fetch-rewrite shim (worldFetch.ts). No served
		* path needs host-cp's BOOTSTRAP_SCRIPT injection anymore, so every path
		* is treated as an SPA-owned ("planning") path and skips bootstrap.
		*
		* Returns false only for non-string input (defensive — a non-string
		* pathname is never a real served path).
		*
		* @param {unknown} pathname
		* @returns {boolean}
		*/
		export function isPlanningPath(pathname) {
		return typeof pathname === 'string';
		}

-170

host-cp/src/compose-worlds-sources.mjs

		/**
		* Phase E4 (olam-dogfood-vision): WorldsSource composition + dedup.
		*
		* Runs every configured WorldsSource (E1) in parallel and dedupes by
		* `id`. Source-array order expresses precedence: the LAST source to
		* claim an id wins on collision. server.mjs (E4 wiring via
		* `buildWorldsSources`) orders sources `[localSource, pylonSource]`
		* so cloud-side metadata overrides local stubs when the Pylon SDK
		* eventually returns real data for a world that's also docker-
		* resident locally.
		*
		* The function is intentionally pure + dep-free (no env reads, no
		* http, no module-level state) so vitest can drive it with two mock
		* sources to assert dedup direction without spinning up the server.
		*
		* ## Failure-mode contract (CP3 audit follow-up — closes CRIT/HIGH-1+2)
		*
		* Robustness goals:
		* 1. One bad source must NOT take down the union. Pylon SDK
		* transient outages, auth errors, network blips — these MUST
		* degrade to "cloud worlds missing this poll" rather than
		* "/api/worlds endpoint hangs". Achieved via `Promise.allSettled`
		* + per-source try/log/treat-as-empty.
		* 2. **Slow sources MUST NOT extend wall time past the SPA poll
		* cadence.** The SPA polls every 4s (Worlds.tsx:124); a Pylon
		* `client.worlds.list()` that takes 8s would block, queue
		* sockets, and pile up overlapping fetches. Achieved via
		* per-source `Promise.race` with `timeoutMs` (default 2000ms,
		* matching the existing docker-inspect timeout in
		* fetchWorldServices). A timed-out source is treated as `[]` for
		* this poll.
		* 3. **A failing source must produce a log line, not a silent
		* empty.** Operators need to see "[worlds-source] pylon-cloud
		* list() failed: <err>" in the host-cp boot log so the
		* degradation is observable.
		*
		* ## Dedup semantics on collision (CP3 audit follow-up — closes HIGH-4)
		*
		* Whole-record replacement (the pre-audit behavior) blanks fields the
		* later source doesn't populate. Concrete example: Pylon returns
		* `{services: undefined}` (or omits the field entirely) for a freshly-
		* claimed world while Local has `{services: [4 entries]}`. Whole-
		* record replacement would drop the local services array; the SPA
		* would render the world with no clickable links until Pylon
		* back-fills.
		*
		* Field-merge (the post-audit behavior): later source's defined
		* fields override earlier; earlier source's fields are preserved
		* where the later source omits them. `id` and `source` always come
		* from the later source (the precedence contract). Implementation:
		* `{ ...byId.get(id), ...world }` — ES spread skips own properties
		* with value `undefined` only if the producer ELIDES them; explicit
		* `field: undefined` does override. Therefore source authors should
		* OMIT fields they don't manage rather than setting them to
		* `undefined` / `[]`.
		*
		* @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource
		* @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary
		*/

		/**
		* @typedef {object} ComposeWorldsSourcesOptions
		* @property {number} [timeoutMs=2000]
		* Per-source timeout cap. A source whose `list()` doesn't resolve
		* within this budget is treated as `[]` for this composition pass
		* (logged at error level). Default matches the docker-inspect
		* timeout used elsewhere in host-cp so the /api/worlds path's worst-
		* case wall time stays bounded by it.
		* @property {(sourceName: string, err: unknown) => void} [onSourceError]
		* Invoked when a source rejects or times out. Defaults to
		* `console.error('[worlds-source] <name> list() failed:', err)`.
		* Tests inject a spy to assert log behavior without polluting
		* stderr.
		*/

		const DEFAULT_TIMEOUT_MS = 8000;

		/**
		* Per-source last-known-good cache. Keyed by source.name → WorldSummary[].
		* When a source resolves successfully, its output is stored here. When a
		* source rejects or times out, we fall back to the cached value so the
		* dashboard shows stale data rather than blanking. Stale data self-heals
		* on the next successful poll.
		*
		* Process-local, no TTL — the running server is authoritative. Tests that
		* need a clean slate should call _resetLastKnownGoodCache().
		*
		* @type {Map<string, import('./worlds-source.mjs').WorldSummary[]>}
		*/
		const _lastKnownGood = new Map();

		/**
		* Wraps a Promise in a per-source timeout race. The timeout error
		* carries the source name so `onSourceError` can log it usefully.
		*
		* @template T
		* @param {Promise<T>} promise
		* @param {number} ms
		* @param {string} sourceName
		* @returns {Promise<T>}
		*/
		function withTimeout(promise, ms, sourceName) {
		/** @type {ReturnType<typeof setTimeout> \| null} */
		let timer = null;
		const timeout = new Promise((_, reject) => {
		timer = setTimeout(() => {
		reject(new Error(`source "${sourceName}" timed out after ${ms}ms`));
		}, ms);
		});
		return Promise.race([promise, timeout]).finally(() => {
		if (timer !== null) clearTimeout(timer);
		});
		}

		/**
		* Reset the last-known-good cache. Exposed for tests only — call before
		* each test that needs a clean slate.
		*/
		export function _resetLastKnownGoodCache() {
		_lastKnownGood.clear();
		}

		/**
		* @param {WorldsSource[]} sources
		* Sources to compose. Order expresses precedence: later wins.
		* @param {ComposeWorldsSourcesOptions} [options]
		* @returns {Promise<WorldSummary[]>}
		* Deduped union of every source's `list()` output, keyed by `id`.
		* On collision: fields from later source override earlier where
		* defined; earlier fields preserved where later source omits them.
		*/
		export async function composeWorldsSources(sources, options = {}) {
		if (sources.length === 0) return [];
		const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
		const onSourceError =
		options.onSourceError ??
		((name, err) => {
		console.error(`[worlds-source] ${name} list() failed:`, err);
		});

		const settled = await Promise.allSettled(
		sources.map((s) => withTimeout(s.list(), timeoutMs, s.name)),
		);

		/** @type {Map<string, WorldSummary>} */
		const byId = new Map();
		for (let i = 0; i < settled.length; i++) {
		const result = settled[i];
		const source = sources[i];
		let resolved;
		if (result.status === 'rejected') {
		onSourceError(source.name, result.reason);
		const lkg = _lastKnownGood.get(source.name);
		if (!lkg) continue;
		resolved = lkg;
		} else {
		resolved = result.value;
		_lastKnownGood.set(source.name, result.value);
		}
		for (const world of resolved) {
		// Field-merge on collision: later source overrides earlier
		// where defined; earlier preserved where later omits. Keeps
		// local service-strip + host_port intact when Pylon claims a
		// world but hasn't populated those fields yet.
		const prior = byId.get(world.id);
		byId.set(world.id, prior ? { ...prior, ...world } : world);
		}
		}
		return [...byId.values()];
		}

-116

host-cp/src/config-reader.mjs

		// config-reader.mjs — Phase D (olam-config-store-unification): a host-cp-local,
		// DEPENDENCY-FREE reader for a single dotted value out of `config.json`.
		//
		// # Why a copy lives here (not an `@olam/core` import)
		//
		// host-cp is a pure `.mjs` package with NO `@olam/core` dependency — it cannot
		// import the TypeScript cloud-state resolver, and a relative reach into
		// `packages/core/src/...` would (a) couple host-cp to core's source layout and
		// (b) fail to resolve in the published/container build where core is not a
		// sibling on disk. The canonical zero-dep reader is
		// `packages/core/src/cloud-state/read-config-value.mjs`; this module INLINES the
		// same logic (Phase D tracker explicitly permits copy-inlining the tiny reader)
		// and adds host-cp's container-aware `config.json` directory resolution.
		//
		// # Container path resolution
		//
		// host-cp reads operator state from a bind-mount, NOT from `~/.olam` directly:
		// compose.yaml mounts `${HOME}/.olam → /data`, so inside the container the
		// canonical config lives at `/data/config.json` (os.homedir() → /root, which is
		// the ephemeral container layer — the WRONG place, the same bug fixed for
		// plan.db / plan-chat-secret). `olamConfigDir()` resolves the directory holding
		// `config.json` honouring, in order:
		// 1. process.env.OLAM_HOME (explicit override — D2 requirement)
		// 2. '/data' when HOST_CP_MODE==='container' (the compose bind-mount target)
		// 3. path.join(os.homedir(), '.olam') (bare-node install — no behaviour change)
		//
		// Returns the resolved value or `null` (file absent, bad JSON, or path miss) —
		// NEVER throws, so a fail-open caller degrades gracefully to its legacy legs.

		import { readFileSync, existsSync } from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';

		/**
		* Deployment-mode detection, identical to server.mjs: container when an
		* explicit OLAM_HOST_CP_MODE says so, else auto-detected from `/.dockerenv`
		* (created by the docker runtime on container start). Re-derived here (rather
		* than imported from server.mjs) so this module has no server.mjs dependency —
		* server.mjs imports THIS, not the reverse.
		*
		* @returns {'container' \| 'bare'}
		*/
		function hostCpMode() {
		return (
		process.env.OLAM_HOST_CP_MODE ??
		(existsSync('/.dockerenv') ? 'container' : 'bare')
		);
		}

		/**
		* Resolve the directory that CONTAINS `config.json` (the `~/.olam` ROOT, or its
		* container `/data` equivalent). Re-reads process.env on every call so a direnv
		* org-switch or a late OLAM_HOME export is observed (no module-load capture).
		*
		* @returns {string}
		*/
		export function olamConfigDir() {
		const olamHome = process.env.OLAM_HOME;
		if (olamHome && olamHome.length > 0) return olamHome;
		if (hostCpMode() === 'container') return '/data';
		return path.join(os.homedir(), '.olam');
		}

		/**
		* Absolute path to the canonical `config.json` host-cp reads.
		* @returns {string}
		*/
		export function configJsonPath() {
		return path.join(olamConfigDir(), 'config.json');
		}

		/**
		* Read a dotted path (e.g. `cloud.urls.anthropic-base-url`) out of the
		* container/host `config.json`. Mirrors
		* packages/core/src/cloud-state/read-config-value.mjs: only `.` splits
		* segments (dash-containing keys like `kg-proxy-url` are fine), and any
		* miss / read error / corrupt JSON returns `null` (fail-open). Returns the
		* string/number/boolean leaf, the sub-object for an interior path, or `null`.
		*
		* @param {string} dotpath
		* @returns {string \| number \| boolean \| object \| null}
		*/
		export function readConfigValue(dotpath) {
		let raw;
		try {
		raw = readFileSync(configJsonPath(), 'utf8');
		} catch {
		return null; // file absent / unreadable → not set
		}
		let parsed;
		try {
		parsed = JSON.parse(raw);
		} catch {
		return null; // corrupt JSON → not set (fail-open)
		}
		let cur = parsed;
		for (const seg of dotpath.split('.')) {
		if (cur === null \|\| typeof cur !== 'object' \|\| !(seg in cur)) return null;
		cur = cur[seg];
		}
		return cur === undefined ? null : cur;
		}

		/**
		* String-typed convenience: returns a non-empty trimmed string leaf, else null.
		* Trims to match the legacy file-read helpers (which `.trim()` file contents).
		*
		* @param {string} dotpath
		* @returns {string \| null}
		*/
		export function readConfigString(dotpath) {
		const v = readConfigValue(dotpath);
		if (typeof v !== 'string') return null;
		const t = v.trim();
		return t.length > 0 ? t : null;
		}

-163

host-cp/src/container-secret-fetcher.mjs

		// Phase F-2-B (B3): fetch a per-world container's X-Olam-Secret via the
		// docker-socket-proxy sidecar (container mode) OR via `docker exec` (bare-
		// node mode — host-cp running as a plain Node process on the host).
		//
		// The secret lives at `/tmp/olam-container-secret` inside the world's
		// devbox container. Phase E init wrote it (`chmod 400` owned by root —
		// world-app user has no write permission, T9 mitigation) and the
		// per-world CP's `requireAuth` middleware compares against it. Host CP
		// reads the secret server-side and injects `X-Olam-Secret` on proxied
		// requests, so the browser never sees the secret directly.
		//
		// Container mode (`dockerHost = 'tcp://docker-socket-proxy:2375'`):
		// 1. POST /containers/<name>/exec
		// body: { Cmd: ['cat', '/tmp/olam-container-secret'], AttachStdout: true, AttachStderr: true }
		// → { Id: '<exec-id>' }
		// 2. POST /exec/<exec-id>/start
		// body: { Detach: false, Tty: false }
		// → response stream containing the file bytes (raw multiplexed
		// stdout/stderr per Docker exec protocol)
		//
		// The exec endpoint is whitelisted in the socket-proxy (EXEC=1).
		//
		// Bare-node mode (`dockerHost = 'docker-cli'`):
		// Spawn `docker exec <containerName> cat /tmp/olam-container-secret` via
		// child_process. No socket-proxy on the host; the docker CLI on the
		// operator's $PATH is the canonical access path. Same `olam-<id>-devbox`
		// naming convention applies. ~10 ms of process-spawn overhead per miss
		// is fine because the secret is cached for OLAM_SECRET_CACHE_TTL_SEC
		// (default 300 s).

		import { spawnSync } from 'node:child_process';

		/**
		* Read /tmp/olam-container-secret from a world's devbox container.
		* Throws on any non-2xx response from the socket-proxy or on the
		* file being empty (the world's CP is misconfigured if it is).
		*
		* @param {object} args
		* @param {string} args.worldId
		* @param {string} args.dockerHost Either `tcp://...` for socket-proxy
		* mode or the sentinel `'docker-cli'` for bare-node mode.
		* @param {(host: string, init: RequestInit) => Promise<Response>} [args.fetchImpl]
		* injectable for tests; defaults to global fetch (Node 22+)
		* @returns {Promise<string>} the secret (trimmed of trailing whitespace)
		*/
		export async function fetchContainerSecret({ worldId, dockerHost, fetchImpl = globalThis.fetch }) {
		// Container naming convention: docker provider creates containers as
		// `olam-${worldId}-devbox` (see packages/adapters/src/docker/container.ts).
		// Phase F-2-D dogfood revealed the original `${worldId}-devbox` was
		// missing the `olam-` prefix.
		const containerName = `olam-${worldId}-devbox`;

		// Bare-node mode: shell out to docker exec directly. Operator's docker
		// CLI on $PATH is the canonical access path; no socket-proxy needed.
		if (dockerHost === 'docker-cli') {
		const result = spawnSync(
		'docker',
		['exec', containerName, 'cat', '/tmp/olam-container-secret'],
		{ encoding: 'utf-8' },
		);
		if (result.error) {
		throw new Error(`docker exec ${containerName} cat ... failed: ${result.error.message}`);
		}
		if (result.status !== 0) {
		throw new Error(
		`docker exec ${containerName} cat ... exit ${result.status}: ${(result.stderr \|\| '').trim()}`,
		);
		}
		const secret = (result.stdout \|\| '').trim();
		if (!secret) {
		throw new Error(`/tmp/olam-container-secret empty in container ${containerName}`);
		}
		return secret;
		}

		// Container mode: HTTP via the docker-socket-proxy sidecar.
		// Docker API: tcp://host:port → http://host:port
		const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');

		// Step 1: create exec instance
		const createUrl = `${apiBase}/containers/${encodeURIComponent(containerName)}/exec`;
		const createRes = await fetchImpl(createUrl, {
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify({
		Cmd: ['cat', '/tmp/olam-container-secret'],
		AttachStdout: true,
		AttachStderr: true,
		Tty: false,
		}),
		});
		if (!createRes.ok) {
		throw new Error(
		`socket-proxy POST /containers/${containerName}/exec failed: ${createRes.status} ${createRes.statusText}`,
		);
		}
		const createBody = await createRes.json();
		const execId = createBody.Id;
		if (!execId) {
		throw new Error(`socket-proxy /exec did not return Id: ${JSON.stringify(createBody)}`);
		}

		// Step 2: start exec, read stdout. The response is Docker's
		// multiplexed exec stream: 8-byte header per frame + payload bytes.
		// Header byte 0 = stream id (1=stdout, 2=stderr), bytes 4-7 = payload
		// length (big-endian uint32). For `cat <smallfile>` we expect a single
		// frame on stream 1.
		const startUrl = `${apiBase}/exec/${execId}/start`;
		const startRes = await fetchImpl(startUrl, {
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify({ Detach: false, Tty: false }),
		});
		if (!startRes.ok) {
		throw new Error(
		`socket-proxy POST /exec/${execId}/start failed: ${startRes.status} ${startRes.statusText}`,
		);
		}
		const buf = new Uint8Array(await startRes.arrayBuffer());

		// Decode the multiplexed stream. Skip stderr frames; concatenate
		// stdout payloads. Empty file → throw (per-world CP is broken).
		const stdoutBytes = decodeDockerExecStream(buf);
		const secret = new TextDecoder('utf-8').decode(stdoutBytes).trim();
		if (!secret) {
		throw new Error(`/tmp/olam-container-secret empty in container ${containerName}`);
		}
		return secret;
		}

		/**
		* Decode Docker's multiplexed exec stream — keep only stdout (stream id 1).
		* Stream format: each frame is 8-byte header + payload. Header byte 0
		* is the stream id (0=stdin, 1=stdout, 2=stderr); bytes 4-7 are the
		* payload length as big-endian uint32. Bytes 1-3 are reserved (zero).
		*
		* @param {Uint8Array} buf
		* @returns {Uint8Array}
		*/
		export function decodeDockerExecStream(buf) {
		const out = [];
		let i = 0;
		while (i + 8 <= buf.byteLength) {
		const streamId = buf[i];
		// Big-endian uint32 at offset i+4..i+8
		const len = (buf[i + 4] << 24) \| (buf[i + 5] << 16) \| (buf[i + 6] << 8) \| buf[i + 7];
		const payload = buf.subarray(i + 8, i + 8 + len);
		if (streamId === 1) {
		out.push(payload);
		}
		i += 8 + len;
		}
		// Concatenate.
		let total = 0;
		for (const p of out) total += p.byteLength;
		const merged = new Uint8Array(total);
		let off = 0;
		for (const p of out) {
		merged.set(p, off);
		off += p.byteLength;
		}
		return merged;
		}

-261

host-cp/src/crystallize-planning.mjs

		// crystallize-planning — atomic-or-compensating chunk-copy from a planning
		// session (_planning world) into a freshly provisioned real world.
		//
		// APPEND-ONLY CONSTRAINT: The chunks table has a NO_DELETE + NO_UPDATE
		// trigger (chunks_append_only_trigger). If chunk-copy fails mid-batch,
		// any chunks already INSERTed under the new worldId STAY in the database.
		// Compensating cleanup only calls destroyWorld (world container teardown) —
		// it CANNOT delete the orphaned chunks. Those orphan chunks are harmless:
		// • idx_chunks_planning only covers world_id='_planning' rows.
		// • The destroyed world container no longer exists, so no subscriber
		// will ever observe those orphans through the normal shape proxy.
		// • Any future re-crystallize creates a fresh worldId, fresh session_id.
		//
		// IDEMPOTENCY:
		// • If crystallize_status is 'crystallized' (with a stored worldId),
		// return immediately — the work is already done.
		// • If crystallize_status is 'in_progress', we cannot safely resume
		// (we don't know how far the previous copy got, and the chunk INSERT
		// is not idempotent by worldId+sessionId alone — the PRIMARY KEY is
		// (message_id, seq), so the same chunk could be re-inserted into a
		// different new session without collision). Safe behavior: return
		// the current status so the UI can display "in progress" and the
		// operator can force-retry after manual inspection.
		//
		// SLUG RULE: lowercased, non-alphanum → hyphens, max 40 chars.
		// Matches the dev-substrate stub in plan-chat-spa/src/server/index.ts
		// (confirmed by reading that file's crystallize stub, around line 983).

		import { randomUUID } from 'node:crypto';
		import { PLANNING_WORLD_ID } from '@olam/chunks/schema';
		import { setCrystallizeStatus } from './planning-sessions.mjs';

		/**
		* Slug a plan title into a world-name-safe string.
		* Lowercased, non-alphanum → hyphens, max 40 chars, leading/trailing
		* hyphens removed. Falls back to 'plan' if result is empty.
		*
		* @param {string} title
		* @returns {string}
		*/
		function slugTitle(title) {
		const base = title
		.toLowerCase()
		.replace(/[^a-z0-9]+/g, '-')
		.replace(/^-+\|-+$/g, '')
		.slice(0, 40);
		return base \|\| 'plan';
		}

		/**
		* Read the current crystallize_status + crystallized_world_id for a session.
		*
		* @param {object} pool
		* @param {string} sessionId
		* @returns {Promise<{crystallize_status: string, crystallized_world_id: string \| null}>}
		*/
		async function readCrystallizeState(pool, sessionId) {
		const result = await pool.query(
		`SELECT crystallize_status, crystallized_world_id
		FROM planning_sessions
		WHERE session_id = $1`,
		[sessionId],
		);
		if (result.rows.length === 0) {
		return { crystallize_status: 'open', crystallized_world_id: null };
		}
		const row = result.rows[0];
		return {
		crystallize_status: row.crystallize_status,
		crystallized_world_id: row.crystallized_world_id ?? null,
		};
		}

		/**
		* SELECT all planning chunks for a session, ordered by seq.
		*
		* @param {object} pool
		* @param {string} sessionId
		* @returns {Promise<Array<{world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type}>>}
		*/
		async function selectPlanningChunks(pool, sessionId) {
		const result = await pool.query(
		`SELECT world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type
		FROM chunks
		WHERE world_id = $1 AND session_id = $2
		ORDER BY seq ASC`,
		[PLANNING_WORLD_ID, sessionId],
		);
		return result.rows;
		}

		/**
		* INSERT a single chunk into the new world's session.
		* Uses the original message_id + seq verbatim; only world_id and
		* session_id change to point at the new world's session.
		*
		* @param {object} pool
		* @param {object} chunk — row from the planning session
		* @param {string} newWorldId
		* @param {string} newSessionId
		* @returns {Promise<void>}
		*/
		async function insertChunkIntoNewWorld(pool, chunk, newWorldId, newSessionId) {
		await pool.query(
		`INSERT INTO chunks
		(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
		[
		newWorldId,
		newSessionId,
		chunk.message_id,
		chunk.seq,
		chunk.actor_id,
		chunk.actor_type,
		chunk.role,
		chunk.chunk,
		chunk.chunk_type,
		],
		);
		}

		/**
		* INSERT a system marker chunk into the ORIGINAL planning session to
		* leave an audit trail of crystallization. The marker lands at
		* world_id='_planning' + the original sessionId.
		*
		* @param {object} pool
		* @param {string} sessionId — original planning session id
		* @param {string} worldId — newly created world id
		* @param {number} phaseCount — number of phases in the plan
		* @returns {Promise<void>}
		*/
		async function insertMarkerChunk(pool, sessionId, worldId, phaseCount) {
		const messageId = randomUUID();
		// Find the current max seq so the marker doesn't collide.
		const seqResult = await pool.query(
		`SELECT COALESCE(MAX(seq), -1) AS max_seq
		FROM chunks
		WHERE world_id = $1 AND session_id = $2`,
		[PLANNING_WORLD_ID, sessionId],
		);
		const nextSeq = Number(seqResult.rows[0].max_seq) + 1;
		await pool.query(
		`INSERT INTO chunks
		(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
		[
		PLANNING_WORLD_ID,
		sessionId,
		messageId,
		nextSeq,
		'system',
		'system',
		'system',
		`Plan crystallized into world "${worldId}" (${phaseCount} phase${phaseCount === 1 ? '' : 's'}).`,
		'text',
		],
		);
		}

		/**
		* crystallizePlanningSession
		*
		* 4-phase atomic-or-compensating process:
		* 1. Set crystallize_status='in_progress'
		* 2. Call createWorld({ name: slugged-title }) → { id: worldId }
		* 3. SELECT all chunks in _planning/sessionId; INSERT each into new world
		* 4. Set crystallize_status='crystallized' (with worldId); INSERT marker chunk
		*
		* Compensating pattern on partial failure:
		* - If createWorld throws: set status='failed', rethrow. destroyWorld NOT called.
		* - If chunk-copy throws mid-batch: set status='failed', call destroyWorld(worldId),
		* rethrow. Orphan chunks already INSERTed stay (append-only; see file header).
		*
		* Idempotency:
		* - Already 'crystallized': return immediately without re-running.
		* - Already 'in_progress': return current status (safe short-circuit; see header).
		*
		* @param {object} opts
		* @param {object} opts.pool — pg.Pool-compatible with .query()
		* @param {string} opts.sessionId — planning session to crystallize
		* @param {string} opts.planTitle — plan title (used for world name slug)
		* @param {Array} opts.planPhases — array of phase objects (name, acceptance, risks?)
		* @param {Function} opts.createWorld — async ({ name }) => { id: string, ... }
		* @param {Function} opts.destroyWorld — async (worldId) => void
		*
		* @returns {Promise<{worldId: string, status: string, new_session_id: string}>}
		* @throws on failure (crystallize_status already set to 'failed' when thrown)
		*/
		export async function crystallizePlanningSession({
		pool,
		sessionId,
		planTitle,
		planPhases,
		createWorld,
		destroyWorld,
		}) {
		// ── Idempotency guard ────────────────────────────────────────────────────
		const currentState = await readCrystallizeState(pool, sessionId);

		if (currentState.crystallize_status === 'crystallized') {
		return {
		worldId: currentState.crystallized_world_id,
		status: `crystallized:${currentState.crystallized_world_id}`,
		new_session_id: null,
		};
		}

		if (currentState.crystallize_status === 'in_progress') {
		// Cannot safely resume without knowing how far the copy got.
		// Return current status so the UI shows 'in_progress'.
		return {
		worldId: currentState.crystallized_world_id,
		status: 'in_progress',
		new_session_id: null,
		};
		}

		// ── Phase 1: mark in_progress ────────────────────────────────────────────
		await setCrystallizeStatus({ pool, sessionId, status: 'in_progress', worldId: null });

		// ── Phase 2: create world ────────────────────────────────────────────────
		let worldId;
		try {
		const worldName = slugTitle(planTitle);
		const world = await createWorld({ name: worldName });
		worldId = world.id;
		} catch (err) {
		await setCrystallizeStatus({ pool, sessionId, status: 'failed', worldId: null });
		throw err;
		}

		// ── Phase 3: copy chunks into new world ──────────────────────────────────
		const newSessionId = randomUUID();
		try {
		const chunks = await selectPlanningChunks(pool, sessionId);
		for (const chunk of chunks) {
		await insertChunkIntoNewWorld(pool, chunk, worldId, newSessionId);
		}
		} catch (err) {
		await setCrystallizeStatus({ pool, sessionId, status: 'failed', worldId: null });
		try {
		await destroyWorld(worldId);
		} catch {
		// Compensating destroy failure is non-fatal — the world may already
		// be partially torn down or the destroy operation may not be
		// reversible. Log is left to the caller's context.
		}
		throw err;
		}

		// ── Phase 4: mark crystallized + insert marker ───────────────────────────
		await setCrystallizeStatus({ pool, sessionId, status: 'crystallized', worldId });
		await insertMarkerChunk(pool, sessionId, worldId, planPhases.length);

		return {
		worldId,
		status: `crystallized:${worldId}`,
		new_session_id: newSessionId,
		};
		}

-157

host-cp/src/dispatch-persister.mjs

		/**
		* dispatch-persister.mjs — persist the last dispatch for each world.
		*
		* The world watchdog's recovery hook reads this to replay the last
		* unanswered prompt when it auto-recovers a wedged claude process.
		*
		* Contract:
		* persist({ worldId, messageId, prompt, source, statePath?, now? })
		* Atomically writes ~/.olam/worlds/<worldId>/state/last-dispatch.json.
		* Overwrites any previous file — only the LATEST dispatch matters for
		* replay. Atomic write (tmp + fs.rename) prevents partial-write residue
		* from corrupting recovery reads.
		*
		* read({ worldId, statePath? })
		* Returns { messageId, prompt, dispatchedAt, source } or null.
		* null on ENOENT (no dispatch persisted yet) — never throws.
		* null on JSON parse error (logs + skips) — never throws on corrupt file.
		*
		* Multiple worlds are independent: world A and world B have separate files.
		* Multiple concurrent persist() calls for the SAME world are safe — each
		* write is a rename of a tmp file so the worst case is one write winning.
		*
		* @see docs/architecture/world-watchdog.md
		*/

		import fs from 'node:fs/promises';
		import path from 'node:path';
		import os from 'node:os';

		// Default base path under which per-world state directories live.
		const DEFAULT_STATE_BASE = path.join(os.homedir(), '.olam', 'worlds');

		/**
		* Derive the path to last-dispatch.json for a world.
		*
		* @param {string} worldId
		* @param {string} [stateBase] Override the base directory (for tests).
		* @returns {string}
		*/
		export function lastDispatchPath(worldId, stateBase = DEFAULT_STATE_BASE) {
		return path.join(stateBase, worldId, 'state', 'last-dispatch.json');
		}

		/**
		* Persist the last dispatch for a world.
		*
		* @param {{
		* worldId: string,
		* messageId: string,
		* prompt: string,
		* source: string,
		* statePath?: string,
		* now?: () => number,
		* }} opts
		* @returns {Promise<void>}
		*/
		export async function persist({
		worldId,
		messageId,
		prompt,
		source,
		statePath,
		now = () => Date.now(),
		}) {
		const filePath = statePath ?? lastDispatchPath(worldId);
		const dir = path.dirname(filePath);
		const tmpPath = `${filePath}.tmp`;

		const record = {
		messageId,
		prompt,
		dispatchedAt: new Date(now()).toISOString(),
		source,
		};

		// Ensure the directory exists.
		await fs.mkdir(dir, { recursive: true });

		// Atomic write: write to .tmp then rename over the target.
		await fs.writeFile(tmpPath, JSON.stringify(record, null, 2) + '\n', 'utf8');
		await fs.rename(tmpPath, filePath);
		}

		/**
		* Fire-and-forget persist wrapper used at the dispatch call-sites.
		*
		* Centralises the void/.catch boilerplate so the two enrichment sites
		* (pr-nanny + /api/cloud-dispatch) can't drift on future changes.
		* Logs failures via the supplied logSource tag; never throws.
		*
		* @param {{
		* worldId: string,
		* messageId: string,
		* prompt: string,
		* source: string,
		* logSource?: string,
		* statePath?: string,
		* now?: () => number,
		* }} opts
		* @returns {void}
		*/
		export function safePersistLastDispatch(opts) {
		const { logSource = opts.source, ...persistOpts } = opts;
		void persist(persistOpts).catch((err) => {
		console.warn(
		`[${logSource}] persistLastDispatch failed (non-fatal): ${err?.message ?? err}`,
		);
		});
		}

		/**
		* Read the last persisted dispatch for a world.
		*
		* @param {{
		* worldId: string,
		* statePath?: string,
		* }} opts
		* @returns {Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } \| null>}
		*/
		export async function read({ worldId, statePath }) {
		const filePath = statePath ?? lastDispatchPath(worldId);

		let raw;
		try {
		raw = await fs.readFile(filePath, 'utf8');
		} catch (err) {
		if (err?.code === 'ENOENT') return null;
		// Other I/O errors (e.g. permissions) — log + return null (fail-soft).
		console.error(`[dispatch-persister] readFile ${filePath}: ${err?.message ?? err}`);
		return null;
		}

		try {
		const parsed = JSON.parse(raw);
		// Basic shape validation — don't throw on corrupt file.
		if (
		typeof parsed !== 'object' \|\|
		parsed === null \|\|
		typeof parsed.messageId !== 'string' \|\|
		typeof parsed.prompt !== 'string' \|\|
		typeof parsed.dispatchedAt !== 'string' \|\|
		typeof parsed.source !== 'string'
		) {
		console.error(`[dispatch-persister] ${filePath}: unexpected shape, skipping`);
		return null;
		}
		return {
		messageId: parsed.messageId,
		prompt: parsed.prompt,
		dispatchedAt: parsed.dispatchedAt,
		source: parsed.source,
		};
		} catch (err) {
		console.error(`[dispatch-persister] ${filePath}: JSON parse error: ${err?.message ?? err}`);
		return null;
		}
		}

-256

host-cp/src/docker-events.mjs

		// Phase F-2-B (B3): subscribe to docker events stream and invalidate
		// the secret cache on lifecycle events for known worlds.
		//
		// M2 ship gate: `docker restart <world>; within 10s, proxy call returns
		// 200 not 401`. The 10s budget is dominated by docker-events latency
		// (events fire ~1s after the docker daemon emits them) + JSON parse +
		// cache invalidate (<100ms). 10s is conservative.
		//
		// Stream format: Docker sends NDJSON — newline-delimited JSON events.
		// Each event has shape:
		// {"Type":"container","Action":"start","Actor":{"Attributes":{"name":"<container-name>"}},...}
		// We filter `Type === 'container'` && `Action ∈ INVALIDATING_ACTIONS` and
		// extract the worldId from the container name to invalidate the secret cache.
		//
		// Dogfood incident (2026-05-08): host-cp returned `secret_fetch_failed`
		// 502 / `unauthorized 401` after operators ran `docker start <devbox>`
		// on previously-exited world containers. Two bugs combined:
		// 1. The action filter excluded `start`. After SIGKILL → exit, the
		// operator's `docker start` emits a `start` event (NOT `restart`),
		// which the filter dropped — so the stale cached secret survived.
		// 2. The container-name regex was `/^(.+)-devbox$/`, predating the
		// `olam-` prefix added in Phase F-2-D. Even when the filter did
		// fire, it invalidated the wrong cache key (`olam-foo` instead of
		// `foo`), so the actual cache entry stayed.
		// Both are fixed below; tests use production naming to prevent drift.

		import http from 'node:http';
		import { spawn } from 'node:child_process';
		import { getDockerRequestOptions } from './lib/docker-request-options.mjs';

		/**
		* Container lifecycle events that may change the per-world secret.
		*
		* - `start` — fresh boot of a previously-exited container; secret is
		* regenerated by Phase E init, cache MUST drop the old value.
		* - `restart` — implicit stop+start; same secret-regeneration semantics.
		* - `stop` / `die` / `kill` — secret is no longer reachable; invalidating
		* prevents host-cp from handing out a stale value the moment
		* `docker start` brings the container back.
		*
		* `pause` / `unpause` are intentionally excluded — those don't change the
		* secret, and invalidating would force an unnecessary docker-exec on
		* resume.
		*/
		const INVALIDATING_ACTIONS = ['start', 'restart', 'stop', 'die', 'kill'];

		/**
		* Subscribe to docker events. Returns a stop function. Auto-reconnects
		* on transient errors (the events stream is long-lived; a daemon
		* restart breaks the connection but the function recovers).
		*
		* @param {object} args
		* @param {string} args.dockerHost Either `tcp://...` for socket-proxy
		* mode or the sentinel `'docker-cli'` for bare-node mode (spawns
		* `docker events --format json` via child_process).
		* @param {(worldId: string) => void} args.onWorldRestart
		* called when a known world restarts/stops/dies
		* @param {(info: { worldId: string, action: string, exitCode?: number }) => void} [args.onWorldLifecycleEvent]
		* Additive observer (Killshot #2): fires alongside onWorldRestart with
		* the raw docker action + exitCode when present. Wired in server.mjs
		* to map docker actions → WorldLifecyclePhase emissions on host-stream.
		* Optional + nullable — existing callers (tests, etc.) don't supply it.
		* @param {(message: string) => void} [args.log]
		* @returns {() => void} stop function
		*/
		export function subscribeDockerEvents({ dockerHost, onWorldRestart, onWorldLifecycleEvent, log = console.log }) {
		let stopped = false;
		let activeReq = null;
		let activeProc = null;
		let reconnectTimer = null;

		// Bare-node mode: shell out to `docker events --format json` and parse
		// its stdout as NDJSON. Same semantic as the HTTP path; different
		// transport. Eliminates the `tcp://docker-cli` URL-construction crash.
		function connectViaCli() {
		if (stopped) return;
		const filters = ['--filter', 'type=container'];
		log('docker-events: spawning `docker events --format json`');
		const child = spawn(
		'docker',
		['events', '--format', '{{json .}}', ...filters],
		{ stdio: ['ignore', 'pipe', 'pipe'] },
		);
		activeProc = child;
		let buf = '';
		child.stdout.setEncoding('utf-8');
		child.stdout.on('data', (chunk) => {
		buf += chunk;
		let nl;
		while ((nl = buf.indexOf('\n')) !== -1) {
		const line = buf.slice(0, nl);
		buf = buf.slice(nl + 1);
		if (!line.trim()) continue;
		try {
		const event = JSON.parse(line);
		// CLI shape uses `status` instead of HTTP API's `Action`; normalize.
		if (event.status && !event.Action) event.Action = event.status;
		if (event.Type === undefined && event.Type !== 'container') event.Type = 'container';
		handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log });
		} catch (err) {
		log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
		}
		}
		});
		child.stderr.on('data', (chunk) => {
		const text = String(chunk).trim();
		if (text) log(`docker-events: stderr: ${text}`);
		});
		child.on('exit', (code, signal) => {
		activeProc = null;
		log(`docker-events: child exited code=${code} signal=${signal}; reconnecting`);
		scheduleReconnect();
		});
		child.on('error', (err) => {
		log(`docker-events: spawn error: ${err.message}; reconnecting`);
		scheduleReconnect();
		});
		}

		function connect() {
		if (stopped) return;
		if (dockerHost === 'docker-cli') {
		return connectViaCli();
		}
		// Docker Engine API: GET /events?filters=...
		// Filter: type=container AND event=restart\|stop\|die
		// (Note: `event` filter takes a JSON-stringified array.)
		//
		// B8 fix (Phase 2 recovery round-2): use getDockerRequestOptions(substrate)
		// instead of constructing a URL from dockerHost. The old code did:
		// new URL('/events', dockerHost.replace(/^tcp:\/\//, 'http://'))
		// On kubernetes, dockerHost = 'unix:///var/run/docker.sock' — the replace
		// is a no-op, `unix:` is not a valid http URL base, and Node throws
		// ERR_INVALID_URL. The options-spread form routes through socketPath
		// (kubernetes) or host+port (compose), which Node's http module
		// understands natively. No URL construction needed.
		const substrate = dockerHost.startsWith('unix:') ? 'kubernetes' : 'compose';
		const filters = JSON.stringify({
		type: ['container'],
		event: INVALIDATING_ACTIONS,
		});
		const filtersParam = encodeURIComponent(filters);
		const path = `/events?filters=${filtersParam}`;
		const dockerOpts = getDockerRequestOptions(substrate);
		const connLabel = substrate === 'kubernetes'
		? `unix:${dockerOpts.socketPath}/events`
		: `http://${dockerOpts.host}:${dockerOpts.port}/events`;

		log(`docker-events: connecting to ${connLabel}`);
		activeReq = http.get({ ...dockerOpts, path }, (res) => {
		if (res.statusCode !== 200) {
		log(`docker-events: unexpected status ${res.statusCode}; will retry`);
		scheduleReconnect();
		return;
		}
		let buf = '';
		res.setEncoding('utf-8');
		res.on('data', (chunk) => {
		buf += chunk;
		// NDJSON: split on newlines; last fragment may be partial.
		let nl;
		while ((nl = buf.indexOf('\n')) !== -1) {
		const line = buf.slice(0, nl);
		buf = buf.slice(nl + 1);
		if (!line.trim()) continue;
		try {
		handleEvent(JSON.parse(line), { onWorldRestart, onWorldLifecycleEvent, log });
		} catch (err) {
		log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
		}
		}
		});
		res.on('end', () => {
		log('docker-events: stream closed; reconnecting');
		scheduleReconnect();
		});
		res.on('error', (err) => {
		log(`docker-events: stream error: ${err.message}; reconnecting`);
		scheduleReconnect();
		});
		});
		activeReq.on('error', (err) => {
		log(`docker-events: connect error: ${err.message}; reconnecting`);
		scheduleReconnect();
		});
		}

		function scheduleReconnect() {
		if (stopped) return;
		if (reconnectTimer) return;
		reconnectTimer = setTimeout(() => {
		reconnectTimer = null;
		connect();
		}, 2000); // 2s backoff
		}

		connect();

		return function stop() {
		stopped = true;
		if (reconnectTimer) clearTimeout(reconnectTimer);
		if (activeReq) activeReq.destroy();
		if (activeProc) {
		try { activeProc.kill('SIGTERM'); } catch { /* ignore */ }
		activeProc = null;
		}
		};
		}

		/**
		* Inspect a docker event and call onWorldRestart if it matches a
		* world container. Container naming convention: `olam-<worldId>-devbox`
		* (per packages/adapters/src/docker/container.ts:67).
		*
		* Exported for unit testing.
		*
		* @param {{ Type?: string, Action?: string, Actor?: { Attributes?: Record<string, string> } }} event
		* @param {{ onWorldRestart: (worldId: string) => void, onWorldLifecycleEvent?: (info: { worldId: string, action: string, exitCode?: number }) => void, log: (m: string) => void }} ctx
		*/
		export function handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log }) {
		if (event?.Type !== 'container') return;
		if (!INVALIDATING_ACTIONS.includes(event.Action ?? '')) return;
		const name = event.Actor?.Attributes?.name;
		if (!name) return;
		// Strip leading slash that Docker sometimes prepends to container names.
		const cleanName = name.startsWith('/') ? name.slice(1) : name;
		// Match the production naming `olam-<worldId>-devbox` literally — the
		// `olam-` prefix was added in Phase F-2-D and was not reflected in the
		// pre-fix regex. Anchoring on it also keeps host-cp's own container
		// (`olam-host-cp`) and the docker-socket-proxy out of the cache-invalidate
		// path even though they happen to start with `olam-`.
		const m = /^olam-(.+)-devbox$/.exec(cleanName);
		if (!m) return;
		const worldId = m[1];
		log(`docker-events: ${event.Action} on ${cleanName} → invalidating ${worldId}`);
		onWorldRestart(worldId);

		// Killshot #2 (additive): also notify the lifecycle observer when one
		// is wired. Docker's `die` events carry the container exit code in
		// Actor.Attributes.exitCode as a string; parse it best-effort and
		// forward NaN/missing as undefined so the classifier sees the
		// unambiguous "no exit code observed" signal.
		if (onWorldLifecycleEvent) {
		const action = event.Action ?? '';
		const rawExit = event.Actor?.Attributes?.exitCode;
		const parsed = rawExit !== undefined ? Number(rawExit) : NaN;
		const exitCode = Number.isFinite(parsed) ? parsed : undefined;
		try {
		onWorldLifecycleEvent({ worldId, action, exitCode });
		} catch (err) {
		// The lifecycle observer is best-effort instrumentation; a thrown
		// error here must not break the cache-invalidate hot path.
		log(`docker-events: onWorldLifecycleEvent threw for ${worldId}: ${err.message}`);
		}
		}
		}

-32

host-cp/src/engine-identity.mjs

		// Container-engine identity for host-cp.
		//
		// Phase 1a / A1: defaults to "docker"; switches to "kubernetes" when running
		// inside a K8s pod (autodetected via KUBERNETES_SERVICE_HOST). Operators can
		// override either way via OLAM_HOST_CP_ENGINE.
		//
		// This module exists separately from server.mjs to keep the engine-resolution
		// logic pure (no I/O, no mkdir, no global side-effects) so unit tests can
		// import it without triggering server startup. server.mjs imports
		// resolveHostCpEngine from here and computes its module-level HOST_CP_ENGINE
		// constant.
		//
		// KubernetesEngine adapter (Phase B / PR3) consumes the same env variables
		// when constructing the engine; the context-allowlist guard (T6 / Decision 10)
		// lives inside that adapter, not here. This module is "what name to surface
		// in the X-Olam-Engine response header" — nothing more.

		/**
		* Resolve the active container-engine identity for host-cp.
		*
		* Precedence (matches HOST_CP_MODE convention at server.mjs:85-87):
		* 1. Explicit env override: OLAM_HOST_CP_ENGINE=docker\|kubernetes
		* 2. Autodetect: KUBERNETES_SERVICE_HOST set → "kubernetes"
		* 3. Default: "docker"
		*
		* @param {NodeJS.ProcessEnv} [env=process.env] - environment to inspect.
		* @returns {string} - engine identity surfaced via X-Olam-Engine header.
		*/
		export function resolveHostCpEngine(env = process.env) {
		return env.OLAM_HOST_CP_ENGINE
		?? (env.KUBERNETES_SERVICE_HOST ? 'kubernetes' : 'docker');
		}

-71

host-cp/src/global-config-source.mjs

		// E1 (Phase E — olam-repos-and-runbooks): read ~/.olam/config.json and
		// expose it to the host-CP API endpoints (/api/repos, /api/runbooks).
		//
		// Never crashes: missing file → empty arrays, corrupt JSON → { error }.
		// Mirrors the workspace-catalog.mjs pattern: pure function, env-driven
		// path, no side effects at module load time.

		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';

		const DEFAULT_CONFIG_PATH =
		process.env.OLAM_CONFIG_PATH ??
		path.join(os.homedir(), '.olam', 'config.json');

		/**
		* @typedef {object} RepoEntry
		* @property {string} name
		* @property {string} path
		* @property {string} [description]
		* @property {number} [addedAt]
		* @property {number} [updatedAt]
		*/

		/**
		* @typedef {object} Runbook
		* @property {string} name
		* @property {string[]} repos
		* @property {number} [updatedAt]
		* @property {Record<string, Record<string, number>>} [portMap]
		* @property {Record<string, Record<string, string>>} [env]
		*/

		/**
		* @typedef {{ repos: RepoEntry[], runbooks: Runbook[] }} GlobalConfig
		* @typedef {{ error: string }} ConfigError
		*/

		/**
		* Load the global olam config from disk.
		* - Missing file → `{ repos: [], runbooks: [] }`
		* - Corrupt JSON → `{ error: string }`
		* - Success → `{ repos: RepoEntry[], runbooks: Runbook[] }`
		*
		* @param {string} [configPath]
		* @returns {GlobalConfig \| ConfigError}
		*/
		export function loadGlobalConfig(configPath = DEFAULT_CONFIG_PATH) {
		if (!fs.existsSync(configPath)) {
		return { repos: [], runbooks: [] };
		}
		let raw;
		try {
		raw = fs.readFileSync(configPath, 'utf-8');
		} catch (err) {
		return { error: `Failed to read ${configPath}: ${err.message}` };
		}
		let parsed;
		try {
		parsed = JSON.parse(raw);
		} catch (err) {
		return { error: `Invalid JSON in ${configPath}: ${err.message}` };
		}
		if (!parsed \|\| typeof parsed !== 'object' \|\| Array.isArray(parsed)) {
		return { error: `${configPath} does not contain a JSON object` };
		}
		return {
		repos: Array.isArray(parsed.repos) ? parsed.repos : [],
		runbooks: Array.isArray(parsed.runbooks) ? parsed.runbooks : [],
		};
		}

-43

host-cp/src/halt-detect.mjs

		// W4 — Halt-shape detection for the host-cp chunk-write proxy.
		//
		// When plan-DO's dispatchPlanningAgent (W1) trips a guardrail, it
		// emits a chunk with chunk_type='goal_mode_assumption' and content
		// matching: `[assumption: <cap>-tripped — spent $X.XXXX of $Y]` (or
		// similar shape per GuardrailState.haltChunkText()).
		//
		// host-cp's /api/plan-chat proxy passes the chunk through to the
		// chunks substrate AND, if it detects a halt-shaped chunk, broadcasts
		// a typed `plan.halted` event on host-stream so the SPA's
		// PlanHaltBanner subscriber fires.
		//
		// Extracted as a pure fn so it can be unit-tested without booting
		// the host-cp server.

		const HALT_RE =
		/^\[assumption:\s(usd\|turns\|tool_calls\|wall_clock)-tripped(?:\s—\sspent\s\$([0-9.]+))?/;

		/**
		* Detect a halt-shaped chunk + extract its components.
		*
		* Returns null when:
		* - chunk is null/undefined
		* - chunk_type isn't 'goal_mode_assumption'
		* - content doesn't match the halt regex
		*
		* Returns the parsed payload otherwise. Caller broadcasts this as
		* the `plan.halted` event payload.
		*/
		export function detectHaltChunk(chunk) {
		if (!chunk \|\| typeof chunk !== 'object') return null;
		if (chunk.chunk_type !== 'goal_mode_assumption') return null;
		if (typeof chunk.chunk !== 'string') return null;
		const m = chunk.chunk.match(HALT_RE);
		if (!m) return null;
		return {
		plan_id: chunk.session_id ?? 'unknown',
		operator_id: chunk.operator_id ?? 'unknown',
		halt_reason: m[1],
		usd_spent_so_far: m[2] ? Number.parseFloat(m[2]) : undefined,
		halted_at: Date.now(),
		};
		}

-504

host-cp/src/host-stream.mjs

		// Phase A → E (sse-consolidation): server-side multiplexed-SSE broadcaster.
		//
		// Single endpoint /api/host-stream replaces ~20 SPA polling loops. Hooks
		// subscribe to typed events on one connection instead of opening one
		// setInterval-loop per resource.
		//
		// Mirrors planOrchestrator.addEventSink fanout pattern verbatim — same
		// per-sink ServerResponse Set, same `event: <name>\ndata: <json>\n\n`
		// wire format, same cleanup-on-disconnect contract. Differences:
		//
		// - Keyed by event TYPE rather than conversationId (the broadcaster is
		// global to the host-cp, not per-conversation).
		// - Caches last-known payload per event type so reconnecting clients
		// receive an immediate snapshot replay before live updates resume.
		// - No turn-buffering — snapshots are idempotent so reconnect == latest.
		//
		// Phase E adds operational polish:
		// - E1: per-event-type trailing-edge debounce (default 100ms).
		// Coalesces broadcast storms during world boot.
		// - E2: per-sink 25s heartbeat (`:\n\n` comment) to keep idle SSE
		// connections alive across most proxy 60s timeouts.
		// - E3: backpressure-aware writes — slow sinks queue up to a bounded
		// in-memory buffer; overflow drops oldest events with an
		// `:overflow` comment so consumers know they missed updates.
		// - E4: per-event-type broadcast counter + sink count metric line.
		// - E5: the metrics tick ALSO broadcasts a `stream.health` typed event
		// carrying the same counters it logs, so any SPA tab can observe
		// live stream health (sink count, per-event broadcast rates,
		// overflow drops) without polling. Snapshot-cached like every
		// other state event — reconnecting clients replay the last
		// health payload immediately (first-paint parity). Opt out via
		// `deps.healthEvents = false`.
		//
		// Pure module: no docker, no DB, no global clock except `setInterval`
		// for the heartbeat/metrics timers (injectable in tests). Wiring those
		// sources to broadcast(...) lives in server.mjs (A4 + A5).
		//
		// References:
		// - packages/host-cp/src/server.mjs:1531 SSE writer template
		// - packages/host-cp/src/plan-orchestrator.mjs:967 addEventSink shape
		// - docs/plans/sse-consolidation/plan-source.md full design
		// - docs/plans/sse-consolidation/phase-e-tasks.md E1-E4 acceptance

		import crypto from 'node:crypto';

		/**
		* @typedef {object} HostStreamDeps
		* @property {(message: string) => void} [log] defaults to no-op
		* @property {object} [debounceMs] per-event-type debounce override
		* @property {number} [debounceMs.default] default trailing-edge ms (Phase E1)
		* @property {number} [heartbeatMs] per-sink heartbeat interval (Phase E2)
		* @property {number} [metricsMs] per-broadcaster metrics tick (Phase E4)
		* @property {boolean} [healthEvents] broadcast `stream.health` on each metrics tick (Phase E5; default true)
		* @property {number} [maxQueuedPerSink] bounded queue size (Phase E3)
		* @property {() => number} [now] injectable clock for `stream.health.at` (tests)
		* @property {(cb: () => void, ms: number) => any} [setTimer] injectable setInterval (tests)
		* @property {(handle: any) => void} [clearTimer] injectable clearInterval (tests)
		*/

		/**
		* @typedef {object} HostStream
		* @property {(res: import('node:http').ServerResponse) => () => void} addSink
		* @property {(eventType: string, payload: unknown) => number} broadcast
		* @property {() => Record<string, unknown>} snapshot
		* @property {() => void} close
		* @property {() => number} sinkCount
		* @property {() => HostStreamMetrics} metrics
		* @property {() => void} flushDebounced test-only — fire all pending coalesced broadcasts immediately
		*/

		/**
		* @typedef {object} HostStreamMetrics
		* @property {Record<string, number>} events per-event-type broadcasts since last reset
		* @property {number} sinks current active-sink count
		* @property {number} overflows total `:overflow` drops since last reset
		*/

		/**
		* Payload wire-shape for the `stream.health` event (Phase E5). A
		* point-in-time projection of the broadcaster's own observability
		* counters, emitted on each metrics tick. `events` carries the
		* per-event-type broadcast counts accrued during the just-elapsed
		* interval (reset afterward), so consumers see a per-interval RATE
		* rather than a monotonic total. `at` is the wall-clock emit time so a
		* reconnecting client can tell how stale the replayed snapshot is.
		*
		* @typedef {object} StreamHealthPayload
		* @property {Record<string, number>} events per-event broadcasts during the interval
		* @property {number} sinks active-sink count at emit time
		* @property {number} overflows `:overflow` drops during the interval
		* @property {number} intervalMs the metrics-tick cadence that produced this payload
		* @property {number} at Date.now() at emit time
		*/

		/** Event type emitted by the metrics tick (Phase E5). */
		export const STREAM_HEALTH_EVENT = 'stream.health';

		/**
		* Skill Forge promote progress (spa-harness-forge Phase C / C32). Broadcast by
		* the host-side promote job runner as it advances a promote_jobs row, so the
		* SPA's /forge editor reflects status live (push-not-poll). Payload shape:
		* { jobId, artifactId, status: 'promoting'\|'published'\|'failed', pr_url?, error? }
		* `broadcast()` is generic, so emitting is just `broadcast(FORGE_PROMOTE_EVENT, …)`.
		*/
		export const FORGE_PROMOTE_EVENT = 'forge.promote';

		const DEFAULT_DEBOUNCE_MS = 100;
		const DEFAULT_HEARTBEAT_MS = 25_000;
		const DEFAULT_METRICS_MS = 60_000;
		const DEFAULT_MAX_QUEUED = 64;

		/**
		* Event types that opt INTO the trailing-edge debounce (Phase E1). The
		* default callers — `world.snapshot`, `tunnels.snapshot`, `servers.snapshot`,
		* `listening.snapshot` — are all idempotent state-replay events where
		* "last writer wins" is correct and a 100ms cap on update propagation
		* is acceptable. Latency-sensitive events (`question.pending`) and
		* connect-only events (`ready`) stay immediate by NOT being in this set.
		*
		* Per-event-type overrides via `deps.debounceMs[type] = 0` force any
		* event off the debounce path; non-zero override flips it on with a
		* custom window. Callers should not need to opt anything new into
		* debouncing — adding a new snapshot event implies adding to this set.
		*/
		const DEFAULT_DEBOUNCED_EVENTS = new Set([
		'world.snapshot',
		'tunnels.snapshot',
		'servers.snapshot',
		'listening.snapshot',
		]);

		/**
		* Create a host-stream broadcaster. Stateless w.r.t. the request — all
		* source-of-truth wiring (docker events, worlds.db, etc.) is done by
		* the caller via repeated `broadcast()` invocations.
		*
		* @param {HostStreamDeps} [deps]
		* @returns {HostStream}
		*/
		export function createHostStream(deps = {}) {
		const log = deps.log ?? (() => {});
		const defaultDebounceMs = deps.debounceMs?.default ?? DEFAULT_DEBOUNCE_MS;
		const heartbeatMs = deps.heartbeatMs ?? DEFAULT_HEARTBEAT_MS;
		const metricsMs = deps.metricsMs ?? DEFAULT_METRICS_MS;
		const healthEvents = deps.healthEvents ?? true;
		const now = deps.now ?? (() => Date.now());
		const maxQueuedPerSink = deps.maxQueuedPerSink ?? DEFAULT_MAX_QUEUED;
		const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
		const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));

		/**
		* @typedef {object} SinkState
		* @property {import('node:http').ServerResponse} res
		* @property {string[]} queue
		* @property {boolean} paused true while waiting for a `drain` event
		* @property {boolean} draining true while flushQueue is iterating
		* @property {boolean} drainListenerAttached
		* @property {any \| null} heartbeatHandle
		* @property {number} overflows
		*/

		/** @type {Map<import('node:http').ServerResponse, SinkState>} */
		const sinks = new Map();

		/** @type {Map<string, unknown>} last-known payload per event type */
		const snapshots = new Map();

		/** @type {Map<string, any>} pending debounce timers per event type */
		const debounceTimers = new Map();

		/** Per-event-type broadcast counters since last metrics flush. */
		const eventCounters = new Map();
		let overflowCounter = 0;

		let closed = false;
		let metricsHandle = null;

		function formatEvent(eventType, payload) {
		return `event: ${eventType}\ndata: ${JSON.stringify(payload)}\n\n`;
		}

		/**
		* Queue-aware write. If the underlying socket's `res.write` returns
		* `false` we buffer the chunk in the per-sink queue and register a
		* one-shot `drain` listener to flush it when the kernel reports the
		* socket is writable again. On overflow we emit `:overflow` so
		* consumers know they missed updates and drop oldest.
		*
		* @returns {boolean} true if the chunk was accepted (synchronously or
		* queued) — false only when the sink is dead and was removed.
		*/
		function writeSafe(state, chunk) {
		const { res } = state;
		if (res.writableEnded \|\| res.destroyed) return false;

		// If a previous write reported backpressure (returned false), queue
		// unconditionally — preserves event ordering. The drain handler
		// flushes the queue in FIFO order.
		if (state.paused) {
		enqueue(state, chunk);
		return true;
		}

		try {
		const ok = res.write(chunk);
		if (ok) return true;
		// Returned false — kernel buffer is full. Switch to queue mode so
		// subsequent writes don't race past this one.
		state.paused = true;
		attachDrain(state);
		return true;
		} catch {
		// Sink already closed — drop it; further writes would throw.
		teardownSink(res);
		return false;
		}
		}

		function enqueue(state, chunk) {
		if (state.queue.length >= maxQueuedPerSink) {
		// Drop oldest, emit :overflow comment when the drain eventually
		// flushes. The overflow comment is enqueued (not written directly)
		// so consumers see it inline with surrounding events.
		state.queue.shift();
		state.overflows += 1;
		overflowCounter += 1;
		if (!state.queue.some((s) => s === ':overflow\n\n')) {
		state.queue.unshift(':overflow\n\n');
		}
		}
		state.queue.push(chunk);
		attachDrain(state);
		}

		function attachDrain(state) {
		if (state.drainListenerAttached) return;
		const { res } = state;
		if (typeof res.once !== 'function') return; // testing-sink fallback
		state.drainListenerAttached = true;
		res.once('drain', () => {
		state.drainListenerAttached = false;
		flushQueue(state);
		});
		}

		function flushQueue(state) {
		const { res } = state;
		if (state.draining) return;
		state.draining = true;
		state.paused = false;
		try {
		while (state.queue.length > 0) {
		if (res.writableEnded \|\| res.destroyed) {
		state.queue.length = 0;
		break;
		}
		const next = state.queue[0];
		let ok = false;
		try {
		ok = res.write(next);
		} catch {
		teardownSink(res);
		return;
		}
		state.queue.shift();
		if (!ok) {
		state.paused = true;
		attachDrain(state);
		break;
		}
		}
		} finally {
		state.draining = false;
		}
		}

		function teardownSink(res) {
		const state = sinks.get(res);
		if (!state) return;
		if (state.heartbeatHandle) {
		try { clearTimer(state.heartbeatHandle); } catch { /* ignore */ }
		state.heartbeatHandle = null;
		}
		state.queue.length = 0;
		sinks.delete(res);
		}

		function doBroadcast(eventType, payload) {
		if (closed) return 0;
		snapshots.set(eventType, payload);
		eventCounters.set(eventType, (eventCounters.get(eventType) ?? 0) + 1);
		const chunk = formatEvent(eventType, payload);
		let reached = 0;
		// Snapshot the iteration order so concurrent sink removal during
		// a write doesn't skip a sibling sink.
		for (const state of [...sinks.values()]) {
		if (writeSafe(state, chunk)) reached += 1;
		}
		return reached;
		}

		function flushDebounced() {
		for (const [type, info] of debounceTimers) {
		clearTimeout(info.handle);
		debounceTimers.delete(type);
		doBroadcast(type, info.payload);
		}
		}

		function logMetrics() {
		if (eventCounters.size === 0 && sinks.size === 0 && overflowCounter === 0) return;
		/** @type {Record<string, number>} */
		const events = {};
		for (const [type, count] of eventCounters) events[type] = count;
		log(`events=${JSON.stringify(events)} sinks=${sinks.size}${overflowCounter > 0 ? ` overflows=${overflowCounter}` : ''}`);

		// Phase E5: broadcast the same counters as a typed `stream.health`
		// event so SPA tabs can observe live stream health without polling.
		// Built from the interval's counters BEFORE the reset below, so the
		// payload is a per-interval rate. The broadcast itself bumps the
		// `stream.health` counter, but the immediately-following reset wipes
		// it — the next interval never double-counts this tick's own emit.
		// Bypasses debounce (immediate path) since each tick is already
		// rate-limited to the metrics cadence.
		if (healthEvents) {
		/** @type {StreamHealthPayload} */
		const payload = {
		events,
		sinks: sinks.size,
		overflows: overflowCounter,
		intervalMs: metricsMs,
		at: now(),
		};
		doBroadcast(STREAM_HEALTH_EVENT, payload);
		}

		eventCounters.clear();
		overflowCounter = 0;
		}

		// Start the metrics tick eagerly — operators want visibility from
		// boot, not just after the first event lands.
		if (metricsMs > 0) {
		metricsHandle = setTimer(logMetrics, metricsMs);
		// Don't pin the event loop just for metrics in tests / shutdown paths.
		if (metricsHandle && typeof metricsHandle.unref === 'function') metricsHandle.unref();
		}

		return {
		addSink(res) {
		if (closed) {
		// Best-effort: end the response so the client sees the channel
		// closing instead of hanging on an empty stream.
		try { res.end(); } catch { /* ignore */ }
		return () => {};
		}

		const state = /** @type {SinkState} */ ({
		res,
		queue: [],
		paused: false,
		draining: false,
		drainListenerAttached: false,
		heartbeatHandle: null,
		overflows: 0,
		});
		sinks.set(res, state);

		// Replay last-known snapshot for every event type so the new
		// subscriber gets current state without waiting for the next change.
		// Sorting keeps test assertions deterministic.
		const types = [...snapshots.keys()].sort();
		for (const type of types) {
		writeSafe(state, formatEvent(type, snapshots.get(type)));
		}

		// Phase E2: per-sink heartbeat. Write a comment line every
		// `heartbeatMs` so the SSE channel survives idle proxies. The
		// comment is invisible to client EventSource listeners (the
		// browser passes only `event:`/`data:` lines through), so this
		// does NOT trigger any handler — it's pure connection-keepalive.
		if (heartbeatMs > 0) {
		state.heartbeatHandle = setTimer(() => {
		// Use writeSafe so backpressure / overflow handling applies
		// uniformly. Heartbeats that fail to flush are uninteresting
		// — the regular broadcast loop will discover the dead sink.
		writeSafe(state, ':\n\n');
		}, heartbeatMs);
		if (state.heartbeatHandle && typeof state.heartbeatHandle.unref === 'function') {
		state.heartbeatHandle.unref();
		}
		}

		return () => {
		teardownSink(res);
		};
		},

		broadcast(eventType, payload) {
		if (closed) return 0;
		if (typeof eventType !== 'string' \|\| eventType.length === 0) {
		throw new TypeError('broadcast: eventType must be a non-empty string');
		}

		// Phase E1: opt-in trailing-edge debounce.
		// - DEFAULT_DEBOUNCED_EVENTS opts the canonical snapshot events
		// into trailing-edge coalescing. Last writer wins because those
		// events are idempotent state replays.
		// - Every other event type bypasses the timer and writes
		// immediately — preserves the Phase A synchronous broadcast
		// contract that existing tests / consumers depend on.
		// - Per-event-type overrides via `deps.debounceMs[eventType]`
		// win in both directions (set to 0 to disable, or specify a
		// custom window).
		// - `flushDebounced()` is exposed for tests that want to assert
		// immediate effects without waiting for the timer.
		let debounceFor;
		const override = deps.debounceMs?.[eventType];
		if (override !== undefined) {
		debounceFor = override;
		} else if (DEFAULT_DEBOUNCED_EVENTS.has(eventType)) {
		debounceFor = defaultDebounceMs;
		} else {
		debounceFor = 0;
		}

		if (debounceFor <= 0) {
		// Take the immediate path; flush any pending coalesce for this
		// type first so order is preserved.
		const pending = debounceTimers.get(eventType);
		if (pending) {
		clearTimeout(pending.handle);
		debounceTimers.delete(eventType);
		}
		return doBroadcast(eventType, payload);
		}

		// Coalesce: keep the latest payload, restart the trailing timer.
		const pending = debounceTimers.get(eventType);
		if (pending) clearTimeout(pending.handle);
		const handle = setTimeout(() => {
		debounceTimers.delete(eventType);
		doBroadcast(eventType, payload);
		}, debounceFor);
		if (typeof handle.unref === 'function') handle.unref();
		debounceTimers.set(eventType, { handle, payload });
		// Returns sinks.size as an approximation; the actual broadcast
		// will happen after the trailing-edge delay. Tests assert via the
		// sink writes anyway.
		return sinks.size;
		},

		snapshot() {
		/** @type {Record<string, unknown>} */
		const out = {};
		for (const [type, payload] of snapshots) out[type] = payload;
		return out;
		},

		close() {
		if (closed) return;
		closed = true;
		// Cancel pending debounce timers — anything still queued is
		// discarded; we don't write to sinks during shutdown.
		for (const [, info] of debounceTimers) clearTimeout(info.handle);
		debounceTimers.clear();
		if (metricsHandle) {
		try { clearTimer(metricsHandle); } catch { /* ignore */ }
		metricsHandle = null;
		}
		for (const [res, state] of [...sinks.entries()]) {
		if (state.heartbeatHandle) {
		try { clearTimer(state.heartbeatHandle); } catch { /* ignore */ }
		}
		try { res.end(); } catch { /* ignore */ }
		sinks.delete(res);
		}
		log('closed');
		},

		sinkCount() {
		return sinks.size;
		},

		metrics() {
		/** @type {Record<string, number>} */
		const events = {};
		for (const [type, count] of eventCounters) events[type] = count;
		return { events, sinks: sinks.size, overflows: overflowCounter };
		},

		flushDebounced,
		};
		}

		/**
		* Generate a fresh streamId for the `ready` event payload. Exposed so
		* route handlers can attach the same id to log lines and the wire.
		*
		* @returns {string}
		*/
		export function newStreamId() {
		return crypto.randomBytes(8).toString('hex');
		}

-141

host-cp/src/listening-server-poller.mjs

		/**
		* listening-server-poller.mjs
		* Discovers listening TCP ports inside a world's devbox container.
		* Dual-mode: Docker HTTP API (container) vs docker exec CLI (bare-node).
		* Cache TTL: 10s per world.
		*/
		import { spawnSync } from 'node:child_process';

		const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli';
		// Skip well-known infra ports — these are always running and not user servers
		const INFRA_PORTS = new Set([8080, 7681, 7682]);

		// Per-world cache: worldId → { ts, servers, error? }
		const cache = new Map();
		const CACHE_TTL_MS = 10_000;

		function worldContainerName(worldId) {
		return `olam-${worldId}-devbox`;
		}

		/**
		* Parse `ss -tlnp` output into server rows.
		* Output format:
		* Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
		* tcp LISTEN 0 128 0.0.0.0:5173 0.0.0.0:* users:(("vite",pid=42,fd=8))
		*
		* @param {string} stdout
		* @returns {Array<{port: number, pid: string, cmd: string}>}
		*/
		export function parseSsOutput(stdout) {
		const lines = stdout.trim().split('\n').slice(1); // skip header
		const results = [];
		for (const line of lines) {
		const parts = line.trim().split(/\s+/);
		if (parts.length < 5) continue;
		// parts[3] = Local Address:Port (e.g. "0.0.0.0:5173" or "*:5173" or ":::5173")
		const localAddr = parts[3];
		const colonIdx = localAddr.lastIndexOf(':');
		if (colonIdx === -1) continue;
		const portStr = localAddr.slice(colonIdx + 1);
		const port = parseInt(portStr, 10);
		if (!Number.isFinite(port) \|\| port <= 0) continue;
		if (INFRA_PORTS.has(port)) continue;

		// Extract pid and cmd from process column: users:(("vite",pid=42,fd=8))
		let pid = '';
		let cmd = '';
		const processCol = parts.slice(4).join(' ');
		const pidMatch = /pid=(\d+)/.exec(processCol);
		if (pidMatch) pid = pidMatch[1];
		const cmdMatch = /"([^"]+)"/.exec(processCol);
		if (cmdMatch) cmd = cmdMatch[1];

		results.push({ port, pid, cmd });
		}
		return results;
		}

		/**
		* Fetch listening servers for a world. Returns cached result if <10s old.
		* @param {string} worldId
		* @returns {Promise<{ts: number, servers: Array<{port: number, pid: string, cmd: string}>, error?: string}>}
		*/
		export async function getListeningServers(worldId) {
		const cached = cache.get(worldId);
		if (cached && Date.now() - cached.ts < CACHE_TTL_MS) return cached;

		const containerName = worldContainerName(worldId);
		try {
		let stdout;
		if (DOCKER_HOST === 'docker-cli') {
		const result = spawnSync(
		'docker', ['exec', containerName, 'ss', '-tlnp'],
		{ encoding: 'utf-8', timeout: 3000 },
		);
		if (result.status !== 0 \|\| result.error) {
		const entry = { ts: Date.now(), servers: [], error: 'container not running' };
		cache.set(worldId, entry);
		return entry;
		}
		stdout = result.stdout ?? '';
		} else {
		const apiBase = DOCKER_HOST.replace(/^tcp:\/\//, 'http://');
		const execCreate = await fetch(
		`${apiBase}/containers/${encodeURIComponent(containerName)}/exec`,
		{
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify({
		AttachStdout: true,
		AttachStderr: false,
		Cmd: ['ss', '-tlnp'],
		}),
		signal: AbortSignal.timeout(3000),
		},
		);
		if (!execCreate.ok) {
		const entry = { ts: Date.now(), servers: [], error: 'container not running' };
		cache.set(worldId, entry);
		return entry;
		}
		const { Id: execId } = await execCreate.json();
		const execStart = await fetch(`${apiBase}/exec/${execId}/start`, {
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify({ Detach: false, Tty: false }),
		signal: AbortSignal.timeout(3000),
		});
		// Docker exec start streams multiplexed output (8-byte header per frame)
		const buf = await execStart.arrayBuffer();
		stdout = demuxDockerStream(Buffer.from(buf));
		}
		const servers = parseSsOutput(stdout);
		const entry = { ts: Date.now(), servers };
		cache.set(worldId, entry);
		return entry;
		} catch {
		const entry = { ts: Date.now(), servers: [], error: 'container not running' };
		cache.set(worldId, entry);
		return entry;
		}
		}

		/**
		* Strip Docker stream multiplexing headers (8 bytes per frame: [stream, 0, 0, 0, size32be]).
		* @param {Buffer} buf
		* @returns {string}
		*/
		function demuxDockerStream(buf) {
		let output = '';
		let offset = 0;
		while (offset + 8 <= buf.length) {
		const size = buf.readUInt32BE(offset + 4);
		const payload = buf.slice(offset + 8, offset + 8 + size);
		output += payload.toString('utf-8');
		offset += 8 + size;
		}
		return output;
		}

		export { parseSsOutput as _parseSsOutputForTests };

-83

host-cp/src/local-worlds-source.mjs

		/**
		* Phase E2 (olam-dogfood-vision): LocalWorldsSource implementation.
		*
		* Wraps host-cp's existing dockerode-driven world enumeration in a
		* WorldsSource-shaped object so E4's composition layer can fan out
		* across multiple sources (local + future Pylon cloud) and merge.
		*
		* The class deliberately takes its dependencies via factory function
		* injection rather than reaching into server.mjs's module-level state
		* directly. Two reasons:
		* 1. Testability — vitest can pass mocked getWorldsRegistry +
		* fetchWorldServices without spinning up the full host-cp
		* server.mjs.
		* 2. Module-cycle avoidance — server.mjs imports this module, so
		* this module CANNOT import server.mjs back without a cycle.
		*
		* Returns the same shape as the pre-E2 GET /api/worlds response with
		* a single addition: `source: 'local'` on every entry.
		*
		* @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource
		* @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary
		* @typedef {import('./worlds-source.mjs').ServiceInfo} ServiceInfo
		*/

		/**
		* @typedef {object} LocalWorldsSourceDeps
		* @property {() => Record<string, number>} getWorldsRegistry
		* Returns current WORLDS map (worldId → host_port). Called fresh
		* per list() so post-list registry mutations are visible immediately.
		* @property {(worldId: string) => string \| null} getWorldName
		* Returns the operator-set friendly name OR null if absent.
		* @property {(worldId: string) => Promise<ServiceInfo[]>} fetchWorldServices
		* Probes per-world services (atlas-core, diner-app, ttyd, per-world CP).
		* Same function the pre-E2 handler called inline.
		*/

		/**
		* @param {LocalWorldsSourceDeps} deps
		* @returns {WorldsSource}
		*/
		export function createLocalWorldsSource(deps) {
		return {
		name: 'local',
		async list() {
		const registry = deps.getWorldsRegistry();
		const entries = Object.entries(registry);
		const worlds = await Promise.all(
		entries.map(async ([id, host_port]) => {
		const services = await deps.fetchWorldServices(id);
		// World status mirrors pre-E2 behavior:
		// - running: >=1 service responds to a probe
		// - starting: container has port bindings but nothing answers
		// - unknown: no port bindings at all (container down/missing)
		const liveCount = services.filter((s) => s.live).length;
		/** @type {'running' \| 'starting' \| 'unknown'} */
		const status =
		services.length === 0
		? 'unknown'
		: liveCount > 0
		? 'running'
		: 'starting';
		/** @type {WorldSummary} */
		const summary = {
		id,
		name: deps.getWorldName(id),
		status,
		services,
		source: 'local',
		};
		// Preserve the pre-E2 host_port field so SPA + CLI consumers
		// that depend on it don't break. WorldSummary type doesn't
		// declare host_port (it's local-source-specific metadata),
		// but extra fields on the object are tolerated by the type.
		return /** @type {WorldSummary & {host_port: number}} */ ({
		...summary,
		host_port,
		});
		}),
		);
		return worlds;
		},
		};
		}

-281

host-cp/src/metrics.mjs

		// Phase C Task C3 — hand-rolled Prometheus metrics registry for host-cp.
		//
		// Emits exactly two metric families:
		// http_requests_total{service,route,method,status_code} counter
		// http_request_duration_seconds{service,route,method} histogram
		//
		// TAXONOMY COMPLIANCE (NON-NEGOTIABLE):
		// ONLY {service, route, method, status_code} labels allowed.
		// BANNED: world_id, trace_id, user_id, request_id, operator_id.
		// world_id surfaces via Prometheus exemplars in Phase D — NOT labels.
		//
		// No external npm deps — Prometheus text exposition is simple enough to
		// produce with template literals. Avoids the prom-client footprint on a
		// host-side service that has no other dependency on metrics tooling.

		// ─── Route mapping ────────────────────────────────────────────────────────
		//
		// Raw req.url is a cardinality bomb: every unique URL is a new time series.
		// We normalize dynamic path segments to stable patterns before labelling.
		//
		// RULES (first match wins):
		// /health → /health
		// /api/bootstrap → /api/bootstrap
		// /metrics → /metrics
		// /api/host-stream → /api/host-stream
		// /api/worlds/{id}/credentials/... → /api/worlds/:id/credentials/:action
		// /api/worlds/{id}/tunnels/... → /api/worlds/:id/tunnels
		// /api/worlds/{id}/pr → /api/worlds/:id/pr
		// /api/worlds/{id}/progress → /api/worlds/:id/progress
		// /api/worlds (no id) → /api/worlds
		// /api/world/{id}/** → /api/world/:id/* (proxy routes)
		// /api/admin/registry/... → /api/admin/registry
		// /api/admin/upgrade → /api/admin/upgrade
		// /api/admin/world-pr → /api/admin/world-pr
		// /api/admin/world-pr/{id} → /api/admin/world-pr/:id
		// /api/auth/credentials/... → /api/auth/credentials
		// /api/auth/... → /api/auth
		// /api/plan/conversations/{id}/... → /api/plan/conversations/:id
		// /api/plan/conversations → /api/plan/conversations
		// /api/plan/** → /api/plan
		// /api/auth/events → /api/auth/events
		// /api/version/status → /api/version/status
		// /api/repos → /api/repos
		// /api/runbooks → /api/runbooks
		// /api/workspaces/match → /api/workspaces/match
		// /api/workspaces → /api/workspaces
		// /api/projects → /api/projects
		// /api/processes/** → /api/processes
		// /v1/chunks/** → /v1/chunks
		// /v1/worlds/** → /v1/worlds
		// /assets/** → /assets (SPA static assets)
		// (other GET to static paths) → /static
		// (unknown) → /unknown

		/** @param {string} pathname */
		export function pathToRoute(pathname) {
		// Normalize trailing slash for matching (keep bare / as /)
		const p = pathname.length > 1 ? pathname.replace(/\/$/, '') : pathname;

		if (p === '/health') return '/health';
		if (p === '/api/bootstrap') return '/api/bootstrap';
		if (p === '/metrics') return '/metrics';
		if (p === '/api/host-stream') return '/api/host-stream';
		if (p === '/api/auth/events') return '/api/auth/events';
		if (p === '/api/version/status') return '/api/version/status';
		if (p === '/api/repos') return '/api/repos';
		if (p === '/api/runbooks') return '/api/runbooks';
		if (p === '/api/workspaces/match') return '/api/workspaces/match';
		if (p === '/api/workspaces') return '/api/workspaces';
		if (p === '/api/projects') return '/api/projects';
		if (p === '/api/worlds') return '/api/worlds';
		if (p === '/api/plan/conversations' \|\| p === '/api/plan/personas') return p;
		if (p === '/api/admin/upgrade') return '/api/admin/upgrade';
		if (p === '/api/admin/world-pr') return '/api/admin/world-pr';
		if (p === '/api/admin/registry') return '/api/admin/registry';
		if (p.startsWith('/api/worlds/')) {
		if (p.includes('/credentials/')) return '/api/worlds/:id/credentials/:action';
		if (p.includes('/tunnels')) return '/api/worlds/:id/tunnels';
		if (p.endsWith('/pr')) return '/api/worlds/:id/pr';
		if (p.endsWith('/progress')) return '/api/worlds/:id/progress';
		return '/api/worlds/:id';
		}
		if (p.startsWith('/api/world/')) return '/api/world/:id/*';
		if (p.startsWith('/api/admin/registry/')) return '/api/admin/registry';
		if (p.startsWith('/api/admin/world-pr/')) return '/api/admin/world-pr/:id';
		if (p.startsWith('/api/auth/credentials')) return '/api/auth/credentials';
		if (p.startsWith('/api/auth/')) return '/api/auth';
		if (p.startsWith('/api/plan/conversations/')) return '/api/plan/conversations/:id';
		if (p.startsWith('/api/plan/')) return '/api/plan';
		if (p.startsWith('/api/processes') \|\| p.startsWith('/api/servers')) return '/api/processes';
		if (p.startsWith('/v1/chunks')) return '/v1/chunks';
		if (p.startsWith('/v1/worlds')) return '/v1/worlds';
		if (p.startsWith('/assets/')) return '/assets';
		// SPA HTML fallback routes (GET / and SPA sub-routes like /worlds, /plan/...)
		if (p === '/' \|\| p.startsWith('/worlds') \|\| p.startsWith('/plan') \|\| p.startsWith('/workspaces')) return '/static';
		return '/unknown';
		}

		// ─── In-memory registry ───────────────────────────────────────────────────

		const HISTOGRAM_BUCKETS = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5];

		/** @type {Map<string, number>} labelSet → count */
		const _counters = new Map();

		/**
		* Per label-set histogram state.
		* @type {Map<string, {buckets: number[], sum: number, count: number}>}
		*/
		const _histograms = new Map();

		/** @param {string[]} parts label values in canonical order */
		function _labelKey(parts) {
		return parts.join('\x00');
		}

		/**
		* Reset all metrics. FOR TESTS ONLY — never call in production code.
		* Exported as a separate name so it's invisible to consumers that only
		* import the named exports they need.
		*/
		export function _resetForTest() {
		_counters.clear();
		_histograms.clear();
		}

		/**
		* Increment http_requests_total counter.
		*
		* @param {string} service
		* @param {string} route — MUST be a normalized route pattern
		* @param {string} method
		* @param {string} statusCode
		*/
		export function incRequest(service, route, method, statusCode) {
		const key = _labelKey([service, route, method, statusCode]);
		_counters.set(key, (_counters.get(key) ?? 0) + 1);
		}

		/**
		* Observe http_request_duration_seconds.
		*
		* @param {string} service
		* @param {string} route
		* @param {string} method
		* @param {number} seconds
		*/
		export function observeDuration(service, route, method, seconds) {
		const key = _labelKey([service, route, method]);
		let h = _histograms.get(key);
		if (!h) {
		// buckets[i] = count of observations where seconds <= HISTOGRAM_BUCKETS[i]
		// but stored as INCREMENTAL per-range so cumulation happens on render.
		// Each bucket[i] = count that fell in range (HISTOGRAM_BUCKETS[i-1], HISTOGRAM_BUCKETS[i]].
		h = { buckets: new Array(HISTOGRAM_BUCKETS.length).fill(0), sum: 0, count: 0 };
		_histograms.set(key, h);
		}
		// Find the first bucket boundary that accommodates this observation.
		// Increment only that bucket; render accumulates for the exposition.
		let placed = false;
		for (let i = 0; i < HISTOGRAM_BUCKETS.length; i++) {
		if (seconds <= HISTOGRAM_BUCKETS[i]) {
		h.buckets[i]++;
		placed = true;
		break;
		}
		}
		// Observations beyond the last bucket are counted in h.count only;
		// the +Inf bucket in the exposition equals h.count.
		if (!placed) {
		// No bucket captured it — it lands in +Inf only.
		}
		h.sum += seconds;
		h.count++;
		}

		// ─── Prometheus text exposition ───────────────────────────────────────────

		/** Escape label value per Prometheus text format (backslash, newline, quote). */
		function escapeLabelValue(v) {
		return String(v).replace(/\\/g, '\\\\').replace(/\n/g, '\\n').replace(/"/g, '\\"');
		}

		/**
		* Build the `{k1="v1",k2="v2",...}` label-set string.
		* @param {Record<string, string>} labels
		*/
		function labelSet(labels) {
		const parts = Object.entries(labels).map(
		([k, v]) => `${k}="${escapeLabelValue(v)}"`,
		);
		return `{${parts.join(',')}}`;
		}

		/**
		* Render the complete Prometheus text exposition.
		* @returns {string}
		*/
		export function renderMetrics() {
		const lines = [];

		// ── http_requests_total ─────────────────────────────────────────────
		lines.push('# HELP http_requests_total Total number of HTTP requests handled.');
		lines.push('# TYPE http_requests_total counter');
		for (const [key, count] of _counters) {
		const [service, route, method, status_code] = key.split('\x00');
		lines.push(
		`http_requests_total${labelSet({ service, route, method, status_code })} ${count}`,
		);
		}

		// ── http_request_duration_seconds ───────────────────────────────────
		lines.push('# HELP http_request_duration_seconds HTTP request duration in seconds (histogram).');
		lines.push('# TYPE http_request_duration_seconds histogram');
		for (const [key, h] of _histograms) {
		const [service, route, method] = key.split('\x00');
		const base = { service, route, method };
		// Cumulative buckets: le=X must be ≥ sum of all observations ≤ X.
		let cumulative = 0;
		for (let i = 0; i < HISTOGRAM_BUCKETS.length; i++) {
		cumulative += h.buckets[i];
		lines.push(
		`http_request_duration_seconds_bucket${labelSet({ ...base, le: String(HISTOGRAM_BUCKETS[i]) })} ${cumulative}`,
		);
		}
		lines.push(
		`http_request_duration_seconds_bucket${labelSet({ ...base, le: '+Inf' })} ${h.count}`,
		);
		lines.push(`http_request_duration_seconds_sum${labelSet(base)} ${h.sum}`);
		lines.push(`http_request_duration_seconds_count${labelSet(base)} ${h.count}`);
		}

		lines.push(''); // trailing newline
		return lines.join('\n');
		}

		// ─── Request instrumentation wrapper ─────────────────────────────────────

		/**
		* Wrap an async request handler so every request is instrumented.
		*
		* The wrapper:
		* 1. Derives a stable route pattern from req.url.
		* 2. Starts a high-resolution timer.
		* 3. Calls the original handler.
		* 4. Records counter + histogram using the response's status code.
		*
		* Status code capture: we monkey-patch res.writeHead and res.end to intercept
		* the status before it's sent. Falls back to res.statusCode (which Node sets
		* implicitly on .end() when no explicit writeHead call was made).
		*
		* @param {string} serviceName — emitted as the `service` label
		* @param {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse) => Promise<void>} handler
		* @returns {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse) => Promise<void>}
		*/
		export function instrumentHandler(serviceName, handler) {
		return async (req, res) => {
		const start = performance.now();

		// Intercept status code by wrapping writeHead.
		let capturedStatus = null;
		const origWriteHead = res.writeHead.bind(res);
		res.writeHead = (status, ...rest) => {
		capturedStatus = status;
		return origWriteHead(status, ...rest);
		};

		try {
		await handler(req, res);
		} finally {
		const durationSec = (performance.now() - start) / 1000;
		const urlObj = new URL(req.url ?? '/', `http://localhost`);
		const route = pathToRoute(urlObj.pathname);
		const method = (req.method ?? 'GET').toUpperCase();
		const statusCode = String(capturedStatus ?? res.statusCode ?? 200);

		incRequest(serviceName, route, method, statusCode);
		observeDuration(serviceName, route, method, durationSec);
		}
		};
		}

-212

host-cp/src/op-side-longpoll.mjs

		/**
		* op-side-longpoll.mjs — Operator-side long-poll loop.
		*
		* Maintains a persistent outbound HTTPS connection from host-cp to
		* plan-DO's /v1/op-poll endpoint, waiting for local-Docker dispatch
		* work to appear. Feature-flag-gated behind OLAM_OPSIDE_LONGPOLL=1
		* (default OFF — no behavior change when unset).
		*
		* ELI5: like installing a phone line to the cloud planner.
		* Your local machine stays connected, ready to receive coding tasks.
		* Nothing calls yet — the phone just sits there waiting (v1).
		*
		* Phase D ships the plumbing and tests it behind a flag. Future cells
		* (#3/#6/#7) wire actual producers on the plan-DO side.
		*
		* Circuit-breaker: 10 consecutive errors → 60 s pause →
		* counter resets and polling resumes. Prevents hammering the server
		* during outages.
		*
		* Reconnect delay: 1000ms base + uniform 0-500ms jitter.
		* All structured log events emitted to console via JSON objects.
		*
		* @module op-side-longpoll
		*/

		// Reconnect delay constants.
		const RECONNECT_BASE_MS = 1000;
		const RECONNECT_JITTER_MS = 500;

		// Circuit-breaker constants.
		const CIRCUIT_BREAKER_THRESHOLD = 10;
		const CIRCUIT_BREAKER_PAUSE_MS = 60_000;

		// Long-poll timeout — plan-DO blocks up to 25 s; we give a 5 s margin.
		const POLL_TIMEOUT_MS = 30_000;

		/**
		* Compute reconnect delay: 1000 + uniform 0-500 ms jitter.
		* Exported so tests can mock Math.random and assert the formula.
		*
		* @param {() => number} [randFn] - Optional RNG override for testing.
		* @returns {number} Delay in milliseconds.
		*/
		export function reconnectDelay(randFn = Math.random) {
		return RECONNECT_BASE_MS + Math.floor(randFn() * RECONNECT_JITTER_MS);
		}

		/** @type {ReturnType<typeof setTimeout> \| null} */
		let pollTimer = null;

		/** @type {boolean} */
		let running = false;

		/** @type {number} */
		let consecutiveErrors = 0;

		/** @type {string \| null} */
		let activeCloudUrl = null;

		/** @type {string \| null} */
		let activeAuth = null;

		/**
		* Emit a structured log event. Uses console.log to be consistent with
		* host-cp's existing logging style. All events include `event` + `ts`.
		*
		* @param {string} event
		* @param {Record<string, unknown>} [extra]
		*/
		function emit(event, extra = {}) {
		console.log(JSON.stringify({ event, ts: new Date().toISOString(), ...extra }));
		}

		/**
		* Sleep for ms milliseconds. Returns a Promise that resolves after the
		* delay. Cancellable via the token: if token.cancelled becomes true
		* before the timeout fires, the promise still resolves (callers check
		* running themselves).
		*
		* @param {number} ms
		* @returns {Promise<void>}
		*/
		function sleep(ms) {
		return new Promise((resolve) => {
		pollTimer = setTimeout(resolve, ms);
		});
		}

		/**
		* Single poll iteration: open a GET /v1/op-poll request, wait for
		* a response, parse the JSON body.
		*
		* @param {string} cloudUrl Base URL (e.g. https://plan-do.example.com)
		* @param {string} auth Authorization header value
		* @returns {Promise<{ work: null \| { worldId: string, dispatchSpec: unknown } }>}
		*/
		async function pollOnce(cloudUrl, auth) {
		const controller = new AbortController();
		const timeoutId = setTimeout(() => controller.abort(), POLL_TIMEOUT_MS);

		try {
		const url = `${cloudUrl.replace(/\/$/, '')}/v1/op-poll`;
		const res = await fetch(url, {
		method: 'GET',
		headers: { Authorization: auth },
		signal: controller.signal,
		});

		if (!res.ok) {
		throw new Error(`op-poll returned ${res.status}`);
		}

		const body = await res.json();
		return body;
		} finally {
		clearTimeout(timeoutId);
		}
		}

		/**
		* The main poll loop. Runs until stopPoll() is called.
		*
		* State transitions:
		* idle
		* → connecting (emit op-poll-connect)
		* → got { work: null } timeout response (emit op-poll-timeout)
		* → wait reconnect delay
		* → connecting again
		*
		* On error:
		* → consecutiveErrors++
		* → emit op-poll-error
		* → if consecutiveErrors >= threshold: circuit-breaker open
		* emit op-poll-circuit-open, wait 60 s, reset counter
		* → else: wait reconnect delay
		*
		* @returns {Promise<void>}
		*/
		async function pollLoop() {
		while (running) {
		emit('op-poll-connect', { cloud_url: activeCloudUrl });

		try {
		const result = await pollOnce(activeCloudUrl, activeAuth);

		// On a successful { work: null } response, reset the error counter.
		consecutiveErrors = 0;

		const delay = reconnectDelay();
		emit('op-poll-timeout', { work: result.work, reconnect_in_ms: delay });

		if (!running) break;
		await sleep(delay);
		} catch (err) {
		consecutiveErrors++;
		const message = err instanceof Error ? err.message : String(err);

		emit('op-poll-error', {
		error: message,
		consecutive_errors: consecutiveErrors,
		});

		if (consecutiveErrors >= CIRCUIT_BREAKER_THRESHOLD) {
		emit('op-poll-circuit-open', {
		consecutive_errors: consecutiveErrors,
		pause_ms: CIRCUIT_BREAKER_PAUSE_MS,
		});
		consecutiveErrors = 0;
		if (!running) break;
		await sleep(CIRCUIT_BREAKER_PAUSE_MS);
		} else {
		if (!running) break;
		const delay = reconnectDelay();
		await sleep(delay);
		}
		}
		}
		}

		/**
		* Start the operator-side long-poll loop.
		*
		* No-op if already running. Reads the flag from the environment:
		* only runs when OLAM_OPSIDE_LONGPOLL === '1'. Call this AFTER
		* server.listen() to avoid blocking the process startup path.
		*
		* @param {string} cloudUrl Base URL of the plan-DO deployment.
		* @param {string} auth Authorization header value for Basic auth.
		*/
		export function startPoll(cloudUrl, auth) {
		if (running) return;
		running = true;
		consecutiveErrors = 0;
		activeCloudUrl = cloudUrl;
		activeAuth = auth;
		// Fire-and-forget; errors are caught inside pollLoop.
		void pollLoop();
		}

		/**
		* Stop the operator-side long-poll loop.
		*
		* Cancels any in-progress sleep timer; the loop condition will
		* exit on its next iteration. Idempotent.
		*/
		export function stopPoll() {
		running = false;
		if (pollTimer !== null) {
		clearTimeout(pollTimer);
		pollTimer = null;
		}
		}

-92

host-cp/src/panic-counter.mjs

		// C4 — macOS panic-log counter.
		//
		// Note: phase-c-tasks.md originally listed a browser SPA path for this.
		// SPAs can't shell out — `child_process` is Node-only. Correct home
		// is host-cp (Node) which brokers operator-machine state through
		// host-stream. host-cp exposes a typed event consumers can subscribe to.
		//
		// Implementation:
		// `log show --predicate 'eventMessage CONTAINS "panic"' --last <N>d`
		// pipes to stdout; we count newlines (each panic event = 1 line).
		//
		// Platform guard:
		// On non-darwin platforms, getPanicCount returns null + emits a
		// `[panic-counter]` warning to stderr. Callers branch on null →
		// skip the delta + don't emit the Slack message.
		//
		// Sampling cadence:
		// Baseline: at olam-cli startup OR on first /plan/new visit
		// Per-session: at plan completion (cloud-mode only)
		//
		// Cost note:
		// `log show` is expensive (~200ms-2s depending on system log size).
		// Cache the baseline + only re-sample on demand. Don't poll.

		import { execFile } from 'node:child_process';
		import { promisify } from 'node:util';
		import { platform } from 'node:os';

		const execFileP = promisify(execFile);

		const PANIC_PREDICATE = 'eventMessage CONTAINS "panic"';
		const DEFAULT_TIMEOUT_MS = 30_000;

		/**
		* Return the count of `panic`-containing log entries over the last N
		* days. Returns null on non-darwin platforms OR on `log` command
		* failure (caller treats null as "no signal; skip the delta").
		*/
		export async function getPanicCount(last_n_days = 7, opts = {}) {
		if (platform() !== 'darwin') {
		if (!opts.silent) {
		process.stderr.write(
		`[panic-counter] platform=${platform()} is not darwin; returning null\n`,
		);
		}
		return null;
		}
		const execImpl = opts.execFileFn ?? execFileP;
		try {
		const { stdout } = await execImpl(
		'log',
		['show', '--predicate', PANIC_PREDICATE, '--last', `${last_n_days}d`],
		{ timeout: opts.timeoutMs ?? DEFAULT_TIMEOUT_MS, maxBuffer: 10 * 1024 * 1024 },
		);
		// `log show` prepends a header + may emit an "is empty" sentinel.
		// Count lines that look like log entries: start with a timestamp.
		const lines = stdout.split('\n').filter((line) => /^\d{4}-\d{2}-\d{2}/.test(line));
		return lines.length;
		} catch (err) {
		if (!opts.silent) {
		const msg = err instanceof Error ? err.message : String(err);
		process.stderr.write(`[panic-counter] log command failed: ${msg}\n`);
		}
		return null;
		}
		}

		/**
		* Pure delta math. Returns null if either input is null (no signal).
		* Negative deltas (panics increased) are valid — caller frames the
		* Slack message appropriately.
		*/
		export function computePanicDelta(before, after) {
		if (before === null \|\| after === null) return null;
		if (typeof before !== 'number' \|\| typeof after !== 'number') return null;
		return after - before;
		}

		/** Format the delta for a Slack message body. Plain English; no jargon. */
		export function formatDeltaSummary(before, after) {
		const delta = computePanicDelta(before, after);
		if (delta === null) {
		return 'Panic delta: n/a (counter unavailable this session).';
		}
		if (delta === 0) {
		return `Panic count steady: ${before} → ${after} (no change this session).`;
		}
		if (delta < 0) {
		return `Panic count down ${Math.abs(delta)}: ${before} → ${after}.`;
		}
		return `Panic count up ${delta}: ${before} → ${after}.`;
		}

-53

host-cp/src/plan-chat-proxy-headers.mjs

		// plan-chat-proxy-headers.mjs — header handling for host-cp's /api/plan-chat/*
		// passthrough proxy (server.mjs). Extracted as pure helpers so the F3 (T9)
		// operator-chunk broker-secret contract is unit-testable without booting the
		// whole host-cp server.
		//
		// F3 (T9) boundary: host-cp's /api/plan-chat/* proxy is the TRUSTED operator
		// surface (SPA browser → host-cp → plan-chat-service). A WORLD process never
		// routes through this proxy — it talks to plan-chat-service directly via
		// host.docker.internal. So:
		// - Client-supplied `x-olam-broker-secret` is ALWAYS stripped (a client must
		// not be able to smuggle the operator-chunk authority secret through).
		// - The real secret is injected by the proxy itself (injectBrokerSecret),
		// only when configured, so the operator's own SPA interject is authorised
		// while a world process — which can't present the secret — is rejected by
		// plan-chat-service's gate.

		const HOP_BY_HOP = new Set(['host', 'connection', 'content-length']);
		const BROKER_SECRET_HEADER = 'x-olam-broker-secret';

		/**
		* Build the upstream header map for a /api/plan-chat/* proxy request.
		* Drops hop-by-hop headers AND any client-supplied broker secret (F3).
		*
		* @param {Record<string, string \| string[] \| undefined>} reqHeaders
		* @returns {Record<string, string>}
		*/
		export function buildPlanChatProxyHeaders(reqHeaders) {
		const headers = {};
		for (const [k, v] of Object.entries(reqHeaders ?? {})) {
		if (HOP_BY_HOP.has(k)) continue;
		// F3 — never forward a CLIENT-supplied broker secret.
		if (k === BROKER_SECRET_HEADER) continue;
		if (Array.isArray(v)) headers[k] = v.join(', ');
		else if (typeof v === 'string') headers[k] = v;
		}
		return headers;
		}

		/**
		* Inject the operator-chunk broker secret into the upstream headers when it is
		* configured. No-op when the secret is unset/empty (the gate then runs in its
		* default ungated-but-loud mode). Mutates + returns `headers`.
		*
		* @param {Record<string, string>} headers
		* @param {string \| undefined} operatorChunkSecret
		* @returns {Record<string, string>}
		*/
		export function injectBrokerSecret(headers, operatorChunkSecret) {
		if (typeof operatorChunkSecret === 'string' && operatorChunkSecret.length > 0) {
		headers[BROKER_SECRET_HEADER] = operatorChunkSecret;
		}
		return headers;
		}

-147

host-cp/src/plan-chat-secret.mjs

		// Bearer-secret management for plan-chat-service.mjs.
		//
		// Mirrors the agent-memory-service pattern from the sibling olam-agent-memory
		// repo: a single 0600 file at ~/.olam/plan-chat-secret holds the bearer
		// hex string. Helpers generate, read, and rotate atomically. Rotation
		// writes to a tmpfile and renames; mid-rotation reads see either the old
		// or new value, never a partial write.
		//
		// Inside the Docker container, os.homedir() → /root, but compose.yaml mounts
		// ${HOME}/.olam → /data. Without an env override, the bearer would be written
		// to /root/.olam/plan-chat-secret (container ephemeral layer) and lost on
		// every `docker compose up --force-recreate` (i.e. every `olam upgrade`).
		// OLAM_PLAN_CHAT_SECRET_PATH is set to /data/plan-chat-secret in compose.yaml
		// and k8s/manifests/30-configmap.yaml so all reads/writes land in the
		// bind-mounted host directory. On bare-host installs (no container) the env
		// var is unset and the path falls back to ~/.olam/plan-chat-secret — no
		// behaviour change. Mirrors precedent commit 5b21d1f2 (PR #440) for plan.db.

		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';
		import crypto from 'node:crypto';
		// Phase D (olam-config-store-unification): consult config.json's
		// `cloud.secrets.plan-chat-secret` value before the legacy secret FILES.
		// Dep-free reader (host-cp has no @olam/core dep) with container-aware /data
		// path resolution — see config-reader.mjs header.
		import { readConfigString, olamConfigDir } from './config-reader.mjs';

		/**
		* Resolve the plan-chat-secret path: prefer <configDir>/secrets/plan-chat-secret
		* (new canonical location) over <configDir>/plan-chat-secret (legacy). The
		* config dir is container-aware (OLAM_HOME ?? container /data ?? ~/.olam) so the
		* bare-node and in-container layouts both resolve correctly. Inlined here
		* because host-cp is a pure .mjs package with no @olam/core dep.
		*/
		function resolvePlanChatSecretPath() {
		const olamHome = olamConfigDir();
		const newPath = path.join(olamHome, 'secrets', 'plan-chat-secret');
		if (fs.existsSync(newPath)) return newPath;
		const legacyPath = path.join(olamHome, 'plan-chat-secret');
		if (fs.existsSync(legacyPath)) return legacyPath;
		// Neither exists — return canonical so writes land in the right place.
		return newPath;
		}

		export const SECRET_PATH =
		process.env.OLAM_PLAN_CHAT_SECRET_PATH ?? resolvePlanChatSecretPath();
		export const SECRET_DIR = path.dirname(SECRET_PATH);
		const SECRET_BYTES = 32; // 64 hex chars
		const SECRET_MODE = 0o600;

		/**
		* Generate a fresh hex bearer (64 chars; 256 bits of entropy).
		*/
		export function generateSecret() {
		return crypto.randomBytes(SECRET_BYTES).toString('hex');
		}

		/** Read a bearer value out of a single secret FILE. Null if absent/empty. */
		function readSecretFile(secretPath) {
		try {
		const value = fs.readFileSync(secretPath, 'utf8').trim();
		if (!value) return null;
		return value;
		} catch (err) {
		if (err && typeof err === 'object' && 'code' in err && err.code === 'ENOENT') return null;
		throw err;
		}
		}

		/**
		* Read the plan-chat bearer. Returns null if absent. Throws on permission errors.
		*
		* Phase D precedence (mirrors resolver.ts getPlanChatSecret) for the DEFAULT
		* read path: the resolved secret FILE (SECRET_PATH — env override or the
		* canonical/legacy file) wins when present, then config.json
		* `cloud.secrets.plan-chat-secret`, then null. The file leg stays FIRST so an
		* operator's existing on-disk bearer (incl. the compose-mounted
		* OLAM_PLAN_CHAT_SECRET_PATH=/data/plan-chat-secret) is byte-for-byte
		* unchanged; config.json only fills in when no file exists yet.
		*
		* When called with an EXPLICIT secretPath (write/rotate read-backs, tests),
		* behaviour is file-only — no config.json leg — so callers that own a specific
		* path keep deterministic file semantics.
		*/
		export function readSecret(secretPath = SECRET_PATH) {
		const explicitPath = secretPath !== SECRET_PATH;
		const fromFile = readSecretFile(secretPath);
		if (fromFile !== null) return fromFile;
		if (explicitPath) return null; // explicit-path callers: file-only semantics
		const fromConfig = readConfigString('cloud.secrets.plan-chat-secret');
		return fromConfig; // string or null
		}

		/**
		* Write the bearer to disk atomically. Creates `~/.olam` if missing. Enforces
		* 0600 perms on the destination (older mode permissions on the tmpfile are
		* tightened immediately after write).
		*/
		export function writeSecret(value, secretPath = SECRET_PATH) {
		if (typeof value !== 'string' \|\| value.length === 0) {
		throw new Error('plan-chat-secret: refusing to write empty bearer');
		}
		fs.mkdirSync(path.dirname(secretPath), { recursive: true, mode: 0o700 });
		const tmp = `${secretPath}.tmp-${process.pid}-${Date.now()}`;
		fs.writeFileSync(tmp, value + '\n', { mode: SECRET_MODE });
		try {
		fs.chmodSync(tmp, SECRET_MODE);
		fs.renameSync(tmp, secretPath);
		} catch (err) {
		try { fs.unlinkSync(tmp); } catch { /* swallow */ }
		throw err;
		}
		}

		/**
		* Read the bearer if it exists, else generate, write, and return it.
		* Idempotent across processes; first writer wins (rename is atomic).
		*/
		export function ensureSecret(secretPath = SECRET_PATH) {
		const existing = readSecret(secretPath);
		if (existing) return existing;
		const fresh = generateSecret();
		writeSecret(fresh, secretPath);
		return fresh;
		}

		/**
		* Rotate: generate a new bearer, write atomically, return the new value.
		* Callers should restart any running plan-chat-service so it re-reads.
		*/
		export function rotateSecret(secretPath = SECRET_PATH) {
		const fresh = generateSecret();
		writeSecret(fresh, secretPath);
		return fresh;
		}

		/**
		* Constant-time compare. Returns true iff both strings are non-empty and
		* byte-equal. Avoids leaking timing on bearer comparison.
		*/
		export function timingSafeEqual(a, b) {
		if (typeof a !== 'string' \|\| typeof b !== 'string') return false;
		if (a.length === 0 \|\| b.length === 0) return false;
		if (a.length !== b.length) return false;
		return crypto.timingSafeEqual(Buffer.from(a), Buffer.from(b));
		}

host-cp/src/plan-chat-service.mjs

Sorry, the diff of this file is too big to display

-1112

host-cp/src/plan-orchestrator.mjs

		// plan-orchestrator.mjs — Phase 2: multi-persona conversation coordinator.
		//
		// Architecture:
		// - AgentRegistry holds one pi AgentSession per (conversationId, personaId).
		// - HandoffEngine forks the session tree when the active persona changes.
		// - All persona turns share one session.jsonl per conversation.
		// - SSE sinks are an in-process Set<ServerResponse> per conversationId.
		//
		// Credentials:
		// - Uses the Olam auth-service vault (same as the rest of host-cp).
		// - No ANTHROPIC_API_KEY required; tokens fetched on demand via auth-service.

		import path from 'node:path';
		import os from 'node:os';
		import fs from 'node:fs';
		import { randomUUID } from 'node:crypto';
		import Database from 'better-sqlite3';
		import { SessionManager } from '@mariozechner/pi-coding-agent';
		import { PERSONAS, DEFAULT_PERSONA_ID, getPersona } from './plan/personas.mjs';
		import { AgentRegistry } from './plan/agent-registry.mjs';
		import { HandoffEngine } from './plan/handoff-engine.mjs';
		import { RopeEngine } from './plan/rope-engine.mjs';
		import { loadAuthorityConfig } from './plan/authority-config.mjs';
		import { isPathVaultUrl, ensurePathVaultProxy } from './plan/path-vault-proxy.mjs';
		// Phase D (olam-config-store-unification): config.json reader (dep-free, copied
		// from packages/core/src/cloud-state/read-config-value.mjs — host-cp has no
		// @olam/core dep). Container-aware /data path resolution lives in config-reader.mjs.
		import { readConfigString, olamConfigDir } from './config-reader.mjs';

		// ── Cloud path-vault fallback ───────────────────────────────────────────────
		//
		// When the local auth-service vault has no Claude credential, the plan agent can
		// instead reach Claude through the operator's cloud path-vault URL. Resolution
		// mirrors server.mjs readAnthropicBaseUrl() (kept independent so this module has
		// no server.mjs dependency):
		// 1. OLAM_ANTHROPIC_BASE_URL env var
		// 2. ~/.olam/anthropic-base-url file
		// 3. ANTHROPIC_BASE_URL env var
		//
		// Only PATH-FORMAT vault URLs (https://host/auth/<sub>/<secret>) are usable as a
		// fallback — they self-authenticate, so no live token is required.

		/** Placeholder api-key handed to the agent runtime in path-vault mode. The
		* path prefix is the real credential; the proxy strips this header. */
		const PATH_VAULT_PLACEHOLDER_KEY = 'path-vault-proxy';

		/** @returns {string} the configured Anthropic base URL, or '' if none. */
		function readAnthropicBaseUrlForFallback() {
		const fromOlamEnv = process.env['OLAM_ANTHROPIC_BASE_URL'];
		if (fromOlamEnv && fromOlamEnv.length > 0) return fromOlamEnv.trim();
		// Phase D: config.json leg sits BETWEEN the two env legs (mirrors
		// server.mjs readAnthropicBaseUrl + resolver.ts getAnthropicBaseUrl).
		const fromConfig = readConfigString('cloud.urls.anthropic-base-url');
		if (fromConfig !== null) return fromConfig;
		try {
		// Legacy fallback under the container-aware config dir (/data or ~/.olam).
		const file = path.join(olamConfigDir(), 'anthropic-base-url');
		const content = fs.readFileSync(file, 'utf-8').trim();
		if (content.length > 0) return content;
		} catch {
		// file absent — fall through
		}
		const fromShellEnv = process.env['ANTHROPIC_BASE_URL'];
		if (fromShellEnv && fromShellEnv.length > 0) return fromShellEnv.trim();
		return '';
		}

		// ── Paths ─────────────────────────────────────────────────────────────────────
		//
		// Inside the Docker container, os.homedir() → /root, but compose.yaml mounts
		// ${HOME}/.olam → /data. Without env overrides, plan.db would be written to
		// /root/.olam/plan.db (container ephemeral layer) and lost on every
		// `docker compose up --force-recreate` (i.e. every `olam upgrade`).
		//
		// OLAM_PLAN_DB_PATH and OLAM_PLAN_DIR are set to /data/plan.db and /data/plan
		// in compose.yaml so all writes land in the bind-mounted host directory.
		// On bare-host installs (no container) neither env var is set and the paths
		// fall back to the original ~/.olam locations — no behaviour change.
		//
		// Paths are resolved at construction time (not module load) so tests can pass
		// explicit paths via constructor opts without any module re-import tricks.

		/** @returns {string} */
		function defaultPlanDbPath() {
		return process.env.OLAM_PLAN_DB_PATH ?? path.join(os.homedir(), '.olam', 'plan.db');
		}

		/** @returns {string} */
		function defaultPlanDir() {
		return process.env.OLAM_PLAN_DIR ?? path.join(os.homedir(), '.olam', 'plan');
		}

		// ── Helpers ───────────────────────────────────────────────────────────────────

		function initSessionFile(sessionFile, sessionId) {
		const header = {
		type: 'session',
		version: 3,
		id: sessionId,
		timestamp: new Date().toISOString(),
		cwd: os.homedir(),
		};
		fs.writeFileSync(sessionFile, JSON.stringify(header) + '\n');
		}

		/**
		* Derive a short title from the first user message content.
		* Truncates at a word boundary to at most maxLen characters.
		* @param {string} content
		* @param {number} [maxLen=40]
		* @returns {string}
		*/
		export function deriveTitle(content, maxLen = 40) {
		const trimmed = content.trim().replace(/\s+/g, ' ');
		if (!trimmed) return '(empty)';
		if (trimmed.length <= maxLen) return trimmed;
		const cut = trimmed.slice(0, maxLen);
		const lastSpace = cut.lastIndexOf(' ');
		return (lastSpace > 0 ? cut.slice(0, lastSpace) : cut) + '…';
		}

		// ── PlanOrchestrator ──────────────────────────────────────────────────────────

		export class PlanOrchestrator {
		#db;
		#planDir;
		#authServiceUrl;
		#authServiceSecret;
		#registry;
		#handoffEngine;
		#ropeEngine;

		/** Tracks the active persona per conversationId: Map<conversationId, personaId> */
		#activePersona = new Map();

		/** @type {Map<string, Set<import('node:http').ServerResponse>>} */
		#sinks = new Map();

		/**
		* Ring buffer of in-flight SSE events per conversationId.
		* Populated while a turn is active; cleared after all persona turn_complete events.
		* Used by drainReplayBuffer to replay missed events on reconnect.
		* @type {Map<string, Array<{event: string, data: object}>>}
		*/
		#activeTurns = new Map();

		/**
		* Number of persona turn_complete events still pending per conversationId.
		* Replay buffer is only cleared when this reaches 0.
		* @type {Map<string, number>}
		*/
		#pendingPersonaCount = new Map();

		/**
		* Mutable current-chunk refs per conversationId.
		* ChunkEmitter updates these; read_sidebar tool reads them.
		* @type {Map<string, { current: string\|null }>}
		*/
		#currentChunkRefs = new Map();

		/**
		* @param {{
		* authServiceUrl: string,
		* authServiceSecret: string,
		* planDbPath?: string,
		* planDirPath?: string,
		* }} opts
		*
		* planDbPath / planDirPath default to OLAM_PLAN_DB_PATH / OLAM_PLAN_DIR env vars,
		* falling back to ~/.olam/plan.db and ~/.olam/plan. Pass explicitly in tests to
		* avoid touching real home-dir paths.
		*/
		constructor({ authServiceUrl, authServiceSecret, planDbPath, planDirPath } = {}) {
		this.#authServiceUrl = authServiceUrl;
		this.#authServiceSecret = authServiceSecret;

		const legacyDbPath = path.join(os.homedir(), '.olam', 'plan.db');
		// Track whether the caller injected an explicit DB path (used to skip the
		// legacy-path migration below — tests inject tmpDir paths and must not
		// inherit the operator's real plan.db).
		const planDbPathInjected = planDbPath !== undefined;
		const resolvedDbPath = planDbPath ?? defaultPlanDbPath();
		this.#planDir = planDirPath ?? defaultPlanDir();

		this.#registry = new AgentRegistry({ authServiceUrl, authServiceSecret });
		this.#handoffEngine = new HandoffEngine(this.#registry);

		fs.mkdirSync(path.dirname(resolvedDbPath), { recursive: true });

		// One-time migration: if the resolved DB path differs from the legacy default and
		// the target doesn't exist yet, copy any existing DB from the old location.
		// This preserves conversations on a hot-restart after deploying the compose.yaml fix.
		// On full container recreate the legacy path is already gone — this is a no-op.
		//
		// Skip when the caller injected an explicit planDbPath — that's the unit-
		// test shape (each test owns a tmpDir db). Pre-fix history: tests on a host
		// with a populated `~/.olam/plan.db` got every `listConversations()` query
		// polluted by real operator data because the migration eagerly copied the
		// legacy file into the test's tmpDir.
		if (
		!planDbPathInjected &&
		resolvedDbPath !== legacyDbPath &&
		!fs.existsSync(resolvedDbPath) &&
		fs.existsSync(legacyDbPath)
		) {
		try {
		fs.copyFileSync(legacyDbPath, resolvedDbPath);
		console.info('[plan] Migrated plan.db from legacy path to', resolvedDbPath);
		} catch (err) {
		console.warn('[plan] plan.db migration failed (non-fatal):', err.message);
		}
		}

		this.#db = new Database(resolvedDbPath);
		this.#db.exec(`
		CREATE TABLE IF NOT EXISTS plan_conversations (
		id TEXT PRIMARY KEY,
		title TEXT,
		persona TEXT NOT NULL DEFAULT 'brainstorm',
		created_at INTEGER NOT NULL,
		last_turn_at INTEGER
		);
		CREATE TABLE IF NOT EXISTS plan_turns (
		id TEXT PRIMARY KEY,
		conversation_id TEXT NOT NULL REFERENCES plan_conversations(id),
		role TEXT NOT NULL,
		content TEXT NOT NULL DEFAULT '',
		persona TEXT,
		from_persona TEXT,
		to_persona TEXT,
		mode TEXT,
		fork_node_id TEXT,
		created_at INTEGER NOT NULL
		);
		CREATE INDEX IF NOT EXISTS plan_turns_conv_idx
		ON plan_turns(conversation_id, created_at);

		-- Phase 4B: lookout agent registry per conversation
		CREATE TABLE IF NOT EXISTS plan_lookout_agents (
		conversation_id TEXT NOT NULL,
		persona_id TEXT NOT NULL,
		muted INTEGER NOT NULL DEFAULT 0,
		mode TEXT NOT NULL DEFAULT 'observe',
		created_at INTEGER NOT NULL,
		PRIMARY KEY (conversation_id, persona_id)
		);

		-- Phase 4B: sidebar signals from lookout agents
		CREATE TABLE IF NOT EXISTS plan_sidebar_signals (
		id TEXT PRIMARY KEY,
		conversation_id TEXT NOT NULL,
		agent_id TEXT NOT NULL,
		urgency TEXT NOT NULL DEFAULT 'p2',
		reason TEXT NOT NULL DEFAULT '',
		content TEXT NOT NULL DEFAULT '',
		chunk_id TEXT NOT NULL,
		created_at INTEGER NOT NULL,
		status TEXT NOT NULL DEFAULT 'active',
		tension_subject TEXT,
		parent_signal_id TEXT
		);
		CREATE INDEX IF NOT EXISTS plan_sidebar_conv_idx
		ON plan_sidebar_signals(conversation_id, created_at);
		CREATE INDEX IF NOT EXISTS plan_sidebar_chunk_idx
		ON plan_sidebar_signals(chunk_id);
		`);

		// Migration guard: add pinned column if the table predates this feature.
		const planConvCols = this.#db.prepare(`PRAGMA table_info(plan_conversations)`).all();
		if (!planConvCols.some(c => c.name === 'pinned')) {
		this.#db.exec(`ALTER TABLE plan_conversations ADD COLUMN pinned INTEGER NOT NULL DEFAULT 0`);
		}

		const authorityConfig = loadAuthorityConfig();
		this.#ropeEngine = new RopeEngine({
		registry: this.#registry,
		db: this.#db,
		broadcast: (cId, evt, data) => this.#broadcast(cId, evt, data),
		authorityConfig,
		});
		}

		// ── Auth-service credential fetching ──────────────────────────────────────

		/**
		* Fetch a Claude credential token for an about-to-run turn.
		*
		* Returns a real vault token when the local vault has one, OR a placeholder
		* token in cloud path-vault fallback mode. As a SIDE EFFECT it points the
		* AgentRegistry at the right Anthropic base URL (localhost proxy in path-vault
		* mode, cleared otherwise) BEFORE any runtime is built — every persona /
		* rope / handoff runtime resolves its model base URL from the registry.
		*
		* Used by all turn-dispatch call sites (dispatch, rope-engine, handoff-engine)
		* via `fetchToken: () => this.#fetchToken()`, so the fallback applies uniformly
		* without changing those call sites.
		*
		* @returns {Promise<string>}
		*/
		async #fetchToken() {
		const cred = await this.#resolveCredential();
		// setAnthropicBaseUrl points runtimes at the localhost path-vault proxy in
		// fallback mode (else clears it). Guard for registries that predate the
		// method or are test doubles — the path-vault override is best-effort.
		if (typeof this.#registry.setAnthropicBaseUrl === 'function') {
		this.#registry.setAnthropicBaseUrl(cred.mode === 'path-vault' ? cred.baseUrl : null);
		}
		return cred.token;
		}

		/**
		* Resolve a credential for the plan agent, preferring the local auth-service
		* vault and falling back to the operator's cloud path-vault URL when the local
		* vault is empty.
		*
		* @typedef {{ mode: 'vault', token: string }
		* \| { mode: 'path-vault', token: string, baseUrl: string }} CredentialResolution
		*
		* @returns {Promise<CredentialResolution>}
		*/
		async #resolveCredential() {
		// 1. Prefer the local vault. When it has a credential, behavior is unchanged.
		// Call the registry directly (NOT #fetchToken) — #fetchToken delegates
		// back here, so going through it would recurse.
		try {
		const token = await this.#registry.fetchToken('claude');
		return { mode: 'vault', token };
		} catch (err) {
		// Only fall back on a missing credential — surface real auth-service errors
		// (timeouts, 5xx) so they don't get masked by the path-vault path.
		if (err?.code && err.code !== 'NO_CREDENTIAL') throw err;
		}

		// 2. Fall back to the cloud path-vault URL, if configured + path-format.
		const baseUrl = readAnthropicBaseUrlForFallback();
		if (!isPathVaultUrl(baseUrl)) {
		// No usable fallback — re-raise the original NO_CREDENTIAL shape so callers
		// (hasCredential / dispatch) behave exactly as before.
		const e = new Error('no active claude credential in vault');
		e.code = 'NO_CREDENTIAL';
		throw e;
		}

		const localBaseUrl = await ensurePathVaultProxy(baseUrl);
		return { mode: 'path-vault', token: PATH_VAULT_PLACEHOLDER_KEY, baseUrl: localBaseUrl };
		}

		/**
		* Lightweight check — returns true when a credential is reachable, either from
		* the local vault OR the cloud path-vault fallback.
		* @returns {Promise<boolean>}
		*/
		async hasCredential() {
		try {
		await this.#resolveCredential();
		return true;
		} catch {
		return false;
		}
		}

		// ── Conversation management ───────────────────────────────────────────────

		/**
		* @param {{ title?: string }} [opts]
		* @returns {{ id: string, title: string\|null, persona: string, created_at: number }}
		*/
		createConversation({ title } = {}) {
		const id = randomUUID();
		const created_at = Date.now();

		const sessionDir = path.join(this.#planDir, id);
		fs.mkdirSync(sessionDir, { recursive: true });
		initSessionFile(path.join(sessionDir, 'session.jsonl'), id);

		this.#db
		.prepare(
		`INSERT INTO plan_conversations (id, title, persona, created_at)
		VALUES (?, ?, ?, ?)`,
		)
		.run(id, title ?? null, DEFAULT_PERSONA_ID, created_at);

		this.#activePersona.set(id, DEFAULT_PERSONA_ID);

		return { id, title: title ?? null, persona: DEFAULT_PERSONA_ID, created_at };
		}

		/** @returns {Array<{id, title, pinned, created_at, last_turn_at, persona, snippet}>} */
		listConversations() {
		return this.#db
		.prepare(
		`SELECT
		c.id, c.title, c.pinned, c.created_at, c.last_turn_at, c.persona,
		(SELECT pt.content FROM plan_turns pt
		WHERE pt.conversation_id = c.id
		ORDER BY pt.created_at DESC LIMIT 1) AS snippet
		FROM plan_conversations c
		ORDER BY c.pinned DESC, COALESCE(c.last_turn_at, c.created_at) DESC, c.rowid DESC`,
		)
		.all();
		}

		/**
		* Patch a conversation's title and/or pinned state.
		* @param {string} id
		* @param {{ title?: string, pinned?: boolean }} updates
		* @returns {object\|null} Updated row, or null if not found.
		*/
		patchConversation(id, updates) {
		const parts = [];
		const values = [];
		if (updates.title !== undefined) {
		parts.push('title = ?');
		values.push(updates.title \|\| null);
		}
		if (updates.pinned !== undefined) {
		parts.push('pinned = ?');
		values.push(updates.pinned ? 1 : 0);
		}
		if (parts.length === 0) return null;
		values.push(id);
		const changed = this.#db
		.prepare(`UPDATE plan_conversations SET ${parts.join(', ')} WHERE id = ?`)
		.run(...values);
		if (changed.changes === 0) return null;
		return this.#db
		.prepare(`SELECT id, title, pinned, created_at, last_turn_at, persona FROM plan_conversations WHERE id = ?`)
		.get(id) ?? null;
		}

		/**
		* Delete a conversation and all its associated data.
		* @param {string} id
		* @returns {boolean} true if deleted, false if not found.
		*/
		deleteConversation(id) {
		const exists = this.#db
		.prepare(`SELECT 1 FROM plan_conversations WHERE id = ?`)
		.get(id);
		if (!exists) return false;

		this.#db.prepare(`DELETE FROM plan_turns WHERE conversation_id = ?`).run(id);
		this.#db.prepare(`DELETE FROM plan_lookout_agents WHERE conversation_id = ?`).run(id);
		this.#db.prepare(`DELETE FROM plan_sidebar_signals WHERE conversation_id = ?`).run(id);
		this.#db.prepare(`DELETE FROM plan_conversations WHERE id = ?`).run(id);

		this.#activePersona.delete(id);
		this.#sinks.delete(id);
		this.#activeTurns.delete(id);
		this.#currentChunkRefs.delete(id);

		const sessionDir = path.join(this.#planDir, id);
		try { fs.rmSync(sessionDir, { recursive: true }); } catch { /* ok if missing */ }

		return true;
		}

		/**
		* @param {string} id
		* @returns {{ id, title, persona, created_at, last_turn_at, tree } \| null}
		*/
		getConversation(id) {
		const row = this.#db
		.prepare(
		`SELECT id, title, persona, created_at, last_turn_at
		FROM plan_conversations WHERE id = ?`,
		)
		.get(id);

		if (!row) return null;

		const sessionFile = path.join(this.#planDir, id, 'session.jsonl');
		let tree = [];
		try {
		const mgr = SessionManager.open(sessionFile, path.join(this.#planDir, id));
		tree = mgr.getTree();
		} catch {
		// Session file missing or corrupt — return empty tree.
		}

		return { ...row, tree };
		}

		// ── Active persona management ─────────────────────────────────────────────

		/**
		* @param {string} conversationId
		* @returns {string} Active persona ID.
		*/
		getActivePersona(conversationId) {
		if (this.#activePersona.has(conversationId)) {
		return this.#activePersona.get(conversationId);
		}
		const row = this.#db
		.prepare(`SELECT persona FROM plan_conversations WHERE id = ?`)
		.get(conversationId);
		const personaId = row?.persona ?? DEFAULT_PERSONA_ID;
		this.#activePersona.set(conversationId, personaId);
		return personaId;
		}

		/**
		* Set the active default persona for a conversation (does NOT trigger a handoff).
		* @param {string} conversationId
		* @param {string} personaId
		*/
		setActivePersona(conversationId, personaId) {
		this.#activePersona.set(conversationId, personaId);
		this.#db
		.prepare(`UPDATE plan_conversations SET persona = ? WHERE id = ?`)
		.run(personaId, conversationId);
		}

		// ── SSE broadcast ─────────────────────────────────────────────────────────

		#broadcast(conversationId, eventName, data) {
		// Buffer event while a turn is active for reconnect replay.
		const buf = this.#activeTurns.get(conversationId);
		if (buf) {
		buf.push({ event: eventName, data });
		}

		const sinks = this.#sinks.get(conversationId);
		if (!sinks \|\| sinks.size === 0) return;
		const chunk = `event: ${eventName}\ndata: ${JSON.stringify(data)}\n\n`;
		for (const res of sinks) {
		try { res.write(chunk); } catch { /* client disconnected */ }
		}

		// Clear buffer only when all pending personas have completed.
		if (eventName === 'turn_complete') {
		const pending = (this.#pendingPersonaCount.get(conversationId) ?? 1) - 1;
		if (pending <= 0) {
		this.#activeTurns.delete(conversationId);
		this.#pendingPersonaCount.delete(conversationId);
		} else {
		this.#pendingPersonaCount.set(conversationId, pending);
		}
		}
		}

		// ── Lookout agent management ──────────────────────────────────────────────

		/**
		* Invite a persona as a lookout for a conversation.
		* @param {string} conversationId
		* @param {string} personaId
		* @returns {{ persona_id: string, state: string, muted: boolean, mode: string }}
		*/
		inviteLookout(conversationId, personaId) {
		const now = Date.now();
		this.#db
		.prepare(`INSERT OR IGNORE INTO plan_lookout_agents (conversation_id, persona_id, muted, mode, created_at) VALUES (?, ?, 0, 'observe', ?)`)
		.run(conversationId, personaId, now);
		const agent = { persona_id: personaId, state: 'listening', muted: false, mode: 'observe' };
		this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'listening' });
		return agent;
		}

		/**
		* Update muted status (or mode) for a lookout agent.
		* @param {string} conversationId
		* @param {string} personaId
		* @param {{ muted?: boolean, mode?: string }} updates
		* @returns {{ persona_id: string, state: string, muted: boolean, mode: string } \| null}
		*/
		updateLookout(conversationId, personaId, { muted, mode } = {}) {
		const row = this.#db
		.prepare(`SELECT * FROM plan_lookout_agents WHERE conversation_id = ? AND persona_id = ?`)
		.get(conversationId, personaId);
		if (!row) return null;

		const newMuted = muted !== undefined ? (muted ? 1 : 0) : row.muted;
		const newMode = mode ?? row.mode;
		this.#db
		.prepare(`UPDATE plan_lookout_agents SET muted = ?, mode = ? WHERE conversation_id = ? AND persona_id = ?`)
		.run(newMuted, newMode, conversationId, personaId);

		const newState = newMuted ? 'idle' : 'listening';
		this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: newState });
		return { persona_id: personaId, state: newState, muted: !!newMuted, mode: newMode };
		}

		/**
		* Remove a lookout agent.
		* @param {string} conversationId
		* @param {string} personaId
		*/
		uninviteLookout(conversationId, personaId) {
		this.#db
		.prepare(`DELETE FROM plan_lookout_agents WHERE conversation_id = ? AND persona_id = ?`)
		.run(conversationId, personaId);
		}

		/**
		* List active lookout agents for a conversation.
		* @param {string} conversationId
		* @returns {Array<{ persona_id: string, state: string, muted: boolean, mode: string }>}
		*/
		listLookoutAgents(conversationId) {
		const rows = this.#db
		.prepare(`SELECT persona_id, muted, mode FROM plan_lookout_agents WHERE conversation_id = ?`)
		.all(conversationId);
		return rows.map((r) => ({
		persona_id: r.persona_id,
		state: r.muted ? 'idle' : 'listening',
		muted: !!r.muted,
		mode: r.mode,
		}));
		}

		// ── Sidebar signal management ─────────────────────────────────────────────

		/**
		* Dismiss a sidebar signal.
		* @param {string} conversationId
		* @param {string} signalId
		* @returns {boolean}
		*/
		dismissSignal(conversationId, signalId) {
		const info = this.#db
		.prepare(`UPDATE plan_sidebar_signals SET status = 'dismissed' WHERE id = ? AND conversation_id = ?`)
		.run(signalId, conversationId);
		return info.changes > 0;
		}

		/**
		* Mark a sidebar signal as used (for next turn context).
		* @param {string} conversationId
		* @param {string} signalId
		* @returns {boolean}
		*/
		useSignal(conversationId, signalId) {
		const info = this.#db
		.prepare(`UPDATE plan_sidebar_signals SET status = 'used' WHERE id = ? AND conversation_id = ?`)
		.run(signalId, conversationId);
		return info.changes > 0;
		}

		/**
		* List sidebar signals for a conversation (optionally filtered by chunk_id).
		* @param {string} conversationId
		* @param {string} [chunkId]
		* @returns {Array<object>}
		*/
		listSignals(conversationId, chunkId) {
		if (chunkId) {
		return this.#db
		.prepare(`SELECT * FROM plan_sidebar_signals WHERE conversation_id = ? AND chunk_id = ? ORDER BY created_at ASC`)
		.all(conversationId, chunkId);
		}
		return this.#db
		.prepare(`SELECT * FROM plan_sidebar_signals WHERE conversation_id = ? ORDER BY created_at ASC`)
		.all(conversationId);
		}

		// ── Lookout analysis ──────────────────────────────────────────────────────

		/**
		* Persona-specific heuristics for lookout analysis.
		* Returns { shouldComment: boolean, urgency, content, reason, tension_subject? }
		* or null if no comment warranted.
		*
		* @param {string} personaId
		* @param {string} content — chunk content to analyze
		* @returns {{ urgency: string, content: string, reason: string, tension_subject?: string } \| null}
		*/
		#analyzeChunkHeuristic(personaId, content) {
		const lower = content.toLowerCase();

		if (personaId === 'scout') {
		// Scout: flag unsubstantiated claims and factual assertions
		const claimPatterns = [
		/\b(research shows\|studies (show\|indicate\|suggest)\|data (shows\|indicates\|suggests))\b/i,
		/\b\d+(\.\d+)?\s*%\b/,
		/\b(always\|never\|all\|every\|none\|no one)\b/i,
		/\b(proven\|definitive\|certain\|guaranteed\|undeniable)\b/i,
		/\b(industry standard\|best practice\|widely accepted)\b/i,
		];
		const matched = claimPatterns.find((p) => p.test(content));
		if (matched) {
		return {
		urgency: 'p2',
		reason: 'Factual claim without cited source',
		content: 'This response contains claims that should be verified with evidence. What data or sources back this up?',
		};
		}
		// Scout spark: look for unexplored data angles
		if (lower.includes('option') \|\| lower.includes('approach') \|\| lower.includes('strategy')) {
		if (Math.random() < 0.3) {
		return {
		urgency: 'spark',
		reason: 'Potential evidence gap',
		content: '_What metrics or signals would tell us which option is actually better here?_',
		};
		}
		}
		}

		if (personaId === 'pm') {
		// PM: flag scope ambiguity and missing requirements
		const scopePatterns = [
		/\b(could\|might\|maybe\|perhaps\|possibly\|potentially)\b/i,
		/\b(later\|eventually\|someday\|future)\b/i,
		/\b(depends on\|unclear\|tbd\|to be determined)\b/i,
		];
		const matched = scopePatterns.find((p) => p.test(content));
		if (matched) {
		return {
		urgency: 'p1',
		reason: 'Scope ambiguity detected',
		content: 'Scope boundary needs clarification. What specifically is in vs. out for this iteration?',
		};
		}
		// PM: flag missing success criteria
		if ((lower.includes('implement') \|\| lower.includes('build') \|\| lower.includes('create')) && !lower.includes('success') && !lower.includes('metric') && !lower.includes('goal')) {
		if (Math.random() < 0.4) {
		return {
		urgency: 'p2',
		reason: 'Missing acceptance criteria',
		content: 'What does done look like here? Define the measurable success criteria before building.',
		};
		}
		}
		}

		if (personaId === 'brainstorm') {
		// Brainstorm: flag premature convergence on a single option
		const convergencePatterns = [
		/\b(the (best\|right\|correct\|only) (way\|approach\|solution\|option))\b/i,
		/\b(we should\|we must\|we need to\|the answer is)\b/i,
		/\b(obviously\|clearly\|simply\|just)\b/i,
		];
		const matched = convergencePatterns.find((p) => p.test(content));
		if (matched) {
		return {
		urgency: 'spark',
		reason: 'Early convergence on one path',
		content: '_Before narrowing: what\'s the alternative that explicitly rejects this approach? What would it look like?_',
		};
		}
		}

		return null;
		}

		/**
		* Run lookout analysis for all active lookout agents after a turn completes.
		* Emits sidebar_entry SSE events for any signals generated.
		*
		* @param {string} conversationId
		* @param {string} chunkId — the turn ID used as chunk reference
		* @param {string} chunkContent — the assistant's response text
		* @param {string} chunkPersona — which persona produced the chunk
		*/
		async #runLookoutAnalysis(conversationId, chunkId, chunkContent, chunkPersona) {
		const lookouts = this.#db
		.prepare(`SELECT persona_id, muted FROM plan_lookout_agents WHERE conversation_id = ? AND muted = 0`)
		.all(conversationId);

		for (const lookout of lookouts) {
		const { persona_id: personaId } = lookout;

		// Skip if this is the persona that produced the chunk
		if (personaId === chunkPersona) continue;

		// Emit thinking state
		this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'thinking' });

		// Small async gap to let the SSE event reach the client before analysis
		await new Promise((resolve) => setTimeout(resolve, 300 + Math.random() * 700));

		try {
		const analysis = this.#analyzeChunkHeuristic(personaId, chunkContent);

		if (analysis) {
		const signalId = randomUUID();
		const now = Date.now();
		this.#db
		.prepare(
		`INSERT INTO plan_sidebar_signals (id, conversation_id, agent_id, urgency, reason, content, chunk_id, created_at, status, tension_subject)
		VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active', ?)`,
		)
		.run(signalId, conversationId, personaId, analysis.urgency, analysis.reason, analysis.content, chunkId, now, analysis.tension_subject ?? null);

		const signal = {
		id: signalId,
		agent_id: personaId,
		urgency: analysis.urgency,
		reason: analysis.reason,
		content: analysis.content,
		chunk_id: chunkId,
		created_at: now,
		status: 'active',
		tension_subject: analysis.tension_subject ?? null,
		parent_signal_id: null,
		};

		this.#broadcast(
		conversationId,
		analysis.urgency === 'p0' ? 'interrupt' : 'sidebar_entry',
		{ signal },
		);
		}
		} catch (err) {
		console.error(`[plan] lookout analysis error ${conversationId}/${personaId}:`, err.message);
		}

		// Return to listening state
		this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'listening' });
		}
		}

		// ── Persona subscription setup ────────────────────────────────────────────

		/**
		* Wire pi event listeners for a session so tokens + turn_complete events are
		* forwarded to SSE clients.
		*
		* @param {string} conversationId
		* @param {string} personaId
		* @param {import('@mariozechner/pi-coding-agent').AgentSession} session
		*/
		#wireSessionEvents(conversationId, personaId, session) {
		session.subscribe((event) => {
		if (event.type === 'message_update') {
		const ae = event.assistantMessageEvent;
		if (ae.type === 'text_delta') {
		this.#broadcast(conversationId, 'token', { delta: ae.delta, persona: personaId });
		}
		} else if (event.type === 'agent_end') {
		const msgs = event.messages;
		const last = msgs[msgs.length - 1];

		let persistedText = '';
		let turnId = last?.id ?? randomUUID();

		// Persist the assistant turn so history loads correctly.
		if (last) {
		const text = (last.content ?? [])
		.filter((c) => c.type === 'text')
		.map((c) => c.text ?? '')
		.join('');
		if (text) {
		persistedText = text;
		const now = Date.now();
		this.#db
		.prepare(
		`INSERT OR IGNORE INTO plan_turns
		(id, conversation_id, role, content, persona, created_at)
		VALUES (?, ?, 'assistant', ?, ?, ?)`,
		)
		.run(turnId, conversationId, text, personaId, now);
		}
		}

		this.#broadcast(conversationId, 'turn_complete', {
		turnId,
		persona: personaId,
		finishReason: last?.stopReason ?? 'end_turn',
		});
		this.#db
		.prepare(`UPDATE plan_conversations SET last_turn_at = ? WHERE id = ?`)
		.run(Date.now(), conversationId);

		// Trigger lookout analysis asynchronously — does not block the turn.
		if (persistedText) {
		this.#runLookoutAnalysis(conversationId, turnId, persistedText, personaId)
		.catch((err) => console.error('[plan] lookout run error:', err.message));
		}
		}
		});
		}

		// ── Public API ────────────────────────────────────────────────────────────

		/**
		* Submit a user turn to one or more personas in parallel.
		* When mentionedPersonas contains 2+ IDs, each receives its own AgentSession
		* and streams tokens with per-persona attribution via SSE `persona` field.
		* Returns immediately; tokens stream over SSE.
		*
		* @param {{
		* conversationId: string,
		* content: string,
		* personaOverride?: string,
		* mentionedPersonas?: string[],
		* }} params
		* @returns {Promise<{ turnId: string, persona: string }>}
		*/
		async submitTurn({ conversationId, content, personaOverride, mentionedPersonas }) {
		const row = this.#db
		.prepare(`SELECT id, title FROM plan_conversations WHERE id = ?`)
		.get(conversationId);

		if (!row) {
		const err = new Error('conversation not found');
		err.code = 'NOT_FOUND';
		throw err;
		}

		const now = Date.now();

		// Determine which personas will receive this turn.
		// Multi-persona: user @-mentioned 2+ personas explicitly.
		// Single-persona: use explicit override or the conversation's active persona.
		const personasToDispatch = (mentionedPersonas?.length ?? 0) > 1
		? mentionedPersonas
		: [personaOverride ?? this.getActivePersona(conversationId)];

		// Open (or reset) the replay buffer; track how many turn_complete events are expected.
		this.#activeTurns.set(conversationId, []);
		this.#pendingPersonaCount.set(conversationId, personasToDispatch.length);

		// Set title from first user message if still null.
		if (row.title === null) {
		this.#db
		.prepare(`UPDATE plan_conversations SET title = ? WHERE id = ?`)
		.run(deriveTitle(content), conversationId);
		}

		// Persist the user turn once (regardless of how many personas respond).
		this.#db
		.prepare(
		`INSERT INTO plan_turns (id, conversation_id, role, content, created_at)
		VALUES (?, ?, 'user', ?, ?)`,
		)
		.run(randomUUID(), conversationId, content, now);

		const isSinglePersona = personasToDispatch.length === 1;

		// Dispatch to each persona. For multi-persona turns, skip rope enrichment —
		// the user explicitly chose all participants, so no auto-delegation is needed.
		await Promise.all(personasToDispatch.map(async (pId) => {
		const onStubCall = (event) => {
		this.#broadcast(conversationId, 'tool_stub_call', { persona: pId, ...event });
		};

		// Refresh credential before each turn. MUST precede getAgent(): in cloud
		// path-vault fallback mode #fetchToken points the registry at the localhost
		// proxy base URL, and getAgent() bakes that base URL into the runtime's
		// model when it first builds the (cached) runtime.
		const token = await this.#fetchToken();

		const { session, authStorage } = await this.#registry.getAgent(conversationId, pId, { onStubCall });

		// Wire events on first use (idempotent because pi de-duplicates subscribers).
		this.#wireSessionEvents(conversationId, pId, session);

		authStorage.setRuntimeApiKey('anthropic', token);

		let promptContent = content;
		if (isSinglePersona) {
		// Pre-turn autoRope enrichment (Phase D): run any persona's autoRope rules
		// before the caller's session sees the content. Keeps pm_gathering_context
		// backward-compat; rope_start/rope_complete are emitted by RopeEngine.
		promptContent = await this.#ropeEngine.autoDelegateIfNeeded({
		conversationId,
		callerPersonaId: pId,
		content,
		fetchToken: () => this.#fetchToken(),
		});
		}

		session.prompt(promptContent).catch((err) => {
		console.error(`[plan] prompt error ${conversationId}/${pId}:`, err.message);
		this.#broadcast(conversationId, 'error', {
		message: err.message,
		code: err.code ?? 'PROMPT_ERROR',
		});
		});
		}));

		const turnId = randomUUID();
		return { turnId, persona: personasToDispatch[0] };
		}

		/**
		* Execute a handoff, switching the default active persona.
		*
		* @param {{
		* conversationId: string,
		* toPersona: string,
		* mode?: 'full' \| 'distilled' \| 'quoted',
		* selectedTurnIds?: string[],
		* }} params
		* @returns {Promise<{ handoffId: string, forkNodeId: string \| null, seededTurnCount: number }>}
		*/
		async handoff({ conversationId, toPersona, mode = 'full', selectedTurnIds = [] }) {
		const row = this.#db
		.prepare(`SELECT id FROM plan_conversations WHERE id = ?`)
		.get(conversationId);
		if (!row) {
		const err = new Error('conversation not found');
		err.code = 'NOT_FOUND';
		throw err;
		}

		const fromPersona = this.getActivePersona(conversationId);
		const onStubCall = (event) => {
		this.#broadcast(conversationId, 'tool_stub_call', { persona: toPersona, ...event });
		};

		const result = await this.#handoffEngine.handoff({
		conversationId,
		fromPersona,
		toPersona,
		mode,
		selectedTurnIds,
		fetchToken: () => this.#fetchToken(),
		onStubCall,
		});

		// Update the active persona for this conversation.
		this.setActivePersona(conversationId, toPersona);

		// Persist handoff marker so history replay can reconstruct it.
		this.#db
		.prepare(
		`INSERT OR IGNORE INTO plan_turns
		(id, conversation_id, role, content, from_persona, to_persona, mode, fork_node_id, created_at)
		VALUES (?, ?, 'handoff', '', ?, ?, ?, ?, ?)`,
		)
		.run(result.handoffId, conversationId, fromPersona, toPersona, mode, result.forkNodeId ?? null, Date.now());

		// Broadcast the handoff event to SSE clients.
		this.#broadcast(conversationId, 'handoff', {
		handoffId: result.handoffId,
		fromPersona,
		toPersona,
		mode,
		forkNodeId: result.forkNodeId,
		});

		// Wire events for the new persona's session.
		try {
		const { session } = await this.#registry.getAgent(conversationId, toPersona, { onStubCall });
		this.#wireSessionEvents(conversationId, toPersona, session);
		} catch {
		// Best-effort — events will be wired on first turn if this fails.
		}

		return result;
		}

		/**
		* Replay buffered in-flight SSE events to a reconnecting client.
		* Call this before addEventSink so the client gets events it missed.
		* No-op if no turn is active.
		*
		* @param {string} conversationId
		* @param {import('node:http').ServerResponse} res
		*/
		drainReplayBuffer(conversationId, res) {
		const buf = this.#activeTurns.get(conversationId);
		if (!buf \|\| buf.length === 0) return;
		for (const { event, data } of buf) {
		try {
		res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
		} catch { /* client closed before drain completed */ }
		}
		}

		/**
		* Register an SSE sink for a conversation. Returns a cleanup function.
		* @param {string} conversationId
		* @param {import('node:http').ServerResponse} res
		* @returns {() => void}
		*/
		addEventSink(conversationId, res) {
		if (!this.#sinks.has(conversationId)) {
		this.#sinks.set(conversationId, new Set());
		}
		this.#sinks.get(conversationId).add(res);
		return () => {
		const s = this.#sinks.get(conversationId);
		if (s) s.delete(res);
		};
		}

		/**
		* Return the ordered turn list for a conversation (for history replay).
		* Each turn is one of:
		* { role:'user'\|'assistant', content, persona?, created_at }
		* { role:'handoff', from_persona, to_persona, mode, fork_node_id, created_at }
		* @param {string} conversationId
		* @returns {Array<object>}
		*/
		getTurns(conversationId) {
		return this.#db
		.prepare(
		`SELECT id, role, content, persona, from_persona, to_persona, mode, fork_node_id, created_at
		FROM plan_turns
		WHERE conversation_id = ?
		ORDER BY created_at ASC`,
		)
		.all(conversationId);
		}

		/** Expose persona list for the /api/plan/personas endpoint. */
		listPersonas() {
		return PERSONAS.map((p) => ({
		id: p.id,
		displayName: p.displayName,
		model: p.model,
		toolNames: p.toolNames,
		systemPromptPreview: p.systemPrompt.length > 120
		? p.systemPrompt.slice(0, 117) + '...'
		: p.systemPrompt,
		}));
		}
		}

-282

host-cp/src/plan-progress.mjs

		/**
		* Plan progress parser — reads phase-*-tasks.md trackers to derive
		* phase/task state for the inbox progress bar.
		*
		* @module plan-progress
		*/

		import { readdirSync, readFileSync, statSync } from 'node:fs';
		import path from 'node:path';

		const WORKING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes

		/**
		* Parse simple key:value pairs from a YAML frontmatter block (---…---).
		* Handles single-line scalar values only — enough for feature/phase keys.
		*
		* @param {string} content
		* @returns {Record<string, string>}
		*/
		function parseFrontmatter(content) {
		const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---/);
		if (!match) return {};
		const result = {};
		for (const line of match[1].split('\n')) {
		const m = line.match(/^([\w-]+):\s*(.+)$/);
		if (m) result[m[1]] = m[2].trim();
		}
		return result;
		}

		/**
		* Extract task definitions from "## Task list" section.
		* Matches headings like:
		* ### A0 — name
		* ### B1 step 5 — multi-part name
		*
		* @param {string} content
		* @returns {Array<{id: string, name: string}>}
		*/
		function extractTaskDefs(content) {
		const sectionMatch = content.match(/^## Task list\s\n([\s\S])/m);
		if (!sectionMatch) return [];

		const taskSection = sectionMatch[1];
		const tasks = [];
		const re = /^###\s+([A-Z]\d+)\b([^\n]*)/gm;
		let m;
		while ((m = re.exec(taskSection)) !== null) {
		const id = m[1];
		const rest = m[2].trim();
		// Strip leading em-dash, double-hyphen, or plain hyphen separator
		const name = rest.replace(/^\s[—\-]{1,2}\s/, '').trim() \|\| id;
		tasks.push({ id, name });
		}
		return tasks;
		}

		/**
		* Extract completed task IDs from the CP0 log comment block.
		* Matches lines like: A0 (2026-05-05): ...
		* A2 (2026-05-05, rebase): ...
		*
		* @param {string} content
		* @returns {Set<string>}
		*/
		function extractCp0Completed(content) {
		const completed = new Set();
		const logMatch = content.match(/<!--\sCP0 log[\s\S]?-->/);
		if (!logMatch) return completed;

		const re = /^([A-Z]\d+)\s*\(/gm;
		let m;
		while ((m = re.exec(logMatch[0])) !== null) {
		completed.add(m[1]);
		}
		return completed;
		}

		/**
		* Extract completed task IDs from an item-format Status table.
		* Matches rows like: \| A1 \| Tool loader index \| done \|
		*
		* @param {string} content
		* @returns {Set<string>}
		*/
		function extractItemTableCompleted(content) {
		const completed = new Set();
		// No `m` flag — `$` must mean end-of-string so the lazy quantifier captures
		// the whole table, not just the first line.
		const statusMatch = content.match(/## Status\s\n([\s\S]?)(?=\n##\s\|$)/);
		if (!statusMatch) return completed;

		const re = /^\\|\s([A-Z]\d+)\s\\|[^\|]+\\|\sdone\s\\|/gim;
		let m;
		while ((m = re.exec(statusMatch[1])) !== null) {
		completed.add(m[1]);
		}
		return completed;
		}

		/**
		* Extract the authoritative done count from a count-format Status table.
		* Matches rows like: \| done \| 3 \|
		*
		* @param {string} content
		* @returns {number\|null}
		*/
		function extractDoneCount(content) {
		const m = content.match(/\\|\sdone\s\\|\s(\d+)\s\\|/i);
		return m ? parseInt(m[1], 10) : null;
		}

		/**
		* Resolve the feature slug from a branch name or by scanning docs/plans/.
		*
		* Strategy:
		* 1. Strip "feat/" prefix + optional "-phase-X" suffix from branch.
		* 2. Exact match against plans subdirectory names.
		* 3. Prefix match (branch slug starts with a plan dir name).
		* 4. Fallback: most-recently-modified plans dir that has phase trackers.
		*
		* @param {string} repoPath - path to the git checkout
		* @param {string\|null} branch
		* @returns {string\|null}
		*/
		function resolveFeatureSlug(repoPath, branch) {
		const plansDir = path.join(repoPath, 'docs', 'plans');
		let entries;
		try {
		entries = readdirSync(plansDir, { withFileTypes: true })
		.filter((d) => d.isDirectory())
		.map((d) => d.name);
		} catch {
		return null;
		}

		if (branch) {
		// Strip feat/ prefix, any nested path, and trailing -phase-X suffix
		const slug = branch
		.replace(/^feat\//, '')
		.replace(/\/.*$/, '')
		.replace(/-phase-[a-z]$/, '');

		// Exact match
		if (entries.includes(slug)) return slug;

		// Prefix match (slug starts with a plan dir name)
		const prefixMatch = entries.find((d) => slug.startsWith(d));
		if (prefixMatch) return prefixMatch;

		// Branch provided but no name match — don't guess
		return null;
		}

		// No branch: fallback to most-recently-modified dir with phase tracker files
		let newest = null;
		let newestMtime = 0;
		for (const dir of entries) {
		const dirPath = path.join(plansDir, dir);
		try {
		const files = readdirSync(dirPath);
		if (!files.some((f) => /^phase-[a-z]-tasks\.md$/.test(f))) continue;
		const mtime = statSync(dirPath).mtimeMs;
		if (mtime > newestMtime) {
		newestMtime = mtime;
		newest = dir;
		}
		} catch {
		// skip unreadable entries
		}
		}

		return newest;
		}

		/**
		* Parse a single phase tracker file into phase/task state.
		*
		* @param {string} filePath
		* @param {boolean} isRecentlyActive - whether the world had recent activity
		* @param {{ workingMarked: boolean }} state - mutable flag shared across phases
		* @returns {{ id: string, name: string, status: string, tasks: Array }\|null}
		*/
		function parseTrackerFile(filePath, isRecentlyActive, state) {
		let content;
		try {
		content = readFileSync(filePath, 'utf8');
		} catch {
		return null;
		}

		const fm = parseFrontmatter(content);

		// Phase ID: frontmatter "phase" field or filename "phase-X-tasks.md"
		const phaseId =
		fm.phase \|\|
		path.basename(filePath).match(/^phase-([a-z])-tasks\.md$/)?.[1] \|\|
		'?';

		const phaseName = `Phase ${phaseId.toUpperCase()}`;

		const taskDefs = extractTaskDefs(content);
		if (taskDefs.length === 0) return null;

		// Collect completions from all sources
		const cp0Completed = extractCp0Completed(content);
		const itemTableCompleted = extractItemTableCompleted(content);
		const doneCount = extractDoneCount(content);

		// Merge CP0 log + item-table; count-format overrides if present
		const mergedCompleted = new Set([...cp0Completed, ...itemTableCompleted]);

		const tasks = taskDefs.map((t, i) => {
		const isComplete =
		doneCount !== null
		? i < doneCount // count format is authoritative
		: mergedCompleted.has(t.id);

		if (isComplete) return { id: t.id, name: t.name, status: 'complete' };

		// First pending task across all phases = candidate for "working"
		if (!state.workingMarked) {
		state.workingMarked = true;
		return {
		id: t.id,
		name: t.name,
		status: isRecentlyActive ? 'working' : 'pending',
		};
		}

		return { id: t.id, name: t.name, status: 'pending' };
		});

		const allComplete = tasks.every((t) => t.status === 'complete');
		const anyWorking = tasks.some((t) => t.status === 'working');
		const phaseStatus = allComplete ? 'complete' : anyWorking ? 'working' : 'pending';

		return { id: phaseId, name: phaseName, status: phaseStatus, tasks };
		}

		/**
		* Read plan progress from a world's git checkout.
		*
		* @param {string} repoPath - absolute path to the git checkout
		* @param {string\|null} branch - current branch name (e.g. "feat/foo-phase-a")
		* @param {{ lastActivityAtMs?: number\|null }} [opts]
		* @returns {{ feature: string, phases: Array }\|null}
		* null when no plan tracker is found (caller falls back to legacy bar)
		*/
		export function readPlanProgress(repoPath, branch, { lastActivityAtMs = null } = {}) {
		const feature = resolveFeatureSlug(repoPath, branch);
		if (!feature) return null;

		const plansDir = path.join(repoPath, 'docs', 'plans', feature);
		let phaseFiles;
		try {
		phaseFiles = readdirSync(plansDir)
		.filter((f) => /^phase-[a-z]-tasks\.md$/.test(f))
		.sort();
		} catch {
		return null;
		}

		if (phaseFiles.length === 0) return null;

		const isRecentlyActive =
		lastActivityAtMs != null
		? Date.now() - lastActivityAtMs <= WORKING_THRESHOLD_MS
		: false;

		const state = { workingMarked: false };

		const phases = phaseFiles
		.map((file) =>
		parseTrackerFile(path.join(plansDir, file), isRecentlyActive, state),
		)
		.filter(Boolean);

		if (phases.length === 0) return null;

		return { feature, phases };
		}

-573

host-cp/src/planning-sessions.mjs

		// planning-sessions — host-cp surface for creating and inspecting in-flight
		// planning sessions stored under world_id = PLANNING_WORLD_ID ('_planning').
		//
		// Formalises what the plan-chat-spa dev substrate does ad hoc:
		//
		// createPlanningSession({ actorId, pool })
		// Seeds a session with one 'system' chunk so the Electric shape subscriber
		// gets a non-empty initial response on its first long-poll cycle. Also
		// INSERTs a row into the planning_sessions sidecar table inside the same
		// transaction so no partial state can exist (chunk written, no metadata row).
		// Returns the allocated world_id, session_id, and the inserted seed chunk.
		//
		// loadPlanningSession({ pool, sessionId })
		// Lightweight metadata read: chunk count, first/last timestamps, first
		// operator content (for title derivation). SPA still streams live chunks
		// via the existing /v1/shape proxy — this is metadata-only.
		//
		// recordPlanningSession({ pool, sessionId, actorId, summary })
		// UPSERT into planning_sessions. Used by createPlanningSession (wrapped in
		// a transaction) and later to update the summary as the session evolves.
		//
		// setCrystallizeStatus({ pool, sessionId, status, worldId })
		// UPDATE planning_sessions.crystallize_status + crystallized_world_id.
		// Throws if status is not in PLANNING_SESSION_STATUSES.
		//
		// listPlanningSessions({ pool, actorId, limit })
		// SELECT rows for actorId, ordered created_at DESC. Returns array.
		//
		// Neither function calls validateChunkInput — that's for the public POST
		// surface. INSERTs here are built directly against the chunks column list.
		//
		// Pool errors surface loudly (never swallowed) so the caller sees the full
		// pg error message and can diagnose connectivity or constraint failures.

		import { randomUUID } from 'node:crypto';
		import { PLANNING_WORLD_ID, PLANNING_SESSION_STATUSES } from '@olam/chunks/schema';

		/**
		* UPSERT a row in planning_sessions for the given sessionId.
		*
		* On first call (from createPlanningSession): inserts a fresh row.
		* On subsequent calls: updates summary + updated_at only (leaves
		* crystallize_status and crystallized_world_id untouched).
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.sessionId
		* @param {string} opts.actorId
		* @param {string \| null} [opts.summary]
		* @param {string \| null} [opts.linearIssueId] — LinearAgent (handoff principle 6):
		* the Linear issue this session is driven by. Immutable once set — subsequent
		* upserts preserve the recorded id (COALESCE), so an ordinary update (null)
		* never clears a Linear link.
		*/
		export async function recordPlanningSession({ pool, sessionId, actorId, summary = null, linearIssueId = null }) {
		await pool.query(
		`INSERT INTO planning_sessions (session_id, actor_id, summary, linear_issue_id)
		VALUES ($1, $2, $3, $4)
		ON CONFLICT (session_id) DO UPDATE
		SET summary = EXCLUDED.summary,
		updated_at = NOW(),
		linear_issue_id = COALESCE(planning_sessions.linear_issue_id, EXCLUDED.linear_issue_id)`,
		[sessionId, actorId, summary, linearIssueId],
		);
		}

		/**
		* Find the ACTIVE planning session for a Linear issue (handoff principle 6).
		* Active = `archived_at IS NULL` (the substrate's established soft-delete
		* marker). A NEW dispatch for the same issue resumes this session; an archived
		* run is never matched, so an archived issue re-opened later starts fresh.
		* Returns the session_id, or null when there is no active session (or no id).
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string \| null \| undefined} opts.linearIssueId
		* @returns {Promise<string \| null>}
		*/
		export async function findActiveLinearSession({ pool, linearIssueId }) {
		if (!linearIssueId) return null;
		const { rows } = await pool.query(
		`SELECT session_id FROM planning_sessions
		WHERE linear_issue_id = $1 AND archived_at IS NULL
		ORDER BY created_at DESC
		LIMIT 1`,
		[linearIssueId],
		);
		return rows?.[0]?.session_id ?? null;
		}

		/**
		* Archive the planning session(s) for a Linear issue (handoff principle 6) —
		* sets `archived_at` so a future dispatch for the same issue is NOT resumed and
		* starts fresh. Called when the Linear issue is archived. Idempotent: only
		* flips still-active rows. Returns the number of sessions archived.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string \| null \| undefined} opts.linearIssueId
		* @returns {Promise<number>}
		*/
		export async function archiveLinearSession({ pool, linearIssueId }) {
		if (!linearIssueId) return 0;
		const res = await pool.query(
		`UPDATE planning_sessions
		SET archived_at = NOW(), updated_at = NOW()
		WHERE linear_issue_id = $1 AND archived_at IS NULL`,
		[linearIssueId],
		);
		return res?.rowCount ?? 0;
		}

		/**
		* Create a new in-flight planning session under world_id='_planning'.
		*
		* Allocates a fresh session_id (UUID v4) and, inside a single transaction:
		* 1. INSERTs a seed chunk (actor_type='system', seq=0) so the Electric shape
		* subscriber receives a non-empty initial long-poll response.
		* 2. INSERTs a planning_sessions sidecar row (via recordPlanningSession) so
		* listPlanningSessions can return it immediately.
		*
		* Transaction guarantee: both INSERTs succeed or both roll back. A chunk
		* written without a corresponding planning_sessions row is the partial-state
		* bug this transaction prevents.
		*
		* @param {object} opts
		* @param {string} opts.actorId — actor_id to attribute the seed chunk to
		* (typically 'system' or the host-cp service id)
		* @param {object} opts.pool — pg.Pool-compatible object with .query() and
		* optionally .connect() for transactional clients.
		* Tests may supply a stub with a transaction-aware
		* .query() (BEGIN / INSERT / INSERT / COMMIT).
		* @returns {Promise<{
		* world_id: string,
		* session_id: string,
		* seed_chunk: {
		* world_id: string, session_id: string, message_id: string, seq: number,
		* actor_id: string, actor_type: string, role: string, chunk: string,
		* chunk_type: string,
		* },
		* }>}
		*/
		export async function createPlanningSession({ actorId, pool }) {
		const sessionId = randomUUID();
		const messageId = randomUUID();
		const seq = 0;
		const actorType = 'system';
		const role = 'system';
		const chunk = 'Planning session created.';
		const chunkType = 'text';

		// Use a transactional client when pool.connect() is available (real pg.Pool).
		// Test stubs that only implement .query() fall through to the flat path;
		// the transactional contract is proven by the test that uses a stub whose
		// second .query() throws and asserts the chunk INSERT was rolled back.
		if (typeof pool.connect === 'function') {
		const client = await pool.connect();
		try {
		await client.query('BEGIN');
		await client.query(
		`INSERT INTO chunks
		(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
		[PLANNING_WORLD_ID, sessionId, messageId, seq, actorId, actorType, role, chunk, chunkType],
		);
		await recordPlanningSession({ pool: client, sessionId, actorId, summary: null });
		await client.query('COMMIT');
		} catch (err) {
		await client.query('ROLLBACK');
		throw err;
		} finally {
		client.release();
		}
		} else {
		// Flat path for test stubs: queries run sequentially on the stub pool.
		await pool.query(
		`INSERT INTO chunks
		(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
		[PLANNING_WORLD_ID, sessionId, messageId, seq, actorId, actorType, role, chunk, chunkType],
		);
		await recordPlanningSession({ pool, sessionId, actorId, summary: null });
		}

		return {
		world_id: PLANNING_WORLD_ID,
		session_id: sessionId,
		seed_chunk: {
		world_id: PLANNING_WORLD_ID,
		session_id: sessionId,
		message_id: messageId,
		seq,
		actor_id: actorId,
		actor_type: actorType,
		role,
		chunk,
		chunk_type: chunkType,
		},
		};
		}

		/**
		* Update crystallize_status (and optionally crystallized_world_id) for a
		* planning session.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.sessionId
		* @param {string} opts.status — must be in PLANNING_SESSION_STATUSES
		* @param {string \| null} [opts.worldId] — required when status='crystallized'
		*/
		export async function setCrystallizeStatus({ pool, sessionId, status, worldId = null }) {
		if (!PLANNING_SESSION_STATUSES.includes(status)) {
		throw new Error(
		`setCrystallizeStatus: invalid status "${status}"; ` +
		`must be one of ${PLANNING_SESSION_STATUSES.join(', ')}`,
		);
		}
		await pool.query(
		`UPDATE planning_sessions
		SET crystallize_status = $2,
		crystallized_world_id = $3,
		updated_at = NOW()
		WHERE session_id = $1`,
		[sessionId, status, worldId],
		);
		}

		/**
		* Create a new multi-turn DISPATCH session (multi-turn-cloud-sandbox-dispatch
		* Phase A2 — distinct from createPlanningSession which is for planning-flow
		* crystallization sessions under world_id='_planning').
		*
		* Allocates a UUID session_id, INSERTs a planning_sessions row with
		* session_type='dispatch' + caller-supplied world_id, applies operator-supplied
		* budget_usd_cap / allow_unpriced_models defaults, returns the session_id.
		*
		* No seed chunk: dispatch sessions accumulate chunks from the agent runtime
		* (via /v1/chunks); we don't pre-seed a system chunk because Electric shape
		* subscribers for dispatch sessions can wait for the first real agent chunk.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.actorId
		* @param {string} opts.worldId — operator-supplied; identifies the dispatch
		* target world (NOT the '_planning' sentinel used by createPlanningSession).
		* @param {number \| null} [opts.budgetUsdCap=null] — per-session budget cap;
		* null = uncapped. When null AND `OLAM_SESSION_BUDGET_DEFAULT_USD` is set,
		* the env-default applies at /v1/dispatch-turn check time (Phase D); here
		* we record the row exactly as supplied.
		* @param {boolean} [opts.allowUnpricedModels=false] — opt session into the
		* pricingForModel-returns-null fallback (Plan A T11 mitigation; default
		* refuses unknown models with 502).
		* @returns {Promise<{ session_id: string }>}
		*/
		export async function createDispatchSession({
		pool,
		actorId,
		worldId,
		budgetUsdCap = null,
		allowUnpricedModels = false,
		sessionId: providedSessionId = null,
		}) {
		if (!actorId \|\| typeof actorId !== 'string') {
		throw new Error('createDispatchSession: actorId required');
		}
		if (!worldId \|\| typeof worldId !== 'string') {
		throw new Error('createDispatchSession: worldId required');
		}
		// A6 (Decision 9 always-on threading): callers MAY supply session_id to
		// upsert an existing planning_sessions row (e.g. /api/cloud-dispatch
		// pre-creating the thread before forwarding to plan-DO). When omitted,
		// we generate a UUID. ON CONFLICT DO NOTHING handles the race where
		// the SPA called /v1/sessions/create concurrently AND server-side
		// cloud-dispatch tried to pre-create the same row.
		const sessionId = providedSessionId ?? randomUUID();
		await pool.query(
		`INSERT INTO planning_sessions
		(session_id, actor_id, session_type, world_id, budget_usd_cap, allow_unpriced_models)
		VALUES ($1, $2, 'dispatch', $3, $4, $5)
		ON CONFLICT (session_id) DO NOTHING`,
		[sessionId, actorId, worldId, budgetUsdCap, allowUnpricedModels],
		);
		return { session_id: sessionId };
		}

		/**
		* Atomic test-and-set lock claim on a dispatch session
		* (multi-turn-cloud-sandbox-dispatch Phase A3 — Decision 4 + T5 mitigation).
		*
		* Pattern: single-statement UPDATE ... WHERE in_flight_turn_id IS NULL RETURNING.
		* Two concurrent attempts: first claim wins (RETURNING yields 1 row); second
		* sees empty result + must return 409 to caller. Matches the established
		* planning-sessions.mjs:169 setCrystallizeStatus atomic-write idiom.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.sessionId
		* @param {string} opts.turnId — operator-or-server-generated turn UUID
		* @returns {Promise<boolean>} true if lock claimed, false if already held
		*/
		export async function claimDispatchTurnLock({ pool, sessionId, turnId }) {
		const result = await pool.query(
		`UPDATE planning_sessions
		SET in_flight_turn_id = $1,
		in_flight_turn_started_at = NOW(),
		last_turn_at = NOW()
		WHERE session_id = $2
		AND session_type = 'dispatch'
		AND in_flight_turn_id IS NULL
		RETURNING session_id`,
		[turnId, sessionId],
		);
		return (result.rows?.length ?? 0) > 0;
		}

		/**
		* Clear the in-flight turn lock after dispatch completes (success OR failure).
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.sessionId
		*/
		export async function clearDispatchTurnLock({ pool, sessionId }) {
		await pool.query(
		`UPDATE planning_sessions
		SET in_flight_turn_id = NULL,
		in_flight_turn_started_at = NULL
		WHERE session_id = $1
		AND session_type = 'dispatch'`,
		[sessionId],
		);
		}

		/**
		* Halt a dispatch session — operator-driven "block next turn" state (T13).
		*
		* Sets halted_at to NOW() AND clears in_flight_turn_id. Future /v1/dispatch-turn
		* calls return 409 'session_halted' until reactivateDispatchSession clears
		* halted_at. Does NOT stop an in-flight container — the running container
		* completes its current turn naturally. UX is "Block next turn" not "Stop"
		* (Plan A Phase C C6).
		*
		* Scoped by actor_id for ownership isolation.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.sessionId
		* @param {string} opts.actorId
		* @returns {Promise<boolean>} true if a session row was updated; false if
		* the session_id was not found / not owned by actorId.
		*/
		export async function haltDispatchSession({ pool, sessionId, actorId }) {
		const result = await pool.query(
		`UPDATE planning_sessions
		SET halted_at = NOW(),
		in_flight_turn_id = NULL,
		in_flight_turn_started_at = NULL
		WHERE session_id = $1
		AND session_type = 'dispatch'
		AND actor_id = $2
		RETURNING session_id`,
		[sessionId, actorId],
		);
		return (result.rows?.length ?? 0) > 0;
		}

		/**
		* Reactivate a halted dispatch session — clears halted_at so subsequent
		* /v1/dispatch-turn calls can claim the lock again. Idempotent (clearing an
		* already-null halted_at is a no-op).
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.sessionId
		* @param {string} opts.actorId
		* @returns {Promise<boolean>} true if a session row was updated; false if
		* the session_id was not found / not owned by actorId.
		*/
		export async function reactivateDispatchSession({ pool, sessionId, actorId }) {
		const result = await pool.query(
		`UPDATE planning_sessions
		SET halted_at = NULL
		WHERE session_id = $1
		AND session_type = 'dispatch'
		AND actor_id = $2
		RETURNING session_id`,
		[sessionId, actorId],
		);
		return (result.rows?.length ?? 0) > 0;
		}

		/**
		* Read a dispatch session by session_id + scope to caller's actor_id
		* (ownership check). Returns the session metadata needed for budget check
		* + plan-DO forward, OR null when not found / not owned.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.sessionId
		* @param {string} opts.actorId
		* @returns {Promise<null \| {
		* session_id: string,
		* world_id: string \| null,
		* actor_id: string,
		* total_usd: number,
		* budget_usd_cap: number \| null,
		* allow_unpriced_models: boolean,
		* halted_at: string \| null,
		* }>}
		*/
		export async function getDispatchSession({ pool, sessionId, actorId }) {
		const result = await pool.query(
		`SELECT session_id, world_id, actor_id,
		total_usd, budget_usd_cap, allow_unpriced_models,
		halted_at
		FROM planning_sessions
		WHERE session_id = $1
		AND session_type = 'dispatch'
		AND actor_id = $2`,
		[sessionId, actorId],
		);
		const row = result.rows?.[0];
		if (!row) return null;
		return {
		session_id: row.session_id,
		world_id: row.world_id ?? null,
		actor_id: row.actor_id,
		total_usd: Number(row.total_usd ?? 0),
		budget_usd_cap:
		row.budget_usd_cap === null \|\| row.budget_usd_cap === undefined
		? null
		: Number(row.budget_usd_cap),
		allow_unpriced_models: Boolean(row.allow_unpriced_models),
		halted_at: row.halted_at ?? null,
		};
		}

		/**
		* List planning sessions for a given actorId, ordered by created_at DESC.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.actorId
		* @param {number} [opts.limit=50]
		* @returns {Promise<Array<{
		* session_id: string,
		* summary: string \| null,
		* crystallize_status: string,
		* crystallized_world_id: string \| null,
		* created_at: string,
		* updated_at: string,
		* }>>}
		*/
		export async function listPlanningSessions({ pool, actorId, limit = 50 }) {
		const result = await pool.query(
		`SELECT session_id, summary, crystallize_status, crystallized_world_id,
		created_at, updated_at
		FROM planning_sessions
		WHERE actor_id = $1
		ORDER BY created_at DESC
		LIMIT $2`,
		[actorId, limit],
		);
		return result.rows;
		}

		/**
		* List multi-turn DISPATCH sessions for a given actorId, ordered by
		* last_turn_at DESC (most recently active first), excluding archived sessions.
		*
		* Distinct from listPlanningSessions: this returns only `session_type='dispatch'`
		* rows + projects the multi-turn-specific columns (total_usd, in_flight_turn_id,
		* halted_at, etc.) that the SPA's SessionsListView (Phase C C3) renders.
		*
		* @param {object} opts
		* @param {object} opts.pool
		* @param {string} opts.actorId
		* @param {number} [opts.limit=50]
		* @returns {Promise<Array<{
		* session_id: string,
		* world_id: string \| null,
		* total_usd: string,
		* budget_usd_cap: string \| null,
		* in_flight_turn_id: string \| null,
		* halted_at: string \| null,
		* last_turn_at: string \| null,
		* created_at: string,
		* summary: string \| null,
		* }>>}
		*/
		export async function listDispatchSessions({ pool, actorId, limit = 50 }) {
		const result = await pool.query(
		`SELECT session_id, world_id,
		total_usd, budget_usd_cap,
		in_flight_turn_id, halted_at,
		last_turn_at, created_at,
		summary
		FROM planning_sessions
		WHERE actor_id = $1
		AND session_type = 'dispatch'
		AND archived_at IS NULL
		ORDER BY last_turn_at DESC NULLS LAST, created_at DESC
		LIMIT $2`,
		[actorId, limit],
		);
		return result.rows;
		}

		/**
		* Load lightweight metadata for an existing in-flight planning session.
		*
		* Performs two queries scoped to world_id='_planning' AND session_id=<sessionId>:
		* 1. Aggregate: chunk_count, first_chunk_at, last_chunk_at.
		* 2. First operator content: earliest chunk where actor_type='operator',
		* used by the SPA for session title derivation.
		*
		* The SPA streams live chunks via the existing /v1/shape proxy; this function
		* is metadata-only and does NOT subscribe to any Electric shape.
		*
		* @param {object} opts
		* @param {object} opts.pool — pg.Pool-compatible object with a .query(sql, params) method
		* @param {string} opts.sessionId — UUID of the planning session to inspect
		* @returns {Promise<{
		* world_id: string,
		* session_id: string,
		* exists: boolean,
		* chunk_count: number,
		* first_chunk_at: string \| null,
		* last_chunk_at: string \| null,
		* first_operator_content: string \| null,
		* }>}
		*/
		export async function loadPlanningSession({ pool, sessionId }) {
		const aggResult = await pool.query(
		`SELECT COUNT(*) AS chunk_count,
		MIN(created_at) AS first_chunk_at,
		MAX(created_at) AS last_chunk_at
		FROM chunks
		WHERE world_id = $1 AND session_id = $2`,
		[PLANNING_WORLD_ID, sessionId],
		);

		const row = aggResult.rows[0];
		const chunkCount = Number(row.chunk_count);
		const exists = chunkCount > 0;

		let firstOperatorContent = null;

		if (exists) {
		const opResult = await pool.query(
		`SELECT chunk
		FROM chunks
		WHERE world_id = $1 AND session_id = $2 AND actor_type = 'operator'
		ORDER BY created_at ASC
		LIMIT 1`,
		[PLANNING_WORLD_ID, sessionId],
		);
		if (opResult.rows.length > 0) {
		firstOperatorContent = opResult.rows[0].chunk;
		}
		}

		return {
		world_id: PLANNING_WORLD_ID,
		session_id: sessionId,
		exists,
		chunk_count: chunkCount,
		first_chunk_at: exists ? row.first_chunk_at : null,
		last_chunk_at: exists ? row.last_chunk_at : null,
		first_operator_content: firstOperatorContent,
		};
		}

-396

host-cp/src/port-bridge-manager.mjs

		/**
		* port-bridge-manager.mjs
		* Manages socat sidecar containers that bridge host port → world devbox port.
		* Dual-mode: Docker HTTP API (container) vs docker CLI (bare-node).
		*/
		import { spawnSync } from 'node:child_process';
		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';

		const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli';
		const SOCAT_IMAGE = 'alpine/socat';
		const SOCAT_IMAGE_TAGGED = 'alpine/socat:latest';
		const HOST_PORT_MIN = 25000;
		const HOST_PORT_MAX = 25999;
		const INFRA_PORTS = new Set([8080, 7681, 7682]);

		let BRIDGES_PATH =
		process.env.OLAM_PORT_BRIDGES_PATH ??
		path.join(os.homedir(), '.olam', 'port-bridges.json');
		let HOST_IP = '127.0.0.1';

		// key: `${worldId}:${containerPort}` → { worldId, containerPort, hostPort, containerId, containerName }
		const registry = new Map();

		export function configure({ bridgesPath, hostIp }) {
		if (bridgesPath && bridgesPath !== BRIDGES_PATH) {
		BRIDGES_PATH = bridgesPath;
		loadState();
		}
		if (hostIp) HOST_IP = hostIp;
		}

		function bridgeKey(worldId, containerPort) {
		return `${worldId}:${containerPort}`;
		}

		function bridgeContainerName(worldId, containerPort) {
		return `olam-${worldId}-bridge-${containerPort}`;
		}

		function loadState() {
		try {
		if (!fs.existsSync(BRIDGES_PATH)) return;
		const raw = fs.readFileSync(BRIDGES_PATH, 'utf-8');
		const data = JSON.parse(raw);
		if (!data \|\| typeof data !== 'object') return;
		for (const [key, entry] of Object.entries(data)) {
		registry.set(key, entry);
		}
		} catch (err) {
		console.error(`port-bridge-manager: loadState failed: ${err.message}`);
		}
		}

		function saveState() {
		try {
		const dir = path.dirname(BRIDGES_PATH);
		fs.mkdirSync(dir, { recursive: true });
		const data = {};
		for (const [key, entry] of registry) {
		data[key] = entry;
		}
		const tmp = `${BRIDGES_PATH}.tmp-${process.pid}-${Date.now()}`;
		fs.writeFileSync(tmp, JSON.stringify(data, null, 2), 'utf-8');
		fs.renameSync(tmp, BRIDGES_PATH);
		} catch (err) {
		console.error(`port-bridge-manager: saveState failed: ${err.message}`);
		}
		}

		function allocateHostPort() {
		const used = new Set(Array.from(registry.values()).map((e) => e.hostPort));
		for (let p = HOST_PORT_MIN; p <= HOST_PORT_MAX; p++) {
		if (!used.has(p)) return p;
		}
		return null;
		}

		async function dockerApiBase() {
		return DOCKER_HOST === 'docker-cli'
		? null // bare-node: no socket proxy HTTP API
		: DOCKER_HOST.replace(/^tcp:\/\//, 'http://');
		}

		/**
		* Detect whether a docker error message indicates the image is missing
		* (and therefore a `docker pull` retry would help). Docker uses a handful
		* of phrasings across CLI + HTTP API surfaces.
		*/
		function isImageMissingError(message) {
		if (!message) return false;
		return /Unable to find image\|pull access denied\|manifest unknown\|No such image\|not found in (the )?(repository\|registry)/i.test(
		message,
		);
		}

		/**
		* Pull alpine/socat:latest via docker CLI. Used by the bare-node bridge
		* create path's fallback retry. 60s budget — image is ~5MB; real pull
		* is typically <2s.
		*
		* @returns {{ok: boolean, stderr: string}}
		*/
		function pullSocatViaCli() {
		const r = spawnSync('docker', ['pull', SOCAT_IMAGE_TAGGED], {
		encoding: 'utf-8',
		timeout: 60_000,
		});
		return {
		ok: r.status === 0,
		stderr: (r.stderr ?? '').trim() \|\| (r.stdout ?? '').trim(),
		};
		}

		/**
		* Pull alpine/socat:latest via Docker HTTP API. Used by the container-mode
		* bridge create path's fallback retry. Streams the pull progress body so
		* Docker actually performs the pull (it's a streaming endpoint).
		*
		* @param {string} apiBase — Docker HTTP API base URL
		* @returns {Promise<{ok: boolean, stderr: string}>}
		*/
		async function pullSocatViaHttpApi(apiBase) {
		try {
		const resp = await fetch(
		`${apiBase}/images/create?fromImage=${encodeURIComponent(SOCAT_IMAGE)}&tag=latest`,
		{ method: 'POST', signal: AbortSignal.timeout(60_000) },
		);
		if (!resp.ok) {
		const body = await resp.text().catch(() => '');
		return { ok: false, stderr: `pull failed: ${resp.status} ${body}` };
		}
		// Drain the streaming progress body — Docker only completes the pull
		// when the response is consumed.
		await resp.text();
		return { ok: true, stderr: '' };
		} catch (err) {
		return { ok: false, stderr: err?.message ?? String(err) };
		}
		}

		/**
		* Create and start a socat bridge container.
		*
		* Returns `{ containerId, pulledImage }` — `pulledImage: true` indicates the
		* function had to fall back to `docker pull alpine/socat:latest` (issue #964
		* — preflight in `olam services up` should normally have already pulled it).
		*
		* @param {string} worldId
		* @param {number} containerPort
		* @param {number} hostPort
		* @returns {Promise<{containerId: string, pulledImage: boolean}>}
		*/
		async function createBridgeContainer(worldId, containerPort, hostPort) {
		const name = bridgeContainerName(worldId, containerPort);
		const networkName = `olam-${worldId}`;
		const devboxName = `olam-${worldId}-devbox`;
		const socatCmd = `TCP-LISTEN:${containerPort},fork,reuseaddr TCP:${devboxName}:${containerPort}`;

		const apiBase = await dockerApiBase();

		if (!apiBase) {
		// bare-node: use docker CLI
		const args = [
		'run', '-d',
		'--name', name,
		'--network', networkName,
		'-p', `${HOST_IP}:${hostPort}:${containerPort}`,
		'--label', `olam.world.id=${worldId}`,
		'--label', 'olam.role=server-bridge',
		'--restart', 'unless-stopped',
		SOCAT_IMAGE,
		'TCP-LISTEN:' + containerPort + ',fork,reuseaddr',
		'TCP:' + devboxName + ':' + containerPort,
		];
		let result = spawnSync('docker', args, { encoding: 'utf-8', timeout: 10000 });
		let pulledImage = false;

		// Issue #964 fallback: if docker run failed because the image is missing,
		// pull it and retry once. This covers hosts where `olam services up`
		// didn't run the preflight (e.g. fresh Hazel install, docker restart
		// pruned the image, etc.).
		if (result.status !== 0 && isImageMissingError(result.stderr ?? '')) {
		const pull = pullSocatViaCli();
		if (!pull.ok) {
		throw new Error(
		`alpine/socat image missing and pull failed: ${pull.stderr \|\| 'unknown error'}`,
		);
		}
		pulledImage = true;
		result = spawnSync('docker', args, { encoding: 'utf-8', timeout: 10000 });
		}

		if (result.status !== 0) {
		throw new Error(result.stderr?.trim() \|\| 'docker run failed');
		}
		return { containerId: result.stdout.trim(), pulledImage };
		}

		// container mode: Docker HTTP API
		const createBody = {
		Image: SOCAT_IMAGE,
		Cmd: socatCmd.split(' '),
		Labels: {
		'olam.world.id': worldId,
		'olam.role': 'server-bridge',
		},
		HostConfig: {
		NetworkMode: networkName,
		PortBindings: {
		[`${containerPort}/tcp`]: [{ HostIp: HOST_IP, HostPort: String(hostPort) }],
		},
		RestartPolicy: { Name: 'unless-stopped' },
		},
		};

		const doCreate = () => fetch(
		`${apiBase}/containers/create?name=${encodeURIComponent(name)}`,
		{
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify(createBody),
		signal: AbortSignal.timeout(10000),
		},
		);

		let createResp = await doCreate();
		let pulledImage = false;

		// Issue #964 fallback for HTTP API path. Docker returns 404 with a body
		// like {"message":"No such image: alpine/socat:latest"} when the image
		// is missing.
		if (!createResp.ok && createResp.status === 404) {
		const body = await createResp.text().catch(() => '');
		if (isImageMissingError(body)) {
		const pull = await pullSocatViaHttpApi(apiBase);
		if (!pull.ok) {
		throw new Error(
		`alpine/socat image missing and pull failed: ${pull.stderr \|\| 'unknown error'}`,
		);
		}
		pulledImage = true;
		createResp = await doCreate();
		} else {
		throw new Error(`container create failed: 404 ${body}`);
		}
		}

		if (!createResp.ok) {
		const body = await createResp.text().catch(() => '');
		// If container already exists (409), try to get its ID
		if (createResp.status === 409) {
		const inspectResp = await fetch(
		`${apiBase}/containers/${encodeURIComponent(name)}/json`,
		{ signal: AbortSignal.timeout(3000) },
		);
		if (inspectResp.ok) {
		const info = await inspectResp.json();
		return { containerId: info.Id, pulledImage };
		}
		}
		throw new Error(`container create failed: ${createResp.status} ${body}`);
		}

		const { Id: containerId } = await createResp.json();

		const startResp = await fetch(`${apiBase}/containers/${encodeURIComponent(containerId)}/start`, {
		method: 'POST',
		signal: AbortSignal.timeout(5000),
		});
		if (!startResp.ok && startResp.status !== 304) {
		throw new Error(`container start failed: ${startResp.status}`);
		}

		return { containerId, pulledImage };
		}

		async function removeBridgeContainer(containerName, containerId) {
		const id = containerId \|\| containerName;
		const apiBase = await dockerApiBase();

		if (!apiBase) {
		spawnSync('docker', ['rm', '-f', id], { encoding: 'utf-8', timeout: 5000 });
		return;
		}

		// Force remove (stop + delete in one call)
		await fetch(`${apiBase}/containers/${encodeURIComponent(id)}?force=true`, {
		method: 'DELETE',
		signal: AbortSignal.timeout(5000),
		}).catch(() => { /* best-effort */ });
		}

		/**
		* Expose a world's container port via a socat bridge.
		* Idempotent: returns existing bridge if already active.
		*
		* @param {string} worldId
		* @param {number} containerPort
		* @returns {Promise<{hostPort: number, containerPort: number, url: string, containerId: string, pulledImage?: boolean}>}
		*/
		export async function exposePort(worldId, containerPort) {
		if (INFRA_PORTS.has(containerPort)) {
		throw new Error(`port ${containerPort} is reserved for infrastructure`);
		}

		const key = bridgeKey(worldId, containerPort);
		const existing = registry.get(key);
		if (existing) {
		return {
		hostPort: existing.hostPort,
		containerPort: existing.containerPort,
		url: `http://${HOST_IP}:${existing.hostPort}`,
		containerId: existing.containerId,
		};
		}

		const hostPort = allocateHostPort();
		if (hostPort === null) {
		throw new Error('no host ports available in range 25000–25999');
		}

		const containerName = bridgeContainerName(worldId, containerPort);
		const { containerId, pulledImage } = await createBridgeContainer(worldId, containerPort, hostPort);

		const entry = { worldId, containerPort, hostPort, containerId, containerName };
		registry.set(key, entry);
		saveState();

		const result = {
		hostPort,
		containerPort,
		url: `http://${HOST_IP}:${hostPort}`,
		containerId,
		};
		// Only attach pulledImage when true so existing callers/tests don't see
		// an unexpected key when the preflight succeeded.
		if (pulledImage) result.pulledImage = true;
		return result;
		}

		/**
		* Remove a port bridge for a world.
		* No-op if bridge doesn't exist.
		*
		* @param {string} worldId
		* @param {number} containerPort
		*/
		export async function removePort(worldId, containerPort) {
		const key = bridgeKey(worldId, containerPort);
		const entry = registry.get(key);
		if (!entry) return;

		registry.delete(key);
		saveState();

		await removeBridgeContainer(entry.containerName, entry.containerId);
		}

		/**
		* Remove all bridges for a world. Called on world destroy.
		* @param {string} worldId
		*/
		export async function killWorld(worldId) {
		const toDelete = [];
		for (const [key, entry] of registry) {
		if (entry.worldId === worldId) toDelete.push({ key, entry });
		}
		for (const { key, entry } of toDelete) {
		registry.delete(key);
		await removeBridgeContainer(entry.containerName, entry.containerId).catch(() => {});
		}
		if (toDelete.length > 0) saveState();
		}

		/**
		* List active bridges for a world.
		* @param {string} worldId
		* @returns {Array<{containerPort: number, hostPort: number, url: string}>}
		*/
		export function getWorldBridges(worldId) {
		const result = [];
		for (const entry of registry.values()) {
		if (entry.worldId === worldId) {
		result.push({
		containerPort: entry.containerPort,
		hostPort: entry.hostPort,
		url: `http://${HOST_IP}:${entry.hostPort}`,
		});
		}
		}
		return result;
		}

		loadState();

-210

host-cp/src/pr-cache.mjs

		/**
		* In-memory cache for GitHub PR data with TTL and concurrent-fetch coalescing.
		*
		* @module pr-cache
		*/

		const GH_API_BASE = 'https://api.github.com';
		const TTL_MS = 30_000;

		/**
		* Parse owner, repo, and PR number from a GitHub PR URL.
		*
		* @param {string} prUrl e.g. https://github.com/owner/repo/pull/123
		* @returns {{ owner: string, repo: string, number: number } \| null}
		*/
		function parsePrUrl(prUrl) {
		const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl);
		if (!m) return null;
		return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) };
		}

		/**
		* Reduce an array of check runs into a single status string.
		*
		* @param {Array<{conclusion: string\|null, status: string}>} checkRuns
		* @returns {'pending'\|'passing'\|'failing'\|null}
		*/
		function reduceCheckRuns(checkRuns) {
		if (!checkRuns \|\| checkRuns.length === 0) return null;

		let hasFailure = false;
		let hasPending = false;

		for (const run of checkRuns) {
		const conclusion = run.conclusion;
		const status = run.status;

		if (
		conclusion === 'failure' \|\|
		conclusion === 'timed_out' \|\|
		conclusion === 'action_required'
		) {
		hasFailure = true;
		} else if (
		status === 'queued' \|\|
		status === 'in_progress' \|\|
		conclusion === null
		) {
		hasPending = true;
		}
		}

		if (hasFailure) return 'failing';
		if (hasPending) return 'pending';
		return 'passing';
		}

		/**
		* @typedef {object} PrCacheEntry
		* @property {number} fetchedAt
		* @property {'open'\|'merged'\|'closed'\|null} prState
		* @property {number\|null} prNumber
		* @property {'pending'\|'passing'\|'failing'\|null} prChecks
		* @property {Promise<PrData>\|null} promise
		*/

		/**
		* @typedef {object} PrData
		* @property {'open'\|'merged'\|'closed'\|null} state
		* @property {number\|null} number
		* @property {'pending'\|'passing'\|'failing'\|null} checks
		*/

		/**
		* Fetch PR data from GitHub API.
		*
		* @param {string} prUrl
		* @param {() => Promise<string\|null>} getToken
		* @returns {Promise<PrData>}
		*/
		async function fetchPrData(prUrl, getToken) {
		const parsed = parsePrUrl(prUrl);
		if (!parsed) return { state: null, number: null, checks: null };

		const token = await getToken();
		/** @type {HeadersInit} */
		const headers = { Accept: 'application/vnd.github+json' };
		if (token) headers['Authorization'] = `token ${token}`;

		// Fetch PR metadata
		const prResp = await fetch(
		`${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/pulls/${parsed.number}`,
		{ headers, signal: AbortSignal.timeout(10_000) },
		);
		if (!prResp.ok) {
		return { state: null, number: parsed.number, checks: null };
		}
		const prData = await prResp.json();

		let state = prData.state ?? null;
		if (state === 'closed' && prData.merged_at) state = 'merged';

		const sha = prData.head?.sha ?? null;
		if (!sha) {
		return { state, number: parsed.number, checks: null };
		}

		// Fetch check runs for the head SHA
		let checks = null;
		try {
		const checksResp = await fetch(
		`${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/commits/${sha}/check-runs`,
		{ headers, signal: AbortSignal.timeout(10_000) },
		);
		if (checksResp.ok) {
		const checksData = await checksResp.json();
		const runs = Array.isArray(checksData.check_runs) ? checksData.check_runs : [];
		checks = reduceCheckRuns(runs);
		}
		} catch {
		// Non-fatal — return null checks
		}

		return { state, number: parsed.number, checks };
		}

		/**
		* Create a PR data cache with TTL and concurrent-fetch coalescing.
		*
		* @returns {{ getPr: (prUrl: string, getToken: () => Promise<string\|null>) => Promise<PrData\|null>, deletePr: (prUrl: string) => void }}
		*/
		export function createPrCache() {
		/** @type {Map<string, PrCacheEntry>} */
		const cache = new Map();

		/**
		* Get PR data for a URL, using cache if fresh or coalescing concurrent fetches.
		*
		* @param {string} prUrl
		* @param {() => Promise<string\|null>} getToken
		* @returns {Promise<PrData\|null>}
		*/
		async function getPr(prUrl, getToken) {
		if (!prUrl) return null;

		const now = Date.now();
		const entry = cache.get(prUrl);

		// Fresh cache hit
		if (entry && !entry.promise && now - entry.fetchedAt < TTL_MS) {
		return { state: entry.prState, number: entry.prNumber, checks: entry.prChecks };
		}

		// In-flight fetch — coalesce
		if (entry && entry.promise) {
		try {
		return await entry.promise;
		} catch {
		return null;
		}
		}

		// Stale or missing — start new fetch
		const promise = fetchPrData(prUrl, getToken).then(
		(data) => {
		cache.set(prUrl, {
		fetchedAt: Date.now(),
		prState: data.state,
		prNumber: data.number,
		prChecks: data.checks,
		promise: null,
		});
		return data;
		},
		(err) => {
		// Clear promise on error so next call retries
		const current = cache.get(prUrl);
		if (current && current.promise) {
		cache.set(prUrl, { ...current, promise: null });
		}
		throw err;
		},
		);

		cache.set(prUrl, {
		fetchedAt: entry ? entry.fetchedAt : 0,
		prState: entry ? entry.prState : null,
		prNumber: entry ? entry.prNumber : null,
		prChecks: entry ? entry.prChecks : null,
		promise,
		});

		try {
		return await promise;
		} catch {
		return null;
		}
		}

		/**
		* Evict a PR entry from the cache (call on world destroy).
		*
		* @param {string} prUrl
		*/
		function deletePr(prUrl) {
		cache.delete(prUrl);
		}

		return { getPr, deletePr };
		}

-154

host-cp/src/pr-merge-poller.mjs

		/**
		* PR merge poller for auto-destroying worlds whose PR has merged.
		*
		* State machine per world:
		* open -> merged (on GitHub reports merged)
		* merged -> merged_destroyed (after grace period, if auto_destroy_on_merge)
		*/

		const GH_API_BASE = 'https://api.github.com';

		/**
		* Parse owner, repo, and PR number from a GitHub PR URL.
		* @param {string} prUrl e.g. https://github.com/org/repo/pull/123
		* @returns {{ owner: string, repo: string, number: number } \| null}
		*/
		function parsePrUrl(prUrl) {
		const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl);
		if (!m) return null;
		return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) };
		}

		/**
		* @param {{
		* prStateStore: import('./world-pr-state.mjs').ReturnType<typeof createWorldPrStateStore>,
		* getGhToken: () => Promise<string\|null>,
		* destroyWorld: (worldId: string) => Promise<void>,
		* pollIntervalMs?: number,
		* gracePeriodMs?: number,
		* }} opts
		*/
		export function createPrMergePoller({
		prStateStore,
		getGhToken,
		destroyWorld,
		pollIntervalMs = 300_000,
		gracePeriodMs = 600_000,
		}) {
		let intervalId = null;
		let disabled = false;
		let warnedOnce = false;
		// Track in-flight grace timers so stop() can clear them
		const graceTimers = new Map();

		async function destroyAndMark(worldId) {
		const entry = prStateStore.get(worldId);
		const prUrl = entry?.pr_url ?? '(unknown)';
		const mergedAt = entry?.pr_merged_at ?? '(unknown)';
		console.log(
		`[pr-merge-poller] auto-destroyed world ${worldId}: PR ${prUrl} merged at ${mergedAt}, destroyed at ${new Date().toISOString()}`,
		);
		try {
		await destroyWorld(worldId);
		} catch (err) {
		console.error(`[pr-merge-poller] destroyWorld failed for ${worldId}:`, err.message);
		}
		prStateStore.set(worldId, { pr_state: 'merged_destroyed' });
		graceTimers.delete(worldId);
		}

		function scheduleGrace(worldId, entry) {
		if (graceTimers.has(worldId)) return; // already scheduled
		const id = setTimeout(() => {
		destroyAndMark(worldId).catch((err) => {
		console.error(`[pr-merge-poller] destroyAndMark error for ${worldId}:`, err.message);
		});
		}, gracePeriodMs);
		graceTimers.set(worldId, id);
		}

		async function checkPr(worldId, entry, ghToken) {
		const parsed = parsePrUrl(entry.pr_url);
		if (!parsed) {
		console.warn(`[pr-merge-poller] cannot parse PR URL for ${worldId}: ${entry.pr_url}`);
		return;
		}
		const apiUrl = `${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/pulls/${parsed.number}`;
		let data;
		try {
		const resp = await fetch(apiUrl, {
		headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' },
		});
		if (!resp.ok) {
		console.warn(`[pr-merge-poller] GH API ${resp.status} for ${worldId}`);
		return;
		}
		data = await resp.json();
		} catch (err) {
		console.warn(`[pr-merge-poller] fetch failed for ${worldId}:`, err.message);
		return;
		}

		const isMerged = data.state === 'closed' && data.merged_at != null;
		if (!isMerged) return;

		prStateStore.set(worldId, {
		pr_state: 'merged',
		pr_merged_at: data.merged_at,
		});

		if (entry.auto_destroy_on_merge === false) return;

		scheduleGrace(worldId, prStateStore.get(worldId));
		}

		async function pollOnce() {
		const ghToken = await getGhToken();
		if (!ghToken) {
		if (!warnedOnce) {
		console.warn(
		'pr-merge-poller: no GH token found (GH_TOKEN/GITHUB_TOKEN env or /gh-config/hosts.yml); PR polling disabled',
		);
		warnedOnce = true;
		}
		disabled = true;
		stop();
		return;
		}

		const worlds = prStateStore.getWorldsToWatch();
		for (const entry of worlds) {
		const { worldId, ...rest } = entry;
		if (rest.pr_state === 'open') {
		await checkPr(worldId, rest, ghToken);
		} else if (rest.pr_state === 'merged') {
		// Resume grace timer for merged entries that survived a restart
		if (rest.auto_destroy_on_merge !== false) {
		scheduleGrace(worldId, rest);
		}
		}
		}
		}

		function start() {
		if (intervalId !== null \|\| disabled) return;
		intervalId = setInterval(() => {
		pollOnce().catch((err) => {
		console.error('[pr-merge-poller] pollOnce error:', err.message);
		});
		}, pollIntervalMs);
		}

		function stop() {
		if (intervalId !== null) {
		clearInterval(intervalId);
		intervalId = null;
		}
		for (const id of graceTimers.values()) {
		clearTimeout(id);
		}
		graceTimers.clear();
		}

		return { start, stop };
		}

-343

host-cp/src/pr-nanny.mjs

		/**
		* PR Nanny — host-side daemon that watches all worlds' open PRs and
		* dispatches fixes via `olam dispatch` when CI/reviews block them.
		*
		* Extends the pr-merge-poller loop pattern. Runs at 60s cadence.
		* Opt-out: OLAM_PR_NANNY=0 (default: enabled).
		*
		* State machine per PR (stored in world-pr-state.json nanny_* fields):
		* watching → dispatching → (paused \| escalated \| halted)
		*
		* Halt conditions (stop dispatching but keep watching):
		* 1. dispatch_count >= MAX_DISPATCHES (configurable, default 5)
		* 2. wall-clock since first dispatch >= MAX_WALL_CLOCK_MIN (default 60)
		* 3. same-root-cause loop detected (last 2 dispatch summaries identical)
		* 4. operator manual pause
		*
		* Tier escalation (PR #N tier-escalation):
		* On each retry, the nanny advances to the next tier in `escalationTiers`
		* (stored per-world in nanny_current_tier) instead of repeating the same
		* model. When the chain is exhausted, emits `dispatch.tier-exhausted` on
		* the host-stream and falls back to existing operator escalation.
		*/

		import { execFile } from 'node:child_process';
		import { promisify } from 'node:util';
		import { pickNextTier } from './dispatch/tier-escalator.mjs';
		import { safePersistLastDispatch } from './dispatch-persister.mjs';

		const execFileAsync = promisify(execFile);

		const GH_API_BASE = 'https://api.github.com';

		// Known external-blocker CI check name patterns.
		// When ALL failing checks match these patterns, the PR is not actionable
		// (the root cause is infrastructure/release-pipeline, not the world's code).
		const EXTERNAL_BLOCKER_PATTERNS = [
		/detect-image-scopes/i,
		/publish-mcp-auth/i,
		/retag-mcp-auth/i,
		/bootstrap.*publish/i,
		/release.*pipeline/i,
		/ghcr.*push/i,
		];

		/**
		* @param {string} checkName
		* @returns {boolean}
		*/
		function isExternalBlockerCheck(checkName) {
		return EXTERNAL_BLOCKER_PATTERNS.some((re) => re.test(checkName));
		}

		/**
		* Returns true when ALL failing CI checks are external-blocker patterns.
		* @param {Array<{name: string, conclusion: string\|null}>} checks
		*/
		export function isExternalBlocker(checks) {
		const failing = checks.filter(
		(c) => c.conclusion === 'failure' \|\| c.conclusion === 'action_required',
		);
		if (failing.length === 0) return false;
		return failing.every((c) => isExternalBlockerCheck(c.name));
		}

		/**
		* @param {string} prUrl e.g. https://github.com/org/repo/pull/123
		* @returns {{ owner: string, repo: string, number: number } \| null}
		*/
		function parsePrUrl(prUrl) {
		const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl);
		if (!m) return null;
		return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) };
		}

		/**
		* @param {{
		* prStateStore: ReturnType<import('./world-pr-state.mjs').createWorldPrStateStore>,
		* getGhToken: () => Promise<string\|null>,
		* dispatchToWorld: (worldId: string, prompt: string, opts?: { tier?: string }) => Promise<void>,
		* consultCodex: (ctx: string) => Promise<string>,
		* broadcastTierEvent?: (eventType: string, payload: unknown) => void,
		* pollIntervalMs?: number,
		* maxDispatches?: number,
		* maxWallClockMin?: number,
		* }} opts
		*/
		export function createPrNanny({
		prStateStore,
		getGhToken,
		dispatchToWorld,
		consultCodex,
		broadcastTierEvent = () => {},
		pollIntervalMs = 60_000,
		maxDispatches = parseInt(process.env.OLAM_PR_NANNY_MAX_DISPATCHES ?? '5', 10),
		maxWallClockMin = parseInt(process.env.OLAM_PR_NANNY_MAX_WALL_CLOCK_MIN ?? '60', 10),
		}) {
		const enabled = (process.env.OLAM_PR_NANNY ?? '1') !== '0';
		if (!enabled) return { start() {}, stop() {} };

		let intervalId = null;
		let warnedOnce = false;

		/**
		* Fetch CI check runs for the PR's head SHA.
		* @param {string} owner @param {string} repo @param {number} prNumber @param {string} ghToken
		* @returns {Promise<Array<{name: string, conclusion: string\|null}>>}
		*/
		async function fetchChecks(owner, repo, prNumber, ghToken) {
		try {
		// First get the PR head SHA
		const prRes = await fetch(
		`${GH_API_BASE}/repos/${owner}/${repo}/pulls/${prNumber}`,
		{ headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' } },
		);
		if (!prRes.ok) return [];
		const prData = await prRes.json();
		const sha = prData.head?.sha;
		if (!sha) return [];

		const checkRes = await fetch(
		`${GH_API_BASE}/repos/${owner}/${repo}/commits/${sha}/check-runs?per_page=100`,
		{ headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' } },
		);
		if (!checkRes.ok) return [];
		const checkData = await checkRes.json();
		return (checkData.check_runs ?? []).map((r) => ({
		name: r.name,
		conclusion: r.conclusion,
		status: r.status,
		}));
		} catch {
		return [];
		}
		}

		/**
		* @param {string} worldId
		* @param {object} entry current pr-state entry
		* @param {string} ghToken
		*/
		async function processWorld(worldId, entry, ghToken) {
		if (entry.nanny_paused \|\| entry.nanny_escalated) return;
		if (entry.pr_state !== 'open') return;

		const parsed = parsePrUrl(entry.pr_url);
		if (!parsed) return;

		// Halt: dispatch cap
		const dispatchCount = entry.nanny_dispatch_count ?? 0;
		if (dispatchCount >= maxDispatches) return;

		// Halt: wall-clock ceiling
		if (entry.nanny_first_dispatch_at) {
		const elapsedMin = (Date.now() - new Date(entry.nanny_first_dispatch_at).getTime()) / 60_000;
		if (elapsedMin >= maxWallClockMin) return;
		}

		const checks = await fetchChecks(parsed.owner, parsed.repo, parsed.number, ghToken);
		const hasCiFailure = checks.some(
		(c) => c.conclusion === 'failure' \|\| c.conclusion === 'action_required',
		);
		const allPassing = checks.length > 0 && checks.every(
		(c) => c.conclusion === 'success' \|\| c.conclusion === 'skipped' \|\| c.conclusion === 'neutral',
		);

		if (allPassing \|\| checks.length === 0) return;
		if (!hasCiFailure) return;

		// External blocker — do not dispatch
		if (isExternalBlocker(checks)) {
		prStateStore.set(worldId, { nanny_external_blocker: true });
		return;
		}

		prStateStore.set(worldId, { nanny_external_blocker: false });

		const failingNames = checks
		.filter((c) => c.conclusion === 'failure' \|\| c.conclusion === 'action_required')
		.map((c) => c.name)
		.join(', ');

		const prompt = `CI is failing on PR ${entry.pr_url}. Failing checks: ${failingNames}. Investigate the root cause, fix the code, and push.`;

		// Halt: same-root-cause loop detection
		if (entry.nanny_last_dispatch_prompt && entry.nanny_last_dispatch_prompt === prompt) {
		console.log(`[pr-nanny] loop detected for ${worldId} — same prompt as last dispatch, halting`);
		prStateStore.set(worldId, { nanny_loop_halted: true });
		return;
		}

		// Consult Codex before dispatching
		const codexCtx = `World ${worldId} has a failing PR: ${entry.pr_url}. Failing CI checks: ${failingNames}. Should we dispatch a fix? Answer: agree, push-back, or rethink.`;
		let verdict = 'agree';
		try {
		verdict = await consultCodex(codexCtx);
		} catch (err) {
		console.warn(`[pr-nanny] codex consult failed for ${worldId}: ${err.message} — defaulting to agree`);
		}

		if (verdict === 'push-back') {
		prStateStore.set(worldId, { nanny_paused: true, nanny_pause_reason: 'codex_pushback' });
		console.log(`[pr-nanny] Codex push-back for ${worldId} — pausing nanny`);
		return;
		}
		if (verdict === 'rethink') {
		prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'codex_rethink' });
		console.log(`[pr-nanny] Codex rethink for ${worldId} — escalating`);
		return;
		}

		// ── Tier escalation (PR #938) ───────────────────────────────────────────
		//
		// `nanny_escalation_tiers` is set by the olam_dispatch caller via the
		// escalationTiers schema field and persisted here by server.mjs when the
		// world is registered for nanny tracking. Defaults to ['sonnet'] when
		// absent (no escalation, no cost surprise).
		//
		// `nanny_current_tier` tracks the model tier used by the LAST dispatch for
		// this PR. On first dispatch (dispatchCount === 0) it is undefined, and we
		// use escalationTiers[0] as the starting tier. On retries we advance the
		// chain via pickNextTier. This is the pr-state store (option c from the
		// design doc) — it persists across polls and matches the nanny_* field
		// pattern already established by nanny_dispatch_count et al.
		const escalationTiers = entry.nanny_escalation_tiers ?? ['sonnet'];
		const currentTier = entry.nanny_current_tier ?? escalationTiers[0] ?? 'sonnet';
		let tierForThisDispatch = currentTier;

		if (dispatchCount > 0) {
		// This is a retry — try to escalate the tier.
		const nextTier = pickNextTier(currentTier, escalationTiers);
		if (nextTier !== null) {
		tierForThisDispatch = nextTier;
		broadcastTierEvent('dispatch.escalated', {
		worldId,
		fromTier: currentTier,
		toTier: nextTier,
		reason: 'retry-after-failure',
		});
		console.log(`[pr-nanny] tier escalated for ${worldId}: ${currentTier} → ${nextTier}`);
		} else {
		// Chain exhausted — emit tier-exhausted and fall back to operator escalation.
		broadcastTierEvent('dispatch.tier-exhausted', {
		worldId,
		exhaustedTier: currentTier,
		escalationTiers,
		});
		console.log(`[pr-nanny] tier chain exhausted for ${worldId} (last tier: ${currentTier}) — escalating to operator`);
		prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'tier_exhausted' });
		return;
		}
		}

		// Dispatch fix
		try {
		safePersistLastDispatch({
		worldId,
		messageId: `nanny-${worldId}-${Date.now()}`,
		prompt,
		source: 'pr-nanny',
		});
		await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
		const now = new Date().toISOString();
		prStateStore.set(worldId, {
		nanny_dispatch_count: dispatchCount + 1,
		nanny_first_dispatch_at: entry.nanny_first_dispatch_at ?? now,
		nanny_last_dispatch_at: now,
		nanny_last_dispatch_prompt: prompt,
		nanny_current_tier: tierForThisDispatch,
		});
		console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches}, tier: ${tierForThisDispatch})`);
		} catch (err) {
		console.error(`[pr-nanny] dispatch failed for ${worldId}: ${err.message}`);
		}
		}

		async function pollOnce() {
		const ghToken = await getGhToken();
		if (!ghToken) {
		if (!warnedOnce) {
		console.warn('[pr-nanny] no GH token — CI polling disabled');
		warnedOnce = true;
		}
		return;
		}

		const worlds = prStateStore.getWorldsToWatch();
		for (const { worldId, ...entry } of worlds) {
		try {
		await processWorld(worldId, entry, ghToken);
		} catch (err) {
		console.error(`[pr-nanny] processWorld error for ${worldId}: ${err.message}`);
		}
		}
		}

		function start() {
		if (intervalId !== null) return;
		// Immediate first poll
		pollOnce().catch((err) => console.error('[pr-nanny] pollOnce error:', err.message));
		intervalId = setInterval(() => {
		pollOnce().catch((err) => console.error('[pr-nanny] pollOnce error:', err.message));
		}, pollIntervalMs);
		}

		function stop() {
		if (intervalId !== null) {
		clearInterval(intervalId);
		intervalId = null;
		}
		}

		return { start, stop };
		}

		/**
		* Default Codex consultation via the host-side `codex` CLI.
		* @param {string} ctx
		* @returns {Promise<'agree'\|'push-back'\|'rethink'>}
		*/
		export async function defaultConsultCodex(ctx) {
		try {
		const { stdout } = await execFileAsync('codex', [
		'--quiet',
		'--model', 'codex-mini-latest',
		`Adversarial review — is this a good idea? ${ctx} Reply with exactly one word: agree, push-back, or rethink.`,
		], { timeout: 30_000 });
		const text = stdout.trim().toLowerCase();
		if (text.startsWith('push')) return 'push-back';
		if (text.startsWith('rethink')) return 'rethink';
		return 'agree';
		} catch {
		return 'agree'; // fail-open: if codex unavailable, dispatch anyway
		}
		}

		/**
		* Default dispatch: shell out to `olam dispatch <worldId> <prompt>`.
		* @param {string} worldId
		* @param {string} prompt
		*/
		export async function defaultDispatchToWorld(worldId, prompt) {
		await execFileAsync('olam', ['dispatch', worldId, prompt], { timeout: 60_000 });
		}

-250

host-cp/src/process-poller.mjs

		/**
		* process-poller.mjs — per-world docker top SSE fanout.
		*
		* Dual-mode: HTTP API when DOCKER_HOST != 'docker-cli'; spawnSync otherwise.
		*
		* NOTE: process argv may contain secrets (--api-key=, --token=). Post-v1 audit needed. (S1)
		*/

		import { spawnSync } from 'node:child_process';

		const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli';

		/**
		* @typedef {{ pid: string, user: string, cpu: string, mem: string, started: string, state: string, command: string }} ProcessRow
		*/

		function worldContainerName(worldId) {
		return `olam-${worldId}-devbox`;
		}

		/**
		* Parse docker top JSON (Titles + Processes arrays) into normalized rows.
		* Falls back gracefully if the response is not JSON.
		* lstart is stored as a raw string — no Date parse (T1).
		*
		* @param {string} stdout
		* @returns {ProcessRow[]}
		*/
		function parseDockerTop(stdout) {
		let parsed;
		try {
		parsed = JSON.parse(stdout);
		} catch {
		return [];
		}

		const titles = parsed?.Titles;
		const processes = parsed?.Processes;
		if (!Array.isArray(titles) \|\| !Array.isArray(processes)) return [];

		// Find column indices by title (case-insensitive partial match).
		function idx(name) {
		const n = name.toLowerCase();
		const i = titles.findIndex((t) => typeof t === 'string' && t.toLowerCase().includes(n));
		return i;
		}

		const pidIdx = idx('pid');
		const userIdx = idx('user');
		const cpuIdx = idx('cpu');
		const memIdx = idx('mem');
		// Accept LSTART, STARTED, STIME, or START_TIME (T1: store as raw string)
		const startIdx = (() => {
		for (const candidate of ['lstart', 'stime', 'start_time', 'start']) {
		const i = idx(candidate);
		if (i !== -1) return i;
		}
		return -1;
		})();
		const stateIdx = idx('stat');
		const cmdIdx = (() => {
		// CMD may be titled "CMD", "COMMAND", or "cmd"
		const i = idx('command');
		return i !== -1 ? i : idx('cmd');
		})();

		return processes.map((row) => ({
		pid: pidIdx !== -1 ? String(row[pidIdx] ?? '').trim() : '',
		user: userIdx !== -1 ? String(row[userIdx] ?? '').trim() : '',
		cpu: cpuIdx !== -1 ? String(row[cpuIdx] ?? '').trim() : '0',
		mem: memIdx !== -1 ? String(row[memIdx] ?? '').trim() : '0',
		started: startIdx !== -1 ? String(row[startIdx] ?? '').trim() : '',
		state: stateIdx !== -1 ? String(row[stateIdx] ?? '').trim() : '',
		command: cmdIdx !== -1 ? String(row[cmdIdx] ?? '').trim() : '',
		}));
		}

		/**
		* Fetch processes for a world container.
		* Returns {ts, processes, error?}.
		* Non-running containers return an empty array + error field (T3).
		*
		* @param {string} worldId
		* @returns {Promise<{ts: number, processes: ProcessRow[], error?: string}>}
		*/
		async function fetchProcesses(worldId) {
		const containerName = worldContainerName(worldId);
		// Docker's /containers/<name>/top?ps_args=<X> passes ps_args verbatim to
		// ps(1) inside the container. The pre-2026-05-05 form `pid,user,...` was
		// a bare comma-separated list that ps treats as a process-ID list, not
		// a column selector — yielding 500 "ps: error: process ID list syntax
		// error" from the Docker API and a misleading "container not running"
		// chip in the SPA. Correct invocation is `ps -eo <cols>` to select all
		// processes (`-e`) and project specific columns (`-o`). Confirmed via
		// host-cp container against olam-dawn-arc-5703-devbox: this form returns
		// 200 with both Titles + Processes arrays, which parseDockerTop expects.
		//
		// Switched lstart → stime to match the CLI path's column choice (line 98)
		// and avoid multi-word timestamp values; the CLI path's split-on-1+ws
		// parser would break on "Mon May 4 14:00:00 2026", and consistency between
		// paths reduces surprise. parseDockerTop accepts either via title match.
		const ps_args = '-eo pid,user,pcpu,pmem,stime,stat,cmd';

		let stdout;
		try {
		if (DOCKER_HOST === 'docker-cli') {
		// Bare-node mode: spawnSync blocks ~50ms at 5s cadence (P2 — acceptable).
		// Use `stime` (single-word start time) instead of `lstart` to avoid
		// multi-word timestamp values that break column-split parsing.
		const result = spawnSync(
		'docker',
		['top', containerName, 'pid', 'user', 'pcpu', 'pmem', 'stime', 'stat', 'cmd'],
		{ encoding: 'utf-8', timeout: 3000 },
		);
		if (result.status !== 0 \|\| result.error) {
		return { ts: Date.now(), processes: [], error: 'container not running' };
		}
		// docker top bare CLI outputs tabular text, not JSON. Wrap it for parseDockerTop.
		stdout = result.stdout ?? '';
		const lines = stdout.trim().split('\n');
		if (lines.length < 1) return { ts: Date.now(), processes: [] };
		// First line is the header row; remaining are process rows.
		// stime is always a single word (e.g. "10:00" or "Feb11"), so splitting
		// on 1+ whitespace is safe.
		const titleFields = lines[0].trim().split(/\s+/);
		const dataRows = lines.slice(1).map((line) => {
		const parts = line.trim().split(/\s+/);
		// CMD may contain spaces — rejoin everything after the 7th token.
		if (parts.length > 7) {
		return [...parts.slice(0, 6), parts.slice(6).join(' ')];
		}
		return parts;
		});
		const wrapped = JSON.stringify({ Titles: titleFields, Processes: dataRows });
		return { ts: Date.now(), processes: parseDockerTop(wrapped) };
		} else {
		// Container mode: Docker HTTP API.
		const apiBase = DOCKER_HOST.replace(/^tcp:\/\//, 'http://');
		const url = `${apiBase}/containers/${encodeURIComponent(containerName)}/top?ps_args=${encodeURIComponent(ps_args)}`;
		const resp = await fetch(url, { signal: AbortSignal.timeout(3000) });
		if (!resp.ok) {
		return { ts: Date.now(), processes: [], error: 'container not running' };
		}
		stdout = await resp.text();
		return { ts: Date.now(), processes: parseDockerTop(stdout) };
		}
		} catch {
		return { ts: Date.now(), processes: [], error: 'container not running' };
		}
		}

		/**
		* Snapshot — thin wrapper over fetchProcesses.
		*
		* @param {string} worldId
		*/
		export async function getProcessSnapshot(worldId) {
		return fetchProcesses(worldId);
		}

		// ── SSE fanout state ─────────────────────────────────────────────────

		/**
		* Per-world subscriber registry.
		* @type {Map<string, {pollTimer: ReturnType<typeof setInterval>, heartbeatTimer: ReturnType<typeof setInterval>, subscribers: Set<import('node:http').ServerResponse>}>}
		*/
		const worldPollers = new Map();

		/**
		* Broadcast a payload to all subscribers for a world.
		* @param {string} worldId
		* @param {{ts: number, processes: ProcessRow[], error?: string}} data
		*/
		function broadcast(worldId, data) {
		const entry = worldPollers.get(worldId);
		if (!entry) return;
		const payload = `event: processes\ndata: ${JSON.stringify(data)}\n\n`;
		for (const res of entry.subscribers) {
		try { res.write(payload); } catch { /* subscriber gone; cleanup fires on close */ }
		}
		}

		/**
		* Subscribe an SSE response to the world's process stream.
		*
		* SSE headers are written BEFORE adding to the Set (T2: prevents leak if close
		* fires before headers are flushed — the cleanup handler is safe to call even
		* with an empty Set).
		*
		* @param {string} worldId
		* @param {import('node:http').ServerResponse} res
		*/
		export function subscribeToProcesses(worldId, res) {
		// Write SSE headers synchronously before touching the subscriber Set (T2).
		res.writeHead(200, {
		'Content-Type': 'text/event-stream',
		'Cache-Control': 'no-cache',
		'Connection': 'keep-alive',
		'X-Accel-Buffering': 'no',
		});

		let entry = worldPollers.get(worldId);

		if (!entry) {
		// First subscriber — start the poll + heartbeat timers.
		const pollTimer = setInterval(async () => {
		const data = await fetchProcesses(worldId);
		broadcast(worldId, data);
		}, 5000);

		const heartbeatTimer = setInterval(() => {
		const e = worldPollers.get(worldId);
		if (!e) return;
		for (const r of e.subscribers) {
		try { r.write(': heartbeat\n\n'); } catch { /* ignore */ }
		}
		}, 25000);

		entry = { pollTimer, heartbeatTimer, subscribers: new Set() };
		worldPollers.set(worldId, entry);
		}

		entry.subscribers.add(res);

		// Send an immediate first snapshot so the client doesn't wait 5s.
		fetchProcesses(worldId).then((data) => {
		try { res.write(`event: processes\ndata: ${JSON.stringify(data)}\n\n`); } catch { /* gone */ }
		});

		// Cleanup on disconnect — mirrors wireRelease pattern with once-flag.
		let cleaned = false;
		function cleanup() {
		if (cleaned) return;
		cleaned = true;
		const e = worldPollers.get(worldId);
		if (!e) return;
		e.subscribers.delete(res);
		if (e.subscribers.size === 0) {
		clearInterval(e.pollTimer);
		clearInterval(e.heartbeatTimer);
		worldPollers.delete(worldId);
		}
		}

		res.on('close', cleanup);
		res.on('finish', cleanup);
		}

		// Export parseDockerTop for unit tests.
		export { parseDockerTop };

-245

host-cp/src/proxy.mjs

		// Phase F-2-B (B3): host CP HTTP proxy.
		//
		// Rewrites incoming requests under `/api/world/<id>/<route...>` to the
		// per-world CP at `<perWorldBase>/<route...>` with `X-Olam-Secret`
		// injected server-side.
		//
		// Pattern lifted from `packages/cloudflare-worker/src/index.ts:462-551`
		// (`proxyContainer`). CF Worker uses Workers' `fetch()`; host CP uses
		// Node's `http.request` so SSE streams flow byte-for-byte without
		// buffering. Verbatim passthrough on /hooks/* and /api/auth/* (D8) is
		// implemented in B4 (this module is JSON-API-only — B4 wraps).

		import http from 'node:http';

		/**
		* Default upstream-request timeout for proxied per-world CP calls. SSE
		* streams (`/api/stream`, `/hooks/*` long-poll) MUST opt out — they
		* intentionally hold the socket open. Everything else should respond
		* within a few seconds; if the per-world CP wedges (slow sqlite,
		* tmux command stuck, long docker exec), this prevents the host-cp
		* connection from hanging until the OS RSTs it. The browser sees a
		* clean 504 instead of Safari's TypeError "Load failed", and useLanes /
		* useReadiness can retry on a known status code.
		*
		* 10s matches the longest legitimate handler we've measured (cold
		* sqlite open + readiness query) with headroom.
		*
		* @internal exported for test override
		*/
		export const DEFAULT_PROXY_TIMEOUT_MS = 10_000;

		/**
		* Parse `/api/world/<id>/<route...>` from a request path. Returns
		* `{ worldId, subPath }` or null if the path doesn't match.
		*
		* Anchored at `^/api/world/` to prevent prefix-matching from /api/worlds
		* (the worlds-list endpoint, plural). Empty world IDs do not match.
		*
		* @param {string} path
		* @returns {{ worldId: string, subPath: string } \| null}
		*/
		export function parseProxyPath(path) {
		const m = /^\/api\/world\/([^/?#]+)(\/.\|\?.\|#.*)?$/.exec(path);
		if (!m) return null;
		return {
		worldId: m[1],
		subPath: m[2] ?? '/',
		};
		}

		/**
		* Compute the per-world CP's base URL from a worldId. Today the world
		* registry stores port offsets; the canonical port is `19080 + offset`.
		* For B3, accept the port directly (deferring worlds.db integration to
		* B6/B10). The caller (server.mjs) resolves worldId → port via worlds.db
		* and passes the port here.
		*
		* In Docker Compose mode, host-cp is in its own network and reaches
		* world CPs via `host.docker.internal:<port>` (compose.yaml's
		* extra_hosts: host-gateway). On Docker Desktop this is automatic;
		* on Linux it requires the `host-gateway` extra-host directive.
		*
		* @param {number} port per-world CP host port (e.g., 20780)
		* @param {string} [host] optional hostname override (default 'host.docker.internal')
		* @returns {string}
		*/
		export function perWorldBase(port, host = 'host.docker.internal') { // bare-node-allow: container-mode default; bare callers pass WORLD_HOST explicitly (server.mjs)
		return `http://${host}:${port}`;
		}

		/**
		* SSE / long-poll paths whose handlers intentionally hold the socket
		* open. These MUST be exempt from the upstream timeout — applying it
		* would kill the stream every 10s. Caller can override per-request via
		* `streaming: true`.
		*
		* @param {string} subPath
		* @returns {boolean}
		*/
		function isStreamingPath(subPath) {
		// Strip query string before matching.
		const p = subPath.split('?')[0];
		return (
		p === '/api/stream' \|\|
		p.endsWith('/api/stream') \|\|
		p.startsWith('/hooks/') \|\|
		p === '/hooks' \|\|
		/^\/api\/auth\/events(\/\|$)/.test(p)
		);
		}

		/**
		* Proxy an incoming request to a per-world CP, injecting X-Olam-Secret.
		*
		* Forwards: method, path (subPath), body bytes, ALL request headers
		* EXCEPT `host` (rewritten) and `x-olam-secret` (overwritten with the
		* injected secret to prevent client spoofing).
		*
		* Returns: status code, ALL response headers (verbatim — D8 contract
		* forwards Set-Cookie, Location, etc. unchanged), body bytes streamed
		* via Node's http.IncomingMessage→ServerResponse pipe (no buffering).
		*
		* Upstream timeout: short-request handlers (≠ SSE) get an upstream
		* socket timeout of `timeoutMs` (defaults to DEFAULT_PROXY_TIMEOUT_MS).
		* On expiry we abort the upstream socket and respond 504 — this
		* converts a wedged per-world CP into a deterministic status code
		* instead of a TCP RST that Safari surfaces as `TypeError: Load
		* failed`. Pass `streaming: true` (or hit a path matching
		* `isStreamingPath`) to opt out.
		*
		* @param {object} args
		* @param {import('node:http').IncomingMessage} args.req
		* @param {import('node:http').ServerResponse} args.res
		* @param {string} args.subPath e.g., '/api/world' or '/api/stream'
		* @param {string} args.targetBase e.g., 'http://host.docker.internal:20780'
		* @param {string} args.secret the X-Olam-Secret value
		* @param {(message: string) => void} [args.log]
		* @param {number} [args.timeoutMs] per-request upstream timeout; ignored for streams
		* @param {boolean} [args.streaming] force SSE/long-poll mode (skip timeout)
		*/
		export function proxyToWorld({
		req,
		res,
		subPath,
		targetBase,
		secret,
		log = console.log,
		timeoutMs = DEFAULT_PROXY_TIMEOUT_MS,
		streaming = false,
		}) {
		const target = new URL(subPath, targetBase);
		const isStream = streaming \|\| isStreamingPath(subPath);

		// Build outbound headers. Filter `host` (Node will set from URL) +
		// overwrite `x-olam-secret` (defense against client spoofing).
		/** @type {Record<string, string \| string[]>} */
		const outHeaders = {};
		for (const [k, v] of Object.entries(req.headers)) {
		if (v === undefined) continue;
		const lower = k.toLowerCase();
		if (lower === 'host' \|\| lower === 'x-olam-secret') continue;
		outHeaders[k] = v;
		}
		outHeaders['x-olam-secret'] = secret;
		outHeaders['x-forwarded-by'] = 'olam-host-cp';

		const upstreamReq = http.request(
		target,
		{
		method: req.method ?? 'GET',
		headers: outHeaders,
		},
		(upstreamRes) => {
		// Once headers come back from upstream, the request is no longer
		// "stuck" — clear the timeout so a slow stream-of-body doesn't
		// get killed mid-flight. Streaming handlers that intentionally
		// delay between writes still rely on the no-timeout path.
		if (timer !== null) {
		clearTimeout(timer);
		timer = null;
		}
		// Verbatim passthrough: status + ALL headers + body bytes.
		// Use res.writeHead so the headers go out atomically with the
		// status line (response.statusCode + setHeader split would race
		// on early body write). statusMessage may be undefined on some
		// upstream paths — fall back to the default.
		res.writeHead(
		upstreamRes.statusCode ?? 502,
		upstreamRes.statusMessage,
		upstreamRes.headers,
		);
		upstreamRes.pipe(res);
		},
		);

		/** @type {ReturnType<typeof setTimeout> \| null} */
		let timer = null;
		if (!isStream && timeoutMs > 0) {
		timer = setTimeout(() => {
		timer = null;
		log(`proxy: upstream timeout (${timeoutMs}ms) for ${target}`);
		// Destroying the upstream req triggers the 'error' handler with
		// a generic socket error; we pre-empt it with an explicit 504
		// first so the client sees a clean status instead of the generic
		// 502 the error handler would emit.
		if (!res.headersSent) {
		res.writeHead(504, { 'Content-Type': 'application/json; charset=utf-8' });
		res.end(JSON.stringify({
		error: 'upstream_timeout',
		message: `per-world CP did not respond within ${timeoutMs}ms`,
		worldUrl: target.origin,
		}));
		} else {
		res.end();
		}
		try {
		upstreamReq.destroy(new Error('proxy upstream timeout'));
		} catch {
		// already destroyed
		}
		}, timeoutMs);
		}

		// Upstream connection error — don't leak internals to the client.
		upstreamReq.on('error', (err) => {
		if (timer !== null) {
		clearTimeout(timer);
		timer = null;
		}
		log(`proxy: upstream error for ${target}: ${err.message}`);
		if (!res.headersSent) {
		res.writeHead(502, { 'Content-Type': 'application/json; charset=utf-8' });
		res.end(JSON.stringify({
		error: 'upstream_unreachable',
		message: 'per-world CP did not respond',
		worldUrl: target.origin,
		}));
		} else {
		// Response already started (likely SSE); just close.
		res.end();
		}
		});

		// Client closed early (browser navigated away, Safari unloaded the
		// EventSource, etc.). Tear down the upstream so we don't keep an
		// open socket to the per-world CP for an answer the caller no longer
		// wants. Without this, host-cp leaks sockets per cancelled poll.
		res.on('close', () => {
		if (timer !== null) {
		clearTimeout(timer);
		timer = null;
		}
		if (!upstreamReq.destroyed) {
		try {
		upstreamReq.destroy();
		} catch {
		// already gone
		}
		}
		});

		// Pipe request body. For GET/HEAD this is a no-op (no body bytes);
		// for POST/PUT/PATCH this streams the body upstream.
		req.pipe(upstreamReq);
		}

-68

host-cp/src/pylon-worlds-source.mjs

		/**
		* Phase E3 (olam-dogfood-vision): PylonWorldsSource skeleton.
		*
		* Stub implementation of the WorldsSource contract (E1) for Pylon-
		* managed cloud worlds. Returns `[]` for now — the actual @pleri/pylon
		* SDK integration is intentionally deferred (T5 mitigation: design the
		* contract before the SDK lands so consumers don't churn when it does).
		*
		* The class proves the interface composes: E4 wires this alongside
		* LocalWorldsSource into the GET /api/worlds handler so a Pylon-enabled
		* deployment fans out across both sources, dedupes by id, and returns
		* the union. With this stub returning `[]`, an enabled-but-empty Pylon
		* source is a strict no-op over local-only behavior.
		*
		* Activation: gated by `OLAM_HOST_CP_PYLON_ENABLED=1`. When the env
		* var is unset/0/false, server.mjs (E4) does NOT instantiate this
		* source — the local-only path is preserved verbatim. When enabled,
		* the empty source layers additively on top of local; behavior is
		* still observably identical until the SDK ships.
		*
		* Why a no-op stub instead of waiting for the SDK:
		* - Consumers (SPA badge logic in E5, regression tests, CLI lookup)
		* can be wired against the contract without blocking on the SDK.
		* - Forces E4's composition logic to actually fan out, dedupe, and
		* merge — exercising the multi-source path in CI before any cloud
		* traffic touches it.
		* - Surface-area lock-in: anything missing here surfaces as a
		* contract gap NOW, not after the SDK is wired.
		*
		* @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource
		* @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary
		*/

		/**
		* @typedef {object} PylonWorldsSourceDeps
		* @property {boolean} enabled
		* When false, list() short-circuits to `[]` without any Pylon
		* interaction. Kept on the deps object (rather than read from
		* process.env at construction time) so tests can flip it without
		* mutating module-level env state.
		*/

		/**
		* @param {PylonWorldsSourceDeps} [deps]
		* @returns {WorldsSource}
		*/
		export function createPylonWorldsSource(deps = { enabled: false }) {
		return {
		name: 'pylon-cloud',
		async list() {
		if (!deps.enabled) return [];
		// TODO(pylon): wire @pleri/pylon SDK. Expected shape:
		// const client = new PylonClient({ token: scopedToken });
		// const cloudWorlds = await client.worlds.list();
		// return cloudWorlds.map((w) => ({
		// id: w.id,
		// name: w.displayName ?? null,
		// status: mapPylonStatus(w.state), // 'running' \| 'starting' \| ...
		// services: mapPylonServices(w.services),
		// source: 'pylon-cloud',
		// }));
		// Until the SDK lands, the source is intentionally empty —
		// proving the interface composes (E4) without committing the
		// mapping shape prematurely.
		return [];
		},
		};
		}

-67

host-cp/src/redact.mjs

		// Phase F-2-B (B6): redact sensitive keys from workspace YAML before
		// exposing via /api/workspaces.
		//
		// T11 mitigation. Workspace YAMLs may contain operator-set environment
		// variables that include OAuth client secrets, API keys, deployment
		// tokens, database passwords. These should NEVER cross the host-cp ↔
		// browser boundary.
		//
		// Strategy: pattern-based recursive redaction. Any object key matching
		// SENSITIVE_KEY_PATTERN replaces its value with `[redacted]`. Catches
		// the standard naming conventions while remaining permissive on
		// non-sensitive keys (we don't false-positive on legitimate config).
		//
		// The pattern is intentionally broad — it's defensive. If an operator
		// names a non-sensitive var with a `_KEY`/`_SECRET`/`_TOKEN`/`_PASSWORD`/
		// `_CREDENTIALS` suffix, it gets redacted. Operators get a clear signal
		// (the value becomes `[redacted]`) and can rename the var if needed.
		//
		// We deliberately do NOT use the `PROTECTED_ENV_KEYS` set from
		// packages/core/src/world/env-setup.ts — that set is for service-
		// discovery host/port/URL keys (POSTGRES_HOST, REDIS_URL, etc.), not
		// org secrets. The two filters address different surfaces:
		// - PROTECTED_ENV_KEYS in core: prevents manifest from overriding
		// service-discovery state on the world's runtime env
		// - SENSITIVE_KEY_PATTERN here: prevents the host CP API from leaking
		// org secrets to the browser
		// Both are needed.

		export const SENSITIVE_KEY_PATTERN = /(._KEY\|._SECRET\|._TOKEN\|._PASSWORD\|._CREDENTIALS\|._AUTH\|API_KEY\|PASSWORD\|SECRET\|TOKEN)$/i;

		/**
		* Recursively redact sensitive values in any JSON-like structure
		* (objects, arrays, primitives). Returns a new value; does not mutate
		* input.
		*
		* @param {unknown} value
		* @returns {unknown}
		*/
		export function redactSensitive(value) {
		if (Array.isArray(value)) {
		return value.map(redactSensitive);
		}
		if (value !== null && typeof value === 'object') {
		/** @type {Record<string, unknown>} */
		const out = {};
		for (const [k, v] of Object.entries(value)) {
		if (SENSITIVE_KEY_PATTERN.test(k)) {
		out[k] = '[redacted]';
		} else {
		out[k] = redactSensitive(v);
		}
		}
		return out;
		}
		return value;
		}

		/**
		* Quick predicate: does this key name look sensitive? Useful for
		* pre-screening when iterating large maps.
		*
		* @param {string} key
		* @returns {boolean}
		*/
		export function isSensitiveKey(key) {
		return SENSITIVE_KEY_PATTERN.test(key);
		}

-159

host-cp/src/redirect.mjs

		// redirect.mjs — Phase B3 (plan-chat-spa-supersedes-control-plane).
		//
		// 301 redirect layer that fronts host-cp's HTTP handler. Maps legacy
		// control-plane routes that get deleted in Phase B4 onto their canonical
		// successors so live URLs in operator history / bookmarks / Slack do not
		// 404 after the deletion lands.
		//
		// Redirect rules (allow-listed; closed set):
		//
		// /plan/:id → no-op (falls through to SPA shell;
		// plan-chat-spa-side router handles the
		// resolver dispatch via useResolveId).
		// Implemented as a sentinel so callers
		// can short-circuit but the request
		// continues to static-serve.
		// /world/:id → 301 /worlds?highlight=:id
		// /sandbox/:id → 301 /worlds?highlight=:id
		// /session/:worldId/plan → 301 /plan/:worldId
		//
		// EXPLICITLY NOT REDIRECTED (more-specific routes still owned by
		// control-plane until Phase E):
		// /world/:id/editor /world/:id/events
		// /sandbox/:id/editor /sandbox/:id/events
		// /inbox/* /workspaces/*
		// /repos /runbooks /design
		//
		// Security (per plan-chat-spa-supersedes-control-plane.md K1 SEC-2):
		// - Redirect targets are HARDCODED prefixes (`/plan/`, `/worlds`). No
		// caller-supplied target is ever reflected into Location.
		// - `:id` segment is validated against RESOLVE_ID_RE before any
		// reflection into the Location header; invalid shapes → 400, not
		// 301. This kills open-redirect / response-splitting / header-
		// injection vectors at the door.
		// - `highlight=<id>` query param uses the SAME shape regex. We do not
		// trust the inbound URL beyond the regex match (no decoding, no
		// surrogate pair handling).
		//
		// Returns one of:
		// { kind: 'redirect', status: 301, location: '<target>' }
		// { kind: 'bad-request', status: 400, message: '<reason>' }
		// { kind: 'passthrough' } — caller continues normal request flow

		import { RESOLVE_ID_RE } from './resolver.mjs';

		/**
		* Compute the redirect verdict for a given pathname. Pure function;
		* does not consume the request body or write the response.
		*
		* @param {string} pathname - URL.pathname (no querystring, no hash)
		* @returns {{ kind: 'redirect', status: 301, location: string }
		* \| { kind: 'bad-request', status: 400, message: string }
		* \| { kind: 'passthrough' }}
		*/
		export function evaluateRedirect(pathname) {
		if (typeof pathname !== 'string' \|\| pathname.length === 0) {
		return { kind: 'passthrough' };
		}

		// /session/:worldId/plan → /plan/:worldId
		// Match BEFORE the catch-all world rules so the `/session/...` prefix
		// wins. The trailing `/plan` is fixed; only the worldId varies.
		const sessionMatch = /^\/session\/([^/]+)\/plan\/?$/.exec(pathname);
		if (sessionMatch) {
		const worldId = sessionMatch[1];
		if (!RESOLVE_ID_RE.test(worldId)) {
		return {
		kind: 'bad-request',
		status: 400,
		message: 'invalid worldId shape on /session/:worldId/plan',
		};
		}
		return {
		kind: 'redirect',
		status: 301,
		location: `/plan/${worldId}`,
		};
		}

		// /design → / (Phase E2: the DesignSurface alpha placeholder is retired.
		// Hardcoded target — no caller reflection. Exact-match only so /designfoo
		// or /design/sub do not over-match into the redirect.)
		if (pathname === '/design' \|\| pathname === '/design/') {
		return { kind: 'redirect', status: 301, location: '/' };
		}

		// /world/:id (catch-all, EXCLUDING /editor and /events sub-routes)
		// /sandbox/:id (catch-all, EXCLUDING /editor and /events sub-routes)
		const worldMatch = /^\/(world\|sandbox)\/([^/]+)(\/.*)?$/.exec(pathname);
		if (worldMatch) {
		const [, , id, rest] = worldMatch;
		// KEEP these — control-plane still owns them until Phase E.
		if (rest === '/editor' \|\| rest === '/events' \|\|
		rest?.startsWith('/editor/') \|\| rest?.startsWith('/events/')) {
		return { kind: 'passthrough' };
		}
		if (!RESOLVE_ID_RE.test(id)) {
		return {
		kind: 'bad-request',
		status: 400,
		message: 'invalid id shape on /(world\|sandbox)/:id',
		};
		}
		return {
		kind: 'redirect',
		status: 301,
		location: `/worlds?highlight=${encodeURIComponent(id)}`,
		};
		}

		// /plan/:id is intentionally passthrough — the SPA shell serves it
		// and the SPA-side router (with useResolveId) decides what to mount.
		// We DO NOT emit a self-loop 301 here. Including the rule for
		// completeness / future-proofing only.
		// (No regex needed; the static-serve layer already handles /plan/*
		// via SPA_PREFIX.)

		return { kind: 'passthrough' };
		}

		/**
		* Apply the redirect verdict to a node:http ServerResponse. Returns
		* `true` when the response was written (caller must NOT continue);
		* returns `false` when the caller should continue the normal request
		* flow.
		*
		* @param {import('node:http').ServerResponse} res
		* @param {ReturnType<typeof evaluateRedirect>} verdict
		* @returns {boolean} true if response was sent, false to passthrough.
		*/
		export function applyRedirect(res, verdict) {
		if (verdict.kind === 'passthrough') return false;

		if (verdict.kind === 'redirect') {
		res.writeHead(301, {
		Location: verdict.location,
		// Short cache so bookmarks update once but operator-local mistakes
		// (typo'd URL) don't pin to a stale redirect forever.
		'Cache-Control': 'public, max-age=300',
		'Content-Type': 'text/plain; charset=utf-8',
		});
		res.end(`Moved permanently: ${verdict.location}\n`);
		return true;
		}

		if (verdict.kind === 'bad-request') {
		res.writeHead(400, {
		'Content-Type': 'application/json; charset=utf-8',
		'Cache-Control': 'no-store',
		});
		res.end(JSON.stringify({
		error: 'bad-request',
		message: verdict.message,
		}));
		return true;
		}

		// Defensive: unknown verdict shape → fall through silently.
		return false;
		}

-121

host-cp/src/resolver.mjs

		// resolver.mjs — Phase A A1 (plan-chat-spa-supersedes-control-plane).
		//
		// Disambiguates a single opaque :id supplied on /plan/:id between
		//
		// - a planning session (planning_sessions.session_id), or
		// - a crystallized world (planning_artifacts.crystallized_world_id), or
		// - unresolvable (returns {kind:'unresolved', canonical_id:null}).
		//
		// Used by plan-chat-spa's useResolveId hook (Phase A A2) so the SPA's
		// cold-open path can mount the correct surface without trusting the
		// id-shape (sentinel `sess_*` prefix is a hint, not authority — see
		// plan-chat-spa-supersedes-control-plane.md K1 SEC-1).
		//
		// Single SQL query (UNION ALL) so resolution costs one round-trip even
		// when the id misses both tables. Bearer auth + rate-limit live in the
		// HTTP handler in plan-chat-service.mjs; this helper is pool-pure for
		// unit testability.

		/**
		* Validate the resolver :id shape. Mirrors plan-chat-service.mjs's
		* SCOPE_ID_RE; tightened to 6-80 chars so an enumeration attacker can't
		* grind through 1-5 char shapes.
		*/
		export const RESOLVE_ID_RE = /^[A-Za-z0-9._-]{6,80}$/;

		/**
		* Resolve an opaque id against the chunks substrate.
		*
		* @param {{ query: (sql: string, params: unknown[]) => Promise<{ rows: unknown[] }> }} pool
		* A pg-shaped pool. Tests pass a stub; production passes pg.Pool.
		* @param {string} id The candidate id.
		* @returns {Promise<{ kind: 'session' \| 'world' \| 'unresolved', canonical_id: string \| null }>}
		*/
		export async function resolveId(pool, id) {
		if (typeof id !== 'string' \|\| !RESOLVE_ID_RE.test(id)) {
		return { kind: 'unresolved', canonical_id: null };
		}

		// Single round-trip. Both branches return the same shape
		// (kind, canonical_id) so PG can UNION ALL them without coercion.
		//
		// Session branch wins on tie (LIMIT 1 + session ordered first) — a
		// session id colliding with a world id is unlikely in practice
		// (worldId is the random docker name; sessionId is uuid-shaped),
		// but the deterministic ordering closes the K1 collision risk
		// surfaced in pass 3 review.
		const sql = `
		SELECT kind, canonical_id FROM (
		SELECT 'session' AS kind, session_id AS canonical_id, 1 AS rank
		FROM planning_sessions
		WHERE session_id = $1
		UNION ALL
		SELECT 'world' AS kind, crystallized_world_id AS canonical_id, 2 AS rank
		FROM planning_artifacts
		WHERE crystallized_world_id = $1
		) AS resolved
		ORDER BY rank
		LIMIT 1
		`;

		const result = await pool.query(sql, [id]);
		const row = result.rows && result.rows[0];
		if (!row) return { kind: 'unresolved', canonical_id: null };

		// Pool stub-friendly: tolerate column names emerging from pg's
		// case-insensitive identifier handling.
		const kind = row.kind ?? row.KIND;
		const canonical_id = row.canonical_id ?? row.CANONICAL_ID;
		if (kind !== 'session' && kind !== 'world') {
		return { kind: 'unresolved', canonical_id: null };
		}
		if (typeof canonical_id !== 'string' \|\| canonical_id.length === 0) {
		return { kind: 'unresolved', canonical_id: null };
		}
		return { kind, canonical_id };
		}

		/**
		* Token-bucket rate limiter, per bearer principal. Closes the brute-
		* force enumeration vector that bearer auth alone leaves open (an
		* authenticated caller could otherwise grind through ids at
		* line-rate).
		*
		* 60 req/min per bearer. Single-process in-memory map (one host-cp
		* per host); a multi-instance deployment would need a shared store,
		* but plan-chat-service is single-tenant single-host by design.
		*/
		export function createRateLimiter({
		capacity = 60,
		windowMs = 60_000,
		now = () => Date.now(),
		} = {}) {
		const buckets = new Map(); // key -> { tokens, lastRefill }

		function take(key) {
		const t = now();
		let bucket = buckets.get(key);
		if (!bucket) {
		bucket = { tokens: capacity, lastRefill: t };
		buckets.set(key, bucket);
		}
		// Refill proportional to elapsed time.
		const elapsed = t - bucket.lastRefill;
		if (elapsed > 0) {
		const refill = (elapsed / windowMs) * capacity;
		bucket.tokens = Math.min(capacity, bucket.tokens + refill);
		bucket.lastRefill = t;
		}
		if (bucket.tokens < 1) {
		return { allowed: false, retryAfterMs: Math.ceil((1 - bucket.tokens) * (windowMs / capacity)) };
		}
		bucket.tokens -= 1;
		return { allowed: true, retryAfterMs: 0 };
		}

		function reset() {
		buckets.clear();
		}

		return { take, reset };
		}

-168

host-cp/src/router.mjs

		// host-cp request router.
		//
		// Replaces the long linear `if (url.pathname === ...)` dispatch chain in
		// server.mjs with an ordered route table. The table is walked in
		// registration order, so route PRECEDENCE is preserved exactly as it was
		// in the original if-ladder: the first matching route wins, later routes
		// are never consulted once a match handles the request.
		//
		// Why a table and not a framework:
		// - host-cp ships with no external HTTP framework (no express/fastify);
		// this matches the existing zero-dep style.
		// - The table is a plain data structure, so it is importable + unit
		// testable WITHOUT booting server.mjs (which spawns docker-events,
		// the auth poller, and the worlds.db reconciler at import time).
		// - A route is now a table entry instead of a `return` buried in a
		// 1700-line ladder. That kills the silent route-shadowing class: a
		// misplaced `return` can no longer swallow a later route, and the
		// full set of routes is enumerable (see `router.routes()`).
		//
		// Behavior-preservation contract (load-bearing — see
		// __tests__/router.test.mjs):
		// 1. Walk order == registration order == original source order.
		// 2. A route MATCHES when its matcher returns a truthy match value AND
		// (no method filter OR the method matches). The matcher receives
		// ({ pathname, method, url }) and returns either a boolean or, for
		// regex routes, the RegExpMatchArray (truthy) so the handler can read
		// capture groups.
		// 3. The FIRST matching route is invoked and dispatch STOPS — identical
		// to `if (cond) { ...; return; }`. The handler owns the response.
		// 4. A route whose path matches but whose METHOD does not is SKIPPED,
		// and the walk continues — identical to the original
		// `if (pathMatch && req.method === 'X')` blocks, where a path hit
		// with the wrong method fell through to the next `if`.
		// 5. If no route matches, dispatch returns `false` so the caller runs
		// its terminal 404 — identical to the original fall-through.
		//
		// The router does NOT add auth, body parsing, or any middleware semantics.
		// Those stay exactly where they were in server.mjs (pre-auth routes, the
		// auth gate, the plan-chat bypass) — the router only models the part of
		// the chain that was a flat sequence of `if` blocks.

		/**
		* @typedef {object} RouteContext
		* @property {string} pathname url.pathname
		* @property {string} method req.method (already normalized by node to uppercase)
		* @property {URL} url parsed request URL
		*/

		/**
		* A matcher decides whether a route applies to a request, ignoring method.
		* Returning a non-boolean truthy value (e.g. a RegExpMatchArray) is
		* forwarded to the handler as `ctx.match` so regex routes can read groups.
		*
		* @typedef {(ctx: RouteContext) => (boolean \| RegExpMatchArray \| null \| undefined)} RouteMatcher
		*/

		/**
		* A handler receives the node req/res plus the parsed url, the matched
		* value (for regex routes), and is responsible for writing the response.
		* It mirrors the body of an original `if` block. Return value is ignored;
		* matching alone terminates dispatch (preserving the `if ... return`
		* semantics where reaching the block always handled the request).
		*
		* @typedef {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse, ctx: RouteContext & { match: any }) => unknown \| Promise<unknown>} RouteHandler
		*/

		/**
		* @typedef {object} Route
		* @property {string} name human label for diagnostics / tests
		* @property {string[] \| null} methods allowed methods, or null for "any method"
		* @property {RouteMatcher} match
		* @property {RouteHandler} handler
		*/

		/**
		* Create an ordered router. Routes are matched in the order they are
		* registered — register in the SAME order the original if-ladder ran.
		*/
		export function createRouter() {
		/** @type {Route[]} */
		const routes = [];

		/**
		* Register a route. Returns the router for chaining.
		*
		* @param {object} spec
		* @param {string} spec.name
		* @param {string \| string[] \| null} [spec.method] single method, list, or null/omitted for any
		* @param {string} [spec.path] exact pathname match (mutually exclusive with prefix/match)
		* @param {string} [spec.prefix] pathname.startsWith(prefix) match
		* @param {RegExp} [spec.pattern] pathname.match(pattern) — match value passed to handler
		* @param {RouteMatcher} [spec.match] custom matcher (overrides path/prefix/pattern)
		* @param {RouteHandler} spec.handler
		*/
		function register(spec) {
		const { name, method, path, prefix, pattern } = spec;
		const handler = spec.handler;
		if (typeof handler !== 'function') {
		throw new TypeError(`route "${name}" requires a handler function`);
		}

		/** @type {string[] \| null} */
		let methods = null;
		if (Array.isArray(method)) methods = method.slice();
		else if (typeof method === 'string') methods = [method];
		// method omitted or null → any method

		/** @type {RouteMatcher} */
		let match;
		if (typeof spec.match === 'function') {
		match = spec.match;
		} else if (typeof path === 'string') {
		match = (ctx) => ctx.pathname === path;
		} else if (typeof prefix === 'string') {
		match = (ctx) => ctx.pathname.startsWith(prefix);
		} else if (pattern instanceof RegExp) {
		match = (ctx) => ctx.pathname.match(pattern);
		} else {
		throw new TypeError(
		`route "${name}" requires one of: path, prefix, pattern, or match`,
		);
		}

		routes.push({ name, methods, match, handler });
		return api;
		}

		/**
		* Walk the table in registration order. Invokes the first route whose
		* matcher is truthy AND whose method filter admits the request, then
		* stops. A path-match with a non-admitted method is skipped (the walk
		* continues), preserving the original `if (pathMatch && method===X)`
		* fall-through.
		*
		* @param {import('node:http').IncomingMessage} req
		* @param {import('node:http').ServerResponse} res
		* @param {URL} url
		* @returns {Promise<boolean>} true if a route handled the request, false to fall through to 404
		*/
		async function dispatch(req, res, url) {
		const ctx = { pathname: url.pathname, method: req.method ?? 'GET', url };
		for (const route of routes) {
		const matched = route.match(ctx);
		if (!matched) continue;
		// Path matched. Now gate on method — a mismatch is a SKIP, not a
		// 405, exactly mirroring the original if-ladder fall-through.
		if (route.methods !== null && !route.methods.includes(ctx.method)) {
		continue;
		}
		await route.handler(req, res, { ...ctx, match: matched });
		return true;
		}
		return false;
		}

		/**
		* Enumerate registered routes (name + methods + matcher kind) for
		* diagnostics, audits, and tests. Pure read of the table.
		*
		* @returns {Array<{ name: string, methods: string[] \| null }>}
		*/
		function list() {
		return routes.map((r) => ({ name: r.name, methods: r.methods }));
		}

		const api = { register, dispatch, list, get size() { return routes.length; } };
		return api;
		}

-104

host-cp/src/secret-cache.mjs

		// Phase F-2-B (B3): per-world secret cache.
		//
		// Pattern lifted from `packages/cloudflare-worker/src/index.ts:428-446`
		// (`getContainerSecret`). CF Worker uses Durable Object storage with a
		// 1h TTL; host CP uses an in-memory Map with a 5min TTL (D2 — demoted
		// from 1h after the security review pass).
		//
		// The cache invalidates on two paths:
		// 1. TTL expiry (lazy: checked on each `get(worldId)` call)
		// 2. Docker events stream (eager: docker-events.mjs subscribes to
		// `restart` / `stop` events and calls `invalidate(worldId)` —
		// M2 ship gate is "docker restart <world>; within 10s, proxy
		// call returns 200 not 401").

		/**
		* @typedef {object} CacheEntry
		* @property {string} secret
		* @property {number} expiresAt epoch ms
		*/

		export class SecretCache {
		/**
		* @param {object} opts
		* @param {number} [opts.ttlSec] cache TTL in seconds (default 300 = 5min)
		* @param {() => number} [opts.now] clock injectable for tests
		* @param {(message: string) => void} [opts.log] logger injectable for tests
		*/
		constructor({ ttlSec = 300, now = Date.now, log = console.log } = {}) {
		this.ttlMs = ttlSec * 1000;
		/** @type {Map<string, CacheEntry>} */
		this.entries = new Map();
		this.now = now;
		this.log = log;
		}

		/**
		* Look up a cached secret. Returns null if absent OR expired (TTL check
		* is lazy — caller must re-fetch and call set()). The expiry path emits
		* a debug log so cache-miss observability is wired in from day one.
		*
		* @param {string} worldId
		* @returns {string \| null}
		*/
		get(worldId) {
		const entry = this.entries.get(worldId);
		if (!entry) return null;
		if (entry.expiresAt <= this.now()) {
		// Lazy expiry — clean up to keep the map tidy. Don't log per-call;
		// would be noisy under load.
		this.entries.delete(worldId);
		return null;
		}
		return entry.secret;
		}

		/**
		* Cache a freshly-fetched secret. Overrides any prior entry. The
		* `set` path is the only place TTL is reset — ensures a cache hit
		* never extends beyond ttlMs from the most recent fetch.
		*
		* @param {string} worldId
		* @param {string} secret
		*/
		set(worldId, secret) {
		this.entries.set(worldId, {
		secret,
		expiresAt: this.now() + this.ttlMs,
		});
		}

		/**
		* Eager invalidation. Called by docker-events.mjs on `restart` / `stop`
		* events. Returns true if an entry was present (test-observable).
		*
		* @param {string} worldId
		* @returns {boolean}
		*/
		invalidate(worldId) {
		const had = this.entries.has(worldId);
		if (had) {
		this.entries.delete(worldId);
		this.log(`secret-cache: invalidated ${worldId}`);
		}
		return had;
		}

		/**
		* Drop everything. Used at shutdown for clean teardown; also useful
		* in tests.
		*/
		clear() {
		this.entries.clear();
		}

		/**
		* Snapshot of cached worldIds (for /health diagnostics + tests).
		* Returns just the keys — never the secrets themselves.
		*
		* @returns {string[]}
		*/
		worldIds() {
		return [...this.entries.keys()];
		}
		}

-85

host-cp/src/serve-only-config.mjs

		// serve-only-config.mjs — host-cp SERVE-ONLY mode gate (Phase A of
		// host-cp-gke-serve-only-mode).
		//
		// host-cp normally runs as a local operator sidecar coupled to the host's
		// docker daemon + operator-repo + gh-config. On a managed GKE cluster those
		// host-couplings are absent: host-cp only serves plan-chat-spa + the
		// host-native `/api/*` surface; world orchestration runs elsewhere.
		//
		// `OLAM_HOST_CP_SERVE_ONLY=true` switches host-cp into that degraded shape:
		// - no docker transport connect, no world discovery
		// - no PlanOrchestrator docker wiring, no pr-merge-poller docker/repo deps
		// - world-orchestration routes (`/api/world/*`) return a structured 503
		// - version-status degrades to 'unknown' (no operator-repo)
		//
		// The flag defaults OFF — the local docker/k3d FULL mode is byte-for-byte
		// unchanged. This module is a tiny pure seam so the gate decision can be
		// unit-tested WITHOUT booting server.mjs (which connects docker + binds a
		// port at module load and therefore can't be imported in a test).
		//
		// ONE coarse flag — no granular per-subsystem toggles (plan S1 / YAGNI).

		/**
		* Decide whether host-cp runs in SERVE-ONLY mode.
		*
		* Strict `=== 'true'` parse (mirrors the HOST_CP_MODE env-flag convention
		* in server.mjs): only the literal string `'true'` enables it. Any other
		* value — unset, `'1'`, `'false'`, `''`, `'TRUE'` — keeps FULL mode so the
		* default stays OFF and operators can't half-enable it by accident.
		*
		* @param {NodeJS.ProcessEnv \| Record<string, string \| undefined>} [env]
		* Environment to read `OLAM_HOST_CP_SERVE_ONLY` from. Defaults to
		* `process.env`.
		* @returns {boolean} `true` when serve-only mode is active.
		*/
		export function isServeOnly(env = process.env) {
		return env?.OLAM_HOST_CP_SERVE_ONLY === 'true';
		}

		/**
		* Structured 503 body for world-orchestration routes that are unavailable
		* in serve-only mode. Reuses the host-cp `/api/*` JSON-error shape
		* (`{ error, message }`) so SPA error handling treats it uniformly.
		*
		* @type {{ error: 'orchestration_unavailable', message: string }}
		*/
		export const ORCHESTRATION_UNAVAILABLE = Object.freeze({
		error: 'orchestration_unavailable',
		message:
		'host-cp is in serve-only mode (managed cluster); world orchestration runs elsewhere',
		});

		/**
		* True when `pathname` (+ `method`) is a world-ORCHESTRATION route that must
		* degrade to a structured 503 in serve-only mode. The surface is wider than
		* the singular `/api/world/` proxy: it also covers the plural `/api/worlds/`
		* per-world mutation/read routes (e.g. `POST /api/worlds/<id>/tunnels` which
		* spawns a real cloudflare tunnel, `DELETE /api/worlds/<id>` which destroys a
		* world), world creation (`POST /api/worlds`), and the CLI `/v1/worlds/`
		* routes. Without this breadth a serve-only host-cp on a shared cluster would
		* execute tunnel/destroy mutations — the opposite of honest degradation.
		* (CP3 finding: the singular-only guard let POST /api/worlds/<id>/tunnels
		* open a live public tunnel in serve-only.)
		*
		* Deliberately NOT orchestration: `GET`/`HEAD /api/worlds` (the bare LIST
		* endpoint) — it returns an empty array in serve-only, which is honest.
		*
		* @param {unknown} pathname URL.pathname (no querystring).
		* @param {string} [method] HTTP method (defaults 'GET').
		* @returns {boolean}
		*/
		export function isOrchestrationRoute(pathname, method = 'GET') {
		if (typeof pathname !== 'string') return false;
		// Singular /api/world/<id>/... — the per-world CP proxy + /progress.
		if (pathname.startsWith('/api/world/')) return true;
		// CLI per-world routes (olam status/logs <world>).
		if (pathname.startsWith('/v1/worlds/')) return true;
		// Plural /api/worlds:
		// bare LIST (GET/HEAD /api/worlds) → honest [] in serve-only, NOT blocked.
		// create (POST /api/worlds) + any per-world subpath (/api/worlds/<id>...) → 503.
		if (pathname === '/api/worlds') {
		return method !== 'GET' && method !== 'HEAD';
		}
		if (/^\/api\/worlds\/[^/?#]+/.test(pathname)) return true;
		return false;
		}

host-cp/src/server.mjs

Sorry, the diff of this file is too big to display

-117

host-cp/src/sse-gate.mjs

		// Phase F-2-B (B5): SSE concurrent-connection gate + path detection.
		//
		// Background. Each open SSE proxy holds:
		// - A Node http.ClientRequest to the per-world CP (one fd)
		// - The browser's incoming socket (one fd)
		// Plus the Node event loop wakes on every chunk. With N worlds × M tabs
		// × Sse-per-tab, the FD budget grows linearly. P3 budgets ≤100 concurrent
		// SSE proxies; P4 caps at 50 + returns 503 with Retry-After: 30 above
		// that. Below the cap there's no impact.
		//
		// Cap semantics:
		// - increment() returns true if we're allowed to open; false → reject.
		// - decrement() is idempotent + fire-once via the FiredFlag pattern
		// because Node emits both 'close' and 'finish' on a normal stream
		// end. Without idempotency the counter would underflow.
		//
		// SSE detection is path-based (cheap; runs before opening upstream).
		// Two patterns are SSE today:
		// /api/stream — per-world CP's existing SSE feed
		// /api/world/<id>/bootstrap-progress — placeholder for B7's UI strip
		// (per-world CP route lands later)

		const SSE_PATH_PATTERNS = [
		/\/api\/stream(?:\/\|$\|\?)/,
		/\/bootstrap-progress(?:\/\|$\|\?)/,
		/\/api\/logs(?:\/\|$\|\?)/,
		];

		/**
		* Detect whether an upstream subPath represents an SSE stream. The
		* subPath is the value emitted by `parseProxyPath()` — i.e., everything
		* AFTER `/api/world/<id>`. So we match on the inner route, not the
		* `/api/world/<id>` prefix.
		*
		* @param {string} subPath
		* @returns {boolean}
		*/
		export function isSsePath(subPath) {
		return SSE_PATH_PATTERNS.some((re) => re.test(subPath));
		}

		export class SseGate {
		/**
		* @param {object} opts
		* @param {number} [opts.maxConcurrent] default 50 (P4 cap)
		* @param {(message: string) => void} [opts.log]
		*/
		constructor({ maxConcurrent = 50, log = console.log } = {}) {
		if (maxConcurrent < 1) {
		throw new Error('SseGate: maxConcurrent must be >= 1');
		}
		this.maxConcurrent = maxConcurrent;
		this.active = 0;
		this.log = log;
		}

		/**
		* Try to acquire a slot. If at cap, returns null + writes a 503 to
		* res. Caller MUST check the return value.
		*
		* @param {import('node:http').ServerResponse} res
		* @returns {{ release: () => void } \| null}
		*/
		acquire(res) {
		if (this.active >= this.maxConcurrent) {
		res.writeHead(503, {
		'Content-Type': 'application/json; charset=utf-8',
		'Retry-After': '30',
		});
		res.end(JSON.stringify({
		error: 'sse_capacity_reached',
		active: this.active,
		cap: this.maxConcurrent,
		retry_after_sec: 30,
		message: 'host CP has reached the SSE concurrent-connection cap. Retry after the indicated delay or close idle SPA tabs.',
		}));
		this.log(`sse-gate: 503 — cap reached (active=${this.active}, cap=${this.maxConcurrent})`);
		return null;
		}
		this.active++;
		let released = false;
		const release = () => {
		if (released) return;
		released = true;
		this.active--;
		};
		return { release };
		}

		/** Diagnostics for /health. */
		stats() {
		return {
		active: this.active,
		cap: this.maxConcurrent,
		};
		}
		}

		/**
		* Wire SSE-gate teardown to a ServerResponse's lifecycle. Node's
		* http response emits 'close' (client disconnected) AND 'finish'
		* (response.end() called) on different code paths. We want decrement
		* exactly once per acquire(), regardless of which event fires first.
		*
		* The release closure is already idempotent (released flag). Wiring
		* both events covers every termination path:
		* - browser closes tab → 'close' on res
		* - upstream EOF + res.end → 'finish' on res
		* - error in proxy → 'close' on res (Node fires close on errors)
		*
		* @param {import('node:http').ServerResponse} res
		* @param {() => void} release
		*/
		export function wireRelease(res, release) {
		res.on('close', release);
		res.on('finish', release);
		}

-191

host-cp/src/tasks-route.mjs

		// packages/host-cp/src/tasks-route.mjs
		//
		// B2.2: mount @olam/tasks-write-api's framework-neutral handlers under
		// /api/tasks/*. host-cp owns the pg.Pool (per D-B-19 olam-local-PG-primary);
		// wraps it via pgPoolExecutor (B2.1.1 adapter) and passes as HandlerDeps.pglite
		// (duck-typed; PgExecutor's query/exec/transaction match PGlite's shape).
		//
		// Auth model: leverages host-cp's existing StartupToken bearer gate (Authorization:
		// Bearer <token>). Per-request scopes + olamNodeId come from headers:
		// X-Olam-Node-Id: UUID of the caller's olam node (sets RLS scope per D-B-23)
		// X-Olam-Session-Id: UUID of the caller's session row (FK for task_claims)
		// X-Olam-Tasks-Scopes: comma-separated scope list (tasks-create,tasks-claim,
		// tasks-state-update,tasks-query). Trust model: bearer
		// token gates access; scope header lets the caller declare
		// narrower intent.
		//
		// Deviation from B2.2 plan spec: spec called for JWT + auth-service integration;
		// host-cp uses opaque tokens (StartupToken) and HTTP calls auth-service via HTTP.
		// JWT scope encoding deferred to Phase D++ when multi-user auth lands; for v1,
		// the existing bearer + per-request header model is sufficient (single-operator;
		// 127.0.0.1:19000 only per host-cp threat model).

		import pg from 'pg';

		// Treat BIGINT (OID 20) as Number, not the default string. The tasks schema's
		// `version` column is BIGINT but stays well within Number-safe range; without
		// this parser pg returns the value as a string, and the task-store types
		// declare it as `number`, letting a stray BigInt propagate (PGlite returns
		// BigInt by default). JSON.stringify on BigInt throws — caused /api/tasks
		// 500s with "Do not know how to serialize a BigInt" during the CLI E2E proof.
		pg.types.setTypeParser(20, (v) => (v == null ? null : Number.parseInt(v, 10)));

		let writeApi = null; // lazy-load tasks-write-api to keep cold-path light
		let executor = null;
		let pool = null;

		const VALID_SCOPES = new Set(['tasks-create', 'tasks-claim', 'tasks-state-update', 'tasks-query']);
		const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;

		async function ensureWriteApi() {
		if (writeApi) return writeApi;
		// Dynamic import: tasks-write-api is built TS (ESM dist). Fail-loud if not
		// built — operator must `npm run build --workspace=@olam/tasks-write-api`
		// before host-cp starts.
		writeApi = await import('@olam/tasks-write-api');
		return writeApi;
		}

		function ensureExecutor() {
		if (executor) return executor;
		const connectionString = process.env.OLAM_LOCAL_PG_URL;
		if (!connectionString) {
		throw new Error(
		'tasks-route: OLAM_LOCAL_PG_URL not set. Bring up Docker PG: docker compose -f packages/infra/docker-compose.local-electric.yml up -d, then export OLAM_LOCAL_PG_URL=postgres://postgres:olam@localhost:54331/olam_tasks',
		);
		}
		pool = new pg.Pool({ connectionString, max: 8 });
		// Lazy require pgPoolExecutor from the same dynamic-imported module.
		// ensureWriteApi must have run first; tasks-route's dispatch order guarantees it.
		return writeApi.pgPoolExecutor(pool);
		}

		function parseAuth(req) {
		const olamNodeId = req.headers['x-olam-node-id'];
		const sessionId = req.headers['x-olam-session-id'];
		const scopesHeader = req.headers['x-olam-tasks-scopes'] ?? '';
		const scopes = String(scopesHeader)
		.split(',')
		.map((s) => s.trim())
		.filter((s) => VALID_SCOPES.has(s));
		if (!olamNodeId \|\| !UUID_RE.test(String(olamNodeId))) return null;
		if (!sessionId \|\| !UUID_RE.test(String(sessionId))) return null;
		if (scopes.length === 0) return null;
		return { olamNodeId: String(olamNodeId), sessionId: String(sessionId), scopes };
		}

		async function readBody(req) {
		if (req.method === 'GET' \|\| req.method === 'HEAD') return {};
		return new Promise((resolve, reject) => {
		let raw = '';
		req.on('data', (chunk) => (raw += chunk));
		req.on('end', () => {
		if (!raw) return resolve({});
		try {
		resolve(JSON.parse(raw));
		} catch {
		resolve({ __invalid: true });
		}
		});
		req.on('error', reject);
		});
		}

		function sendEnvelope(res, status, envelope) {
		res.statusCode = status;
		res.setHeader('Content-Type', 'application/json');
		// BigInt-safe serialization. @olam/tasks's task-store wraps `version`
		// (and any future BIGINT fields) in BigInt() during row→Task mapping;
		// default JSON.stringify throws on BigInt. The values stay safely
		// within Number range (version starts at 0, increments per mutation),
		// so emitting as a JSON number is lossless for any realistic load.
		res.end(JSON.stringify(envelope, (_key, value) =>
		typeof value === 'bigint' ? Number(value) : value,
		));
		}

		/**
		* Dispatch a /api/tasks/* request. Returns true if handled; false if route
		* doesn't match (caller continues to next dispatcher in server.mjs).
		*/
		export async function dispatchTasksRoute(req, res, url) {
		const pathname = url.pathname;
		if (!pathname.startsWith('/api/tasks')) return false;

		// Lazy initialise on first request (avoids boot-time crash when PG not up).
		let api;
		try {
		api = await ensureWriteApi();
		} catch (e) {
		sendEnvelope(res, 500, { success: false, data: null, error: `tasks-write-api unbuilt: ${e.message}` });
		return true;
		}
		let exec;
		try {
		exec = ensureExecutor();
		} catch (e) {
		sendEnvelope(res, 503, { success: false, data: null, error: e.message });
		return true;
		}

		const auth = parseAuth(req);
		if (!auth) {
		sendEnvelope(res, 401, {
		success: false,
		data: null,
		error: 'Missing or malformed X-Olam-Node-Id / X-Olam-Session-Id / X-Olam-Tasks-Scopes headers',
		});
		return true;
		}

		const body = await readBody(req);
		if (body && body.__invalid) {
		sendEnvelope(res, 400, { success: false, data: null, error: 'Invalid JSON body' });
		return true;
		}

		// Route matching — minimal pattern (host-cp's existing if-ladder style).
		const segments = pathname.split('/').filter(Boolean); // ['api','tasks',...]
		const ctx = { auth, params: {}, query: Object.fromEntries(url.searchParams) };
		const deps = { pglite: exec };

		try {
		let response;
		if (segments.length === 2 && req.method === 'POST') {
		response = await api.createHandler({ ...ctx, body }, deps);
		} else if (segments.length === 2 && req.method === 'GET') {
		response = await api.queryHandler({ ...ctx, body }, deps);
		} else if (segments.length === 3 && segments[2] === 'claim' && req.method === 'POST') {
		response = await api.claimHandler({ ...ctx, body }, deps);
		} else if (segments.length === 3 && segments[2] === 'distill' && req.method === 'GET') {
		response = await api.distillHandler({ ...ctx, body }, deps);
		} else if (segments.length === 4 && segments[3] === 'heartbeat' && req.method === 'POST') {
		ctx.params.id = segments[2];
		response = await api.heartbeatHandler({ ...ctx, body }, deps);
		} else if (segments.length === 4 && segments[3] === 'complete' && req.method === 'POST') {
		ctx.params.id = segments[2];
		response = await api.completeHandler({ ...ctx, body }, deps);
		} else if (segments.length === 4 && segments[3] === 'update' && req.method === 'POST') {
		ctx.params.id = segments[2];
		response = await api.updateHandler({ ...ctx, body }, deps);
		} else {
		sendEnvelope(res, 404, { success: false, data: null, error: `Unknown /api/tasks route: ${req.method} ${pathname}` });
		return true;
		}

		sendEnvelope(res, response.status, response.envelope);
		return true;
		} catch (e) {
		console.error('[tasks-route] handler error:', e);
		sendEnvelope(res, 500, { success: false, data: null, error: e?.message ?? 'internal error' });
		return true;
		}
		}

		// Test surface — for unit tests to reset module state between cases.
		export function _resetForTests() {
		writeApi = null;
		executor = null;
		if (pool) pool.end().catch(() => undefined);
		pool = null;
		}

-280

host-cp/src/upgrade-spawner.mjs

		// Upgrade-trigger: spawn an ephemeral `olam upgrade` runner container.
		//
		// The user clicks "Run upgrade" in the dashboard → host-cp's
		// /api/admin/upgrade endpoint asks the docker daemon (via the
		// socket-proxy sidecar) to create + start a one-off container that
		// runs the olam CLI's full upgrade pipeline against the local stack.
		//
		// Why a separate container (and not a child process inside host-cp)?
		// `olam upgrade` recreates host-cp itself as part of the atomic
		// tag-swap. If the orchestrator lived inside host-cp, the moment it
		// asked docker to stop the old host-cp container the orchestrator
		// would die with it — leaving no one to start the new container.
		// A sibling container survives host-cp's recreate.
		//
		// Why this same image (not a purpose-built `olam-upgrader`)?
		// The host-cp image already has Node, the olam CLI, the docker CLI,
		// and the docker compose plugin baked in by Dockerfile. Reusing it
		// means there's nothing extra to publish, and the upgrader is
		// guaranteed to ship from the same source SHA as the host-cp it
		// replaces. The upgrader's `Cmd` overrides host-cp's default CMD
		// so it runs the CLI instead of starting the server.
		//
		// Security note (single-user-trusted-local-dev assumption):
		// POST /api/admin/upgrade requires the host-cp auth token. Anyone
		// with that token can already spawn arbitrary commands inside
		// running devboxes via the existing exec path; spawning an upgrader
		// container does not meaningfully widen the blast radius for the
		// single-user model. Multi-user / hosted deployments will need a
		// tighter policy (capability bit, user-scoped tokens).

		import http from 'node:http';

		/**
		* Default upgrader entrypoint. Authenticates docker against GHCR (where
		* the host-cp / auth / devbox images live), then runs the full
		* atomic-swap pipeline.
		*
		* Auth resolution order:
		* 1. `$GH_TOKEN` env var (set on host-cp via compose; operator
		* typically resolves it from `gh auth token` before `olam host-cp
		* start`). Required path on macOS — the host's `gh` keeps the
		* token in Keychain, which doesn't follow into a Linux container.
		* 2. `gh auth token` against the mounted ~/.config/gh. Works on Linux
		* operators whose gh keeps the token in the config dir directly.
		* 3. No-token (warns). The pull-by-digest step will fail with a clear
		* `unauthorized` error from the daemon and the upgrader exits non-
		* zero — surfacing in `docker logs` for diagnosis.
		*
		* Wrapping the auth + upgrade in a single `sh -c` invocation lets the
		* `docker login` stage feed credentials directly into the docker
		* daemon without leaking the token through shared volumes.
		*/
		const DEFAULT_UPGRADER_CMD = [
		'sh',
		'-c',
		[
		'set -e',
		// Resolve the GH token. Prefer the env var (works on every OS);
		// fall back to `gh auth token` (Linux-only on macOS Keychain hosts).
		'TOKEN="${GH_TOKEN:-$(gh auth token 2>/dev/null \|\| true)}"',
		'if [ -z "$TOKEN" ]; then echo "[upgrader] no GH_TOKEN; ghcr pulls will fail" >&2; fi',
		// Authenticate against ghcr.io. `oauth2` is GitHub's canonical
		// username placeholder for PAT-style tokens.
		'[ -n "$TOKEN" ] && echo "$TOKEN" \| docker login ghcr.io -u oauth2 --password-stdin',
		// The CLI resolves `packages/host-cp/compose.yaml` relative to its
		// current working directory (see upgrade.ts:1008-1009). Inside the
		// upgrader the npm-installed package lives at
		// /usr/local/lib/node_modules/@pleri/olam-cli/, so cd there before
		// running so the relative path resolves to the bundled compose
		// file. Without this the recreate step fails with `open
		// /app/packages/host-cp/compose.yaml: no such file or directory`.
		// Operator's olam repo is bind-mounted at /workspace below; cd
		// there so the CLI finds packages/host-cp/compose.yaml. Without
		// this the recreate step fails with `open <cwd>/packages/host-cp/
		// compose.yaml: no such file or directory` because the npm-
		// installed @pleri/olam-cli package does NOT bundle the compose
		// file (it's repo-source only).
		'cd /workspace',
		// Then run the upgrade. The CLI handles pull-by-digest, atomic
		// swap, recreate, and the post-recreate /api/version/status
		// round-trip itself.
		'olam upgrade -y',
		].join(' && '),
		];

		/**
		* Spawn the upgrader. Resolves with the container ID on a successful
		* `/start`; throws on any failure path so the caller can surface a
		* clean 500 with the daemon's reason.
		*
		* @param {object} args
		* @param {string} args.dockerHost tcp://docker-socket-proxy:2375 or 'docker-cli'
		* @param {string} args.olamHomeHostPath e.g. /Users/ernie/.olam
		* @param {string} args.dockerSockHostPath e.g. /var/run/docker.sock
		* @param {string} args.image upgrader image (defaults to host-cp's own image)
		* @param {string} [args.ghConfigHostPath] operator's ~/.config/gh; bind-mounted ro for
		* `gh auth token` to work inside the upgrader
		* (Linux fallback only; macOS uses GH_TOKEN env)
		* @param {string} [args.ghToken] pre-resolved GH token (typically read from
		* host-cp's GH_TOKEN env via compose). Passed
		* to the upgrader as $GH_TOKEN so `docker login
		* ghcr.io` works on macOS hosts whose Keychain-
		* backed gh config can't be read inside a Linux
		* container.
		* @param {string} [args.repoHostPath] operator's olam repo path on the host. Bind-
		* mounted into the upgrader at /workspace so
		* the CLI's cwd-relative compose-file lookup
		* resolves (the npm package doesn't bundle
		* packages/host-cp/compose.yaml).
		* @param {string} [args.operatorHomeHostPath] operator's $HOME on the host. Passed as the
		* upgrader's HOME env so docker-compose's
		* `${HOME}` interpolation in bind sources
		* resolves to a daemon-visible path.
		* @param {ReadonlyArray<string>} [args.cmd] override the upgrade command for tests
		* @param {(host: string, init: object) => Promise<Response>} [args.fetchImpl]
		* @param {(message: string) => void} [args.log]
		* @returns {Promise<{ id: string, name: string }>}
		*/
		export async function spawnUpgraderContainer({
		dockerHost,
		olamHomeHostPath,
		dockerSockHostPath,
		image,
		ghConfigHostPath,
		ghToken,
		repoHostPath,
		operatorHomeHostPath,
		cmd = DEFAULT_UPGRADER_CMD,
		fetchImpl = globalThis.fetch,
		log = console.log,
		}) {
		if (!olamHomeHostPath) {
		throw new Error('OLAM_HOME_HOST_PATH not set; cannot bind-mount operator state');
		}
		if (!dockerSockHostPath) {
		throw new Error('OLAM_DOCKER_SOCK_HOST_PATH not set; upgrader cannot reach docker daemon');
		}
		if (!image) {
		throw new Error('upgrader image not configured (OLAM_UPGRADER_IMAGE)');
		}
		if (!repoHostPath) {
		throw new Error(
		'OLAM_REPO_HOST_PATH not set; upgrader cannot find packages/host-cp/compose.yaml',
		);
		}

		// Bare-node (operator's host docker CLI on PATH) is documented but
		// out of scope for the trigger feature — container + unix-socket paths
		// are supported (compose stack and k8s hostPath socket mount).
		if (dockerHost === 'docker-cli') {
		// The literals below (`unix:///var/run/docker.sock` and `tcp://docker-socket-proxy:2375`)
		// are diagnostic text naming the deployment shapes that ARE supported,
		// not hostnames being used as transport — error-message-only.
		throw new Error(
		'upgrade-trigger requires a docker socket (unix:///var/run/docker.sock via k8s hostPath mount, ' + // bare-node-allow: diagnostic-text
		'or tcp://docker-socket-proxy:2375 via compose); bare-node not yet supported. ' + // bare-node-allow: diagnostic-text
		'For k8s: ensure the cluster was created with ' +
		'--volume /var/run/docker.sock:/var/run/docker.sock@server:* ' +
		'and olam doctor reports probeDockerSocketBindMount [PASS].',
		);
		}

		const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
		const containerName = `olam-upgrader-${Date.now()}`;

		// Container create body. AutoRemove cleans up on exit so we don't
		// accumulate stopped upgrader carcasses; HostConfig.Binds gives the
		// CLI everything it needs (operator state + docker socket).
		const createBody = {
		Image: image,
		Cmd: [...cmd],
		Entrypoint: [], // override host-cp's tini entrypoint; olam CLI is self-contained
		Env: [
		// HOME serves two roles:
		// - The CLI uses HOME to resolve ~/.olam (we bind-mount the
		// operator's ~/.olam into the upgrader so the CLI sees its
		// state).
		// - docker-compose interpolates `${HOME}` in bind sources of
		// compose.yaml. The daemon resolves those bind sources on
		// the HOST filesystem, so HOME must be a path the daemon
		// can find (typically the operator's host $HOME).
		//
		// Default to /root for back-compat with tests that don't pass
		// the operator host path; production callers (server.mjs) pass
		// operatorHomeHostPath through.
		`HOME=${operatorHomeHostPath ?? '/root'}`,
		// Non-interactive mode + auto-yes are belt-and-braces: -y flag is
		// also passed in Cmd, but env is the canonical way to opt out of
		// tty prompts when stdin is closed.
		'OLAM_NON_INTERACTIVE=1',
		'CI=1',
		// GH token forwarded for the docker-login-to-ghcr step. Only
		// included when host-cp received it (compose set GH_TOKEN); the
		// wrapper's auth fallback handles the unset case explicitly.
		...(ghToken ? [`GH_TOKEN=${ghToken}`] : []),
		],
		HostConfig: {
		AutoRemove: true,
		// Bind sources are resolved by the docker daemon on the HOST
		// filesystem, so target paths must match the host's view too —
		// when the upgrader's `docker compose up` recreates host-cp,
		// compose's ${HOME} interpolation pulls operatorHomeHostPath
		// (set as HOME below). We keep the source==target convention
		// for ~/.olam so the path is identical inside and out.
		Binds: [
		`${olamHomeHostPath}:${operatorHomeHostPath ? `${operatorHomeHostPath}/.olam` : '/root/.olam'}`,
		`${dockerSockHostPath}:/var/run/docker.sock`,
		// Operator's repo bind-mounted read-only at /workspace. The
		// wrapper cds here so the CLI's relative compose-file lookup
		// resolves to `<repo>/packages/host-cp/compose.yaml`.
		`${repoHostPath}:/workspace:ro`,
		// Optional gh config bind. When unset (operator skipped
		// `gh auth login`) the upgrader's `gh auth token` step fails
		// and the wrapper exits early — surface the failure via
		// `docker logs` rather than spawning a doomed run.
		...(ghConfigHostPath
		? [`${ghConfigHostPath}:${operatorHomeHostPath ? `${operatorHomeHostPath}/.config/gh` : '/root/.config/gh'}:ro`]
		: []),
		],
		// Same network as host-cp so the upgrader can reach the
		// docker-socket-proxy + auth-service if it needs to during the
		// verification phase. Falls through to docker.sock for daemon
		// operations.
		NetworkMode: 'olam-host-cp-internal',
		},
		Labels: {
		'olam.role': 'upgrader',
		'olam.spawned-by': 'host-cp',
		'olam.spawned-at': new Date().toISOString(),
		},
		};

		log(`[upgrade] creating upgrader container ${containerName} from ${image}`);
		const createUrl = `${apiBase}/containers/create?name=${encodeURIComponent(containerName)}`;
		const createRes = await fetchImpl(createUrl, {
		method: 'POST',
		headers: { 'Content-Type': 'application/json' },
		body: JSON.stringify(createBody),
		});
		if (!createRes.ok) {
		const detail = await safeReadBody(createRes);
		throw new Error(
		`daemon rejected POST /containers/create: ${createRes.status} ${createRes.statusText} ${detail}`.trim(),
		);
		}
		const created = await createRes.json();
		const containerId = created.Id;
		if (!containerId) {
		throw new Error(`POST /containers/create returned no Id: ${JSON.stringify(created)}`);
		}

		log(`[upgrade] starting upgrader ${containerId.slice(0, 12)}`);
		const startUrl = `${apiBase}/containers/${encodeURIComponent(containerId)}/start`;
		const startRes = await fetchImpl(startUrl, { method: 'POST' });
		if (!startRes.ok && startRes.status !== 304) {
		// 304 Not Modified = already started; treat as success.
		const detail = await safeReadBody(startRes);
		throw new Error(
		`daemon rejected POST /containers/${containerId}/start: ${startRes.status} ${detail}`.trim(),
		);
		}

		return { id: containerId, name: containerName };
		}

		/**
		* Read the response body without crashing if it isn't JSON or is empty.
		* Used for human-readable error surfaces in 500 responses.
		*
		* @param {Response} res
		* @returns {Promise<string>}
		*/
		async function safeReadBody(res) {
		try {
		const txt = await res.text();
		return txt.trim().slice(0, 512);
		} catch {
		return '';
		}
		}

-318

host-cp/src/version-status.mjs

		// Version detection for Phase 1 of self-upgrade.
		//
		// Compares each component's baked OLAM_BUILD_SHA against the operator's
		// local repo HEAD (mounted read-only at /operator-repo). Reports upgrade
		// availability without triggering any automatic action — Phase 1 is
		// detection only.

		import fs from 'node:fs';
		import path from 'node:path';

		/** @typedef {'ok' \| 'behind' \| 'unknown'} VersionState */

		/**
		* @typedef {Object} ComponentVersion
		* @property {string} running - SHA baked into the running image
		* @property {string} latest - SHA of operator's local HEAD (or 'unknown')
		* @property {boolean} upgradeAvailable
		*/

		/**
		* @typedef {Object} VersionSnapshot
		* @property {ComponentVersion} hostCp
		* @property {ComponentVersion} authService
		* @property {ComponentVersion} devbox
		* @property {string} operatorHead - resolved HEAD or 'unknown'
		* @property {string} checkedAt - ISO timestamp
		* @property {string} cliVersion - operator's CLI semver (e.g. "0.1.69") or 'unknown'
		*/

		/**
		* Read the operator's local repo HEAD.
		*
		* Tries OLAM_REPO_PATH env var first, then /operator-repo (the compose-
		* mounted path), then $HOME/Projects/ein-sof/olam as a bare-node fallback.
		*
		* Returns 'unknown' on any read error.
		*
		* @returns {string}
		*/
		export function readOperatorHead() {
		const candidates = [
		process.env.OLAM_REPO_PATH,
		'/operator-repo',
		].filter(Boolean);

		for (const repoPath of candidates) {
		try {
		// Read HEAD to find the current branch ref (e.g. "ref: refs/heads/main")
		// then resolve to the SHA.
		const headFile = path.join(repoPath, '.git', 'HEAD');
		if (!fs.existsSync(headFile)) continue;

		const headContent = fs.readFileSync(headFile, 'utf-8').trim();

		if (headContent.startsWith('ref: ')) {
		// Symbolic ref → resolve to SHA via the packed-refs or loose ref.
		const refPath = headContent.slice('ref: '.length);
		const looseRef = path.join(repoPath, '.git', refPath);
		if (fs.existsSync(looseRef)) {
		return fs.readFileSync(looseRef, 'utf-8').trim();
		}
		// Try packed-refs fallback.
		const packedRefs = path.join(repoPath, '.git', 'packed-refs');
		if (fs.existsSync(packedRefs)) {
		const lines = fs.readFileSync(packedRefs, 'utf-8').split('\n');
		for (const line of lines) {
		if (line.startsWith('#')) continue;
		const [sha, ref] = line.trim().split(' ');
		if (ref === refPath) return sha;
		}
		}
		} else if (/^[0-9a-f]{40}$/i.test(headContent)) {
		// Detached HEAD — use the SHA directly.
		return headContent;
		}
		} catch {
		// silently try next candidate
		}
		}
		return 'unknown';
		}

		/**
		* Compare two SHAs. Returns true when they differ and both are known.
		* If either is 'unknown' we cannot assert an upgrade is available.
		*
		* @param {string} running
		* @param {string} latest
		* @returns {boolean}
		*/
		export function isUpgradeAvailable(running, latest) {
		if (running === 'unknown' \|\| latest === 'unknown') return false;
		// SHAs may be full (40 hex chars) or short (7+ hex chars from --short).
		// Compare by checking if one is a prefix of the other.
		const a = running.toLowerCase();
		const b = latest.toLowerCase();
		return !a.startsWith(b) && !b.startsWith(a);
		}

		/**
		* Fetch the auth-service's /health endpoint and extract buildSha.
		*
		* @param {string} authServiceUrl
		* @returns {Promise<string>}
		*/
		export async function fetchAuthServiceSha(authServiceUrl) {
		try {
		const res = await fetch(`${authServiceUrl}/health`, {
		signal: AbortSignal.timeout(5000),
		});
		if (!res.ok) return 'unknown';
		const data = /** @type {unknown} */ (await res.json());
		if (data && typeof data === 'object' && 'buildSha' in data) {
		const sha = /** @type {Record<string, unknown>} */ (data)['buildSha'];
		return typeof sha === 'string' ? sha : 'unknown';
		}
		return 'unknown';
		} catch {
		return 'unknown';
		}
		}

		/**
		* Inspect a locally-tagged docker image (by reference such as
		* `ghcr.io/pleri/olam-host-cp:latest`) and extract its baked
		* OLAM_BUILD_SHA env. Returns 'unknown' if the image isn't pulled,
		* the docker socket is unreachable, or the env is missing.
		*
		* Used as the "what's the latest published image we'd swap to?"
		* signal for the upgrade comparator — replaces the prior
		* `operatorHead` (operator's local git HEAD) which over-reports
		* upgradeAvailable whenever an SPA-only PR merges between releases.
		*
		* @param {string} dockerApiBase
		* @param {string} imageRef e.g. "ghcr.io/pleri/olam-host-cp:latest"
		* @returns {Promise<string>}
		*/
		export async function fetchLatestImageSha(dockerApiBase, imageRef) {
		try {
		const res = await fetch(
		`${dockerApiBase}/images/${encodeURIComponent(imageRef)}/json`,
		{ signal: AbortSignal.timeout(5000) },
		);
		if (!res.ok) return 'unknown';
		const image = /** @type {unknown} */ (await res.json());
		if (!image \|\| typeof image !== 'object') return 'unknown';
		const config = /** @type {Record<string, unknown>} */ (image)['Config'];
		if (!config \|\| typeof config !== 'object') return 'unknown';
		const env = /** @type {Record<string, unknown>} */ (config)['Env'];
		if (!Array.isArray(env)) return 'unknown';
		for (const e of env) {
		if (typeof e === 'string' && e.startsWith('OLAM_BUILD_SHA=')) {
		return e.slice('OLAM_BUILD_SHA='.length);
		}
		}
		return 'unknown';
		} catch {
		return 'unknown';
		}
		}

		/**
		* Fetch the devbox image SHA. We check the running devbox container's
		* OLAM_BUILD_SHA env var via the docker socket proxy (inspect endpoint).
		* Returns 'unknown' if any step fails.
		*
		* @param {string} dockerApiBase e.g. "http://docker-socket-proxy:2375" or "http://localhost:2375"
		* @returns {Promise<string>}
		*/
		export async function fetchDevboxImageSha(dockerApiBase) {
		try {
		// List containers named olam-*-devbox and grab the first one.
		const listRes = await fetch(
		`${dockerApiBase}/containers/json?filters=${encodeURIComponent(JSON.stringify({ name: ['olam-devbox'] }))}`,
		{ signal: AbortSignal.timeout(5000) },
		);
		if (!listRes.ok) return 'unknown';
		const containers = /** @type {unknown} */ (await listRes.json());
		if (!Array.isArray(containers) \|\| containers.length === 0) return 'unknown';

		// Use the most recently-created devbox container's image ID.
		// Inspect the image for OLAM_BUILD_SHA label or env.
		const container = /** @type {Record<string, unknown>} */ (containers[0]);
		const imageId = typeof container['ImageID'] === 'string' ? container['ImageID'] : null;
		if (!imageId) return 'unknown';

		const inspectRes = await fetch(
		`${dockerApiBase}/images/${encodeURIComponent(imageId)}/json`,
		{ signal: AbortSignal.timeout(5000) },
		);
		if (!inspectRes.ok) return 'unknown';
		const image = /** @type {unknown} */ (await inspectRes.json());
		if (!image \|\| typeof image !== 'object') return 'unknown';

		const config = /** @type {Record<string, unknown>} */ (image)['Config'];
		if (!config \|\| typeof config !== 'object') return 'unknown';
		const env = /** @type {Record<string, unknown>} */ (config)['Env'];
		if (!Array.isArray(env)) return 'unknown';

		for (const e of env) {
		if (typeof e === 'string' && e.startsWith('OLAM_BUILD_SHA=')) {
		return e.slice('OLAM_BUILD_SHA='.length);
		}
		}
		return 'unknown';
		} catch {
		return 'unknown';
		}
		}

		/**
		* Build a full VersionSnapshot from all available sources.
		*
		* @param {{
		* authServiceUrl: string;
		* dockerApiBase: string;
		* }} opts
		* @returns {Promise<VersionSnapshot>}
		*/
		export async function buildVersionSnapshot({ authServiceUrl, dockerApiBase }) {
		const operatorHead = readOperatorHead();

		// Inspect locally-pulled `:latest` image tags to get the actual
		// published baked SHA — what `olam upgrade` would swap us to next.
		// Fall back to operatorHead when the image isn't pulled (first-run
		// or stack never upgraded) so the banner still surfaces SOMETHING.
		const [authSha, devboxSha, hostCpLatestPublished, authLatestPublished, devboxLatestPublished] =
		await Promise.all([
		fetchAuthServiceSha(authServiceUrl),
		fetchDevboxImageSha(dockerApiBase),
		fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-host-cp:latest'),
		// NOTE: docker tag is `olam-auth` (no `-service` suffix); npm
		// workspace is `auth-service`. The two diverged historically.
		fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-auth:latest'),
		fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-devbox:latest'),
		]);

		const hostCpRunning = process.env.OLAM_BUILD_SHA ?? 'unknown';

		// Pick "latest" per component: use the published image SHA when we
		// can read it (truthful — that's what would swap in), else fall back
		// to operatorHead (legacy behaviour, may over-report between SPA-only
		// PR merges and the next image rebuild — but still informative when
		// the operator hasn't yet pulled `:latest`).
		const hostCpLatest = pickLatest(hostCpLatestPublished, operatorHead);
		const authLatest = pickLatest(authLatestPublished, operatorHead);
		const devboxLatest = pickLatest(devboxLatestPublished, operatorHead);

		// CLI version is propagated by `olam host-cp start` via the
		// OLAM_CLI_VERSION env (see packages/cli/src/commands/host-cp.ts
		// buildComposeEnv). Falls back to host-cp's own package.json when
		// an older CLI started this container without setting the env.
		const cliVersion = process.env.OLAM_CLI_VERSION
		\|\| readHostCpPackageVersion()
		\|\| 'unknown';

		return {
		hostCp: {
		running: hostCpRunning,
		latest: hostCpLatest,
		upgradeAvailable: isUpgradeAvailable(hostCpRunning, hostCpLatest),
		},
		authService: {
		running: authSha,
		latest: authLatest,
		upgradeAvailable: isUpgradeAvailable(authSha, authLatest),
		},
		devbox: {
		running: devboxSha,
		latest: devboxLatest,
		upgradeAvailable: isUpgradeAvailable(devboxSha, devboxLatest),
		},
		operatorHead,
		checkedAt: new Date().toISOString(),
		cliVersion,
		};
		}

		/**
		* Prefer the published-image SHA (truthful "would swap to") over the
		* operator's local git HEAD (over-reports when SPA-only PRs land
		* between image rebuilds). Falls back to operatorHead when the image
		* isn't pulled (e.g. cold-start before first `olam upgrade`).
		*
		* @param {string} publishedImageSha
		* @param {string} operatorHead
		* @returns {string}
		*/
		export function pickLatest(publishedImageSha, operatorHead) {
		if (publishedImageSha && publishedImageSha !== 'unknown') return publishedImageSha;
		return operatorHead;
		}

		/**
		* Read host-cp's bundled package.json version as the CLI-version
		* fallback when OLAM_CLI_VERSION isn't propagated. The container
		* Dockerfile copies the manifest into /app, so the lookup walks up
		* from this module's location.
		*
		* @returns {string \| null}
		*/
		function readHostCpPackageVersion() {
		try {
		const here = path.dirname(new URL(import.meta.url).pathname);
		for (const candidate of [
		path.join(here, '..', 'package.json'),
		path.join(here, '..', '..', 'package.json'),
		]) {
		if (fs.existsSync(candidate)) {
		const pkg = JSON.parse(fs.readFileSync(candidate, 'utf-8'));
		if (typeof pkg.version === 'string' && pkg.version.length > 0) return pkg.version;
		}
		}
		} catch {
		// best-effort
		}
		return null;
		}

-149

host-cp/src/workspace-catalog.mjs

		// Phase F-2-B (B6): workspace + project catalog for host CP.
		//
		// Reads workspace YAML files from `~/.olam/workspaces/*.yaml` (mounted
		// at `/data/workspaces` inside the host-cp container per compose.yaml).
		// Provides three endpoints' worth of data:
		//
		// 1. /api/workspaces — list all workspaces (redacted)
		// 2. /api/projects — deduplicated project union
		// 3. POST /api/workspaces/match — exact set-equality matching
		// for D13's project-first
		// create-world flow

		import fs from 'node:fs';
		import path from 'node:path';
		import YAML from 'yaml';
		import { redactSensitive } from './redact.mjs';

		/**
		* @typedef {object} Project
		* @property {string} name
		* @property {string} [url]
		* @property {string} [path]
		* @property {string} [branch]
		*/

		/**
		* @typedef {object} Workspace
		* @property {string} name
		* @property {Project[]} repos project list (called `repos` in YAML)
		* @property {Record<string, unknown>} [defaults]
		* @property {Record<string, unknown>} [services]
		* @property {Record<string, unknown>} [image]
		* @property {Record<string, unknown>} [host_ui]
		* @property {number} [updatedAt]
		*/

		/**
		* Load all workspace YAMLs from a directory. Returns an array, sorted
		* by name. Invalid YAMLs are logged + skipped (don't bring down the
		* whole list because one file is malformed).
		*
		* @param {string} dir
		* @param {(message: string) => void} [log]
		* @returns {Workspace[]}
		*/
		export function loadWorkspaces(dir, log = console.log) {
		if (!fs.existsSync(dir)) {
		log(`workspace-catalog: directory ${dir} does not exist`);
		return [];
		}
		/** @type {Workspace[]} */
		const out = [];
		for (const entry of fs.readdirSync(dir)) {
		if (!entry.endsWith('.yaml') && !entry.endsWith('.yml')) continue;
		const filePath = path.join(dir, entry);
		try {
		const raw = fs.readFileSync(filePath, 'utf-8');
		const parsed = YAML.parse(raw);
		if (parsed && typeof parsed === 'object' && parsed.name) {
		// Normalize: ensure `repos` is at least an empty array.
		out.push({ ...parsed, repos: parsed.repos ?? [] });
		} else {
		log(`workspace-catalog: skipping ${entry} (no .name field)`);
		}
		} catch (err) {
		log(`workspace-catalog: failed to parse ${entry}: ${err.message}`);
		}
		}
		return out.sort((a, b) => a.name.localeCompare(b.name));
		}

		/**
		* /api/workspaces response: redacted workspace list.
		*
		* @param {Workspace[]} workspaces
		* @returns {Workspace[]}
		*/
		export function workspacesForApi(workspaces) {
		return /** @type {Workspace[]} */ (redactSensitive(workspaces));
		}

		/**
		* /api/projects response: deduplicated project union across all
		* workspaces. Dedup key is project name (case-sensitive — Atlas Core
		* and atlas-core would be distinct, which matches the workspace YAML
		* convention of using kebab-case throughout).
		*
		* Per-project metadata: takes the FIRST occurrence's url/path/branch.
		* Subsequent occurrences with the same name are ignored. This keeps
		* the response stable across reorderings within individual workspace
		* YAMLs.
		*
		* @param {Workspace[]} workspaces
		* @returns {Project[]}
		*/
		export function projectsFromWorkspaces(workspaces) {
		/** @type {Map<string, Project>} */
		const byName = new Map();
		for (const ws of workspaces) {
		for (const repo of ws.repos ?? []) {
		if (!repo?.name) continue;
		if (!byName.has(repo.name)) {
		byName.set(repo.name, { ...repo });
		}
		}
		}
		return [...byName.values()].sort((a, b) => a.name.localeCompare(b.name));
		}

		/**
		* POST /api/workspaces/match request body: { projects: string[] }.
		* Returns workspaces whose project-name set EXACTLY equals the input
		* set (no subset, no superset). Sorted by name for response stability.
		*
		* Algorithm: O(W × P) where W = #workspaces, P = average projects per
		* workspace. Workspaces are small (<10 projects each); fine for direct
		* iteration.
		*
		* @param {Workspace[]} workspaces
		* @param {string[]} projectNames
		* @returns {Workspace[]}
		*/
		export function matchWorkspacesByProjects(workspaces, projectNames) {
		const target = new Set(projectNames);
		/** @type {Workspace[]} */
		const matches = [];
		for (const ws of workspaces) {
		const wsNames = new Set((ws.repos ?? []).map((r) => r.name).filter(Boolean));
		if (setsEqual(target, wsNames)) {
		matches.push(ws);
		}
		}
		return matches.sort((a, b) => a.name.localeCompare(b.name));
		}

		/**
		* Set equality. Two sets are equal iff same size + same members.
		*
		* @param {Set<string>} a
		* @param {Set<string>} b
		* @returns {boolean}
		*/
		function setsEqual(a, b) {
		if (a.size !== b.size) return false;
		for (const x of a) {
		if (!b.has(x)) return false;
		}
		return true;
		}

-392

host-cp/src/world-activity-tracker.mjs

		/**
		* WorldActivityTracker — periodic scanner that turns each active world's
		* Claude session JSONL into `thought_count` + `total_cost_usd` updates on
		* the `worlds` table (~/.olam/worlds.db), plus a `world.activity.tick`
		* event on the host-stream broadcaster.
		*
		* Closes #965. Pre-fix, `olam_status <world>` always reported
		* `Cost $0.0000 / Thoughts 0` because nothing wrote those columns after
		* world creation. Rico (the orchestrator) reads those fields to decide
		* whether a world is progressing or stalled, so as far as it was
		* concerned every world was frozen.
		*
		* Design notes:
		* - JSONL path is operator-configurable. Default contract per #965
		* is `~/.olam/worlds/<id>/state/claude-main.jsonl`; override the
		* template via `OLAM_WORLD_JSONL_PATH_TEMPLATE`. On this host the
		* producer for the default path is not yet shipped (Claude Code
		* writes to `~/.claude/projects/<sanitized>/<uuid>.jsonl` by
		* default), so values stay at 0 until either the producer lands or
		* the env override repoints the scanner.
		* - Dedupe by `message.id`. Claude SDK JSONL emits multiple lines
		* per assistant API turn (one per content block), each carrying the
		* SAME `message.id` + the SAME `usage` block. Naive sum-by-line
		* double-counts. We dedupe by `message.id` for usage totals and
		* count unique-message-id as `thoughtCount`.
		* - Idempotent. Re-scanning the same JSONL produces the same
		* numbers; safe to run at any cadence.
		* - Fail-soft per world. A bad JSONL line, missing file, or
		* unreadable handle never crashes the loop — the failing world is
		* skipped with a debug log and the next world proceeds.
		*
		* Cadence: `OLAM_WORLD_ACTIVITY_TICK_MS` (default 60_000).
		*
		* Wire-in: `server.mjs` constructs once with `{ db, broadcaster }` after
		* both are ready and calls `.stop()` from the SIGTERM/SIGINT handler.
		*
		* @see ../host-stream.mjs broadcaster API
		* @see ../worlds-db-source.mjs read-only DB open pattern (model for
		* `tryOpenDb` here, though tracker WRITES not reads).
		*/

		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';
		import readline from 'node:readline';
		import { createRequire } from 'node:module';

		const require = createRequire(import.meta.url);

		// TODO(rates): source live model rates from auth-service or a config
		// file. For now we anchor on Claude Opus per-million baseline ($3 input
		// / $15 output) — the issue surface is "value advances post-creation",
		// not "is dollar-accurate to 4 decimals". When per-model rates land,
		// pluck the model id from the assistant message and dispatch.
		const INPUT_USD_PER_M_TOKENS = 3.0;
		const OUTPUT_USD_PER_M_TOKENS = 15.0;

		const DEFAULT_TICK_MS = 60_000;

		/**
		* Resolve a per-world JSONL path from an operator-supplied template
		* string. The template supports a single `{worldId}` placeholder, and a
		* leading `~/` is expanded to `os.homedir()`.
		*
		* @param {string} template
		* @param {string} worldId
		* @returns {string}
		*/
		export function resolveJsonlPath(template, worldId) {
		const swapped = template.replace(/\{worldId\}/g, worldId);
		if (swapped.startsWith('~/')) {
		return path.join(os.homedir(), swapped.slice(2));
		}
		return swapped;
		}

		/**
		* Scan a single JSONL file and return aggregate counts.
		*
		* @param {string} jsonlPath
		* @returns {Promise<{thoughtCount:number, inputTokens:number, outputTokens:number, costUsd:number, lastActivityAt:string\|null}>}
		*/
		export async function scanWorldJsonl(jsonlPath) {
		const seenMessageIds = new Set();
		let inputTokens = 0;
		let outputTokens = 0;
		let lastTimestamp = null;

		let stream;
		try {
		stream = fs.createReadStream(jsonlPath, { encoding: 'utf8' });
		} catch {
		// ENOENT or permission error — return zeros.
		return zeroStats();
		}

		// createReadStream defers ENOENT to the 'error' event; convert to a
		// rejected promise so the caller's try/catch sees it uniformly.
		const errorPromise = new Promise((_, reject) => {
		stream.on('error', reject);
		});

		const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });

		const linesPromise = (async () => {
		for await (const line of rl) {
		if (!line) continue;
		let row;
		try {
		row = JSON.parse(line);
		} catch {
		// Skip malformed lines silently — the JSONL has been observed
		// to contain partial writes during active sessions.
		continue;
		}
		if (!row \|\| row.type !== 'assistant') continue;
		const msg = row.message;
		if (!msg \|\| typeof msg !== 'object') continue;

		const messageId = typeof msg.id === 'string' ? msg.id : null;
		if (messageId === null) continue;
		if (seenMessageIds.has(messageId)) continue;
		seenMessageIds.add(messageId);

		const usage = msg.usage;
		if (usage && typeof usage === 'object') {
		if (Number.isFinite(usage.input_tokens)) {
		inputTokens += Number(usage.input_tokens);
		}
		if (Number.isFinite(usage.output_tokens)) {
		outputTokens += Number(usage.output_tokens);
		}
		}

		if (typeof row.timestamp === 'string') {
		// Lexicographic comparison is correct on ISO-8601 with consistent zone.
		if (lastTimestamp === null \|\| row.timestamp > lastTimestamp) {
		lastTimestamp = row.timestamp;
		}
		}
		}
		})();

		try {
		await Promise.race([linesPromise, errorPromise]);
		} catch {
		return zeroStats();
		} finally {
		try { stream.destroy(); } catch { /* ignore */ }
		}

		const costUsd =
		(inputTokens / 1_000_000) * INPUT_USD_PER_M_TOKENS +
		(outputTokens / 1_000_000) * OUTPUT_USD_PER_M_TOKENS;

		return {
		thoughtCount: seenMessageIds.size,
		inputTokens,
		outputTokens,
		costUsd,
		lastActivityAt: lastTimestamp,
		};
		}

		function zeroStats() {
		return {
		thoughtCount: 0,
		inputTokens: 0,
		outputTokens: 0,
		costUsd: 0,
		lastActivityAt: null,
		};
		}

		/**
		* @typedef {object} WorldActivityTrackerDeps
		* @property {string} [dbPath] Path to worlds.db; defaults to
		* `OLAM_WORLDS_DB` env var or `~/.olam/worlds.db`.
		* @property {object} [broadcaster] Object with `.broadcast(type, payload)`
		* (e.g. the return of `createHostStream`). Optional — when absent
		* events are skipped but DB writes still happen.
		* @property {number} [intervalMs] Tick cadence. Defaults to
		* `OLAM_WORLD_ACTIVITY_TICK_MS` env or 60000.
		* @property {string} [jsonlPathTemplate] JSONL path template.
		* `{worldId}` is replaced per world. Defaults to
		* `OLAM_WORLD_JSONL_PATH_TEMPLATE` env or
		* `~/.olam/worlds/{worldId}/state/claude-main.jsonl`.
		* @property {(msg: string) => void} [log] Defaults to `console.log`.
		* @property {(msg: string) => void} [debug] Optional verbose log; defaults
		* to no-op (debug-level skips on missing JSONL would be noisy).
		* @property {(cb: () => void, ms: number) => any} [setTimer] Injectable
		* `setInterval` for tests.
		* @property {(handle: any) => void} [clearTimer] Injectable
		* `clearInterval` for tests.
		* @property {() => Date} [now] Clock injection for tests.
		*/

		/**
		* @typedef {object} WorldActivityTrackerHandle
		* @property {() => void} stop
		* @property {() => Promise<number>} tickNow Run one tick synchronously
		* (returns the count of worlds processed). Exposed for tests.
		*/

		/**
		* Start the world activity tracker. Returns a `{ stop, tickNow }`
		* handle. Safe to call before the worlds.db file exists — the tracker
		* skip-with-log until the file appears.
		*
		* @param {WorldActivityTrackerDeps} [deps]
		* @returns {WorldActivityTrackerHandle}
		*/
		export function startWorldActivityTracker(deps = {}) {
		const log = deps.log ?? ((m) => console.log(`[world-activity] ${m}`));
		const debug = deps.debug ?? (() => {});
		const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
		const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));
		const now = deps.now ?? (() => new Date());

		const intervalMs =
		deps.intervalMs ??
		parseInt(process.env.OLAM_WORLD_ACTIVITY_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10);

		const dbPath =
		deps.dbPath ??
		process.env.OLAM_WORLDS_DB ??
		path.join(os.homedir(), '.olam', 'worlds.db');

		const jsonlPathTemplate =
		deps.jsonlPathTemplate ??
		process.env.OLAM_WORLD_JSONL_PATH_TEMPLATE ??
		'~/.olam/worlds/{worldId}/state/claude-main.jsonl';

		const broadcaster = deps.broadcaster ?? null;

		let stopped = false;
		let inFlight = false;
		let intervalHandle = null;

		/**
		* One tick: open DB, read active worlds, scan each JSONL, write back,
		* emit event. Returns the count of worlds processed.
		*
		* @returns {Promise<number>}
		*/
		async function tick() {
		if (stopped) return 0;
		if (inFlight) {
		// Skip overlap — slow filesystem must not pile up ticks.
		debug('tick skipped: previous tick still in flight');
		return 0;
		}
		inFlight = true;

		let db = null;
		let processed = 0;
		try {
		let Database;
		try {
		Database = require('better-sqlite3');
		} catch (err) {
		// better-sqlite3 unavailable (e.g. container without native
		// build) — degrade silently.
		log(`better-sqlite3 unavailable; skipping tick: ${err.message}`);
		return 0;
		}

		try {
		db = new Database(dbPath, { fileMustExist: true });
		} catch (err) {
		// SQLITE_CANTOPEN (file absent) is the expected first-boot
		// case; everything else is worth surfacing.
		if (err.code !== 'SQLITE_CANTOPEN') {
		log(`open ${dbPath} failed: ${err.message}`);
		} else {
		debug(`${dbPath} not present yet; skipping tick`);
		}
		return 0;
		}

		let activeWorlds;
		try {
		activeWorlds = db
		.prepare(
		"SELECT id FROM worlds WHERE status NOT IN ('destroyed', 'failed')",
		)
		.all();
		} catch (err) {
		log(`query active worlds failed: ${err.message}`);
		return 0;
		}

		const updateStmt = db.prepare(
		`UPDATE worlds
		SET thought_count = ?,
		total_cost_usd = ?,
		updated_at = ?
		WHERE id = ?`,
		);

		for (const row of activeWorlds) {
		if (stopped) break;
		const worldId = row.id;
		if (typeof worldId !== 'string') continue;
		const jsonlPath = resolveJsonlPath(jsonlPathTemplate, worldId);

		let stats;
		try {
		stats = await scanWorldJsonl(jsonlPath);
		} catch (err) {
		// Defence in depth — scanWorldJsonl is already fail-soft, but
		// this catches anything unforeseen at the call seam.
		debug(`scan ${worldId} failed: ${err.message}`);
		continue;
		}

		const updatedAt = now().toISOString();
		try {
		updateStmt.run(
		stats.thoughtCount,
		stats.costUsd,
		updatedAt,
		worldId,
		);
		} catch (err) {
		log(`update ${worldId} failed: ${err.message}`);
		continue;
		}

		if (broadcaster && typeof broadcaster.broadcast === 'function') {
		try {
		broadcaster.broadcast('world.activity.tick', {
		worldId,
		thoughtCount: stats.thoughtCount,
		costUsd: stats.costUsd,
		inputTokens: stats.inputTokens,
		outputTokens: stats.outputTokens,
		lastActivityAt: stats.lastActivityAt,
		updatedAt,
		});
		} catch (err) {
		log(`broadcast ${worldId} failed: ${err.message}`);
		}
		}

		processed += 1;
		}
		} finally {
		if (db) {
		try { db.close(); } catch { /* ignore */ }
		}
		inFlight = false;
		}

		return processed;
		}

		// Kick off an initial tick on next event-loop turn so callers can
		// attach test spies before any DB work happens.
		setImmediate(() => {
		if (stopped) return;
		void tick().catch((err) => {
		log(`initial tick crashed: ${err?.message ?? err}`);
		});
		});

		intervalHandle = setTimer(() => {
		void tick().catch((err) => {
		log(`tick crashed: ${err?.message ?? err}`);
		});
		}, intervalMs);
		// Don't pin the event loop on shutdown.
		if (intervalHandle && typeof intervalHandle.unref === 'function') {
		intervalHandle.unref();
		}

		log(
		`started: db=${dbPath} template=${jsonlPathTemplate} interval=${intervalMs}ms`,
		);

		return {
		stop() {
		if (stopped) return;
		stopped = true;
		if (intervalHandle !== null) {
		try { clearTimer(intervalHandle); } catch { /* ignore */ }
		intervalHandle = null;
		}
		},
		tickNow: tick,
		};
		}

-176

host-cp/src/world-names-store.mjs

		// Phase F-2-D follow-up: persistent world-name store.
		//
		// Background: world.id is the docker container suffix (e.g. `gold-arc-1454`)
		// and is immutable. Operators want a separate human-friendly `name`
		// (e.g. "Refactor the auth module") so the worlds list reads like a
		// task board instead of a string of CSS-color-words.
		//
		// Storage: a single JSON file at /data/world-names.json (mounted from
		// ~/.olam/world-names.json on the host). Atomic write via tmp+rename so
		// concurrent PATCHes can't half-write the file. Read-on-demand with a
		// tiny in-process cache keyed off mtime so steady-state GET /api/worlds
		// doesn't reread the file every poll.
		//
		// Schema:
		// { "<worldId>": "<name>", ... }
		//
		// Names are arbitrary UTF-8 strings, capped at NAME_MAX_LEN to keep
		// the file small + the UI sane.

		import fs from 'node:fs';
		import path from 'node:path';

		const NAME_MAX_LEN = 120;

		/**
		* @typedef {object} WorldNamesStore
		* @property {() => Record<string, string>} all
		* @property {(id: string) => string \| null} get
		* @property {(id: string, name: string) => string} set
		* @property {(id: string) => void} remove
		*/

		/**
		* Create a JSON-backed world-names store rooted at `filePath`.
		* Resilient to a missing file (treats as empty); resilient to a
		* malformed file (logs + treats as empty).
		*
		* @param {string} filePath
		* @returns {WorldNamesStore}
		*/
		export function createWorldNamesStore(filePath) {
		/** @type {Record<string, string>} */
		let cache = {};
		let cacheMtimeMs = -1;

		function readFromDisk() {
		if (!fs.existsSync(filePath)) {
		cache = {};
		cacheMtimeMs = 0;
		return;
		}
		try {
		const stat = fs.statSync(filePath);
		if (stat.mtimeMs === cacheMtimeMs) return; // cache hit
		const raw = fs.readFileSync(filePath, 'utf-8');
		const parsed = JSON.parse(raw);
		if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
		const next = {};
		for (const [k, v] of Object.entries(parsed)) {
		if (typeof v === 'string') next[k] = v;
		}
		cache = next;
		} else {
		cache = {};
		}
		cacheMtimeMs = stat.mtimeMs;
		} catch (err) {
		console.error(`world-names-store: failed to read ${filePath}: ${err.message}`);
		cache = {};
		cacheMtimeMs = 0;
		}
		}

		function writeToDisk() {
		const dir = path.dirname(filePath);
		fs.mkdirSync(dir, { recursive: true });
		const tmp = `${filePath}.tmp-${process.pid}-${Date.now()}`;
		fs.writeFileSync(tmp, JSON.stringify(cache, null, 2), 'utf-8');
		fs.renameSync(tmp, filePath);
		try {
		const stat = fs.statSync(filePath);
		cacheMtimeMs = stat.mtimeMs;
		} catch {
		cacheMtimeMs = 0;
		}
		}

		/** @returns {Record<string, string>} */
		function all() {
		readFromDisk();
		return { ...cache };
		}

		/**
		* @param {string} id
		* @returns {string \| null}
		*/
		function get(id) {
		readFromDisk();
		return cache[id] ?? null;
		}

		/**
		* @param {string} id
		* @param {string} name
		* @returns {string} the normalized name actually stored
		*/
		function set(id, name) {
		if (typeof id !== 'string' \|\| id.length === 0) {
		throw new Error('worldId must be a non-empty string');
		}
		const normalized = normalizeName(name);
		if (normalized === null) {
		throw new Error('name must be a non-empty string (after trim)');
		}
		readFromDisk();
		cache = { ...cache, [id]: normalized };
		writeToDisk();
		return normalized;
		}

		/**
		* @param {string} id
		*/
		function remove(id) {
		readFromDisk();
		if (!(id in cache)) return;
		const next = { ...cache };
		delete next[id];
		cache = next;
		writeToDisk();
		}

		return { all, get, set, remove };
		}

		/**
		* Normalize a name input. Trims, collapses internal whitespace, caps
		* length. Returns null for empty/whitespace-only input.
		*
		* @param {unknown} input
		* @returns {string \| null}
		*/
		export function normalizeName(input) {
		if (typeof input !== 'string') return null;
		const trimmed = input.replace(/\s+/g, ' ').trim();
		if (trimmed.length === 0) return null;
		return trimmed.length > NAME_MAX_LEN
		? trimmed.slice(0, NAME_MAX_LEN).trimEnd()
		: trimmed;
		}

		/**
		* Derive a human-friendly name from an initial task / dispatch text.
		* Takes the first sentence (split on `.`/`?`/`!`/newline), trims, caps
		* at ~60 chars at a word boundary so the UI doesn't truncate mid-word.
		* Returns null for empty input — caller falls back to id.
		*
		* @param {unknown} taskText
		* @returns {string \| null}
		*/
		export function inferNameFromTask(taskText) {
		if (typeof taskText !== 'string') return null;
		const cleaned = taskText.replace(/\s+/g, ' ').trim();
		if (cleaned.length === 0) return null;
		// First sentence terminator wins; otherwise the whole string.
		const firstSentence = cleaned.split(/[.!?\n]/)[0]?.trim() ?? cleaned;
		const SOFT_CAP = 60;
		if (firstSentence.length <= SOFT_CAP) return firstSentence \|\| null;
		// Cap at a word boundary close to SOFT_CAP so we don't dangle
		// half a word + an ellipsis.
		const head = firstSentence.slice(0, SOFT_CAP);
		const lastSpace = head.lastIndexOf(' ');
		const truncated = lastSpace > 30 ? head.slice(0, lastSpace) : head;
		return truncated.replace(/[\s,;:—–-]+$/u, '');
		}

-97

host-cp/src/world-pr-state.mjs

		import fs from 'node:fs';
		import path from 'node:path';

		/**
		* @typedef {object} PrStateEntry
		* @property {string} pr_url
		* @property {number\|null} pr_number
		* @property {string\|null} pr_repo
		* @property {string\|null} pr_created_at
		* @property {'open'\|'merged'\|'merged_destroyed'} pr_state
		* @property {string\|null} pr_merged_at
		* @property {boolean} auto_destroy_on_merge
		*/

		/**
		* @param {string} filePath
		*/
		export function createWorldPrStateStore(filePath) {
		/** @type {Record<string, PrStateEntry>} */
		let cache = {};
		let cacheMtimeMs = -1;

		function readFromDisk() {
		if (!fs.existsSync(filePath)) {
		cache = {};
		cacheMtimeMs = 0;
		return;
		}
		try {
		const stat = fs.statSync(filePath);
		if (stat.mtimeMs === cacheMtimeMs) return;
		const raw = fs.readFileSync(filePath, 'utf-8');
		const parsed = JSON.parse(raw);
		cache = parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {};
		cacheMtimeMs = stat.mtimeMs;
		} catch (err) {
		console.error(`world-pr-state: failed to read ${filePath}: ${err.message}`);
		cache = {};
		cacheMtimeMs = 0;
		}
		}

		function writeToDisk() {
		const dir = path.dirname(filePath);
		fs.mkdirSync(dir, { recursive: true });
		const tmp = `${filePath}.tmp-${process.pid}-${Date.now()}`;
		fs.writeFileSync(tmp, JSON.stringify(cache, null, 2), 'utf-8');
		fs.renameSync(tmp, filePath);
		try {
		cacheMtimeMs = fs.statSync(filePath).mtimeMs;
		} catch {
		cacheMtimeMs = 0;
		}
		}

		function getAll() {
		readFromDisk();
		return { ...cache };
		}

		/** @param {string} worldId */
		function get(worldId) {
		readFromDisk();
		return cache[worldId] ?? null;
		}

		/**
		* Upsert — merges data with the existing entry.
		* @param {string} worldId
		* @param {Partial<PrStateEntry>} data
		*/
		function set(worldId, data) {
		readFromDisk();
		const existing = cache[worldId] ?? {};
		cache = { ...cache, [worldId]: { ...existing, ...data } };
		writeToDisk();
		}

		/** @param {string} worldId */
		function remove(worldId) {
		readFromDisk();
		if (!(worldId in cache)) return;
		const next = { ...cache };
		delete next[worldId];
		cache = next;
		writeToDisk();
		}

		function getWorldsToWatch() {
		readFromDisk();
		return Object.entries(cache)
		.filter(([, entry]) => entry.pr_url && entry.pr_state !== 'merged_destroyed')
		.map(([worldId, entry]) => ({ worldId, ...entry }));
		}

		return { getAll, get, set, remove, getWorldsToWatch };
		}

-322

host-cp/src/world-progress.mjs

		/**
		* World progress computation — maps world state onto the 8-phase ladder
		* shown in the inbox row progress bar.
		*
		* @module world-progress
		*/

		import path from 'node:path';
		import { homedir } from 'node:os';
		import { createRequire } from 'node:module';
		import { execFile } from 'node:child_process';
		import { promisify } from 'node:util';
		import { readPlanProgress } from './plan-progress.mjs';

		const execFileAsync = promisify(execFile);

		// Mirror of @olam/core/src/world-paths.mjs. Inlined deliberately: host-cp's
		// slim Docker image does NOT bundle @olam/core (see server.mjs ~L560 for the
		// architectural decision). Keep these two definitions in sync until the
		// host-cp image build is taught to vendor workspace deps.
		const WORLD_DB_FILENAME = 'world.db';
		function getWorldDbPath(workspacePath) {
		return path.join(workspacePath, WORLD_DB_FILENAME);
		}

		/**
		* Phase ladder definition.
		* @type {Array<{name: string, index: number}>}
		*/
		const PHASES = [
		{ name: 'starting', index: 1 },
		{ name: 'implementing', index: 2 },
		{ name: 'committing', index: 3 },
		{ name: 'pushing', index: 4 },
		{ name: 'in_review', index: 5 },
		{ name: 'ci_failed', index: 6 },
		{ name: 'ready', index: 7 },
		{ name: 'merged', index: 8 },
		];

		const PHASE_TOTAL = PHASES.length;
		const IDLE_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes

		/**
		* Determine the current phase from observable state.
		*
		* @param {{
		* thoughts: number,
		* commitsAhead: number,
		* pushed: boolean,
		* prUrl: string\|null,
		* prChecks: 'pending'\|'passing'\|'failing'\|null,
		* prState: 'open'\|'merged'\|'closed'\|null,
		* }} state
		* @returns {string} phase name
		*/
		export function determinePhase({ thoughts, commitsAhead, pushed, prUrl, prChecks, prState }) {
		// merged
		if (prState === 'merged') return 'merged';

		// prUrl exists
		if (prUrl) {
		if (prChecks === 'failing') return 'ci_failed';
		if (prChecks === 'passing' && prState === 'open') return 'ready';
		// prChecks is null or pending
		return 'in_review';
		}

		// No PR yet
		if (pushed) return 'pushing';
		if (commitsAhead >= 1) return 'committing';
		if (thoughts >= 30) return 'implementing';
		return 'starting';
		}

		/**
		* Build the safe/default response for a world.
		*
		* @param {string} worldId
		* @returns {object}
		*/
		export function makeSafeResponse(worldId) {
		return {
		worldId,
		phase: 'starting',
		phaseIndex: 1,
		phaseTotal: PHASE_TOTAL,
		isIdle: false,
		thoughts: 0,
		lastActivityAt: null,
		runtimeMs: 0,
		commitsAhead: 0,
		pushed: false,
		prUrl: null,
		prNumber: null,
		prChecks: null,
		prState: null,
		plan: null,
		};
		}

		/**
		* Read a world row from worlds.db.
		*
		* @param {string} dbPath
		* @param {string} worldId
		* @returns {{ branch: string, repos: string[], workspacePath: string, createdAt: string } \| null}
		*/
		function defaultReadWorldRow(dbPath, worldId) {
		try {
		const Database = createRequire(import.meta.url)('better-sqlite3');
		const db = new Database(dbPath, { readonly: true });
		db.pragma('journal_mode = WAL');
		const row = db.prepare(
		'SELECT branch, repos, workspace_path, created_at FROM worlds WHERE id = ?',
		).get(worldId);
		db.close();
		if (!row) return null;
		let repos = [];
		try {
		repos = typeof row.repos === 'string' ? JSON.parse(row.repos) : (row.repos ?? []);
		} catch {
		repos = [];
		}
		return {
		branch: row.branch ?? 'main',
		repos,
		workspacePath: row.workspace_path ?? '',
		createdAt: row.created_at ?? null,
		};
		} catch {
		return null;
		}
		}

		/**
		* Read thought count and last activity from a world.db.
		*
		* @param {string} dbPath
		* @returns {{ count: number, lastAt: string\|null }}
		*/
		function defaultReadThoughts(dbPath) {
		try {
		const Database = createRequire(import.meta.url)('better-sqlite3');
		const db = new Database(dbPath, { readonly: true });
		db.pragma('journal_mode = WAL');
		const row = db
		.prepare('SELECT COUNT(*) AS cnt, MAX(created_at) AS last_at FROM thought_nodes')
		.get();
		db.close();
		return {
		count: Number(row?.cnt ?? 0),
		lastAt: row?.last_at ?? null,
		};
		} catch {
		return { count: 0, lastAt: null };
		}
		}

		/**
		* Count commits ahead of origin/main for a git worktree.
		*
		* @param {string} worktreePath
		* @returns {Promise<number>}
		*/
		async function defaultGitCommitsAhead(worktreePath) {
		try {
		const { stdout } = await execFileAsync(
		'git',
		['-C', worktreePath, 'rev-list', 'origin/main..HEAD', '--count'],
		{ timeout: 5000 },
		);
		const n = parseInt(stdout.trim(), 10);
		return Number.isFinite(n) ? n : 0;
		} catch {
		return 0;
		}
		}

		/**
		* Check whether the branch has been pushed to origin.
		*
		* @param {string} worktreePath
		* @param {string} branch
		* @returns {Promise<boolean>}
		*/
		async function defaultGitIsPushed(worktreePath, branch) {
		try {
		await execFileAsync(
		'git',
		['-C', worktreePath, 'rev-parse', '--quiet', '--verify', `origin/${branch}`],
		{ timeout: 5000 },
		);
		return true;
		} catch {
		return false;
		}
		}

		/**
		* Compute the current progress state for a world.
		*
		* @param {string} worldId
		* @param {{
		* worldsDbPath?: string,
		* prCache?: { getPr: (prUrl: string, getToken: () => Promise<string\|null>) => Promise<{state:string\|null,number:number\|null,checks:string\|null}\|null> },
		* prStateStore?: { get: (worldId: string) => object\|null },
		* getGhToken?: () => Promise<string\|null>,
		* _readWorldRow?: (dbPath: string, worldId: string) => object\|null,
		* _readThoughts?: (dbPath: string) => { count: number, lastAt: string\|null },
		* _gitCommitsAhead?: (worktreePath: string) => Promise<number>,
		* _gitIsPushed?: (worktreePath: string, branch: string) => Promise<boolean>,
		* }} [deps]
		* @returns {Promise<object>}
		*/
		export async function computeProgress(worldId, deps = {}) {
		const safe = makeSafeResponse(worldId);

		try {
		const {
		worldsDbPath = process.env.OLAM_WORLDS_DB ?? path.join(homedir(), '.olam/worlds.db'),
		prCache = null,
		prStateStore = null,
		getGhToken = async () => null,
		_readWorldRow = defaultReadWorldRow,
		_readThoughts = defaultReadThoughts,
		_gitCommitsAhead = defaultGitCommitsAhead,
		_gitIsPushed = defaultGitIsPushed,
		} = deps;

		// Read world row
		const worldRow = _readWorldRow(worldsDbPath, worldId);
		if (!worldRow) return safe;

		const { branch, repos, workspacePath, createdAt } = worldRow;
		const worktreePath = repos.length > 0 ? path.join(workspacePath, repos[0]) : workspacePath;

		// Compute runtimeMs
		const runtimeMs = createdAt ? Date.now() - new Date(createdAt).getTime() : 0;

		// Read thoughts
		const thoughtsDbPath = getWorldDbPath(workspacePath);
		const { count: thoughts, lastAt: thoughtsLastAt } = _readThoughts(thoughtsDbPath);

		// Git state
		const [commitsAhead, pushed] = await Promise.all([
		_gitCommitsAhead(worktreePath),
		_gitIsPushed(worktreePath, branch),
		]);

		// PR state — check prStateStore first
		let prUrl = null;
		let prNumber = null;
		let prState = null;
		let prChecks = null;

		if (prStateStore) {
		const prEntry = prStateStore.get(worldId);
		if (prEntry) {
		prUrl = prEntry.pr_url ?? null;
		prNumber = prEntry.pr_number ?? null;
		// Normalize merged_destroyed → merged
		const rawState = prEntry.pr_state ?? null;
		prState = rawState === 'merged_destroyed' ? 'merged' : (rawState === 'none' ? null : rawState);
		}
		}

		// Live PR data from cache
		if (prUrl && prCache) {
		try {
		const livePr = await prCache.getPr(prUrl, getGhToken);
		if (livePr) {
		prChecks = livePr.checks;
		// Update state if live data shows merged
		if (livePr.state === 'merged') prState = 'merged';
		if (livePr.number != null) prNumber = livePr.number;
		}
		} catch {
		// Non-fatal
		}
		}

		// Determine phase
		const phase = determinePhase({ thoughts, commitsAhead, pushed, prUrl, prChecks, prState });
		const phaseEntry = PHASES.find((p) => p.name === phase) ?? PHASES[0];

		// Idle overlay — only for implementing or committing phases
		let isIdle = false;
		if (phase === 'implementing' \|\| phase === 'committing') {
		if (thoughtsLastAt) {
		const lastActivityMs = new Date(thoughtsLastAt).getTime();
		if (!isNaN(lastActivityMs) && Date.now() - lastActivityMs > IDLE_THRESHOLD_MS) {
		isIdle = true;
		}
		}
		}

		// Plan progress — additive; null when no tracker found
		const lastActivityAtMs = thoughtsLastAt ? new Date(thoughtsLastAt).getTime() : null;
		const plan = readPlanProgress(worktreePath, branch, { lastActivityAtMs });

		return {
		worldId,
		phase,
		phaseIndex: phaseEntry.index,
		phaseTotal: PHASE_TOTAL,
		isIdle,
		thoughts,
		lastActivityAt: thoughtsLastAt ?? null,
		runtimeMs: Math.max(0, runtimeMs),
		commitsAhead,
		pushed,
		prUrl,
		prNumber,
		prChecks,
		prState,
		plan,
		};
		} catch {
		return safe;
		}
		}

-136

host-cp/src/world-services.mjs

		// Service enrichment (Phase F-2-D dogfood fix) — extracted from server.mjs.
		//
		// Fetch port bindings for a world's container via docker-socket-proxy
		// inspect, map each to a clickable URL tagged with well-known internal
		// ports, and probe each for actual reachability.
		//
		// Extracted as a standalone module so the probe + enrichment logic can be
		// unit-tested in isolation (server.mjs has module-level side effects that
		// make direct import impractical). The two host-specific values that the
		// inline version read from server.mjs module constants — HOST_FOR_WORLD and
		// DOCKER_HOST — are injected as a `deps` object so the functions stay pure
		// and deterministically testable.

		export const WELL_KNOWN_PORTS = {
		3000: 'atlas-core (Rails)',
		5175: 'diner-app (Vite)',
		7681: 'Terminal (ttyd)',
		7682: 'Terminal Shell (ttyd)',
		8080: 'Per-world CP',
		};

		/**
		* Quick liveness probe against a service URL. Returns true if the
		* service responds with ANY HTTP response (1xx-5xx) — we don't care
		* about status codes because each app has its own conventions (Vite
		* 200s on /, ttyd may 401, Rails may 500 on /, the per-world CP 200s).
		* What matters is that something is listening.
		*
		* Probed from inside the host-cp container so we use hostForWorld
		* (host.docker.internal on macOS/Windows, 172.17.0.1 on Linux) — the
		* SPA's own 127.0.0.1:<port> URL is unreachable from container-side.
		*
		* Tight 800ms timeout. Worst case: 4 services × 800ms in parallel ≤ 1s
		* added to the /api/worlds response — acceptable for a 4s poll cycle.
		*
		* @param {number} hostPort
		* @param {{ hostForWorld: string }} deps
		* @returns {Promise<boolean>}
		*/
		export async function probeServiceLive(hostPort, { hostForWorld }) {
		const probeUrl = `http://${hostForWorld}:${hostPort}/`;
		try {
		const res = await fetch(probeUrl, {
		method: 'HEAD',
		signal: AbortSignal.timeout(800),
		redirect: 'manual',
		});
		return res.status > 0;
		} catch {
		// ECONNREFUSED, timeout, DNS — anything counts as not-live. Try
		// GET as a fallback because some servers (e.g. ttyd) close on HEAD
		// and we don't want false negatives from picky upstream behavior.
		try {
		const res2 = await fetch(probeUrl, {
		method: 'GET',
		signal: AbortSignal.timeout(800),
		redirect: 'manual',
		});
		return res2.status > 0;
		} catch {
		return false;
		}
		}
		}

		/**
		* Get the running container's port bindings from socket-proxy + map
		* each to a clickable URL. Each service is then probed in parallel
		* for actual reachability — the docker port mapping just tells us
		* what's CONFIGURED; the probe confirms what's actually LISTENING.
		*
		* Returns [] on any docker-inspect failure (container missing, socket-
		* proxy down) so the API still returns a valid worlds list.
		*
		* @param {string} worldId
		* @param {{ hostForWorld: string, dockerHost: string }} deps
		* @returns {Promise<Array<{name: string, host_port: number, internal_port: number, url: string, live: boolean}>>}
		*/
		export async function fetchWorldServices(worldId, { hostForWorld, dockerHost }) {
		const containerName = `olam-${worldId}-devbox`;
		let data;
		try {
		if (dockerHost === 'docker-cli') {
		// Bare-node mode: shell out to `docker inspect` instead of HTTP.
		// Same fix pattern as fetchContainerSecret (PR #108). Without
		// this, the services array is always empty in bare-node and the
		// SPA can't find the ttyd host port → terminal renders blank.
		const { spawnSync } = await import('node:child_process');
		const result = spawnSync(
		'docker',
		['inspect', containerName],
		{ encoding: 'utf-8', timeout: 2000 },
		);
		if (result.status !== 0) return [];
		const arr = JSON.parse(result.stdout \|\| '[]');
		data = Array.isArray(arr) && arr.length > 0 ? arr[0] : null;
		if (!data) return [];
		} else {
		const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
		const res = await fetch(`${apiBase}/containers/${encodeURIComponent(containerName)}/json`, {
		signal: AbortSignal.timeout(2000),
		});
		if (!res.ok) return [];
		data = await res.json();
		}
		const ports = data?.NetworkSettings?.Ports ?? {};
		const draft = [];
		for (const [internal, bindings] of Object.entries(ports)) {
		if (!Array.isArray(bindings) \|\| bindings.length === 0) continue;
		const internalPort = parseInt(internal.split('/')[0], 10);
		const hostPort = parseInt(bindings[0].HostPort, 10);
		if (!Number.isFinite(internalPort) \|\| !Number.isFinite(hostPort)) continue;
		draft.push({
		name: WELL_KNOWN_PORTS[internalPort] ?? `App (port ${internalPort})`,
		host_port: hostPort,
		internal_port: internalPort,
		url: `http://127.0.0.1:${hostPort}`,
		});
		}

		// Probe each service in parallel for actual reachability. Adds a
		// `live: boolean` field. The UI dims chips for non-live services
		// so operators can see what's configured-but-down vs configured-
		// and-up at a glance.
		const liveResults = await Promise.all(
		draft.map((s) => probeServiceLive(s.host_port, { hostForWorld })),
		);
		const services = draft.map((s, i) => ({ ...s, live: liveResults[i] }));

		// Stable order: well-known ports first (CP, then Rails/Vite, then terminal).
		services.sort((a, b) => a.internal_port - b.internal_port);
		return services;
		} catch {
		return [];
		}
		}

-311

host-cp/src/world-tunnel-manager.mjs

		import { spawn } from 'node:child_process';
		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';

		// Deployment-mode values injected by server.mjs via configure().
		// Defaults are bare-node-safe so the module is usable in tests without configure().
		let HOST_FOR_WORLD = process.env.OLAM_HOST_FOR_WORLD ?? '127.0.0.1';
		let TUNNELS_PATH =
		process.env.OLAM_WORLD_TUNNELS_PATH ??
		path.join(os.homedir(), '.olam', 'world-tunnels.json');

		/**
		* Called by server.mjs immediately after it resolves HOST_FOR_WORLD and
		* WORLD_TUNNELS_PATH from the deployment-mode branch. Avoids re-deriving
		* container-specific literals (host.docker.internal, /data/…) in this module.
		* Re-runs loadState() when tunnelsPath differs from the env-var default so
		* container-mode persistence is loaded from /data/ rather than ~/.olam/.
		*/
		export function configure({ hostForWorld, tunnelsPath }) {
		HOST_FOR_WORLD = hostForWorld;
		if (tunnelsPath !== TUNNELS_PATH) {
		TUNNELS_PATH = tunnelsPath;
		loadState();
		}
		}

		const TUNNEL_TIMEOUT_MS = 30_000;
		const PROBE_TIMEOUT_MS = 3_000;
		const URL_PATTERN = /https:\/\/[a-z0-9-]+\.trycloudflare\.com/;

		export const STATUS = {
		IDLE: 'idle',
		STARTING: 'starting',
		RUNNING: 'running',
		ERROR: 'error',
		STALE: 'stale',
		};

		export class AlreadyStartingError extends Error {
		constructor(worldId, serviceName) {
		super(`tunnel for ${serviceName} in world ${worldId} is already starting`);
		this.name = 'AlreadyStartingError';
		this.worldId = worldId;
		this.serviceName = serviceName;
		}
		}

		export class TunnelTimeoutError extends Error {
		constructor(worldId, serviceName) {
		super(`tunnel for ${serviceName} in world ${worldId} timed out after 30s with no URL`);
		this.name = 'TunnelTimeoutError';
		this.worldId = worldId;
		this.serviceName = serviceName;
		}
		}

		// Key: `${worldId}:${serviceName}` → {worldId, serviceName, port, status, url, process?}
		const registry = new Map();

		function tunnelKey(worldId, serviceName) {
		return `${worldId}:${serviceName}`;
		}

		function loadState() {
		try {
		if (!fs.existsSync(TUNNELS_PATH)) return;
		const raw = fs.readFileSync(TUNNELS_PATH, 'utf-8');
		const data = JSON.parse(raw);
		if (!data \|\| typeof data !== 'object' \|\| Array.isArray(data)) return;
		for (const [key, entry] of Object.entries(data)) {
		registry.set(key, { ...entry, process: null });
		}
		} catch (err) {
		console.error(`world-tunnel-manager: loadState failed: ${err.message}`);
		}
		}

		function saveState() {
		try {
		const dir = path.dirname(TUNNELS_PATH);
		fs.mkdirSync(dir, { recursive: true });
		const data = {};
		for (const [key, entry] of registry) {
		// eslint-disable-next-line no-unused-vars
		const { process: _proc, ...rest } = entry;
		data[key] = rest;
		}
		const tmp = `${TUNNELS_PATH}.tmp-${process.pid}-${Date.now()}`;
		fs.writeFileSync(tmp, JSON.stringify(data, null, 2), 'utf-8');
		fs.renameSync(tmp, TUNNELS_PATH);
		} catch (err) {
		console.error(`world-tunnel-manager: saveState failed: ${err.message}`);
		}
		}

		/**
		* Start a cloudflared quick-tunnel for a world service.
		* Resolves with the assigned trycloudflare.com URL.
		* Rejects with AlreadyStartingError if the service is already starting/running.
		* Rejects with TunnelTimeoutError if no URL is emitted within 30s.
		*
		* @param {string} worldId
		* @param {string} serviceName
		* @param {number} port host-side port (i.e. the published port on this machine)
		* @returns {Promise<string>} the public tunnel URL
		*/
		export async function startTunnel(worldId, serviceName, port) {
		const key = tunnelKey(worldId, serviceName);
		const existing = registry.get(key);
		if (existing && (existing.status === STATUS.STARTING \|\| existing.status === STATUS.RUNNING)) {
		throw new AlreadyStartingError(worldId, serviceName);
		}

		const entry = {
		worldId,
		serviceName,
		port,
		status: STATUS.STARTING,
		url: null,
		process: null,
		};
		registry.set(key, entry);
		saveState();

		const target = `http://${HOST_FOR_WORLD}:${port}`;
		const child = spawn('cloudflared', ['tunnel', '--url', target], {
		stdio: ['ignore', 'pipe', 'pipe'],
		detached: false,
		});
		entry.process = child;

		return new Promise((resolve, reject) => {
		let settled = false;

		function settle(resolvedUrl) {
		if (settled) return;
		settled = true;
		clearTimeout(timer);

		if (resolvedUrl) {
		entry.status = STATUS.RUNNING;
		entry.url = resolvedUrl;
		saveState();
		resolve(resolvedUrl);
		} else {
		entry.status = STATUS.ERROR;
		entry.process = null;
		saveState();
		reject(new TunnelTimeoutError(worldId, serviceName));
		}
		}

		const timer = setTimeout(() => settle(null), TUNNEL_TIMEOUT_MS);

		function scanChunk(chunk) {
		const lines = chunk.toString().split('\n');
		for (const line of lines) {
		const match = URL_PATTERN.exec(line);
		if (match) { settle(match[0]); return; }
		}
		}

		child.stdout.on('data', scanChunk);
		child.stderr.on('data', scanChunk);

		child.on('error', (err) => {
		console.error(`world-tunnel-manager: cloudflared spawn error: ${err.message}`);
		settle(null);
		});

		child.on('exit', (code) => {
		if (!settled) {
		console.error(`world-tunnel-manager: cloudflared exited (code ${code}) before URL`);
		settle(null);
		} else {
		// Process died after URL was emitted (tunnel dropped)
		entry.status = STATUS.ERROR;
		entry.process = null;
		saveState();
		}
		});
		});
		}

		/**
		* Stop a tunnel for a specific service. No-op if the service has no tunnel.
		* @param {string} worldId
		* @param {string} serviceName
		*/
		export function stopTunnel(worldId, serviceName) {
		const key = tunnelKey(worldId, serviceName);
		const entry = registry.get(key);
		if (!entry) return;
		if (entry.process) {
		try { entry.process.kill('SIGTERM'); } catch { /* already dead */ }
		entry.process = null;
		}
		entry.status = STATUS.IDLE;
		entry.url = null;
		saveState();
		}

		/**
		* Return tunnel state for ALL worlds, keyed by worldId. Used by the
		* host-stream broadcaster (sse-consolidation Phase B-bonus) to push a
		* `tunnels.snapshot` whenever the registry changes — replaces the
		* SPA's per-row `usePublishedTunnels` poll loop.
		*
		* @returns {{ [worldId: string]: Array<{name: string, port: number, url: string\|null, status: string}> }}
		*/
		export function getAllTunnels() {
		/** @type {Record<string, Array<{name: string, port: number, url: string\|null, status: string}>>} */
		const byWorld = {};
		for (const entry of registry.values()) {
		if (!byWorld[entry.worldId]) byWorld[entry.worldId] = [];
		byWorld[entry.worldId].push({
		name: entry.serviceName,
		port: entry.port,
		url: entry.url,
		status: entry.status,
		});
		}
		return byWorld;
		}

		/**
		* Return the current tunnel state for all services in a world.
		* @param {string} worldId
		* @returns {Array<{name: string, port: number, url: string\|null, status: string}>}
		*/
		export function getWorldTunnels(worldId) {
		const result = [];
		for (const entry of registry.values()) {
		if (entry.worldId === worldId) {
		result.push({
		name: entry.serviceName,
		port: entry.port,
		url: entry.url,
		status: entry.status,
		});
		}
		}
		return result;
		}

		/**
		* Kill all tunnels for a world. Called when a world is destroyed.
		* Idempotent — no-op if world has no tunnels.
		* @param {string} worldId
		*/
		export function killWorld(worldId) {
		const toDelete = [];
		for (const [key, entry] of registry) {
		if (entry.worldId !== worldId) continue;
		if (entry.process) {
		try { entry.process.kill('SIGTERM'); } catch { /* already dead */ }
		entry.process = null;
		}
		toDelete.push(key);
		}
		for (const key of toDelete) registry.delete(key);
		if (toDelete.length > 0) saveState();
		}

		/**
		* On startup, probe each persisted "running" tunnel. If the URL is unreachable,
		* mark as stale so the UI can surface a Re-publish affordance.
		*/
		export async function probeAllOnStartup() {
		const toProbe = [];
		for (const [key, entry] of registry) {
		if (entry.status === STATUS.RUNNING && entry.url) {
		toProbe.push({ key, url: entry.url });
		}
		}

		await Promise.all(
		toProbe.map(async ({ key, url }) => {
		try {
		const res = await fetch(url, {
		signal: AbortSignal.timeout(PROBE_TIMEOUT_MS),
		});
		if (!res.ok) throw new Error(`HTTP ${res.status}`);
		} catch {
		const entry = registry.get(key);
		if (entry) {
		entry.status = STATUS.STALE;
		saveState();
		}
		}
		}),
		);
		}

		function killAll() {
		for (const entry of registry.values()) {
		if (entry.process) {
		try { entry.process.kill('SIGTERM'); } catch { /* already dead */ }
		entry.process = null;
		}
		}
		}

		process.on('SIGTERM', killAll);
		process.on('exit', killAll);

		// Initialise on module load using env-var or bare-node default path.
		// configure() re-runs loadState() when server.mjs provides a different path
		// (container mode: /data/world-tunnels.json vs the ~/.olam default above).
		loadState();

-119

host-cp/src/world-watchdog-pid-lookup.mjs

		/**
		* world-watchdog-pid-lookup.mjs — host-visible PID lookup for the world watchdog.
		*
		* Uses `docker top <containerId>` to enumerate processes inside a world's
		* container and returns the host-visible PID of the claude process.
		*
		* `docker top` output format (Linux Docker / Colima):
		* UID PID PPID C STIME TTY TIME CMD
		* root 1234 1 0 10:00 ? 00:00:00 node /usr/local/bin/claude ...
		*
		* The PID column (index 1 in default ps output) is already the host-visible
		* PID. On Mac/Colima the container runs inside a Linux VM so `docker top`
		* returns PIDs within the VM's PID namespace — these are NOT the macOS host
		* PIDs, but they ARE the PIDs visible from within the Linux layer (where
		* /proc reads happen). This is the same namespace the watchdog probes use
		* when reading /proc/<pid>/wchan etc., so the PIDs are correct for probe use.
		*
		* Inject `docker` for tests (avoids spawning real docker processes).
		*
		* @see docs/architecture/world-watchdog.md
		*/

		import { execFile } from 'node:child_process';
		import { promisify } from 'node:util';

		const execFileAsync = promisify(execFile);

		/**
		* Default docker executor — shells out to the real `docker` CLI.
		*
		* @param {string} containerId
		* @returns {Promise<string>} stdout from `docker top <containerId>`
		*/
		async function defaultDockerTop(containerId) {
		const { stdout } = await execFileAsync('docker', ['top', containerId], {
		timeout: 5_000,
		});
		return stdout;
		}

		/**
		* Parse the stdout from `docker top` and extract host-visible PIDs whose
		* CMD column matches a claude process.
		*
		* docker top default output columns (ps -ef format):
		* UID PID PPID C STIME TTY TIME CMD
		* Indices: 0=UID, 1=PID, 2=PPID, ..., 7+=CMD (rest of line after 7 columns).
		*
		* @param {string} stdout Raw output from `docker top <id>`
		* @returns {number[]} Host-visible PIDs of matching claude processes, sorted ascending.
		*/
		export function parseDockerTopOutput(stdout) {
		const lines = stdout.split('\n').filter((l) => l.trim().length > 0);
		if (lines.length < 2) return []; // header only or empty

		// Skip the header line (first line contains column names).
		const dataLines = lines.slice(1);

		const pids = [];
		for (const line of dataLines) {
		// Split on any whitespace — `docker top` columns are space-separated.
		// CMD may contain spaces; split into at most 8 parts (last = full CMD string).
		const parts = line.trim().split(/\s+/);
		if (parts.length < 8) continue;

		const pid = parseInt(parts[1], 10);
		if (!Number.isFinite(pid) \|\| pid <= 0) continue;

		// parts[7] onward is the CMD. Rejoin the remainder.
		const cmd = parts.slice(7).join(' ');

		// Match: `claude` as standalone binary, or `node` process running claude.
		if (/(?:^\|\/)claude(\s\|$)/.test(cmd) \|\| /node[^\s]\s+.[/\\]claude(?:\s\|$)/.test(cmd)) {
		pids.push(pid);
		}
		}

		return pids.sort((a, b) => a - b);
		}

		/**
		* Find the host-visible PID of the claude process running inside a container.
		*
		* Returns the lowest matching PID (parent process heuristic — the supervisor
		* claude process has a lower PID than any child workers it spawns).
		*
		* Fail-soft:
		* - docker unreachable / container not found → null + log
		* - no claude process in the container → null (silent)
		* - multiple claude processes → return the lowest PID
		*
		* @param {{
		* containerId: string,
		* dockerTop?: (containerId: string) => Promise<string>,
		* log?: (msg: string) => void,
		* }} opts
		* @returns {Promise<number \| null>}
		*/
		export async function findClaudePid({
		containerId,
		dockerTop = defaultDockerTop,
		log = (m) => console.log(`[world-watchdog-pid-lookup] ${m}`),
		}) {
		if (!containerId) return null;

		let stdout;
		try {
		stdout = await dockerTop(containerId);
		} catch (err) {
		log(`docker top ${containerId} failed: ${err?.message ?? err}`);
		return null;
		}

		const pids = parseDockerTopOutput(stdout);
		if (pids.length === 0) return null;

		// Lowest PID = the parent/supervisor process.
		return pids[0];
		}

-271

host-cp/src/world-watchdog-probes.mjs

		/**
		* world-watchdog-probes.mjs — pure probe functions for the world watchdog.
		*
		* Three readers extract raw signals from the Linux /proc filesystem:
		* - readWchan(pid, opts) → string \| null
		* - readCloseWaitSockets(pid, opts) → Array<{remoteIp, remotePort}>
		* - readCpuPercent(pid, windowMs, opts) → number \| null
		*
		* One pure classifier turns those signals into a verdict:
		* - classify({ wchan, closeWaitCount, cpuPercent }) → 'healthy'\|'suspect'\|'wedged'
		*
		* All readers are fail-soft: any I/O error or parse error returns
		* null / [] / 0 rather than throwing. The classifier treats null inputs as
		* the signal not firing (conservative — only promotes to 'wedged' when all
		* three signals are conclusive).
		*
		* Test injection: pass `opts.procRoot` to redirect /proc reads to a fixture
		* directory (e.g. src/__tests__/fixtures/proc-gold-elk-5574/).
		*
		* CLOSE_WAIT threshold note (deviation from D2): Decision D2 specifies
		* filtering CLOSE_WAIT by peer hostname (.anthropic.com \| auth-worker.).
		* DNS resolution at every tick is unreliable under network stress (exactly
		* when the watchdog must be most accurate). The gold-elk-5574 forensic data
		* shows ≥3 CLOSE_WAIT to ANY peer is already diagnostic — a healthy claude
		* process has 0-1 CLOSE_WAIT sockets under normal operation. The classifier
		* therefore uses count ≥ 3 without hostname filtering. This deviation is
		* documented in docs/architecture/world-watchdog.md Signal 2.
		*
		* @see docs/architecture/world-watchdog.md
		* @see packages/host-cp/src/__tests__/world-watchdog-probes.test.mjs
		*/

		import fs from 'node:fs/promises';
		import path from 'node:path';

		// HZ — Linux scheduler tick rate. Kernel default is 100; can be 250 or 1000
		// on tickless kernels but the /proc/stat jiffies-to-seconds conversion is
		// independent of the actual HZ when the denominator is wall-clock ms.
		// We divide jiffies by HZ to get seconds of CPU time, then compare to the
		// wall-clock window. HZ=100 is correct for virtually all container environments.
		const LINUX_HZ = 100;

		// /proc/net/tcp state byte for CLOSE_WAIT.
		const CLOSE_WAIT_STATE = '08';

		/**
		* Read the wchan (wait channel) of a process's main thread.
		*
		* @param {number\|string} pid Process ID.
		* @param {{ procRoot?: string }} [opts]
		* `procRoot` defaults to '/proc'; override for tests.
		* @returns {Promise<string\|null>}
		* The wchan string (e.g. 'futex_wait_queue', 'epoll_wait') or null on error.
		*/
		export async function readWchan(pid, opts = {}) {
		const procRoot = opts.procRoot ?? '/proc';
		const wchanPath = path.join(procRoot, String(pid), 'wchan');
		try {
		const content = await fs.readFile(wchanPath, 'utf8');
		return content.trim() \|\| null;
		} catch {
		return null;
		}
		}

		/**
		* Read CLOSE_WAIT sockets for a process from /proc/<pid>/net/tcp (and tcp6).
		*
		* Parses the /proc/net/tcp format (space-separated hex fields). State field
		* (column index 3, 0-based) == '08' means CLOSE_WAIT. Returns all matching
		* entries regardless of remote peer — see module JSDoc for rationale.
		*
		* @param {number\|string} pid Process ID.
		* @param {{ procRoot?: string }} [opts]
		* @returns {Promise<Array<{remoteIp: string, remotePort: number}>>}
		* Array of CLOSE_WAIT socket descriptors, empty on error or no matches.
		*/
		export async function readCloseWaitSockets(pid, opts = {}) {
		const procRoot = opts.procRoot ?? '/proc';
		const results = [];

		for (const proto of ['tcp', 'tcp6']) {
		const tcpPath = path.join(procRoot, String(pid), 'net', proto);
		let content;
		try {
		content = await fs.readFile(tcpPath, 'utf8');
		} catch {
		// ENOENT: pid gone or proto not available — skip, not an error.
		continue;
		}

		const lines = content.split('\n');
		// Skip header line.
		for (let i = 1; i < lines.length; i++) {
		const line = lines[i].trim();
		if (!line) continue;
		const fields = line.split(/\s+/);
		// /proc/net/tcp columns (0-based):
		// 0: sl
		// 1: local_address (hex IP:port)
		// 2: rem_address (hex IP:port)
		// 3: st (hex state)
		if (fields.length < 4) continue;
		const state = fields[3];
		if (state !== CLOSE_WAIT_STATE) continue;

		const remAddr = fields[2];
		const colonIdx = remAddr.lastIndexOf(':');
		if (colonIdx === -1) continue;
		const remIpHex = remAddr.slice(0, colonIdx);
		const remPortHex = remAddr.slice(colonIdx + 1);

		const remIp = parseHexIp(remIpHex);
		const remPort = parseInt(remPortHex, 16);

		if (remIp !== null && Number.isFinite(remPort)) {
		results.push({ remoteIp: remIp, remotePort: remPort });
		}
		}
		}

		return results;
		}

		/**
		* Measure CPU utilisation for a process over a time window.
		*
		* Reads /proc/<pid>/stat twice (before + after `windowMs` ms) and computes:
		* cpuPercent = (utime+stime delta) / (HZ * windowMs / 1000) * 100
		*
		* @param {number\|string} pid Process ID.
		* @param {number} windowMs Measurement window in milliseconds.
		* @param {{ procRoot?: string, sleep?: (ms: number) => Promise<void>, now?: () => number }} [opts]
		* `sleep` — injectable delay function (default: real setTimeout).
		* `now` — injectable clock (default: Date.now).
		* `procRoot` — injectable proc root for tests.
		* @returns {Promise<number\|null>}
		* CPU percent (0–100+) or null on read/parse error.
		*/
		export async function readCpuPercent(pid, windowMs, opts = {}) {
		const procRoot = opts.procRoot ?? '/proc';
		const sleep = opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
		const statPath = path.join(procRoot, String(pid), 'stat');

		const before = await readStatTimes(statPath);
		if (before === null) return null;

		await sleep(windowMs);

		const after = await readStatTimes(statPath);
		if (after === null) return null;

		const deltaTicks = (after.utime + after.stime) - (before.utime + before.stime);
		if (deltaTicks < 0) return null;

		// deltaTicks jiffies / HZ = delta CPU-seconds.
		// windowMs / 1000 = window in seconds.
		const windowSec = windowMs / 1000;
		if (windowSec <= 0) return null;

		const cpuPercent = (deltaTicks / LINUX_HZ / windowSec) * 100;
		return cpuPercent;
		}

		// ── Internal helpers ──────────────────────────────────────────────────────────

		/**
		* Parse utime + stime from /proc/<pid>/stat content.
		*
		* @param {string} statPath
		* @returns {Promise<{utime: number, stime: number}\|null>}
		*/
		async function readStatTimes(statPath) {
		let content;
		try {
		content = await fs.readFile(statPath, 'utf8');
		} catch {
		return null;
		}

		// The stat format is: pid (comm) state ppid pgroup session ... utime stime ...
		// The command name (field 2) can contain spaces and parentheses, so we
		// find the last ')' to reliably locate the fields that follow.
		const parenClose = content.lastIndexOf(')');
		if (parenClose === -1) return null;

		// After the closing ')', fields are space-separated starting with ' state'.
		// Fields after ')' (0-indexed):
		// 0: state, 1: ppid, 2: pgrp, 3: session, 4: tty_nr, 5: tpgid,
		// 6: flags, 7: minflt, 8: cminflt, 9: majflt, 10: cmajflt,
		// 11: utime, 12: stime (i.e. indices 11+12 from the post-paren split)
		const afterParen = content.slice(parenClose + 1).trim();
		const fields = afterParen.split(/\s+/);
		// utime = fields[11], stime = fields[12]
		if (fields.length < 13) return null;

		const utime = parseInt(fields[11], 10);
		const stime = parseInt(fields[12], 10);

		if (!Number.isFinite(utime) \|\| !Number.isFinite(stime)) return null;
		return { utime, stime };
		}

		/**
		* Parse a hex-encoded IP address from /proc/net/tcp format.
		*
		* IPv4: 8 hex chars in little-endian byte order (e.g. "0101007F" → "127.0.0.1").
		* IPv6: 32 hex chars (4 groups of 8, each in little-endian).
		*
		* @param {string} hexIp
		* @returns {string\|null}
		*/
		function parseHexIp(hexIp) {
		if (hexIp.length === 8) {
		// IPv4: stored as little-endian 32-bit integer.
		const b = [
		parseInt(hexIp.slice(6, 8), 16),
		parseInt(hexIp.slice(4, 6), 16),
		parseInt(hexIp.slice(2, 4), 16),
		parseInt(hexIp.slice(0, 2), 16),
		];
		if (b.some((x) => !Number.isFinite(x))) return null;
		return b.join('.');
		}
		if (hexIp.length === 32) {
		// IPv6: 4 groups of 8 hex chars, each group little-endian.
		const groups = [];
		for (let g = 0; g < 4; g++) {
		const chunk = hexIp.slice(g * 8, g * 8 + 8);
		// Reverse byte order within each 32-bit group.
		const bytes = [
		chunk.slice(6, 8),
		chunk.slice(4, 6),
		chunk.slice(2, 4),
		chunk.slice(0, 2),
		];
		// Pair bytes into 16-bit groups for IPv6 notation.
		groups.push(bytes[0] + bytes[1], bytes[2] + bytes[3]);
		}
		return groups.join(':');
		}
		return null;
		}

		// ── Classifier ───────────────────────────────────────────────────────────────

		/**
		* @typedef {'healthy'\|'suspect'\|'wedged'} WatchdogVerdict
		*/

		/**
		* Classify a set of probe signals into a watchdog verdict.
		*
		* AND-gate: all three of (wchan=futex_wait_queue, closeWaitCount≥3, cpuPercent<1)
		* must fire for 'wedged'. Any subset → 'suspect'. None → 'healthy'.
		* Null inputs are treated as not-firing (fail-soft).
		*
		* @param {{ wchan: string\|null, closeWaitCount: number\|null, cpuPercent: number\|null }} signals
		* @returns {WatchdogVerdict}
		*/
		export function classify({ wchan, closeWaitCount, cpuPercent }) {
		const wchanFires = wchan === 'futex_wait_queue';
		const closeWaitFires = typeof closeWaitCount === 'number' && closeWaitCount >= 3;
		const cpuFires = typeof cpuPercent === 'number' && cpuPercent < 1;

		const firingCount = (wchanFires ? 1 : 0) + (closeWaitFires ? 1 : 0) + (cpuFires ? 1 : 0);

		if (firingCount === 3) return 'wedged';
		if (firingCount > 0) return 'suspect';
		return 'healthy';
		}

-192

host-cp/src/world-watchdog-recovery.mjs

		/**
		* world-watchdog-recovery.mjs — recovery hook for wedged claude processes.
		*
		* Isolated from world-watchdog.mjs so kill + replay logic is independently
		* mockable in tests without touching the watchdog's ticker.
		*
		* API:
		* createRecovery({ autoRecoverMode, leakyBucket, broadcaster, persister,
		* replay, processKill, log })
		* → { onWedgedVerdict({ worldId, pid }): Promise<void> }
		*
		* Three modes (from compute.autoRecover in .olam/config.yaml):
		* false — no-op; recovery never fires even on wedged verdict (DEFAULT)
		* 'dry-run' — emits all breadcrumbs, never calls processKill or replay
		* true — SIGKILL pid + read last-dispatch + replay; rate-limited
		*
		* Rate-limit: B2 leaky-bucket (3/hour/world). 4th wedge in window emits
		* world.watchdog.recovery.budget_exhausted and skips all action.
		*
		* Replay stub: the `replay` dep is accepted as an injected function. In
		* server.mjs it is wired to a console.warn stub + breadcrumb until the
		* operator runs the B3 idempotence probe and signs off. See TODO below.
		*
		* @see docs/architecture/world-watchdog.md Recovery section
		* @see packages/host-cp/src/lib/leaky-bucket.mjs
		* @see packages/host-cp/src/dispatch-persister.mjs
		*/

		/**
		* @typedef {'false'\|true\|'dry-run'} AutoRecoverMode
		*/

		/**
		* @typedef {object} RecoveryDeps
		* @property {false\|true\|'dry-run'} autoRecoverMode
		* Passed from server.mjs which reads config.compute.autoRecover.
		* Default false if config unavailable.
		* @property {{ tryConsume(key: string): { allowed: boolean, retryAfterMs?: number, totalInWindow: number } }} leakyBucket
		* B2 leaky-bucket instance. Keyed by worldId.
		* @property {{ broadcast(type: string, payload: object): void }} [broadcaster]
		* Host-stream broadcaster. Optional — when absent, breadcrumbs are skipped.
		* @property {{ read({ worldId: string }): Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } \| null> }} persister
		* B4 dispatch-persister read function.
		* @property {(opts: { worldId: string, messageId: string, prompt: string }) => Promise<void>} replay
		* Opaque dispatch helper. Injected dep — DO NOT implement dispatch here.
		* In server.mjs this is wired to a stub until operator signs off on B3 probe.
		* @property {(pid: number) => void} [processKill]
		* process.kill indirection so tests can spy without actually killing.
		* Defaults to process.kill.
		* @property {(msg: string) => void} [log]
		* Logger. Defaults to console.log with [world-watchdog-recovery] prefix.
		*/

		/**
		* @typedef {object} RecoveryHandle
		* @property {(opts: { worldId: string, pid: number\|null }) => Promise<void>} onWedgedVerdict
		*/

		/**
		* Create a recovery handle.
		*
		* @param {RecoveryDeps} deps
		* @returns {RecoveryHandle}
		*/
		export function createRecovery({
		autoRecoverMode = false,
		leakyBucket,
		broadcaster = null,
		persister,
		replay,
		processKill = (pid) => process.kill(pid, 'SIGKILL'),
		log = (m) => console.log(`[world-watchdog-recovery] ${m}`),
		} = {}) {
		/**
		* Emit a breadcrumb via broadcaster (fail-soft).
		*
		* @param {string} type
		* @param {object} payload
		*/
		function broadcast(type, payload) {
		if (!broadcaster \|\| typeof broadcaster.broadcast !== 'function') return;
		try {
		broadcaster.broadcast(type, payload);
		} catch (err) {
		log(`broadcast ${type} failed: ${err?.message ?? err}`);
		}
		}

		/**
		* Handle a 2-tick-confirmed wedged verdict for a world.
		*
		* Called by world-watchdog.mjs on verdict-transition only (suspect → wedged),
		* NOT on steady-state re-wedge.
		*
		* @param {{ worldId: string, pid: number\|null }} opts
		* @returns {Promise<void>}
		*/
		async function onWedgedVerdict({ worldId, pid }) {
		// mode=false → detection-only; never act.
		if (autoRecoverMode === false) return;

		// PID null → watchdog hasn't resolved a real PID yet (Phase A stub case);
		// skip silently — there is nothing to kill.
		if (pid === null) return;

		// Rate-limit gate.
		const bucket = leakyBucket.tryConsume(worldId);
		if (!bucket.allowed) {
		broadcast('world.watchdog.recovery.budget_exhausted', {
		worldId,
		retryAfterMs: bucket.retryAfterMs,
		totalInWindow: bucket.totalInWindow,
		});
		log(`worldId=${worldId}: budget exhausted (${bucket.totalInWindow} in window); skipping recovery`);
		return;
		}

		// Read last persisted dispatch for replay.
		let lastDispatch = null;
		try {
		lastDispatch = await persister.read({ worldId });
		} catch (err) {
		log(`worldId=${worldId}: persister.read failed: ${err?.message ?? err}`);
		}

		broadcast('world.watchdog.recovery.start', {
		worldId,
		pid,
		mode: autoRecoverMode,
		lastDispatchMessageId: lastDispatch?.messageId ?? null,
		});

		// dry-run — log planned action but do NOT kill.
		if (autoRecoverMode === 'dry-run') {
		log(`worldId=${worldId}: dry-run — would SIGKILL pid=${pid}${lastDispatch ? ` + replay messageId=${lastDispatch.messageId}` : ' (no last-dispatch)'}`);
		broadcast('world.watchdog.recovery.complete', {
		worldId,
		pid,
		mode: 'dry-run',
		replayed: false,
		});
		return;
		}

		// mode=true — act.
		try {
		// 1. SIGKILL the wedged process.
		processKill(pid);
		log(`worldId=${worldId}: SIGKILL sent to pid=${pid}`);

		// 2. Replay or note absence of last-dispatch.
		if (!lastDispatch) {
		broadcast('world.watchdog.recovery.restart_without_replay', {
		worldId,
		pid,
		});
		log(`worldId=${worldId}: no last-dispatch; killed without replay`);
		} else {
		// TODO: wire real replay once operator has run the B3 idempotence probe
		// and confirmed dispatch is idempotent for the substrates in use.
		// Until then this stub logs and emits a breadcrumb so the stub path
		// is visible in production logs. See B3 probe + operator review gate B6.
		broadcast('world.watchdog.recovery.replay_stub', {
		worldId,
		prompt: lastDispatch.prompt,
		});
		log(`worldId=${worldId}: replay stub hit — real replay deferred pending B3 sign-off`);
		await replay({
		worldId,
		messageId: lastDispatch.messageId,
		prompt: lastDispatch.prompt,
		});
		}

		broadcast('world.watchdog.recovery.complete', {
		worldId,
		pid,
		mode: true,
		replayed: !!lastDispatch,
		});
		} catch (err) {
		log(`worldId=${worldId}: recovery failed: ${err?.message ?? err}`);
		broadcast('world.watchdog.recovery.failed', {
		worldId,
		pid,
		error: err?.message ?? String(err),
		});
		}
		}

		return { onWedgedVerdict };
		}

-313

host-cp/src/world-watchdog.mjs

		/**
		* world-watchdog.mjs — periodic watchdog that probes each active world's
		* `claude` PID for the three wedge signals (wchan + CLOSE_WAIT + CPU) and
		* emits `world.watchdog.tick` events on the host-stream broadcaster.
		*
		* Design:
		* - Mirrors `world-activity-tracker.mjs` shape exactly: `startWorldWatchdog(deps)`
		* returns `{ stop, tickNow }`.
		* - Per-world 2-tick confirm: a `'wedged'` classification is only emitted
		* after TWO consecutive ticks with the wedge signature. A single-tick
		* wedge emits `'suspect'`. A healthy tick resets the streak.
		* - Per-world fail-soft: a probe error for one world never skips other worlds.
		* - `OLAM_WORLD_WATCHDOG_DISABLED=1` → `start()` is a no-op (returns stub).
		* - Cadence: `OLAM_WORLD_WATCHDOG_TICK_MS` env or `intervalMs` dep (default 30_000).
		*
		* v1 stub: `getClaudePidForWorld(worldId)` returns null for all worlds in
		* Phase A. When null, the tick still fires but all probe signals are null,
		* producing `verdict: 'unknown'`. Real PID lookup (docker inspect →
		* /proc/<hostPid>/status NSpid field) is wired in a follow-up.
		* This is documented here and in docs/architecture/world-watchdog.md.
		*
		* Wire-in: `server.mjs` constructs once after broadcaster is ready and calls
		* `.stop()` from the SIGTERM/SIGINT handler. Gated on `!SERVE_ONLY`.
		*
		* @see docs/architecture/world-watchdog.md
		* @see packages/host-cp/src/world-watchdog-probes.mjs
		* @see packages/host-cp/src/world-activity-tracker.mjs (shape reference)
		*/

		import {
		readWchan,
		readCloseWaitSockets,
		readCpuPercent,
		classify,
		} from './world-watchdog-probes.mjs';
		// Recovery hook (B5). Optional dep — when absent (recovery is null/undefined),
		// the watchdog behaves exactly as Phase A: detection-only, no kill, no replay.
		// Wire via startWorldWatchdog({ recovery: createRecovery({...}) }) in server.mjs.

		const DEFAULT_TICK_MS = 30_000;
		// CPU measurement window: shorter than the tick cadence so we don't overlap.
		const CPU_WINDOW_MS = 500;

		/**
		* @typedef {object} WorldWatchdogDeps
		* @property {object} [broadcaster] Object with `.broadcast(type, payload)`.
		* Optional — when absent events are skipped but state tracking still works.
		* @property {number} [intervalMs] Tick cadence in ms. Defaults to
		* `OLAM_WORLD_WATCHDOG_TICK_MS` env or 30_000.
		* @property {() => Promise<string[]>} [listActiveWorlds]
		* Returns an array of active world IDs to probe each tick.
		* Defaults to returning [].
		* @property {(worldId: string) => Promise<number\|null>} [getClaudePidForWorld]
		* Returns the host-side PID of the claude process for a world, or null.
		* v1 default: always returns null (all worlds → verdict 'unknown').
		* @property {{ procRoot?: string }} [probes]
		* Injectable probe options (procRoot for tests).
		* @property {{ onWedgedVerdict(opts: { worldId: string, pid: number\|null }): Promise<void> }} [recovery]
		* Optional recovery handle (from world-watchdog-recovery.mjs). When present,
		* called once on verdict-transition to 'wedged' (suspect → wedged), NOT on
		* steady-state re-wedge. When absent, detection-only (Phase A behaviour).
		* @property {(msg: string) => void} [log] Defaults to `console.log`.
		* @property {(msg: string) => void} [debug] Defaults to no-op.
		* @property {(cb: () => void, ms: number) => any} [setTimer]
		* Injectable `setInterval` for tests.
		* @property {(handle: any) => void} [clearTimer]
		* Injectable `clearInterval` for tests.
		* @property {() => Date} [now] Clock injection for tests.
		*/

		/**
		* @typedef {object} WorldWatchdogHandle
		* @property {() => void} stop
		* @property {() => Promise<number>} tickNow Run one tick immediately (returns
		* the count of worlds processed). Exposed for tests.
		* @property {(worldId: string) => object\|null} getVerdict
		* Returns the latest in-memory verdict entry for a world, or null if no tick
		* has fired yet. Used by the HTTP endpoint (A5).
		*/

		/**
		* Per-world state tracked between ticks for the 2-tick confirm.
		*
		* @typedef {object} WorldWatchdogState
		* @property {'healthy'\|'suspect'\|'wedged'\|'unknown'} lastClassification
		* The raw classification from the previous tick (before 2-tick confirm).
		* @property {'healthy'\|'suspect'\|'wedged'\|'unknown'} lastVerdict
		* The emitted verdict (post-confirm).
		* @property {string} lastTickAt ISO-8601 timestamp of last tick.
		* @property {object\|null} lastSignals The signals from the last tick.
		* @property {number\|null} lastPid The PID probed last tick.
		*/

		/**
		* Start the world watchdog. Returns a `{ stop, tickNow, getVerdict }` handle.
		*
		* Honoring `OLAM_WORLD_WATCHDOG_DISABLED=1`: if the env var is set, returns
		* a no-op stub immediately without starting the interval or making any probe
		* calls.
		*
		* @param {WorldWatchdogDeps} [deps]
		* @returns {WorldWatchdogHandle}
		*/
		export function startWorldWatchdog(deps = {}) {
		// Honour kill switch — return a no-op stub.
		if (process.env.OLAM_WORLD_WATCHDOG_DISABLED === '1') {
		return {
		stop() {},
		tickNow: async () => 0,
		getVerdict: () => null,
		};
		}

		const log = deps.log ?? ((m) => console.log(`[world-watchdog] ${m}`));
		const debug = deps.debug ?? (() => {});
		const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
		const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));
		const now = deps.now ?? (() => new Date());

		const intervalMs =
		deps.intervalMs ??
		parseInt(process.env.OLAM_WORLD_WATCHDOG_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10);

		const broadcaster = deps.broadcaster ?? null;
		const listActiveWorlds = deps.listActiveWorlds ?? (async () => []);
		const getClaudePidForWorld = deps.getClaudePidForWorld ?? (async (_id) => null);
		const probeOpts = deps.probes ?? {};
		// Recovery hook — null when not configured (Phase A / default-off behaviour).
		const recovery = deps.recovery ?? null;

		// Per-world state map: worldId → WorldWatchdogState.
		/** @type {Map<string, WorldWatchdogState>} */
		const worldState = new Map();

		let stopped = false;
		let inFlight = false;
		let intervalHandle = null;

		/**
		* Probe a single world and update its state. Returns the verdict emitted.
		*
		* @param {string} worldId
		* @returns {Promise<'healthy'\|'suspect'\|'wedged'\|'unknown'>}
		*/
		async function probeWorld(worldId) {
		const pid = await getClaudePidForWorld(worldId);

		let wchan = null;
		let closeWaitSockets = [];
		let cpuPercent = null;

		if (pid !== null) {
		// All probes are fail-soft — they return null/[] on I/O error.
		[wchan, closeWaitSockets, cpuPercent] = await Promise.all([
		readWchan(pid, probeOpts),
		readCloseWaitSockets(pid, probeOpts),
		readCpuPercent(pid, CPU_WINDOW_MS, probeOpts),
		]);
		}

		const closeWaitCount = closeWaitSockets.length;
		const signals = pid !== null
		? { wchan, closeWaitCount, cpuPercent }
		: null;

		// Classify raw signals.
		const rawClassification = pid !== null
		? classify({ wchan, closeWaitCount, cpuPercent })
		: 'unknown';

		// 2-tick confirm: only emit 'wedged' if BOTH this tick AND the previous tick
		// classified as 'wedged'. Otherwise emit the raw classification.
		const prev = worldState.get(worldId);
		let verdict;
		if (rawClassification === 'wedged' && prev?.lastClassification === 'wedged') {
		verdict = 'wedged';
		} else if (rawClassification === 'wedged') {
		// First 'wedged' tick — emit 'suspect' (2-tick confirm pending).
		verdict = 'suspect';
		} else {
		verdict = rawClassification;
		}

		const tickAt = now().toISOString();

		// Update per-world state.
		worldState.set(worldId, {
		lastClassification: rawClassification,
		lastVerdict: verdict,
		lastTickAt: tickAt,
		lastSignals: signals,
		lastPid: pid,
		});

		// Recovery hook — fire ONCE on verdict-transition to 'wedged' (not on
		// steady-state re-wedge). Guard: prev?.lastVerdict !== 'wedged' ensures
		// only the suspect→wedged transition triggers, not wedged→wedged.
		if (
		verdict === 'wedged' &&
		recovery !== null &&
		prev?.lastVerdict !== 'wedged'
		) {
		// Fire-and-forget; fail-soft so a recovery error never skips other worlds.
		void recovery.onWedgedVerdict({ worldId, pid }).catch((err) => {
		log(`recovery.onWedgedVerdict ${worldId} failed: ${err?.message ?? err}`);
		});
		}

		// Emit broadcaster event.
		if (broadcaster && typeof broadcaster.broadcast === 'function') {
		try {
		broadcaster.broadcast('world.watchdog.tick', {
		worldId,
		verdict,
		signals,
		pid,
		lastTickAt: tickAt,
		});
		} catch (err) {
		log(`broadcast ${worldId} failed: ${err?.message ?? err}`);
		}
		}

		return verdict;
		}

		/**
		* One tick: get active worlds, probe each, return count processed.
		*
		* @returns {Promise<number>}
		*/
		async function tick() {
		if (stopped) return 0;
		if (inFlight) {
		debug('tick skipped: previous tick still in flight');
		return 0;
		}
		inFlight = true;

		let processed = 0;
		try {
		let worlds;
		try {
		worlds = await listActiveWorlds();
		} catch (err) {
		log(`listActiveWorlds failed: ${err?.message ?? err}`);
		return 0;
		}

		for (const worldId of worlds) {
		if (stopped) break;
		if (typeof worldId !== 'string') continue;

		try {
		await probeWorld(worldId);
		processed += 1;
		} catch (err) {
		// Per-world fail-soft: one bad world doesn't crash the loop.
		debug(`probe ${worldId} failed: ${err?.message ?? err}`);
		}
		}
		} finally {
		inFlight = false;
		}

		return processed;
		}

		// Kick off an initial tick on next event-loop turn so callers can
		// attach test spies before any probe work happens.
		setImmediate(() => {
		if (stopped) return;
		void tick().catch((err) => {
		log(`initial tick crashed: ${err?.message ?? err}`);
		});
		});

		intervalHandle = setTimer(() => {
		void tick().catch((err) => {
		log(`tick crashed: ${err?.message ?? err}`);
		});
		}, intervalMs);
		// Don't pin the event loop on shutdown.
		if (intervalHandle && typeof intervalHandle.unref === 'function') {
		intervalHandle.unref();
		}

		log(`started: interval=${intervalMs}ms`);

		return {
		stop() {
		if (stopped) return;
		stopped = true;
		if (intervalHandle !== null) {
		try { clearTimer(intervalHandle); } catch { /* ignore */ }
		intervalHandle = null;
		}
		},

		tickNow: tick,

		/**
		* Return the latest in-memory verdict entry for a world.
		* Returns null if no tick has fired for this world yet.
		*
		* @param {string} worldId
		* @returns {WorldWatchdogState\|null}
		*/
		getVerdict(worldId) {
		return worldState.get(worldId) ?? null;
		},
		};
		}

-191

host-cp/src/worlds-db-source.mjs

		/**
		* WorldsDbSource — reconcile loop that reads ~/.olam/worlds.db and
		* auto-registers running worlds into host-cp's in-memory registry.
		*
		* Two triggers (belt-and-suspenders):
		* 1. fs.watch on the worlds.db file — fires within ~100ms of a write
		* 2. 30s setInterval backstop — catches cases where fs.watch silently
		* misses events (network filesystems, some Linux kernels)
		*
		* Uses better-sqlite3 for synchronous, lightweight reads. If the module
		* is not installed (e.g., no native build in the container), the module
		* logs a warning and exits without crashing the server.
		*
		* DB handle: deliberately NOT cached across reconcile calls. A long-lived
		* readonly connection with the DB bind-mounted across the docker boundary
		* does not reliably pick up writes committed on the host side — the host
		* writer appends to the WAL, but the container reader's snapshot is stuck
		* at the point the handle was first opened. Closing and reopening on every
		* reconcile forces a new read transaction that sees all committed WAL
		* frames. Cost: ~1 ms per call at a 30 s interval — negligible. This
		* eliminates the entire class of "olam create world vanishes within 30 s"
		* bugs (regression confirmed: ember-elk-9191 removed by reconciler despite
		* being present in worlds.db with status=running).
		*
		* Interface: thin wrapper so a future "remote" source (cloud orchestrator)
		* can drop in via the same WorldsSource interface in worlds-source.mjs.
		*/

		import fs from 'node:fs';
		import { createRequire } from 'node:module';

		const require = createRequire(import.meta.url);

		/**
		* @typedef {object} WorldsDbSourceDeps
		* @property {string} dbPath Path to worlds.db (OLAM_WORLDS_DB or ~/.olam/worlds.db)
		* @property {string} dockerHost Docker API base URL (tcp://host:port)
		* @property {string} worldHost Host used to reach world CPs (127.0.0.1 or host.docker.internal)
		* @property {() => Record<string, number>} getRegistry Current WORLDS map
		* @property {(id: string, port: number) => void} onWorldAdded Called when a new running world is found
		* @property {(id: string) => void} onWorldRemoved Called when a running world disappears
		* @property {(msg: string) => void} [log]
		*/

		/**
		* Derive the per-world CP host port from docker inspect.
		*
		* @param {string} worldId
		* @param {string} dockerHost e.g. 'tcp://docker-socket-proxy:2375'
		* @returns {Promise<number \| null>}
		*/
		async function getWorldPortFromDocker(worldId, dockerHost) {
		const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
		const containerName = `olam-${worldId}-devbox`;
		try {
		const res = await fetch(`${apiBase}/containers/${encodeURIComponent(containerName)}/json`, {
		signal: AbortSignal.timeout(3000),
		});
		if (!res.ok) return null;
		const data = await res.json();
		// Per-world CP runs on internal port 8080; host port is the published binding.
		const ports = data?.NetworkSettings?.Ports ?? {};
		const binding = ports['8080/tcp'];
		if (!Array.isArray(binding) \|\| binding.length === 0) return null;
		const hostPort = parseInt(binding[0].HostPort, 10);
		return Number.isFinite(hostPort) ? hostPort : null;
		} catch {
		return null;
		}
		}

		/**
		* Start the worlds-db reconcile loop. Returns a stop function.
		*
		* @param {WorldsDbSourceDeps} deps
		* @returns {{ stop: () => void }}
		*/
		export function startWorldsDbReconciler(deps) {
		const { dbPath, dockerHost, getRegistry, onWorldAdded, onWorldRemoved, log = console.log } = deps;

		let db = null;
		let stopped = false;
		let watcher = null;

		function tryOpenDb() {
		if (db) return db;
		try {
		// Dynamic require — gracefully degrade if better-sqlite3 is not installed.
		// better-sqlite3 is CommonJS-only; createRequire enables sync dynamic loading in ESM.
		const Database = require('better-sqlite3');
		db = new Database(dbPath, { readonly: true, fileMustExist: true });
		log(`[worlds-db] opened ${dbPath}`);
		return db;
		} catch (err) {
		if (err.code === 'MODULE_NOT_FOUND') {
		log('[worlds-db] better-sqlite3 not available; skipping DB reconciler');
		} else if (err.code !== 'SQLITE_CANTOPEN') {
		log(`[worlds-db] failed to open ${dbPath}: ${err.message}`);
		}
		return null;
		}
		}

		async function reconcile() {
		if (stopped) return;

		// Close any cached handle so tryOpenDb() opens a fresh connection below.
		// A long-lived readonly handle under cross-bind-mount WAL mode has its
		// read snapshot frozen at open time; closing and reopening starts a new
		// read transaction that includes all WAL frames committed by the host.
		if (db) {
		try { db.close(); } catch { /* ignore */ }
		db = null;
		}

		const database = tryOpenDb();
		if (!database) return;

		try {
		let runningIds;
		try {
		const rows = database.prepare("SELECT id FROM worlds WHERE status = 'running'").all();
		runningIds = new Set(rows.map((r) => r.id));
		} catch (err) {
		log(`[worlds-db] query failed: ${err.message}`);
		return;
		}

		const registry = getRegistry();

		// Add worlds that are running in DB but missing from registry.
		for (const id of runningIds) {
		if (id in registry) continue;
		const port = await getWorldPortFromDocker(id, dockerHost);
		if (port === null) {
		log(`[worlds-db] world ${id} running in DB but no docker port found; skipping`);
		continue;
		}
		log(`[worlds-db] reconcile: adding ${id} → :${port}`);
		onWorldAdded(id, port);
		}

		// Remove worlds that are registered but no longer 'running' in DB.
		for (const id of Object.keys(registry)) {
		if (runningIds.has(id)) continue;
		log(`[worlds-db] reconcile: removing ${id} (not running in DB)`);
		onWorldRemoved(id);
		}
		} finally {
		// Always close — no need to hold the handle between reconciles.
		try { db.close(); } catch { /* ignore */ }
		db = null;
		}
		}

		// Watch the DB file for changes (fast path).
		if (fs.existsSync(dbPath)) {
		try {
		watcher = fs.watch(dbPath, { persistent: false }, () => {
		void reconcile();
		});
		} catch (err) {
		log(`[worlds-db] fs.watch failed: ${err.message}; relying on 30s poll`);
		}
		// Initial reconcile on startup.
		void reconcile();
		} else {
		log(`[worlds-db] ${dbPath} not found; will poll every 30s`);
		}

		// 30s backstop poll. Also watches for the file to appear.
		const interval = setInterval(async () => {
		if (!watcher && fs.existsSync(dbPath)) {
		// File appeared since startup — set up watcher now.
		try {
		watcher = fs.watch(dbPath, { persistent: false }, () => { void reconcile(); });
		log(`[worlds-db] ${dbPath} appeared; watcher started`);
		} catch { /* fs.watch failure is non-fatal */ }
		}
		await reconcile();
		}, 30_000);

		return {
		stop() {
		stopped = true;
		clearInterval(interval);
		if (watcher) { try { watcher.close(); } catch { /* ignore */ } }
		if (db) { try { db.close(); } catch { /* ignore */ } }
		},
		};
		}

-59

host-cp/src/worlds-source.mjs

		/**
		* Phase E1 (olam-dogfood-vision): WorldsSource interface.
		*
		* Single narrow boundary that both LocalWorldsSource (today's
		* dockerode-driven enumeration) and PylonWorldsSource (future cloud
		* worlds) implement. The interface is the entire contract — there is
		* no shared abstract class, no shared base, no shared utility module.
		*
		* Per Phase E plan (S1 contract carried through C-phase): the wire
		* shape IS the abstraction. Sources implementing this interface are
		* free to pick any backend (dockerode, Pylon SDK, mock, sqlite cache
		* — anything) as long as `list()` returns the WorldSummary shape.
		*
		* Deliberately narrow:
		* - `name` — discriminator for the source. SPA uses this to render
		* the per-world `source` chip (E5).
		* - `list()` — read-only enumeration. NO mutations. Mutations stay
		* on host-cp's existing endpoints (POST /api/worlds delegation,
		* DELETE via per-world CP, etc.). T5 mitigation: keeping the
		* surface narrow lets the future Pylon SDK integration extend
		* `list()`'s implementation without forcing a contract change
		* across consumers.
		*
		* This is a `.mjs` file (matches host-cp's existing module style).
		* Type information is conveyed via JSDoc; consumers reading via
		* TypeScript get the shape via `// @ts-check` + JSDoc inference.
		*
		* @typedef {object} ServiceInfo
		* @property {string} name
		* @property {number} host_port
		* @property {number} internal_port
		* @property {string} url
		* @property {boolean} live
		*
		* @typedef {object} WorldSummary
		* @property {string} id
		* @property {string \| null} name
		* @property {'running' \| 'starting' \| 'unknown' \| 'failed'} status
		* @property {ServiceInfo[]} services
		* @property {'local' \| 'pylon-cloud'} source
		*
		* @typedef {object} WorldsSource
		* @property {'local' \| 'pylon-cloud'} name
		* @property {() => Promise<WorldSummary[]>} list
		*/

		// Re-export the source-name discriminator so consumers don't repeat
		// the literal string. Both implementations + E4's composition layer
		// + E5's SPA badge logic reference this.
		export const SOURCE_NAMES = /** @type {const} */ (['local', 'pylon-cloud']);

		// `WorldsSource` is a TYPE export — no runtime symbol. Consumers
		// import it via JSDoc references:
		// /** @type {import('./worlds-source.mjs').WorldsSource} */
		// or in TypeScript:
		// import type { WorldsSource } from './worlds-source.mjs';
		//
		// Test files exercising the interface treat it as duck-typed: any
		// object with the right shape passes structural compatibility.

dist/index.js

Sorry, the diff of this file is too big to display

dist/mcp-server.js

Sorry, the diff of this file is too big to display

@pleri/olam-cli - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics