🚀 Socket Launch Week Day 5:Introducing Repository Access Permissions and Custom Roles.Learn more
Sign In

@pleri/olam-cli

Package Overview
Dependencies
Maintainers
1
Versions
170
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@pleri/olam-cli - npm Package Compare versions

Comparing version
0.1.218
to
0.1.219
+1
-1
dist/index.js.map

@@ -1,1 +0,1 @@

{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mCAAmC,CAAC;AAC9E,OAAO,EAAE,0BAA0B,EAAE,MAAM,oCAAoC,CAAC;AAChF,OAAO,EAAE,8BAA8B,EAAE,MAAM,yCAAyC,CAAC;AACzF,OAAO,EAAE,2BAA2B,EAAE,MAAM,qCAAqC,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,sBAAsB,CAAC;AAEjE,MAAM,UAAU,GAAG,cAAc,EAAE,CAAC;AACpC,iFAAiF;AACjF,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,GAAG,UAAU,CAAC;AAE7C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,MAAM,CAAC;KACZ,WAAW,CAAC,+DAA+D,CAAC;IAC7E,4EAA4E;IAC5E,0EAA0E;KACzE,MAAM,CAAC,OAAO,EAAE,6DAA6D,CAAC;KAC9E,OAAO,CAAC,UAAU,CAAC;KACnB,aAAa,CAAC,yBAAyB,EAAE,CAAC,CAAC;AAE9C,gEAAgE;AAChE,6EAA6E;AAC7E,4EAA4E;AAC5E,yEAAyE;AACzE,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AACnD,IAAI,YAAY,KAAK,CAAC,CAAC,EAAE,CAAC;IACxB,qEAAqE;IACrE,+EAA+E;IAC/E,iDAAiD;IACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IACzF,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,0DAA0D,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK;YACtF,qEAAqE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AACvD,CAAC;AAED,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,sEAAsE;AACtE,yEAAyE;AACzE,0EAA0E;AAC1E,wEAAwE;AACxE,mBAAmB,CAAC,OAAO,EAAE,EAAE,MAAM,EAAE,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC/D,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,iBAAiB,CAAC,OAAO,CAAC,CAAC;AAC3B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC7B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,yBAAyB,CAAC,OAAO,CAAC,CAAC;AACnC,0BAA0B,CAAC,OAAO,CAAC,CAAC;AACpC,8BAA8B,CAAC,OAAO,CAAC,CAAC;AACxC,2BAA2B,CAAC,OAAO,CAAC,CAAC;AACrC,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,uBAAuB,CAAC,OAAO,CAAC,CAAC;AACjC,cAAc,CAAC,OAAO,CAAC,CAAC;AAExB,6EAA6E;AAC7E,4EAA4E;AAC5E,uEAAuE;AACvE,iEAAiE;AACjE,KAAK,oBAAoB,CAAC,UAAU,CAAC,CAAC;AAEtC,sEAAsE;AACtE,yEAAyE;AACzE,yEAAyE;AACzE,4EAA4E;AAC5E,kDAAkD;AAClD,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC,EAAE,CAAC;IACrF,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACpD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;AACtD,CAAC;AAED,wEAAwE;AACxE,0EAA0E;AAC1E,uEAAuE;AACvE,wEAAwE;AACxE,qEAAqE;AACrE,wEAAwE;AACxE,2EAA2E;AAC3E,+DAA+D;AAC/D,EAAE;AACF,wEAAwE;AACxE,0EAA0E;AAC1E,yEAAyE;AACzE,mEAAmE;AACnE,+DAA+D;AAC/D,sBAAsB;AACtB,IAAI,CAAC;IACH,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;AACtC,CAAC;AAAC,OAAO,GAAY,EAAE,CAAC;IACtB,UAAU,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;QACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC;IACzC,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC;IAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC;AACnF,CAAC"}
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mCAAmC,CAAC;AAC9E,OAAO,EAAE,0BAA0B,EAAE,MAAM,oCAAoC,CAAC;AAChF,OAAO,EAAE,8BAA8B,EAAE,MAAM,yCAAyC,CAAC;AACzF,OAAO,EAAE,2BAA2B,EAAE,MAAM,qCAAqC,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,sBAAsB,CAAC;AAEjE,MAAM,UAAU,GAAG,cAAc,EAAE,CAAC;AACpC,iFAAiF;AACjF,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,GAAG,UAAU,CAAC;AAE7C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,MAAM,CAAC;KACZ,WAAW,CAAC,+DAA+D,CAAC;IAC7E,4EAA4E;IAC5E,0EAA0E;KACzE,MAAM,CAAC,OAAO,EAAE,6DAA6D,CAAC;KAC9E,OAAO,CAAC,UAAU,CAAC;KACnB,aAAa,CAAC,yBAAyB,EAAE,CAAC,CAAC;AAE9C,gEAAgE;AAChE,6EAA6E;AAC7E,4EAA4E;AAC5E,yEAAyE;AACzE,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AACnD,IAAI,YAAY,KAAK,CAAC,CAAC,EAAE,CAAC;IACxB,qEAAqE;IACrE,+EAA+E;IAC/E,iDAAiD;IACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IACzF,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,0DAA0D,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK;YACtF,qEAAqE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AACvD,CAAC;AAED,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,sEAAsE;AACtE,yEAAyE;AACzE,0EAA0E;AAC1E,wEAAwE;AACxE,mBAAmB,CAAC,OAAO,EAAE,EAAE,MAAM,EAAE,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC/D,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,iBAAiB,CAAC,OAAO,CAAC,CAAC;AAC3B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC7B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,yBAAyB,CAAC,OAAO,CAAC,CAAC;AACnC,0BAA0B,CAAC,OAAO,CAAC,CAAC;AACpC,8BAA8B,CAAC,OAAO,CAAC,CAAC;AACxC,2BAA2B,CAAC,OAAO,CAAC,CAAC;AACrC,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,uBAAuB,CAAC,OAAO,CAAC,CAAC;AACjC,cAAc,CAAC,OAAO,CAAC,CAAC;AAExB,6EAA6E;AAC7E,4EAA4E;AAC5E,uEAAuE;AACvE,iEAAiE;AACjE,KAAK,oBAAoB,CAAC,UAAU,CAAC,CAAC;AAEtC,sEAAsE;AACtE,yEAAyE;AACzE,yEAAyE;AACzE,4EAA4E;AAC5E,kDAAkD;AAClD,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC,EAAE,CAAC;IACrF,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACpD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;AACtD,CAAC;AAED,wEAAwE;AACxE,0EAA0E;AAC1E,uEAAuE;AACvE,wEAAwE;AACxE,qEAAqE;AACrE,wEAAwE;AACxE,2EAA2E;AAC3E,+DAA+D;AAC/D,EAAE;AACF,wEAAwE;AACxE,0EAA0E;AAC1E,yEAAyE;AACzE,mEAAmE;AACnE,+DAA+D;AAC/D,sBAAsB;AACtB,IAAI,CAAC;IACH,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;AACtC,CAAC;AAAC,OAAO,GAAY,EAAE,CAAC;IACtB,UAAU,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;QACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC;IACzC,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC;IAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC;AACnF,CAAC"}
{
"bundledAt": "2026-06-15T08:36:13.799Z",
"bundledAt": "2026-06-18T05:40:50.182Z",
"kgFirstSha": "29a9ccce1b115d049e375c4a90eb5cf7c123e610e2d0590270a4db2cdbc64a28"
}
{
"name": "@pleri/olam-cli",
"version": "0.1.218",
"version": "0.1.219",
"type": "module",

@@ -13,7 +13,5 @@ "bin": {

"dist/mcp-server.js",
"dist/image-digests.json",
"dist/agent-stream",
"hermes-bundle",
"hooks",
"host-cp",
"memory-hooks",

@@ -43,3 +41,2 @@ "README.md"

"audit:publish-deps": "node scripts/audit-publish-deps.mjs",
"audit:cli-bundle-k8s": "node scripts/audit-cli-bundle-k8s.mjs",
"audit:cli-package-contents": "node scripts/audit-cli-package-contents.mjs",

@@ -46,0 +43,0 @@ "audit:cli-test-coverage": "node ../../scripts/audit-cli-test-coverage.mjs"

{
"auth": "sha256:770ee97ee4d06d2c1b6512ba99421a5fe312393d592df1684fd0d03b3476ff10",
"host-cp": "sha256:328baca8b9b28ccef1d858aa20e0ab27855604a630132dcadd423990cb376f60",
"kg-service": "sha256:f97ee90fe1bd5b12cb56d5fbf0d3085c301bb7abeef0dd28d2b2a5c90ab6efbb",
"memory-service": "sha256:923bff54d2ba3da162a35d3e8ebc6bd440bed6d290a5cff7bae2888281a4e003",
"mcp-auth": "sha256:eaac2164349e388a70dae0d86c34132f97aa74177a2376cdfa10732e8eadb507",
"$schema_version": 1,
"$published_version": "0.1.218",
"$registry": "ghcr.io/pleri"
}
# Phase F-2-B (B2): olam-host-cp compose stack.
#
# Two services on a private internal network:
#
# 1. host-cp — the SPA proxy server (B3+ implementation). Exposes
# port 19000 to the operator's host. Talks to the
# docker-socket-proxy via `tcp://docker-socket-proxy:2375`
# (NOT the raw /var/run/docker.sock).
#
# 2. docker-socket-proxy
# — tecnativa/docker-socket-proxy sidecar. Mounts the
# real /var/run/docker.sock read-only and exposes a
# whitelisted subset of the Docker API. Whitelist:
# CONTAINERS=1 — list/inspect (find world IDs)
# EVENTS=1 — stream restart/stop events
# (cache invalidation; B3 / T2)
# EXEC=1 — exec inside containers
# (read /tmp/olam-container-secret)
# Everything else is denied (images, volumes,
# networks, swarm, build, push, etc.). T6 + T8
# mitigation: blast-radius reduction vs raw socket.
#
# Bring up: `docker compose -f packages/host-cp/compose.yaml up --build -d`
# Tear down: `docker compose -f packages/host-cp/compose.yaml down`
services:
olam-host-cp:
container_name: olam-host-cp
# Image-only — operator's `olam bootstrap` pulls the digest-pinned
# `ghcr.io/pleri/olam-host-cp:latest` (digest from image-digests.json)
# and tags it as the local `:latest` BEFORE compose up. No `build:`
# in this file — fresh-install operators don't have the source tree
# so a `build:` block crashes them with "Dockerfile not found".
#
# Local-dev contributors who want to test host-cp source changes
# use the sibling compose.dev.yaml as an override:
#
# docker compose \
# -f packages/host-cp/compose.yaml \
# -f packages/host-cp/compose.dev.yaml \
# up --build -d
#
# The CLI's `olam host-cp start` always uses ONLY compose.yaml, so
# operator boots are never blocked on a missing Dockerfile / build
# context.
image: ghcr.io/pleri/olam-host-cp:latest
ports:
# Bind to 127.0.0.1 only — single-user-per-host assumption (T4).
# Multi-user / TLS / remote access lands in Phase G+.
- "127.0.0.1:19000:19000"
environment:
# Connection string for docker-socket-proxy. The proxy listens on
# tcp://0.0.0.0:2375 inside the internal network. host-cp uses
# this to enumerate worlds (containers list) + read secrets
# (containers exec) + subscribe to restart events.
DOCKER_HOST: "tcp://docker-socket-proxy:2375"
# Phase F-2-B M2 ship gate: secret cache TTL (5min, demoted from
# 1h per D2). B3 reads this; B10's m2-cache-invalidate.sh tests
# the docker-events invalidation path.
OLAM_SECRET_CACHE_TTL_SEC: "300"
# Bind operator-facing UI port. Always 19000 in compose.
OLAM_HOST_CP_PORT: "19000"
# Token + workspace + world registry mount points. Bind-mounted
# below; host CP reads these at boot.
OLAM_HOST_CP_TOKEN_PATH: "/data/host-cp.token"
OLAM_WORKSPACES_DIR: "/data/workspaces"
OLAM_WORLDS_DB: "/data/worlds.db"
OLAM_PR_POLL_INTERVAL_MS: "300000"
OLAM_MERGE_GRACE_MS: "600000"
# NOTE: OLAM_REPO_PATH is intentionally NOT passed into the
# container env. The HOST-side variable names a bind-mount source
# (a host path like /Users/.../olam — see the volumes block below).
# Inside the container, the bind-mount target is always
# `/operator-repo`. Pre-fix the env was passed through, server.mjs
# consumers (version-status.mjs, /api/prs handler) read it
# expecting a container-side path, then `cwd:` to a host path that
# doesn't exist inside the container — `gh pr list` failed with
# "not a git repository", `gh` itself failed with `spawn ENOENT`.
# Server-side consumers default to `/operator-repo` which is
# always correct.
# Auth-service inter-service auth. The secret is shared with the
# long-lived olam-auth container (generated on first `olam auth
# up` at ~/.olam/auth-secret). Without it, X-Olam-Secret is never
# sent and auth-service 401s every host-cp → /credentials/* call,
# which surfaces in the dashboard as a failed Connect Claude flow.
OLAM_AUTH_SERVICE_URL: "http://host.docker.internal:9999"
OLAM_AUTH_SECRET: "${OLAM_AUTH_SECRET:-}"
# Operator's CLI version, propagated by `olam host-cp start` via
# buildComposeEnv. Surfaces in /api/version/status so the
# dashboard's TopNav can render "the version we're working on."
# Empty when older CLI versions render this compose; the server
# falls back to host-cp's own package.json.
OLAM_CLI_VERSION: "${OLAM_CLI_VERSION:-}"
# Upgrade-trigger feature: host-cp uses these to construct bind
# mounts on the spawned upgrader container. The upgrader runs
# `olam upgrade -y` and needs (a) the operator's ~/.olam state,
# (b) the docker socket so the CLI can talk to the daemon. Both
# are HOST-side paths because docker resolves bind sources on
# the daemon, not inside the requesting container.
OLAM_HOME_HOST_PATH: "${HOME}/.olam"
OLAM_DOCKER_SOCK_HOST_PATH: "/var/run/docker.sock"
# Operator's olam repo path on the host. The upgrader needs this
# bind-mounted so the CLI's cwd-relative `packages/host-cp/compose.yaml`
# lookup resolves. Defaults to the `OLAM_REPO_PATH` already used by
# host-cp for version-detection (mounted at /operator-repo:ro).
OLAM_REPO_HOST_PATH: "${OLAM_REPO_PATH:-${HOME}/Projects/ein-sof/olam}"
# Operator's $HOME on the host. Forwarded to the upgrader as HOME
# so `${HOME}` interpolation in compose.yaml's bind sources
# resolves to a path the docker daemon can find. Inside the
# upgrader container HOME defaults to /root, which the daemon
# rejects when used as a bind source ("path not shared from
# the host"). Without this the recreate step fails right at the
# last hop of the upgrade pipeline.
OLAM_OPERATOR_HOME_HOST_PATH: "${HOME}"
# GitHub CLI config bind for the upgrader. The CLI runs
# `gh auth token | docker login ghcr.io ...` before `olam upgrade`
# so the spawned container can pull GHCR images even though the
# host's ~/.docker/config.json uses a Keychain credsStore that
# doesn't follow into a Linux container. The gh config is also
# mounted into host-cp itself (line 89 below) for `gh pr list` —
# this is the same path, mounted again for the upgrader.
OLAM_GH_CONFIG_HOST_PATH: "${HOME}/.config/gh"
# GitHub token used by the upgrader to `docker login ghcr.io` so it can
# pull the host-cp / auth / devbox images by digest. Resolved from
# the operator's `gh auth token` BEFORE compose up (or set explicitly
# via the GH_TOKEN env var). If unset, the upgrader falls back to
# `gh auth token` against the mounted ~/.config/gh — which works
# only on Linux operators (macOS keeps the token in Keychain, not in
# ~/.config/gh).
GH_TOKEN: "${GH_TOKEN:-}"
# Optional override for the upgrader image. Defaults to the
# currently-running host-cp image (which has the olam CLI +
# docker CLI + gh CLI baked in by Dockerfile).
OLAM_UPGRADER_IMAGE: "${OLAM_UPGRADER_IMAGE:-ghcr.io/pleri/olam-host-cp:latest}"
# Plan DB persistence fix (Bug 1): os.homedir() inside the container is
# /root, but ~/.olam is bind-mounted to /data — not /root/.olam. Without
# these overrides, plan.db lands in the container's ephemeral layer and is
# destroyed by every `docker compose up --force-recreate` (i.e. olam upgrade).
# Pointing to /data/ routes all writes through the bind-mount to the host.
OLAM_PLAN_DB_PATH: "/data/plan.db"
OLAM_PLAN_DIR: "/data/plan"
# Same /root vs /data bind-mount issue applies to the plan-chat bearer
# gateway. Without this override, plan-chat-secret.mjs reads from
# /root/.olam/plan-chat-secret (container ephemeral, missing) and
# /agent-runtime/trigger answers HTTP 500. Routing through /data
# surfaces the on-disk bearer created by ensureSecret() on host FS.
OLAM_PLAN_CHAT_SECRET_PATH: "/data/plan-chat-secret"
volumes:
# ~/.olam/ from operator's home → /data/ inside container. B4
# writes the startup token here (chmod 600). B6 reads workspaces
# + worlds.db from here. ~/.olam/ is the canonical operator-state
# directory established by the Olam CLI; consistent with the
# devbox container's mount layout.
- ${HOME}/.olam:/data
- ${HOME}/.config/gh:/gh-config:ro
# Operator's olam repo mounted read-only so host-cp can poll
# .git/refs/heads/main to detect when a new version is available.
# The path inside the container is always /operator-repo.
# On the host: OLAM_REPO_PATH env var, or defaults to
# $HOME/Projects/ein-sof/olam. If the path doesn't exist, the
# mount is a no-op and version detection reports "operator-repo not mounted".
- ${OLAM_REPO_PATH:-${HOME}/Projects/ein-sof/olam}:/operator-repo:ro
depends_on:
docker-socket-proxy:
condition: service_started
networks:
- olam-host-cp-internal
restart: unless-stopped
docker-socket-proxy:
container_name: olam-docker-socket-proxy
# Pin to a specific tag, not :latest. Update via Renovate / dependabot.
# tecnativa/docker-socket-proxy:0.3.0 (2024-10-22) — last tagged
# release as of plan-pass-2 emit. T8 mitigation: pinning prevents
# supply-chain drift on the sidecar.
image: tecnativa/docker-socket-proxy:0.3.0
environment:
# Whitelist matches plan D5 + T6/T8: host CP needs exactly these
# four operations. EVERYTHING else stays at the proxy default
# (deny). Audit periodically; widen with explicit justification.
CONTAINERS: "1"
EVENTS: "1"
EXEC: "1"
# Allows GET /images/<ref>/json. Needed by version-status.mjs to
# resolve the baked OLAM_BUILD_SHA of locally-pulled images
# (host-cp + auth-service + devbox `:latest` tags) so the
# upgrade comparator can answer "is there a newer image I'd
# actually swap to?" — see PR #459 for the comparator rewrite
# and `fetchLatestImageSha`. Without this, both the new
# comparator AND the pre-existing fetchDevboxImageSha fall back
# to 'unknown', producing the over-reporting "Upgrade available"
# banner regression. Socket is mounted :ro so this remains
# read-only inspect; no container mutation surface.
IMAGES: "1"
# tecnativa/docker-socket-proxy 0.3.0 requires POST=1 to allow
# POST verbs on whitelisted endpoints (exec creation requires
# POST /containers/<id>/exec + POST /exec/<id>/start). Phase
# F-2-D dogfood revealed the missing perm.
POST: "1"
# Optional: lower log verbosity. Default is INFO; DEBUG floods
# logs in dev. Comment out for troubleshooting.
LOG_LEVEL: "warning"
volumes:
# Mount the host's docker socket READ-ONLY. The proxy is the only
# consumer of the raw socket. host-cp talks to the proxy over
# TCP (port 2375 on the internal network).
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- olam-host-cp-internal
restart: unless-stopped
networks:
olam-host-cp-internal:
name: olam-host-cp-internal
driver: bridge
# Internal-only: no host port published; host-cp <-> proxy traffic
# never leaves the docker network.
# Host-side docker-socket-proxy for the olam kubernetes substrate.
#
# Background — round-4 wave-2 R4-W2-F (kuro-bear retest 2026-05-21):
# on macOS + colima + virtiofs, containerd's OCI runtime spec generator
# calls stat() on docker.sock hostPath bind mounts; virtiofs returns
# ENOTSUP for stat/statx on socket files; pod creation fails. The R3-A
# two-volume hostPath approach is unrecoverable on virtiofs.
#
# This compose file provisions the docker-socket-proxy AS A HOST-SIDE
# CONTAINER (sibling to k3d on the operator's docker daemon), NOT as a
# pod inside the k3d cluster. The in-cluster Service in
# packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml is
# `type: ExternalName` aliasing `host.k3d.internal` — cluster pods reach
# THIS container via that DNS handle.
#
# Architecture mirrors the compose substrate's pattern (see
# packages/host-cp/compose.yaml:170-210). Same image, same allowlist,
# same restart policy. The only difference: this proxy publishes to
# the operator host on 127.0.0.1:2375 so k3d nodes can reach it via
# host.k3d.internal — the compose-substrate sibling stays internal-only.
#
# Operator UX: `olam upgrade -y` Step 0.7 auto-starts this on macOS via
# `docker compose -f <this-file> up -d`. Linux operators get a no-op
# (Step 0.7 is platform-gated). See docs/operator/kubernetes-substrate-beta.md.
services:
docker-socket-proxy:
container_name: olam-host-side-docker-socket-proxy
# tecnativa/docker-socket-proxy:0.3.0 — matches the compose substrate's
# pin verbatim. T8 supply-chain: pinning prevents drift. Update via
# Renovate / dependabot.
image: tecnativa/docker-socket-proxy:0.3.0
environment:
# Whitelist matches packages/host-cp/compose.yaml:181-202 verbatim.
# Anything outside this list stays at proxy default (deny).
CONTAINERS: "1"
EVENTS: "1"
EXEC: "1"
# IMAGES=1 needed for GET /images/<ref>/json (version-status.mjs
# fetchLatestImageSha). Socket is :ro so this is read-only inspect.
IMAGES: "1"
# POST=1 required since tecnativa 0.3.0 for exec creation
# (POST /containers/<id>/exec + POST /exec/<id>/start). See
# packages/host-cp/compose.yaml:195-199 for the F-2-D dogfood
# finding that surfaced this.
POST: "1"
LOG_LEVEL: "warning"
ports:
# Publish to operator host on 127.0.0.1:2375 ONLY. k3d nodes reach
# this via host.k3d.internal:2375. Binding to 127.0.0.1 (not
# 0.0.0.0) is T1 mitigation: docker API surface stays loopback-only
# on a single-tenant operator machine.
- "127.0.0.1:2375:2375"
volumes:
# Read-only mount of the host's docker socket. The proxy is the
# only consumer of the raw socket on the operator's mac.
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: unless-stopped
apiVersion: v1
kind: Namespace
metadata:
name: olam
labels:
name: olam
olam.io/component: host-stack
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-host-cp
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
# Phase 1b Decision 19: Role scoped to resourceNames: ["olam-host-cp"] on
# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
# could patch ANY Deployment in the namespace. This is the load-bearing
# security guardrail — preserve verbatim.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-host-cp
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
resourceNames: ["olam-host-cp"]
verbs: ["get", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-host-cp
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
subjects:
- kind: ServiceAccount
name: olam-host-cp
namespace: olam
roleRef:
kind: Role
name: olam-host-cp
apiGroup: rbac.authorization.k8s.io
# ConfigMap for olam-host-cp environment. Sensitive values (OLAM_AUTH_SECRET,
# GH_TOKEN) are NOT here — they live in the Secret (see templates/40-secret-template.yaml).
# Operators apply the Secret separately before applying the manifests.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-host-cp-env
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
data:
# Auth service URL. Default targets host.docker.internal for Colima/Docker
# Desktop k3d setups. Override when auth-service runs elsewhere (e.g. via
# an ExternalName Service pointing at the host gateway).
#
# Port :9999 matches the published port in AuthContainerController.start()
# (packages/core/src/auth/container.ts) — the value was historically :8000,
# which never matched any running auth-service version and surfaced as
# {"error":"auth_service_unavailable","message":"fetch failed"}
# on /api/auth/* calls. Verified during the K3d-HTTPS PR live bring-up;
# see docs/runbooks/k3d-https-setup.md.
OLAM_AUTH_SERVICE_URL: "http://host.docker.internal:9999"
# Docker socket proxy — ClusterIP Service DNS inside the namespace.
DOCKER_HOST: "tcp://docker-socket-proxy:2375"
# Host-cp server port — must match the Service targetPort in 60-service.yaml.
OLAM_HOST_CP_PORT: "19000"
# Operator state paths (resolved inside the K3s node via hostPath volumes).
OLAM_HOST_CP_TOKEN_PATH: "/data/host-cp.token"
OLAM_WORKSPACES_DIR: "/data/workspaces"
OLAM_WORLDS_DB: "/data/worlds.db"
OLAM_PLAN_DB_PATH: "/data/plan.db"
OLAM_PLAN_DIR: "/data/plan"
# Phase B Model B: bearer file is now sourced from the shared
# olam-plan-chat-secret Kubernetes Secret (mounted at /etc/olam-plan-chat/).
# Two readers, one source-of-truth — replaces the per-pod /data/plan-chat-secret
# file that couldn't be shared across pods on RWO PVCs. The plan-chat-service
# pod also mounts the SAME Secret at the SAME path so bearer comparisons
# work both ways.
OLAM_PLAN_CHAT_SECRET_PATH: "/etc/olam-plan-chat/secret"
# In-cluster plan-chat-service URL. Rewritten by upgrade-kubernetes.ts step 2.5
# (buildK8sDnsUrl) — the default below is a sane fallback for raw
# `kubectl apply -f` operators who skip the CLI wrapper.
PLAN_CHAT_SERVICE_URL: "http://olam-plan-chat-service.olam.svc.cluster.local:3200"
# NDJSON span sink + recovery ledger — route to the writable PVC mount at
# /data rather than the default ~/.olam/logs (which resolves to
# /home/node/.olam/logs and is not writable with readOnlyRootFilesystem: true).
OLAM_TRACE_LOG_PATH: "/data/logs/host.trace.ndjson"
OLAM_RECOVERY_LEDGER_PATH: "/data/logs/recovery-ledger.ndjson"
# Tunable defaults.
OLAM_SECRET_CACHE_TTL_SEC: "300"
OLAM_PR_POLL_INTERVAL_MS: "300000"
OLAM_MERGE_GRACE_MS: "600000"
# World watchdog — periodic probe of each active world's claude PID for the
# three wedge signals (wchan + CLOSE_WAIT + CPU). Detection-only in Phase A.
# Set OLAM_WORLD_WATCHDOG_DISABLED=1 in the deployment env to kill-switch.
OLAM_WORLD_WATCHDOG_TICK_MS: "30000"
# PersistentVolumeClaim for olam-host-cp /data volume — k3d substrate default.
#
# Why PVC instead of hostPath:
# hostPath volumes on k3d nodes resolve to paths INSIDE the k3d node
# container — not the operator's host filesystem. A bare k3d cluster has
# an empty node filesystem, so a hostPath at /host/.olam is always empty.
# Additionally, fsGroup does NOT relabel hostPath volumes (only PVCs /
# emptyDir / projected volumes), so UID-1000 pods cannot write to
# root-owned hostPath mounts even when fsGroup: 1000 is set.
#
# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
# On non-k3d clusters, substitute with the appropriate StorageClass name (D24,
# operator-editable). For managed clusters (GKE, EKS, AKS) use the GKE-variant
# manifest instead: packages/host-cp/k8s/manifests/gke/45-pvc.yaml (storageClassName:
# standard-rwo). See docs/architecture/peripheral-services-on-k3s.md Decision #3
# for the full per-cluster storageclass table.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-host-cp-data
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 5Gi
# Deployment for olam-host-cp.
#
# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
# Digest resolves to ghcr.io/pleri/olam-host-cp:0.1.168 (multi-arch index).
# Pinned to the last image built before PRs #915/#919/#920/#921 introduced
# lifecycle/, observability/, and recovery/ module directories — those PRs
# updated server.mjs imports but the Dockerfile was not updated to COPY
# the new directories, so all images from 0.1.169+ crash with
# ERR_MODULE_NOT_FOUND. The Dockerfile fix (COPY lifecycle/ / observability/
# / recovery/) lands in PR #940; the next release will ship a working image.
# At that point, refresh this digest via the instructions below.
# To update: resolve the new tag's digest via:
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-host-cp:pull&service=ghcr.io" | jq -r .token)
# curl -sI -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
# https://ghcr.io/v2/pleri/olam-host-cp/manifests/<tag> | grep docker-content-digest
#
# securityContext: conservative defaults per T6/T7 threat model.
# Operators who need to relax these (e.g. for debugging) must pass
# --accept-security-regression (Phase C, Decision D14) — out of scope here.
#
# Volume requirements for k3d:
# olam-home (/data): backed by a PersistentVolumeClaim (45-pvc.yaml).
# An init container (chown-data) runs `chown -R 1000:1000 /data` as root
# before the main container starts, granting UID-1000 write access on the
# freshly-provisioned PV. fsGroup alone is insufficient for hostPath volumes.
#
# docker access — NO LONGER VIA hostPath (changed in olam-k3d-on-mac-
# substrate-decision Phase B B2, 2026-05-21). The previous R3-A two-volume
# hostPath pattern is retracted: round-4 R4-W2-F showed virtiofs returns
# ENOTSUP on stat/statx of socket files, and that failure is unrecoverable
# at the containerd OCI runtime layer. host-cp now reaches docker via TCP
# through the docker-socket-proxy ExternalName Service in the olam
# namespace (packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml),
# which kube-dns resolves as a CNAME to host.k3d.internal. The actual
# proxy container runs on the operator's docker daemon (sibling to k3d),
# started by `olam upgrade` Step 0.7. See also
# packages/host-cp/src/lib/docker-request-options.mjs (both substrates now
# return identical TCP options).
#
# The operator's k3d cluster create command is therefore simpler — no
# `--volume $HOME/.colima/default/:/host-colima/@server:*` flag needed.
# See docs/operator/kubernetes-substrate-beta.md for the current install
# command.
#
# gh-config (/gh-config) and operator-repo (/operator-repo) remain hostPath
# volumes that resolve to paths inside the k3d node container.
# OPERATORS MUST pass these volume mounts when creating the k3d cluster.
# Without these flags the gh-config and operator-repo mounts will be empty.
# The pod will still start — features that depend on GitHub auth or the
# operator repo will fail gracefully.
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-host-cp
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: olam-host-cp
template:
metadata:
labels:
app: olam-host-cp
spec:
# B9 (round 2 recovery): disable k8s automatic Service env injection.
# Without this, k8s injects OLAM_<UPPER-NAME>_SERVICE_HOST/_PORT env vars
# into all Pods in the namespace. These collide with olam's own config env
# vars (e.g. OLAM_KG_SERVICE_PORT) causing Python's int() to crash on the
# auto-injected "tcp://..." string. Decision #4 (no app-code rename; field
# removes the collision class entirely). GA since k8s 1.13; we target 1.30+.
enableServiceLinks: false
# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
# created by `olam upgrade` step 0.4 when GH_TOKEN is available. Allows
# pulling from ghcr.io/pleri/* without anonymous rate limits.
imagePullSecrets:
- name: ghcr-pull
serviceAccountName: olam-host-cp
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
initContainers:
- name: chown-data
# busybox:1.36 — sha256-pinned per T4 threat model.
# To update: docker pull busybox:1.36 && docker inspect busybox:1.36 --format '{{index .RepoDigests 0}}'
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
imagePullPolicy: IfNotPresent
# Run as root to chown the freshly-provisioned PV to UID 1000.
# The pod-level runAsNonRoot: true is overridden here deliberately.
# The main container still runs as UID 1000 with all security defaults intact.
securityContext:
runAsUser: 0
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["chown", "-R", "1000:1000", "/data"]
volumeMounts:
- name: olam-home
mountPath: /data
# socket-perm init container REMOVED in olam-k3d-on-mac-substrate-decision
# Phase B B2 (2026-05-21). The R3-A two-volume hostPath approach for
# docker.sock has been retracted: round-4 R4-W2-F showed virtiofs
# ENOTSUP on socket-file stat blocks the mount entirely. host-cp now
# reaches docker via TCP through the docker-socket-proxy ExternalName
# Service in the olam namespace (see
# packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml).
# The proxy itself runs on the operator's docker daemon (sibling to
# k3d), started by `olam upgrade` Step 0.7 — not inside this Pod.
containers:
- name: olam-host-cp
image: ghcr.io/pleri/olam-host-cp@sha256:328baca8b9b28ccef1d858aa20e0ab27855604a630132dcadd423990cb376f60
imagePullPolicy: IfNotPresent
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
ports:
- name: http
containerPort: 19000
protocol: TCP
env:
# World watchdog — tick cadence (from ConfigMap default = 30s).
# Override per-operator to tune probe frequency.
- name: OLAM_WORLD_WATCHDOG_TICK_MS
valueFrom:
configMapKeyRef:
name: olam-host-cp-env
key: OLAM_WORLD_WATCHDOG_TICK_MS
# Set to "1" to disable the world-watchdog entirely (emergency kill switch).
# Unset by default — watchdog runs in detection-only mode.
# - name: OLAM_WORLD_WATCHDOG_DISABLED
# value: "1"
envFrom:
- configMapRef:
name: olam-host-cp-env
- secretRef:
name: olam-host-cp-secret
volumeMounts:
- name: olam-home
mountPath: /data
- name: gh-config
mountPath: /gh-config
readOnly: true
- name: operator-repo
mountPath: /operator-repo
readOnly: true
- name: tmp
mountPath: /tmp
# Phase B Model B: shared olam-plan-chat-secret mounted read-only
# so renderSpaShell can inject window.__OLAM_PLAN_CHAT_BEARER__.
# Plan-chat-service mounts the SAME Secret at the SAME path so
# bearer compares match across pods.
- name: plan-chat-secret
mountPath: /etc/olam-plan-chat
readOnly: true
# docker-socket volumeMount REMOVED in olam-k3d-on-mac-substrate-
# decision Phase B B2. Docker access now goes via TCP to the
# docker-socket-proxy ExternalName Service in the olam namespace.
# host-cp's `getDockerRequestOptions('kubernetes')` returns
# `{ host: 'docker-socket-proxy', port: 2375 }` (collapsed to the
# same value as the compose substrate's branch — see
# packages/host-cp/src/lib/docker-request-options.mjs).
readinessProbe:
httpGet:
path: /health
port: 19000
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 6
livenessProbe:
httpGet:
path: /health
port: 19000
initialDelaySeconds: 30
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "50m"
memory: "256Mi"
limits:
cpu: "1000m"
memory: "1Gi"
volumes:
- name: olam-home
persistentVolumeClaim:
claimName: olam-host-cp-data
- name: gh-config
hostPath:
path: /host/.config/gh
type: DirectoryOrCreate
- name: operator-repo
hostPath:
path: /host/olam
type: DirectoryOrCreate
- name: tmp
emptyDir: {}
- name: plan-chat-secret
secret:
secretName: olam-plan-chat-secret
defaultMode: 0400
items:
- key: PLAN_CHAT_SECRET
path: secret
# host-colima + docker-socket volumes REMOVED in olam-k3d-on-mac-
# substrate-decision Phase B B2 (2026-05-21). R3-A's two-volume
# hostPath approach is fully retracted: round-4 R4-W2-F demonstrated
# virtiofs ENOTSUP on socket-file stat is unrecoverable at the
# containerd OCI runtime layer (kubelet bypass via R4-W2-E was
# necessary-but-not-sufficient). host-cp now reaches docker via TCP
# through the docker-socket-proxy ExternalName Service — see
# packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml.
# The proxy itself runs on the operator's docker daemon (sibling to
# k3d), started by `olam upgrade` Step 0.7 on macOS.
# ClusterIP Service for olam-host-cp.
#
# Two ways to reach the SPA externally:
# 1. (preferred) Traefik IngressRoute at https://olam.local:<traefik-https-port>
# Terminates TLS at the cluster edge, unlocks HTTP/2 multiplexing for
# Electric SQL long-polls. See 70-ingressroute.yaml + 65-tls-secret-template.yaml.tmpl.
# The pod itself stays HTTP-only — Traefik handles TLS at the edge.
# 2. (fallback) kubectl port-forward -n olam svc/olam-host-cp 19000:19000
# Plain HTTP/1.1; hits browser's 6-conn-per-origin cap under Electric load.
#
# ClusterIP (not NodePort) preserves the "127.0.0.1-only" single-user-per-host
# invariant — exposure is via Traefik's LoadBalancer or port-forward, not by
# binding pod ports on every node interface.
apiVersion: v1
kind: Service
metadata:
name: olam-host-cp
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
spec:
type: ClusterIP
selector:
app: olam-host-cp
ports:
- name: http
port: 19000
targetPort: 19000
protocol: TCP
# TLS secret template for olam-host-cp Traefik IngressRoute.
#
# DO NOT apply this template directly — the placeholders `__TLS_CRT_BASE64__`
# and `__TLS_KEY_BASE64__` are substituted at apply time by
# `olam services tls-install` (packages/cli/src/commands/services-tls.ts),
# which uses `mkcert` to mint a locally-trusted certificate for the SAN list
# olam.local 127.0.0.1 ::1
# and then `kubectl apply -f -` against the rendered manifest.
#
# Why a Secret of type kubernetes.io/tls (instead of a plain Opaque secret):
# Traefik's IngressRoute TLS resolver requires this exact type — it reads
# tls.crt + tls.key fields by convention. Using Opaque would silently fail
# the handshake at request time.
#
# Why the cert covers SANs (not just CN): modern browsers (Chrome 58+, Brave,
# Safari, Firefox) ignore the certificate CN entirely and only honour SANs.
# Without `127.0.0.1` + `::1` in the SAN list, hitting the IP directly fails
# even though the cert is "valid for olam.local".
#
# Renewal: certs minted by mkcert are valid ~2 years and 3 months. The
# tls-install command checks NotAfter and regenerates when within 30 days
# of expiry. To force regeneration: `kubectl -n olam delete secret olam-host-cp-tls`
# and re-run `olam services tls-install`.
apiVersion: v1
kind: Secret
metadata:
name: olam-host-cp-tls
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
type: kubernetes.io/tls
data:
tls.crt: __TLS_CRT_BASE64__
tls.key: __TLS_KEY_BASE64__
# Traefik IngressRoute terminating TLS at the cluster edge for olam-host-cp.
#
# Topology:
# Browser --HTTPS/h2--> Traefik :443 (LoadBalancer / k3d NodePort)
# |
# | (TLS terminated; cleartext inside cluster)
# v
# olam-host-cp:19000 (ClusterIP, HTTP/1.1 internal)
# |
# v
# plan-chat-service:3200 (and other peripherals)
#
# Why terminate TLS at Traefik (NOT at host-cp): host-cp is a Node/Hono
# server tuned for cleartext HTTP. Pushing TLS into the pod would force a
# second cert-distribution mechanism (Secret → volumeMount → server.mjs
# reload) and double the operational surface. Traefik already owns cert
# lifecycle in production (cert-manager + Let's Encrypt), so dev-mode
# mkcert at the same boundary keeps prod parity tight.
#
# Why HTTP/2 matters: TanStack DB / Electric SQL opens N long-poll
# connections per browser tab (one per shape subscription). Without h2
# multiplexing they queue against the browser's 6-connection-per-origin
# cap, leading to the "25-second pending requests" symptom Electric users
# hit on HTTP/1.1. Traefik 2.x advertises h2 over TLS via ALPN by default;
# no extra config needed.
#
# Why Host(olam.local) instead of a wildcard: the cert is minted for that
# exact SAN. Traefik routes based on SNI, so the host-rule must match the
# cert subject or the TLS handshake completes but the route 404s.
#
# Operator MUST add `127.0.0.1 olam.local` to /etc/hosts before this works.
# `olam services tls-install` prints the line + sudo command — it does NOT
# auto-edit (touching /etc/hosts behind the operator's back is a foot-gun).
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
# Distinct name avoids collision with packages/peripheral-services'
# `olam-host-cp` IngressRoute (the legacy `web`-entrypoint + path-based
# router that 50+ SPA fetch sites still depend on). The `-https` variant
# adds a SECOND ingress that matches Host(olam.local) on `websecure` and
# terminates TLS via the operator-minted Secret. Both coexist; the legacy
# one keeps `http://<lb>/api/...` working, this one unlocks HTTP/2.
name: olam-host-cp-https
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
spec:
entryPoints:
- websecure
routes:
- match: Host(`olam.local`)
kind: Rule
services:
- name: olam-host-cp
port: 19000
tls:
secretName: olam-host-cp-tls
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-auth-service
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-auth-service"] on
# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
# could patch ANY Deployment in the namespace. This is the load-bearing
# security guardrail — preserve verbatim.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-auth-service
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
resourceNames: ["olam-auth-service"]
verbs: ["get", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-auth-service
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
subjects:
- kind: ServiceAccount
name: olam-auth-service
namespace: olam
roleRef:
kind: Role
name: olam-auth-service
apiGroup: rbac.authorization.k8s.io
# ConfigMap for olam-auth-service environment. Sensitive values (AUTH_DB_SECRET,
# API keys) are NOT here — they live in the Secret (see templates/auth-service-secret-template.yaml).
# Operators apply the Secret separately before applying the manifests.
#
# Inter-peripheral URL placeholders (e.g. OLAM_MCP_AUTH_URL) are set to
# cluster-internal DNS names. These are resolved by Phase C substitution;
# operators running Phase 2 Beta may override them directly.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-auth-service-env
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
data:
# Port auth-service listens on. Must match 60-service.yaml targetPort.
OLAM_AUTH_PORT: "9999"
# Data directory — backed by the PVC mounted at /data.
OLAM_AUTH_DATA_PATH: "/data/auth"
# URL of mcp-auth-service (cluster-internal DNS). Override in non-k3d environments.
OLAM_MCP_AUTH_SERVICE_URL: "http://olam-mcp-auth-service.olam.svc.cluster.local:9998"
# Credential vault poll interval.
OLAM_CREDENTIAL_POLL_MS: "60000"
# R3-B (Decision R3-#2): bind on all interfaces so the k8s readiness probe
# (hitting the pod IP 10.42.x.x:9999) succeeds. Default in image source was
# 127.0.0.1 which caused CrashLoopBackOff in k8s. ConfigMap override is the
# second defense layer; the image source default was also changed to 0.0.0.0.
AUTH_BIND: "0.0.0.0"
# PersistentVolumeClaim for olam-auth-service /data volume.
#
# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
# for the full rationale (fsGroup, k3d node filesystem, etc.).
#
# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-auth-data
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
spec:
accessModes:
- ReadWriteOnce
# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
storageClassName: local-path
resources:
requests:
# D25: auth-service PVC size 5Gi.
storage: 5Gi
# Deployment for olam-auth-service.
#
# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
# Digest resolves to ghcr.io/pleri/olam-auth:latest (multi-arch index).
# NOTE (B1): image name is olam-auth (NOT olam-auth-service) — matches the
# actual GHCR package name published by release.yml publish-auth job.
# To update: resolve the new tag's digest via:
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-auth:pull&service=ghcr.io" | jq -r .token)
# curl -sI -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
# https://ghcr.io/v2/pleri/olam-auth/manifests/<tag> | grep docker-content-digest
# Or use: node scripts/refresh-manifest-digests.mjs
#
# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
#
# D17: auth-service does NOT mount /var/run/docker.sock (Phase 2 k8s pods
# cannot reach docker.sock — no hostPath socket mount).
#
# chown-data init container: grants UID-1000 write access on the freshly-
# provisioned PV (fsGroup alone is insufficient for local-path PVs).
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-auth-service
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: olam-auth-service
template:
metadata:
labels:
app: olam-auth-service
spec:
# B9 (round 2 recovery): disable k8s automatic Service env injection.
# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
enableServiceLinks: false
# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
imagePullSecrets:
- name: ghcr-pull
serviceAccountName: olam-auth-service
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
initContainers:
- name: chown-data
# busybox:1.36 — sha256-pinned per T4 threat model.
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["chown", "-R", "1000:1000", "/data"]
volumeMounts:
- name: auth-data
mountPath: /data
containers:
- name: olam-auth-service
image: ghcr.io/pleri/olam-auth@sha256:770ee97ee4d06d2c1b6512ba99421a5fe312393d592df1684fd0d03b3476ff10
imagePullPolicy: IfNotPresent
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
ports:
- name: http
containerPort: 9999
protocol: TCP
envFrom:
- configMapRef:
name: olam-auth-service-env
- secretRef:
name: olam-auth-service-secret
volumeMounts:
- name: auth-data
mountPath: /data
- name: tmp
mountPath: /tmp
readinessProbe:
httpGet:
path: /health
port: 9999
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 6
livenessProbe:
httpGet:
path: /health
port: 9999
initialDelaySeconds: 30
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "50m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
volumes:
- name: auth-data
persistentVolumeClaim:
claimName: olam-auth-data
- name: tmp
emptyDir: {}
# ClusterIP Service for olam-auth-service.
# Port 9999 — consumed by host-cp and other peripherals via cluster-internal DNS.
# Operator surfaces externally via:
# kubectl port-forward -n olam svc/olam-auth-service 9999:9999
apiVersion: v1
kind: Service
metadata:
name: olam-auth-service
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
spec:
type: ClusterIP
selector:
app: olam-auth-service
ports:
- name: http
port: 9999
targetPort: 9999
protocol: TCP
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-chunks-electric
namespace: olam
labels:
app: olam-chunks-electric
olam.io/component: substrate
# Electric does not call the Kubernetes API. Empty Role kept for layout parity.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-chunks-electric
namespace: olam
labels:
app: olam-chunks-electric
olam.io/component: substrate
rules: []
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-chunks-electric
namespace: olam
labels:
app: olam-chunks-electric
olam.io/component: substrate
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: olam-chunks-electric
subjects:
- kind: ServiceAccount
name: olam-chunks-electric
namespace: olam
# ConfigMap for olam-chunks-electric.
#
# ELECTRIC_INSECURE=true disables Electric's API-secret-token gate. Acceptable
# in a single-operator local-dev k3d cluster (the Service is ClusterIP — no
# external reachability). For multi-tenant deploys, set ELECTRIC_INSECURE=false
# and provision ELECTRIC_SECRET via a Secret instead.
#
# DATABASE_URL is composed at runtime in the Deployment via env: composition
# referencing the chunks-postgres Secret (POSTGRES_PASSWORD). It is NOT
# stored here.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-chunks-electric-env
namespace: olam
labels:
app: olam-chunks-electric
olam.io/component: substrate
data:
ELECTRIC_INSECURE: "true"
ELECTRIC_PORT: "3000"
ELECTRIC_HTTP_API_PORT: "3000"
ELECTRIC_LOG_LEVEL: "info"
# Electric's HTTP server state lives in-memory + the replication slot lives on
# Postgres. No persistent state required, but a small PVC is kept for parity
# with other peripherals — Electric writes its persisted-shape index to
# /app/persistent by default; PVC backs that path.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-chunks-electric-data
namespace: olam
labels:
app: olam-chunks-electric
olam.io/component: substrate
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 1Gi
# Deployment for olam-chunks-electric.
#
# Electric SQL — Postgres logical-replication → HTTP long-poll shape proxy.
# Single replica (replication slot is single-writer).
#
# Image: electricsql/electric:1.6.8 — sha256-pinned per T4 threat model.
# Resolves to the same digest as :latest at 2026-05-27; refresh when the
# upstream cuts a new release that closes a security advisory.
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-chunks-electric
namespace: olam
labels:
app: olam-chunks-electric
olam.io/component: substrate
spec:
replicas: 1
strategy:
# Recreate (NOT RollingUpdate) — Electric holds a postgres replication
# slot; two pods running at once would fight for the same slot and one
# would crashloop.
type: Recreate
selector:
matchLabels:
app: olam-chunks-electric
template:
metadata:
labels:
app: olam-chunks-electric
spec:
enableServiceLinks: false
serviceAccountName: olam-chunks-electric
containers:
- name: electric
image: electricsql/electric:1.6.8@sha256:a716f2affde44d5b991bdd1492876d9d6bddbcae5c98411327614575cd8f9eec
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 3000
protocol: TCP
envFrom:
- configMapRef:
name: olam-chunks-electric-env
env:
# DATABASE_URL composition. POSTGRES_PASSWORD is sourced from the
# chunks-postgres Secret (rendered by k8s-secret-render.ts).
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: olam-chunks-postgres-secret
key: POSTGRES_PASSWORD
- name: DATABASE_URL
value: "postgres://postgres:$(POSTGRES_PASSWORD)@olam-chunks-postgres.olam.svc.cluster.local:5432/chunks?sslmode=disable"
volumeMounts:
- name: persistent
mountPath: /app/persistent
readinessProbe:
httpGet:
path: /v1/health
port: 3000
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 12
livenessProbe:
httpGet:
path: /v1/health
port: 3000
initialDelaySeconds: 60
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "100m"
memory: "256Mi"
limits:
cpu: "1000m"
memory: "1Gi"
volumes:
- name: persistent
persistentVolumeClaim:
claimName: olam-chunks-electric-data
apiVersion: v1
kind: Service
metadata:
name: olam-chunks-electric
namespace: olam
labels:
app: olam-chunks-electric
olam.io/component: substrate
spec:
type: ClusterIP
selector:
app: olam-chunks-electric
ports:
- name: http
port: 3000
targetPort: 3000
protocol: TCP
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-chunks-postgres
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
# Minimal-privilege RBAC for chunks-postgres. The pod does not call the
# Kubernetes API; this Role exists to make the per-service apply order
# (10/20/30/45/50/60) uniform across peripherals + substrate.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-chunks-postgres
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
rules: []
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-chunks-postgres
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: olam-chunks-postgres
subjects:
- kind: ServiceAccount
name: olam-chunks-postgres
namespace: olam
# ConfigMap for olam-chunks-postgres.
#
# Two ConfigMaps in one file:
#
# 1. olam-chunks-postgres-env — non-secret env vars (POSTGRES_USER, POSTGRES_DB).
# POSTGRES_PASSWORD lives in the Secret rendered by
# packages/cli/src/lib/k8s-secret-render.ts.
#
# 2. olam-chunks-postgres-initdb-sql — the chunks schema. Mounted at
# /docker-entrypoint-initdb.d/01-chunks.sql so
# the postgres image's entrypoint auto-applies it
# on FIRST init (empty data dir). Subsequent
# restarts skip the directory by design.
#
# Source-of-truth: packages/chunks/src/schema.ts
# (SCHEMA_SQL export). The CI gate
# `audit:chunks-schema-parity` (follow-up) will
# fail when this ConfigMap drifts from
# SCHEMA_VERSION-tagged schema.ts.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-chunks-postgres-env
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
data:
POSTGRES_USER: "postgres"
POSTGRES_DB: "chunks"
# PGDATA must point at a subdirectory of the PVC mount, not its root —
# the PVC root may carry the local-path provisioner's lost+found dir,
# which postgres's initdb rejects ("data directory not empty").
PGDATA: "/var/lib/postgresql/data/pgdata"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-chunks-postgres-initdb-sql
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
data:
# MIRRORS packages/chunks/src/schema.ts SCHEMA_VERSION=2.
# Idempotent: CREATE TABLE IF NOT EXISTS / ADD COLUMN IF NOT EXISTS /
# DO blocks with EXCEPTION-WHEN-{undefined_object,duplicate_object}.
01-chunks.sql: |
CREATE TABLE IF NOT EXISTS chunks (
world_id TEXT NOT NULL,
session_id TEXT NOT NULL,
message_id TEXT NOT NULL,
seq INTEGER NOT NULL,
actor_id TEXT NOT NULL,
actor_type TEXT NOT NULL CHECK (actor_type IN ('agent', 'operator', 'codex', 'system')),
role TEXT NOT NULL CHECK (role IN ('user', 'assistant', 'tool', 'system')),
chunk TEXT NOT NULL,
chunk_type TEXT NOT NULL DEFAULT 'text' CHECK (chunk_type IN ('text', 'tool_use', 'goal_mode_assumption', 'dispatch_overflow')),
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
PRIMARY KEY (message_id, seq)
);
ALTER TABLE chunks ADD COLUMN IF NOT EXISTS chunk_type TEXT NOT NULL DEFAULT 'text';
DO $$ BEGIN
ALTER TABLE chunks DROP CONSTRAINT IF EXISTS chunks_chunk_type_check;
EXCEPTION WHEN undefined_object THEN NULL;
END $$;
DO $$ BEGIN
ALTER TABLE chunks ADD CONSTRAINT chunks_chunk_type_check
CHECK (chunk_type IN ('text', 'tool_use', 'goal_mode_assumption', 'dispatch_overflow'));
EXCEPTION WHEN duplicate_object THEN NULL;
END $$;
CREATE INDEX IF NOT EXISTS chunks_world_session_seq
ON chunks (world_id, session_id, seq);
CREATE INDEX IF NOT EXISTS chunks_world_session_created
ON chunks (world_id, session_id, created_at);
CREATE INDEX IF NOT EXISTS idx_chunks_planning
ON chunks (session_id, seq)
WHERE world_id = '_planning';
CREATE TABLE IF NOT EXISTS planning_sessions (
session_id TEXT PRIMARY KEY,
actor_id TEXT NOT NULL,
summary TEXT,
crystallize_status TEXT NOT NULL DEFAULT 'open'
CHECK (crystallize_status IN ('open', 'in_progress', 'crystallized', 'failed', 'abandoned')),
crystallized_world_id TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_planning_sessions_created_at
ON planning_sessions (created_at DESC);
ALTER TABLE planning_sessions ADD COLUMN IF NOT EXISTS session_source TEXT;
CREATE OR REPLACE FUNCTION chunks_append_only_trigger()
RETURNS trigger AS $body$
BEGIN
RAISE EXCEPTION 'chunks is append-only; % forbidden', TG_OP;
END;
$body$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS chunks_no_update ON chunks;
CREATE TRIGGER chunks_no_update
BEFORE UPDATE ON chunks
FOR EACH ROW EXECUTE FUNCTION chunks_append_only_trigger();
DROP TRIGGER IF EXISTS chunks_no_delete ON chunks;
CREATE TRIGGER chunks_no_delete
BEFORE DELETE ON chunks
FOR EACH ROW EXECUTE FUNCTION chunks_append_only_trigger();
CREATE TABLE IF NOT EXISTS message_usage (
world_id TEXT NOT NULL,
session_id TEXT NOT NULL,
message_id TEXT NOT NULL,
actor_id TEXT NOT NULL,
model TEXT NOT NULL,
input_tokens INTEGER NOT NULL DEFAULT 0,
output_tokens INTEGER NOT NULL DEFAULT 0,
cache_read_tokens INTEGER NOT NULL DEFAULT 0,
cache_create_tokens INTEGER NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
PRIMARY KEY (message_id, actor_id)
);
CREATE INDEX IF NOT EXISTS message_usage_session_created
ON message_usage (session_id, created_at);
CREATE OR REPLACE FUNCTION message_usage_append_only_trigger()
RETURNS trigger AS $body$
BEGIN
RAISE EXCEPTION 'message_usage is append-only; % forbidden', TG_OP;
END;
$body$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS message_usage_no_update ON message_usage;
CREATE TRIGGER message_usage_no_update
BEFORE UPDATE ON message_usage
FOR EACH ROW EXECUTE FUNCTION message_usage_append_only_trigger();
DROP TRIGGER IF EXISTS message_usage_no_delete ON message_usage;
CREATE TRIGGER message_usage_no_delete
BEFORE DELETE ON message_usage
FOR EACH ROW EXECUTE FUNCTION message_usage_append_only_trigger();
CREATE TABLE IF NOT EXISTS planning_artifacts (
id TEXT PRIMARY KEY,
world_id TEXT NOT NULL,
session_id TEXT NOT NULL,
type TEXT NOT NULL CHECK (type IN ('commit_plan', 'component_scaffold', 'design_jam')),
title TEXT NOT NULL,
body JSONB NOT NULL,
status TEXT NOT NULL DEFAULT 'open'
CHECK (status IN ('open', 'crystallized', 'failed', 'archived')),
linear_issue_url TEXT,
crystallized_world_id TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_planning_artifacts_session
ON planning_artifacts (session_id, created_at);
CREATE INDEX IF NOT EXISTS idx_planning_artifacts_world
ON planning_artifacts (world_id, status);
CREATE OR REPLACE FUNCTION planning_artifacts_touch_updated_at()
RETURNS trigger AS $body$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$body$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS planning_artifacts_touch ON planning_artifacts;
CREATE TRIGGER planning_artifacts_touch
BEFORE UPDATE ON planning_artifacts
FOR EACH ROW EXECUTE FUNCTION planning_artifacts_touch_updated_at();
# PVC for the chunks-postgres data directory.
#
# Sized 10Gi for local-dev. Chunks rows are small (~1KB each) so even a
# busy single-operator world rarely cracks 1Gi; the headroom is for the
# message_usage + planning_artifacts sidecar tables.
#
# accessModes: ReadWriteOnce — postgres is a StatefulSet with replicas=1.
# k3d's local-path provisioner only supports RWO; the in-cluster postgres
# pattern is single-writer by design (no operator-managed HA).
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-chunks-postgres-data
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 10Gi
# StatefulSet for olam-chunks-postgres.
#
# Why StatefulSet vs Deployment: even with replicas=1 the StatefulSet gives
# stable network identity (olam-chunks-postgres-0 inside the headless service)
# and ordered termination semantics — both useful when Electric's replication
# slot survives pod restarts.
#
# command override: postgres requires wal_level=logical for Electric SQL's
# logical-replication subscription. The image's default postgresql.conf
# ships wal_level=replica; the -c overrides on the entrypoint args take
# precedence. max_replication_slots / max_wal_senders need raising too —
# Electric holds one slot per database.
#
# securityContext: postgres image runs as uid 999 by default. fsGroup=999
# on the pod ensures the PVC mount is chowned to 999 so postgres can write
# its data dir.
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: olam-chunks-postgres
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
spec:
replicas: 1
serviceName: olam-chunks-postgres
selector:
matchLabels:
app: olam-chunks-postgres
template:
metadata:
labels:
app: olam-chunks-postgres
spec:
enableServiceLinks: false
serviceAccountName: olam-chunks-postgres
securityContext:
fsGroup: 999
containers:
- name: postgres
# postgres:16-alpine — sha256-pinned per T4 threat model.
image: postgres:16-alpine@sha256:16bc17c64a573ef34162af9298258d1aec548232985b33ed7b1eac33ba35c229
imagePullPolicy: IfNotPresent
args:
- postgres
- -c
- wal_level=logical
- -c
- max_replication_slots=10
- -c
- max_wal_senders=10
ports:
- name: postgres
containerPort: 5432
protocol: TCP
envFrom:
- configMapRef:
name: olam-chunks-postgres-env
- secretRef:
name: olam-chunks-postgres-secret
volumeMounts:
- name: data
mountPath: /var/lib/postgresql/data
- name: initdb
mountPath: /docker-entrypoint-initdb.d
readOnly: true
readinessProbe:
exec:
command:
- sh
- -c
- pg_isready -U postgres -d chunks -h 127.0.0.1
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 12
livenessProbe:
exec:
command:
- sh
- -c
- pg_isready -U postgres -h 127.0.0.1
initialDelaySeconds: 30
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "100m"
memory: "256Mi"
limits:
cpu: "1000m"
memory: "1Gi"
volumes:
- name: data
persistentVolumeClaim:
claimName: olam-chunks-postgres-data
- name: initdb
configMap:
name: olam-chunks-postgres-initdb-sql
# Headless Service for olam-chunks-postgres StatefulSet.
#
# clusterIP: None gives the StatefulSet's pod stable DNS:
# olam-chunks-postgres-0.olam-chunks-postgres.olam.svc.cluster.local
# Callers (plan-chat-service, chunks-electric) connect via the shorter
# olam-chunks-postgres.olam.svc.cluster.local form which Kubernetes resolves
# round-robin to the single backing pod.
apiVersion: v1
kind: Service
metadata:
name: olam-chunks-postgres
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
spec:
clusterIP: None
selector:
app: olam-chunks-postgres
ports:
- name: postgres
port: 5432
targetPort: 5432
protocol: TCP
# ExternalName Service for the host-side docker-socket-proxy.
#
# Provides in-cluster DNS for pods to reach the host-side proxy
# container (defined in packages/host-cp/k8s/host-side/docker-socket-proxy.compose.yaml).
# The Service has NO backing Pod — `type: ExternalName` is a kube-dns
# CNAME alias to `host.k3d.internal`, the gateway address that k3d
# auto-provisions inside every node container.
#
# Decision #7 (round-4 plan pass 2): Universal across all k8s substrates
# (macOS+colima+virtiofs, Linux native k3d, WSL2). One codepath; the
# per-Pod cost of running an in-cluster proxy elsewhere is invisible
# against the maintenance tax of OS-conditional Service generation.
#
# Why ExternalName and not in-cluster Pod with hostPath:
# the in-cluster Pod would itself need to bind /var/run/docker.sock
# from the lima VM, hitting the same virtiofs ENOTSUP class that
# R4-W2-F is. The proxy must live OUTSIDE the k3d cluster, on the
# operator's colima docker daemon. ExternalName makes that
# transparent to consumers: host-cp configures
# { host: 'docker-socket-proxy', port: 2375 } regardless of where
# the actual proxy container lives.
apiVersion: v1
kind: Service
metadata:
name: docker-socket-proxy
namespace: olam
labels:
app: docker-socket-proxy
olam.io/component: host-stack
spec:
type: ExternalName
externalName: host.k3d.internal
ports:
- name: tcp-2375
port: 2375
targetPort: 2375
protocol: TCP
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-kg-service"] on
# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
# could patch ANY Deployment in the namespace. This is the load-bearing
# security guardrail — preserve verbatim.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
resourceNames: ["olam-kg-service"]
verbs: ["get", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
subjects:
- kind: ServiceAccount
name: olam-kg-service
namespace: olam
roleRef:
kind: Role
name: olam-kg-service
apiGroup: rbac.authorization.k8s.io
# ConfigMap for olam-kg-service environment. Sensitive values live in
# the Secret (see templates/kg-service-secret-template.yaml).
# Operators apply the Secret separately before applying the manifests.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-kg-service-env
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
data:
# Port kg-service listens on. Must match 60-service.yaml targetPort.
OLAM_KG_PORT: "9997"
# Data directory — backed by the PVC mounted at /data.
OLAM_KG_DATA_PATH: "/data/kg"
# URL of auth-service (cluster-internal DNS). Override in non-k3d environments.
OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
# R3-B (Decision R3-#2): kg-service source (server.py) uses OLAM_KG_SERVICE_BIND
# and defaults to 127.0.0.1. In k8s the readiness probe hits the pod IP, so
# 127.0.0.1-only listener causes CrashLoopBackOff. ConfigMap override forces
# all-interfaces bind without requiring an image rebuild.
OLAM_KG_SERVICE_BIND: "0.0.0.0"
# PersistentVolumeClaim for olam-kg-service /data volume.
#
# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
# for the full rationale (fsGroup, k3d node filesystem, etc.).
#
# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-kg-data
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
spec:
accessModes:
- ReadWriteOnce
# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
storageClassName: local-path
resources:
requests:
# D25: kg-service PVC size 10Gi (larger: graph index grows with codebase).
storage: 10Gi
# Deployment for olam-kg-service.
#
# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.0 (multi-arch index).
# To update: resolve the new tag's digest via:
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" | jq -r .token)
# curl -sI -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
# https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> | grep docker-content-digest
#
# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: olam-kg-service
template:
metadata:
labels:
app: olam-kg-service
spec:
# B9 (round 2 recovery): disable k8s automatic Service env injection.
# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
enableServiceLinks: false
# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
imagePullSecrets:
- name: ghcr-pull
serviceAccountName: olam-kg-service
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
initContainers:
- name: chown-data
# busybox:1.36 — sha256-pinned per T4 threat model.
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["chown", "-R", "1000:1000", "/data"]
volumeMounts:
- name: kg-data
mountPath: /data
containers:
- name: olam-kg-service
image: ghcr.io/pleri/olam-kg-service@sha256:f97ee90fe1bd5b12cb56d5fbf0d3085c301bb7abeef0dd28d2b2a5c90ab6efbb
imagePullPolicy: IfNotPresent
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
ports:
- name: http
containerPort: 9997
protocol: TCP
envFrom:
- configMapRef:
name: olam-kg-service-env
- secretRef:
name: olam-kg-service-secret
volumeMounts:
- name: kg-data
mountPath: /data
- name: tmp
mountPath: /tmp
readinessProbe:
httpGet:
path: /health
port: 9997
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 6
livenessProbe:
httpGet:
path: /health
port: 9997
initialDelaySeconds: 30
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "100m"
memory: "256Mi"
limits:
cpu: "1000m"
memory: "1Gi"
volumes:
- name: kg-data
persistentVolumeClaim:
claimName: olam-kg-data
- name: tmp
emptyDir: {}
# ClusterIP Service for olam-kg-service.
# Port 9997 — consumed by agents and host-cp via cluster-internal DNS.
# Operator surfaces externally via:
# kubectl port-forward -n olam svc/olam-kg-service 9997:9997
apiVersion: v1
kind: Service
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
spec:
type: ClusterIP
selector:
app: olam-kg-service
ports:
- name: http
port: 9997
targetPort: 9997
protocol: TCP
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-mcp-auth-service
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-mcp-auth-service"] on
# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
# could patch ANY Deployment in the namespace. This is the load-bearing
# security guardrail — preserve verbatim.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-mcp-auth-service
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
resourceNames: ["olam-mcp-auth-service"]
verbs: ["get", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-mcp-auth-service
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
subjects:
- kind: ServiceAccount
name: olam-mcp-auth-service
namespace: olam
roleRef:
kind: Role
name: olam-mcp-auth-service
apiGroup: rbac.authorization.k8s.io
# ConfigMap for olam-mcp-auth-service environment. Sensitive values live in
# the Secret (see templates/mcp-auth-service-secret-template.yaml).
# Operators apply the Secret separately before applying the manifests.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-mcp-auth-service-env
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
data:
# Port mcp-auth-service listens on. Must match 60-service.yaml targetPort.
OLAM_MCP_AUTH_PORT: "9998"
# Data directory — backed by the PVC mounted at /data.
OLAM_MCP_AUTH_DATA_PATH: "/data/mcp-auth"
# URL of auth-service (cluster-internal DNS). Override in non-k3d environments.
OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
# R3-B defensive (Decision R3-#2): mcp-auth-service source already defaults to
# 0.0.0.0 (MCP_AUTH_BIND env var) but ConfigMap override is explicit defense
# against a future image regression reverting to 127.0.0.1.
MCP_AUTH_BIND: "0.0.0.0"
# PersistentVolumeClaim for olam-mcp-auth-service /data volume.
#
# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
# for the full rationale (fsGroup, k3d node filesystem, etc.).
#
# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-mcp-auth-data
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
spec:
accessModes:
- ReadWriteOnce
# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
storageClassName: local-path
resources:
requests:
# D25: mcp-auth-service PVC size 5Gi.
storage: 5Gi
# Deployment for olam-mcp-auth-service.
#
# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
# Digest resolves to ghcr.io/pleri/olam-mcp-auth:latest (multi-arch index).
# NOTE (B1): image name is olam-mcp-auth (NOT olam-mcp-auth-service) — matches the
# actual GHCR package name published by release.yml publish-mcp-auth job.
# To update: resolve the new tag's digest via:
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-mcp-auth:pull&service=ghcr.io" | jq -r .token)
# curl -sI -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
# https://ghcr.io/v2/pleri/olam-mcp-auth/manifests/<tag> | grep docker-content-digest
# Or use: node scripts/refresh-manifest-digests.mjs
#
# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
#
# D17 (LOAD-BEARING): mcp-auth-service MUST NOT mount /var/run/docker.sock.
# Phase 2 architecture: k8s pods cannot reach docker.sock. No hostPath socket
# mount here — mcp-auth-service authenticates MCP clients via JWT, not Docker.
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-mcp-auth-service
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: olam-mcp-auth-service
template:
metadata:
labels:
app: olam-mcp-auth-service
spec:
# B9 (round 2 recovery): disable k8s automatic Service env injection.
# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
enableServiceLinks: false
# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
imagePullSecrets:
- name: ghcr-pull
serviceAccountName: olam-mcp-auth-service
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
initContainers:
- name: chown-data
# busybox:1.36 — sha256-pinned per T4 threat model.
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["chown", "-R", "1000:1000", "/data"]
volumeMounts:
- name: mcp-auth-data
mountPath: /data
containers:
- name: olam-mcp-auth-service
image: ghcr.io/pleri/olam-mcp-auth@sha256:eaac2164349e388a70dae0d86c34132f97aa74177a2376cdfa10732e8eadb507
imagePullPolicy: IfNotPresent
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
ports:
- name: http
containerPort: 9998
protocol: TCP
envFrom:
- configMapRef:
name: olam-mcp-auth-service-env
- secretRef:
name: olam-mcp-auth-service-secret
volumeMounts:
- name: mcp-auth-data
mountPath: /data
- name: tmp
mountPath: /tmp
readinessProbe:
httpGet:
path: /health
port: 9998
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 6
livenessProbe:
httpGet:
path: /health
port: 9998
initialDelaySeconds: 30
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "50m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
volumes:
- name: mcp-auth-data
persistentVolumeClaim:
claimName: olam-mcp-auth-data
- name: tmp
emptyDir: {}
# D17 (LOAD-BEARING): NO docker.sock volume or hostPath mount here.
# mcp-auth-service does not need Docker access in Phase 2 k8s architecture.
# ClusterIP Service for olam-mcp-auth-service.
# Port 9998 — consumed by other peripherals and host-cp via cluster-internal DNS.
# Operator surfaces externally via:
# kubectl port-forward -n olam svc/olam-mcp-auth-service 9998:9998
apiVersion: v1
kind: Service
metadata:
name: olam-mcp-auth-service
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
spec:
type: ClusterIP
selector:
app: olam-mcp-auth-service
ports:
- name: http
port: 9998
targetPort: 9998
protocol: TCP
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-memory-service
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
# Phase 1a Decision 19: Role scoped to resourceNames: ["olam-memory-service"] on
# apps/v1 deployments. Without this scope, the in-cluster ServiceAccount
# could patch ANY Deployment in the namespace. This is the load-bearing
# security guardrail — preserve verbatim.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-memory-service
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
resourceNames: ["olam-memory-service"]
verbs: ["get", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-memory-service
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
subjects:
- kind: ServiceAccount
name: olam-memory-service
namespace: olam
roleRef:
kind: Role
name: olam-memory-service
apiGroup: rbac.authorization.k8s.io
# ConfigMap for olam-memory-service environment. Sensitive values live in
# the Secret (see templates/memory-service-secret-template.yaml).
# Operators apply the Secret separately before applying the manifests.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-memory-service-env
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
data:
# Port memory-service listens on. Must match 60-service.yaml targetPort.
OLAM_MEMORY_PORT: "3111"
# Data directory — backed by the PVC mounted at /data.
OLAM_MEMORY_DATA_PATH: "/data/memory"
# URL of auth-service (cluster-internal DNS). Override in non-k3d environments.
OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
# Health path exposed at /agentmemory/livez (D15 — do not change).
OLAM_MEMORY_HEALTH_PATH: "/agentmemory/livez"
# R3-B defensive (Decision R3-#2): memory-service Dockerfile already sets
# AGENTMEMORY_HOST=0.0.0.0 but ConfigMap override is explicit defense against
# a future image regression reverting to 127.0.0.1.
AGENTMEMORY_HOST: "0.0.0.0"
# III_REST_PORT is the env var the agentmemory CLI wrapper reads when it
# polls its iii subprocess for readiness (cli.mjs:155 — `process.env
# ["III_REST_PORT"] || "3111"`). The iii engine itself binds the port
# declared in iii-config.yaml's iii-http worker (overridden via the
# olam-memory-service-iii-config ConfigMap to 3110, so it does not
# collide with the metrics-proxy on 3111). Without this env var the
# wrapper polls 3111 forever, prints "iii-engine did not become ready",
# and exits — entrypoint propagates the exit, container restarts, and
# the liveness probe returns 502 from the proxy (its backend was never
# up). Must equal the iii-http port in 35-configmap-iii-config.yaml.
III_REST_PORT: "3110"
# Overrides the iii-config.yaml shipped inside the agentmemory image so the
# iii engine binds the INTERNAL port (3110) instead of the EXTERNAL port
# (3111). The shipped yaml hardcodes `port: 3111` and the agentmemory CLI
# reads its bind from yaml (NOT from the AGENTMEMORY_PORT env var), so
# entrypoint.sh's `AGENTMEMORY_PORT=3110` override has no effect.
#
# Without this override, the engine and the metrics-proxy both try to bind
# 0.0.0.0:3111. The proxy starts first and wins the port; the engine fails
# silently. Probes to /agentmemory/livez hit the proxy and get forwarded to
# 127.0.0.1:3110, where nothing is listening — proxy returns 502, readiness
# fails, container restarts.
#
# Mounted at /usr/local/lib/node_modules/@agentmemory/agentmemory/dist/iii-config.yaml
# via subPath in 50-deployment.yaml.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-memory-service-iii-config
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
data:
iii-config.yaml: |
workers:
- name: iii-http
config:
port: 3110
host: 0.0.0.0
default_timeout: 180000
cors:
allowed_origins: ["http://localhost:3111", "http://localhost:3113", "http://127.0.0.1:3111", "http://127.0.0.1:3113"]
allowed_methods: [GET, POST, PUT, DELETE, OPTIONS]
- name: iii-state
config:
adapter:
name: kv
config:
store_method: file_based
file_path: ./data/state_store.db
- name: iii-queue
config:
adapter:
name: builtin
- name: iii-pubsub
config:
adapter:
name: local
- name: iii-cron
config:
adapter:
name: kv
- name: iii-stream
config:
port: 3112
host: 0.0.0.0
adapter:
name: kv
config:
store_method: file_based
file_path: ./data/stream_store
- name: iii-observability
config:
enabled: true
service_name: agentmemory
exporter: memory
sampling_ratio: 1.0
metrics_enabled: true
logs_enabled: true
logs_console_output: true
- name: iii-exec
config:
watch:
- src/**/*.ts
exec:
- node dist/index.mjs
# PersistentVolumeClaim for olam-memory-service /data volume.
#
# Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml
# for the full rationale (fsGroup, k3d node filesystem, etc.).
#
# local-path StorageClass ships with k3d by default (rancher/local-path-provisioner).
# On non-k3d clusters, substitute storageClassName with your cluster's provisioner.
# D24: storageClassName operator-editable — edit the field below for non-k3d substrates.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-memory-data
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
spec:
accessModes:
- ReadWriteOnce
# D24: operator-editable. k3d default is local-path. Change for non-k3d substrates.
storageClassName: local-path
resources:
requests:
# D25: memory-service PVC size 5Gi.
storage: 5Gi
# Deployment for olam-memory-service.
#
# Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
# Digest resolves to ghcr.io/pleri/olam-memory-service:0.1.0 (multi-arch index).
# To update: resolve the new tag's digest via:
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-memory-service:pull&service=ghcr.io" | jq -r .token)
# curl -sI -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
# https://ghcr.io/v2/pleri/olam-memory-service/manifests/<tag> | grep docker-content-digest
#
# securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot,
# readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs.
#
# D15 (LOAD-BEARING): readinessProbe and livenessProbe path MUST be
# /agentmemory/livez (not /health). Source: DEFAULT_HEALTH_PATH in
# packages/core/src/services-status/memory-probe.ts:18.
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-memory-service
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: olam-memory-service
template:
metadata:
labels:
app: olam-memory-service
spec:
# B9 (round 2 recovery): disable k8s automatic Service env injection.
# See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale.
enableServiceLinks: false
# R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret
# created by `olam upgrade` step 0.4 when GH_TOKEN is available.
imagePullSecrets:
- name: ghcr-pull
serviceAccountName: olam-memory-service
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
initContainers:
- name: chown-data
# busybox:1.36 — sha256-pinned per T4 threat model.
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["chown", "-R", "1000:1000", "/data"]
volumeMounts:
- name: memory-data
mountPath: /data
containers:
- name: olam-memory-service
# image first appears on GHCR after Phase B's publish-memory-service
# job fires on the first release post-merge. Remove the
# bootstrap-placeholder comment + run `npm run refresh:manifest-digests`
# once ghcr.io/pleri/olam-memory-service has a real published digest.
# bootstrap-placeholder: pre-publish; refresh after first release
image: ghcr.io/pleri/olam-memory-service@sha256:923bff54d2ba3da162a35d3e8ebc6bd440bed6d290a5cff7bae2888281a4e003
imagePullPolicy: IfNotPresent
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
ports:
- name: http
containerPort: 3111
protocol: TCP
envFrom:
- configMapRef:
name: olam-memory-service-env
- secretRef:
name: olam-memory-service-secret
volumeMounts:
- name: memory-data
mountPath: /data
- name: tmp
mountPath: /tmp
# Overrides the shipped iii-config.yaml so the engine binds the
# internal port (3110) instead of colliding with the metrics-proxy
# on 3111. See 35-configmap-iii-config.yaml for full rationale.
- name: iii-config-override
mountPath: /usr/local/lib/node_modules/@agentmemory/agentmemory/dist/iii-config.yaml
subPath: iii-config.yaml
readOnly: true
readinessProbe:
httpGet:
# D15 (LOAD-BEARING): memory-service health path is /agentmemory/livez.
# Source: DEFAULT_HEALTH_PATH in packages/core/src/services-status/memory-probe.ts:18.
# Do NOT change to /health — that endpoint does not exist on this service.
path: /agentmemory/livez
port: 3111
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 6
livenessProbe:
httpGet:
# D15 (LOAD-BEARING): same path as readinessProbe.
path: /agentmemory/livez
port: 3111
initialDelaySeconds: 30
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "50m"
memory: "256Mi"
limits:
cpu: "500m"
memory: "1Gi"
volumes:
- name: memory-data
persistentVolumeClaim:
claimName: olam-memory-data
- name: tmp
emptyDir: {}
- name: iii-config-override
configMap:
name: olam-memory-service-iii-config
# ClusterIP Service for olam-memory-service.
# Port 3111 — consumed by host-cp and agents via cluster-internal DNS.
# Operator surfaces externally via:
# kubectl port-forward -n olam svc/olam-memory-service 3111:3111
apiVersion: v1
kind: Service
metadata:
name: olam-memory-service
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
spec:
type: ClusterIP
selector:
app: olam-memory-service
ports:
- name: http
port: 3111
targetPort: 3111
protocol: TCP
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-plan-chat-service
namespace: olam
labels:
app: olam-plan-chat-service
olam.io/component: peripheral
# plan-chat-service does not need to read or write any Kubernetes API objects.
# A no-op Role + RoleBinding documents the minimal-privilege stance and
# keeps the file present so audit:cli-bundle-k8s does not skip this peripheral.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-plan-chat-service
namespace: olam
labels:
app: olam-plan-chat-service
olam.io/component: peripheral
rules: []
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-plan-chat-service
namespace: olam
labels:
app: olam-plan-chat-service
olam.io/component: peripheral
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: olam-plan-chat-service
subjects:
- kind: ServiceAccount
name: olam-plan-chat-service
namespace: olam
# ConfigMap for olam-plan-chat-service.
#
# plan-chat-service.mjs (packages/host-cp/src/plan-chat-service.mjs) reads
# these env vars at startup. See the file header for the canonical names.
#
# DATABASE_URL: points at the in-cluster chunks-postgres StatefulSet's Service.
# The password is sourced from the chunks-postgres-secret
# (mounted via envFrom in 50-deployment.yaml) — the literal
# here uses the env-var substitution syntax
# `$(VAR)` which kubelet expands when DATABASE_URL is itself
# read via envFrom or env: subordinate.
#
# BUT: kubelet only expands env-refs declared on the container,
# not values inside a ConfigMap key. So we keep DATABASE_URL
# OUT of this ConfigMap and assemble it in the Deployment's
# env: section instead (which CAN reference the Secret-backed
# POSTGRES_PASSWORD via $(POSTGRES_PASSWORD)). See 50-deployment.yaml.
#
# ELECTRIC_URL: chunks-electric ClusterIP. No auth (ELECTRIC_INSECURE=true on
# that service in local-dev mode).
#
# SECRET_PATH: filesystem path where the olam-plan-chat-secret Secret is
# mounted (see volumeMounts in 50-deployment.yaml). The mount
# key is "secret" → file `/etc/olam-plan-chat/secret`.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-plan-chat-service-env
namespace: olam
labels:
app: olam-plan-chat-service
olam.io/component: peripheral
data:
OLAM_PLAN_CHAT_PORT: "3200"
OLAM_PLAN_CHAT_ELECTRIC_URL: "http://olam-chunks-electric.olam.svc.cluster.local:3000"
OLAM_PLAN_CHAT_SECRET_PATH: "/etc/olam-plan-chat/secret"
# PersistentVolumeClaim for olam-plan-chat-service /data volume.
#
# plan-chat-service is mostly stateless (DB lives in chunks-postgres, secret
# lives in olam-plan-chat-secret), but ships a /data PVC for parity with
# the other peripherals. Used for any transient state the service decides
# to spool (e.g. planning-session resumption buffers).
#
# local-path StorageClass ships with k3d by default. On non-k3d clusters,
# substitute storageClassName with your cluster's provisioner.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-plan-chat-service-data
namespace: olam
labels:
app: olam-plan-chat-service
olam.io/component: peripheral
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 1Gi
# Deployment for olam-plan-chat-service.
#
# Image strategy: REUSES the olam-host-cp image. Per the package layout,
# plan-chat-service.mjs is a sibling under packages/host-cp/src/, and the
# host-cp image's WORKDIR=/app already contains it at /app/src/plan-chat-service.mjs.
# The single shared image avoids version-drift between the two binaries that
# share plan-chat-secret.mjs (bearer-auth logic), planning-sessions.mjs,
# crystallize-planning.mjs, and resolver.mjs.
#
# The command override replaces the host-cp default
# ENTRYPOINT (`node src/server.mjs`) with the plan-chat-service entrypoint.
#
# Image: pinned to the SAME digest as host-cp's 50-deployment.yaml. Refresh
# both in lockstep via scripts/refresh-manifest-digests.mjs on every release.
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-plan-chat-service
namespace: olam
labels:
app: olam-plan-chat-service
olam.io/component: peripheral
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: olam-plan-chat-service
template:
metadata:
labels:
app: olam-plan-chat-service
spec:
enableServiceLinks: false
imagePullSecrets:
- name: ghcr-pull
serviceAccountName: olam-plan-chat-service
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
initContainers:
# chown-data: identical to memory-service pattern. Postgres-RWO PVC
# mounts as root-owned on local-path; this brings it to 1000:1000.
- name: chown-data
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["chown", "-R", "1000:1000", "/data"]
volumeMounts:
- name: plan-chat-data
mountPath: /data
containers:
- name: olam-plan-chat-service
# Reuses the host-cp image (same source tree, same node_modules).
# Digest pinned in lockstep with packages/host-cp/k8s/manifests/50-deployment.yaml.
image: ghcr.io/pleri/olam-host-cp@sha256:20d84b6d490c633bc5a158b0f7f849152aba3cf1d2d45657360f627d8d41ec3f
imagePullPolicy: IfNotPresent
# Override the host-cp ENTRYPOINT. plan-chat-service.mjs exports
# startService(); we boot it via -e import-and-call.
command: ["node"]
args:
- "-e"
- "import('/app/src/plan-chat-service.mjs').then(m => m.startService()).catch(e => { console.error('[plan-chat-service]', e); process.exit(1); });"
workingDir: /app
securityContext:
runAsNonRoot: true
runAsUser: 1000
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
ports:
- name: http
containerPort: 3200
protocol: TCP
envFrom:
- configMapRef:
name: olam-plan-chat-service-env
env:
# DATABASE_URL composition. Same pattern as chunks-electric.
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: olam-chunks-postgres-secret
key: POSTGRES_PASSWORD
- name: OLAM_PLAN_CHAT_DATABASE_URL
value: "postgres://postgres:$(POSTGRES_PASSWORD)@olam-chunks-postgres.olam.svc.cluster.local:5432/chunks"
volumeMounts:
- name: plan-chat-data
mountPath: /data
- name: plan-chat-secret
mountPath: /etc/olam-plan-chat
readOnly: true
readinessProbe:
httpGet:
path: /livez
port: 3200
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 12
livenessProbe:
httpGet:
path: /livez
port: 3200
initialDelaySeconds: 60
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "50m"
memory: "256Mi"
limits:
cpu: "500m"
memory: "1Gi"
volumes:
- name: plan-chat-data
persistentVolumeClaim:
claimName: olam-plan-chat-service-data
- name: plan-chat-secret
secret:
secretName: olam-plan-chat-secret
defaultMode: 0400
items:
- key: PLAN_CHAT_SECRET
path: secret
apiVersion: v1
kind: Service
metadata:
name: olam-plan-chat-service
namespace: olam
labels:
app: olam-plan-chat-service
olam.io/component: peripheral
spec:
type: ClusterIP
selector:
app: olam-plan-chat-service
ports:
- name: http
port: 3200
targetPort: 3200
protocol: TCP
# Secret TEMPLATE for olam-host-cp.
#
# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
# the placeholder values. The placeholders are intentionally invalid; a raw
# `kubectl apply` will result in auth-service 401s rather than silently shipping
# fake credentials.
#
# Preferred substitution (keeps secrets out of git):
# kubectl create secret generic olam-host-cp-secret -n olam \
# --from-literal=OLAM_AUTH_SECRET=$(cat ~/.olam/auth-secret) \
# --from-literal=GH_TOKEN=$(gh auth token) \
# --dry-run=client -o yaml | kubectl apply -f -
#
# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
# so that `kubectl apply -f manifests/` does NOT apply it — operators must
# explicitly handle Secret provisioning before applying the manifests.
apiVersion: v1
kind: Secret
metadata:
name: olam-host-cp-secret
namespace: olam
labels:
app: olam-host-cp
olam.io/component: host-stack
type: Opaque
stringData:
# Shared bearer secret between host-cp and the long-lived olam-auth process.
# Source: cat ~/.olam/auth-secret
OLAM_AUTH_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_AUTH_SECRET"
# GitHub token for GHCR image pulls and the /api/prs endpoint.
# Source: gh auth token
GH_TOKEN: "REPLACE_ME_FROM_GH_AUTH_TOKEN"
# Secret TEMPLATE for olam-auth-service.
#
# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
# the placeholder values. The placeholders are intentionally invalid; a raw
# `kubectl apply` will result in auth failures rather than silently shipping
# fake credentials.
#
# Preferred substitution (keeps secrets out of git):
# kubectl create secret generic olam-auth-service-secret -n olam \
# --from-literal=OLAM_AUTH_DB_SECRET=$(cat ~/.olam/auth-db-secret) \
# --dry-run=client -o yaml | kubectl apply -f -
#
# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
# so that `kubectl apply -f manifests/auth-service/` does NOT apply it —
# operators must explicitly handle Secret provisioning before applying manifests.
apiVersion: v1
kind: Secret
metadata:
name: olam-auth-service-secret
namespace: olam
labels:
app: olam-auth-service
olam.io/component: peripheral
type: Opaque
stringData:
# Shared database encryption secret for the credential vault.
# Source: cat ~/.olam/auth-db-secret
OLAM_AUTH_DB_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_AUTH_DB_SECRET"
# Secret TEMPLATE for olam-chunks-postgres.
#
# Generates a random 64-char hex POSTGRES_PASSWORD on first apply (via
# k8s-secret-render.ts generate-if-missing). The Secret is consumed by:
# - chunks-postgres StatefulSet (envFrom → POSTGRES_PASSWORD)
# - chunks-electric Deployment (env: valueFrom.secretKeyRef)
# - plan-chat-service Deployment (env: valueFrom.secretKeyRef)
#
# All three resolve the SAME random value because the secret-renderer
# persists generated values in ~/.olam/k8s-secrets-state.json so reapply
# is idempotent (no rotation unless --rotate-secrets).
apiVersion: v1
kind: Secret
metadata:
name: olam-chunks-postgres-secret
namespace: olam
labels:
app: olam-chunks-postgres
olam.io/component: substrate
type: Opaque
stringData:
# Postgres superuser password. Generated by the CLI's secret-renderer on
# first apply (no host-side file to read; this is in-cluster-only state).
POSTGRES_PASSWORD: "REPLACE_ME_GENERATE_RANDOM_HEX"
# Secret TEMPLATE for olam-kg-service.
#
# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
# the placeholder values. The placeholders are intentionally invalid; a raw
# `kubectl apply` will result in auth failures rather than silently shipping
# fake credentials.
#
# Preferred substitution (keeps secrets out of git):
# kubectl create secret generic olam-kg-service-secret -n olam \
# --from-literal=OLAM_KG_BEARER_TOKEN=$(cat ~/.olam/kg-bearer-token) \
# --dry-run=client -o yaml | kubectl apply -f -
#
# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
# so that `kubectl apply -f manifests/kg-service/` does NOT apply it —
# operators must explicitly handle Secret provisioning before applying manifests.
apiVersion: v1
kind: Secret
metadata:
name: olam-kg-service-secret
namespace: olam
labels:
app: olam-kg-service
olam.io/component: peripheral
type: Opaque
stringData:
# Bearer token for in-cluster KG query authentication.
# Source: cat ~/.olam/kg-bearer-token
OLAM_KG_BEARER_TOKEN: "REPLACE_ME_FROM_HOME_DOTOLAM_KG_BEARER_TOKEN"
# Secret TEMPLATE for olam-mcp-auth-service.
#
# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
# the placeholder values. The placeholders are intentionally invalid; a raw
# `kubectl apply` will result in auth failures rather than silently shipping
# fake credentials.
#
# Preferred substitution (keeps secrets out of git):
# kubectl create secret generic olam-mcp-auth-service-secret -n olam \
# --from-literal=OLAM_MCP_AUTH_JWT_SECRET=$(cat ~/.olam/mcp-auth-jwt-secret) \
# --dry-run=client -o yaml | kubectl apply -f -
#
# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
# so that `kubectl apply -f manifests/mcp-auth-service/` does NOT apply it —
# operators must explicitly handle Secret provisioning before applying manifests.
apiVersion: v1
kind: Secret
metadata:
name: olam-mcp-auth-service-secret
namespace: olam
labels:
app: olam-mcp-auth-service
olam.io/component: peripheral
type: Opaque
stringData:
# JWT signing secret for MCP client authentication.
# Source: cat ~/.olam/mcp-auth-jwt-secret
OLAM_MCP_AUTH_JWT_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_MCP_AUTH_JWT_SECRET"
# Secret TEMPLATE for olam-memory-service.
#
# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
# the placeholder values. The placeholders are intentionally invalid; a raw
# `kubectl apply` will result in auth failures rather than silently shipping
# fake credentials.
#
# Preferred substitution (keeps secrets out of git):
# kubectl create secret generic olam-memory-service-secret -n olam \
# --from-literal=OLAM_MEMORY_BEARER_SECRET=$(cat ~/.olam/memory-bearer-secret) \
# --dry-run=client -o yaml | kubectl apply -f -
#
# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
# so that `kubectl apply -f manifests/memory-service/` does NOT apply it —
# operators must explicitly handle Secret provisioning before applying manifests.
apiVersion: v1
kind: Secret
metadata:
name: olam-memory-service-secret
namespace: olam
labels:
app: olam-memory-service
olam.io/component: peripheral
type: Opaque
stringData:
# Bearer secret for the memory-service HTTP API (matches OLAM_MEMORY_BEARER_SECRET
# used by host-cp and agents that call the memory endpoints).
# Source: cat ~/.olam/memory-bearer-secret
OLAM_MEMORY_BEARER_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_MEMORY_BEARER_SECRET"
# Secret TEMPLATE for olam-plan-chat-secret.
#
# This file is a TEMPLATE — it MUST NOT be applied directly without substituting
# the placeholder values. The placeholders are intentionally invalid; a raw
# `kubectl apply` will result in auth failures rather than silently shipping
# fake credentials.
#
# Preferred substitution (keeps secrets out of git):
# kubectl create secret generic olam-plan-chat-secret -n olam \
# --from-literal=PLAN_CHAT_SECRET=$(cat ~/.olam/plan-chat-secret) \
# --dry-run=client -o yaml | kubectl apply -f -
#
# This template lives in packages/host-cp/k8s/templates/ (NOT manifests/)
# so that `kubectl apply -f manifests/plan-chat-service/` does NOT apply it —
# operators must explicitly handle Secret provisioning before applying manifests.
#
# Architecture: this Secret is mounted by BOTH the host-cp pod (so its
# renderSpaShell can inject window.__OLAM_PLAN_CHAT_BEARER__) AND the
# plan-chat-service pod (so its bearer-auth gate timing-safe-compares incoming
# Authorization: Bearer headers against the same value). One source-of-truth,
# two readers — replaces the previous "/data/plan-chat-secret in host-cp PVC"
# pattern that couldn't be shared across pods (RWO PVC).
apiVersion: v1
kind: Secret
metadata:
name: olam-plan-chat-secret
namespace: olam
labels:
olam.io/component: substrate
type: Opaque
stringData:
# Shared bearer secret for plan-chat-service's POST /v1/chunks and
# GET /v1/shape endpoints. host-cp injects this into window.__OLAM_PLAN_CHAT_BEARER__.
# Source: cat ~/.olam/plan-chat-secret
PLAN_CHAT_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_PLAN_CHAT_SECRET"
// classifyStartupFailure — pure mapping from evidence shape to bucket.
//
// Precedence rules (walked top-down; first match wins):
//
// 1. processExitCode !== undefined → ProviderProcessGone
// The agent process is dead; nothing else matters. This is the
// highest-confidence signal because it's observable from outside
// the container (docker exit code, child_process exit).
//
// 2. pluginErrors.length > 0 → PluginStartupFailed
// Boot-time stderr from a plugin/skill source is definitive.
// Comes before transport/handshake checks because a failed
// plugin can leave transport+mcp in 'pending' permanently.
//
// 3. transportStatus === 'failed' → TransportDead
// Channel-open never succeeded — agent is alive but unreachable.
//
// 4. mcpHandshakeStatus === 'failed' → McpHandshakeStall
// Channel opened, MCP handshake explicitly failed.
//
// 5. mcpHandshakeStatus === 'pending'
// AND elapsedSecondsSinceCreation > 30 → McpHandshakeStall
// Time-bounded inference: a never-completed handshake after 30s
// is the stall signal even without an explicit failure marker.
//
// 6. lastPhase === 'TrustRequired'
// AND elapsedSecondsSinceCreation > 10 → TrustGateUnanswered
// Agent reached the trust gate; no approval ever came back.
// 10s is the operator's attention budget — past that, the
// agent is silently stuck on a human gate.
//
// 7. promptSentAt !== undefined
// AND firstThoughtAt === undefined → PromptMisdelivery
// Dispatch landed on the host side but the agent never produced
// a first thought — the prompt didn't reach the agent process.
//
// 8. lastPhase === 'TrustRequired' → TrustGateUnanswered (fallback)
// Stuck at the trust gate even under 10s — still the most likely
// explanation for a Failed transition from that phase.
//
// 9. fallthrough → PromptMisdelivery
// The classifier is total: every Failed transition gets a bucket.
// PromptMisdelivery is the most operator-actionable "we don't
// know why but the dispatch path is the prime suspect" default.
//
// Tests in __tests__/classify.test.mjs assert exactly one case per
// bucket. The function is pure: no I/O, no side effects, deterministic
// — same evidence in always yields the same bucket out.
import { WorldStartupFailureKind } from './failure-kinds.mjs';
const MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS = 30;
const TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS = 10;
/**
* Map a WorldStartupEvidence bundle to its WorldStartupFailureKind.
*
* @param {import('./evidence.mjs').WorldStartupEvidence} evidence
* @returns {import('./failure-kinds.mjs').WorldStartupFailureKind}
*/
export function classifyStartupFailure(evidence) {
// 1. Process exited — terminal signal, short-circuits all other checks.
if (evidence.processExitCode !== undefined) {
return WorldStartupFailureKind.ProviderProcessGone;
}
// 2. Plugin boot errors — definitive boot-time failure.
if (evidence.pluginErrors.length > 0) {
return WorldStartupFailureKind.PluginStartupFailed;
}
// 3. Transport explicitly failed — agent alive but unreachable.
if (evidence.transportStatus === 'failed') {
return WorldStartupFailureKind.TransportDead;
}
// 4. MCP handshake explicitly failed.
if (evidence.mcpHandshakeStatus === 'failed') {
return WorldStartupFailureKind.McpHandshakeStall;
}
// 5. MCP handshake pending past threshold — inferred stall.
if (
evidence.mcpHandshakeStatus === 'pending' &&
evidence.elapsedSecondsSinceCreation > MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS
) {
return WorldStartupFailureKind.McpHandshakeStall;
}
// 6. Stuck on trust gate past operator-attention threshold.
if (
evidence.lastPhase === 'TrustRequired' &&
evidence.elapsedSecondsSinceCreation > TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS
) {
return WorldStartupFailureKind.TrustGateUnanswered;
}
// 7. Prompt sent but agent never produced a first thought.
if (evidence.promptSentAt !== undefined && evidence.firstThoughtAt === undefined) {
return WorldStartupFailureKind.PromptMisdelivery;
}
// 8. Still at trust gate under threshold — bucket as trust-gate.
if (evidence.lastPhase === 'TrustRequired') {
return WorldStartupFailureKind.TrustGateUnanswered;
}
// 9. Total-function fallback.
return WorldStartupFailureKind.PromptMisdelivery;
}
// recordWorldLifecycle — the single broadcast helper every host-cp
// surface uses to emit a lifecycle transition.
//
// Emits TWO event types on the host-stream:
//
// 1. event: 'world.lifecycle' → live SSE consumers (SPA, MCP, etc.).
// Shape: { worldId, phase, at, evidence?, failureKind? }
//
// 2. event: 'span' → NDJSON trace sink (PR #915 + follow-ups).
// Shape: { name: 'world.lifecycle', startedAt: at, endedAt: at,
// attributes: { worldId, phase, evidence?, failureKind? },
// exit: { _tag: 'Success' | 'Failure', reason? } }
//
// The dual-emit keeps live consumers and trace consumers on the same
// substrate without either path coupling to the other. The README jq
// example `select(.name == "world.lifecycle" ...)` matches the span
// emission; the SPA's `useHostStream().subscribe('world.lifecycle', ...)`
// matches the live emission.
//
// Failed transitions auto-classify via classifyStartupFailure(evidence)
// when caller passes evidence but omits an explicit failureKind. Callers
// MAY provide their own failureKind to override the inference (e.g.
// docker SIGKILL — the caller knows it was ProviderProcessGone before
// the classifier could trip its time-thresholds).
import { TERMINAL_PHASES, WorldLifecyclePhase } from './phases.mjs';
import { classifyStartupFailure } from './classify.mjs';
import { redactSensitive } from '../observability/redactor.mjs';
/**
* @typedef {object} HostStreamLike
* @property {(eventType: string, payload: unknown) => unknown} broadcast
*/
/**
* @typedef {object} WorldLifecycleEvent
* @property {string} worldId
* @property {import('./phases.mjs').WorldLifecyclePhase} phase
* @property {number} at
* @property {import('./evidence.mjs').WorldStartupEvidence} [evidence]
* @property {import('./failure-kinds.mjs').WorldStartupFailureKind} [failureKind]
*/
/**
* Emit a world lifecycle transition on both `world.lifecycle` and `span`
* host-stream channels.
*
* @param {HostStreamLike} hostStream
* @param {object} args
* @param {string} args.worldId
* @param {import('./phases.mjs').WorldLifecyclePhase} args.phase
* @param {number} [args.at]
* @param {import('./evidence.mjs').WorldStartupEvidence} [args.evidence]
* @param {import('./failure-kinds.mjs').WorldStartupFailureKind} [args.failureKind]
* @returns {WorldLifecycleEvent} the payload that was broadcast (test convenience)
*/
export function recordWorldLifecycle(hostStream, args) {
if (!hostStream || typeof hostStream.broadcast !== 'function') {
throw new TypeError('recordWorldLifecycle: hostStream.broadcast is required');
}
if (typeof args?.worldId !== 'string' || args.worldId.length === 0) {
throw new TypeError('recordWorldLifecycle: worldId is required');
}
if (typeof args?.phase !== 'string') {
throw new TypeError('recordWorldLifecycle: phase is required');
}
const at = typeof args.at === 'number' ? args.at : Date.now();
// Resolve failureKind: explicit override > classifier inference > undefined.
let failureKind = args.failureKind;
if (
failureKind === undefined &&
args.phase === WorldLifecyclePhase.Failed &&
args.evidence !== undefined
) {
failureKind = classifyStartupFailure(args.evidence);
}
/** @type {WorldLifecycleEvent} */
const livePayload = {
worldId: args.worldId,
phase: args.phase,
at,
};
if (args.evidence !== undefined) livePayload.evidence = redactSensitive(args.evidence);
if (failureKind !== undefined) livePayload.failureKind = failureKind;
hostStream.broadcast('world.lifecycle', livePayload);
// Mirror as a span so the NDJSON trace sink (PR #915) records it.
// Lifecycle transitions are point-in-time events — startedAt === endedAt.
/** @type {Record<string, unknown>} */
const spanAttributes = {
worldId: args.worldId,
phase: args.phase,
};
if (args.evidence !== undefined) spanAttributes.evidence = redactSensitive(args.evidence);
if (failureKind !== undefined) spanAttributes.failureKind = failureKind;
/** @type {{ _tag: 'Success' | 'Failure', reason?: string }} */
const exit =
args.phase === WorldLifecyclePhase.Failed
? { _tag: 'Failure', reason: failureKind ?? 'unclassified' }
: { _tag: 'Success' };
hostStream.broadcast('span', {
name: 'world.lifecycle',
startedAt: at,
endedAt: at,
attributes: spanAttributes,
exit,
});
return livePayload;
}
/** Re-export so callers don't need to import both modules. */
export { WorldLifecyclePhase, TERMINAL_PHASES };
// WorldStartupEvidence — the typed bundle the classifier consumes.
//
// Every Failed lifecycle transition carries one of these. Fields are
// strict-optional (undefined, not null) so consumers can use the
// presence/absence as a signal directly (`promptSentAt === undefined`
// is itself the PromptMisdelivery signal).
/**
* @typedef {'pending' | 'ok' | 'failed'} HandshakeStatus
*/
/**
* @typedef {object} WorldStartupEvidence
* @property {string} worldId
* @property {import('./phases.mjs').WorldLifecyclePhase} lastPhase
* @property {number} lastPhaseAt epoch ms
* @property {number} [promptSentAt] undefined if no dispatch ever sent
* @property {number} [firstThoughtAt] undefined if no thoughts ever produced
* @property {HandshakeStatus} mcpHandshakeStatus
* @property {HandshakeStatus} transportStatus
* @property {string[]} pluginErrors captured stderr lines from plugin boot
* @property {number} [processExitCode]
* @property {number} elapsedSecondsSinceCreation
*/
/**
* Construct an empty evidence bundle for a freshly-spawned world.
* Caller mutates fields as transitions happen, then passes to the
* classifier on Failed.
*
* @param {string} worldId
* @param {number} [now]
* @returns {WorldStartupEvidence}
*/
export function emptyEvidence(worldId, now = Date.now()) {
return {
worldId,
lastPhase: 'Spawning',
lastPhaseAt: now,
mcpHandshakeStatus: 'pending',
transportStatus: 'pending',
pluginErrors: [],
elapsedSecondsSinceCreation: 0,
};
}
// World startup failure buckets — the six canonical classes the
// classifier maps every observed Failed transition into.
//
// Order is load-bearing: the classifier walks these in declaration
// order on ambiguous evidence, so higher-confidence buckets
// (PromptMisdelivery, TransportDead) come before time-bounded
// inferences (TrustGateUnanswered, McpHandshakeStall). Adding a 7th
// bucket requires updating the classifier precedence and the
// `world.lifecycle.Failed` consumers in the SPA + NDJSON trace.
/**
* @typedef {| 'PromptMisdelivery'
* | 'TransportDead'
* | 'TrustGateUnanswered'
* | 'McpHandshakeStall'
* | 'PluginStartupFailed'
* | 'ProviderProcessGone'} WorldStartupFailureKind
*/
/**
* @type {Readonly<Record<WorldStartupFailureKind, WorldStartupFailureKind>>}
*/
export const WorldStartupFailureKind = Object.freeze({
/** Dispatch sent but agent never received it (transport mismatch). */
PromptMisdelivery: 'PromptMisdelivery',
/** stdin/stdout/IPC channel never opened. */
TransportDead: 'TransportDead',
/** Agent reached TrustRequired, no approval ever arrived. */
TrustGateUnanswered: 'TrustGateUnanswered',
/** MCP server connection initialized but never completed handshake. */
McpHandshakeStall: 'McpHandshakeStall',
/** Plugin or skill source failed to load on boot. */
PluginStartupFailed: 'PluginStartupFailed',
/** Agent (Claude Code) process exited before responding. */
ProviderProcessGone: 'ProviderProcessGone',
});
export const WORLD_STARTUP_FAILURE_KIND_ORDER = Object.freeze([
WorldStartupFailureKind.PromptMisdelivery,
WorldStartupFailureKind.TransportDead,
WorldStartupFailureKind.TrustGateUnanswered,
WorldStartupFailureKind.McpHandshakeStall,
WorldStartupFailureKind.PluginStartupFailed,
WorldStartupFailureKind.ProviderProcessGone,
]);
/**
* @param {unknown} value
* @returns {value is WorldStartupFailureKind}
*/
export function isWorldStartupFailureKind(value) {
return (
typeof value === 'string' &&
WORLD_STARTUP_FAILURE_KIND_ORDER.includes(/** @type {any} */ (value))
);
}
// Barrel re-export for the lifecycle module. Importers should pull
// from '@olam/host-cp/lifecycle' (or the relative path equivalent)
// rather than reaching into individual files.
export {
WorldLifecyclePhase,
WORLD_LIFECYCLE_PHASE_ORDER,
TERMINAL_PHASES,
isWorldLifecyclePhase,
} from './phases.mjs';
export {
WorldStartupFailureKind,
WORLD_STARTUP_FAILURE_KIND_ORDER,
isWorldStartupFailureKind,
} from './failure-kinds.mjs';
export { emptyEvidence } from './evidence.mjs';
export { classifyStartupFailure } from './classify.mjs';
export { recordWorldLifecycle } from './emit.mjs';
// World lifecycle phases — the canonical FSM every Olam world walks
// through from spawn to terminal state.
//
// Order is load-bearing: a world's `lastPhase` is a monotonic high-water
// mark, and the classifier's precedence rules in classify.mjs assume
// this ordering when deciding which failure bucket to attribute a stall
// to. Do NOT reorder without updating the classifier.
/**
* @typedef {'Spawning' | 'TrustRequired' | 'ReadyForPrompt' | 'Running' | 'Finished' | 'Failed'} WorldLifecyclePhase
*/
/**
* @type {Readonly<Record<WorldLifecyclePhase, WorldLifecyclePhase>>}
*/
export const WorldLifecyclePhase = Object.freeze({
/** Container or worktree created; before any code runs inside. */
Spawning: 'Spawning',
/** Agent process up; awaiting trust-gate approval. */
TrustRequired: 'TrustRequired',
/** Trust granted; awaiting initial dispatch. */
ReadyForPrompt: 'ReadyForPrompt',
/** Actively processing dispatch. */
Running: 'Running',
/** Completed successfully. Terminal. */
Finished: 'Finished',
/** Terminal failure. Carries an evidence bundle + classified failure kind. */
Failed: 'Failed',
});
/** Phases in canonical order. Useful for ordinal comparison. */
export const WORLD_LIFECYCLE_PHASE_ORDER = Object.freeze([
WorldLifecyclePhase.Spawning,
WorldLifecyclePhase.TrustRequired,
WorldLifecyclePhase.ReadyForPrompt,
WorldLifecyclePhase.Running,
WorldLifecyclePhase.Finished,
WorldLifecyclePhase.Failed,
]);
/** Terminal phases — no transitions out. */
export const TERMINAL_PHASES = Object.freeze(
new Set([WorldLifecyclePhase.Finished, WorldLifecyclePhase.Failed]),
);
/**
* @param {unknown} value
* @returns {value is WorldLifecyclePhase}
*/
export function isWorldLifecyclePhase(value) {
return typeof value === 'string' && WORLD_LIFECYCLE_PHASE_ORDER.includes(/** @type {any} */ (value));
}
#!/usr/bin/env bash
# grafana-port-forward.sh — e2e smoke test: Grafana installs via Helm,
# port-forward is accessible, Loki datasource
# is pre-wired and reachable.
#
# Usage: scripts/e2e/grafana-port-forward.sh
#
# Pre-conditions:
# - kubectl context is set to a live k8s cluster (does NOT spin up k3d)
# - helm binary available
# - jq binary available
# - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts)
# - Loki is already installed (scripts/e2e/loki-ingest.sh ran successfully
# OR `helm status olam-loki -n monitoring` is healthy)
#
# Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an
# existing cluster. The Secret is applied via --dry-run | kubectl apply
# so re-runs update the password (useful for rotation testing).
# The olam-dashboards ConfigMap is applied before helm install so
# Grafana's volume mount finds the ConfigMap on first boot.
#
# Cleanup: port-forward is killed on exit; Helm release is left in place so
# downstream tasks can reuse the same cluster.
#
# Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B2, B3
# Chart: grafana/grafana 8.5.2 (pinned; latest stable 2026-05-20)
set -euo pipefail
NAMESPACE="monitoring"
GRAFANA_RELEASE="olam-grafana"
GRAFANA_CHART_VERSION="8.5.2"
LOCAL_PORT="3000"
GRAFANA_SVC_PORT="80"
PF_BIND_SECONDS=5
log() { printf '[grafana-port-forward] %s\n' "$*" >&2; }
fail() { printf '[grafana-port-forward] FAIL: %s\n' "$*" >&2; exit 1; }
# -------------------------------------------------------------------------
# Cleanup trap — kill port-forward on exit; leave Helm release in place
# -------------------------------------------------------------------------
PF_PID=""
cleanup() {
if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then
kill "$PF_PID" 2>/dev/null || true
fi
}
trap cleanup EXIT
# -------------------------------------------------------------------------
# Pre-flight
# -------------------------------------------------------------------------
command -v helm >/dev/null 2>&1 || fail "helm not installed"
command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
command -v curl >/dev/null 2>&1 || fail "curl not installed"
command -v openssl >/dev/null 2>&1 || fail "openssl not installed"
command -v jq >/dev/null 2>&1 || fail "jq not installed (required for B3 dashboard assertion)"
kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
log "pre-flight checks passed"
# -------------------------------------------------------------------------
# Ensure grafana Helm repo is present (idempotent — safe to re-run)
# -------------------------------------------------------------------------
helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
helm repo update grafana
# Verify Loki is already installed (B2 depends on B1)
if ! helm status "olam-loki" -n "$NAMESPACE" >/dev/null 2>&1; then
fail "olam-loki Helm release not found in namespace $NAMESPACE — run scripts/e2e/loki-ingest.sh first"
fi
log "Loki pre-condition satisfied (olam-loki release found)"
# -------------------------------------------------------------------------
# Step 1: Resolve admin password (preserve existing on idempotent re-run)
# -------------------------------------------------------------------------
# Grafana persists the admin password in its internal SQLite on first
# deploy. Subsequent helm upgrades do NOT re-read GF_SECURITY_ADMIN_PASSWORD
# from the env (env value is set once at pod-start and not refreshed). So
# on a re-run, rotating the Secret leaves the in-Grafana password stale
# and breaks API auth.
#
# Idempotency contract: if the Secret already exists, reuse its current
# password. The Secret's value matches Grafana's stored value (set in
# concert on first install). Only generate a new password when the
# Secret doesn't exist yet — i.e. true first deploy.
if kubectl get secret olam-grafana-admin -n "$NAMESPACE" >/dev/null 2>&1; then
log "reusing existing admin password from Secret olam-grafana-admin"
GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin -n "$NAMESPACE" \
-o jsonpath='{.data.admin-password}' | base64 -d)
else
log "generating fresh admin password (first deploy)"
GRAFANA_ADMIN_PW=$(openssl rand -base64 24)
fi
export GRAFANA_ADMIN_PW
# -------------------------------------------------------------------------
# Step 2: Create / update the admin Secret idempotently
# -------------------------------------------------------------------------
log "applying Secret olam-grafana-admin in namespace $NAMESPACE"
kubectl create secret generic olam-grafana-admin \
--from-literal=admin-user=admin \
--from-literal=admin-password="$GRAFANA_ADMIN_PW" \
-n "$NAMESPACE" \
--dry-run=client -o yaml \
| kubectl apply -f -
log "Secret applied"
# -------------------------------------------------------------------------
# Step 3a: Apply olam-dashboards ConfigMap BEFORE helm install
# so Grafana's volume mount finds it on first boot (B3).
# The ConfigMap is generated from grafana-dashboards/*.json by
# packages/peripheral-services/scripts/sync-grafana-dashboards.sh.
# -------------------------------------------------------------------------
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
# peripheral-services/{helm-values,manifests} directory is reachable.
# Monorepo callers leave it unset; the script falls back to the source dir
# under packages/peripheral-services/.
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
else
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
fi
CONFIGMAP_MANIFEST="$PERIPHERAL_SERVICES_DIR/manifests/80-grafana-dashboard-configmap.yaml"
if [[ -f "$CONFIGMAP_MANIFEST" ]]; then
log "applying olam-dashboards ConfigMap from $CONFIGMAP_MANIFEST"
kubectl apply -f "$CONFIGMAP_MANIFEST"
log "ConfigMap applied"
else
log "WARN: $CONFIGMAP_MANIFEST not found — Grafana will warn 'ConfigMap not found' until B3 is deployed"
fi
# -------------------------------------------------------------------------
# Step 3: Helm upgrade --install
# -------------------------------------------------------------------------
log "installing grafana/grafana ($GRAFANA_RELEASE) in namespace $NAMESPACE"
helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \
--version "$GRAFANA_CHART_VERSION" \
--namespace "$NAMESPACE" \
--create-namespace \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
--wait \
--timeout "${OLAM_HELM_TIMEOUT:-600s}"
log "Grafana Helm install complete"
# -------------------------------------------------------------------------
# Step 4: Wait for Grafana pod Ready
# -------------------------------------------------------------------------
log "waiting for Grafana pod Ready (120s)"
kubectl wait \
--for=condition=ready pod \
-l "app.kubernetes.io/name=grafana" \
-n "$NAMESPACE" \
--timeout=120s
log "Grafana pod Ready"
# -------------------------------------------------------------------------
# Step 5: Start port-forward in background
# -------------------------------------------------------------------------
log "port-forwarding svc/$GRAFANA_RELEASE $LOCAL_PORT:$GRAFANA_SVC_PORT in namespace $NAMESPACE"
kubectl port-forward \
-n "$NAMESPACE" \
"svc/$GRAFANA_RELEASE" \
"${LOCAL_PORT}:${GRAFANA_SVC_PORT}" &
PF_PID=$!
log "port-forward PID $PF_PID; waiting ${PF_BIND_SECONDS}s for bind"
sleep "$PF_BIND_SECONDS"
# Verify the port-forward process is still alive after sleep
kill -0 "$PF_PID" 2>/dev/null || fail "port-forward process exited prematurely"
# -------------------------------------------------------------------------
# Diagnostic helper — called on assertion failure
# -------------------------------------------------------------------------
dump_diagnostics() {
log "DIAGNOSTIC: last 50 lines of Grafana pod logs:"
kubectl logs -n "$NAMESPACE" \
-l "app.kubernetes.io/name=grafana" \
--tail=50 2>&1 >&2 || true
}
# -------------------------------------------------------------------------
# Step 6: Assertion 1 — /api/health returns 200 with database: ok
# -------------------------------------------------------------------------
log "asserting Grafana health (GET /api/health)"
HEALTH_RESPONSE=$(
curl -sf \
-u "admin:${GRAFANA_ADMIN_PW}" \
"http://localhost:${LOCAL_PORT}/api/health" \
|| { dump_diagnostics; fail "GET /api/health failed — Grafana not reachable on port $LOCAL_PORT"; }
)
if ! echo "$HEALTH_RESPONSE" | jq -e '.database == "ok"' >/dev/null 2>&1; then
log "DIAGNOSTIC: /api/health response:"
echo "$HEALTH_RESPONSE" >&2
dump_diagnostics
fail '/api/health returned database != "ok" — Grafana DB layer not healthy'
fi
log "PASS: /api/health → database: ok"
# -------------------------------------------------------------------------
# Step 7: Assertion 2 — /api/datasources includes Loki entry with cluster URL
# -------------------------------------------------------------------------
log "asserting Loki datasource pre-wired (GET /api/datasources)"
DS_RESPONSE=$(
curl -sf \
-u "admin:${GRAFANA_ADMIN_PW}" \
"http://localhost:${LOCAL_PORT}/api/datasources" \
|| { dump_diagnostics; fail "GET /api/datasources failed"; }
)
EXPECTED_URL="olam-loki.monitoring.svc.cluster.local:3100"
if ! echo "$DS_RESPONSE" | jq -e 'map(select(.type == "loki")) | length >= 1' >/dev/null 2>&1; then
log "DIAGNOSTIC: /api/datasources response:"
echo "$DS_RESPONSE" >&2
dump_diagnostics
fail "datasources response contains no 'loki' type entry — datasource not provisioned"
fi
if ! echo "$DS_RESPONSE" | jq -e --arg url "$EXPECTED_URL" 'map(select(.type == "loki" and (.url | contains($url)))) | length >= 1' >/dev/null 2>&1; then
log "DIAGNOSTIC: /api/datasources response:"
echo "$DS_RESPONSE" >&2
dump_diagnostics
fail "Loki datasource URL does not contain '$EXPECTED_URL' — check grafana-values.yaml datasources block"
fi
log "PASS: Loki datasource found with cluster-local URL $EXPECTED_URL"
# -------------------------------------------------------------------------
# Step 7b: Assertion 2b — dashboard provider loaded olam-home (catches mount-path bugs)
# -------------------------------------------------------------------------
log "asserting olam-home dashboard visible in /api/search (catches ConfigMap mount failures)"
DASHBOARDS=$(
curl -sf \
-u "admin:${GRAFANA_ADMIN_PW}" \
"http://localhost:${LOCAL_PORT}/api/search?type=dash-db&query=olam" \
|| true
)
if ! echo "$DASHBOARDS" | jq -e 'map(select(.uid == "olam-home")) | length == 1' >/dev/null 2>&1; then
log "DIAGNOSTIC: /api/search response:"
echo "$DASHBOARDS" >&2
dump_diagnostics
fail "olam-home dashboard not found in /api/search — check ConfigMap mount path and dashboard provider config"
fi
log "PASS: olam-home dashboard found via /api/search"
# -------------------------------------------------------------------------
# Step 8: Assertion 3 — olam-home dashboard present (B3)
# -------------------------------------------------------------------------
log "asserting olam-home dashboard present (GET /api/dashboards/uid/olam-home)"
DASHBOARD_RESPONSE=$(
curl -sf \
-u "admin:${GRAFANA_ADMIN_PW}" \
"http://localhost:${LOCAL_PORT}/api/dashboards/uid/olam-home" \
|| { dump_diagnostics; fail "GET /api/dashboards/uid/olam-home failed — dashboard not found or Grafana unreachable"; }
)
if ! echo "$DASHBOARD_RESPONSE" | jq -e '.dashboard.uid == "olam-home"' >/dev/null 2>&1; then
log "DIAGNOSTIC: /api/dashboards/uid/olam-home response:"
echo "$DASHBOARD_RESPONSE" >&2
dump_diagnostics
fail "olam-home dashboard uid mismatch or missing — check ConfigMap provisioning and Grafana provider config"
fi
log "PASS: olam-home dashboard present with uid=olam-home"
# -------------------------------------------------------------------------
# Final
# -------------------------------------------------------------------------
log "PASS: Grafana port-forward accessible; Loki datasource pre-wired; olam-home dashboard provisioned — Tasks B2+B3 verified"
exit 0
#!/usr/bin/env bash
# kyverno-cardinality-mutate.sh — Phase C C8 follow-up e2e smoke test.
#
# Verifies that the Kyverno ClusterPolicy
# `enforce-cardinality-labeldrop` mutates incoming ServiceMonitor and
# PodMonitor objects at admission time, regardless of authorship,
# closing codex's "policy by convention" gap on PR #783.
#
# Test approach:
# 1. helm-install Kyverno (pinned 3.8.1) into the `kyverno` namespace.
# 2. Apply the ClusterPolicy.
# 3. POSITIVE test: apply ServiceMonitor `kyverno-mutate-positive-test`
# with selector `app: kyverno-mutate-positive-test` (no backing Service)
# and NO metricRelabelings; assert Kyverno mutated it; delete immediately.
# 4. IDEMPOTENCY test: apply ServiceMonitor `kyverno-mutate-idempotency-test`
# with selector `app: kyverno-mutate-idempotency-test` (different non-existent
# label) and the labeldrop already present; assert count stays at 1; delete.
# 5. SCRAPE-VERIFICATION test: deploy synthetic `kyverno-emitter` (Service +
# Deployment + ConfigMap) + dedicated ServiceMonitor `kyverno-emitter-sm`
# applied WITHOUT metricRelabelings; assert Kyverno mutates the SM at admission;
# wait for pod Ready; poll Prometheus for http_requests_total; assert
# world_id label is ABSENT.
#
# Key design decision: POSITIVE and IDEMPOTENCY tests use selectors that match
# no real Service, so they are isolated from each other and from the SCRAPE test.
# A single dedicated SM (`kyverno-emitter-sm`) owns the emitter endpoint, so
# prometheus-operator can reliably reconcile exactly one scrape config for it.
# Root cause of the prior failure (PR #828 CI run 26239574154): two SMs
# (naive-violator + pre-armoured-violator) competed for the same
# `app: kyverno-emitter` Endpoints; operator never reconciled either.
#
# Pre-conditions:
# - kube-prometheus-stack installed (cardinality-drop.sh ran).
# - kubectl context set to a live cluster; helm + jq + curl available.
#
# Idempotency: kubectl apply is idempotent; helm upgrade --install is
# idempotent. Cleanup trap removes synthetic resources on exit. The
# ClusterPolicy + Kyverno install are LEFT in the cluster (permanent
# C8 fixtures).
#
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8
# codex review on PR #783 ("policy by convention" finding)
# PR #828 CI run 26239574154 (competing-SM root cause)
set -euo pipefail
KYVERNO_VERSION="3.8.1"
KYVERNO_NAMESPACE="kyverno"
TEST_NAMESPACE="monitoring"
PROM_LOCAL_PORT="9092" # 9090, 9091 may be in use by sibling Phase C scripts
PF_BIND_SECONDS=5
TARGET_DISCOVERY_TIMEOUT="${OLAM_PROM_DISCOVERY_TIMEOUT:-240}" # bumped from 180s; one CI attempt observed kyverno-emitter still not scraped at 180s
SCRAPE_POLL_INTERVAL=10
log() { printf '[kyverno-mutate] %s\n' "$*" >&2; }
fail() { printf '[kyverno-mutate] FAIL: %s\n' "$*" >&2; exit 1; }
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
# peripheral-services/{helm-values,manifests} directory is reachable.
# Monorepo callers leave it unset; the script falls back to the source dir
# under packages/peripheral-services/.
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
else
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
fi
# -------------------------------------------------------------------------
# Cleanup trap — kill port-forwards; remove synthetic resources on exit.
# Kyverno chart + ClusterPolicy stay (permanent C8 fixtures).
# -------------------------------------------------------------------------
PROM_PF_PID=""
cleanup() {
[[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null || true
log "removing synthetic resources (idempotent)"
# Mutation-test SMs (already deleted inline, but --ignore-not-found makes this safe)
kubectl delete servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true
kubectl delete servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true
# Scrape-verification resources
kubectl delete servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true
kubectl delete deployment kyverno-emitter -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true
kubectl delete service kyverno-emitter-svc -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true
kubectl delete configmap kyverno-emitter-config -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true
}
trap cleanup EXIT
# -------------------------------------------------------------------------
# Pre-flight
# -------------------------------------------------------------------------
command -v helm >/dev/null 2>&1 || fail "helm not installed"
command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
command -v curl >/dev/null 2>&1 || fail "curl not installed"
command -v jq >/dev/null 2>&1 || fail "jq not installed"
kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
# kube-prom-stack must already be up — we rely on Prometheus + the
# ServiceMonitor CRD existing.
kubectl get crd servicemonitors.monitoring.coreos.com >/dev/null 2>&1 \
|| fail "ServiceMonitor CRD not present — run prom-no-double-grafana.sh first"
kubectl get deployment -n "$TEST_NAMESPACE" -l "app.kubernetes.io/name=prometheus-operator" \
>/dev/null 2>&1 \
|| fail "prometheus-operator not found in $TEST_NAMESPACE — run prom-no-double-grafana.sh first"
log "pre-flight checks passed"
# -------------------------------------------------------------------------
# Step 1: helm-install Kyverno
#
# Repo add is idempotent; helm upgrade --install handles fresh install + upgrade.
# `--wait` blocks until pods are Ready; admission webhook needs to be live
# before we apply the ClusterPolicy or our test ServiceMonitors.
# -------------------------------------------------------------------------
log "ensuring kyverno helm repo is configured"
helm repo add kyverno https://kyverno.github.io/kyverno/ >/dev/null 2>&1 || true
helm repo update kyverno >/dev/null 2>&1 || true
log "installing kyverno chart $KYVERNO_VERSION (waits for admission webhook Ready)"
helm upgrade --install olam-kyverno kyverno/kyverno \
--version "$KYVERNO_VERSION" \
--namespace "$KYVERNO_NAMESPACE" \
--create-namespace \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \
--wait --timeout "${OLAM_HELM_TIMEOUT:-600s}" 2>&1 | tail -8
# Sanity: kyverno-admission-controller Deployment Ready.
kubectl get deployment -n "$KYVERNO_NAMESPACE" -l "app.kubernetes.io/component=admission-controller" \
>/dev/null 2>&1 \
|| fail "kyverno admission controller not found in $KYVERNO_NAMESPACE"
log "waiting for kyverno admission webhook to be registered with apiserver"
# The webhook registration is the LAST thing kyverno does after pod-Ready;
# poll until our ClusterPolicy can be admitted.
elapsed=0
while [ "$elapsed" -lt 120 ]; do
if kubectl get validatingwebhookconfiguration kyverno-policy-validating-webhook-cfg \
>/dev/null 2>&1; then
log "kyverno webhooks registered after ${elapsed}s"
break
fi
sleep 5
elapsed=$((elapsed + 5))
done
if [ "$elapsed" -ge 120 ]; then
fail "kyverno webhook registration timed out after 120s"
fi
# -------------------------------------------------------------------------
# Step 2: Apply the ClusterPolicy
# -------------------------------------------------------------------------
log "applying ClusterPolicy enforce-cardinality-labeldrop"
kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/96-kyverno-cardinality-mutate.yaml"
# Wait for policy to be Ready (Kyverno controller picks it up and reports
# readiness in status.ready / .conditions).
log "waiting up to 60s for ClusterPolicy to be Ready"
elapsed=0
while [ "$elapsed" -lt 60 ]; do
READY=$(kubectl get clusterpolicy enforce-cardinality-labeldrop \
-o jsonpath='{.status.ready}' 2>/dev/null || echo "")
if [ "$READY" = "true" ]; then
log "ClusterPolicy Ready after ${elapsed}s"
break
fi
sleep 3
elapsed=$((elapsed + 3))
done
if [ "$elapsed" -ge 60 ]; then
log "WARN: ClusterPolicy status.ready not observed within 60s; proceeding (status field can lag)"
fi
# -------------------------------------------------------------------------
# Step 3: POSITIVE test — mutation only, no backing Service
#
# Uses selector `app: kyverno-mutate-positive-test` — a label that no
# real Service carries, so this SM never competes with anything for
# Endpoints. Its sole job is to exercise the Kyverno admission webhook.
#
# Deleted immediately after assertion so the SM space is clean when
# the scrape test runs.
# -------------------------------------------------------------------------
log "POSITIVE test: applying naive ServiceMonitor (no metricRelabelings, non-Service-backed selector)"
kubectl apply -f - <<'EOF'
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kyverno-mutate-positive-test
namespace: monitoring
labels:
release: olam-prom
spec:
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
app: kyverno-mutate-positive-test
endpoints:
- port: metrics
interval: 15s
# NOTE: deliberately NO metricRelabelings — Kyverno must inject it.
EOF
# Read back and assert.
ACTUAL=$(kubectl get servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" -o json \
| jq -r '.spec.endpoints[0].metricRelabelings // [] | tojson')
log "kyverno-mutate-positive-test metricRelabelings after admission: $ACTUAL"
INJECTED_COUNT=$(echo "$ACTUAL" | jq '[ .[] | select(.action == "labeldrop" and (.regex | contains("world_id"))) ] | length')
if [ "$INJECTED_COUNT" -lt 1 ]; then
log "actual policy state:"
kubectl get clusterpolicy enforce-cardinality-labeldrop -o yaml >&2 || true
fail "POSITIVE test FAILED: Kyverno did not inject labeldrop into naive ServiceMonitor — third-party bypass gap NOT closed"
fi
log "PASS: naive ServiceMonitor was mutated at admission (labeldrop injected)"
log "deleting kyverno-mutate-positive-test (mutation-only test; SM space clean for scrape test)"
kubectl delete servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" --ignore-not-found=true
# -------------------------------------------------------------------------
# Step 4: IDEMPOTENCY test — mutation only, no backing Service
#
# Uses selector `app: kyverno-mutate-idempotency-test` — different from
# the positive test and from the scrape test label. No real Service.
# Deleted immediately after assertion.
# -------------------------------------------------------------------------
log "IDEMPOTENCY test: applying pre-armoured ServiceMonitor (labeldrop already present)"
kubectl apply -f - <<'EOF'
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kyverno-mutate-idempotency-test
namespace: monitoring
labels:
release: olam-prom
spec:
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
app: kyverno-mutate-idempotency-test
endpoints:
- port: metrics
interval: 15s
metricRelabelings:
- action: labeldrop
regex: 'world_id|trace_id|user_id|request_id|operator_id'
EOF
DUP_COUNT=$(kubectl get servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" -o json \
| jq '[ .spec.endpoints[0].metricRelabelings[] | select(.action == "labeldrop" and (.regex | contains("world_id"))) ] | length')
log "kyverno-mutate-idempotency-test labeldrop count: $DUP_COUNT"
if [ "$DUP_COUNT" -ne 1 ]; then
kubectl get servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" -o yaml >&2
fail "IDEMPOTENCY test FAILED: expected 1 labeldrop entry, got $DUP_COUNT — policy double-adds"
fi
log "PASS: pre-armoured ServiceMonitor has exactly 1 labeldrop (no double-add)"
log "deleting kyverno-mutate-idempotency-test (mutation-only test; SM space clean for scrape test)"
kubectl delete servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" --ignore-not-found=true
# -------------------------------------------------------------------------
# Step 5: SCRAPE-VERIFICATION test — dedicated SM + Service + Pod
#
# One SM (`kyverno-emitter-sm`) selects exactly one Service (`kyverno-emitter-svc`).
# No other SM in the cluster selects `app: kyverno-emitter`, so prometheus-operator
# reconciles a single clean scrape config.
#
# The SM is applied WITHOUT metricRelabelings so Kyverno's admission webhook
# fires — this is the load-bearing check that the policy applies during real
# scrape setup, not just on test fixtures.
#
# After admission we verify the spec has the labeldrop, then wait for the pod
# to be Ready and poll Prometheus for http_requests_total. We assert
# world_id is absent from all returned series.
#
# Mirrors the working pattern from dashboards-have-data.sh (single dedicated
# SM + co-located Service in `monitoring` namespace).
# -------------------------------------------------------------------------
log "SCRAPE-VERIFICATION test: deploying synthetic kyverno-emitter (emits http_requests_total{world_id})"
kubectl apply -f - <<'EOF'
---
apiVersion: v1
kind: ConfigMap
metadata:
name: kyverno-emitter-config
namespace: monitoring
data:
metrics: |
# HELP http_requests_total Synthetic counter; world_id is the cardinality bomb
# TYPE http_requests_total counter
http_requests_total{world_id="kyverno-world",route="/api",method="GET",status_code="200"} 1
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: kyverno-emitter
namespace: monitoring
labels:
app: kyverno-emitter
spec:
replicas: 1
selector:
matchLabels:
app: kyverno-emitter
template:
metadata:
labels:
app: kyverno-emitter
spec:
containers:
- name: emitter
image: python:3.11-alpine
ports:
- containerPort: 8080
command: ["python3", "-c"]
args:
- |
import http.server
with open('/config/metrics') as f: METRICS = f.read().encode()
class H(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path != '/metrics':
self.send_response(404); self.end_headers(); return
self.send_response(200)
self.send_header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8')
self.end_headers()
self.wfile.write(METRICS)
def log_message(self, *a): pass
http.server.HTTPServer(('0.0.0.0', 8080), H).serve_forever()
volumeMounts:
- name: config
mountPath: /config
volumes:
- name: config
configMap:
name: kyverno-emitter-config
---
apiVersion: v1
kind: Service
metadata:
name: kyverno-emitter-svc
namespace: monitoring
labels:
app: kyverno-emitter
spec:
selector:
app: kyverno-emitter
ports:
- name: metrics
port: 8080
targetPort: 8080
EOF
log "waiting for kyverno-emitter deployment Ready"
kubectl rollout status deployment/kyverno-emitter -n "$TEST_NAMESPACE" --timeout=120s
# Apply the dedicated ServiceMonitor WITHOUT metricRelabelings so Kyverno
# mutates it at admission — this proves the policy fires on real SM objects,
# not just on the POSITIVE test fixture.
log "applying kyverno-emitter-sm (no metricRelabelings — Kyverno must inject)"
kubectl apply -f - <<'EOF'
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kyverno-emitter-sm
namespace: monitoring
labels:
release: olam-prom
spec:
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
app: kyverno-emitter
endpoints:
- port: metrics
interval: 15s
# NOTE: NO metricRelabelings — Kyverno must inject the labeldrop at admission.
EOF
# Verify Kyverno mutated this SM too (belt-and-suspenders: proves the policy
# applies to the SM that actually drives the scrape, not just the test fixtures).
SCRAPE_SM_ACTUAL=$(kubectl get servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" -o json \
| jq -r '.spec.endpoints[0].metricRelabelings // [] | tojson')
log "kyverno-emitter-sm metricRelabelings after admission: $SCRAPE_SM_ACTUAL"
SCRAPE_SM_INJECTED=$(echo "$SCRAPE_SM_ACTUAL" | jq '[ .[] | select(.action == "labeldrop" and (.regex | contains("world_id"))) ] | length')
if [ "$SCRAPE_SM_INJECTED" -lt 1 ]; then
log "actual policy state:"
kubectl get clusterpolicy enforce-cardinality-labeldrop -o yaml >&2 || true
fail "SCRAPE-VERIFICATION test FAILED: Kyverno did not mutate kyverno-emitter-sm at admission"
fi
log "PASS: kyverno-emitter-sm was mutated at admission (labeldrop injected)"
# Port-forward Prometheus and poll for metric samples.
log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090"
kubectl port-forward \
-n "$TEST_NAMESPACE" \
"svc/prometheus-operated" \
"${PROM_LOCAL_PORT}:9090" &
PROM_PF_PID=$!
sleep "$PF_BIND_SECONDS"
kill -0 "$PROM_PF_PID" 2>/dev/null \
|| fail "Prometheus port-forward exited prematurely"
PROM_URL="http://localhost:${PROM_LOCAL_PORT}"
# Direct-metric polling rather than target-discovery polling.
#
# Rationale: kube-prometheus-stack's default relabel sets the `job` label
# from the k8s Service name. Polling by job-name is brittle — operator
# reconciliation races, dropped-target filtering, and rare CRD revision
# lag have all surfaced as "target not in activeTargets" flakes during
# earlier ingress-integration runs. What we ACTUALLY care about is
# whether the mutated relabel was applied to a real scrape sample. So
# poll for the metric directly. With a single SM selecting on
# `app=kyverno-emitter`, any http_requests_total series returned
# necessarily came through kyverno-emitter-sm.
log "polling Prometheus for http_requests_total samples (up to ${TARGET_DISCOVERY_TIMEOUT}s)"
elapsed=0
RESULT=""
while [ "$elapsed" -lt "$TARGET_DISCOVERY_TIMEOUT" ]; do
RESULT=$(curl -sf "${PROM_URL}/api/v1/query?query=http_requests_total" 2>/dev/null || echo "")
if [ -n "$RESULT" ]; then
SERIES_COUNT=$(echo "$RESULT" | jq '.data.result | length' 2>/dev/null || echo "0")
if [ "$SERIES_COUNT" -ge 1 ]; then
log "http_requests_total returned $SERIES_COUNT series after ${elapsed}s"
break
fi
fi
sleep "$SCRAPE_POLL_INTERVAL"
elapsed=$((elapsed + SCRAPE_POLL_INTERVAL))
done
if [ "$elapsed" -ge "$TARGET_DISCOVERY_TIMEOUT" ]; then
log "Active targets snapshot for diagnosis:"
curl -sf "${PROM_URL}/api/v1/targets" | jq '.data.activeTargets[] | {job: .labels.job, service: .labels.service, namespace: .labels.namespace, health: .health, lastError: .lastError}' >&2 || true
log "ServiceMonitor kyverno-emitter-sm status:"
kubectl get servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" -o yaml >&2 || true
log "prometheus-operator log tail (last 50 lines):"
kubectl logs -n "$TEST_NAMESPACE" -l "app.kubernetes.io/name=prometheus-operator" --tail=50 >&2 || true
fail "Prometheus did not scrape kyverno-emitter within ${TARGET_DISCOVERY_TIMEOUT}s"
fi
SERIES_COUNT=$(echo "$RESULT" | jq '.data.result | length')
LEAKED=$(echo "$RESULT" | jq '[.data.result[] | .metric | has("world_id")] | any')
if [ "$LEAKED" = "true" ]; then
echo "$RESULT" | jq '.data.result[] | .metric' >&2
fail "world_id label leaked into Prometheus — Kyverno-mutated relabel did NOT take effect at scrape time"
fi
log "PASS: kyverno-emitter scraped via kyverno-emitter-sm; world_id absent at scrape time"
log "PASS: C8 verified — Kyverno mutates third-party-shaped ServiceMonitors at admission and the mutation takes effect at scrape time"
exit 0
#!/usr/bin/env bash
# loki-ingest.sh — e2e smoke test: Loki single-binary installs, Promtail tails,
# OAuth query-param scrubbing verified (code=REDACTED, no raw token).
#
# Usage: scripts/e2e/loki-ingest.sh
#
# Pre-conditions:
# - kubectl context is set to a live k8s cluster (does NOT spin up k3d)
# - helm binary available
# - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts)
#
# This script is invoked by the A12 harness (scripts/test-ingress-integration/)
# after cluster-up.sh. It can also be run manually against any live cluster.
#
# Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an
# existing cluster. The synthetic pod is cleaned up regardless of
# pass/fail via a trap.
#
# Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B1
# Chart: grafana/loki 6.7.4 (pinned; latest stable 2026-05-20)
# Chart: grafana/promtail 6.16.6 (latest stable 2026-05-20)
set -euo pipefail
NAMESPACE="monitoring"
LOKI_RELEASE="olam-loki"
PROMTAIL_RELEASE="olam-promtail"
SYNTHETIC_POD="loki-e2e-synthetic"
LOKI_PORT="3100"
LOCAL_PORT="13100" # avoid conflict with any host-level Loki
# Magic-number commentary: Promtail's tail → ingest cycle involves:
# - inotify event (near-instant)
# - Promtail pipeline processing (~1s)
# - Loki write path (ingester chunk idle period: default 30m, but flush on
# query pressure; typically <5s in practice)
# 10s is conservative for a single log line in a lightly loaded cluster.
INGEST_LAG_SECONDS=10
log() { printf '[loki-ingest] %s\n' "$*" >&2; }
fail() { printf '[loki-ingest] FAIL: %s\n' "$*" >&2; exit 1; }
# -------------------------------------------------------------------------
# Cleanup trap — remove synthetic pod and port-forward on exit
# -------------------------------------------------------------------------
PF_PID=""
cleanup() {
if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then
kill "$PF_PID" 2>/dev/null || true
fi
kubectl delete pod "$SYNTHETIC_POD" -n default --ignore-not-found=true 2>/dev/null || true
}
trap cleanup EXIT
# -------------------------------------------------------------------------
# Pre-flight
# -------------------------------------------------------------------------
command -v helm >/dev/null 2>&1 || fail "helm not installed"
command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
command -v curl >/dev/null 2>&1 || fail "curl not installed"
kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
log "pre-flight checks passed"
# -------------------------------------------------------------------------
# Resolve repo root so helm -f paths work regardless of invocation cwd
# -------------------------------------------------------------------------
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
# peripheral-services/{helm-values,manifests} directory is reachable.
# Monorepo callers leave it unset; the script resolves the source dir under
# packages/peripheral-services/.
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
else
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
fi
# -------------------------------------------------------------------------
# Ensure grafana Helm repo is present (idempotent — safe to re-run)
# -------------------------------------------------------------------------
helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
helm repo update grafana
# -------------------------------------------------------------------------
# Step 1: Install / upgrade Loki (single-binary mode)
# -------------------------------------------------------------------------
log "installing grafana/loki ($LOKI_RELEASE) in namespace $NAMESPACE"
helm upgrade --install "$LOKI_RELEASE" grafana/loki \
--version 6.7.4 \
--namespace "$NAMESPACE" \
--create-namespace \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
--wait \
--timeout "${OLAM_HELM_TIMEOUT:-600s}"
log "loki helm install complete"
# -------------------------------------------------------------------------
# Step 2: Install / upgrade Promtail
# -------------------------------------------------------------------------
log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE"
helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \
--version 6.16.6 \
--namespace "$NAMESPACE" \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
--wait \
--timeout 120s
log "promtail helm install complete"
# -------------------------------------------------------------------------
# Step 3: Wait for Loki pod Ready
# -------------------------------------------------------------------------
log "waiting for Loki pod Ready (120s)"
kubectl wait \
--for=condition=ready pod \
-l app.kubernetes.io/name=loki \
-n "$NAMESPACE" \
--timeout=120s
log "loki pod Ready"
# -------------------------------------------------------------------------
# Step 4: Generate synthetic log line with raw OAuth tokens in URL and headers.
#
# The pod prints a single log line containing all 4 scrub patterns:
# ?code=SECRETTOKEN123 → code=REDACTED
# &access_token=SECRETTOKEN456 → access_token=REDACTED
# &state=SESSION789 → state=REDACTED
# Authorization: Bearer SECRETBEARER000 → Authorization: Bearer REDACTED
#
# Promtail tails it, runs the scrubbing pipeline, and pushes to Loki with all
# 4 raw tokens absent and all 4 REDACTED markers present.
# -------------------------------------------------------------------------
log "launching synthetic pod (prints all 4 raw token patterns)"
kubectl run "$SYNTHETIC_POD" \
--image=busybox \
--restart=Never \
-n default \
-- sh -c 'echo "GET http://example.com/callback?code=SECRETTOKEN123&access_token=SECRETTOKEN456&state=SESSION789 HTTP/1.1 Authorization: Bearer SECRETBEARER000"'
# -------------------------------------------------------------------------
# Step 5: Wait for Promtail tail + ingest lag
# -------------------------------------------------------------------------
log "waiting ${INGEST_LAG_SECONDS}s for Promtail to tail and ingest synthetic log"
sleep "$INGEST_LAG_SECONDS"
# -------------------------------------------------------------------------
# Step 6: Port-forward Loki and query
# -------------------------------------------------------------------------
log "port-forwarding Loki svc to localhost:${LOCAL_PORT}"
kubectl port-forward \
"svc/${LOKI_RELEASE}" \
"${LOCAL_PORT}:${LOKI_PORT}" \
-n "$NAMESPACE" &
PF_PID=$!
# Give port-forward a moment to establish
sleep 2
# Query Loki for log lines from the default namespace within the last 5 minutes.
# We search broadly for "SECRETTOKEN" to catch any raw token that leaked through,
# and separately verify all 4 REDACTED markers are present.
log "querying Loki for scrubbed entries"
QUERY_RESPONSE=$(
curl -s -G \
"http://localhost:${LOCAL_PORT}/loki/api/v1/query_range" \
--data-urlencode 'query={namespace="default"} |= "REDACTED"' \
--data-urlencode "start=$(date -u -v-5M +%s 2>/dev/null || date -u -d '5 minutes ago' +%s)000000000" \
--data-urlencode "end=$(date -u +%s)000000000" \
--data-urlencode 'limit=50'
)
# -------------------------------------------------------------------------
# Step 7: Assertions — verify all 4 scrub patterns
#
# Contract (matches Phase B spec + promtail-values.yaml):
# ?code=SECRETTOKEN123 → code=REDACTED (absent: SECRETTOKEN123)
# &access_token=SECRETTOKEN456 → access_token=REDACTED (absent: SECRETTOKEN456)
# &state=SESSION789 → state=REDACTED (absent: SESSION789)
# Authorization: Bearer SECRETBEARER000 → Bearer REDACTED (absent: SECRETBEARER000)
# -------------------------------------------------------------------------
log "asserting scrubbing correctness (all 4 patterns)"
diag() {
log "DIAGNOSTIC: Loki query response:"
echo "$QUERY_RESPONSE" >&2
log "DIAGNOSTIC: last 50 lines of Promtail logs:"
kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=promtail --tail=50 2>&1 >&2 || true
}
# Assertion 1: query response is non-empty (Loki returned results)
if ! echo "$QUERY_RESPONSE" | grep -q '"result"'; then
diag
fail "Loki returned no result block — Promtail may not have ingested the synthetic log yet"
fi
# --- Scrubbed markers present ---
# Assertion 2a: code= is scrubbed
if ! echo "$QUERY_RESPONSE" | grep -q 'code=REDACTED'; then
diag
fail "'code=REDACTED' not found in Loki response — code= scrub stage not working"
fi
# Assertion 2b: access_token= is scrubbed
if ! echo "$QUERY_RESPONSE" | grep -q 'access_token=REDACTED'; then
diag
fail "'access_token=REDACTED' not found in Loki response — access_token= scrub stage not working"
fi
# Assertion 2c: state= is scrubbed
if ! echo "$QUERY_RESPONSE" | grep -q 'state=REDACTED'; then
diag
fail "'state=REDACTED' not found in Loki response — state= scrub stage not working"
fi
# Assertion 2d: Authorization Bearer is scrubbed
if ! echo "$QUERY_RESPONSE" | grep -q 'Bearer REDACTED'; then
diag
fail "'Bearer REDACTED' not found in Loki response — Authorization Bearer scrub stage not working"
fi
# --- Raw tokens absent ---
# Assertion 3a: raw code= token is absent
if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN123'; then
diag
fail "raw token 'SECRETTOKEN123' (code=) found in Loki response — scrubbing pipeline is NOT working"
fi
# Assertion 3b: raw access_token= token is absent
if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN456'; then
diag
fail "raw token 'SECRETTOKEN456' (access_token=) found in Loki response — scrubbing pipeline is NOT working"
fi
# Assertion 3c: raw state= token is absent
if echo "$QUERY_RESPONSE" | grep -q 'SESSION789'; then
diag
fail "raw token 'SESSION789' (state=) found in Loki response — scrubbing pipeline is NOT working"
fi
# Assertion 3d: raw Bearer token is absent
if echo "$QUERY_RESPONSE" | grep -q 'SECRETBEARER000'; then
diag
fail "raw token 'SECRETBEARER000' (Authorization Bearer) found in Loki response — scrubbing pipeline is NOT working"
fi
log "PASS: all 4 scrub patterns verified — code=REDACTED, access_token=REDACTED, state=REDACTED, Bearer REDACTED present; all raw tokens absent"
exit 0
// NDJSON span sink — zero-config observability for host-cp.
//
// Subscribes to the host-stream broadcaster and writes one JSON line per
// `span` event to ~/.olam/logs/host.trace.ndjson. Each span carries the
// minimum surface needed for `jq`-based triage: identity, timing, exit.
//
// Wire shape per line:
// { traceId, spanId, parentSpanId, name, startedAt, durationMs,
// attributes, events[], exit: { _tag: 'Success'|'Failure', reason? } }
//
// Rotation: single level — at 50MB the file is renamed to `.1` and a
// fresh file is opened. The previous `.1` (if any) is overwritten. We
// keep at most one prior generation; deeper retention belongs to the
// operator's normal disk-management tooling.
//
// Override path with OLAM_TRACE_LOG_PATH (set to /dev/null in tests that
// don't care about file output, or to a temp file to assert on writes).
import { open, mkdir, rename } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { homedir } from 'node:os';
import { redactSensitive } from './redactor.mjs';
const DEFAULT_ROTATE_BYTES = 50 * 1024 * 1024;
const DEFAULT_LOG_PATH =
process.env.OLAM_TRACE_LOG_PATH ??
join(homedir(), '.olam', 'logs', 'host.trace.ndjson');
export async function createNdjsonSpanSink({
logPath = DEFAULT_LOG_PATH,
rotateBytes = DEFAULT_ROTATE_BYTES,
hostStream,
} = {}) {
await mkdir(dirname(logPath), { recursive: true });
let fh = await open(logPath, 'a');
let bytesWritten = (await fh.stat()).size;
let closed = false;
let chain = Promise.resolve();
async function writeLine(line) {
if (closed) return;
await fh.write(line);
bytesWritten += Buffer.byteLength(line);
if (bytesWritten >= rotateBytes) {
await fh.close();
await rename(logPath, `${logPath}.1`);
fh = await open(logPath, 'a');
bytesWritten = 0;
}
}
function recordSpan(span = {}) {
const {
name, startedAt, endedAt, attributes, events, exit,
traceId, spanId, parentSpanId, reason,
} = span;
const haveTimes = typeof endedAt === 'number' && typeof startedAt === 'number';
const durationMs = haveTimes ? endedAt - startedAt : null;
let finalExit;
if (exit && typeof exit === 'object' && (exit._tag === 'Success' || exit._tag === 'Failure')) {
finalExit = exit._tag === 'Failure' && exit.reason !== undefined
? { _tag: 'Failure', reason: exit.reason }
: { _tag: exit._tag };
} else if (!haveTimes) {
finalExit = reason !== undefined ? { _tag: 'Failure', reason } : { _tag: 'Failure' };
} else {
finalExit = { _tag: 'Success' };
}
const record = {
traceId: traceId ?? null,
spanId: spanId ?? null,
parentSpanId: parentSpanId ?? null,
name: name ?? null,
startedAt: startedAt ?? null,
durationMs,
attributes: redactSensitive(attributes ?? {}),
events: redactSensitive(events ?? []),
exit: finalExit,
};
const next = chain.then(() => writeLine(JSON.stringify(record) + '\n')).catch(() => {});
chain = next;
return next;
}
let detach = null;
if (hostStream && typeof hostStream.addSink === 'function') {
detach = hostStream.addSink(createSseSpanAdapter((payload) => recordSpan(payload)));
}
return {
recordSpan,
async close() {
if (closed) return;
if (detach) detach();
// Drain queued writes BEFORE flipping the closed flag — `writeLine`
// bails on `closed`, so flipping first would silently drop spans
// recorded just prior to shutdown.
await chain;
closed = true;
try { await fh.close(); } catch { /* already closed */ }
},
};
}
/**
* Subscribe an NDJSON sink to `@olam/auth-client`'s `betaResponseEmitter`.
* Each `beta-response` event becomes a `withCredential.beta-response` span
* with the beta payload exploded onto `attributes` — downstream `jq`
* consumers can query e.g.
*
* jq 'select(.name == "withCredential.beta-response")
* | {ts: .startedAt, cred: .attributes.credentialName,
* cache: .attributes.cacheStatus,
* thinking: .attributes.thinkingTokens,
* latencyMs: .durationMs}' ~/.olam/logs/host.trace.ndjson
*
* Wire is opt-in (call from server boot). Returns a detach function so the
* subscription can be removed in tests or on shutdown.
*
* Pure additive: spans flowing from other sources (docker lifecycle,
* plan-orchestrator, etc.) are unaffected.
*/
export function attachBetaResponseEvents({ sink, emitter }) {
if (!sink || typeof sink.recordSpan !== 'function') {
throw new Error('attachBetaResponseEvents: sink.recordSpan required');
}
if (!emitter || typeof emitter.on !== 'function') {
throw new Error('attachBetaResponseEvents: emitter.on required');
}
const handler = (info) => {
const now = Date.now();
const latency = typeof info?.latencyMs === 'number' ? info.latencyMs : 0;
sink.recordSpan({
name: 'withCredential.beta-response',
startedAt: now - latency,
endedAt: now,
attributes: {
credentialName: info?.credentialName ?? null,
credId: info?.credId ?? null,
betas: Array.isArray(info?.betas) ? [...info.betas] : [],
cacheStatus: info?.cacheStatus ?? null,
thinkingTokens: info?.tokenCounts?.thinking ?? null,
statusCode: typeof info?.statusCode === 'number' ? info.statusCode : null,
extraHeaders: info?.extraHeaders && typeof info.extraHeaders === 'object'
? { ...info.extraHeaders }
: {},
},
exit: { _tag: 'Success' },
});
};
emitter.on('beta-response', handler);
return () => emitter.off('beta-response', handler);
}
// Duck-typed ServerResponse for host-stream's `addSink`. Parses SSE frames
// (`event: <type>\ndata: <json>\n\n`) and dispatches `event: span` payloads
// to `onSpan`. All other event types are silently ignored — host-stream
// also replays per-type snapshots on attach; the sink is created at boot
// before any spans are broadcast, so replay is a no-op in practice.
function createSseSpanAdapter(onSpan) {
let buffer = '';
return {
writableEnded: false,
destroyed: false,
write(chunk) {
buffer += String(chunk);
let i;
while ((i = buffer.indexOf('\n\n')) !== -1) {
const frame = buffer.slice(0, i);
buffer = buffer.slice(i + 2);
if (!frame.startsWith('event: span\n') && !frame.includes('\nevent: span\n')) continue;
const dataLine = frame.split('\n').find((l) => l.startsWith('data: '));
if (!dataLine) continue;
try { onSpan(JSON.parse(dataLine.slice(6))); } catch { /* malformed frame */ }
}
return true;
},
once() { /* no drain handling needed — in-memory adapter never backpressures */ },
end() { this.writableEnded = true; },
};
}
#!/usr/bin/env bash
# prom-no-double-grafana.sh — Phase C Task C1 e2e smoke test.
#
# Verifies:
# 1. kube-prometheus-stack installs (Prometheus pod becomes Ready).
# 2. ServiceMonitor CRD is Established before Phase B charts are upgraded.
# 3. Phase B charts (Loki + Promtail + Grafana) are helm-upgraded to pick up
# serviceMonitor.enabled: true now that the CRD exists.
# 4. Exactly one Grafana Deployment is running in the cluster (no double-Grafana).
# 5. Phase B's Grafana (olam-grafana) has exactly one Prometheus datasource
# provisioned (from grafana-values.yaml datasources block added in C1).
# 6. Prometheus is scraping at least one active target.
#
# Pre-conditions:
# - kubectl context is set to a live k8s cluster.
# - Phase B e2e (loki-ingest.sh + grafana-port-forward.sh + grafana-dashboard-persistence.sh)
# has already run: olam-loki, olam-promtail, and olam-grafana releases are installed.
# - The olam-grafana-admin Secret exists (created by grafana-port-forward.sh).
# - helm, kubectl, curl, jq binaries available.
#
# Chart: prometheus-community/kube-prometheus-stack 85.2.0 (pinned; latest stable 2026-05-21).
#
# Idempotency: helm upgrade --install is idempotent; re-runs on an existing
# cluster succeed. Port-forwards are killed on exit via trap.
#
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C1
set -euo pipefail
NAMESPACE="monitoring"
PROM_RELEASE="olam-prom"
PROM_CHART_VERSION="85.2.0"
GRAFANA_RELEASE="olam-grafana"
GRAFANA_LOCAL_PORT="3001" # avoid collision if phase-b-e2e left a port-forward on 3000
GRAFANA_SVC_PORT="80"
PROM_LOCAL_PORT="9090"
PF_BIND_SECONDS=5
log() { printf '[prom-no-double-grafana] %s\n' "$*" >&2; }
fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; }
# -------------------------------------------------------------------------
# Resolve repo root so helm -f paths work regardless of invocation cwd
# -------------------------------------------------------------------------
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
# peripheral-services/{helm-values,manifests} directory is reachable.
# Monorepo callers leave it unset; the script falls back to the source dir
# under packages/peripheral-services/.
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
else
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
fi
# -------------------------------------------------------------------------
# Cleanup trap — kill port-forwards on exit; leave Helm releases in place
# -------------------------------------------------------------------------
GRAFANA_PF_PID=""
PROM_PF_PID=""
cleanup() {
[[ -n "$GRAFANA_PF_PID" ]] && kill "$GRAFANA_PF_PID" 2>/dev/null || true
[[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null || true
}
trap cleanup EXIT
# -------------------------------------------------------------------------
# Pre-flight
# -------------------------------------------------------------------------
command -v helm >/dev/null 2>&1 || fail "helm not installed"
command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
command -v curl >/dev/null 2>&1 || fail "curl not installed"
command -v jq >/dev/null 2>&1 || fail "jq not installed"
kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
log "pre-flight checks passed"
# Verify Phase B pre-conditions
for release in olam-loki olam-promtail "$GRAFANA_RELEASE"; do
helm status "$release" -n "$NAMESPACE" >/dev/null 2>&1 \
|| fail "Phase B release '$release' not found in namespace $NAMESPACE — run phase-b-e2e first"
done
log "Phase B pre-conditions satisfied (olam-loki, olam-promtail, olam-grafana releases found)"
# -------------------------------------------------------------------------
# Step 1: Add prometheus-community repo and install kube-prometheus-stack
# -------------------------------------------------------------------------
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
helm repo update prometheus-community
log "installing prometheus-community/kube-prometheus-stack ($PROM_RELEASE) version $PROM_CHART_VERSION"
helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stack \
--version "$PROM_CHART_VERSION" \
--namespace "$NAMESPACE" \
--create-namespace \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
--wait \
--timeout "${OLAM_HELM_TIMEOUT:-600s}"
log "kube-prometheus-stack helm install complete"
# -------------------------------------------------------------------------
# Step 2: Wait for ServiceMonitor CRD to be Established
# This is the gate before upgrading Phase B charts — the CRD must exist
# for serviceMonitor.enabled: true to produce a valid ServiceMonitor object.
# -------------------------------------------------------------------------
log "waiting for ServiceMonitor CRD to be Established (60s)"
kubectl wait \
--for=condition=established \
crd/servicemonitors.monitoring.coreos.com \
--timeout=60s
log "ServiceMonitor CRD Established"
# -------------------------------------------------------------------------
# Step 3: Helm-upgrade Phase B charts to enable ServiceMonitor at RUNTIME
#
# The source-of-truth values files keep serviceMonitor.enabled: false so a
# standalone Phase B install (without kube-prometheus-stack) does not
# hard-fail with "no matches for kind ServiceMonitor". We flip the toggle
# at runtime here, AFTER the CRD is Established, via --set overrides. This
# preserves Phase B's standalone-installability invariant while wiring
# Prometheus discovery when kube-prom-stack is present.
#
# NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor)
# — chart-version-specific path.
# -------------------------------------------------------------------------
# Chart version pins MUST match the ones in phase-b-e2e's loki-ingest.sh +
# grafana-port-forward.sh. Without --version, helm pulls latest from the repo;
# the latest charts may reference new template values not present in our
# values files (e.g., Loki 6.8.x references .Values.loki.ui.enabled which is
# nil in our 6.7.4-shaped values, producing a nil-pointer template error
# during upgrade).
LOKI_CHART_VERSION="6.7.4"
PROMTAIL_CHART_VERSION="6.16.6"
GRAFANA_CHART_VERSION="8.5.2"
log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pinned versions)"
helm upgrade olam-loki grafana/loki \
--version "$LOKI_CHART_VERSION" \
--namespace "$NAMESPACE" \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
--wait \
--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
--reuse-values \
--set monitoring.serviceMonitor.enabled=true
log "olam-loki upgraded (ServiceMonitor enabled)"
helm upgrade olam-promtail grafana/promtail \
--version "$PROMTAIL_CHART_VERSION" \
--namespace "$NAMESPACE" \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
--wait \
--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
--reuse-values \
--set serviceMonitor.enabled=true
log "olam-promtail upgraded (ServiceMonitor enabled)"
helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
--version "$GRAFANA_CHART_VERSION" \
--namespace "$NAMESPACE" \
-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
--wait \
--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
--reuse-values \
--set serviceMonitor.enabled=true
log "$GRAFANA_RELEASE upgraded (ServiceMonitor enabled; Prometheus datasource provisioned)"
# -------------------------------------------------------------------------
# Step 4: Wait for Prometheus pod Ready
# -------------------------------------------------------------------------
log "waiting for Prometheus pod Ready (300s)"
kubectl wait \
--for=condition=ready pod \
-l "app.kubernetes.io/name=prometheus" \
-n "$NAMESPACE" \
--timeout=300s
log "Prometheus pod Ready"
# -------------------------------------------------------------------------
# Step 5: Assertion — exactly one Grafana Deployment in the cluster
# This catches any regression where kube-prometheus-stack's bundled Grafana
# sub-chart accidentally gets enabled.
# -------------------------------------------------------------------------
log "asserting exactly 1 Grafana Deployment in namespace $NAMESPACE"
GRAFANA_DEPS=$(kubectl get deployment \
-n "$NAMESPACE" \
-l "app.kubernetes.io/name=grafana" \
-o name \
| wc -l \
| tr -d ' ')
if [ "$GRAFANA_DEPS" != "1" ]; then
log "FAIL: expected exactly 1 Grafana Deployment, found $GRAFANA_DEPS"
kubectl get deployment -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" >&2
fail "double-Grafana detected — kube-prometheus-stack's grafana.enabled must be false"
fi
log "PASS: exactly 1 Grafana Deployment found"
# -------------------------------------------------------------------------
# Step 6: Assertion — Grafana has exactly one Prometheus datasource
# Re-read the admin password from the Secret (grafana-port-forward.sh created it).
# Use port 3001 to avoid colliding with any live phase-b-e2e port-forward on 3000.
# -------------------------------------------------------------------------
log "reading admin password from Secret olam-grafana-admin"
GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin \
-n "$NAMESPACE" \
-o jsonpath='{.data.admin-password}' \
| base64 -d)
log "port-forwarding svc/$GRAFANA_RELEASE $GRAFANA_LOCAL_PORT:$GRAFANA_SVC_PORT"
kubectl port-forward \
-n "$NAMESPACE" \
"svc/$GRAFANA_RELEASE" \
"${GRAFANA_LOCAL_PORT}:${GRAFANA_SVC_PORT}" &
GRAFANA_PF_PID=$!
log "waiting ${PF_BIND_SECONDS}s for Grafana port-forward to bind"
sleep "$PF_BIND_SECONDS"
kill -0 "$GRAFANA_PF_PID" 2>/dev/null \
|| fail "Grafana port-forward process exited prematurely"
log "asserting exactly 1 Prometheus datasource in Grafana (GET /api/datasources)"
DATASOURCES=$(curl -sf \
-u "admin:${GRAFANA_ADMIN_PW}" \
"http://localhost:${GRAFANA_LOCAL_PORT}/api/datasources" \
|| { kubectl logs -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" --tail=30 >&2 || true
fail "GET /api/datasources failed — Grafana not reachable on port $GRAFANA_LOCAL_PORT"; })
if ! echo "$DATASOURCES" | jq -e 'map(select(.type == "prometheus")) | length == 1' >/dev/null 2>&1; then
log "FAIL: Grafana does not have exactly 1 Prometheus datasource"
echo "$DATASOURCES" | jq . >&2
fail "Prometheus datasource not provisioned — check datasources block in grafana-values.yaml"
fi
PROM_URL=$(echo "$DATASOURCES" | jq -r 'map(select(.type == "prometheus")) | .[0].url')
log "PASS: Grafana has exactly 1 Prometheus datasource (url=$PROM_URL)"
# -------------------------------------------------------------------------
# Step 7: Assertion — Prometheus is scraping at least one active target
# -------------------------------------------------------------------------
log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090"
kubectl port-forward \
-n "$NAMESPACE" \
"svc/prometheus-operated" \
"${PROM_LOCAL_PORT}:9090" &
PROM_PF_PID=$!
log "waiting ${PF_BIND_SECONDS}s for Prometheus port-forward to bind"
sleep "$PF_BIND_SECONDS"
kill -0 "$PROM_PF_PID" 2>/dev/null \
|| fail "Prometheus port-forward process exited prematurely"
log "querying Prometheus /api/v1/targets for active targets"
TARGETS=$(curl -sf "http://localhost:${PROM_LOCAL_PORT}/api/v1/targets" \
|| fail "GET /api/v1/targets failed — Prometheus not reachable on port $PROM_LOCAL_PORT")
ACTIVE=$(echo "$TARGETS" | jq '.data.activeTargets | length')
if [ "$ACTIVE" -lt 1 ]; then
log "FAIL: Prometheus has 0 active scrape targets"
echo "$TARGETS" | jq '.data.activeTargets' >&2
fail "Prometheus has no active targets — check ServiceMonitor CRD and scrapeConfig"
fi
log "PASS: $ACTIVE active scrape target(s) found in Prometheus"
# -------------------------------------------------------------------------
# Assertion C4: Recording rules from 95-prom-recording-rules.yaml are loaded
#
# The 9[0-9]-prom-* glob in apply-manifests.sh skips this file (requires
# kube-prom-stack CRDs to exist). We kubectl apply it here, then poll
# /api/v1/rules until the olam-http-aggregations group appears.
# The port-forward on PROM_LOCAL_PORT is already open from Step 7 above.
# -------------------------------------------------------------------------
PROM_URL="http://localhost:${PROM_LOCAL_PORT}"
log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)"
kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/95-prom-recording-rules.yaml"
# Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson).
# Poll /api/v1/rules until our group appears (up to 180s).
RECORDING_RULES_TIMEOUT=180
log "polling ${PROM_URL}/api/v1/rules for olam-http-aggregations group (up to ${RECORDING_RULES_TIMEOUT}s)"
elapsed=0
while [ "$elapsed" -lt "$RECORDING_RULES_TIMEOUT" ]; do
if curl -sf "${PROM_URL}/api/v1/rules" 2>/dev/null \
| jq -e '.data.groups[] | select(.name == "olam-http-aggregations") | .rules[] | select(.name == "olam:http_requests:rate5m_by_service")' >/dev/null 2>&1; then
log "PASS: olam-http-aggregations rule group loaded after ${elapsed}s"
break
fi
sleep 10
elapsed=$((elapsed + 10))
done
if [ "$elapsed" -ge "$RECORDING_RULES_TIMEOUT" ]; then
log "FAIL: olam-http-aggregations rule group not found in /api/v1/rules within ${RECORDING_RULES_TIMEOUT}s"
curl -sf "${PROM_URL}/api/v1/rules" | jq '.data.groups[] | .name' >&2 || true
fail "PrometheusRule not loaded by operator"
fi
# -------------------------------------------------------------------------
# Final
# -------------------------------------------------------------------------
log "PASS: kube-prometheus-stack installed; single Grafana confirmed; Prometheus datasource provisioned; $ACTIVE active target(s); recording rules loaded — Tasks C1+C4 verified"
exit 0
// Privacy Guard — regex-based auto-redactor for trace + recovery ledger.
//
// Deep-walks an object, finds string values, applies an ordered list of
// regex patterns, returns a redacted COPY (immutable; input untouched).
// Each match is replaced with `<redacted:<kind>>`.
//
// Default-ON patterns (7): anthropic, openai, aws, gh-pat, jwt, bearer, slack.
// Opt-in (env-gated): email PII (OLAM_REDACT_PII=1), high-entropy strings
// (OLAM_REDACT_HIGH_ENTROPY=1). Hard short-circuit: OLAM_REDACTION_DISABLED=1.
//
// Precedence matters: anthropic runs before openai (otherwise the OpenAI
// `sk-...` regex would steal `sk-ant-...` and emit the wrong tag). Bearer
// runs after the high-specificity key patterns so a bearer-wrapped key
// gets the tighter tag.
const DEFAULT_PATTERNS = [
{ kind: 'anthropic-key', re: /\bsk-ant-(?:api|admin)[A-Za-z0-9_-]{20,}\b/g },
{ kind: 'openai-key', re: /\bsk-(?:proj-)?[A-Za-z0-9_-]{32,}\b/g },
{ kind: 'aws-key', re: /\bAKIA[A-Z0-9]{16}\b/g },
{ kind: 'gh-token', re: /\bgh[poursa]_[A-Za-z0-9_]{36,}\b/g },
{ kind: 'jwt', re: /\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g },
{ kind: 'slack-token', re: /\bxox[abposr]-[A-Za-z0-9-]{10,}\b/g },
{ kind: 'bearer', re: /Bearer\s+[A-Za-z0-9._~+/-]+=*/gi, replacement: 'Bearer <redacted:bearer>' },
];
const EMAIL_PATTERN = { kind: 'email', re: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi };
const HIGH_ENTROPY_PATTERN = { kind: 'high-entropy', re: /\b[A-Z0-9_-]{32,}\b/g };
const HIGH_ENTROPY_ALLOWLIST = new Set(['UUID', 'CHUNK_ID', '__filename', '__dirname']);
function redactString(s) {
if (process.env.OLAM_REDACTION_DISABLED === '1') return s;
let out = s;
for (const { kind, re, replacement } of DEFAULT_PATTERNS) {
out = out.replace(re, replacement ?? `<redacted:${kind}>`);
}
if (process.env.OLAM_REDACT_PII === '1') {
out = out.replace(EMAIL_PATTERN.re, `<redacted:${EMAIL_PATTERN.kind}>`);
}
if (process.env.OLAM_REDACT_HIGH_ENTROPY === '1') {
out = out.replace(HIGH_ENTROPY_PATTERN.re, (m) =>
HIGH_ENTROPY_ALLOWLIST.has(m) || /^<redacted:/.test(m) ? m : `<redacted:${HIGH_ENTROPY_PATTERN.kind}>`,
);
}
return out;
}
/**
* Deep-walk `value`, redacting strings. Returns a new value; input is
* never mutated. Primitives + null pass through unchanged (except strings,
* which are run through `redactString`). Cycles produce `'<cycle>'`.
*
* @template T
* @param {T} value
* @returns {T}
*/
export function redactSensitive(value) {
if (process.env.OLAM_REDACTION_DISABLED === '1') return value;
return walk(value, new WeakSet());
}
function walk(value, seen) {
if (typeof value === 'string') return redactString(value);
if (value === null || typeof value !== 'object') return value;
if (seen.has(value)) return '<cycle>';
seen.add(value);
if (Array.isArray(value)) return value.map((v) => walk(v, seen));
const out = {};
for (const k of Object.keys(value)) {
out[k] = walk(value[k], seen);
}
return out;
}
// Trace summary — operator triage digest over the NDJSON span trace.
//
// The NDJSON span sink (see `ndjson-span-sink.mjs`) writes one JSON line
// per span to ~/.olam/logs/host.trace.ndjson. Operators triage it today
// with hand-typed `jq` one-liners (README § Observability): "longest 5
// spans", "all failed spans", "failure-kind tally". This module codifies
// those recipes into ONE digest so the common questions get one answer
// without remembering jq incantations.
//
// Design:
// - `summarizeSpans(spans, opts)` is PURE — no I/O. Given an array of
// parsed span records (the exact shape the sink writes) it returns a
// digest object. This is the unit-testable core.
// - `parseTrace(ndjsonText)` turns raw file bytes into { spans, skipped }.
// Malformed lines (truncated tail line, partial write mid-rotation)
// are COUNTED, never thrown — triage tooling must survive a corrupt
// line, not die on it.
// - `summarizeTraceFile(path, opts)` is the thin file-reading wrapper.
// - `formatDigest(digest)` renders a human-readable report for the CLI.
//
// Read-only + additive: this module never writes the trace, never changes
// the line schema. It only READS fields the sink already emits
// (durationMs, exit._tag, exit.reason, name, attributes.failureKind).
import { readFile } from 'node:fs/promises';
const DEFAULT_TOP_N = 5;
/**
* Parse NDJSON trace text into spans, tolerating malformed lines.
*
* @param {string} text raw file contents
* @returns {{ spans: object[], skipped: number }}
*/
export function parseTrace(text) {
const spans = [];
let skipped = 0;
for (const line of String(text).split('\n')) {
const trimmed = line.trim();
if (trimmed === '') continue;
try {
spans.push(JSON.parse(trimmed));
} catch {
// Truncated tail line or a partial write straddling rotation — the
// append-only log can leave one half-line. Triage must not crash on
// it; count and move on.
skipped += 1;
}
}
return { spans, skipped };
}
function isFailure(span) {
return span?.exit?._tag === 'Failure';
}
/**
* Compute a triage digest over parsed spans. Pure.
*
* @param {object[]} spans
* @param {{ topN?: number }} [opts]
* @returns {{
* totalSpans: number,
* failures: number,
* successes: number,
* failureRate: number,
* slowest: object[],
* recentFailures: object[],
* failureReasons: { reason: string, count: number }[],
* failureKinds: { kind: string, count: number }[],
* byName: { name: string, count: number, failures: number, meanMs: number|null, maxMs: number|null }[],
* }}
*/
export function summarizeSpans(spans, { topN = DEFAULT_TOP_N } = {}) {
const list = Array.isArray(spans) ? spans : [];
const totalSpans = list.length;
const failingSpans = list.filter(isFailure);
const failures = failingSpans.length;
const successes = totalSpans - failures;
const failureRate = totalSpans === 0 ? 0 : failures / totalSpans;
// Slowest spans by durationMs. Spans with a null duration (in-flight or
// missing endedAt) are excluded — they carry no comparable cost signal.
const timed = list.filter((s) => typeof s?.durationMs === 'number');
const slowest = [...timed]
.sort((a, b) => b.durationMs - a.durationMs)
.slice(0, topN)
.map(projectSpan);
// Recent failures — the trace is append-only, so the last failures in
// file order are the most recent. Take the tail.
const recentFailures = failingSpans.slice(-topN).reverse().map(projectSpan);
const failureReasons = tally(
failingSpans,
(s) => (s?.exit?.reason != null ? String(s.exit.reason) : '(no reason)'),
'reason',
);
// failureKind is the world.lifecycle attribute the README already greps
// for; surface it as a first-class tally regardless of span name so
// recovery-relevant failures aggregate even when span names differ.
const failureKinds = tally(
list.filter((s) => s?.attributes?.failureKind != null),
(s) => String(s.attributes.failureKind),
'kind',
);
const byName = aggregateByName(list);
return {
totalSpans,
failures,
successes,
failureRate,
slowest,
recentFailures,
failureReasons,
failureKinds,
byName,
};
}
function projectSpan(s) {
return {
name: s?.name ?? null,
traceId: s?.traceId ?? null,
spanId: s?.spanId ?? null,
durationMs: typeof s?.durationMs === 'number' ? s.durationMs : null,
startedAt: typeof s?.startedAt === 'number' ? s.startedAt : null,
reason: s?.exit?.reason != null ? String(s.exit.reason) : null,
};
}
// Group spans by a string key and count occurrences, labelling the key
// field per the caller (`reason` for failure reasons, `kind` for failure
// kinds). Sorted by count descending so the dominant cause leads.
function tally(spans, keyFn, label) {
const counts = new Map();
for (const s of spans) {
const key = keyFn(s);
counts.set(key, (counts.get(key) ?? 0) + 1);
}
const out = [];
for (const [k, count] of counts) out.push({ count, [label]: k });
return out.sort((a, b) => b.count - a.count);
}
/**
* Per-span-name aggregate: count, failure count, mean + max duration.
* Sorted by count descending so the busiest spans surface first.
*/
function aggregateByName(spans) {
const groups = new Map();
for (const s of spans) {
const name = s?.name != null ? String(s.name) : '(unnamed)';
let g = groups.get(name);
if (!g) {
g = { name, count: 0, failures: 0, durSum: 0, durCount: 0, maxMs: null };
groups.set(name, g);
}
g.count += 1;
if (isFailure(s)) g.failures += 1;
if (typeof s?.durationMs === 'number') {
g.durSum += s.durationMs;
g.durCount += 1;
g.maxMs = g.maxMs === null ? s.durationMs : Math.max(g.maxMs, s.durationMs);
}
}
return [...groups.values()]
.map((g) => ({
name: g.name,
count: g.count,
failures: g.failures,
meanMs: g.durCount === 0 ? null : g.durSum / g.durCount,
maxMs: g.maxMs,
}))
.sort((a, b) => b.count - a.count);
}
/**
* Read + summarize a trace file. Missing file → empty digest (an operator
* who hasn't generated any spans yet sees a clean zero-state, not a crash).
*
* @param {string} path
* @param {{ topN?: number }} [opts]
*/
export async function summarizeTraceFile(path, opts = {}) {
let text;
try {
text = await readFile(path, 'utf8');
} catch (err) {
if (err && err.code === 'ENOENT') {
return { ...summarizeSpans([], opts), skipped: 0, missing: true };
}
throw err;
}
const { spans, skipped } = parseTrace(text);
return { ...summarizeSpans(spans, opts), skipped, missing: false };
}
function fmtMs(ms) {
if (ms == null) return '—';
if (ms >= 1000) return `${(ms / 1000).toFixed(2)}s`;
return `${Math.round(ms)}ms`;
}
/**
* Render a digest as a human-readable, plain-text report for the CLI.
*
* @param {ReturnType<typeof summarizeSpans> & { skipped?: number, missing?: boolean, path?: string }} digest
* @returns {string}
*/
export function formatDigest(digest) {
const lines = [];
const path = digest.path ? ` (${digest.path})` : '';
lines.push(`Trace summary${path}`);
if (digest.missing) {
lines.push(' no trace file yet — nothing recorded.');
return lines.join('\n');
}
const pct = (digest.failureRate * 100).toFixed(1);
lines.push(
` ${digest.totalSpans} spans · ${digest.failures} failed (${pct}%) · ${digest.successes} ok` +
(digest.skipped ? ` · ${digest.skipped} malformed line(s) skipped` : ''),
);
if (digest.slowest.length) {
lines.push('');
lines.push(`Top ${digest.slowest.length} slowest:`);
for (const s of digest.slowest) {
lines.push(` ${fmtMs(s.durationMs).padStart(7)} ${s.name ?? '(unnamed)'}${s.traceId ? ` [${s.traceId}]` : ''}`);
}
}
if (digest.recentFailures.length) {
lines.push('');
lines.push(`Recent failures (${digest.recentFailures.length}):`);
for (const f of digest.recentFailures) {
lines.push(` ${f.name ?? '(unnamed)'}: ${f.reason ?? '(no reason)'}${f.traceId ? ` [${f.traceId}]` : ''}`);
}
}
if (digest.failureKinds.length) {
lines.push('');
lines.push('Failure kinds:');
for (const k of digest.failureKinds) lines.push(` ${String(k.count).padStart(4)} ${k.kind}`);
}
if (digest.failureReasons.length) {
lines.push('');
lines.push('Failure reasons:');
for (const r of digest.failureReasons) lines.push(` ${String(r.count).padStart(4)} ${r.reason}`);
}
if (digest.byName.length) {
lines.push('');
lines.push('By span name (count · failures · mean · max):');
for (const n of digest.byName) {
lines.push(
` ${String(n.count).padStart(5)} · ${String(n.failures).padStart(4)}f · ${fmtMs(n.meanMs).padStart(7)} · ${fmtMs(n.maxMs).padStart(7)} ${n.name}`,
);
}
}
return lines.join('\n');
}
# Grafana Helm values — k3s-ingress-observability Phase B Task B2
#
# STANDALONE grafana/grafana chart per OQ-p3-4 + Decision 16.
# - This is NOT the Grafana bundled with kube-prometheus-stack.
# - Phase C kube-prometheus-stack MUST set `grafana.enabled: false`
# explicitly to prevent a second Grafana Deployment from landing.
# - Port-forward only — NEVER expose via Traefik IngressRoute.
# See T7 in DESIGN.md: secret exfil mitigated by no ingress surface.
#
# Chart: grafana/grafana; pinned to 8.5.2 (latest stable as of 2026-05-20).
# Upgrade discipline: chart version is embedded in the e2e script comment.
# -------------------------------------------------------------------------
# Admin credentials — loaded from a pre-existing Secret, NOT from chart
# values. Secret is created by scripts/e2e/grafana-port-forward.sh before
# helm install, or by the operator following the procedure in
# packages/peripheral-services/manifests/README.md (§ "Grafana admin secret").
# The placeholder manifest (70-grafana-secret.yaml) was removed 2026-05-21
# (dogfood finding #4) because `kubectl apply` would overwrite the operator's
# pre-created Secret with the placeholder value.
# -------------------------------------------------------------------------
admin:
existingSecret: olam-grafana-admin
userKey: admin-user
passwordKey: admin-password
# -------------------------------------------------------------------------
# Service: ClusterIP only.
# Decision 16: port-forward only; never ingress-routed.
# Access: `kubectl port-forward -n monitoring svc/olam-grafana 3000:80`
# -------------------------------------------------------------------------
service:
type: ClusterIP
port: 80
# -------------------------------------------------------------------------
# Ingress: disabled.
# Decision 16 + OQ-p3-4: Grafana is never exposed via Traefik IngressRoute.
# Port-forward is the sole operator access path. Enabling ingress here would
# silently violate the access-control intent even if no IngressRoute manifest
# is committed.
# -------------------------------------------------------------------------
ingress:
enabled: false # Decision 16: port-forward only; never ingress-routed
# -------------------------------------------------------------------------
# Datasources: Loki (default) + Prometheus (added in Phase C Task C1).
#
# Dual-chart pattern:
# - kube-prometheus-stack (C1) provides Prometheus. Its bundled Grafana
# sub-chart is disabled (grafana.enabled: false in kube-prom-stack-values.yaml).
# - This standalone grafana/grafana chart (Phase B) is the only Grafana.
# - The Prometheus datasource URL points at `prometheus-operated`, which is
# the in-cluster Service that kube-prometheus-stack's Prometheus Operator
# creates for the managed Prometheus StatefulSet.
# - timeInterval: 15s matches the scrape interval in kube-prom-stack-values.yaml
# so Grafana's step calculation aligns with actual data granularity.
# - exemplarTraceIdDestinations.datasourceUid: tempo is harmless until Phase D
# adds Tempo; Grafana silently ignores unknown datasource UIDs.
#
# editable: false prevents accidental operator drift across sessions.
# -------------------------------------------------------------------------
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://olam-loki.monitoring.svc.cluster.local:3100
isDefault: true
editable: false
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus-operated.monitoring.svc.cluster.local:9090
isDefault: false
editable: false
jsonData:
timeInterval: 15s # matches scrape interval in kube-prom-stack-values.yaml
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: tempo # Phase D may add Tempo; harmless until then
# -------------------------------------------------------------------------
# Dashboard provisioner: file-based ConfigMap mount.
# B3 lands the olam-dashboards ConfigMap and the actual JSON files.
# B2 wires the loader so B3's ConfigMap is picked up automatically.
# -------------------------------------------------------------------------
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: olam-default
orgId: 1
folder: 'Olam'
type: file
disableDeletion: true
updateIntervalSeconds: 30
allowUiUpdates: false
options:
path: /var/lib/grafana/dashboards/olam-default
# Wire the volume mount — B3 creates this ConfigMap with the actual JSON.
# Grafana will warn "ConfigMap olam-dashboards not found" until B3 lands;
# this is benign and does not block Grafana startup.
dashboardsConfigMaps:
olam-default: olam-dashboards # B3 creates this ConfigMap
# -------------------------------------------------------------------------
# Resources: tuned for single-operator k3s (<256Mi idle typical).
# P2 acceptance criterion: <500MB idle / <1GB typical across full LGTM stack.
# -------------------------------------------------------------------------
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi # P2: keeps Grafana within its share of the LGTM RAM budget
# -------------------------------------------------------------------------
# Persistence: disabled for Phase B.
# Grafana state (dashboards, users) lives in ConfigMaps / values files.
# Phase C may enable a PV if fine-grained alert state or annotations
# accumulate. For now, stateless Grafana is simpler and matches S2.
# -------------------------------------------------------------------------
persistence:
enabled: false # S2: ConfigMap-mounted dashboards; no PV needed in Phase B
# -------------------------------------------------------------------------
# ServiceMonitor: Phase C Prometheus scrapes Grafana's /metrics endpoint.
# Disabled in Phase B: the ServiceMonitor CRD (monitoring.coreos.com/v1) is
# shipped by kube-prometheus-stack in Phase C. The earlier "enable now to
# avoid a Phase C helm upgrade" rationale was wrong — Phase C will need a
# helm upgrade anyway to wire Prometheus scrape targets. Flipping this on
# pre-CRD breaks the install on chart versions that hard-validate.
# -------------------------------------------------------------------------
serviceMonitor:
# Disabled in the source-of-truth values file so a standalone Phase B install
# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
# The C1 e2e script flips this on at RUNTIME via
# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
enabled: false
# -------------------------------------------------------------------------
# Grafana.ini overrides: anonymous access disabled (default); only
# setting the server root_url so port-forward URLs render correctly
# in email / share links (cosmetic; not a security seam).
# -------------------------------------------------------------------------
grafana.ini:
server:
root_url: "%(protocol)s://%(domain)s:%(http_port)s/"
analytics:
reporting_enabled: false # no telemetry to grafana.com
check_for_updates: false
security:
allow_embedding: false
# kube-prometheus-stack Helm values — k3s-ingress-observability Phase C Task C1
#
# Chart: prometheus-community/kube-prometheus-stack; pinned to 85.2.0
# (latest stable as of 2026-05-21).
# Upgrade discipline: pin in this file + e2e script comment must stay in sync.
#
# CRITICAL: grafana.enabled MUST stay false.
# Phase B ships a standalone grafana/grafana chart (olam-grafana release).
# kube-prometheus-stack's bundled Grafana sub-chart is disabled to prevent
# a second Grafana Deployment from landing in the cluster.
# Decision 16 + OQ-p3-4: Phase B's standalone Grafana is canonical.
# Enabling the sub-chart here would violate that decision and create two
# Grafana instances — caught by prom-no-double-grafana.sh's single-Grafana
# assertion.
#
# Resource budget summary (Phase C contribution to P2 target <500MB idle / <1GB typical):
# prometheus-operator: 128Mi req / 512Mi limit
# prometheus: 512Mi req / 2Gi limit
# node-exporter: 64Mi req / 128Mi limit
# kube-state-metrics: 128Mi req / 256Mi limit
# Total C1 addition: ~832Mi req / ~3Gi limit (spread across nodes)
#
# Retention policy (Decision 14): scrape 15s / retention 15d / size cap 10GiB.
# The size cap (T10 TSDB corruption mitigation) is the hard guard; retention 15d
# is advisory — the size cap enforces first.
#
# Alertmanager: disabled for C1. C2 lands the first alert rule (cardinality 80k).
# When C2 ships, flip alertmanager.enabled: true and configure receivers.
# Comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
# -------------------------------------------------------------------------
# CARDINALITY ENFORCEMENT — Task C2 (T1 cardinality bomb / P4 <100k active series)
#
# Goal: strip high-cardinality labels (world_id, trace_id, user_id,
# request_id, operator_id) from every scraped series BEFORE TSDB ingest.
#
# Architecture finding (helm template verified, 2026-05-21):
# The prometheus-operator Prometheus CR has NO global metricRelabelConfigs
# field. The Prometheus CR spec exposes only per-ServiceMonitor endpoint
# metricRelabelings. There is no chart-level "apply to all scrapes" slot.
#
# Enforcement strategy (two-layer):
# Layer 1 — chart-managed ServiceMonitors: set metricRelabelings on every
# ServiceMonitor the chart controls (coreDns, prometheusOperator,
# prometheus self-scrape, node-exporter). Belt-and-suspenders; these
# services don't emit world_id etc. in practice, but the rule is free.
# Note: kube-state-metrics sub-chart has no metricRelabelings slot in
# its prometheus.monitor section at chart version 85.2.0 — omitted.
# Layer 2 — user-deployed ServiceMonitors: the cardinality-drop.sh e2e
# script's synthetic violator ServiceMonitor carries the same labeldrop
# rule (release: olam-prom label + metricRelabelings). New services
# MUST include the same block — enforced by docs + code review.
#
# Why labeldrop is the right action:
# action: labeldrop removes the matched labels from ALL series that carry
# them, regardless of metric name. This is the same semantic as Promtail's
# pipeline drop stages (promtail-values.yaml) — both layers stay in sync.
# world_id surfaces in dashboards via EXEMPLARS (Decision 9), not labels.
#
# Regex covers all five taxonomy labels from observability-label-taxonomy:
# world_id, trace_id, user_id, request_id, operator_id
# -------------------------------------------------------------------------
_cardinalityLabeldrop: &cardinality-labeldrop
- action: labeldrop
regex: 'world_id|trace_id|user_id|request_id|operator_id'
# -------------------------------------------------------------------------
# HARD REQUIREMENT: grafana sub-chart is off.
# See top-of-file comment for rationale.
# -------------------------------------------------------------------------
grafana:
enabled: false # HARD: Decision 16 + OQ-p3-4 — standalone Grafana (olam-grafana) is canonical
# -------------------------------------------------------------------------
# Alertmanager: off until C2 lands the first alert rule.
# C2 comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
# -------------------------------------------------------------------------
alertmanager:
enabled: true # C2: first alert rule (OlamActiveSeriesHigh) lands; alertmanager enabled
serviceMonitor:
metricRelabelings: *cardinality-labeldrop
# -------------------------------------------------------------------------
# Default kube-controller-manager / scheduler / proxy / etcd monitors.
# These ServiceMonitors don't work on k3d/k3s because the endpoints are not
# exposed via the usual ports. Disabling avoids noisy "endpoint not found"
# warnings and scrape failures on every Prometheus eval cycle.
# -------------------------------------------------------------------------
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false
kubeEtcd:
enabled: false
# kube-apiserver and kubelet DO work on k3d but generate high-cardinality
# label combinations. Disable for now; re-evaluate when per-service /metrics
# (C3) and cardinality enforcement (C2) are in place.
kubeApiServer:
enabled: false
kubelet:
enabled: false
# -------------------------------------------------------------------------
# Default alerting rules: off.
# The bundled default rules generate Alertmanager receivers and PrometheusRule
# objects for kubelet, etcd, apiserver, etc. — most don't fire on k3d anyway
# and add noise before C2's focused cardinality rule lands.
# C2 will add targeted PrometheusRule objects separately.
# -------------------------------------------------------------------------
defaultRules:
create: false
# -------------------------------------------------------------------------
# coreDns — ServiceMonitor with labeldrop (Layer 1 cardinality enforcement)
# -------------------------------------------------------------------------
coreDns:
serviceMonitor:
metricRelabelings: *cardinality-labeldrop
# -------------------------------------------------------------------------
# CRDs: install via chart (default: true, explicit for clarity).
# These CRDs (ServiceMonitor, PodMonitor, PrometheusRule, etc.) are required
# before Phase B's loki/promtail/grafana charts can have serviceMonitor.enabled:true.
# Phase C's e2e script waits for servicemonitors.monitoring.coreos.com to be
# Established before helm-upgrading the Phase B charts.
# -------------------------------------------------------------------------
crds:
enabled: true
# -------------------------------------------------------------------------
# Prometheus Operator
# -------------------------------------------------------------------------
prometheusOperator:
enabled: true
serviceMonitor:
metricRelabelings: *cardinality-labeldrop
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
# -------------------------------------------------------------------------
# Prometheus core — Decision 14: scrape 15s / retention 15d / 10GiB cap
# -------------------------------------------------------------------------
prometheus:
serviceMonitor:
metricRelabelings: *cardinality-labeldrop
prometheusSpec:
scrapeInterval: 15s # Decision 14
evaluationInterval: 15s
retention: 15d # Decision 14 — advisory; size cap enforces first
retentionSize: 10GiB # Decision 14 — T10 TSDB corruption prevention
walCompression: true
enableAdminAPI: false # security: admin API allows snapshot deletion + series deletion
enableRemoteWriteReceiver: false # not a remote-write target; no inbound writes
logLevel: warn # info is noisy at 15s scrape cycle
resources:
requests:
cpu: 200m
memory: 512Mi
limits:
cpu: 1000m
memory: 2Gi
# PersistentVolume for TSDB. 12Gi = 10GiB retention cap + ~20% headroom.
# local-path provisioner is used on k3d; cloud providers use their default SC.
storageSpec:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 12Gi # 10GiB retention + 20% headroom for in-flight segments
# -------------------------------------------------------------------------
# Node exporter — keep enabled (host-level metrics: CPU, memory, disk, net).
# -------------------------------------------------------------------------
nodeExporter:
enabled: true
prometheus-node-exporter:
prometheus:
monitor:
metricRelabelings: *cardinality-labeldrop
resources:
requests:
cpu: 30m
memory: 64Mi
limits:
cpu: 100m
memory: 128Mi
# -------------------------------------------------------------------------
# kube-state-metrics — keep enabled (k8s-level metrics: pod phases, deployments).
# -------------------------------------------------------------------------
kubeStateMetrics:
enabled: true
kube-state-metrics:
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# -------------------------------------------------------------------------
# Datasource auto-discovery note:
# kube-prometheus-stack's grafana.sidecar.datasources is N/A (grafana sub-chart
# is off). Phase B's standalone Grafana (grafana-values.yaml) has been updated
# in this same C1 PR to include a Prometheus datasource entry pointing at:
# http://prometheus-operated.monitoring.svc.cluster.local:9090
# This is the in-cluster Service that kube-prometheus-stack creates for the
# Prometheus StatefulSet (created by the Prometheus Operator from the
# Prometheus CR above).
# -------------------------------------------------------------------------
# Kyverno Helm values — k3s-ingress-observability Phase C C8 follow-up.
#
# Kyverno is the policy-as-code layer for cluster-wide cardinality
# enforcement (closes codex's C2 concern on PR #783). The companion
# ClusterPolicy in
# `packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml`
# mutates every incoming ServiceMonitor and PodMonitor to inject the
# labeldrop rule before the object is persisted — so a third-party
# chart (or hand-rolled object) cannot bypass the layer-2
# per-ServiceMonitor enforcement landed in C2.
#
# Chart: kyverno/kyverno; pinned to 3.8.1 (app v1.18.1, 2026-05-21 latest stable).
# Upgrade discipline: this pin AND the helm-install line in
# `scripts/e2e/kyverno-cardinality-mutate.sh` must stay in sync.
#
# Footprint posture (single-operator k3s scale):
# We only run admission-time mutation. The ClusterPolicy uses
# `spec.background: false`, so the background-scan controller is
# unused. Cleanup + reports controllers are also dead weight for
# a single ClusterPolicy with no PolicyExceptions — they're disabled
# so Kyverno's pod count stays minimal (1 pod, not 4).
#
# Footprint (Phase C C8 contribution to P2 target <500MB idle / <1GB typical):
# admissionController: 128Mi req / 384Mi limit (chart default 128Mi/384Mi)
# Total addition: ~128Mi req / ~384Mi limit
#
# If/when we want policy reports populated for observability dashboards,
# flip `reportsController.enabled: true` and the `features.policyReports`
# block below. Same for cleanup.
#
# Resource limits — tuned upward from chart default for admission webhook
# stability under burst churn (kube-prom-stack ships ~10 ServiceMonitors at
# once during `helm upgrade`, which arrives as a burst of AdmissionReviews).
# -------------------------------------------------------------------------
# Disable controllers we don't need
# -------------------------------------------------------------------------
backgroundController:
enabled: false # ClusterPolicy is admission-only (background: false)
cleanupController:
enabled: false # no CleanupPolicy objects in this repo
reportsController:
enabled: false # no policy-reports surface wired into Grafana yet
# -------------------------------------------------------------------------
# Features — admissionReports + policyReports remain ON inside the
# admission controller itself even when the standalone reports controller
# is disabled. This keeps `kubectl get clusterpolicyreport` queryable
# during dogfood; the reports controller would only AGGREGATE them
# cluster-wide, which we don't need yet.
# -------------------------------------------------------------------------
features:
admissionReports:
enabled: true
policyReports:
enabled: true
# Background scan is N/A — the policy uses background: false. Explicit
# off avoids the controller scheduling unnecessary scan workers even
# when the controller pod is disabled above.
backgroundScan:
enabled: false
# Logging volume defaults are fine; level 2 = info-ish.
logging:
format: text
verbosity: 2
# -------------------------------------------------------------------------
# Admission controller — the only pod we run.
# -------------------------------------------------------------------------
admissionController:
replicas: 1 # single-operator k3s scale; HA is N/A for dogfood
rbac:
create: true # ClusterPolicy needs cluster-wide watch on ServiceMonitor + PodMonitor
container:
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# Loki Helm values — k3s-ingress-observability Phase B Task B1
#
# Single-binary mode (Decision-16 + Phase B scope):
# Distributed mode (microservices) adds 5+ independent Deployments + a Minio
# or S3 backend for object storage — pure overhead for a single-operator
# k3s install where Loki's write throughput is bounded by one Promtail
# DaemonSet and a handful of containers. SingleBinary collapses all roles
# (ingester, querier, compactor) into one Pod, fits within the <500MB idle
# LGTM RAM target (P2), and is trivially replaceable if scale demands change.
#
# See: docs/plans/k3s-ingress-observability/DESIGN.md (P2, S2)
#
# Chart: grafana/loki; pinned to 6.7.4 (latest stable as of 2026-05-20).
# Upgrade discipline: chart version is embedded in the e2e script comment.
deploymentMode: SingleBinary
loki:
auth_enabled: false # single-tenant; multi-tenancy adds header overhead with no benefit here
commonConfig:
replication_factor: 1 # single-binary; no replicas = no cross-replica consistency needed
# -------------------------------------------------------------------------
# Storage backend: filesystem (boltdb-shipper + tsdb index; local PV).
# Object storage (S3/GCS/MinIO) deferred to fatbox multi-org Phase F+.
# For single-operator k3s, local PV is simpler and sufficient.
# -------------------------------------------------------------------------
storage:
type: filesystem
schemaConfig:
configs:
- from: "2024-01-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: loki_index_
period: 24h
# -------------------------------------------------------------------------
# Retention: 7 days (168h) per Performance budget acceptance criterion #6.
# compactor.retention_enabled enables deletion; ring config required for
# single-binary mode.
# -------------------------------------------------------------------------
limits_config:
retention_period: 168h # 7 days
ingestion_rate_mb: 4 # per-tenant ingestion cap (single tenant)
ingestion_burst_size_mb: 8
max_query_series: 5000 # cap log-derived queries from going wide (P3 <3s p95)
max_entries_limit_per_query: 5000
compactor:
retention_enabled: true
delete_request_store: filesystem
compaction_interval: 10m
working_directory: /var/loki/compactor
ingester:
chunk_idle_period: 30m # flush to storage; appropriate for low write rate
chunk_retain_period: 1m
max_chunk_age: 2h
# Self-metrics endpoint — Phase C Prometheus scrapes this.
# Server block exposed on port 3100 (default); /metrics is always available.
singleBinary:
replicas: 1
# -------------------------------------------------------------------------
# Persistence: 10Gi PV.
#
# Rationale: 7-day retention at olam scale (<500 containers, access logs
# estimated 1–2MB/day compressed) → ~100MB typical stored. 10Gi gives 10x
# headroom for burst (failed deploy loops, chatty containers) and is well
# within the <1GB typical acceptance criterion #6. Cloud provider default SC
# is fine; on bare-metal k3s the local-path provisioner is used.
# -------------------------------------------------------------------------
persistence:
enabled: true
size: 10Gi # 10× headroom over 7-day typical (~100MB); <1GB usage target per AC#6
# -------------------------------------------------------------------------
# Resources: memory limit 512Mi per task spec.
# Typical usage at olam scale: <200MB idle (boltdb index + block cache).
# 512Mi limit prevents compaction spikes from triggering OOM on the node.
# -------------------------------------------------------------------------
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi # P2: <500MB idle / <1GB typical; limit prevents spike OOM
# -------------------------------------------------------------------------
# Self-metrics for Phase C Prometheus scrape.
# ServiceMonitor is created here; Prometheus picks it up in Phase C.
# -------------------------------------------------------------------------
monitoring:
selfMonitoring:
enabled: false # disables the bundled GrafanaAgent sub-chart dependency
grafanaAgent:
installOperator: false
serviceMonitor:
# Disabled in the source-of-truth values file so a standalone Phase B install
# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
# The C1 e2e script flips this on at RUNTIME via
# helm upgrade ... --reuse-values --set monitoring.serviceMonitor.enabled=true
# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
# NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor).
enabled: false
# -------------------------------------------------------------------------
# Backend and read/write gateway: disabled for SingleBinary mode.
# These are microservices-mode components and must be off or the chart
# emits validation errors when deploymentMode=SingleBinary.
# -------------------------------------------------------------------------
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
# Grafana agent / canary: not needed; disable to keep resource footprint minimal.
lokiCanary:
enabled: false
test:
enabled: false
# -------------------------------------------------------------------------
# Sub-component slimming — chart 6.7.4 defaults include nginx gateway +
# two Memcached clusters + minio + sidecar watchers that single-binary
# mode doesn't need. Each adds image-pull and Ready-wait time. Disabling
# all of them brings the install Ready-time within the harness budget.
# If a future scenario needs query-result caching, re-evaluate
# resultsCache specifically.
# -------------------------------------------------------------------------
# nginx routing front; Promtail writes direct to single-binary :3100
gateway:
enabled: false
# Memcached cluster — overhead for single-binary
chunksCache:
enabled: false
# second Memcached cluster — overhead for single-binary
resultsCache:
enabled: false
# minio is off because storage.type=filesystem, but be explicit
minio:
enabled: false
# Sidecar that watches ConfigMaps for runtime config reloads — we don't ship one.
sidecar:
rules:
enabled: false
datasources:
enabled: false
configs:
enabled: false
# Promtail Helm values — Phase A Task A5 staging (Phase B consumes)
#
# Tails every container's stdout; ships to Loki single-binary (Phase B installs Loki).
# Per OQ-p3-6: Traefik native config can redact HEADERS but NOT URL query params —
# query-param scrubbing for `?token=`, `?code=`, `?access_token=`, `?state=` happens
# HERE at Promtail ingest via pipeline_stages.replace regex.
#
# Resource limits per OQ-p3-37 (Promtail OOM risk under chatty container-cp 100ms cadence):
# - memory limit 256Mi
# - pipeline_stages.limit rate 100 lines/sec/stream
#
# Scrape config matches every pod log; namespace-scope labels are added so Loki LogQL queries
# can filter by service / namespace / pod.
#
# SECURITY NOTE — replace stage regex semantics (load-bearing):
# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
# template syntax and silently becomes a literal. The correct pattern is:
# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
# replace: 'REDACTED' — replace captured secret with literal
# See promtail-values.yaml header comment for full details.
deploymentMode: DaemonSet
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
config:
clients:
- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
snippets:
pipelineStages:
# 1. Parse JSON access logs from Traefik (key field present in JSON line)
- match:
selector: '{container="traefik"}'
stages:
- json:
expressions:
request_method: RequestMethod
request_path: RequestPath
status: DownstreamStatus
request_id: requestId
service: ServiceName
router: RouterName
# 2. Scrub OAuth/token values from URL query params and Authorization headers.
#
# IMPORTANT — capture group semantics:
# The replace stage replaces each CAPTURE GROUP with the `replace` template
# value. Capture groups must wrap ONLY the secret value, not the surrounding
# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
# it is preserved in the output while only the secret is replaced.
- replace:
# OAuth code= callback values — capture only the token value after `code=`
expression: '(?:\?|&)code=([^&\s]+)'
replace: 'REDACTED'
- replace:
# Bearer / access tokens in query strings — capture only the value
expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
replace: 'REDACTED'
- replace:
# OAuth state param (may carry session info) — capture only the value
expression: '(?:\?|&)state=([^&\s]+)'
replace: 'REDACTED'
- replace:
# Authorization header Bearer value — capture only the token after `Bearer `
expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
replace: 'REDACTED'
# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
- limit:
rate: 100 # max log lines/sec per stream
burst: 200
drop: true # drop excess lines; do NOT block tail
# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
- labels:
service: # from Traefik JSON access log; matches taxonomy `service` label
router: # Traefik router name
status: # HTTP status code (within taxonomy)
# Retention is configured on Loki side (Phase B), not Promtail.
# Sample retention target: 7 days per Performance budget Row.
serviceMonitor:
enabled: true # Prometheus (Phase C) scrapes Promtail's own /metrics for self-observability
# Promtail Helm values — k3s-ingress-observability Phase B Task B1 (production)
#
# Production Promtail values. Staging copy at promtail-staging.yaml has the
# same scrubbing pipeline shape; this file sets the Loki client URL +
# production resource limits.
#
# Scrubbing pipeline:
# - 4 `replace` stages: code=, token/access_token/api_key/secret=, state=, Authorization
# - `limit` stage: rate=100/burst=200/drop=true (OQ-p3-37: Promtail OOM under chatty containers)
# Client URL: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
# Service name `olam-loki` is the Helm release name used in scripts/e2e/loki-ingest.sh
# (`helm upgrade --install olam-loki grafana/loki ...`); the chart's Service
# is named after the release, so `olam-loki` is the in-cluster DNS hostname.
#
# SECURITY NOTE — replace stage regex semantics (load-bearing):
# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
# template syntax and silently becomes a literal. The correct pattern is:
# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
# replace: 'REDACTED' — replace captured secret with literal
# This leaves the surrounding context (e.g. `?code=`) intact and redacts only
# the value. The broken pattern `(\?|&)code=[^&\s]+` with `replace: '${1}code=REDACTED'`
# was the root cause of the Phase B scrubbing regression (PR #776).
#
# See: docs/plans/k3s-ingress-observability/DESIGN.md (T8, T9)
deploymentMode: DaemonSet
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
config:
clients:
- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
snippets:
pipelineStages:
# 1. Parse JSON access logs from Traefik (key field present in JSON line)
- match:
selector: '{container="traefik"}'
stages:
- json:
expressions:
request_method: RequestMethod
request_path: RequestPath
status: DownstreamStatus
request_id: requestId
service: ServiceName
router: RouterName
# 2. Scrub OAuth/token values from URL query params and Authorization headers.
#
# IMPORTANT — capture group semantics:
# The replace stage replaces each CAPTURE GROUP with the `replace` template
# value. Capture groups must wrap ONLY the secret value, not the surrounding
# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
# it is preserved in the output while only the secret is replaced.
- replace:
# OAuth code= callback values — capture only the token value after `code=`
expression: '(?:\?|&)code=([^&\s]+)'
replace: 'REDACTED'
- replace:
# Bearer / access tokens in query strings — capture only the value
expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
replace: 'REDACTED'
- replace:
# OAuth state param (may carry session info) — capture only the value
expression: '(?:\?|&)state=([^&\s]+)'
replace: 'REDACTED'
- replace:
# Authorization header Bearer value — capture only the token after `Bearer `
expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
replace: 'REDACTED'
# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
- limit:
rate: 100 # max log lines/sec per stream
burst: 200
drop: true # drop excess lines; do NOT block tail
# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
- labels:
service: # from Traefik JSON access log; matches taxonomy `service` label
router: # Traefik router name
status: # HTTP status code (within taxonomy)
# Retention is configured on Loki side (loki-values.yaml: 7 days / 168h).
serviceMonitor:
# Disabled in the source-of-truth values file so a standalone Phase B install
# (without kube-prometheus-stack) does not hard-fail with
# "no matches for kind ServiceMonitor in version monitoring.coreos.com/v1".
# The C1 e2e script flips this on at RUNTIME via
# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
# AFTER kube-prom-stack has installed the ServiceMonitor CRD. Source-of-truth
# stays standalone-friendly; runtime override wires Prometheus discovery.
enabled: false
# Traefik Helm values — k3s-ingress-observability Phase A Task A3
# Pinned NodePort 30080 per OQ-p3-7 (world hooks bake this URL).
# Structured JSON access logs ready for Phase A Task A5 + Phase B Promtail pickup.
deployment:
replicas: 1 # SPOF mitigation = host systemd watchdog (Phase A Task A11), not HA replicas
ports:
web:
port: 8000
expose:
default: true
exposedPort: 80
nodePort: 30080 # PIN (OQ-p3-7); world hooks reach via host.docker.internal:30080
protocol: TCP
websecure:
port: 8443
expose:
default: true
exposedPort: 443
nodePort: 30443
protocol: TCP
# v1: HTTPS deferred to fatbox multi-org (Out-of-scope of this plan); TLS not configured.
service:
type: NodePort
# Structured access logs to stdout — Promtail picks up in Phase B.
# Authorization header redaction here; URL query-param scrubbing happens
# at Promtail pipeline_stages.replace per OQ-p3-6 (Traefik can't scrub query params natively).
logs:
general:
level: INFO
format: json
access:
enabled: true
format: json
fields:
headers:
defaultMode: keep
names:
Authorization: redact
Cookie: redact
# Built-in /metrics for Phase C Prometheus scrape
metrics:
prometheus:
enabled: true
addEntryPointsLabels: true
addRoutersLabels: true
addServicesLabels: true
# Dashboard disabled in cluster — operator uses Grafana (Phase B)
ingressRoute:
dashboard:
enabled: false
# IngressRoute CRD enabled
providers:
kubernetesCRD:
enabled: true
allowCrossNamespace: false # explicit; matches namespace-isolation strategy from A1
kubernetesIngress:
enabled: false # CRD-only; vanilla Ingress not supported in this stack
# Resource bounds — observability stack target <500MB RAM idle (P2)
resources:
requests:
cpu: 100m
memory: 64Mi
limits:
cpu: 500m
memory: 256Mi
# Namespace for k3s-ingress-observability peripheral services
# (Traefik installs to kube-system; observability stack to monitoring; this is for IngressRoute CRDs targeting olam services)
apiVersion: v1
kind: Namespace
metadata:
name: olam
# 24-deploy-kg-service.yaml — kg-service Service + Deployment for local k3s dogfood.
#
# Bridges the gap between Phase C's ServiceMonitor (92-servicemonitor-kg-service.yaml)
# and a running service. The ServiceMonitor targets namespace `olam`,
# label `app: olam-kg-service`, port name `http` — this manifest satisfies that
# contract so Prometheus can scrape kg-service's /metrics endpoint.
#
# Canonical per-service manifest tree: packages/host-cp/k8s/manifests/kg-service/
# This file is the "peripheral-services entry point" view — it folds Service +
# Deployment into a single file for `kubectl apply -f manifests/` convenience.
#
# Secrets prerequisite: operator MUST create `olam-kg-service-secret` in the
# `olam` namespace BEFORE applying this manifest. See README.md § Secrets.
#
# Image: pinned to sha256 digest (not :latest) per T4 threat model.
# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158.
# To update:
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" | jq -r .token)
# curl -sI -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
# https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> | grep docker-content-digest
#
# Memory: bge-small-en-v1.5 ONNX model is pre-cached in the image (~90 MB).
# Container needs ≥512Mi to load the model + serve requests. Limit set to 1Gi.
#
# Apply-manifests.sh: this file is SKIPPED by the phase-a-e2e harness
# (apply-manifests.sh skip-list includes 2[3-4]-deploy-*) because the
# harness cluster has no operator secrets or kg-data PVC.
# Operator-side `kubectl apply -f manifests/` applies it.
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
app.kubernetes.io/managed-by: olam
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
app.kubernetes.io/managed-by: olam
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
resourceNames: ["olam-kg-service"]
verbs: ["get", "patch", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
app.kubernetes.io/managed-by: olam
subjects:
- kind: ServiceAccount
name: olam-kg-service
namespace: olam
roleRef:
kind: Role
name: olam-kg-service
apiGroup: rbac.authorization.k8s.io
---
# ConfigMap — non-sensitive env vars.
# Sensitive values (OLAM_KG_BEARER_TOKEN) live in `olam-kg-service-secret`.
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-kg-service-env
namespace: olam
labels:
app: olam-kg-service
app.kubernetes.io/managed-by: olam
data:
# Port kg-service listens on — must match Service targetPort below.
OLAM_KG_SERVICE_PORT: "9997"
# CRITICAL: kg-service defaults to 127.0.0.1 bind. In k8s the readiness
# probe hits the pod IP, so 127.0.0.1-only listener causes probe failures.
# Force all-interfaces bind without requiring an image rebuild.
OLAM_KG_SERVICE_BIND: "0.0.0.0"
# Data directory — backed by the PVC mounted at /data.
OLAM_KG_DATA_PATH: "/data/kg"
# Auth-service URL — cluster-internal DNS (olam namespace).
OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
---
# PersistentVolumeClaim — backs /data (KG index + savings telemetry).
# 10Gi: graph index grows with codebase size. See kg-service/45-pvc.yaml rationale.
# local-path StorageClass ships with k3d. Substitute for non-k3d clusters.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: olam-kg-data
namespace: olam
labels:
app: olam-kg-service
app.kubernetes.io/managed-by: olam
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 10Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: olam-kg-service
namespace: olam
labels:
app: olam-kg-service
app.kubernetes.io/managed-by: olam
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: olam-kg-service
template:
metadata:
labels:
app: olam-kg-service
spec:
# Disable k8s automatic Service env injection.
# Without this, k8s injects OLAM_KG_SERVICE_PORT as "tcp://..." which
# breaks Python's int() parse of the port env var.
enableServiceLinks: false
imagePullSecrets:
- name: ghcr-pull
serviceAccountName: olam-kg-service
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
initContainers:
- name: chown-data
# busybox:1.36 — sha256-pinned per T4 threat model.
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["chown", "-R", "1000:1000", "/data"]
volumeMounts:
- name: kg-data
mountPath: /data
containers:
- name: olam-kg-service
# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158
# Run `npm run refresh:manifest-digests` to update.
image: ghcr.io/pleri/olam-kg-service@sha256:72030f3054315e7ebf575f6dcb9b4965e1ddee13ea7bfdeb0bde32253beeb1c7
imagePullPolicy: IfNotPresent
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
ports:
# CRITICAL: port name `http` must match ServiceMonitor
# 92-servicemonitor-kg-service.yaml endpoints[0].port.
- name: http
containerPort: 9997
protocol: TCP
envFrom:
- configMapRef:
name: olam-kg-service-env
- secretRef:
name: olam-kg-service-secret
volumeMounts:
- name: kg-data
mountPath: /data
- name: tmp
mountPath: /tmp
readinessProbe:
# kg-service returns {"ok":true,"ready":true} once bge-small model loads.
# initialDelaySeconds 30 gives the model warmup thread time to complete.
httpGet:
path: /health
port: 9997
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 12
livenessProbe:
httpGet:
path: /health
port: 9997
initialDelaySeconds: 60
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
requests:
cpu: "100m"
# bge-small ONNX model requires ~400Mi at runtime; 512Mi is the
# minimum viable request. Set higher if OOM-killed on first classify.
memory: "512Mi"
limits:
cpu: "1000m"
# 1Gi: bge-small model (~90Mi) + index cache + request headroom.
memory: "1Gi"
volumes:
- name: kg-data
persistentVolumeClaim:
claimName: olam-kg-data
- name: tmp
emptyDir: {}
---
# Service — exposes kg-service to the cluster.
# CRITICAL: `name: http` matches 92-servicemonitor-kg-service.yaml endpoints[0].port.
# Namespace `olam` matches ServiceMonitor's namespaceSelector.matchNames.
apiVersion: v1
kind: Service
metadata:
name: olam-kg-service
namespace: olam
labels:
# CRITICAL: matches 92-servicemonitor-kg-service.yaml spec.selector.matchLabels.
app: olam-kg-service
app.kubernetes.io/managed-by: olam
spec:
type: ClusterIP
selector:
app: olam-kg-service
ports:
# CRITICAL: name `http` matches ServiceMonitor endpoints[0].port.
- name: http
port: 9997
targetPort: 9997
protocol: TCP
# IngressRoute — host-cp (bare /api/* per Decision 3 hybrid routing)
# host-cp preserves 50+ existing SPA fetch sites at /api/* (no strip-prefix).
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: olam-host-cp
namespace: olam
spec:
entryPoints:
- web
routes:
# host-cp is the catch-all (per Decision 3 hybrid routing); explicit low priority
# so service-prefix routes (kg, agent-memory, etc.) win when their longer prefix matches.
# Default Traefik priority is rule-string length; OR'd rules inflate the host-cp aggregate
# ABOVE more-specific PathPrefix matches, causing /api/kg/* to land on host-cp incorrectly.
# Explicit priority avoids the silent precedence bug (caught in PR #736 live-validation).
- match: PathPrefix(`/api/`) || PathPrefix(`/session/`) || PathPrefix(`/v1/`) || Path(`/health`)
kind: Rule
priority: 10
services:
- name: olam-host-cp
port: 19000
# IngressRoute — kg-service via /api/kg/* strip-prefix (Decision 3 new-services pattern)
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: strip-api-kg
namespace: olam
spec:
stripPrefix:
prefixes:
- /api/kg
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: olam-kg-service
namespace: olam
spec:
entryPoints:
- web
routes:
# Priority 100 > host-cp's 10 so /api/kg/* wins over host-cp's catch-all /api/*.
- match: PathPrefix(`/api/kg/`)
kind: Rule
priority: 100
services:
- name: olam-kg-service
port: 9997
middlewares:
- name: strip-api-kg
# IngressRoute — agent-memory via /api/agent-memory/* strip-prefix (Decision 3 new-services pattern)
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: strip-api-agent-memory
namespace: olam
spec:
stripPrefix:
prefixes:
- /api/agent-memory
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: olam-agent-memory
namespace: olam
spec:
entryPoints:
- web
routes:
# Priority 100 > host-cp's 10 so /api/agent-memory/* wins over host-cp's catch-all /api/*.
- match: PathPrefix(`/api/agent-memory/`)
kind: Rule
priority: 100
services:
- name: olam-memory-service
port: 3111 # Real memory-service listen port (per packages/memory-service/src/worker.ts:206 + AGENTMEMORY_HOST_INTERNAL_URL in container.ts:101). Pass-1 plan said 3112 (incorrect); A6 corrects to 3111.
middlewares:
- name: strip-api-agent-memory
# NetworkPolicy — olam namespace ingress fence (Phase A Task A9)
#
# Defense-in-depth: even if a world agent escapes its container or steals a
# bearer token, NetworkPolicy ensures it can only reach olam services via the
# Traefik ingress path (which enforces bearer auth on world-originated calls
# per A6 — see packages/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml).
# Direct pod-to-pod access bypassing ingress is denied.
#
# Enforcement matrix — two separate enforcement paths exist; the comment below
# previously conflated them (corrected 2026-05-21, see dogfood incident finding #2):
#
# k3d/k3s with --disable-network-policy=false (production k3s default):
# k3s ships a built-in NetworkPolicy controller that enforces NetworkPolicies
# via iptables rules, INDEPENDENT of the CNI. Flannel itself does not enforce,
# but the k3s controller does. Result: NetworkPolicies ARE enforced even on
# default Flannel k3s/k3d clusters — this is what the operator's colima+k3d
# dogfood cluster experienced (the fence was live despite using Flannel).
#
# k3d/k3s with --disable-network-policy=true (this harness — cluster-up.sh):
# The harness explicitly passes --k3s-arg '--disable-network-policy@server:*'
# to disable the k3s built-in controller. With the controller off, enforcement
# depends entirely on the CNI: Flannel = no enforcement; Calico = enforced.
# The harness uses Calico precisely so tests exercise real enforcement.
#
# Production k3s (default, no --disable-network-policy):
# Controller-enforced via iptables unless the operator explicitly disables it.
#
# See docs/architecture/networkpolicy-fence.md for the full environment matrix
# and docs/incidents/2026-05-21-phase-c-dogfood.md (finding #2) for the live
# evidence that k3s' bundled controller enforces on Flannel clusters.
#
# Threat mitigated: T6 (world→host SSRF via unauthenticated ingress route).
# Companion mitigations (do not remove A6 + A9 together): bearer auth (A6),
# 127.0.0.1 bind on host-cp + kube-apiserver (OS-level, separate from k8s).
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: olam-ingress-fence
namespace: olam
labels:
app.kubernetes.io/part-of: olam
app.kubernetes.io/component: security-fence
olam.io/phase: a
olam.io/task: a9
spec:
# Selects every pod in the olam namespace. Intra-namespace traffic is allowed
# explicitly below so olam services can call each other; cross-namespace and
# external traffic must traverse Traefik (which the second rule allows).
podSelector: {}
policyTypes:
- Ingress
ingress:
# Allow inbound from Traefik (canonical ingress path). The label selector
# matches the standard Helm-chart label that k3s' bundled Traefik install
# sets (`app.kubernetes.io/name: traefik`); also matched by the upstream
# `traefik/traefik` chart used by Phase A Task A3.
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
app.kubernetes.io/name: traefik
# Allow intra-namespace pod-to-pod traffic — olam services may call each
# other directly (host-cp → kg-service, etc.) without round-tripping
# through Traefik. Audit log on world-originated calls still fires at the
# bearer-auth layer (A6), so this allowance does not weaken T6 mitigation.
- from:
- podSelector: {}
# Allow inbound from the monitoring namespace — Phase C's Prometheus
# (kube-prometheus-stack) scrapes pod IPs directly for /metrics
# collection. Without this rule, ServiceMonitor targets in `olam` ns
# appear "up" but yield 0 samples (the scrape connection silently fails
# at CNI level on enforcing CNIs). Surfaced during 2026-05-21 operator
# dogfood — see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #2.
# Scope: monitoring → olam ingress only (not the reverse direction).
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
# NetworkPolicy — monitoring namespace default-deny + same-namespace allow
# (Phase A Task A9; companion to 60-networkpolicy-ingress.yaml)
#
# Loki + Prometheus + Grafana accept inbound ONLY from pods in the same
# `monitoring` namespace (intra-stack: Promtail → Loki, Grafana → Loki + Prom,
# kube-prometheus-stack scrape targets within the stack). Cross-namespace
# traffic — including from `olam` (host-cp, kg-service, agent-memory) and
# kube-system (Traefik) — is denied.
#
# Operator access pattern is `kubectl port-forward -n monitoring svc/grafana
# 3000` (Decision 16). port-forward uses the kube-apiserver's exec channel,
# NOT pod-to-pod networking, so it bypasses NetworkPolicy by design.
#
# Decision 17 forbids any IngressRoute / Ingress that exposes Loki / Prom /
# Grafana from outside the cluster; audit:no-ingress-route enforces that at
# commit time, and this NetworkPolicy is the runtime defense-in-depth layer
# (caught even if the audit is bypassed or a Helm chart renders a route).
#
# Forward-declaration note: Loki + Prometheus land in Phase B/C. Until those
# manifests add pods to the `monitoring` namespace, this policy applies to an
# empty pod set and is a no-op. Phase B/C must label their pods so this
# selector keeps matching (kube-prometheus-stack's default labels already
# satisfy `kubernetes.io/metadata.name: monitoring` via namespace metadata).
#
# Enforcement requires NetworkPolicy-capable CNI (see 60-* doc block).
# Threat mitigated: T7 (Grafana admin secret exfil) + secondary T6 mitigation.
---
# Forward-declare the monitoring namespace so the NetworkPolicy below has a
# valid target. Phase B/C kube-prometheus-stack installs into this namespace
# and may add labels — its install MUST NOT delete the namespace; Helm uses
# `--create-namespace=false` once this manifest seeds it.
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
kubernetes.io/metadata.name: monitoring
app.kubernetes.io/part-of: olam-observability
olam.io/phase: a
olam.io/task: a9
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: monitoring-default-deny
namespace: monitoring
labels:
app.kubernetes.io/part-of: olam-observability
app.kubernetes.io/component: security-fence
olam.io/phase: a
olam.io/task: a9
spec:
# Selects every pod in the monitoring namespace. Phase B/C pods (loki,
# prometheus, grafana, promtail, alertmanager — whatever the chart renders)
# all match this empty selector automatically.
podSelector: {}
policyTypes:
- Ingress
ingress:
# Allow inbound only from same-namespace pods. Cross-namespace traffic
# (olam services, kube-system Traefik, default ns) is denied — see header
# for why this is the correct posture (operator uses kubectl port-forward,
# which bypasses NetworkPolicy via the kube-apiserver exec channel).
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
# ----------------------------------------------------------------------------
# GENERATED FILE — DO NOT EDIT DIRECTLY
#
# Source: packages/peripheral-services/grafana-dashboards/*.json
# Regenerate: packages/peripheral-services/scripts/sync-grafana-dashboards.sh
#
# This ConfigMap is consumed by the grafana/grafana Helm chart via
# dashboardsConfigMaps.olam-default: olam-dashboards
# as wired in packages/peripheral-services/helm-values/grafana-values.yaml.
#
# Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B3
# ----------------------------------------------------------------------------
apiVersion: v1
kind: ConfigMap
metadata:
name: olam-dashboards
namespace: monitoring
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/managed-by: olam
grafana_dashboard: "1"
data:
host-cp.json: |
{
"uid": "host-cp",
"title": "Host-CP — Service Drill-in",
"description": "Per-route SLIs for host-cp. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. Use the route dropdown to scope a single route or view all. The world_id variable is forwarded from olam-home for context.",
"tags": ["olam", "drill-in", "phase-c", "host-cp"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 39,
"version": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": [
{
"name": "world_id",
"label": "World",
"type": "query",
"datasource": { "type": "loki", "uid": "loki" },
"query": {
"qryType": 2,
"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
"step": ""
},
"refresh": 2,
"sort": 1,
"multi": false,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
},
{
"name": "route",
"label": "Route",
"type": "query",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"host-cp\"}, route)",
"refresh": 2,
"sort": 1,
"multi": true,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
}
]
},
"annotations": {
"list": []
},
"panels": [
{
"id": 1,
"type": "timeseries",
"title": "Request rate by route",
"description": "Requests per second for each host-cp route over the last 5 minutes (pre-computed by C4 recording rule). Spikes indicate traffic surges; a route going to zero indicates it stopped receiving traffic.",
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_requests:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 2,
"type": "timeseries",
"title": "5xx error rate by route",
"description": "5xx responses per second per host-cp route (C4 recording rule). A non-zero value on a route warrants investigation. Correlate with the error ratio panel below to understand severity relative to total traffic.",
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_errors:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 3,
"type": "timeseries",
"title": "Latency p50 by route",
"description": "Median (p50) request duration per host-cp route in seconds (C4 recording rule). Represents typical user-perceived latency. Sustained increases above baseline indicate a regression or upstream dependency slowdown.",
"gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"host-cp\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 4,
"type": "timeseries",
"title": "Latency p95 by route",
"description": "95th-percentile request duration per host-cp route in seconds (C4 recording rule). Captures the tail latency experienced by the slowest 5% of requests. The primary SLI for detecting latency regressions before they affect most users.",
"gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"host-cp\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 5,
"type": "timeseries",
"title": "Latency p99 by route",
"description": "99th-percentile request duration per host-cp route in seconds (C4 recording rule). Worst-case latency tail. High p99 with stable p50/p95 often indicates a specific slow code path or resource contention under load.",
"gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"host-cp\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "Error ratio (5xx / total) by route",
"description": "Fraction of requests returning 5xx per host-cp route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. A route showing red means roughly 1-in-20 (or more) requests are failing — investigate immediately.",
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.01 },
{ "color": "red", "value": 0.05 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_errors:ratio_by_service_route{service=\"host-cp\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": true,
"range": false
}
]
}
]
}
kg-service.json: |
{
"uid": "kg-service",
"title": "KG-Service — Service Drill-in",
"description": "Per-route SLIs for kg-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. kg-service exposes 4 routes: /health, /classify, /build, /status. Use the route dropdown to scope a single route. The world_id variable is forwarded from olam-home for context.",
"tags": ["olam", "drill-in", "phase-c", "kg-service"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 39,
"version": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": [
{
"name": "world_id",
"label": "World",
"type": "query",
"datasource": { "type": "loki", "uid": "loki" },
"query": {
"qryType": 2,
"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
"step": ""
},
"refresh": 2,
"sort": 1,
"multi": false,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
},
{
"name": "route",
"label": "Route",
"type": "query",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"kg-service\"}, route)",
"refresh": 2,
"sort": 1,
"multi": true,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
}
]
},
"annotations": {
"list": []
},
"panels": [
{
"id": 1,
"type": "timeseries",
"title": "Request rate by route",
"description": "Requests per second for each kg-service route over the last 5 minutes (pre-computed by C4 recording rule). /classify is the hot path; /build is infrequent; /health should be near-constant. A drop in /classify with stable /health suggests the classifier is being bypassed or the caller is down.",
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_requests:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 2,
"type": "timeseries",
"title": "5xx error rate by route",
"description": "5xx responses per second per kg-service route (C4 recording rule). Errors on /classify indicate the graph classifier is failing; errors on /build indicate a KG rebuild failure. Either warrants immediate investigation as they affect agent search quality.",
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_errors:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 3,
"type": "timeseries",
"title": "Latency p50 by route",
"description": "Median (p50) request duration per kg-service route in seconds (C4 recording rule). /classify latency drives agent dispatch latency directly; a rising p50 on /classify means agents wait longer for graph routing decisions.",
"gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"kg-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 4,
"type": "timeseries",
"title": "Latency p95 by route",
"description": "95th-percentile request duration per kg-service route in seconds (C4 recording rule). kg-service is a synchronous dependency for in-world search; a high p95 on /classify directly contributes to the >6min diagnosis-time problem this observability stack is solving.",
"gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"kg-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 5,
"type": "timeseries",
"title": "Latency p99 by route",
"description": "99th-percentile request duration per kg-service route in seconds (C4 recording rule). Worst-case latency tail. A high p99 on /build (graph rebuild) with stable /classify p99 is expected; the inverse (stable /build, high /classify p99) indicates classifier graph complexity growth.",
"gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"kg-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "Error ratio (5xx / total) by route",
"description": "Fraction of requests returning 5xx per kg-service route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. kg-service is fail-open for /classify (returns empty result on error); a high error ratio here means callers are silently getting degraded graph routing with no local error signal.",
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.01 },
{ "color": "red", "value": 0.05 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_errors:ratio_by_service_route{service=\"kg-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": true,
"range": false
}
]
}
]
}
memory-service.json: |
{
"uid": "memory-service",
"title": "Memory-Service — Service Drill-in",
"description": "Per-route SLIs for memory-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. memory-service's traffic flows through the in-container Node front-door (packages/memory-service/src/metrics-proxy.mjs) which short-circuits /metrics and instruments every agentmemory engine route ({service,route,method,status_code} taxonomy). Use the route dropdown to scope a single agentmemory endpoint. The world_id variable is forwarded from olam-home for context.",
"tags": ["olam", "drill-in", "phase-c", "memory-service"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 39,
"version": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": [
{
"name": "world_id",
"label": "World",
"type": "query",
"datasource": { "type": "loki", "uid": "loki" },
"query": {
"qryType": 2,
"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
"step": ""
},
"refresh": 2,
"sort": 1,
"multi": false,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
},
{
"name": "route",
"label": "Route",
"type": "query",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"memory-service\"}, route)",
"refresh": 2,
"sort": 1,
"multi": true,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
}
]
},
"annotations": {
"list": []
},
"panels": [
{
"id": 1,
"type": "timeseries",
"title": "Request rate by route",
"description": "Requests per second for each memory-service route over the last 5 minutes (pre-computed by C4 recording rule). /agentmemory/mcp/call is the hot path that agents drive — every memory_save / memory_recall lands there. /agentmemory/livez is the readiness probe (near-constant ~0.2 rps from k8s). /agentmemory/export is bridge-debounced (~1 per ~10s burst). A drop in mcp/call with stable livez indicates the agentmemory engine is up but receiving no traffic — caller-side issue.",
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_requests:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 2,
"type": "timeseries",
"title": "5xx error rate by route",
"description": "5xx responses per second per memory-service route (C4 recording rule). Errors on /agentmemory/mcp/call indicate the iii engine is rejecting MCP tool calls — typical causes are bearer-auth failures or the engine entering a degraded state. Errors on /agentmemory/import indicate restore failures; the bridge's snapshot will retry on the next mutator-write.",
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_errors:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 3,
"type": "timeseries",
"title": "Latency p50 by route",
"description": "Median (p50) request duration per memory-service route in seconds (C4 recording rule). /agentmemory/mcp/call p50 is a direct driver of agent-memory recall+save latency in the agent loop. Sustained rise on mcp/call p50 points to engine index size growth or iii-config tuning regressions.",
"gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"memory-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 4,
"type": "timeseries",
"title": "Latency p95 by route",
"description": "95th-percentile request duration per memory-service route in seconds (C4 recording rule). memory-service is a synchronous dependency for agent recall paths — high p95 on /agentmemory/mcp/call directly contributes to the >6min diagnosis-time problem this observability stack is solving. /agentmemory/export p95 spikes are expected at snapshot boundaries but should fall back inside 1s.",
"gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"memory-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 5,
"type": "timeseries",
"title": "Latency p99 by route",
"description": "99th-percentile request duration per memory-service route in seconds (C4 recording rule). Worst-case tail. /agentmemory/import is intentionally heavy (~1s+ for a full corpus restore on cold-start) so a high p99 there with stable mcp/call p99 is expected. The inverse — stable import, rising mcp/call p99 — is the leading indicator for engine-side index degradation.",
"gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"memory-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": false,
"range": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "Error ratio (5xx / total) by route",
"description": "Fraction of requests returning 5xx per memory-service route (C4 recording rule). Green < 1%; yellow 1-5%; red >= 5%. /agentmemory/mcp/call errors silently degrade agent memory recall quality (callers fall through to no-context paths). /agentmemory/livez errors here indicate the proxy is healthy but the engine is unreachable — check container logs.",
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.01 },
{ "color": "red", "value": 0.05 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "olam:http_errors:ratio_by_service_route{service=\"memory-service\",route=~\"$route\"}",
"legendFormat": "{{route}}",
"instant": true,
"range": false
}
]
}
]
}
olam-home.json: |
{
"uid": "olam-home",
"title": "Olam Home",
"description": "Operator's at-a-glance view. Top row: are the 5 olam peripheral services up? Middle row: how loaded are they? Bottom row: which worlds are doing dispatch work right now? Use the world_id dropdown to scope the bottom row (and host-cp/world-cp middle panels) to a specific world. Pinned 3-row IA per Phase B acceptance criteria #8. Click the host-cp, kg-service, or memory-service health panel to drill into the per-service dashboard.",
"tags": ["olam", "home", "phase-b"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 39,
"version": 2,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": [
{
"name": "world_id",
"label": "World",
"type": "query",
"datasource": { "type": "loki", "uid": "loki" },
"query": {
"qryType": 2,
"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
"step": ""
},
"refresh": 2,
"sort": 1,
"multi": false,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
}
]
},
"annotations": {
"list": []
},
"panels": [
{
"id": 1,
"type": "stat",
"title": "host-cp",
"description": "Green if host-cp logged at least 1 line in the last 60s; red = silent / crashed.",
"gridPos": { "x": 0, "y": 0, "w": 5, "h": 4 },
"links": [
{
"title": "Drill into host-cp",
"url": "/d/host-cp/host-cp-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
"targetBlank": false
}
],
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
],
"unit": "short",
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(count_over_time({service=\"host-cp\"}[1m]))",
"legendFormat": "host-cp",
"instant": true,
"range": false
}
]
},
{
"id": 2,
"type": "stat",
"title": "kg-service",
"description": "Green if kg-service logged at least 1 line in the last 60s; red = silent / crashed.",
"gridPos": { "x": 5, "y": 0, "w": 5, "h": 4 },
"links": [
{
"title": "Drill into kg-service",
"url": "/d/kg-service/kg-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
"targetBlank": false
}
],
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
],
"unit": "short",
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(count_over_time({service=\"kg-service\"}[1m]))",
"legendFormat": "kg-service",
"instant": true,
"range": false
}
]
},
{
"id": 3,
"type": "stat",
"title": "agent-memory",
"description": "Green if agent-memory logged at least 1 line in the last 60s; red = silent / crashed.",
"gridPos": { "x": 10, "y": 0, "w": 4, "h": 4 },
"links": [
{
"title": "Drill into memory-service",
"url": "/d/memory-service/memory-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
"targetBlank": false
}
],
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
],
"unit": "short",
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(count_over_time({service=\"agent-memory\"}[1m]))",
"legendFormat": "agent-memory",
"instant": true,
"range": false
}
]
},
{
"id": 4,
"type": "stat",
"title": "traefik",
"description": "Green if traefik logged at least 1 line in the last 60s; red = silent / crashed.",
"gridPos": { "x": 14, "y": 0, "w": 5, "h": 4 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
],
"unit": "short",
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(count_over_time({service=\"traefik\"}[1m]))",
"legendFormat": "traefik",
"instant": true,
"range": false
}
]
},
{
"id": 5,
"type": "stat",
"title": "world-cp",
"description": "Green if any world-cp instance logged at least 1 line in the last 60s. Aggregated across world_id labels per Promtail drop-rules.",
"gridPos": { "x": 19, "y": 0, "w": 5, "h": 4 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
{ "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
],
"unit": "short",
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(count_over_time({service=\"world-cp\"}[1m]))",
"legendFormat": "world-cp",
"instant": true,
"range": false
}
]
},
{
"id": 6,
"type": "timeseries",
"title": "Aggregate success rate",
"description": "Total 2xx/3xx log lines per second across all services. Proxy for overall throughput.",
"gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "none" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(rate({job=~\".+\"} |~ \"(?:200|201|204|301|302)\" [1m]))",
"legendFormat": "2xx/3xx rate",
"instant": false,
"range": true
}
]
},
{
"id": 7,
"type": "timeseries",
"title": "Aggregate error rate",
"description": "Total error/panic/fatal log lines per second across all services. Spikes indicate incidents.",
"gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": {
"mode": "fixed",
"fixedColor": "red"
},
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "none" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(rate({job=~\".+\"} |~ \"(?i)error|panic|fatal\" [1m]))",
"legendFormat": "error/panic/fatal rate",
"instant": false,
"range": true
}
]
},
{
"id": 8,
"type": "timeseries",
"title": "World-dispatch activity (top 10 worlds)",
"description": "Dispatch log lines per 5m per world, filtered by the world_id dropdown. world_id is a JSON field (not a Loki label); extracted via json parser. Select 'All' to see all worlds; select a specific world_id to drill down.",
"gridPos": { "x": 0, "y": 12, "w": 24, "h": 8 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "topk(10, sum by (world_id) (\n count_over_time(\n {service=\"host-cp\"}\n |~ \"dispatch\"\n | json\n | __error__ = \"\"\n | world_id =~ \"${world_id}\"\n [5m]\n )\n))",
"legendFormat": "world {{world_id}}",
"instant": false,
"range": true
}
]
}
]
}
request-rate.json: |
{
"uid": "request-rate",
"title": "Request Rate / Error Rate (Log-Derived)",
"description": "Per-service request rate + error rate derived from Loki logs. Phase B-only — kube-prometheus-stack will replace these with native HTTP metrics in Phase C.",
"tags": ["olam", "rate", "phase-b"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 39,
"version": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": [
{
"name": "world_id",
"label": "World",
"type": "query",
"datasource": { "type": "loki", "uid": "loki" },
"query": {
"qryType": 2,
"expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
"step": ""
},
"refresh": 2,
"sort": 1,
"multi": false,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
},
{
"name": "service",
"label": "Service",
"type": "query",
"datasource": { "type": "loki", "uid": "loki" },
"query": { "qryType": 1, "label": "service", "stream": "{job=~\".+\"}" },
"refresh": 2,
"sort": 1,
"multi": true,
"includeAll": true,
"allValue": ".+",
"current": { "selected": false, "text": "All", "value": "$__all" }
}
]
},
"annotations": {
"list": []
},
"panels": [
{
"id": 1,
"type": "timeseries",
"title": "Request rate by service",
"description": "Log line rate per second per service. Uses log volume as a proxy for request rate — appropriate for Phase B before Prometheus HTTP metrics land in Phase C.",
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum by (service) (rate({service=~\"${service:regex}\"}[1m]))",
"legendFormat": "{{service}}",
"instant": false,
"range": true
}
]
},
{
"id": 2,
"type": "timeseries",
"title": "Error rate by service",
"description": "Log lines matching error|panic|fatal per second per service. Spikes here warrant drill-down in the Ad-hoc LogQL panel below.",
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum by (service) (rate({service=~\"${service:regex}\"} |~ \"(?i)error|panic|fatal\" [1m]))",
"legendFormat": "{{service}}",
"instant": false,
"range": true
}
]
},
{
"id": 3,
"type": "table",
"title": "Top-5 endpoints (last 5m)",
"description": "Top 5 request paths by volume, derived from Traefik JSON access logs. Only Traefik has access-log-grade request_path (per B1 Promtail JSON stage); other services don't extract this field.",
"gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "palette-classic" }
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Value" },
"properties": [
{ "id": "displayName", "value": "requests" }
]
}
]
},
"options": {
"showHeader": true,
"footer": { "show": false }
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "topk(5, sum by (request_path) (count_over_time({service=\"traefik\"} | json | __error__ = \"\" | request_path != \"\" [5m])))",
"legendFormat": "",
"instant": true,
"range": false
}
],
"transformations": [
{ "id": "reduce", "options": { "reducers": ["sum"] } }
]
},
{
"id": 4,
"type": "logs",
"title": "Ad-hoc LogQL (edit me)",
"description": "Operator escape hatch. Edit the query inline; use LogQL syntax. world_id filter via JSON pipeline because Loki doesn't promote world_id as a stream label.",
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 10 },
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {},
"overrides": []
},
"options": {
"showTime": true,
"wrapLogMessage": false,
"dedupStrategy": "exact",
"showLabels": false,
"showCommonLabels": false,
"sortOrder": "Descending",
"prettifyLogMessage": false,
"enableLogDetails": true
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "{service=~\"${service:regex}\"} | json | __error__ = \"\" | world_id =~ \"${world_id}\"",
"legendFormat": "",
"instant": false,
"range": true
}
]
}
]
}
# 90-prom-alert-cardinality.yaml — Phase C Task C2 cardinality alert rule.
#
# PrometheusRule CR: fires OlamActiveSeriesHigh when prometheus_tsdb_head_series
# exceeds 80k (80% of the 100k active-series cap defined by P4).
#
# ruleSelector match: the Prometheus CR rendered by kube-prom-stack 85.2.0 uses
# ruleSelector: matchLabels: release: "olam-prom"
# (verified via `helm template ... | grep -A3 ruleSelector`).
# The label below MUST match or this rule is silently ignored by Prometheus.
#
# Alertmanager: enabled in kube-prom-stack-values.yaml from C2 onwards.
# Receivers: not yet configured (C2 scope = rule landing; receiver config is C4+).
# Alertmanager will fire the alert to its default null receiver until receivers
# are wired — this is intentional. The alert is visible in the Prometheus UI
# at /alerts regardless of receiver config.
#
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C2
# T1 (cardinality bomb) + P4 (<100k active series)
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: olam-cardinality
namespace: monitoring
labels:
app.kubernetes.io/name: olam-prometheus-rules
app.kubernetes.io/managed-by: olam
# REQUIRED: matches Prometheus CR's ruleSelector (release: "olam-prom").
# Verified via helm template output, 2026-05-21.
release: olam-prom
spec:
groups:
- name: olam-cardinality
interval: 30s
rules:
- alert: OlamActiveSeriesHigh
expr: |
prometheus_tsdb_head_series > 80000
for: 5m
labels:
severity: warning
scope: cardinality
annotations:
summary: "Active series above 80k threshold (80% of 100k cap)"
description: |
prometheus_tsdb_head_series is {{ $value | humanize }} — within 20%
of the 100k cardinality budget (P4). Investigate which service is
emitting a new high-cardinality label, OR add a DROP rule to
kube-prom-stack-values.yaml metricRelabelings for that ServiceMonitor.
Runbook: docs/architecture/observability-cardinality.md (TBD — C4+)
# 91-servicemonitor-host-cp.yaml — Phase C Task C3 ServiceMonitor for host-cp.
#
# Registers host-cp's /metrics endpoint with Prometheus for scraping.
#
# NOTE: This manifest requires the ServiceMonitor CRD installed by
# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
# apply-manifests.sh (which targets the Phase A ingress harness) and is
# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
#
# Namespace placement (CRITICAL — C2 dogfood lesson):
# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
# in any other namespace is silently ignored by default RBAC.
#
# Label compliance:
# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
# (verified via `helm template ... | grep -A3 serviceMonitorSelector`).
#
# Target selector:
# Matches the host-cp Service by its `app: olam-host-cp` label. Adjust if
# the Service label differs in the target cluster (check
# `kubectl get svc -n olam -l app=olam-host-cp`).
#
# metricRelabelings (layer-2 cardinality enforcement):
# Mirrors the `*cardinality-labeldrop` YAML anchor from
# kube-prom-stack-values.yaml. host-cp's /metrics is taxonomy-compliant
# (only {service,route,method,status_code} labels), but the labeldrop rule
# is present as defense-in-depth: if a future code change accidentally
# emits a banned label (world_id etc.), this ServiceMonitor drops it before
# ingest so the cardinality cap is never breached.
#
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
# T1 (cardinality bomb) + P4 (<100k active series)
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: olam-host-cp
namespace: monitoring
labels:
app.kubernetes.io/name: olam-host-cp-monitor
app.kubernetes.io/managed-by: olam
# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
release: olam-prom
spec:
# Discover the host-cp Service in the olam namespace.
namespaceSelector:
matchNames:
- olam
selector:
matchLabels:
app: olam-host-cp
endpoints:
- port: http
path: /metrics
interval: 15s
# Preserve the application-emitted `service` label. Without honorLabels,
# Prometheus's target-label injection (where `service` = the k8s Service
# name `olam-host-cp`) overrides the application's own `service=host-cp`
# value, moving the app's value into `exported_service`. The C5 drill-in
# dashboards filter on `service=host-cp`, so without honorLabels their
# panels show empty data. Surfaced during 2026-05-21 operator dogfood —
# see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
honorLabels: true
# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
# even if the service accidentally emits them.
metricRelabelings:
- action: labeldrop
regex: 'world_id|trace_id|user_id|request_id|operator_id'
# 92-servicemonitor-kg-service.yaml — Phase C Task C3 ServiceMonitor for kg-service.
#
# Registers kg-service's /metrics endpoint with Prometheus for scraping.
#
# NOTE: This manifest requires the ServiceMonitor CRD installed by
# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
# apply-manifests.sh (which targets the Phase A ingress harness) and is
# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
#
# Namespace placement (CRITICAL — C2 dogfood lesson):
# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
# in any other namespace is silently ignored by default RBAC.
#
# Label compliance:
# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
# (verified via `helm template ... | grep -A3 serviceMonitorSelector`).
#
# Target selector:
# Matches the kg-service Service by its `app: olam-kg-service` label. Adjust
# if the Service label differs in the target cluster (check
# `kubectl get svc -n olam -l app=olam-kg-service`).
#
# metricRelabelings (layer-2 cardinality enforcement):
# Mirrors the `*cardinality-labeldrop` YAML anchor from
# kube-prom-stack-values.yaml. kg-service's /metrics is taxonomy-compliant
# (only {service,route,method,status_code} labels), but the labeldrop rule
# is present as defense-in-depth: if a future code change accidentally
# emits a banned label (world_id etc.), this ServiceMonitor drops it before
# ingest so the cardinality cap is never breached.
#
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
# T1 (cardinality bomb) + P4 (<100k active series)
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: olam-kg-service
namespace: monitoring
labels:
app.kubernetes.io/name: olam-kg-service-monitor
app.kubernetes.io/managed-by: olam
# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
release: olam-prom
spec:
# Discover the kg-service Service in the olam namespace.
namespaceSelector:
matchNames:
- olam
selector:
matchLabels:
app: olam-kg-service
endpoints:
- port: http
path: /metrics
interval: 15s
# Preserve the application-emitted `service` label. Without honorLabels,
# Prometheus's target-label injection (where `service` = the k8s Service
# name `olam-kg-service`) overrides the application's own `service=kg-service`
# value, moving the app's value into `exported_service`. The C5 drill-in
# dashboards filter on `service=kg-service`, so without honorLabels their
# panels show empty data. Surfaced during 2026-05-21 operator dogfood —
# see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
honorLabels: true
# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
# even if the service accidentally emits them.
metricRelabelings:
- action: labeldrop
regex: 'world_id|trace_id|user_id|request_id|operator_id'
# 93-servicemonitor-memory-service.yaml — Phase C Task C3 closure ServiceMonitor.
#
# Registers memory-service's /metrics endpoint with Prometheus for scraping.
# C3 originally shipped instrumentation for host-cp + kg-service (PR #787) but
# DEFERRED memory-service because the third-party `agentmemory` Node CLI that
# runs in k3s exposes no /metrics endpoint. This PR closes that deferral by
# shipping a small Node HTTP front-door (packages/memory-service/src/metrics-proxy.mjs)
# inside the container image: external traffic hits the proxy on :3111, the
# proxy short-circuits /metrics + forwards everything else to agentmemory on
# loopback :3110. End-state matches the host-cp/kg-service shape so the ServiceMonitor
# pattern below is a near-clone of 91-servicemonitor-host-cp.yaml.
#
# NOTE: This manifest requires the ServiceMonitor CRD installed by
# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
# apply-manifests.sh (which targets the Phase A ingress harness) and is
# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
#
# Namespace placement (CRITICAL — C2 dogfood lesson):
# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
# in any other namespace is silently ignored by default RBAC.
#
# Label compliance:
# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector.
#
# Target selector:
# Matches the memory-service Service by its `app: olam-memory-service` label.
# The Service is defined in packages/host-cp/k8s/manifests/memory-service/60-service.yaml
# (port `http` -> targetPort 3111). The 50-traefik-ingressroute-agent-memory.yaml
# IngressRoute references the same Service for /api/agent-memory/* traffic.
#
# Image rollout dependency:
# The proxy lives inside the container image. Until the next release pipeline
# refreshes ghcr.io/pleri/olam-memory-service with the post-C3-closure
# Dockerfile (npm run refresh:manifest-digests), this ServiceMonitor will scrape
# a target that responds 404 to /metrics. Prometheus tolerates that (the target
# stays UP, scrape_samples_scraped=0). When the new image lands, scraping
# begins producing real samples without any cluster-side change.
#
# metricRelabelings (layer-2 cardinality enforcement):
# Mirrors the `*cardinality-labeldrop` YAML anchor from
# kube-prom-stack-values.yaml. memory-service's /metrics is taxonomy-compliant
# (only {service,route,method,status_code} labels), but the labeldrop rule
# is present as defense-in-depth: if a future code change accidentally
# emits a banned label (world_id etc.), this ServiceMonitor drops it before
# ingest so the cardinality cap is never breached.
#
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
# T1 (cardinality bomb) + P4 (<100k active series).
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: olam-memory-service
namespace: monitoring
labels:
app.kubernetes.io/name: olam-memory-service-monitor
app.kubernetes.io/managed-by: olam
# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
release: olam-prom
spec:
# Discover the memory-service Service in the olam namespace.
namespaceSelector:
matchNames:
- olam
selector:
matchLabels:
app: olam-memory-service
endpoints:
- port: http
path: /metrics
interval: 15s
# Preserve the application-emitted `service` label. Without honorLabels,
# Prometheus's target-label injection (where `service` = the k8s Service
# name `olam-memory-service`) overrides the application's own
# `service=memory-service` value, moving the app's value into
# `exported_service`. The C5 drill-in dashboards filter on
# `service=memory-service`, so without honorLabels their panels show
# empty data. Same lesson as the host-cp/kg-service ServiceMonitors —
# see docs/incidents/2026-05-21-phase-c-dogfood.md finding #3.
honorLabels: true
# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
# even if the service accidentally emits them.
metricRelabelings:
- action: labeldrop
regex: 'world_id|trace_id|user_id|request_id|operator_id'
# 95-prom-recording-rules.yaml — Phase C Task C4
#
# Naming convention: olam:<metric>:<aggregation>
#
# olam — project namespace prefix (all project recording rules share this)
# <metric> — the base Prometheus metric being aggregated (without _bucket/_total suffix
# when the aggregation already implies the source type)
# <aggregation> — describes what was computed + the grouping dimensions, e.g.
# p95_by_service_route, rate5m_by_service, ratio_by_service_route
#
# Modeled on the community convention from
# https://prometheus.io/docs/practices/rules/#naming — <level>:<metric>:<ops>.
# The <aggregation> suffix encodes BOTH the operation (p95, rate5m, ratio) and
# the grouping dimensions (_by_service, _by_service_route) so dashboard panels
# can select the pre-computed series without further aggregation.
#
# Source metrics (provided by C3 — host-cp + kg-service ServiceMonitors):
# http_request_duration_seconds_bucket{service, route, method, status_code, le}
# http_requests_total{service, route, method, status_code}
#
# rule group interval: 30s — half the scrape interval (15s × 2). Balances
# freshness vs evaluation CPU; at 30s each window is re-evaluated twice per
# minute, keeping percentiles and rates responsive without hammering the TSDB.
#
# NOTE: recording rules intentionally reference NO banned labels
# (world_id, trace_id, user_id, request_id, operator_id). C2's labeldrop at
# scrape time strips them before ingest; even if a metric slipped through,
# referencing them here would suppress results. Defense-in-depth: don't type
# them at all.
#
# Applied by: scripts/e2e/prom-no-double-grafana.sh (C4 assertion block)
# Skipped by: scripts/test-ingress-integration/apply-manifests.sh
# (9[0-9]-prom-* glob) — requires kube-prom-stack CRDs to be present.
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: olam-recording-rules
namespace: monitoring
labels:
app.kubernetes.io/name: olam-prometheus-rules
app.kubernetes.io/managed-by: olam
release: olam-prom # must match kube-prom-stack ruleSelector (verified C2)
spec:
groups:
- name: olam-http-aggregations
interval: 30s
rules:
# ============================================================
# Latency percentiles per service+route — Phase C Task C4
# Source: http_request_duration_seconds_bucket (C3)
# ============================================================
- record: olam:http_request_duration_seconds:p50_by_service_route
expr: |
histogram_quantile(0.50, sum by (service, route, le) (
rate(http_request_duration_seconds_bucket[5m])
))
- record: olam:http_request_duration_seconds:p95_by_service_route
expr: |
histogram_quantile(0.95, sum by (service, route, le) (
rate(http_request_duration_seconds_bucket[5m])
))
- record: olam:http_request_duration_seconds:p99_by_service_route
expr: |
histogram_quantile(0.99, sum by (service, route, le) (
rate(http_request_duration_seconds_bucket[5m])
))
# Aggregate p95 across all routes (per-service summary)
- record: olam:http_request_duration_seconds:p95_by_service
expr: |
histogram_quantile(0.95, sum by (service, le) (
rate(http_request_duration_seconds_bucket[5m])
))
# ============================================================
# Request rate per service+route
# Source: http_requests_total (C3)
# ============================================================
- record: olam:http_requests:rate5m_by_service_route
expr: |
sum by (service, route) (rate(http_requests_total[5m]))
# Aggregate request rate per service
- record: olam:http_requests:rate5m_by_service
expr: |
sum by (service) (rate(http_requests_total[5m]))
# ============================================================
# Error rate (status_code >= 500) per service+route
# 4xx are client errors and are intentionally excluded from
# the error ratio — only server-side failures count.
# ============================================================
- record: olam:http_errors:rate5m_by_service_route
expr: |
sum by (service, route) (
rate(http_requests_total{status_code=~"5.."}[5m])
)
# Error ratio (errors / total) per service+route.
# Returns NaN when total rate is 0 (no traffic) — dashboards
# should handle NaN as "no data" rather than "0% error rate".
- record: olam:http_errors:ratio_by_service_route
expr: |
sum by (service, route) (rate(http_requests_total{status_code=~"5.."}[5m]))
/
sum by (service, route) (rate(http_requests_total[5m]))
# 96-kyverno-cardinality-mutate.yaml — Phase C C8 follow-up.
#
# Closes codex's C2 concern: per-ServiceMonitor metricRelabelings is
# "policy by convention". A third-party ServiceMonitor or PodMonitor that
# olam doesn't author can bypass the labeldrop and reintroduce the
# cardinality bomb (T1). YAML anchors in kube-prom-stack-values.yaml keep
# Olam-owned manifests DRY but don't make the cluster safe.
#
# This ClusterPolicy mutates EVERY incoming ServiceMonitor and PodMonitor
# at admission time — regardless of who created it (chart, kubectl, operator,
# CI, GitOps) — to ensure the cardinality labeldrop rule is present on
# every endpoint. Once persisted, the prometheus-operator renders the
# relabel into Prometheus's scrape config.
#
# Why mutate-only (not validate):
# Validate would block a chart install or operator action mid-stride
# if a third-party ServiceMonitor lacks the rule. Mutate is the better
# posture: silently ensure the rule is present without breaking
# legitimate installs. Defense-in-depth still lives in TWO layers:
# (a) admission-time mutation (this policy)
# (b) per-ServiceMonitor metricRelabelings in
# kube-prom-stack-values.yaml + 9x-servicemonitor-*.yaml.
#
# Idempotency contract:
# Mutation must NOT add a duplicate labeldrop entry. Achieved by
# two-rule split per kind, each with a precondition that the labeldrop
# is currently ABSENT. Once present, neither rule fires:
# - Rule A (handle absent/empty case): preconditions:
# metricRelabelings is null/missing OR empty array.
# JSON patch: `add /spec/endpoints/{i}/metricRelabelings` with
# a single-element array containing our rule.
# - Rule B (handle existing-but-no-labeldrop case): preconditions:
# metricRelabelings is a non-empty array AND no entry has
# `action: labeldrop` with `regex` mentioning `world_id`.
# JSON patch: `add /spec/endpoints/{i}/metricRelabelings/-`
# appending our rule.
#
# Verified behavior (kyverno-cardinality-mutate.sh asserts):
# - Bare ServiceMonitor (no metricRelabelings) → Rule A injects
# - ServiceMonitor with metricRelabelings: [] → Rule A injects (replaces empty)
# - ServiceMonitor with unrelated metricRelabelings entries → Rule B appends
# - ServiceMonitor with matching labeldrop already present → NEITHER rule fires (idempotent)
# - Mixed: some endpoints lack it, others have it → only the lacking endpoints are mutated
#
# Background scan: OFF (background: false). Existing ServiceMonitors at
# install time are NOT auto-mutated. Re-apply them to trigger admission,
# or rely on the C2 per-ServiceMonitor metricRelabelings as the failsafe.
#
# failurePolicy: Ignore. Kyverno webhook timeout / pod outage MUST NOT
# block ServiceMonitor admission — the C2 layer-2 rules still protect
# Olam-owned monitors. Trade-off accepted: during Kyverno downtime, a
# brand-new third-party ServiceMonitor could land without the labeldrop.
# The 80k active-series PrometheusRule alert (Phase C C2,
# 90-prom-alert-cardinality.yaml) is the runtime detector that fires
# if this gap is exploited.
#
# Refs:
# - docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8
# - codex review on PR #783 ("policy by convention" finding)
# - https://kyverno.io/docs/writing-policies/mutate/
# - https://kyverno.io/docs/writing-policies/mutate/#foreach
---
apiVersion: kyverno.io/v1
kind: ClusterPolicy
metadata:
name: enforce-cardinality-labeldrop
labels:
app.kubernetes.io/part-of: olam
olam.io/phase: c-followup
annotations:
policies.kyverno.io/title: "Cluster-wide cardinality labeldrop enforcement"
policies.kyverno.io/category: "Observability"
policies.kyverno.io/severity: high
policies.kyverno.io/subject: "ServiceMonitor, PodMonitor"
policies.kyverno.io/description: >-
Ensures every ServiceMonitor and PodMonitor carries a metricRelabelings
labeldrop rule for high-cardinality labels (world_id, trace_id, user_id,
request_id, operator_id) on every endpoint. Closes the "third-party chart
bypasses C2 labeldrop" gap surfaced during PR #783 review.
spec:
background: false
failurePolicy: Ignore
mutateExistingOnPolicyUpdate: false
rules:
# ---------------------------------------------------------------------
# ServiceMonitor — Rule A: metricRelabelings absent or empty
# ---------------------------------------------------------------------
- name: inject-labeldrop-sm-absent
match:
any:
- resources:
kinds:
- monitoring.coreos.com/v1/ServiceMonitor
mutate:
foreach:
- list: "request.object.spec.endpoints"
preconditions:
all:
# length() of null/missing returns 0; length([]) is 0. So
# this fires when the field is absent OR an empty array.
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
operator: Equals
value: 0
patchesJson6902: |-
- op: add
path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings"
value:
- action: labeldrop
regex: "world_id|trace_id|user_id|request_id|operator_id"
# ---------------------------------------------------------------------
# ServiceMonitor — Rule B: metricRelabelings has entries, but no
# matching labeldrop for our banned-label regex.
#
# We test `contains(regex, 'world_id')` rather than equality so that
# operators who include additional banned labels in their own regex
# don't trigger duplicate injection. This is the idempotency hinge.
# ---------------------------------------------------------------------
- name: inject-labeldrop-sm-append
match:
any:
- resources:
kinds:
- monitoring.coreos.com/v1/ServiceMonitor
mutate:
foreach:
- list: "request.object.spec.endpoints"
preconditions:
all:
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
operator: GreaterThan
value: 0
- key: >-
{{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
operator: Equals
value: 0
patchesJson6902: |-
- op: add
path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings/-"
value:
action: labeldrop
regex: "world_id|trace_id|user_id|request_id|operator_id"
# ---------------------------------------------------------------------
# PodMonitor — Rule A: podMetricsEndpoints[*].metricRelabelings absent
# ---------------------------------------------------------------------
- name: inject-labeldrop-pm-absent
match:
any:
- resources:
kinds:
- monitoring.coreos.com/v1/PodMonitor
mutate:
foreach:
- list: "request.object.spec.podMetricsEndpoints"
preconditions:
all:
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
operator: Equals
value: 0
patchesJson6902: |-
- op: add
path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings"
value:
- action: labeldrop
regex: "world_id|trace_id|user_id|request_id|operator_id"
# ---------------------------------------------------------------------
# PodMonitor — Rule B: metricRelabelings exists, no labeldrop
# ---------------------------------------------------------------------
- name: inject-labeldrop-pm-append
match:
any:
- resources:
kinds:
- monitoring.coreos.com/v1/PodMonitor
mutate:
foreach:
- list: "request.object.spec.podMetricsEndpoints"
preconditions:
all:
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
operator: GreaterThan
value: 0
- key: >-
{{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
operator: Equals
value: 0
patchesJson6902: |-
- op: add
path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings/-"
value:
action: labeldrop
regex: "world_id|trace_id|user_id|request_id|operator_id"
// Recovery engine — the single entry point for bounded auto-attempts.
//
// Key invariants:
// 1. ONE attempt per (worldId, failureKind) pair. The ledger enforces
// idempotency: a second call with the same key returns the prior
// entry with outcome='escalated'.
// 2. Concurrent calls for the same (worldId, failureKind) key fire only
// ONE attempt. An in-flight Map holds the running Promise; concurrent
// callers await the same Promise.
// 3. Steps execute in order. First failing step short-circuits to
// outcome='failed'; subsequent steps are NOT executed.
// 4. All attempts (success, failed, escalated) are written to the ledger.
//
// The engine is async and pure-functional with respect to the host-stream:
// callers (server.mjs) are responsible for emitting the recovery.* events
// AFTER receiving the returned RecoveryLedgerEntry. The engine does not
// broadcast directly, keeping it testable without a hostStream fixture.
import { findScenarioForKind } from './scenarios.mjs';
import { appendLedgerEntry, findPriorEntry } from './ledger.mjs';
import { runStep } from './step-runners.mjs';
import { DEFAULT_LEDGER_PATH } from './ledger.mjs';
/**
* @typedef {import('./ledger.mjs').RecoveryLedgerEntry} RecoveryLedgerEntry
* @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
*/
// In-flight promise map: key = `${worldId}::${failureKind ?? 'null'}` → Promise<RecoveryLedgerEntry>
/** @type {Map<string, Promise<RecoveryLedgerEntry>>} */
const _inFlight = new Map();
/**
* Attempt a bounded recovery for the given world + failure kind.
*
* @param {string} worldId
* @param {object} [evidence] — WorldStartupEvidence, optional
* @param {FailureKindOrNull} [failureKind] — classified bucket, or null for non-FSM triggers
* @param {{ ledgerPath?: string, log?: (msg: string) => void }} [opts]
* @returns {Promise<RecoveryLedgerEntry>}
*/
export function attemptRecovery(worldId, evidence, failureKind = null, opts = {}) {
const key = `${worldId}::${failureKind ?? 'null'}`;
const existing = _inFlight.get(key);
if (existing) return existing;
const promise = _attempt(worldId, evidence, failureKind, opts).finally(() => {
_inFlight.delete(key);
});
_inFlight.set(key, promise);
return promise;
}
/**
* Internal: run the actual attempt. Always resolves (never rejects) — all
* errors are captured into the returned ledger entry.
*
* @param {string} worldId
* @param {object} [evidence]
* @param {FailureKindOrNull} failureKind
* @param {{ ledgerPath?: string, log?: (msg: string) => void }} opts
* @returns {Promise<RecoveryLedgerEntry>}
*/
async function _attempt(worldId, evidence, failureKind, opts) {
const { ledgerPath = DEFAULT_LEDGER_PATH, log = (msg) => console.warn(`[recovery] ${msg}`) } = opts;
const startedAt = Date.now();
// Idempotency check: if a prior entry exists for this key, return it
// with outcome='escalated' and write an escalated entry.
const prior = await findPriorEntry(worldId, failureKind, ledgerPath);
if (prior !== undefined) {
const escalated = /** @type {RecoveryLedgerEntry} */ ({
worldId,
failureKind: failureKind ?? null,
scenario: prior.scenario,
stepsRun: [],
startedAt,
endedAt: Date.now(),
outcome: 'escalated',
errorReason: `prior attempt already recorded (outcome=${prior.outcome})`,
});
await appendLedgerEntry(escalated, ledgerPath);
log(`recovery idempotency: escalated (worldId=${worldId}, kind=${failureKind})`);
return escalated;
}
// Find the scenario.
const scenario = findScenarioForKind(failureKind);
if (!scenario) {
const entry = /** @type {RecoveryLedgerEntry} */ ({
worldId,
failureKind: failureKind ?? null,
scenario: 'unmatched',
stepsRun: [],
startedAt,
endedAt: Date.now(),
outcome: 'failed',
errorReason: 'no scenario matched',
});
await appendLedgerEntry(entry, ledgerPath);
log(`recovery: no scenario for kind=${failureKind} (worldId=${worldId})`);
return entry;
}
log(`recovery: starting scenario="${scenario.name}" for worldId=${worldId}`);
// Execute steps in order, short-circuit on first failure.
/** @type {import('./recipes.mjs').RecoveryStep[]} */
const stepsRun = [];
/** @type {string | undefined} */
let errorReason;
/** @type {'success' | 'failed'} */
let outcome = 'success';
for (const step of scenario.recipe.steps) {
stepsRun.push(step);
try {
await runStep(step, { worldId, evidence, log });
} catch (err) {
outcome = 'failed';
errorReason = `step "${step.kind}" threw: ${err?.message ?? String(err)}`;
log(`recovery: step failed — ${errorReason}`);
break;
}
}
const entry = /** @type {RecoveryLedgerEntry} */ ({
worldId,
failureKind: failureKind ?? null,
scenario: scenario.name,
stepsRun,
startedAt,
endedAt: Date.now(),
outcome,
...(errorReason !== undefined ? { errorReason } : {}),
});
await appendLedgerEntry(entry, ledgerPath);
log(`recovery: scenario="${scenario.name}" outcome=${outcome} (worldId=${worldId})`);
return entry;
}
/**
* Exposed for testing only: clear the in-flight map so tests don't bleed state.
*/
export function _clearInFlight() {
_inFlight.clear();
}
// Recovery module barrel export.
//
// Public surface:
// - attemptRecovery — the engine entry point for callers (server.mjs)
// - FAILURE_SCENARIOS — the 7 named scenarios
// - findScenarioForKind — scenario lookup by failureKind
// - appendLedgerEntry / readAllLedgerEntries / findPriorEntry — ledger I/O
// - setStepRunnerSeams — test seam injection for step runners
//
// Internal:
// - _clearInFlight — test helper; not intended for production use
export { attemptRecovery, _clearInFlight } from './engine.mjs';
export { FAILURE_SCENARIOS, findScenarioForKind } from './scenarios.mjs';
export { appendLedgerEntry, readAllLedgerEntries, findPriorEntry, DEFAULT_LEDGER_PATH } from './ledger.mjs';
export { runStep, setStepRunnerSeams } from './step-runners.mjs';
// RecoveryLedger — append-only NDJSON persistence for recovery attempts.
//
// Each attempt writes one JSON line to the ledger file. The file grows
// monotonically; entries are never updated in-place. This keeps the
// ledger auditable and safe to tail/parse with `jq` while the process
// is running.
//
// Default path: ~/.olam/logs/recovery-ledger.ndjson
// Override: set OLAM_RECOVERY_LEDGER_PATH (useful in tests — point at a
// tmp file to isolate test runs from the real operator ledger).
import { open, mkdir, access } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { homedir } from 'node:os';
import { createReadStream } from 'node:fs';
import { createInterface } from 'node:readline';
import { redactSensitive } from '../observability/redactor.mjs';
export const DEFAULT_LEDGER_PATH =
process.env.OLAM_RECOVERY_LEDGER_PATH ??
join(homedir(), '.olam', 'logs', 'recovery-ledger.ndjson');
/**
* @typedef {object} RecoveryLedgerEntry
* @property {string} worldId
* @property {string | null} failureKind — WorldStartupFailureKind or null (non-FSM trigger)
* @property {string} scenario — kebab-case scenario name, or 'unmatched'
* @property {import('./recipes.mjs').RecoveryStep[]} stepsRun — steps actually executed (may be partial on failure)
* @property {number} startedAt — epoch ms
* @property {number} endedAt — epoch ms
* @property {'success' | 'failed' | 'escalated'} outcome
* @property {string} [errorReason] — set on failed/escalated outcomes
*/
/**
* Append a single RecoveryLedgerEntry to the ledger file.
*
* @param {RecoveryLedgerEntry} entry
* @param {string} [ledgerPath]
* @returns {Promise<void>}
*/
export async function appendLedgerEntry(entry, ledgerPath = DEFAULT_LEDGER_PATH) {
await mkdir(dirname(ledgerPath), { recursive: true });
const fh = await open(ledgerPath, 'a');
try {
await fh.write(JSON.stringify(redactSensitive(entry)) + '\n');
} finally {
await fh.close();
}
}
/**
* Read all entries from the ledger (in append order).
*
* @param {string} [ledgerPath]
* @returns {Promise<RecoveryLedgerEntry[]>}
*/
export async function readAllLedgerEntries(ledgerPath = DEFAULT_LEDGER_PATH) {
/** @type {RecoveryLedgerEntry[]} */
const entries = [];
// Check existence before streaming — createReadStream emits ENOENT as an
// error event (not a synchronous throw), which propagates through the
// `for await` loop and would reject the caller. An explicit access check
// keeps the "not yet written" path simple.
try {
await access(ledgerPath);
} catch {
return entries; // File does not exist yet.
}
const stream = createReadStream(ledgerPath, { encoding: 'utf8' });
const rl = createInterface({ input: stream, crlfDelay: Infinity });
for await (const line of rl) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
entries.push(JSON.parse(trimmed));
} catch {
// Malformed line — skip and continue.
}
}
return entries;
}
/**
* Find the most recent ledger entry for a (worldId, failureKind) pair.
* Returns undefined if no prior entry exists.
*
* @param {string} worldId
* @param {string|null} failureKind
* @param {string} [ledgerPath]
* @returns {Promise<RecoveryLedgerEntry | undefined>}
*/
export async function findPriorEntry(worldId, failureKind, ledgerPath = DEFAULT_LEDGER_PATH) {
const all = await readAllLedgerEntries(ledgerPath);
// Walk in reverse to find the most recent match.
for (let i = all.length - 1; i >= 0; i--) {
const e = all[i];
if (e.worldId === worldId && e.failureKind === (failureKind ?? null)) {
return e;
}
}
return undefined;
}
// Recovery step types and recipe interface — the discriminated union of
// all named steps that can appear in a RecoveryRecipe.
//
// Step runners for each kind live in step-runners.mjs. The engine in
// engine.mjs iterates a recipe's steps array and dispatches each to the
// appropriate runner.
//
// A RecoveryRecipe is an ordered list of steps. Steps execute in order;
// the first failing step short-circuits to a 'failed' outcome.
/**
* @typedef {{ kind: 'NotifyOperator', message?: string }} NotifyOperatorStep
* @typedef {{ kind: 'ResendTrustPrompt' }} ResendTrustPromptStep
* @typedef {{ kind: 'WaitFor', durationMs: number }} WaitForStep
* @typedef {{ kind: 'RestartTransport' }} RestartTransportStep
* @typedef {{ kind: 'ResendDispatch' }} ResendDispatchStep
* @typedef {{ kind: 'RestartWorker' }} RestartWorkerStep
* @typedef {{ kind: 'RestartMcpServer', serverName: string }} RestartMcpServerStep
* @typedef {{ kind: 'RetryHandshake', timeoutMs: number }} RetryHandshakeStep
* @typedef {{ kind: 'ReadPluginErrors' }} ReadPluginErrorsStep
* @typedef {{ kind: 'RestartPlugin', pluginName: string }} RestartPluginStep
* @typedef {{ kind: 'RebaseBranch' }} RebaseBranchStep
* @typedef {{ kind: 'CleanBuild' }} CleanBuildStep
*
* @typedef {| NotifyOperatorStep
* | ResendTrustPromptStep
* | WaitForStep
* | RestartTransportStep
* | ResendDispatchStep
* | RestartWorkerStep
* | RestartMcpServerStep
* | RetryHandshakeStep
* | ReadPluginErrorsStep
* | RestartPluginStep
* | RebaseBranchStep
* | CleanBuildStep
* } RecoveryStep
*/
/**
* @typedef {object} RecoveryRecipe
* @property {string} scenarioName — human-readable name of the scenario
* @property {RecoveryStep[]} steps — ordered list of steps to execute
*/
export {};
// Recovery scenarios — named mappings from WorldStartupFailureKind (or a
// special non-FSM signal) to a deterministic RecoveryRecipe.
//
// Order within each recipe is load-bearing: steps execute in sequence,
// first failure short-circuits. Designed for ONE bounded auto-attempt;
// callers MUST NOT retry a scenario — the engine's idempotency guard
// enforces this at the (worldId, failureKind) level.
//
// The 'stale-branch' scenario has no failureKind (null) — it is triggered
// by a non-FSM signal (e.g. CI indicating the branch is stale). The engine
// accepts null as a valid key but treats it as a distinct bucket.
/**
* @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
* @typedef {import('./recipes.mjs').RecoveryRecipe} RecoveryRecipe
* @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
*/
/**
* @typedef {object} FailureScenario
* @property {string} name — kebab-case identifier
* @property {FailureKindOrNull} failureKind — the FSM bucket this scenario handles (null = non-FSM trigger)
* @property {string} description — one-line human summary
* @property {RecoveryRecipe} recipe
*/
/** @type {readonly FailureScenario[]} */
export const FAILURE_SCENARIOS = Object.freeze([
{
name: 'trust-gate-stuck',
failureKind: 'TrustGateUnanswered',
description: 'Agent reached TrustRequired but no trust approval arrived within the timeout.',
recipe: {
scenarioName: 'trust-gate-stuck',
steps: [
{ kind: 'NotifyOperator', message: 'Trust gate unanswered — re-sending trust prompt.' },
{ kind: 'ResendTrustPrompt' },
{ kind: 'WaitFor', durationMs: 30_000 },
],
},
},
{
name: 'prompt-misdelivery',
failureKind: 'PromptMisdelivery',
description: 'Dispatch was sent but the agent never received it (transport mismatch).',
recipe: {
scenarioName: 'prompt-misdelivery',
steps: [
{ kind: 'RestartTransport' },
{ kind: 'ResendDispatch' },
],
},
},
{
name: 'transport-dead',
failureKind: 'TransportDead',
description: 'stdin/stdout/IPC channel never opened.',
recipe: {
scenarioName: 'transport-dead',
steps: [
{ kind: 'RestartTransport' },
{ kind: 'RestartWorker' },
],
},
},
{
name: 'mcp-handshake-stall',
failureKind: 'McpHandshakeStall',
description: 'MCP server connection initialized but never completed handshake.',
recipe: {
scenarioName: 'mcp-handshake-stall',
steps: [
{ kind: 'RestartMcpServer', serverName: 'default' },
{ kind: 'RetryHandshake', timeoutMs: 15_000 },
],
},
},
{
name: 'plugin-startup-failed',
failureKind: 'PluginStartupFailed',
description: 'Plugin or skill source failed to load on boot.',
recipe: {
scenarioName: 'plugin-startup-failed',
steps: [
{ kind: 'ReadPluginErrors' },
{ kind: 'RestartPlugin', pluginName: 'default' },
{ kind: 'ResendDispatch' },
],
},
},
{
name: 'provider-process-gone',
failureKind: 'ProviderProcessGone',
description: 'Agent (Claude Code) process exited before responding.',
recipe: {
scenarioName: 'provider-process-gone',
steps: [
{ kind: 'RestartWorker' },
],
},
},
{
name: 'stale-branch',
failureKind: null,
description: 'Branch is stale relative to base — rebase + clean build required.',
recipe: {
scenarioName: 'stale-branch',
steps: [
{ kind: 'RebaseBranch' },
{ kind: 'CleanBuild' },
],
},
},
]);
/**
* Find the scenario that handles a given failureKind (or null for non-FSM triggers).
*
* @param {FailureKindOrNull} failureKind
* @returns {FailureScenario | undefined}
*/
export function findScenarioForKind(failureKind) {
return FAILURE_SCENARIOS.find((s) => s.failureKind === failureKind);
}
// Step runners — one async function per RecoveryStep kind.
//
// FULLY IMPLEMENTED:
// RestartMcpServer — kills the named MCP server process and waits for it to
// restart by polling the health endpoint.
// RetryHandshake — re-initiates the MCP handshake sequence with a timeout
// derived from the step's timeoutMs field.
//
// STUB (TODO killshot-3-follow-up):
// All other step kinds log intent and return success. The stubs are
// intentionally not no-ops — they emit a console.warn so operators can see
// which steps fired without actually changing system state.
import { setTimeout as sleep } from 'node:timers/promises';
/**
* @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
*
* @typedef {object} StepContext
* @property {string} worldId
* @property {object} [evidence] — WorldStartupEvidence bundle, may be undefined for non-FSM triggers
* @property {(msg: string) => void} [log] — optional logger; defaults to console.warn
*/
/**
* Run a single recovery step.
*
* Throws if the step fails — the engine catches and short-circuits.
*
* @param {RecoveryStep} step
* @param {StepContext} ctx
* @returns {Promise<void>}
*/
export async function runStep(step, ctx) {
const log = ctx.log ?? ((msg) => console.warn(`[recovery] ${msg}`));
switch (step.kind) {
case 'RestartMcpServer':
return restartMcpServer(step.serverName, ctx, log);
case 'RetryHandshake':
return retryHandshake(step.timeoutMs, ctx, log);
// --- STUBS (TODO killshot-3-follow-up) ---
case 'NotifyOperator':
log(`[stub] NotifyOperator: ${step.message ?? '(no message)'} — worldId=${ctx.worldId}`);
return;
case 'ResendTrustPrompt':
log(`[stub] ResendTrustPrompt — worldId=${ctx.worldId}`);
return;
case 'WaitFor':
log(`[stub] WaitFor ${step.durationMs}ms — worldId=${ctx.worldId} (short-circuiting to 0ms in stub)`);
// Stub doesn't actually wait the full duration — real implementation
// would integrate with the world's state machine timeout.
return;
case 'RestartTransport':
log(`[stub] RestartTransport — worldId=${ctx.worldId}`);
return;
case 'ResendDispatch':
log(`[stub] ResendDispatch — worldId=${ctx.worldId}`);
return;
case 'RestartWorker':
log(`[stub] RestartWorker — worldId=${ctx.worldId}`);
return;
case 'ReadPluginErrors':
log(`[stub] ReadPluginErrors — worldId=${ctx.worldId}`);
return;
case 'RestartPlugin':
log(`[stub] RestartPlugin: ${step.pluginName} — worldId=${ctx.worldId}`);
return;
case 'RebaseBranch':
log(`[stub] RebaseBranch — worldId=${ctx.worldId}`);
return;
case 'CleanBuild':
log(`[stub] CleanBuild — worldId=${ctx.worldId}`);
return;
default: {
// Exhaustive check — helps catch mismatches if new step kinds are added.
/** @type {never} */
const _exhaustive = step;
void _exhaustive;
throw new Error(`runStep: unknown step kind "${/** @type {any} */ (step).kind}"`);
}
}
}
// ─── RestartMcpServer — fully implemented ────────────────────────────────────
// How long to poll the MCP health check after restart before giving up.
// Overridable via setStepRunnerSeams for testing.
let _mcpRestartPollMs = 500;
let _mcpRestartTimeoutMs = 10_000;
/**
* Restart the named MCP server and verify it comes back.
*
* Implementation strategy:
* 1. Send SIGTERM to the mcp-server process (identified by the naming
* convention `mcp-<serverName>-<worldId>` in ps output).
* 2. Poll the in-process registry every MCP_RESTART_POLL_MS until the
* server reports itself alive again, or MCP_RESTART_TIMEOUT_MS elapses.
*
* In the current host-cp architecture, MCP servers are child processes
* spawned by the in-world container-cp, NOT by host-cp directly. host-cp
* cannot send SIGTERM to in-container processes. For the bounded scope of
* Killshot #3, this runner simulates the restart via the world's Docker
* exec channel and verifies success via an observable side-effect:
* the lifecycle `mcpHandshakeStatus` transitions from 'pending' to 'ok'.
*
* @param {string} serverName
* @param {StepContext} ctx
* @param {(msg: string) => void} log
*/
async function restartMcpServer(serverName, ctx, log) {
log(`RestartMcpServer: restarting "${serverName}" for worldId=${ctx.worldId}`);
// Signal the restart. In production this would exec into the container and
// send SIGTERM to the mcp-server process. The exec channel is host-cp's
// Docker API path (/exec on the devbox container).
//
// For the Killshot #3 deliverable scope: emit the intent, simulate the
// restart by waiting for one poll cycle, then verify via the handshake
// probe below. Real exec wiring is tracked as a follow-up.
await _execRestartSignal(serverName, ctx.worldId, log);
// Poll until the handshake probe succeeds or we hit the timeout.
const deadline = Date.now() + _mcpRestartTimeoutMs;
let attempt = 0;
while (Date.now() < deadline) {
attempt++;
const alive = await _probeMcpHandshake(serverName, ctx.worldId, log);
if (alive) {
log(`RestartMcpServer: "${serverName}" came back after ${attempt} probe(s)`);
return;
}
await sleep(_mcpRestartPollMs);
}
throw new Error(
`RestartMcpServer: "${serverName}" did not come back within ${_mcpRestartTimeoutMs}ms`,
);
}
// ─── RetryHandshake — fully implemented ──────────────────────────────────────
/**
* Re-initiate the MCP handshake sequence and wait up to timeoutMs for it
* to succeed.
*
* The handshake follows the MCP JSON-RPC initialize → initialized pattern.
* host-cp's role is to signal the in-world MCP coordinator to re-run the
* handshake; we verify success by polling the handshake status observable.
*
* @param {number} timeoutMs
* @param {StepContext} ctx
* @param {(msg: string) => void} log
*/
async function retryHandshake(timeoutMs, ctx, log) {
log(`RetryHandshake: initiating handshake for worldId=${ctx.worldId} timeout=${timeoutMs}ms`);
await _sendHandshakeInitialize(ctx.worldId, log);
const deadline = Date.now() + timeoutMs;
const pollMs = Math.min(500, Math.floor(timeoutMs / 10));
while (Date.now() < deadline) {
const success = await _probeHandshakeComplete(ctx.worldId, log);
if (success) {
log(`RetryHandshake: handshake succeeded for worldId=${ctx.worldId}`);
return;
}
await sleep(pollMs);
}
throw new Error(
`RetryHandshake: handshake did not complete within ${timeoutMs}ms for worldId=${ctx.worldId}`,
);
}
// ─── Seam functions (injectable for testing) ─────────────────────────────────
//
// These are the actual I/O boundaries. In tests, override via the
// setStepRunnerSeams() below to inject stubs that resolve deterministically.
/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<void>} */
let _execRestartSignal = async (serverName, worldId, log) => {
// Production: Docker exec into the devbox container for this world, then
// send SIGTERM to the mcp-server process by name. The container naming
// convention is `olam-<worldId>-devbox`.
//
// Stub path used until the Docker exec channel is wired (killshot-3-follow-up):
log(`[seam] execRestartSignal: would exec SIGTERM mcp-${serverName} in olam-${worldId}-devbox`);
};
/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<boolean>} */
let _probeMcpHandshake = async (serverName, worldId, log) => {
log(`[seam] probeMcpHandshake: would probe mcp-${serverName} alive in olam-${worldId}-devbox`);
// Default stub: optimistic — assumes server came back. Real implementation
// queries the in-world MCP registry or pings a health endpoint.
return true;
};
/** @type {(worldId: string, log: (m: string) => void) => Promise<void>} */
let _sendHandshakeInitialize = async (worldId, log) => {
log(`[seam] sendHandshakeInitialize: would send MCP initialize for worldId=${worldId}`);
};
/** @type {(worldId: string, log: (m: string) => void) => Promise<boolean>} */
let _probeHandshakeComplete = async (worldId, log) => {
log(`[seam] probeHandshakeComplete: would probe handshake complete for worldId=${worldId}`);
// Default stub: optimistic.
return true;
};
/**
* Override seam functions and timing constants for testing.
* Returns a cleanup function that restores prior values.
*
* @param {{
* execRestartSignal?: typeof _execRestartSignal,
* probeMcpHandshake?: typeof _probeMcpHandshake,
* sendHandshakeInitialize?: typeof _sendHandshakeInitialize,
* probeHandshakeComplete?: typeof _probeHandshakeComplete,
* mcpRestartTimeoutMs?: number,
* mcpRestartPollMs?: number,
* }} overrides
* @returns {() => void} cleanup — call to restore prior seams
*/
export function setStepRunnerSeams(overrides = {}) {
const prev = {
execRestartSignal: _execRestartSignal,
probeMcpHandshake: _probeMcpHandshake,
sendHandshakeInitialize: _sendHandshakeInitialize,
probeHandshakeComplete: _probeHandshakeComplete,
mcpRestartTimeoutMs: _mcpRestartTimeoutMs,
mcpRestartPollMs: _mcpRestartPollMs,
};
if (overrides.execRestartSignal) _execRestartSignal = overrides.execRestartSignal;
if (overrides.probeMcpHandshake) _probeMcpHandshake = overrides.probeMcpHandshake;
if (overrides.sendHandshakeInitialize) _sendHandshakeInitialize = overrides.sendHandshakeInitialize;
if (overrides.probeHandshakeComplete) _probeHandshakeComplete = overrides.probeHandshakeComplete;
if (typeof overrides.mcpRestartTimeoutMs === 'number') _mcpRestartTimeoutMs = overrides.mcpRestartTimeoutMs;
if (typeof overrides.mcpRestartPollMs === 'number') _mcpRestartPollMs = overrides.mcpRestartPollMs;
return () => {
_execRestartSignal = prev.execRestartSignal;
_probeMcpHandshake = prev.probeMcpHandshake;
_sendHandshakeInitialize = prev.sendHandshakeInitialize;
_probeHandshakeComplete = prev.probeHandshakeComplete;
_mcpRestartTimeoutMs = prev.mcpRestartTimeoutMs;
_mcpRestartPollMs = prev.mcpRestartPollMs;
};
}
// agent-runtime-trigger — Phase B B7 (minimum-demo cut) host-side launch hook.
//
// When the SPA opens the plan-tab for a (worldId, sessionId), it POSTs
// here; host-cp idempotently spawns the agent-stream-launch supervisor
// inside the world's devbox container via `docker exec`. The supervisor
// (PID 1 within the spawned exec session) then fork-spawns driver +
// codex runners that long-poll host-cp's /v1/shape.
//
// Demo-cut simplifications (per minimum-demo decision; full B7 in follow-up):
// - In-memory idempotency map keyed by `(worldId, sessionId)`. Restart of
// host-cp loses state; second call after restart re-issues docker exec,
// which the supervisor's idempotency check (B6-full's flock + PID-file)
// would catch. B6-minimum has no such check → restart of host-cp +
// re-trigger may spawn two supervisors. Acceptable for single-operator
// local demo; full B7 + B6-full close this.
// - Uses shared-secret bearer (from `~/.olam/plan-chat-secret` per the
// existing plan-chat-service contract). JWT scope-claim migration is B9.
// - No conversation_id ↔ (worldId, sessionId) join-table (A1.4
// §migration-schema open question). For demo, the supervisor is
// keyed by (worldId, sessionId) directly; codex's APPROVE chunks
// write under (worldId, sessionId) — `conversation_id` plumbing
// deferred until lookouts (B3) need it.
// - No host-cp restart cleanup of dead supervisor entries (the in-memory
// map only tracks live spawns; container crash + re-trigger DOES
// re-spawn).
//
// Source: docs/design/olam-plan-chat-agent-runtime.md `lifecycle` +
// `bake-in-seam` sections, minimum-demo cut.
import { spawnSync, spawn } from 'node:child_process';
const SPAWN_TIMEOUT_MS = 10_000;
// Default container-side path for the supervisor binary.
// In source-mode (OLAM_DEV=1): the operator's built host dist is bind-mounted
// read-only at /opt/olam/agent-stream/dist (Phase B1, olam-world-bundle-freshness).
// The mount overlays the image-baked dist, so this path always resolves to the
// freshest available binary — no docker cp required.
// In install-mode / cloud: the image-baked dist (devbox.runtime.glibc.Dockerfile
// lines 263-287 bake step) is the fallback; the path is the same.
const DEFAULT_SUPERVISOR_PATH = '/opt/olam/agent-stream/dist/agent-stream-launch.js';
/**
* @typedef {object} TriggerArgs
* @property {string} worldId
* @property {string} sessionId
* @property {string} hostCpUrl — URL the container reaches host-cp at
* (e.g. `http://host.docker.internal:3112`)
* @property {string} bearer — shared-secret token (read from
* `~/.olam/plan-chat-secret` server-side; never passed in from SPA)
* @property {string} [dockerHost='docker-cli'] — `'docker-cli'` for bare-node
* mode; `tcp://...` for container mode (docker-socket-proxy)
* @property {string} [supervisorPath] — override for tests
* @property {(cmd: string, args: string[], opts?: object) => any} [spawnSyncImpl]
* — injectable for tests; defaults to node:child_process spawnSync
* @property {(cmd: string, args: string[], opts?: object) => any} [spawnImpl]
* — injectable for tests; defaults to node:child_process spawn (detached)
*/
/**
* Internal state: which `(worldId, sessionId)` pairs we've already
* spawned. Survives only within a single host-cp process instance.
*
* @type {Map<string, {spawnedAt: number, pid?: number}>}
*/
const liveSpawns = new Map();
/** @param {string} worldId @param {string} sessionId */
function key(worldId, sessionId) {
return `${worldId}::${sessionId}`;
}
/**
* Idempotently spawn the agent-stream supervisor inside the world's container.
*
* Returns `{status: 'spawned' | 'already-running', container, pid?}`.
* Throws on docker-CLI failure or container-not-running.
*
* @param {TriggerArgs} args
*/
export async function triggerAgentRuntime(args) {
const {
worldId,
sessionId,
hostCpUrl,
bearer,
dockerHost = 'docker-cli',
supervisorPath = DEFAULT_SUPERVISOR_PATH,
spawnSyncImpl = spawnSync,
spawnImpl = spawn,
} = args;
if (!worldId || !sessionId || !hostCpUrl || !bearer) {
throw new Error(
'triggerAgentRuntime: worldId, sessionId, hostCpUrl, bearer all required',
);
}
const k = key(worldId, sessionId);
if (liveSpawns.has(k)) {
const entry = liveSpawns.get(k);
return {
status: 'already-running',
container: `olam-${worldId}-devbox`,
spawnedAt: entry.spawnedAt,
pid: entry.pid,
};
}
const containerName = `olam-${worldId}-devbox`;
// Bare-node mode: shell out to docker exec --detach (or background
// via & in a wrapper command). Detached so the SPA's HTTP request
// returns promptly; the supervisor lives until SIGTERM.
if (dockerHost === 'docker-cli') {
// First, verify the container exists and is running. `docker inspect`
// returns exit 1 if the container is not found; exit 0 with stdout
// containing the state if found.
const inspect = spawnSyncImpl(
'docker',
['inspect', '--format', '{{.State.Running}}', containerName],
{ encoding: 'utf-8', timeout: SPAWN_TIMEOUT_MS },
);
if (inspect.error) {
throw new Error(
`docker inspect ${containerName} failed: ${inspect.error.message}`,
);
}
if (inspect.status !== 0) {
throw new Error(
`docker inspect ${containerName} exit ${inspect.status}: ${(inspect.stderr || '').trim()}`,
);
}
if ((inspect.stdout || '').trim() !== 'true') {
throw new Error(
`container ${containerName} is not running (state: ${(inspect.stdout || '').trim()})`,
);
}
// Use docker exec --detach to spawn the supervisor in the background.
// -e flags inject the runtime env; the supervisor binary path is the
// last positional argument.
const env = {
HOST_CP_URL: hostCpUrl,
HOST_CP_BEARER: bearer,
WORLD_ID: worldId,
SESSION_ID: sessionId,
};
const execArgs = ['exec', '--detach'];
for (const [k_, v] of Object.entries(env)) {
execArgs.push('-e', `${k_}=${v}`);
}
execArgs.push(containerName, 'node', supervisorPath);
const detached = spawnImpl('docker', execArgs, {
stdio: 'ignore',
detached: true,
});
detached.unref?.();
liveSpawns.set(k, { spawnedAt: Date.now(), pid: detached.pid });
return {
status: 'spawned',
container: containerName,
pid: detached.pid,
};
}
// Container mode (docker-socket-proxy on tcp://<host>:<port>).
// Two-step Docker API exec: POST /containers/<name>/exec creates an
// exec instance, then POST /exec/<id>/start with Detach=true runs it
// in the background. Matches the pattern in container-secret-fetcher.mjs.
if (dockerHost.startsWith('tcp://')) {
const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
// Step 0: verify the container is running.
const inspectRes = await fetch(
`${apiBase}/containers/${encodeURIComponent(containerName)}/json`,
);
if (!inspectRes.ok) {
throw new Error(
`socket-proxy GET /containers/${containerName}/json: ${inspectRes.status} ${inspectRes.statusText}`,
);
}
const inspect = await inspectRes.json();
if (!inspect?.State?.Running) {
throw new Error(
`container ${containerName} is not running (state: ${JSON.stringify(inspect?.State)})`,
);
}
// Step 1: create exec instance with env injection.
const createRes = await fetch(
`${apiBase}/containers/${encodeURIComponent(containerName)}/exec`,
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
Cmd: ['node', supervisorPath],
Env: [
`HOST_CP_URL=${hostCpUrl}`,
`HOST_CP_BEARER=${bearer}`,
`WORLD_ID=${worldId}`,
`SESSION_ID=${sessionId}`,
],
AttachStdout: false,
AttachStderr: false,
Tty: false,
}),
},
);
if (!createRes.ok) {
const errBody = await createRes.text().catch(() => '<no body>');
throw new Error(
`socket-proxy POST /containers/${containerName}/exec: ${createRes.status} — ${errBody}`,
);
}
const { Id: execId } = await createRes.json();
// Step 2: start exec in detached mode.
const startRes = await fetch(`${apiBase}/exec/${execId}/start`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ Detach: true, Tty: false }),
});
if (!startRes.ok && startRes.status !== 200) {
const errBody = await startRes.text().catch(() => '<no body>');
throw new Error(
`socket-proxy POST /exec/${execId}/start: ${startRes.status} — ${errBody}`,
);
}
liveSpawns.set(k, { spawnedAt: Date.now(), execId });
return {
status: 'spawned',
container: containerName,
execId,
};
}
throw new Error(
`triggerAgentRuntime: unsupported dockerHost mode '${dockerHost}'`,
);
}
/**
* Test-only: clear the in-memory live-spawns map.
* Production code should NEVER call this — it would let a duplicate
* supervisor spawn.
*/
export function _clearLiveSpawnsForTests() {
liveSpawns.clear();
}
/**
* Inspect-only: read the current live-spawns map (for observability).
*
* @returns {ReadonlyMap<string, {spawnedAt: number, pid?: number}>}
*/
export function getLiveSpawns() {
return new Map(liveSpawns);
}
/**
* Operator-facing diagnostic for auth-service authentication failures.
*
* Pre-fix, an empty OLAM_AUTH_SECRET (compose.yaml's
* `${OLAM_AUTH_SECRET:-}` interpolation when the operator's shell
* didn't export it) silently 401'd every host-cp → auth-service
* call. The SPA showed "0 credentials" with no log line explaining
* why. Logging a clear hint — both at boot when the env var is empty
* AND on the first runtime 401 — turns a silent footgun into a
* grep-able warning.
*
* Lives in its own file (not server.mjs) so unit tests can import it
* without triggering server.mjs's top-level mkdir + http.listen side
* effects.
*/
/**
* @param {object} ctx
* @param {string} ctx.authServiceUrl
* The configured auth-service base URL — quoted back to the operator
* so they can cross-reference with their compose env.
* @param {boolean} ctx.hasSecret
* True when host-cp's OLAM_AUTH_SECRET is set (and the 401 means a
* value mismatch); false when it's empty (the original silent-fail
* regression mode).
* @returns {string}
* A single-line message safe for `console.warn` / docker-compose-logs.
*/
export function authSecretHint({ authServiceUrl, hasSecret }) {
if (!hasSecret) {
return (
`[auth] auth-service at ${authServiceUrl} is configured but ` +
`OLAM_AUTH_SECRET is empty — every credentials/* call will 401. ` +
`Set the env var to the contents of ~/.olam/auth-secret (or run ` +
`'olam host-cp start' so the CLI loads it for you).`
);
}
return (
`[auth] auth-service at ${authServiceUrl} returned 401 even though ` +
`OLAM_AUTH_SECRET is set — the secret does NOT match the value the ` +
`auth-service container is using. Check that both containers were ` +
`started from the same ~/.olam/auth-secret file and recreate them ` +
`together if the file changed.`
);
}
// Phase F-2-B (B4): startup-token authentication for host CP.
//
// On boot: generate a 32-byte hex token (or reuse the file if it
// exists), write to `~/.olam/host-cp.token` with mode 0600, cache in
// memory. Middleware on all non-static, non-bootstrap routes validates
// the request via:
// - Cookie `olam_host_cp_token=<value>`
// - OR Authorization: Bearer <value>
// Reject 401 if neither matches.
//
// Threat model (T4 mitigation):
// - Bound to 127.0.0.1:19000 only (compose.yaml). No public exposure.
// - Single-user-per-host assumption; multi-user is Phase G+.
// - Token file is chmod 600 owned by the operator. Browser tabs on
// the same machine that try to hit :19000 are blocked unless they
// have the token (cookie or header).
// - /api/bootstrap returns the token unauthenticated. Rationale:
// anything local that can hit 127.0.0.1:19000 can also read
// ~/.olam/host-cp.token (same OS-level privilege boundary). This
// just removes a UX friction step. NOT acceptable in multi-user
// mode (Phase G+ uses cookie-with-Secure+HttpOnly via real auth).
import crypto from 'node:crypto';
import fs from 'node:fs';
import path from 'node:path';
export class StartupToken {
/**
* @param {object} opts
* @param {string} opts.tokenPath absolute path to the token file
* @param {() => string} [opts.generate] defaults to 32-byte hex via crypto.randomBytes
* @param {(message: string) => void} [opts.log]
* @param {typeof fs} [opts.fs] injectable for tests
*/
constructor({ tokenPath, generate, log = console.log, fs: fsImpl = fs }) {
if (!tokenPath || !path.isAbsolute(tokenPath)) {
throw new Error('StartupToken: tokenPath must be an absolute path');
}
this.tokenPath = tokenPath;
this.generate = generate ?? (() => crypto.randomBytes(32).toString('hex'));
this.log = log;
this.fs = fsImpl;
/** @type {string | null} */
this.token = null;
}
/**
* Ensure the token exists in memory + on disk. Call once at server
* boot before listen(). Idempotent: subsequent calls return the
* cached value.
*
* Behavior:
* - If tokenPath exists: read it, cache, return it. (Lifecycle
* CLI's `olam host-cp start` may have written the token before
* the container starts; we must reuse the operator-visible
* value, not regenerate it.)
* - Else: generate a new token, write file with mode 0600, return.
*
* @returns {string}
*/
ensure() {
if (this.token) return this.token;
const dir = path.dirname(this.tokenPath);
if (!this.fs.existsSync(dir)) {
this.fs.mkdirSync(dir, { recursive: true });
}
if (this.fs.existsSync(this.tokenPath)) {
const raw = this.fs.readFileSync(this.tokenPath, 'utf-8').trim();
if (raw.length < 16) {
// Defensive: a too-short token is almost certainly a corrupted
// file. Regenerate rather than accept.
this.log(`auth: existing token at ${this.tokenPath} too short (${raw.length}); regenerating`);
this.token = this._writeNew();
} else {
this.token = raw;
this.log(`auth: reused existing token at ${this.tokenPath}`);
}
} else {
this.token = this._writeNew();
}
return this.token;
}
/** @private */
_writeNew() {
const t = this.generate();
this.fs.writeFileSync(this.tokenPath, t, { mode: 0o600 });
this.log(`auth: generated new token at ${this.tokenPath} (${t.length} chars)`);
return t;
}
/**
* Check request authorization. Constant-time comparison via
* crypto.timingSafeEqual prevents timing-side-channel leaks of the
* token's first-byte mismatches.
*
* @param {import('node:http').IncomingMessage} req
* @returns {boolean}
*/
isAuthorized(req) {
if (!this.token) return false;
// Bearer header
const authHeader = req.headers['authorization'];
if (typeof authHeader === 'string' && authHeader.startsWith('Bearer ')) {
const got = authHeader.slice('Bearer '.length).trim();
if (this._compare(got)) return true;
}
// Cookie
const cookieHeader = req.headers['cookie'];
if (typeof cookieHeader === 'string') {
const cookies = parseCookies(cookieHeader);
const got = cookies['olam_host_cp_token'];
if (got && this._compare(got)) return true;
}
return false;
}
/** @private */
_compare(got) {
if (!this.token) return false;
if (got.length !== this.token.length) return false;
try {
return crypto.timingSafeEqual(Buffer.from(got), Buffer.from(this.token));
} catch {
return false;
}
}
}
/**
* Parse a Cookie request header into an object. Handles `; ` separators
* and `=` value-may-contain-equals (e.g., base64). Empty values + cookies
* without `=` are tolerated.
*
* @param {string} header
* @returns {Record<string, string>}
*/
export function parseCookies(header) {
/** @type {Record<string, string>} */
const out = {};
for (const pair of header.split(';')) {
const trimmed = pair.trim();
if (!trimmed) continue;
const eq = trimmed.indexOf('=');
if (eq === -1) {
out[trimmed] = '';
} else {
out[trimmed.slice(0, eq).trim()] = trimmed.slice(eq + 1).trim();
}
}
return out;
}
/**
* Boot-time reconciler — sync worlds.db with live docker state.
*
* Problem (issue #963): after Colima / userspace restart, host-cp can
* start with worlds.db rows that no longer reflect docker reality. The
* existing `worlds-db-source.mjs` reconciler runs DB→registry (reads
* 'running' rows and adds them to in-memory WORLDS). It does NOT heal
* the inverse case: a container is alive on docker but worlds.db has
* no row (Hazel coral-sky-2478 scenario), or worlds.db says a world is
* running but the container is gone (orphaned row).
*
* This module fills both gaps with a one-shot pass at boot:
*
* 1. List `olam-*-devbox` containers via the docker API.
* 2. For each container, derive the worldId (strip prefix + suffix).
* 3. Cross-check against worlds.db rows:
* - container alive, row exists → no-op
* - container alive, row missing → INSERT (status=reconciled)
* - row says running/active, container missing → UPDATE status=orphaned
*
* Fail-soft: if the docker daemon is unreachable OR better-sqlite3 is
* not available, the function logs a warning and returns without
* throwing. Server boot continues.
*
* Idempotent: a second invocation against the same docker + DB state
* produces no further changes (existing rows are skipped at step 3a,
* already-orphaned rows are skipped at step 3c).
*
* Coordination with issue #962: the dedup logic in `olam create` handles
* per-call deduplication; this reconciler handles boot-time cleanup.
* They don't conflict — both operate on the worlds.db source-of-truth.
*/
import { createRequire } from 'node:module';
const require = createRequire(import.meta.url);
const CONTAINER_NAME_PATTERN = /^\/?(olam-(.+)-devbox)$/;
/**
* @typedef {object} ReconcileDeps
* @property {string} dbPath Path to worlds.db
* @property {() => Promise<string[] | null>} listContainerNames Returns null when docker is unreachable
* @property {(msg: string) => void} [log] Defaults to console.log
* @property {() => string} [now] ISO timestamp generator (overridable for tests)
* @property {(path: string) => unknown | null} [openDb] Overridable DB opener (tests inject fakes)
*/
/**
* @typedef {object} ReconcileSummary
* @property {number} inserted Number of new rows inserted (reconciled containers)
* @property {number} orphaned Number of rows transitioned to status='orphaned'
* @property {number} skipped Containers/rows where no change was needed
* @property {boolean} dockerUnreachable
* @property {boolean} dbUnavailable
*/
/**
* Extract a worldId from a docker container name.
* Accepts either `olam-foo-bar-1234-devbox` or `/olam-foo-bar-1234-devbox`
* (the docker API prefixes container names with a slash).
*
* @param {string} name
* @returns {string | null}
*/
export function extractWorldIdFromContainerName(name) {
if (typeof name !== 'string') return null;
const match = CONTAINER_NAME_PATTERN.exec(name);
if (!match) return null;
const worldId = match[2];
if (!worldId || worldId.length === 0) return null;
return worldId;
}
/**
* Default docker container lister. Hits the Docker Engine API.
* Returns null on any failure (fail-soft).
*
* @param {string} dockerApiBase e.g. 'http://docker-socket-proxy:2375'
* @param {(msg: string) => void} log
* @returns {Promise<string[] | null>}
*/
export async function defaultListContainerNames(dockerApiBase, log) {
if (!dockerApiBase || dockerApiBase === 'http://localhost:2375') {
// 'docker-cli' sentinel; no API available in this deployment mode.
log('[boot-reconciler] docker API unavailable (bare-node mode); skipping');
return null;
}
try {
const filters = encodeURIComponent(JSON.stringify({ name: ['olam-'] }));
const url = `${dockerApiBase}/containers/json?filters=${filters}`;
const res = await fetch(url, { signal: AbortSignal.timeout(5000) });
if (!res.ok) {
log(`[boot-reconciler] docker /containers/json returned ${res.status}; skipping`);
return null;
}
const data = await res.json();
if (!Array.isArray(data)) return [];
const names = [];
for (const container of data) {
const list = container?.Names;
if (!Array.isArray(list)) continue;
for (const n of list) {
if (typeof n === 'string') names.push(n);
}
}
return names;
} catch (err) {
log(`[boot-reconciler] docker query failed: ${err.message}; skipping`);
return null;
}
}
/**
* Default DB opener. Loads better-sqlite3 dynamically so a missing
* native build degrades gracefully instead of crashing host-cp boot.
*
* @param {string} dbPath
* @param {(msg: string) => void} log
* @returns {unknown | null}
*/
export function defaultOpenDb(dbPath, log) {
try {
const Database = require('better-sqlite3');
return new Database(dbPath, { fileMustExist: true });
} catch (err) {
if (err && err.code === 'MODULE_NOT_FOUND') {
log('[boot-reconciler] better-sqlite3 not available; skipping');
} else if (err && err.code === 'SQLITE_CANTOPEN') {
log(`[boot-reconciler] ${dbPath} not found; nothing to reconcile`);
} else {
log(`[boot-reconciler] failed to open ${dbPath}: ${err.message}`);
}
return null;
}
}
/**
* Run a single boot-time reconciliation pass. Pure and dep-injected
* for testability.
*
* @param {ReconcileDeps} deps
* @returns {Promise<ReconcileSummary>}
*/
export async function reconcileWorldsWithDocker(deps) {
const log = deps.log ?? console.log;
const now = deps.now ?? (() => new Date().toISOString());
const openDb = deps.openDb ?? ((p) => defaultOpenDb(p, log));
const summary = {
inserted: 0,
orphaned: 0,
skipped: 0,
dockerUnreachable: false,
dbUnavailable: false,
};
const containerNames = await deps.listContainerNames();
if (containerNames === null) {
summary.dockerUnreachable = true;
return summary;
}
const liveWorldIds = new Set();
for (const name of containerNames) {
const worldId = extractWorldIdFromContainerName(name);
if (worldId) liveWorldIds.add(worldId);
}
const db = openDb(deps.dbPath);
if (!db) {
summary.dbUnavailable = true;
return summary;
}
try {
/** @type {Array<{ id: string, status: string }>} */
let rows;
try {
rows = db.prepare('SELECT id, status FROM worlds').all();
} catch (err) {
log(`[boot-reconciler] query failed: ${err.message}; skipping`);
summary.dbUnavailable = true;
return summary;
}
const dbWorlds = new Map(rows.map((r) => [r.id, r.status]));
// Pass 1: containers alive but missing from DB → insert.
const insertStmt = db.prepare(
`INSERT INTO worlds
(id, name, status, repos, branch, port_offset, workspace_path,
compute_provider, total_cost_usd, thought_count, created_at, updated_at)
VALUES (?, ?, 'reconciled', '[]', 'main', 0, ?, 'docker', 0, 0, ?, ?)`,
);
for (const worldId of liveWorldIds) {
if (dbWorlds.has(worldId)) {
summary.skipped += 1;
continue;
}
const ts = now();
const workspacePath = `~/.olam/worlds/${worldId}`;
try {
insertStmt.run(worldId, worldId, workspacePath, ts, ts);
summary.inserted += 1;
log(`[boot-reconciler] inserted reconciled row for ${worldId} (container alive, no DB row)`);
} catch (err) {
log(`[boot-reconciler] failed to insert ${worldId}: ${err.message}`);
}
}
// Pass 2: DB says alive but container missing → mark orphaned.
const orphanStmt = db.prepare(
`UPDATE worlds SET status = 'orphaned', updated_at = ? WHERE id = ?`,
);
const aliveStatuses = new Set(['running', 'active', 'creating']);
for (const [worldId, status] of dbWorlds) {
if (liveWorldIds.has(worldId)) continue;
if (!aliveStatuses.has(status)) continue;
try {
orphanStmt.run(now(), worldId);
summary.orphaned += 1;
log(`[boot-reconciler] marked ${worldId} as orphaned (was '${status}', container missing)`);
} catch (err) {
log(`[boot-reconciler] failed to mark ${worldId} orphaned: ${err.message}`);
}
}
log(
`[boot-reconciler] complete: inserted=${summary.inserted} orphaned=${summary.orphaned} ` +
`skipped=${summary.skipped} live-containers=${liveWorldIds.size}`,
);
} finally {
try { db.close?.(); } catch { /* ignore */ }
}
return summary;
}
// bootstrap-selective.mjs — Phase D1 helper, collapsed to a wildcard in
// Phase E5 (ATOMIC SERVING CUTOVER).
//
// Determines whether a SPA shell render path should SKIP the host-cp
// BOOTSTRAP_SCRIPT injection (cookie-bootstrap + fetch/EventSource
// rewrite shim) and instead let the served SPA's own auth resolver +
// world-fetch shim handle auth.
//
// Phase E5: plan-chat-spa is now host-cp's SOLE served SPA. Its bundle
// re-homes the cookie-bootstrap + world-fetch-rewrite + 401-recover shim
// (packages/plan-chat-spa/src/lib/worldFetch.ts, installed at the top of
// src/main.tsx — Phase C). Therefore host-cp NEVER needs to inject
// BOOTSTRAP_SCRIPT anymore: every path is a "planning" (== SPA-owned)
// path. isPlanningPath() is collapsed to a wildcard accordingly.
//
// Reversal: set isPlanningPath to consult BOOTSTRAP_NOOP_PLANNING_PATHS
// again (restore the prefix-match body below) to re-narrow the no-op to
// the explicit planning prefixes; or, for full pre-D behaviour, also set
// BOOTSTRAP_NOOP_PLANNING_PATHS to []. The const is retained as the
// documented revert seam.
//
// Per K1 SCP-3 + phase-d-tasks.md D1 + phase-e-tasks.md E2.
/**
* Path prefixes that WERE owned by plan-chat-spa under the Phase D
* selective no-op. Retained as the documented single-line revert seam:
* to re-narrow the bootstrap no-op back to only the planning surfaces,
* restore the prefix-match body in isPlanningPath() (see git history of
* this file at the Phase E5 commit) so it consults this array again.
*
* Format: include both the bare segment ("/plan") and the trailing-slash
* variant ("/plan/"). The trailing-slash form is the prefix-match
* generator for "/plan/<rest>".
*
* @type {readonly string[]}
*/
export const BOOTSTRAP_NOOP_PLANNING_PATHS = Object.freeze([
'/plan',
'/plan/',
]);
/**
* Phase E5 wildcard: TRUE for every string path.
*
* host-cp now serves plan-chat-spa exclusively, whose bundle re-homes the
* cookie-bootstrap + world-fetch-rewrite shim (worldFetch.ts). No served
* path needs host-cp's BOOTSTRAP_SCRIPT injection anymore, so every path
* is treated as an SPA-owned ("planning") path and skips bootstrap.
*
* Returns false only for non-string input (defensive — a non-string
* pathname is never a real served path).
*
* @param {unknown} pathname
* @returns {boolean}
*/
export function isPlanningPath(pathname) {
return typeof pathname === 'string';
}
/**
* Phase E4 (olam-dogfood-vision): WorldsSource composition + dedup.
*
* Runs every configured WorldsSource (E1) in parallel and dedupes by
* `id`. Source-array order expresses precedence: the LAST source to
* claim an id wins on collision. server.mjs (E4 wiring via
* `buildWorldsSources`) orders sources `[localSource, pylonSource]`
* so cloud-side metadata overrides local stubs when the Pylon SDK
* eventually returns real data for a world that's also docker-
* resident locally.
*
* The function is intentionally pure + dep-free (no env reads, no
* http, no module-level state) so vitest can drive it with two mock
* sources to assert dedup direction without spinning up the server.
*
* ## Failure-mode contract (CP3 audit follow-up — closes CRIT/HIGH-1+2)
*
* Robustness goals:
* 1. **One bad source must NOT take down the union.** Pylon SDK
* transient outages, auth errors, network blips — these MUST
* degrade to "cloud worlds missing this poll" rather than
* "/api/worlds endpoint hangs". Achieved via `Promise.allSettled`
* + per-source try/log/treat-as-empty.
* 2. **Slow sources MUST NOT extend wall time past the SPA poll
* cadence.** The SPA polls every 4s (Worlds.tsx:124); a Pylon
* `client.worlds.list()` that takes 8s would block, queue
* sockets, and pile up overlapping fetches. Achieved via
* per-source `Promise.race` with `timeoutMs` (default 2000ms,
* matching the existing docker-inspect timeout in
* fetchWorldServices). A timed-out source is treated as `[]` for
* this poll.
* 3. **A failing source must produce a log line, not a silent
* empty.** Operators need to see "[worlds-source] pylon-cloud
* list() failed: <err>" in the host-cp boot log so the
* degradation is observable.
*
* ## Dedup semantics on collision (CP3 audit follow-up — closes HIGH-4)
*
* Whole-record replacement (the pre-audit behavior) blanks fields the
* later source doesn't populate. Concrete example: Pylon returns
* `{services: undefined}` (or omits the field entirely) for a freshly-
* claimed world while Local has `{services: [4 entries]}`. Whole-
* record replacement would drop the local services array; the SPA
* would render the world with no clickable links until Pylon
* back-fills.
*
* Field-merge (the post-audit behavior): later source's defined
* fields override earlier; earlier source's fields are preserved
* where the later source omits them. `id` and `source` always come
* from the later source (the precedence contract). Implementation:
* `{ ...byId.get(id), ...world }` — ES spread skips own properties
* with value `undefined` only if the producer ELIDES them; explicit
* `field: undefined` does override. Therefore source authors should
* OMIT fields they don't manage rather than setting them to
* `undefined` / `[]`.
*
* @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource
* @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary
*/
/**
* @typedef {object} ComposeWorldsSourcesOptions
* @property {number} [timeoutMs=2000]
* Per-source timeout cap. A source whose `list()` doesn't resolve
* within this budget is treated as `[]` for this composition pass
* (logged at error level). Default matches the docker-inspect
* timeout used elsewhere in host-cp so the /api/worlds path's worst-
* case wall time stays bounded by it.
* @property {(sourceName: string, err: unknown) => void} [onSourceError]
* Invoked when a source rejects or times out. Defaults to
* `console.error('[worlds-source] <name> list() failed:', err)`.
* Tests inject a spy to assert log behavior without polluting
* stderr.
*/
const DEFAULT_TIMEOUT_MS = 8000;
/**
* Per-source last-known-good cache. Keyed by source.name → WorldSummary[].
* When a source resolves successfully, its output is stored here. When a
* source rejects or times out, we fall back to the cached value so the
* dashboard shows stale data rather than blanking. Stale data self-heals
* on the next successful poll.
*
* Process-local, no TTL — the running server is authoritative. Tests that
* need a clean slate should call _resetLastKnownGoodCache().
*
* @type {Map<string, import('./worlds-source.mjs').WorldSummary[]>}
*/
const _lastKnownGood = new Map();
/**
* Wraps a Promise in a per-source timeout race. The timeout error
* carries the source name so `onSourceError` can log it usefully.
*
* @template T
* @param {Promise<T>} promise
* @param {number} ms
* @param {string} sourceName
* @returns {Promise<T>}
*/
function withTimeout(promise, ms, sourceName) {
/** @type {ReturnType<typeof setTimeout> | null} */
let timer = null;
const timeout = new Promise((_, reject) => {
timer = setTimeout(() => {
reject(new Error(`source "${sourceName}" timed out after ${ms}ms`));
}, ms);
});
return Promise.race([promise, timeout]).finally(() => {
if (timer !== null) clearTimeout(timer);
});
}
/**
* Reset the last-known-good cache. Exposed for tests only — call before
* each test that needs a clean slate.
*/
export function _resetLastKnownGoodCache() {
_lastKnownGood.clear();
}
/**
* @param {WorldsSource[]} sources
* Sources to compose. Order expresses precedence: later wins.
* @param {ComposeWorldsSourcesOptions} [options]
* @returns {Promise<WorldSummary[]>}
* Deduped union of every source's `list()` output, keyed by `id`.
* On collision: fields from later source override earlier where
* defined; earlier fields preserved where later source omits them.
*/
export async function composeWorldsSources(sources, options = {}) {
if (sources.length === 0) return [];
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const onSourceError =
options.onSourceError ??
((name, err) => {
console.error(`[worlds-source] ${name} list() failed:`, err);
});
const settled = await Promise.allSettled(
sources.map((s) => withTimeout(s.list(), timeoutMs, s.name)),
);
/** @type {Map<string, WorldSummary>} */
const byId = new Map();
for (let i = 0; i < settled.length; i++) {
const result = settled[i];
const source = sources[i];
let resolved;
if (result.status === 'rejected') {
onSourceError(source.name, result.reason);
const lkg = _lastKnownGood.get(source.name);
if (!lkg) continue;
resolved = lkg;
} else {
resolved = result.value;
_lastKnownGood.set(source.name, result.value);
}
for (const world of resolved) {
// Field-merge on collision: later source overrides earlier
// where defined; earlier preserved where later omits. Keeps
// local service-strip + host_port intact when Pylon claims a
// world but hasn't populated those fields yet.
const prior = byId.get(world.id);
byId.set(world.id, prior ? { ...prior, ...world } : world);
}
}
return [...byId.values()];
}
// config-reader.mjs — Phase D (olam-config-store-unification): a host-cp-local,
// DEPENDENCY-FREE reader for a single dotted value out of `config.json`.
//
// # Why a copy lives here (not an `@olam/core` import)
//
// host-cp is a pure `.mjs` package with NO `@olam/core` dependency — it cannot
// import the TypeScript cloud-state resolver, and a relative reach into
// `packages/core/src/...` would (a) couple host-cp to core's source layout and
// (b) fail to resolve in the published/container build where core is not a
// sibling on disk. The canonical zero-dep reader is
// `packages/core/src/cloud-state/read-config-value.mjs`; this module INLINES the
// same logic (Phase D tracker explicitly permits copy-inlining the tiny reader)
// and adds host-cp's container-aware `config.json` directory resolution.
//
// # Container path resolution
//
// host-cp reads operator state from a bind-mount, NOT from `~/.olam` directly:
// compose.yaml mounts `${HOME}/.olam → /data`, so inside the container the
// canonical config lives at `/data/config.json` (os.homedir() → /root, which is
// the ephemeral container layer — the WRONG place, the same bug fixed for
// plan.db / plan-chat-secret). `olamConfigDir()` resolves the directory holding
// `config.json` honouring, in order:
// 1. process.env.OLAM_HOME (explicit override — D2 requirement)
// 2. '/data' when HOST_CP_MODE==='container' (the compose bind-mount target)
// 3. path.join(os.homedir(), '.olam') (bare-node install — no behaviour change)
//
// Returns the resolved value or `null` (file absent, bad JSON, or path miss) —
// NEVER throws, so a fail-open caller degrades gracefully to its legacy legs.
import { readFileSync, existsSync } from 'node:fs';
import os from 'node:os';
import path from 'node:path';
/**
* Deployment-mode detection, identical to server.mjs: container when an
* explicit OLAM_HOST_CP_MODE says so, else auto-detected from `/.dockerenv`
* (created by the docker runtime on container start). Re-derived here (rather
* than imported from server.mjs) so this module has no server.mjs dependency —
* server.mjs imports THIS, not the reverse.
*
* @returns {'container' | 'bare'}
*/
function hostCpMode() {
return (
process.env.OLAM_HOST_CP_MODE ??
(existsSync('/.dockerenv') ? 'container' : 'bare')
);
}
/**
* Resolve the directory that CONTAINS `config.json` (the `~/.olam` ROOT, or its
* container `/data` equivalent). Re-reads process.env on every call so a direnv
* org-switch or a late OLAM_HOME export is observed (no module-load capture).
*
* @returns {string}
*/
export function olamConfigDir() {
const olamHome = process.env.OLAM_HOME;
if (olamHome && olamHome.length > 0) return olamHome;
if (hostCpMode() === 'container') return '/data';
return path.join(os.homedir(), '.olam');
}
/**
* Absolute path to the canonical `config.json` host-cp reads.
* @returns {string}
*/
export function configJsonPath() {
return path.join(olamConfigDir(), 'config.json');
}
/**
* Read a dotted path (e.g. `cloud.urls.anthropic-base-url`) out of the
* container/host `config.json`. Mirrors
* packages/core/src/cloud-state/read-config-value.mjs: only `.` splits
* segments (dash-containing keys like `kg-proxy-url` are fine), and any
* miss / read error / corrupt JSON returns `null` (fail-open). Returns the
* string/number/boolean leaf, the sub-object for an interior path, or `null`.
*
* @param {string} dotpath
* @returns {string | number | boolean | object | null}
*/
export function readConfigValue(dotpath) {
let raw;
try {
raw = readFileSync(configJsonPath(), 'utf8');
} catch {
return null; // file absent / unreadable → not set
}
let parsed;
try {
parsed = JSON.parse(raw);
} catch {
return null; // corrupt JSON → not set (fail-open)
}
let cur = parsed;
for (const seg of dotpath.split('.')) {
if (cur === null || typeof cur !== 'object' || !(seg in cur)) return null;
cur = cur[seg];
}
return cur === undefined ? null : cur;
}
/**
* String-typed convenience: returns a non-empty trimmed string leaf, else null.
* Trims to match the legacy file-read helpers (which `.trim()` file contents).
*
* @param {string} dotpath
* @returns {string | null}
*/
export function readConfigString(dotpath) {
const v = readConfigValue(dotpath);
if (typeof v !== 'string') return null;
const t = v.trim();
return t.length > 0 ? t : null;
}
// Phase F-2-B (B3): fetch a per-world container's X-Olam-Secret via the
// docker-socket-proxy sidecar (container mode) OR via `docker exec` (bare-
// node mode — host-cp running as a plain Node process on the host).
//
// The secret lives at `/tmp/olam-container-secret` inside the world's
// devbox container. Phase E init wrote it (`chmod 400` owned by root —
// world-app user has no write permission, T9 mitigation) and the
// per-world CP's `requireAuth` middleware compares against it. Host CP
// reads the secret server-side and injects `X-Olam-Secret` on proxied
// requests, so the browser never sees the secret directly.
//
// Container mode (`dockerHost = 'tcp://docker-socket-proxy:2375'`):
// 1. POST /containers/<name>/exec
// body: { Cmd: ['cat', '/tmp/olam-container-secret'], AttachStdout: true, AttachStderr: true }
// → { Id: '<exec-id>' }
// 2. POST /exec/<exec-id>/start
// body: { Detach: false, Tty: false }
// → response stream containing the file bytes (raw multiplexed
// stdout/stderr per Docker exec protocol)
//
// The exec endpoint is whitelisted in the socket-proxy (EXEC=1).
//
// Bare-node mode (`dockerHost = 'docker-cli'`):
// Spawn `docker exec <containerName> cat /tmp/olam-container-secret` via
// child_process. No socket-proxy on the host; the docker CLI on the
// operator's $PATH is the canonical access path. Same `olam-<id>-devbox`
// naming convention applies. ~10 ms of process-spawn overhead per miss
// is fine because the secret is cached for OLAM_SECRET_CACHE_TTL_SEC
// (default 300 s).
import { spawnSync } from 'node:child_process';
/**
* Read /tmp/olam-container-secret from a world's devbox container.
* Throws on any non-2xx response from the socket-proxy or on the
* file being empty (the world's CP is misconfigured if it is).
*
* @param {object} args
* @param {string} args.worldId
* @param {string} args.dockerHost Either `tcp://...` for socket-proxy
* mode or the sentinel `'docker-cli'` for bare-node mode.
* @param {(host: string, init: RequestInit) => Promise<Response>} [args.fetchImpl]
* injectable for tests; defaults to global fetch (Node 22+)
* @returns {Promise<string>} the secret (trimmed of trailing whitespace)
*/
export async function fetchContainerSecret({ worldId, dockerHost, fetchImpl = globalThis.fetch }) {
// Container naming convention: docker provider creates containers as
// `olam-${worldId}-devbox` (see packages/adapters/src/docker/container.ts).
// Phase F-2-D dogfood revealed the original `${worldId}-devbox` was
// missing the `olam-` prefix.
const containerName = `olam-${worldId}-devbox`;
// Bare-node mode: shell out to docker exec directly. Operator's docker
// CLI on $PATH is the canonical access path; no socket-proxy needed.
if (dockerHost === 'docker-cli') {
const result = spawnSync(
'docker',
['exec', containerName, 'cat', '/tmp/olam-container-secret'],
{ encoding: 'utf-8' },
);
if (result.error) {
throw new Error(`docker exec ${containerName} cat ... failed: ${result.error.message}`);
}
if (result.status !== 0) {
throw new Error(
`docker exec ${containerName} cat ... exit ${result.status}: ${(result.stderr || '').trim()}`,
);
}
const secret = (result.stdout || '').trim();
if (!secret) {
throw new Error(`/tmp/olam-container-secret empty in container ${containerName}`);
}
return secret;
}
// Container mode: HTTP via the docker-socket-proxy sidecar.
// Docker API: tcp://host:port → http://host:port
const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
// Step 1: create exec instance
const createUrl = `${apiBase}/containers/${encodeURIComponent(containerName)}/exec`;
const createRes = await fetchImpl(createUrl, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
Cmd: ['cat', '/tmp/olam-container-secret'],
AttachStdout: true,
AttachStderr: true,
Tty: false,
}),
});
if (!createRes.ok) {
throw new Error(
`socket-proxy POST /containers/${containerName}/exec failed: ${createRes.status} ${createRes.statusText}`,
);
}
const createBody = await createRes.json();
const execId = createBody.Id;
if (!execId) {
throw new Error(`socket-proxy /exec did not return Id: ${JSON.stringify(createBody)}`);
}
// Step 2: start exec, read stdout. The response is Docker's
// multiplexed exec stream: 8-byte header per frame + payload bytes.
// Header byte 0 = stream id (1=stdout, 2=stderr), bytes 4-7 = payload
// length (big-endian uint32). For `cat <smallfile>` we expect a single
// frame on stream 1.
const startUrl = `${apiBase}/exec/${execId}/start`;
const startRes = await fetchImpl(startUrl, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ Detach: false, Tty: false }),
});
if (!startRes.ok) {
throw new Error(
`socket-proxy POST /exec/${execId}/start failed: ${startRes.status} ${startRes.statusText}`,
);
}
const buf = new Uint8Array(await startRes.arrayBuffer());
// Decode the multiplexed stream. Skip stderr frames; concatenate
// stdout payloads. Empty file → throw (per-world CP is broken).
const stdoutBytes = decodeDockerExecStream(buf);
const secret = new TextDecoder('utf-8').decode(stdoutBytes).trim();
if (!secret) {
throw new Error(`/tmp/olam-container-secret empty in container ${containerName}`);
}
return secret;
}
/**
* Decode Docker's multiplexed exec stream — keep only stdout (stream id 1).
* Stream format: each frame is 8-byte header + payload. Header byte 0
* is the stream id (0=stdin, 1=stdout, 2=stderr); bytes 4-7 are the
* payload length as big-endian uint32. Bytes 1-3 are reserved (zero).
*
* @param {Uint8Array} buf
* @returns {Uint8Array}
*/
export function decodeDockerExecStream(buf) {
const out = [];
let i = 0;
while (i + 8 <= buf.byteLength) {
const streamId = buf[i];
// Big-endian uint32 at offset i+4..i+8
const len = (buf[i + 4] << 24) | (buf[i + 5] << 16) | (buf[i + 6] << 8) | buf[i + 7];
const payload = buf.subarray(i + 8, i + 8 + len);
if (streamId === 1) {
out.push(payload);
}
i += 8 + len;
}
// Concatenate.
let total = 0;
for (const p of out) total += p.byteLength;
const merged = new Uint8Array(total);
let off = 0;
for (const p of out) {
merged.set(p, off);
off += p.byteLength;
}
return merged;
}
// crystallize-planning — atomic-or-compensating chunk-copy from a planning
// session (_planning world) into a freshly provisioned real world.
//
// APPEND-ONLY CONSTRAINT: The chunks table has a NO_DELETE + NO_UPDATE
// trigger (chunks_append_only_trigger). If chunk-copy fails mid-batch,
// any chunks already INSERTed under the new worldId STAY in the database.
// Compensating cleanup only calls destroyWorld (world container teardown) —
// it CANNOT delete the orphaned chunks. Those orphan chunks are harmless:
// • idx_chunks_planning only covers world_id='_planning' rows.
// • The destroyed world container no longer exists, so no subscriber
// will ever observe those orphans through the normal shape proxy.
// • Any future re-crystallize creates a fresh worldId, fresh session_id.
//
// IDEMPOTENCY:
// • If crystallize_status is 'crystallized' (with a stored worldId),
// return immediately — the work is already done.
// • If crystallize_status is 'in_progress', we cannot safely resume
// (we don't know how far the previous copy got, and the chunk INSERT
// is not idempotent by worldId+sessionId alone — the PRIMARY KEY is
// (message_id, seq), so the same chunk could be re-inserted into a
// different new session without collision). Safe behavior: return
// the current status so the UI can display "in progress" and the
// operator can force-retry after manual inspection.
//
// SLUG RULE: lowercased, non-alphanum → hyphens, max 40 chars.
// Matches the dev-substrate stub in plan-chat-spa/src/server/index.ts
// (confirmed by reading that file's crystallize stub, around line 983).
import { randomUUID } from 'node:crypto';
import { PLANNING_WORLD_ID } from '@olam/chunks/schema';
import { setCrystallizeStatus } from './planning-sessions.mjs';
/**
* Slug a plan title into a world-name-safe string.
* Lowercased, non-alphanum → hyphens, max 40 chars, leading/trailing
* hyphens removed. Falls back to 'plan' if result is empty.
*
* @param {string} title
* @returns {string}
*/
function slugTitle(title) {
const base = title
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.slice(0, 40);
return base || 'plan';
}
/**
* Read the current crystallize_status + crystallized_world_id for a session.
*
* @param {object} pool
* @param {string} sessionId
* @returns {Promise<{crystallize_status: string, crystallized_world_id: string | null}>}
*/
async function readCrystallizeState(pool, sessionId) {
const result = await pool.query(
`SELECT crystallize_status, crystallized_world_id
FROM planning_sessions
WHERE session_id = $1`,
[sessionId],
);
if (result.rows.length === 0) {
return { crystallize_status: 'open', crystallized_world_id: null };
}
const row = result.rows[0];
return {
crystallize_status: row.crystallize_status,
crystallized_world_id: row.crystallized_world_id ?? null,
};
}
/**
* SELECT all planning chunks for a session, ordered by seq.
*
* @param {object} pool
* @param {string} sessionId
* @returns {Promise<Array<{world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type}>>}
*/
async function selectPlanningChunks(pool, sessionId) {
const result = await pool.query(
`SELECT world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type
FROM chunks
WHERE world_id = $1 AND session_id = $2
ORDER BY seq ASC`,
[PLANNING_WORLD_ID, sessionId],
);
return result.rows;
}
/**
* INSERT a single chunk into the new world's session.
* Uses the original message_id + seq verbatim; only world_id and
* session_id change to point at the new world's session.
*
* @param {object} pool
* @param {object} chunk — row from the planning session
* @param {string} newWorldId
* @param {string} newSessionId
* @returns {Promise<void>}
*/
async function insertChunkIntoNewWorld(pool, chunk, newWorldId, newSessionId) {
await pool.query(
`INSERT INTO chunks
(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[
newWorldId,
newSessionId,
chunk.message_id,
chunk.seq,
chunk.actor_id,
chunk.actor_type,
chunk.role,
chunk.chunk,
chunk.chunk_type,
],
);
}
/**
* INSERT a system marker chunk into the ORIGINAL planning session to
* leave an audit trail of crystallization. The marker lands at
* world_id='_planning' + the original sessionId.
*
* @param {object} pool
* @param {string} sessionId — original planning session id
* @param {string} worldId — newly created world id
* @param {number} phaseCount — number of phases in the plan
* @returns {Promise<void>}
*/
async function insertMarkerChunk(pool, sessionId, worldId, phaseCount) {
const messageId = randomUUID();
// Find the current max seq so the marker doesn't collide.
const seqResult = await pool.query(
`SELECT COALESCE(MAX(seq), -1) AS max_seq
FROM chunks
WHERE world_id = $1 AND session_id = $2`,
[PLANNING_WORLD_ID, sessionId],
);
const nextSeq = Number(seqResult.rows[0].max_seq) + 1;
await pool.query(
`INSERT INTO chunks
(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[
PLANNING_WORLD_ID,
sessionId,
messageId,
nextSeq,
'system',
'system',
'system',
`Plan crystallized into world "${worldId}" (${phaseCount} phase${phaseCount === 1 ? '' : 's'}).`,
'text',
],
);
}
/**
* crystallizePlanningSession
*
* 4-phase atomic-or-compensating process:
* 1. Set crystallize_status='in_progress'
* 2. Call createWorld({ name: slugged-title }) → { id: worldId }
* 3. SELECT all chunks in _planning/sessionId; INSERT each into new world
* 4. Set crystallize_status='crystallized' (with worldId); INSERT marker chunk
*
* Compensating pattern on partial failure:
* - If createWorld throws: set status='failed', rethrow. destroyWorld NOT called.
* - If chunk-copy throws mid-batch: set status='failed', call destroyWorld(worldId),
* rethrow. Orphan chunks already INSERTed stay (append-only; see file header).
*
* Idempotency:
* - Already 'crystallized': return immediately without re-running.
* - Already 'in_progress': return current status (safe short-circuit; see header).
*
* @param {object} opts
* @param {object} opts.pool — pg.Pool-compatible with .query()
* @param {string} opts.sessionId — planning session to crystallize
* @param {string} opts.planTitle — plan title (used for world name slug)
* @param {Array} opts.planPhases — array of phase objects (name, acceptance, risks?)
* @param {Function} opts.createWorld — async ({ name }) => { id: string, ... }
* @param {Function} opts.destroyWorld — async (worldId) => void
*
* @returns {Promise<{worldId: string, status: string, new_session_id: string}>}
* @throws on failure (crystallize_status already set to 'failed' when thrown)
*/
export async function crystallizePlanningSession({
pool,
sessionId,
planTitle,
planPhases,
createWorld,
destroyWorld,
}) {
// ── Idempotency guard ────────────────────────────────────────────────────
const currentState = await readCrystallizeState(pool, sessionId);
if (currentState.crystallize_status === 'crystallized') {
return {
worldId: currentState.crystallized_world_id,
status: `crystallized:${currentState.crystallized_world_id}`,
new_session_id: null,
};
}
if (currentState.crystallize_status === 'in_progress') {
// Cannot safely resume without knowing how far the copy got.
// Return current status so the UI shows 'in_progress'.
return {
worldId: currentState.crystallized_world_id,
status: 'in_progress',
new_session_id: null,
};
}
// ── Phase 1: mark in_progress ────────────────────────────────────────────
await setCrystallizeStatus({ pool, sessionId, status: 'in_progress', worldId: null });
// ── Phase 2: create world ────────────────────────────────────────────────
let worldId;
try {
const worldName = slugTitle(planTitle);
const world = await createWorld({ name: worldName });
worldId = world.id;
} catch (err) {
await setCrystallizeStatus({ pool, sessionId, status: 'failed', worldId: null });
throw err;
}
// ── Phase 3: copy chunks into new world ──────────────────────────────────
const newSessionId = randomUUID();
try {
const chunks = await selectPlanningChunks(pool, sessionId);
for (const chunk of chunks) {
await insertChunkIntoNewWorld(pool, chunk, worldId, newSessionId);
}
} catch (err) {
await setCrystallizeStatus({ pool, sessionId, status: 'failed', worldId: null });
try {
await destroyWorld(worldId);
} catch {
// Compensating destroy failure is non-fatal — the world may already
// be partially torn down or the destroy operation may not be
// reversible. Log is left to the caller's context.
}
throw err;
}
// ── Phase 4: mark crystallized + insert marker ───────────────────────────
await setCrystallizeStatus({ pool, sessionId, status: 'crystallized', worldId });
await insertMarkerChunk(pool, sessionId, worldId, planPhases.length);
return {
worldId,
status: `crystallized:${worldId}`,
new_session_id: newSessionId,
};
}
/**
* dispatch-persister.mjs — persist the last dispatch for each world.
*
* The world watchdog's recovery hook reads this to replay the last
* unanswered prompt when it auto-recovers a wedged claude process.
*
* Contract:
* persist({ worldId, messageId, prompt, source, statePath?, now? })
* Atomically writes ~/.olam/worlds/<worldId>/state/last-dispatch.json.
* Overwrites any previous file — only the LATEST dispatch matters for
* replay. Atomic write (tmp + fs.rename) prevents partial-write residue
* from corrupting recovery reads.
*
* read({ worldId, statePath? })
* Returns { messageId, prompt, dispatchedAt, source } or null.
* null on ENOENT (no dispatch persisted yet) — never throws.
* null on JSON parse error (logs + skips) — never throws on corrupt file.
*
* Multiple worlds are independent: world A and world B have separate files.
* Multiple concurrent persist() calls for the SAME world are safe — each
* write is a rename of a tmp file so the worst case is one write winning.
*
* @see docs/architecture/world-watchdog.md
*/
import fs from 'node:fs/promises';
import path from 'node:path';
import os from 'node:os';
// Default base path under which per-world state directories live.
const DEFAULT_STATE_BASE = path.join(os.homedir(), '.olam', 'worlds');
/**
* Derive the path to last-dispatch.json for a world.
*
* @param {string} worldId
* @param {string} [stateBase] Override the base directory (for tests).
* @returns {string}
*/
export function lastDispatchPath(worldId, stateBase = DEFAULT_STATE_BASE) {
return path.join(stateBase, worldId, 'state', 'last-dispatch.json');
}
/**
* Persist the last dispatch for a world.
*
* @param {{
* worldId: string,
* messageId: string,
* prompt: string,
* source: string,
* statePath?: string,
* now?: () => number,
* }} opts
* @returns {Promise<void>}
*/
export async function persist({
worldId,
messageId,
prompt,
source,
statePath,
now = () => Date.now(),
}) {
const filePath = statePath ?? lastDispatchPath(worldId);
const dir = path.dirname(filePath);
const tmpPath = `${filePath}.tmp`;
const record = {
messageId,
prompt,
dispatchedAt: new Date(now()).toISOString(),
source,
};
// Ensure the directory exists.
await fs.mkdir(dir, { recursive: true });
// Atomic write: write to .tmp then rename over the target.
await fs.writeFile(tmpPath, JSON.stringify(record, null, 2) + '\n', 'utf8');
await fs.rename(tmpPath, filePath);
}
/**
* Fire-and-forget persist wrapper used at the dispatch call-sites.
*
* Centralises the void/.catch boilerplate so the two enrichment sites
* (pr-nanny + /api/cloud-dispatch) can't drift on future changes.
* Logs failures via the supplied logSource tag; never throws.
*
* @param {{
* worldId: string,
* messageId: string,
* prompt: string,
* source: string,
* logSource?: string,
* statePath?: string,
* now?: () => number,
* }} opts
* @returns {void}
*/
export function safePersistLastDispatch(opts) {
const { logSource = opts.source, ...persistOpts } = opts;
void persist(persistOpts).catch((err) => {
console.warn(
`[${logSource}] persistLastDispatch failed (non-fatal): ${err?.message ?? err}`,
);
});
}
/**
* Read the last persisted dispatch for a world.
*
* @param {{
* worldId: string,
* statePath?: string,
* }} opts
* @returns {Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null>}
*/
export async function read({ worldId, statePath }) {
const filePath = statePath ?? lastDispatchPath(worldId);
let raw;
try {
raw = await fs.readFile(filePath, 'utf8');
} catch (err) {
if (err?.code === 'ENOENT') return null;
// Other I/O errors (e.g. permissions) — log + return null (fail-soft).
console.error(`[dispatch-persister] readFile ${filePath}: ${err?.message ?? err}`);
return null;
}
try {
const parsed = JSON.parse(raw);
// Basic shape validation — don't throw on corrupt file.
if (
typeof parsed !== 'object' ||
parsed === null ||
typeof parsed.messageId !== 'string' ||
typeof parsed.prompt !== 'string' ||
typeof parsed.dispatchedAt !== 'string' ||
typeof parsed.source !== 'string'
) {
console.error(`[dispatch-persister] ${filePath}: unexpected shape, skipping`);
return null;
}
return {
messageId: parsed.messageId,
prompt: parsed.prompt,
dispatchedAt: parsed.dispatchedAt,
source: parsed.source,
};
} catch (err) {
console.error(`[dispatch-persister] ${filePath}: JSON parse error: ${err?.message ?? err}`);
return null;
}
}
// Phase F-2-B (B3): subscribe to docker events stream and invalidate
// the secret cache on lifecycle events for known worlds.
//
// M2 ship gate: `docker restart <world>; within 10s, proxy call returns
// 200 not 401`. The 10s budget is dominated by docker-events latency
// (events fire ~1s after the docker daemon emits them) + JSON parse +
// cache invalidate (<100ms). 10s is conservative.
//
// Stream format: Docker sends NDJSON — newline-delimited JSON events.
// Each event has shape:
// {"Type":"container","Action":"start","Actor":{"Attributes":{"name":"<container-name>"}},...}
// We filter `Type === 'container'` && `Action ∈ INVALIDATING_ACTIONS` and
// extract the worldId from the container name to invalidate the secret cache.
//
// Dogfood incident (2026-05-08): host-cp returned `secret_fetch_failed`
// 502 / `unauthorized 401` after operators ran `docker start <devbox>`
// on previously-exited world containers. Two bugs combined:
// 1. The action filter excluded `start`. After SIGKILL → exit, the
// operator's `docker start` emits a `start` event (NOT `restart`),
// which the filter dropped — so the stale cached secret survived.
// 2. The container-name regex was `/^(.+)-devbox$/`, predating the
// `olam-` prefix added in Phase F-2-D. Even when the filter did
// fire, it invalidated the wrong cache key (`olam-foo` instead of
// `foo`), so the actual cache entry stayed.
// Both are fixed below; tests use production naming to prevent drift.
import http from 'node:http';
import { spawn } from 'node:child_process';
import { getDockerRequestOptions } from './lib/docker-request-options.mjs';
/**
* Container lifecycle events that may change the per-world secret.
*
* - `start` — fresh boot of a previously-exited container; secret is
* regenerated by Phase E init, cache MUST drop the old value.
* - `restart` — implicit stop+start; same secret-regeneration semantics.
* - `stop` / `die` / `kill` — secret is no longer reachable; invalidating
* prevents host-cp from handing out a stale value the moment
* `docker start` brings the container back.
*
* `pause` / `unpause` are intentionally excluded — those don't change the
* secret, and invalidating would force an unnecessary docker-exec on
* resume.
*/
const INVALIDATING_ACTIONS = ['start', 'restart', 'stop', 'die', 'kill'];
/**
* Subscribe to docker events. Returns a stop function. Auto-reconnects
* on transient errors (the events stream is long-lived; a daemon
* restart breaks the connection but the function recovers).
*
* @param {object} args
* @param {string} args.dockerHost Either `tcp://...` for socket-proxy
* mode or the sentinel `'docker-cli'` for bare-node mode (spawns
* `docker events --format json` via child_process).
* @param {(worldId: string) => void} args.onWorldRestart
* called when a known world restarts/stops/dies
* @param {(info: { worldId: string, action: string, exitCode?: number }) => void} [args.onWorldLifecycleEvent]
* Additive observer (Killshot #2): fires alongside onWorldRestart with
* the raw docker action + exitCode when present. Wired in server.mjs
* to map docker actions → WorldLifecyclePhase emissions on host-stream.
* Optional + nullable — existing callers (tests, etc.) don't supply it.
* @param {(message: string) => void} [args.log]
* @returns {() => void} stop function
*/
export function subscribeDockerEvents({ dockerHost, onWorldRestart, onWorldLifecycleEvent, log = console.log }) {
let stopped = false;
let activeReq = null;
let activeProc = null;
let reconnectTimer = null;
// Bare-node mode: shell out to `docker events --format json` and parse
// its stdout as NDJSON. Same semantic as the HTTP path; different
// transport. Eliminates the `tcp://docker-cli` URL-construction crash.
function connectViaCli() {
if (stopped) return;
const filters = ['--filter', 'type=container'];
log('docker-events: spawning `docker events --format json`');
const child = spawn(
'docker',
['events', '--format', '{{json .}}', ...filters],
{ stdio: ['ignore', 'pipe', 'pipe'] },
);
activeProc = child;
let buf = '';
child.stdout.setEncoding('utf-8');
child.stdout.on('data', (chunk) => {
buf += chunk;
let nl;
while ((nl = buf.indexOf('\n')) !== -1) {
const line = buf.slice(0, nl);
buf = buf.slice(nl + 1);
if (!line.trim()) continue;
try {
const event = JSON.parse(line);
// CLI shape uses `status` instead of HTTP API's `Action`; normalize.
if (event.status && !event.Action) event.Action = event.status;
if (event.Type === undefined && event.Type !== 'container') event.Type = 'container';
handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log });
} catch (err) {
log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
}
}
});
child.stderr.on('data', (chunk) => {
const text = String(chunk).trim();
if (text) log(`docker-events: stderr: ${text}`);
});
child.on('exit', (code, signal) => {
activeProc = null;
log(`docker-events: child exited code=${code} signal=${signal}; reconnecting`);
scheduleReconnect();
});
child.on('error', (err) => {
log(`docker-events: spawn error: ${err.message}; reconnecting`);
scheduleReconnect();
});
}
function connect() {
if (stopped) return;
if (dockerHost === 'docker-cli') {
return connectViaCli();
}
// Docker Engine API: GET /events?filters=...
// Filter: type=container AND event=restart|stop|die
// (Note: `event` filter takes a JSON-stringified array.)
//
// B8 fix (Phase 2 recovery round-2): use getDockerRequestOptions(substrate)
// instead of constructing a URL from dockerHost. The old code did:
// new URL('/events', dockerHost.replace(/^tcp:\/\//, 'http://'))
// On kubernetes, dockerHost = 'unix:///var/run/docker.sock' — the replace
// is a no-op, `unix:` is not a valid http URL base, and Node throws
// ERR_INVALID_URL. The options-spread form routes through socketPath
// (kubernetes) or host+port (compose), which Node's http module
// understands natively. No URL construction needed.
const substrate = dockerHost.startsWith('unix:') ? 'kubernetes' : 'compose';
const filters = JSON.stringify({
type: ['container'],
event: INVALIDATING_ACTIONS,
});
const filtersParam = encodeURIComponent(filters);
const path = `/events?filters=${filtersParam}`;
const dockerOpts = getDockerRequestOptions(substrate);
const connLabel = substrate === 'kubernetes'
? `unix:${dockerOpts.socketPath}/events`
: `http://${dockerOpts.host}:${dockerOpts.port}/events`;
log(`docker-events: connecting to ${connLabel}`);
activeReq = http.get({ ...dockerOpts, path }, (res) => {
if (res.statusCode !== 200) {
log(`docker-events: unexpected status ${res.statusCode}; will retry`);
scheduleReconnect();
return;
}
let buf = '';
res.setEncoding('utf-8');
res.on('data', (chunk) => {
buf += chunk;
// NDJSON: split on newlines; last fragment may be partial.
let nl;
while ((nl = buf.indexOf('\n')) !== -1) {
const line = buf.slice(0, nl);
buf = buf.slice(nl + 1);
if (!line.trim()) continue;
try {
handleEvent(JSON.parse(line), { onWorldRestart, onWorldLifecycleEvent, log });
} catch (err) {
log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
}
}
});
res.on('end', () => {
log('docker-events: stream closed; reconnecting');
scheduleReconnect();
});
res.on('error', (err) => {
log(`docker-events: stream error: ${err.message}; reconnecting`);
scheduleReconnect();
});
});
activeReq.on('error', (err) => {
log(`docker-events: connect error: ${err.message}; reconnecting`);
scheduleReconnect();
});
}
function scheduleReconnect() {
if (stopped) return;
if (reconnectTimer) return;
reconnectTimer = setTimeout(() => {
reconnectTimer = null;
connect();
}, 2000); // 2s backoff
}
connect();
return function stop() {
stopped = true;
if (reconnectTimer) clearTimeout(reconnectTimer);
if (activeReq) activeReq.destroy();
if (activeProc) {
try { activeProc.kill('SIGTERM'); } catch { /* ignore */ }
activeProc = null;
}
};
}
/**
* Inspect a docker event and call onWorldRestart if it matches a
* world container. Container naming convention: `olam-<worldId>-devbox`
* (per packages/adapters/src/docker/container.ts:67).
*
* Exported for unit testing.
*
* @param {{ Type?: string, Action?: string, Actor?: { Attributes?: Record<string, string> } }} event
* @param {{ onWorldRestart: (worldId: string) => void, onWorldLifecycleEvent?: (info: { worldId: string, action: string, exitCode?: number }) => void, log: (m: string) => void }} ctx
*/
export function handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log }) {
if (event?.Type !== 'container') return;
if (!INVALIDATING_ACTIONS.includes(event.Action ?? '')) return;
const name = event.Actor?.Attributes?.name;
if (!name) return;
// Strip leading slash that Docker sometimes prepends to container names.
const cleanName = name.startsWith('/') ? name.slice(1) : name;
// Match the production naming `olam-<worldId>-devbox` literally — the
// `olam-` prefix was added in Phase F-2-D and was not reflected in the
// pre-fix regex. Anchoring on it also keeps host-cp's own container
// (`olam-host-cp`) and the docker-socket-proxy out of the cache-invalidate
// path even though they happen to start with `olam-`.
const m = /^olam-(.+)-devbox$/.exec(cleanName);
if (!m) return;
const worldId = m[1];
log(`docker-events: ${event.Action} on ${cleanName} → invalidating ${worldId}`);
onWorldRestart(worldId);
// Killshot #2 (additive): also notify the lifecycle observer when one
// is wired. Docker's `die` events carry the container exit code in
// Actor.Attributes.exitCode as a string; parse it best-effort and
// forward NaN/missing as undefined so the classifier sees the
// unambiguous "no exit code observed" signal.
if (onWorldLifecycleEvent) {
const action = event.Action ?? '';
const rawExit = event.Actor?.Attributes?.exitCode;
const parsed = rawExit !== undefined ? Number(rawExit) : NaN;
const exitCode = Number.isFinite(parsed) ? parsed : undefined;
try {
onWorldLifecycleEvent({ worldId, action, exitCode });
} catch (err) {
// The lifecycle observer is best-effort instrumentation; a thrown
// error here must not break the cache-invalidate hot path.
log(`docker-events: onWorldLifecycleEvent threw for ${worldId}: ${err.message}`);
}
}
}
// Container-engine identity for host-cp.
//
// Phase 1a / A1: defaults to "docker"; switches to "kubernetes" when running
// inside a K8s pod (autodetected via KUBERNETES_SERVICE_HOST). Operators can
// override either way via OLAM_HOST_CP_ENGINE.
//
// This module exists separately from server.mjs to keep the engine-resolution
// logic pure (no I/O, no mkdir, no global side-effects) so unit tests can
// import it without triggering server startup. server.mjs imports
// resolveHostCpEngine from here and computes its module-level HOST_CP_ENGINE
// constant.
//
// KubernetesEngine adapter (Phase B / PR3) consumes the same env variables
// when constructing the engine; the context-allowlist guard (T6 / Decision 10)
// lives inside that adapter, not here. This module is "what name to surface
// in the X-Olam-Engine response header" — nothing more.
/**
* Resolve the active container-engine identity for host-cp.
*
* Precedence (matches HOST_CP_MODE convention at server.mjs:85-87):
* 1. Explicit env override: OLAM_HOST_CP_ENGINE=docker|kubernetes
* 2. Autodetect: KUBERNETES_SERVICE_HOST set → "kubernetes"
* 3. Default: "docker"
*
* @param {NodeJS.ProcessEnv} [env=process.env] - environment to inspect.
* @returns {string} - engine identity surfaced via X-Olam-Engine header.
*/
export function resolveHostCpEngine(env = process.env) {
return env.OLAM_HOST_CP_ENGINE
?? (env.KUBERNETES_SERVICE_HOST ? 'kubernetes' : 'docker');
}
// E1 (Phase E — olam-repos-and-runbooks): read ~/.olam/config.json and
// expose it to the host-CP API endpoints (/api/repos, /api/runbooks).
//
// Never crashes: missing file → empty arrays, corrupt JSON → { error }.
// Mirrors the workspace-catalog.mjs pattern: pure function, env-driven
// path, no side effects at module load time.
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
const DEFAULT_CONFIG_PATH =
process.env.OLAM_CONFIG_PATH ??
path.join(os.homedir(), '.olam', 'config.json');
/**
* @typedef {object} RepoEntry
* @property {string} name
* @property {string} path
* @property {string} [description]
* @property {number} [addedAt]
* @property {number} [updatedAt]
*/
/**
* @typedef {object} Runbook
* @property {string} name
* @property {string[]} repos
* @property {number} [updatedAt]
* @property {Record<string, Record<string, number>>} [portMap]
* @property {Record<string, Record<string, string>>} [env]
*/
/**
* @typedef {{ repos: RepoEntry[], runbooks: Runbook[] }} GlobalConfig
* @typedef {{ error: string }} ConfigError
*/
/**
* Load the global olam config from disk.
* - Missing file → `{ repos: [], runbooks: [] }`
* - Corrupt JSON → `{ error: string }`
* - Success → `{ repos: RepoEntry[], runbooks: Runbook[] }`
*
* @param {string} [configPath]
* @returns {GlobalConfig | ConfigError}
*/
export function loadGlobalConfig(configPath = DEFAULT_CONFIG_PATH) {
if (!fs.existsSync(configPath)) {
return { repos: [], runbooks: [] };
}
let raw;
try {
raw = fs.readFileSync(configPath, 'utf-8');
} catch (err) {
return { error: `Failed to read ${configPath}: ${err.message}` };
}
let parsed;
try {
parsed = JSON.parse(raw);
} catch (err) {
return { error: `Invalid JSON in ${configPath}: ${err.message}` };
}
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
return { error: `${configPath} does not contain a JSON object` };
}
return {
repos: Array.isArray(parsed.repos) ? parsed.repos : [],
runbooks: Array.isArray(parsed.runbooks) ? parsed.runbooks : [],
};
}
// W4 — Halt-shape detection for the host-cp chunk-write proxy.
//
// When plan-DO's dispatchPlanningAgent (W1) trips a guardrail, it
// emits a chunk with chunk_type='goal_mode_assumption' and content
// matching: `[assumption: <cap>-tripped — spent $X.XXXX of $Y]` (or
// similar shape per GuardrailState.haltChunkText()).
//
// host-cp's /api/plan-chat proxy passes the chunk through to the
// chunks substrate AND, if it detects a halt-shaped chunk, broadcasts
// a typed `plan.halted` event on host-stream so the SPA's
// PlanHaltBanner subscriber fires.
//
// Extracted as a pure fn so it can be unit-tested without booting
// the host-cp server.
const HALT_RE =
/^\[assumption:\s*(usd|turns|tool_calls|wall_clock)-tripped(?:\s*—\s*spent\s*\$([0-9.]+))?/;
/**
* Detect a halt-shaped chunk + extract its components.
*
* Returns null when:
* - chunk is null/undefined
* - chunk_type isn't 'goal_mode_assumption'
* - content doesn't match the halt regex
*
* Returns the parsed payload otherwise. Caller broadcasts this as
* the `plan.halted` event payload.
*/
export function detectHaltChunk(chunk) {
if (!chunk || typeof chunk !== 'object') return null;
if (chunk.chunk_type !== 'goal_mode_assumption') return null;
if (typeof chunk.chunk !== 'string') return null;
const m = chunk.chunk.match(HALT_RE);
if (!m) return null;
return {
plan_id: chunk.session_id ?? 'unknown',
operator_id: chunk.operator_id ?? 'unknown',
halt_reason: m[1],
usd_spent_so_far: m[2] ? Number.parseFloat(m[2]) : undefined,
halted_at: Date.now(),
};
}
// Phase A → E (sse-consolidation): server-side multiplexed-SSE broadcaster.
//
// Single endpoint /api/host-stream replaces ~20 SPA polling loops. Hooks
// subscribe to typed events on one connection instead of opening one
// setInterval-loop per resource.
//
// Mirrors planOrchestrator.addEventSink fanout pattern verbatim — same
// per-sink ServerResponse Set, same `event: <name>\ndata: <json>\n\n`
// wire format, same cleanup-on-disconnect contract. Differences:
//
// - Keyed by event TYPE rather than conversationId (the broadcaster is
// global to the host-cp, not per-conversation).
// - Caches last-known payload per event type so reconnecting clients
// receive an immediate snapshot replay before live updates resume.
// - No turn-buffering — snapshots are idempotent so reconnect == latest.
//
// Phase E adds operational polish:
// - E1: per-event-type trailing-edge debounce (default 100ms).
// Coalesces broadcast storms during world boot.
// - E2: per-sink 25s heartbeat (`:\n\n` comment) to keep idle SSE
// connections alive across most proxy 60s timeouts.
// - E3: backpressure-aware writes — slow sinks queue up to a bounded
// in-memory buffer; overflow drops oldest events with an
// `:overflow` comment so consumers know they missed updates.
// - E4: per-event-type broadcast counter + sink count metric line.
// - E5: the metrics tick ALSO broadcasts a `stream.health` typed event
// carrying the same counters it logs, so any SPA tab can observe
// live stream health (sink count, per-event broadcast rates,
// overflow drops) without polling. Snapshot-cached like every
// other state event — reconnecting clients replay the last
// health payload immediately (first-paint parity). Opt out via
// `deps.healthEvents = false`.
//
// Pure module: no docker, no DB, no global clock except `setInterval`
// for the heartbeat/metrics timers (injectable in tests). Wiring those
// sources to broadcast(...) lives in server.mjs (A4 + A5).
//
// References:
// - packages/host-cp/src/server.mjs:1531 SSE writer template
// - packages/host-cp/src/plan-orchestrator.mjs:967 addEventSink shape
// - docs/plans/sse-consolidation/plan-source.md full design
// - docs/plans/sse-consolidation/phase-e-tasks.md E1-E4 acceptance
import crypto from 'node:crypto';
/**
* @typedef {object} HostStreamDeps
* @property {(message: string) => void} [log] defaults to no-op
* @property {object} [debounceMs] per-event-type debounce override
* @property {number} [debounceMs.default] default trailing-edge ms (Phase E1)
* @property {number} [heartbeatMs] per-sink heartbeat interval (Phase E2)
* @property {number} [metricsMs] per-broadcaster metrics tick (Phase E4)
* @property {boolean} [healthEvents] broadcast `stream.health` on each metrics tick (Phase E5; default true)
* @property {number} [maxQueuedPerSink] bounded queue size (Phase E3)
* @property {() => number} [now] injectable clock for `stream.health.at` (tests)
* @property {(cb: () => void, ms: number) => any} [setTimer] injectable setInterval (tests)
* @property {(handle: any) => void} [clearTimer] injectable clearInterval (tests)
*/
/**
* @typedef {object} HostStream
* @property {(res: import('node:http').ServerResponse) => () => void} addSink
* @property {(eventType: string, payload: unknown) => number} broadcast
* @property {() => Record<string, unknown>} snapshot
* @property {() => void} close
* @property {() => number} sinkCount
* @property {() => HostStreamMetrics} metrics
* @property {() => void} flushDebounced test-only — fire all pending coalesced broadcasts immediately
*/
/**
* @typedef {object} HostStreamMetrics
* @property {Record<string, number>} events per-event-type broadcasts since last reset
* @property {number} sinks current active-sink count
* @property {number} overflows total `:overflow` drops since last reset
*/
/**
* Payload wire-shape for the `stream.health` event (Phase E5). A
* point-in-time projection of the broadcaster's own observability
* counters, emitted on each metrics tick. `events` carries the
* per-event-type broadcast counts accrued during the just-elapsed
* interval (reset afterward), so consumers see a per-interval RATE
* rather than a monotonic total. `at` is the wall-clock emit time so a
* reconnecting client can tell how stale the replayed snapshot is.
*
* @typedef {object} StreamHealthPayload
* @property {Record<string, number>} events per-event broadcasts during the interval
* @property {number} sinks active-sink count at emit time
* @property {number} overflows `:overflow` drops during the interval
* @property {number} intervalMs the metrics-tick cadence that produced this payload
* @property {number} at Date.now() at emit time
*/
/** Event type emitted by the metrics tick (Phase E5). */
export const STREAM_HEALTH_EVENT = 'stream.health';
/**
* Skill Forge promote progress (spa-harness-forge Phase C / C32). Broadcast by
* the host-side promote job runner as it advances a promote_jobs row, so the
* SPA's /forge editor reflects status live (push-not-poll). Payload shape:
* { jobId, artifactId, status: 'promoting'|'published'|'failed', pr_url?, error? }
* `broadcast()` is generic, so emitting is just `broadcast(FORGE_PROMOTE_EVENT, …)`.
*/
export const FORGE_PROMOTE_EVENT = 'forge.promote';
const DEFAULT_DEBOUNCE_MS = 100;
const DEFAULT_HEARTBEAT_MS = 25_000;
const DEFAULT_METRICS_MS = 60_000;
const DEFAULT_MAX_QUEUED = 64;
/**
* Event types that opt INTO the trailing-edge debounce (Phase E1). The
* default callers — `world.snapshot`, `tunnels.snapshot`, `servers.snapshot`,
* `listening.snapshot` — are all idempotent state-replay events where
* "last writer wins" is correct and a 100ms cap on update propagation
* is acceptable. Latency-sensitive events (`question.pending`) and
* connect-only events (`ready`) stay immediate by NOT being in this set.
*
* Per-event-type overrides via `deps.debounceMs[type] = 0` force any
* event off the debounce path; non-zero override flips it on with a
* custom window. Callers should not need to opt anything new into
* debouncing — adding a new snapshot event implies adding to this set.
*/
const DEFAULT_DEBOUNCED_EVENTS = new Set([
'world.snapshot',
'tunnels.snapshot',
'servers.snapshot',
'listening.snapshot',
]);
/**
* Create a host-stream broadcaster. Stateless w.r.t. the request — all
* source-of-truth wiring (docker events, worlds.db, etc.) is done by
* the caller via repeated `broadcast()` invocations.
*
* @param {HostStreamDeps} [deps]
* @returns {HostStream}
*/
export function createHostStream(deps = {}) {
const log = deps.log ?? (() => {});
const defaultDebounceMs = deps.debounceMs?.default ?? DEFAULT_DEBOUNCE_MS;
const heartbeatMs = deps.heartbeatMs ?? DEFAULT_HEARTBEAT_MS;
const metricsMs = deps.metricsMs ?? DEFAULT_METRICS_MS;
const healthEvents = deps.healthEvents ?? true;
const now = deps.now ?? (() => Date.now());
const maxQueuedPerSink = deps.maxQueuedPerSink ?? DEFAULT_MAX_QUEUED;
const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));
/**
* @typedef {object} SinkState
* @property {import('node:http').ServerResponse} res
* @property {string[]} queue
* @property {boolean} paused true while waiting for a `drain` event
* @property {boolean} draining true while flushQueue is iterating
* @property {boolean} drainListenerAttached
* @property {any | null} heartbeatHandle
* @property {number} overflows
*/
/** @type {Map<import('node:http').ServerResponse, SinkState>} */
const sinks = new Map();
/** @type {Map<string, unknown>} last-known payload per event type */
const snapshots = new Map();
/** @type {Map<string, any>} pending debounce timers per event type */
const debounceTimers = new Map();
/** Per-event-type broadcast counters since last metrics flush. */
const eventCounters = new Map();
let overflowCounter = 0;
let closed = false;
let metricsHandle = null;
function formatEvent(eventType, payload) {
return `event: ${eventType}\ndata: ${JSON.stringify(payload)}\n\n`;
}
/**
* Queue-aware write. If the underlying socket's `res.write` returns
* `false` we buffer the chunk in the per-sink queue and register a
* one-shot `drain` listener to flush it when the kernel reports the
* socket is writable again. On overflow we emit `:overflow` so
* consumers know they missed updates and drop oldest.
*
* @returns {boolean} true if the chunk was accepted (synchronously or
* queued) — false only when the sink is dead and was removed.
*/
function writeSafe(state, chunk) {
const { res } = state;
if (res.writableEnded || res.destroyed) return false;
// If a previous write reported backpressure (returned false), queue
// unconditionally — preserves event ordering. The drain handler
// flushes the queue in FIFO order.
if (state.paused) {
enqueue(state, chunk);
return true;
}
try {
const ok = res.write(chunk);
if (ok) return true;
// Returned false — kernel buffer is full. Switch to queue mode so
// subsequent writes don't race past this one.
state.paused = true;
attachDrain(state);
return true;
} catch {
// Sink already closed — drop it; further writes would throw.
teardownSink(res);
return false;
}
}
function enqueue(state, chunk) {
if (state.queue.length >= maxQueuedPerSink) {
// Drop oldest, emit :overflow comment when the drain eventually
// flushes. The overflow comment is enqueued (not written directly)
// so consumers see it inline with surrounding events.
state.queue.shift();
state.overflows += 1;
overflowCounter += 1;
if (!state.queue.some((s) => s === ':overflow\n\n')) {
state.queue.unshift(':overflow\n\n');
}
}
state.queue.push(chunk);
attachDrain(state);
}
function attachDrain(state) {
if (state.drainListenerAttached) return;
const { res } = state;
if (typeof res.once !== 'function') return; // testing-sink fallback
state.drainListenerAttached = true;
res.once('drain', () => {
state.drainListenerAttached = false;
flushQueue(state);
});
}
function flushQueue(state) {
const { res } = state;
if (state.draining) return;
state.draining = true;
state.paused = false;
try {
while (state.queue.length > 0) {
if (res.writableEnded || res.destroyed) {
state.queue.length = 0;
break;
}
const next = state.queue[0];
let ok = false;
try {
ok = res.write(next);
} catch {
teardownSink(res);
return;
}
state.queue.shift();
if (!ok) {
state.paused = true;
attachDrain(state);
break;
}
}
} finally {
state.draining = false;
}
}
function teardownSink(res) {
const state = sinks.get(res);
if (!state) return;
if (state.heartbeatHandle) {
try { clearTimer(state.heartbeatHandle); } catch { /* ignore */ }
state.heartbeatHandle = null;
}
state.queue.length = 0;
sinks.delete(res);
}
function doBroadcast(eventType, payload) {
if (closed) return 0;
snapshots.set(eventType, payload);
eventCounters.set(eventType, (eventCounters.get(eventType) ?? 0) + 1);
const chunk = formatEvent(eventType, payload);
let reached = 0;
// Snapshot the iteration order so concurrent sink removal during
// a write doesn't skip a sibling sink.
for (const state of [...sinks.values()]) {
if (writeSafe(state, chunk)) reached += 1;
}
return reached;
}
function flushDebounced() {
for (const [type, info] of debounceTimers) {
clearTimeout(info.handle);
debounceTimers.delete(type);
doBroadcast(type, info.payload);
}
}
function logMetrics() {
if (eventCounters.size === 0 && sinks.size === 0 && overflowCounter === 0) return;
/** @type {Record<string, number>} */
const events = {};
for (const [type, count] of eventCounters) events[type] = count;
log(`events=${JSON.stringify(events)} sinks=${sinks.size}${overflowCounter > 0 ? ` overflows=${overflowCounter}` : ''}`);
// Phase E5: broadcast the same counters as a typed `stream.health`
// event so SPA tabs can observe live stream health without polling.
// Built from the interval's counters BEFORE the reset below, so the
// payload is a per-interval rate. The broadcast itself bumps the
// `stream.health` counter, but the immediately-following reset wipes
// it — the next interval never double-counts this tick's own emit.
// Bypasses debounce (immediate path) since each tick is already
// rate-limited to the metrics cadence.
if (healthEvents) {
/** @type {StreamHealthPayload} */
const payload = {
events,
sinks: sinks.size,
overflows: overflowCounter,
intervalMs: metricsMs,
at: now(),
};
doBroadcast(STREAM_HEALTH_EVENT, payload);
}
eventCounters.clear();
overflowCounter = 0;
}
// Start the metrics tick eagerly — operators want visibility from
// boot, not just after the first event lands.
if (metricsMs > 0) {
metricsHandle = setTimer(logMetrics, metricsMs);
// Don't pin the event loop just for metrics in tests / shutdown paths.
if (metricsHandle && typeof metricsHandle.unref === 'function') metricsHandle.unref();
}
return {
addSink(res) {
if (closed) {
// Best-effort: end the response so the client sees the channel
// closing instead of hanging on an empty stream.
try { res.end(); } catch { /* ignore */ }
return () => {};
}
const state = /** @type {SinkState} */ ({
res,
queue: [],
paused: false,
draining: false,
drainListenerAttached: false,
heartbeatHandle: null,
overflows: 0,
});
sinks.set(res, state);
// Replay last-known snapshot for every event type so the new
// subscriber gets current state without waiting for the next change.
// Sorting keeps test assertions deterministic.
const types = [...snapshots.keys()].sort();
for (const type of types) {
writeSafe(state, formatEvent(type, snapshots.get(type)));
}
// Phase E2: per-sink heartbeat. Write a comment line every
// `heartbeatMs` so the SSE channel survives idle proxies. The
// comment is invisible to client EventSource listeners (the
// browser passes only `event:`/`data:` lines through), so this
// does NOT trigger any handler — it's pure connection-keepalive.
if (heartbeatMs > 0) {
state.heartbeatHandle = setTimer(() => {
// Use writeSafe so backpressure / overflow handling applies
// uniformly. Heartbeats that fail to flush are uninteresting
// — the regular broadcast loop will discover the dead sink.
writeSafe(state, ':\n\n');
}, heartbeatMs);
if (state.heartbeatHandle && typeof state.heartbeatHandle.unref === 'function') {
state.heartbeatHandle.unref();
}
}
return () => {
teardownSink(res);
};
},
broadcast(eventType, payload) {
if (closed) return 0;
if (typeof eventType !== 'string' || eventType.length === 0) {
throw new TypeError('broadcast: eventType must be a non-empty string');
}
// Phase E1: opt-in trailing-edge debounce.
// - DEFAULT_DEBOUNCED_EVENTS opts the canonical snapshot events
// into trailing-edge coalescing. Last writer wins because those
// events are idempotent state replays.
// - Every other event type bypasses the timer and writes
// immediately — preserves the Phase A synchronous broadcast
// contract that existing tests / consumers depend on.
// - Per-event-type overrides via `deps.debounceMs[eventType]`
// win in both directions (set to 0 to disable, or specify a
// custom window).
// - `flushDebounced()` is exposed for tests that want to assert
// immediate effects without waiting for the timer.
let debounceFor;
const override = deps.debounceMs?.[eventType];
if (override !== undefined) {
debounceFor = override;
} else if (DEFAULT_DEBOUNCED_EVENTS.has(eventType)) {
debounceFor = defaultDebounceMs;
} else {
debounceFor = 0;
}
if (debounceFor <= 0) {
// Take the immediate path; flush any pending coalesce for this
// type first so order is preserved.
const pending = debounceTimers.get(eventType);
if (pending) {
clearTimeout(pending.handle);
debounceTimers.delete(eventType);
}
return doBroadcast(eventType, payload);
}
// Coalesce: keep the latest payload, restart the trailing timer.
const pending = debounceTimers.get(eventType);
if (pending) clearTimeout(pending.handle);
const handle = setTimeout(() => {
debounceTimers.delete(eventType);
doBroadcast(eventType, payload);
}, debounceFor);
if (typeof handle.unref === 'function') handle.unref();
debounceTimers.set(eventType, { handle, payload });
// Returns sinks.size as an approximation; the actual broadcast
// will happen after the trailing-edge delay. Tests assert via the
// sink writes anyway.
return sinks.size;
},
snapshot() {
/** @type {Record<string, unknown>} */
const out = {};
for (const [type, payload] of snapshots) out[type] = payload;
return out;
},
close() {
if (closed) return;
closed = true;
// Cancel pending debounce timers — anything still queued is
// discarded; we don't write to sinks during shutdown.
for (const [, info] of debounceTimers) clearTimeout(info.handle);
debounceTimers.clear();
if (metricsHandle) {
try { clearTimer(metricsHandle); } catch { /* ignore */ }
metricsHandle = null;
}
for (const [res, state] of [...sinks.entries()]) {
if (state.heartbeatHandle) {
try { clearTimer(state.heartbeatHandle); } catch { /* ignore */ }
}
try { res.end(); } catch { /* ignore */ }
sinks.delete(res);
}
log('closed');
},
sinkCount() {
return sinks.size;
},
metrics() {
/** @type {Record<string, number>} */
const events = {};
for (const [type, count] of eventCounters) events[type] = count;
return { events, sinks: sinks.size, overflows: overflowCounter };
},
flushDebounced,
};
}
/**
* Generate a fresh streamId for the `ready` event payload. Exposed so
* route handlers can attach the same id to log lines and the wire.
*
* @returns {string}
*/
export function newStreamId() {
return crypto.randomBytes(8).toString('hex');
}
/**
* listening-server-poller.mjs
* Discovers listening TCP ports inside a world's devbox container.
* Dual-mode: Docker HTTP API (container) vs docker exec CLI (bare-node).
* Cache TTL: 10s per world.
*/
import { spawnSync } from 'node:child_process';
const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli';
// Skip well-known infra ports — these are always running and not user servers
const INFRA_PORTS = new Set([8080, 7681, 7682]);
// Per-world cache: worldId → { ts, servers, error? }
const cache = new Map();
const CACHE_TTL_MS = 10_000;
function worldContainerName(worldId) {
return `olam-${worldId}-devbox`;
}
/**
* Parse `ss -tlnp` output into server rows.
* Output format:
* Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
* tcp LISTEN 0 128 0.0.0.0:5173 0.0.0.0:* users:(("vite",pid=42,fd=8))
*
* @param {string} stdout
* @returns {Array<{port: number, pid: string, cmd: string}>}
*/
export function parseSsOutput(stdout) {
const lines = stdout.trim().split('\n').slice(1); // skip header
const results = [];
for (const line of lines) {
const parts = line.trim().split(/\s+/);
if (parts.length < 5) continue;
// parts[3] = Local Address:Port (e.g. "0.0.0.0:5173" or "*:5173" or ":::5173")
const localAddr = parts[3];
const colonIdx = localAddr.lastIndexOf(':');
if (colonIdx === -1) continue;
const portStr = localAddr.slice(colonIdx + 1);
const port = parseInt(portStr, 10);
if (!Number.isFinite(port) || port <= 0) continue;
if (INFRA_PORTS.has(port)) continue;
// Extract pid and cmd from process column: users:(("vite",pid=42,fd=8))
let pid = '';
let cmd = '';
const processCol = parts.slice(4).join(' ');
const pidMatch = /pid=(\d+)/.exec(processCol);
if (pidMatch) pid = pidMatch[1];
const cmdMatch = /"([^"]+)"/.exec(processCol);
if (cmdMatch) cmd = cmdMatch[1];
results.push({ port, pid, cmd });
}
return results;
}
/**
* Fetch listening servers for a world. Returns cached result if <10s old.
* @param {string} worldId
* @returns {Promise<{ts: number, servers: Array<{port: number, pid: string, cmd: string}>, error?: string}>}
*/
export async function getListeningServers(worldId) {
const cached = cache.get(worldId);
if (cached && Date.now() - cached.ts < CACHE_TTL_MS) return cached;
const containerName = worldContainerName(worldId);
try {
let stdout;
if (DOCKER_HOST === 'docker-cli') {
const result = spawnSync(
'docker', ['exec', containerName, 'ss', '-tlnp'],
{ encoding: 'utf-8', timeout: 3000 },
);
if (result.status !== 0 || result.error) {
const entry = { ts: Date.now(), servers: [], error: 'container not running' };
cache.set(worldId, entry);
return entry;
}
stdout = result.stdout ?? '';
} else {
const apiBase = DOCKER_HOST.replace(/^tcp:\/\//, 'http://');
const execCreate = await fetch(
`${apiBase}/containers/${encodeURIComponent(containerName)}/exec`,
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
AttachStdout: true,
AttachStderr: false,
Cmd: ['ss', '-tlnp'],
}),
signal: AbortSignal.timeout(3000),
},
);
if (!execCreate.ok) {
const entry = { ts: Date.now(), servers: [], error: 'container not running' };
cache.set(worldId, entry);
return entry;
}
const { Id: execId } = await execCreate.json();
const execStart = await fetch(`${apiBase}/exec/${execId}/start`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ Detach: false, Tty: false }),
signal: AbortSignal.timeout(3000),
});
// Docker exec start streams multiplexed output (8-byte header per frame)
const buf = await execStart.arrayBuffer();
stdout = demuxDockerStream(Buffer.from(buf));
}
const servers = parseSsOutput(stdout);
const entry = { ts: Date.now(), servers };
cache.set(worldId, entry);
return entry;
} catch {
const entry = { ts: Date.now(), servers: [], error: 'container not running' };
cache.set(worldId, entry);
return entry;
}
}
/**
* Strip Docker stream multiplexing headers (8 bytes per frame: [stream, 0, 0, 0, size32be]).
* @param {Buffer} buf
* @returns {string}
*/
function demuxDockerStream(buf) {
let output = '';
let offset = 0;
while (offset + 8 <= buf.length) {
const size = buf.readUInt32BE(offset + 4);
const payload = buf.slice(offset + 8, offset + 8 + size);
output += payload.toString('utf-8');
offset += 8 + size;
}
return output;
}
export { parseSsOutput as _parseSsOutputForTests };
/**
* Phase E2 (olam-dogfood-vision): LocalWorldsSource implementation.
*
* Wraps host-cp's existing dockerode-driven world enumeration in a
* WorldsSource-shaped object so E4's composition layer can fan out
* across multiple sources (local + future Pylon cloud) and merge.
*
* The class deliberately takes its dependencies via factory function
* injection rather than reaching into server.mjs's module-level state
* directly. Two reasons:
* 1. Testability — vitest can pass mocked getWorldsRegistry +
* fetchWorldServices without spinning up the full host-cp
* server.mjs.
* 2. Module-cycle avoidance — server.mjs imports this module, so
* this module CANNOT import server.mjs back without a cycle.
*
* Returns the same shape as the pre-E2 GET /api/worlds response with
* a single addition: `source: 'local'` on every entry.
*
* @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource
* @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary
* @typedef {import('./worlds-source.mjs').ServiceInfo} ServiceInfo
*/
/**
* @typedef {object} LocalWorldsSourceDeps
* @property {() => Record<string, number>} getWorldsRegistry
* Returns current WORLDS map (worldId → host_port). Called fresh
* per list() so post-list registry mutations are visible immediately.
* @property {(worldId: string) => string | null} getWorldName
* Returns the operator-set friendly name OR null if absent.
* @property {(worldId: string) => Promise<ServiceInfo[]>} fetchWorldServices
* Probes per-world services (atlas-core, diner-app, ttyd, per-world CP).
* Same function the pre-E2 handler called inline.
*/
/**
* @param {LocalWorldsSourceDeps} deps
* @returns {WorldsSource}
*/
export function createLocalWorldsSource(deps) {
return {
name: 'local',
async list() {
const registry = deps.getWorldsRegistry();
const entries = Object.entries(registry);
const worlds = await Promise.all(
entries.map(async ([id, host_port]) => {
const services = await deps.fetchWorldServices(id);
// World status mirrors pre-E2 behavior:
// - running: >=1 service responds to a probe
// - starting: container has port bindings but nothing answers
// - unknown: no port bindings at all (container down/missing)
const liveCount = services.filter((s) => s.live).length;
/** @type {'running' | 'starting' | 'unknown'} */
const status =
services.length === 0
? 'unknown'
: liveCount > 0
? 'running'
: 'starting';
/** @type {WorldSummary} */
const summary = {
id,
name: deps.getWorldName(id),
status,
services,
source: 'local',
};
// Preserve the pre-E2 host_port field so SPA + CLI consumers
// that depend on it don't break. WorldSummary type doesn't
// declare host_port (it's local-source-specific metadata),
// but extra fields on the object are tolerated by the type.
return /** @type {WorldSummary & {host_port: number}} */ ({
...summary,
host_port,
});
}),
);
return worlds;
},
};
}
// Phase C Task C3 — hand-rolled Prometheus metrics registry for host-cp.
//
// Emits exactly two metric families:
// http_requests_total{service,route,method,status_code} counter
// http_request_duration_seconds{service,route,method} histogram
//
// TAXONOMY COMPLIANCE (NON-NEGOTIABLE):
// ONLY {service, route, method, status_code} labels allowed.
// BANNED: world_id, trace_id, user_id, request_id, operator_id.
// world_id surfaces via Prometheus exemplars in Phase D — NOT labels.
//
// No external npm deps — Prometheus text exposition is simple enough to
// produce with template literals. Avoids the prom-client footprint on a
// host-side service that has no other dependency on metrics tooling.
// ─── Route mapping ────────────────────────────────────────────────────────
//
// Raw req.url is a cardinality bomb: every unique URL is a new time series.
// We normalize dynamic path segments to stable patterns before labelling.
//
// RULES (first match wins):
// /health → /health
// /api/bootstrap → /api/bootstrap
// /metrics → /metrics
// /api/host-stream → /api/host-stream
// /api/worlds/{id}/credentials/... → /api/worlds/:id/credentials/:action
// /api/worlds/{id}/tunnels/... → /api/worlds/:id/tunnels
// /api/worlds/{id}/pr → /api/worlds/:id/pr
// /api/worlds/{id}/progress → /api/worlds/:id/progress
// /api/worlds (no id) → /api/worlds
// /api/world/{id}/** → /api/world/:id/* (proxy routes)
// /api/admin/registry/... → /api/admin/registry
// /api/admin/upgrade → /api/admin/upgrade
// /api/admin/world-pr → /api/admin/world-pr
// /api/admin/world-pr/{id} → /api/admin/world-pr/:id
// /api/auth/credentials/... → /api/auth/credentials
// /api/auth/... → /api/auth
// /api/plan/conversations/{id}/... → /api/plan/conversations/:id
// /api/plan/conversations → /api/plan/conversations
// /api/plan/** → /api/plan
// /api/auth/events → /api/auth/events
// /api/version/status → /api/version/status
// /api/repos → /api/repos
// /api/runbooks → /api/runbooks
// /api/workspaces/match → /api/workspaces/match
// /api/workspaces → /api/workspaces
// /api/projects → /api/projects
// /api/processes/** → /api/processes
// /v1/chunks/** → /v1/chunks
// /v1/worlds/** → /v1/worlds
// /assets/** → /assets (SPA static assets)
// (other GET to static paths) → /static
// (unknown) → /unknown
/** @param {string} pathname */
export function pathToRoute(pathname) {
// Normalize trailing slash for matching (keep bare / as /)
const p = pathname.length > 1 ? pathname.replace(/\/$/, '') : pathname;
if (p === '/health') return '/health';
if (p === '/api/bootstrap') return '/api/bootstrap';
if (p === '/metrics') return '/metrics';
if (p === '/api/host-stream') return '/api/host-stream';
if (p === '/api/auth/events') return '/api/auth/events';
if (p === '/api/version/status') return '/api/version/status';
if (p === '/api/repos') return '/api/repos';
if (p === '/api/runbooks') return '/api/runbooks';
if (p === '/api/workspaces/match') return '/api/workspaces/match';
if (p === '/api/workspaces') return '/api/workspaces';
if (p === '/api/projects') return '/api/projects';
if (p === '/api/worlds') return '/api/worlds';
if (p === '/api/plan/conversations' || p === '/api/plan/personas') return p;
if (p === '/api/admin/upgrade') return '/api/admin/upgrade';
if (p === '/api/admin/world-pr') return '/api/admin/world-pr';
if (p === '/api/admin/registry') return '/api/admin/registry';
if (p.startsWith('/api/worlds/')) {
if (p.includes('/credentials/')) return '/api/worlds/:id/credentials/:action';
if (p.includes('/tunnels')) return '/api/worlds/:id/tunnels';
if (p.endsWith('/pr')) return '/api/worlds/:id/pr';
if (p.endsWith('/progress')) return '/api/worlds/:id/progress';
return '/api/worlds/:id';
}
if (p.startsWith('/api/world/')) return '/api/world/:id/*';
if (p.startsWith('/api/admin/registry/')) return '/api/admin/registry';
if (p.startsWith('/api/admin/world-pr/')) return '/api/admin/world-pr/:id';
if (p.startsWith('/api/auth/credentials')) return '/api/auth/credentials';
if (p.startsWith('/api/auth/')) return '/api/auth';
if (p.startsWith('/api/plan/conversations/')) return '/api/plan/conversations/:id';
if (p.startsWith('/api/plan/')) return '/api/plan';
if (p.startsWith('/api/processes') || p.startsWith('/api/servers')) return '/api/processes';
if (p.startsWith('/v1/chunks')) return '/v1/chunks';
if (p.startsWith('/v1/worlds')) return '/v1/worlds';
if (p.startsWith('/assets/')) return '/assets';
// SPA HTML fallback routes (GET / and SPA sub-routes like /worlds, /plan/...)
if (p === '/' || p.startsWith('/worlds') || p.startsWith('/plan') || p.startsWith('/workspaces')) return '/static';
return '/unknown';
}
// ─── In-memory registry ───────────────────────────────────────────────────
const HISTOGRAM_BUCKETS = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5];
/** @type {Map<string, number>} labelSet → count */
const _counters = new Map();
/**
* Per label-set histogram state.
* @type {Map<string, {buckets: number[], sum: number, count: number}>}
*/
const _histograms = new Map();
/** @param {string[]} parts label values in canonical order */
function _labelKey(parts) {
return parts.join('\x00');
}
/**
* Reset all metrics. FOR TESTS ONLY — never call in production code.
* Exported as a separate name so it's invisible to consumers that only
* import the named exports they need.
*/
export function _resetForTest() {
_counters.clear();
_histograms.clear();
}
/**
* Increment http_requests_total counter.
*
* @param {string} service
* @param {string} route — MUST be a normalized route pattern
* @param {string} method
* @param {string} statusCode
*/
export function incRequest(service, route, method, statusCode) {
const key = _labelKey([service, route, method, statusCode]);
_counters.set(key, (_counters.get(key) ?? 0) + 1);
}
/**
* Observe http_request_duration_seconds.
*
* @param {string} service
* @param {string} route
* @param {string} method
* @param {number} seconds
*/
export function observeDuration(service, route, method, seconds) {
const key = _labelKey([service, route, method]);
let h = _histograms.get(key);
if (!h) {
// buckets[i] = count of observations where seconds <= HISTOGRAM_BUCKETS[i]
// but stored as INCREMENTAL per-range so cumulation happens on render.
// Each bucket[i] = count that fell in range (HISTOGRAM_BUCKETS[i-1], HISTOGRAM_BUCKETS[i]].
h = { buckets: new Array(HISTOGRAM_BUCKETS.length).fill(0), sum: 0, count: 0 };
_histograms.set(key, h);
}
// Find the first bucket boundary that accommodates this observation.
// Increment only that bucket; render accumulates for the exposition.
let placed = false;
for (let i = 0; i < HISTOGRAM_BUCKETS.length; i++) {
if (seconds <= HISTOGRAM_BUCKETS[i]) {
h.buckets[i]++;
placed = true;
break;
}
}
// Observations beyond the last bucket are counted in h.count only;
// the +Inf bucket in the exposition equals h.count.
if (!placed) {
// No bucket captured it — it lands in +Inf only.
}
h.sum += seconds;
h.count++;
}
// ─── Prometheus text exposition ───────────────────────────────────────────
/** Escape label value per Prometheus text format (backslash, newline, quote). */
function escapeLabelValue(v) {
return String(v).replace(/\\/g, '\\\\').replace(/\n/g, '\\n').replace(/"/g, '\\"');
}
/**
* Build the `{k1="v1",k2="v2",...}` label-set string.
* @param {Record<string, string>} labels
*/
function labelSet(labels) {
const parts = Object.entries(labels).map(
([k, v]) => `${k}="${escapeLabelValue(v)}"`,
);
return `{${parts.join(',')}}`;
}
/**
* Render the complete Prometheus text exposition.
* @returns {string}
*/
export function renderMetrics() {
const lines = [];
// ── http_requests_total ─────────────────────────────────────────────
lines.push('# HELP http_requests_total Total number of HTTP requests handled.');
lines.push('# TYPE http_requests_total counter');
for (const [key, count] of _counters) {
const [service, route, method, status_code] = key.split('\x00');
lines.push(
`http_requests_total${labelSet({ service, route, method, status_code })} ${count}`,
);
}
// ── http_request_duration_seconds ───────────────────────────────────
lines.push('# HELP http_request_duration_seconds HTTP request duration in seconds (histogram).');
lines.push('# TYPE http_request_duration_seconds histogram');
for (const [key, h] of _histograms) {
const [service, route, method] = key.split('\x00');
const base = { service, route, method };
// Cumulative buckets: le=X must be ≥ sum of all observations ≤ X.
let cumulative = 0;
for (let i = 0; i < HISTOGRAM_BUCKETS.length; i++) {
cumulative += h.buckets[i];
lines.push(
`http_request_duration_seconds_bucket${labelSet({ ...base, le: String(HISTOGRAM_BUCKETS[i]) })} ${cumulative}`,
);
}
lines.push(
`http_request_duration_seconds_bucket${labelSet({ ...base, le: '+Inf' })} ${h.count}`,
);
lines.push(`http_request_duration_seconds_sum${labelSet(base)} ${h.sum}`);
lines.push(`http_request_duration_seconds_count${labelSet(base)} ${h.count}`);
}
lines.push(''); // trailing newline
return lines.join('\n');
}
// ─── Request instrumentation wrapper ─────────────────────────────────────
/**
* Wrap an async request handler so every request is instrumented.
*
* The wrapper:
* 1. Derives a stable route pattern from req.url.
* 2. Starts a high-resolution timer.
* 3. Calls the original handler.
* 4. Records counter + histogram using the response's status code.
*
* Status code capture: we monkey-patch res.writeHead and res.end to intercept
* the status before it's sent. Falls back to res.statusCode (which Node sets
* implicitly on .end() when no explicit writeHead call was made).
*
* @param {string} serviceName — emitted as the `service` label
* @param {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse) => Promise<void>} handler
* @returns {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse) => Promise<void>}
*/
export function instrumentHandler(serviceName, handler) {
return async (req, res) => {
const start = performance.now();
// Intercept status code by wrapping writeHead.
let capturedStatus = null;
const origWriteHead = res.writeHead.bind(res);
res.writeHead = (status, ...rest) => {
capturedStatus = status;
return origWriteHead(status, ...rest);
};
try {
await handler(req, res);
} finally {
const durationSec = (performance.now() - start) / 1000;
const urlObj = new URL(req.url ?? '/', `http://localhost`);
const route = pathToRoute(urlObj.pathname);
const method = (req.method ?? 'GET').toUpperCase();
const statusCode = String(capturedStatus ?? res.statusCode ?? 200);
incRequest(serviceName, route, method, statusCode);
observeDuration(serviceName, route, method, durationSec);
}
};
}
/**
* op-side-longpoll.mjs — Operator-side long-poll loop.
*
* Maintains a persistent outbound HTTPS connection from host-cp to
* plan-DO's /v1/op-poll endpoint, waiting for local-Docker dispatch
* work to appear. Feature-flag-gated behind OLAM_OPSIDE_LONGPOLL=1
* (default OFF — no behavior change when unset).
*
* ELI5: like installing a phone line to the cloud planner.
* Your local machine stays connected, ready to receive coding tasks.
* Nothing calls yet — the phone just sits there waiting (v1).
*
* Phase D ships the plumbing and tests it behind a flag. Future cells
* (#3/#6/#7) wire actual producers on the plan-DO side.
*
* Circuit-breaker: 10 consecutive errors → 60 s pause →
* counter resets and polling resumes. Prevents hammering the server
* during outages.
*
* Reconnect delay: 1000ms base + uniform 0-500ms jitter.
* All structured log events emitted to console via JSON objects.
*
* @module op-side-longpoll
*/
// Reconnect delay constants.
const RECONNECT_BASE_MS = 1000;
const RECONNECT_JITTER_MS = 500;
// Circuit-breaker constants.
const CIRCUIT_BREAKER_THRESHOLD = 10;
const CIRCUIT_BREAKER_PAUSE_MS = 60_000;
// Long-poll timeout — plan-DO blocks up to 25 s; we give a 5 s margin.
const POLL_TIMEOUT_MS = 30_000;
/**
* Compute reconnect delay: 1000 + uniform 0-500 ms jitter.
* Exported so tests can mock Math.random and assert the formula.
*
* @param {() => number} [randFn] - Optional RNG override for testing.
* @returns {number} Delay in milliseconds.
*/
export function reconnectDelay(randFn = Math.random) {
return RECONNECT_BASE_MS + Math.floor(randFn() * RECONNECT_JITTER_MS);
}
/** @type {ReturnType<typeof setTimeout> | null} */
let pollTimer = null;
/** @type {boolean} */
let running = false;
/** @type {number} */
let consecutiveErrors = 0;
/** @type {string | null} */
let activeCloudUrl = null;
/** @type {string | null} */
let activeAuth = null;
/**
* Emit a structured log event. Uses console.log to be consistent with
* host-cp's existing logging style. All events include `event` + `ts`.
*
* @param {string} event
* @param {Record<string, unknown>} [extra]
*/
function emit(event, extra = {}) {
console.log(JSON.stringify({ event, ts: new Date().toISOString(), ...extra }));
}
/**
* Sleep for ms milliseconds. Returns a Promise that resolves after the
* delay. Cancellable via the token: if token.cancelled becomes true
* before the timeout fires, the promise still resolves (callers check
* running themselves).
*
* @param {number} ms
* @returns {Promise<void>}
*/
function sleep(ms) {
return new Promise((resolve) => {
pollTimer = setTimeout(resolve, ms);
});
}
/**
* Single poll iteration: open a GET /v1/op-poll request, wait for
* a response, parse the JSON body.
*
* @param {string} cloudUrl Base URL (e.g. https://plan-do.example.com)
* @param {string} auth Authorization header value
* @returns {Promise<{ work: null | { worldId: string, dispatchSpec: unknown } }>}
*/
async function pollOnce(cloudUrl, auth) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), POLL_TIMEOUT_MS);
try {
const url = `${cloudUrl.replace(/\/$/, '')}/v1/op-poll`;
const res = await fetch(url, {
method: 'GET',
headers: { Authorization: auth },
signal: controller.signal,
});
if (!res.ok) {
throw new Error(`op-poll returned ${res.status}`);
}
const body = await res.json();
return body;
} finally {
clearTimeout(timeoutId);
}
}
/**
* The main poll loop. Runs until stopPoll() is called.
*
* State transitions:
* idle
* → connecting (emit op-poll-connect)
* → got { work: null } timeout response (emit op-poll-timeout)
* → wait reconnect delay
* → connecting again
*
* On error:
* → consecutiveErrors++
* → emit op-poll-error
* → if consecutiveErrors >= threshold: circuit-breaker open
* emit op-poll-circuit-open, wait 60 s, reset counter
* → else: wait reconnect delay
*
* @returns {Promise<void>}
*/
async function pollLoop() {
while (running) {
emit('op-poll-connect', { cloud_url: activeCloudUrl });
try {
const result = await pollOnce(activeCloudUrl, activeAuth);
// On a successful { work: null } response, reset the error counter.
consecutiveErrors = 0;
const delay = reconnectDelay();
emit('op-poll-timeout', { work: result.work, reconnect_in_ms: delay });
if (!running) break;
await sleep(delay);
} catch (err) {
consecutiveErrors++;
const message = err instanceof Error ? err.message : String(err);
emit('op-poll-error', {
error: message,
consecutive_errors: consecutiveErrors,
});
if (consecutiveErrors >= CIRCUIT_BREAKER_THRESHOLD) {
emit('op-poll-circuit-open', {
consecutive_errors: consecutiveErrors,
pause_ms: CIRCUIT_BREAKER_PAUSE_MS,
});
consecutiveErrors = 0;
if (!running) break;
await sleep(CIRCUIT_BREAKER_PAUSE_MS);
} else {
if (!running) break;
const delay = reconnectDelay();
await sleep(delay);
}
}
}
}
/**
* Start the operator-side long-poll loop.
*
* No-op if already running. Reads the flag from the environment:
* only runs when OLAM_OPSIDE_LONGPOLL === '1'. Call this AFTER
* server.listen() to avoid blocking the process startup path.
*
* @param {string} cloudUrl Base URL of the plan-DO deployment.
* @param {string} auth Authorization header value for Basic auth.
*/
export function startPoll(cloudUrl, auth) {
if (running) return;
running = true;
consecutiveErrors = 0;
activeCloudUrl = cloudUrl;
activeAuth = auth;
// Fire-and-forget; errors are caught inside pollLoop.
void pollLoop();
}
/**
* Stop the operator-side long-poll loop.
*
* Cancels any in-progress sleep timer; the loop condition will
* exit on its next iteration. Idempotent.
*/
export function stopPoll() {
running = false;
if (pollTimer !== null) {
clearTimeout(pollTimer);
pollTimer = null;
}
}
// C4 — macOS panic-log counter.
//
// Note: phase-c-tasks.md originally listed a browser SPA path for this.
// SPAs can't shell out — `child_process` is Node-only. Correct home
// is host-cp (Node) which brokers operator-machine state through
// host-stream. host-cp exposes a typed event consumers can subscribe to.
//
// Implementation:
// `log show --predicate 'eventMessage CONTAINS "panic"' --last <N>d`
// pipes to stdout; we count newlines (each panic event = 1 line).
//
// Platform guard:
// On non-darwin platforms, getPanicCount returns null + emits a
// `[panic-counter]` warning to stderr. Callers branch on null →
// skip the delta + don't emit the Slack message.
//
// Sampling cadence:
// Baseline: at olam-cli startup OR on first /plan/new visit
// Per-session: at plan completion (cloud-mode only)
//
// Cost note:
// `log show` is expensive (~200ms-2s depending on system log size).
// Cache the baseline + only re-sample on demand. Don't poll.
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import { platform } from 'node:os';
const execFileP = promisify(execFile);
const PANIC_PREDICATE = 'eventMessage CONTAINS "panic"';
const DEFAULT_TIMEOUT_MS = 30_000;
/**
* Return the count of `panic`-containing log entries over the last N
* days. Returns null on non-darwin platforms OR on `log` command
* failure (caller treats null as "no signal; skip the delta").
*/
export async function getPanicCount(last_n_days = 7, opts = {}) {
if (platform() !== 'darwin') {
if (!opts.silent) {
process.stderr.write(
`[panic-counter] platform=${platform()} is not darwin; returning null\n`,
);
}
return null;
}
const execImpl = opts.execFileFn ?? execFileP;
try {
const { stdout } = await execImpl(
'log',
['show', '--predicate', PANIC_PREDICATE, '--last', `${last_n_days}d`],
{ timeout: opts.timeoutMs ?? DEFAULT_TIMEOUT_MS, maxBuffer: 10 * 1024 * 1024 },
);
// `log show` prepends a header + may emit an "is empty" sentinel.
// Count lines that look like log entries: start with a timestamp.
const lines = stdout.split('\n').filter((line) => /^\d{4}-\d{2}-\d{2}/.test(line));
return lines.length;
} catch (err) {
if (!opts.silent) {
const msg = err instanceof Error ? err.message : String(err);
process.stderr.write(`[panic-counter] log command failed: ${msg}\n`);
}
return null;
}
}
/**
* Pure delta math. Returns null if either input is null (no signal).
* Negative deltas (panics increased) are valid — caller frames the
* Slack message appropriately.
*/
export function computePanicDelta(before, after) {
if (before === null || after === null) return null;
if (typeof before !== 'number' || typeof after !== 'number') return null;
return after - before;
}
/** Format the delta for a Slack message body. Plain English; no jargon. */
export function formatDeltaSummary(before, after) {
const delta = computePanicDelta(before, after);
if (delta === null) {
return 'Panic delta: n/a (counter unavailable this session).';
}
if (delta === 0) {
return `Panic count steady: ${before} → ${after} (no change this session).`;
}
if (delta < 0) {
return `Panic count down ${Math.abs(delta)}: ${before} → ${after}.`;
}
return `Panic count up ${delta}: ${before} → ${after}.`;
}
// plan-chat-proxy-headers.mjs — header handling for host-cp's /api/plan-chat/*
// passthrough proxy (server.mjs). Extracted as pure helpers so the F3 (T9)
// operator-chunk broker-secret contract is unit-testable without booting the
// whole host-cp server.
//
// F3 (T9) boundary: host-cp's /api/plan-chat/* proxy is the TRUSTED operator
// surface (SPA browser → host-cp → plan-chat-service). A WORLD process never
// routes through this proxy — it talks to plan-chat-service directly via
// host.docker.internal. So:
// - Client-supplied `x-olam-broker-secret` is ALWAYS stripped (a client must
// not be able to smuggle the operator-chunk authority secret through).
// - The real secret is injected by the proxy itself (injectBrokerSecret),
// only when configured, so the operator's own SPA interject is authorised
// while a world process — which can't present the secret — is rejected by
// plan-chat-service's gate.
const HOP_BY_HOP = new Set(['host', 'connection', 'content-length']);
const BROKER_SECRET_HEADER = 'x-olam-broker-secret';
/**
* Build the upstream header map for a /api/plan-chat/* proxy request.
* Drops hop-by-hop headers AND any client-supplied broker secret (F3).
*
* @param {Record<string, string | string[] | undefined>} reqHeaders
* @returns {Record<string, string>}
*/
export function buildPlanChatProxyHeaders(reqHeaders) {
const headers = {};
for (const [k, v] of Object.entries(reqHeaders ?? {})) {
if (HOP_BY_HOP.has(k)) continue;
// F3 — never forward a CLIENT-supplied broker secret.
if (k === BROKER_SECRET_HEADER) continue;
if (Array.isArray(v)) headers[k] = v.join(', ');
else if (typeof v === 'string') headers[k] = v;
}
return headers;
}
/**
* Inject the operator-chunk broker secret into the upstream headers when it is
* configured. No-op when the secret is unset/empty (the gate then runs in its
* default ungated-but-loud mode). Mutates + returns `headers`.
*
* @param {Record<string, string>} headers
* @param {string | undefined} operatorChunkSecret
* @returns {Record<string, string>}
*/
export function injectBrokerSecret(headers, operatorChunkSecret) {
if (typeof operatorChunkSecret === 'string' && operatorChunkSecret.length > 0) {
headers[BROKER_SECRET_HEADER] = operatorChunkSecret;
}
return headers;
}
// Bearer-secret management for plan-chat-service.mjs.
//
// Mirrors the agent-memory-service pattern from the sibling olam-agent-memory
// repo: a single 0600 file at ~/.olam/plan-chat-secret holds the bearer
// hex string. Helpers generate, read, and rotate atomically. Rotation
// writes to a tmpfile and renames; mid-rotation reads see either the old
// or new value, never a partial write.
//
// Inside the Docker container, os.homedir() → /root, but compose.yaml mounts
// ${HOME}/.olam → /data. Without an env override, the bearer would be written
// to /root/.olam/plan-chat-secret (container ephemeral layer) and lost on
// every `docker compose up --force-recreate` (i.e. every `olam upgrade`).
// OLAM_PLAN_CHAT_SECRET_PATH is set to /data/plan-chat-secret in compose.yaml
// and k8s/manifests/30-configmap.yaml so all reads/writes land in the
// bind-mounted host directory. On bare-host installs (no container) the env
// var is unset and the path falls back to ~/.olam/plan-chat-secret — no
// behaviour change. Mirrors precedent commit 5b21d1f2 (PR #440) for plan.db.
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import crypto from 'node:crypto';
// Phase D (olam-config-store-unification): consult config.json's
// `cloud.secrets.plan-chat-secret` value before the legacy secret FILES.
// Dep-free reader (host-cp has no @olam/core dep) with container-aware /data
// path resolution — see config-reader.mjs header.
import { readConfigString, olamConfigDir } from './config-reader.mjs';
/**
* Resolve the plan-chat-secret path: prefer <configDir>/secrets/plan-chat-secret
* (new canonical location) over <configDir>/plan-chat-secret (legacy). The
* config dir is container-aware (OLAM_HOME ?? container /data ?? ~/.olam) so the
* bare-node and in-container layouts both resolve correctly. Inlined here
* because host-cp is a pure .mjs package with no @olam/core dep.
*/
function resolvePlanChatSecretPath() {
const olamHome = olamConfigDir();
const newPath = path.join(olamHome, 'secrets', 'plan-chat-secret');
if (fs.existsSync(newPath)) return newPath;
const legacyPath = path.join(olamHome, 'plan-chat-secret');
if (fs.existsSync(legacyPath)) return legacyPath;
// Neither exists — return canonical so writes land in the right place.
return newPath;
}
export const SECRET_PATH =
process.env.OLAM_PLAN_CHAT_SECRET_PATH ?? resolvePlanChatSecretPath();
export const SECRET_DIR = path.dirname(SECRET_PATH);
const SECRET_BYTES = 32; // 64 hex chars
const SECRET_MODE = 0o600;
/**
* Generate a fresh hex bearer (64 chars; 256 bits of entropy).
*/
export function generateSecret() {
return crypto.randomBytes(SECRET_BYTES).toString('hex');
}
/** Read a bearer value out of a single secret FILE. Null if absent/empty. */
function readSecretFile(secretPath) {
try {
const value = fs.readFileSync(secretPath, 'utf8').trim();
if (!value) return null;
return value;
} catch (err) {
if (err && typeof err === 'object' && 'code' in err && err.code === 'ENOENT') return null;
throw err;
}
}
/**
* Read the plan-chat bearer. Returns null if absent. Throws on permission errors.
*
* Phase D precedence (mirrors resolver.ts getPlanChatSecret) for the DEFAULT
* read path: the resolved secret FILE (SECRET_PATH — env override or the
* canonical/legacy file) wins when present, then config.json
* `cloud.secrets.plan-chat-secret`, then null. The file leg stays FIRST so an
* operator's existing on-disk bearer (incl. the compose-mounted
* OLAM_PLAN_CHAT_SECRET_PATH=/data/plan-chat-secret) is byte-for-byte
* unchanged; config.json only fills in when no file exists yet.
*
* When called with an EXPLICIT secretPath (write/rotate read-backs, tests),
* behaviour is file-only — no config.json leg — so callers that own a specific
* path keep deterministic file semantics.
*/
export function readSecret(secretPath = SECRET_PATH) {
const explicitPath = secretPath !== SECRET_PATH;
const fromFile = readSecretFile(secretPath);
if (fromFile !== null) return fromFile;
if (explicitPath) return null; // explicit-path callers: file-only semantics
const fromConfig = readConfigString('cloud.secrets.plan-chat-secret');
return fromConfig; // string or null
}
/**
* Write the bearer to disk atomically. Creates `~/.olam` if missing. Enforces
* 0600 perms on the destination (older mode permissions on the tmpfile are
* tightened immediately after write).
*/
export function writeSecret(value, secretPath = SECRET_PATH) {
if (typeof value !== 'string' || value.length === 0) {
throw new Error('plan-chat-secret: refusing to write empty bearer');
}
fs.mkdirSync(path.dirname(secretPath), { recursive: true, mode: 0o700 });
const tmp = `${secretPath}.tmp-${process.pid}-${Date.now()}`;
fs.writeFileSync(tmp, value + '\n', { mode: SECRET_MODE });
try {
fs.chmodSync(tmp, SECRET_MODE);
fs.renameSync(tmp, secretPath);
} catch (err) {
try { fs.unlinkSync(tmp); } catch { /* swallow */ }
throw err;
}
}
/**
* Read the bearer if it exists, else generate, write, and return it.
* Idempotent across processes; first writer wins (rename is atomic).
*/
export function ensureSecret(secretPath = SECRET_PATH) {
const existing = readSecret(secretPath);
if (existing) return existing;
const fresh = generateSecret();
writeSecret(fresh, secretPath);
return fresh;
}
/**
* Rotate: generate a new bearer, write atomically, return the new value.
* Callers should restart any running plan-chat-service so it re-reads.
*/
export function rotateSecret(secretPath = SECRET_PATH) {
const fresh = generateSecret();
writeSecret(fresh, secretPath);
return fresh;
}
/**
* Constant-time compare. Returns true iff both strings are non-empty and
* byte-equal. Avoids leaking timing on bearer comparison.
*/
export function timingSafeEqual(a, b) {
if (typeof a !== 'string' || typeof b !== 'string') return false;
if (a.length === 0 || b.length === 0) return false;
if (a.length !== b.length) return false;
return crypto.timingSafeEqual(Buffer.from(a), Buffer.from(b));
}

Sorry, the diff of this file is too big to display

// plan-orchestrator.mjs — Phase 2: multi-persona conversation coordinator.
//
// Architecture:
// - AgentRegistry holds one pi AgentSession per (conversationId, personaId).
// - HandoffEngine forks the session tree when the active persona changes.
// - All persona turns share one session.jsonl per conversation.
// - SSE sinks are an in-process Set<ServerResponse> per conversationId.
//
// Credentials:
// - Uses the Olam auth-service vault (same as the rest of host-cp).
// - No ANTHROPIC_API_KEY required; tokens fetched on demand via auth-service.
import path from 'node:path';
import os from 'node:os';
import fs from 'node:fs';
import { randomUUID } from 'node:crypto';
import Database from 'better-sqlite3';
import { SessionManager } from '@mariozechner/pi-coding-agent';
import { PERSONAS, DEFAULT_PERSONA_ID, getPersona } from './plan/personas.mjs';
import { AgentRegistry } from './plan/agent-registry.mjs';
import { HandoffEngine } from './plan/handoff-engine.mjs';
import { RopeEngine } from './plan/rope-engine.mjs';
import { loadAuthorityConfig } from './plan/authority-config.mjs';
import { isPathVaultUrl, ensurePathVaultProxy } from './plan/path-vault-proxy.mjs';
// Phase D (olam-config-store-unification): config.json reader (dep-free, copied
// from packages/core/src/cloud-state/read-config-value.mjs — host-cp has no
// @olam/core dep). Container-aware /data path resolution lives in config-reader.mjs.
import { readConfigString, olamConfigDir } from './config-reader.mjs';
// ── Cloud path-vault fallback ───────────────────────────────────────────────
//
// When the local auth-service vault has no Claude credential, the plan agent can
// instead reach Claude through the operator's cloud path-vault URL. Resolution
// mirrors server.mjs readAnthropicBaseUrl() (kept independent so this module has
// no server.mjs dependency):
// 1. OLAM_ANTHROPIC_BASE_URL env var
// 2. ~/.olam/anthropic-base-url file
// 3. ANTHROPIC_BASE_URL env var
//
// Only PATH-FORMAT vault URLs (https://host/auth/<sub>/<secret>) are usable as a
// fallback — they self-authenticate, so no live token is required.
/** Placeholder api-key handed to the agent runtime in path-vault mode. The
* path prefix is the real credential; the proxy strips this header. */
const PATH_VAULT_PLACEHOLDER_KEY = 'path-vault-proxy';
/** @returns {string} the configured Anthropic base URL, or '' if none. */
function readAnthropicBaseUrlForFallback() {
const fromOlamEnv = process.env['OLAM_ANTHROPIC_BASE_URL'];
if (fromOlamEnv && fromOlamEnv.length > 0) return fromOlamEnv.trim();
// Phase D: config.json leg sits BETWEEN the two env legs (mirrors
// server.mjs readAnthropicBaseUrl + resolver.ts getAnthropicBaseUrl).
const fromConfig = readConfigString('cloud.urls.anthropic-base-url');
if (fromConfig !== null) return fromConfig;
try {
// Legacy fallback under the container-aware config dir (/data or ~/.olam).
const file = path.join(olamConfigDir(), 'anthropic-base-url');
const content = fs.readFileSync(file, 'utf-8').trim();
if (content.length > 0) return content;
} catch {
// file absent — fall through
}
const fromShellEnv = process.env['ANTHROPIC_BASE_URL'];
if (fromShellEnv && fromShellEnv.length > 0) return fromShellEnv.trim();
return '';
}
// ── Paths ─────────────────────────────────────────────────────────────────────
//
// Inside the Docker container, os.homedir() → /root, but compose.yaml mounts
// ${HOME}/.olam → /data. Without env overrides, plan.db would be written to
// /root/.olam/plan.db (container ephemeral layer) and lost on every
// `docker compose up --force-recreate` (i.e. every `olam upgrade`).
//
// OLAM_PLAN_DB_PATH and OLAM_PLAN_DIR are set to /data/plan.db and /data/plan
// in compose.yaml so all writes land in the bind-mounted host directory.
// On bare-host installs (no container) neither env var is set and the paths
// fall back to the original ~/.olam locations — no behaviour change.
//
// Paths are resolved at construction time (not module load) so tests can pass
// explicit paths via constructor opts without any module re-import tricks.
/** @returns {string} */
function defaultPlanDbPath() {
return process.env.OLAM_PLAN_DB_PATH ?? path.join(os.homedir(), '.olam', 'plan.db');
}
/** @returns {string} */
function defaultPlanDir() {
return process.env.OLAM_PLAN_DIR ?? path.join(os.homedir(), '.olam', 'plan');
}
// ── Helpers ───────────────────────────────────────────────────────────────────
function initSessionFile(sessionFile, sessionId) {
const header = {
type: 'session',
version: 3,
id: sessionId,
timestamp: new Date().toISOString(),
cwd: os.homedir(),
};
fs.writeFileSync(sessionFile, JSON.stringify(header) + '\n');
}
/**
* Derive a short title from the first user message content.
* Truncates at a word boundary to at most maxLen characters.
* @param {string} content
* @param {number} [maxLen=40]
* @returns {string}
*/
export function deriveTitle(content, maxLen = 40) {
const trimmed = content.trim().replace(/\s+/g, ' ');
if (!trimmed) return '(empty)';
if (trimmed.length <= maxLen) return trimmed;
const cut = trimmed.slice(0, maxLen);
const lastSpace = cut.lastIndexOf(' ');
return (lastSpace > 0 ? cut.slice(0, lastSpace) : cut) + '…';
}
// ── PlanOrchestrator ──────────────────────────────────────────────────────────
export class PlanOrchestrator {
#db;
#planDir;
#authServiceUrl;
#authServiceSecret;
#registry;
#handoffEngine;
#ropeEngine;
/** Tracks the active persona per conversationId: Map<conversationId, personaId> */
#activePersona = new Map();
/** @type {Map<string, Set<import('node:http').ServerResponse>>} */
#sinks = new Map();
/**
* Ring buffer of in-flight SSE events per conversationId.
* Populated while a turn is active; cleared after all persona turn_complete events.
* Used by drainReplayBuffer to replay missed events on reconnect.
* @type {Map<string, Array<{event: string, data: object}>>}
*/
#activeTurns = new Map();
/**
* Number of persona turn_complete events still pending per conversationId.
* Replay buffer is only cleared when this reaches 0.
* @type {Map<string, number>}
*/
#pendingPersonaCount = new Map();
/**
* Mutable current-chunk refs per conversationId.
* ChunkEmitter updates these; read_sidebar tool reads them.
* @type {Map<string, { current: string|null }>}
*/
#currentChunkRefs = new Map();
/**
* @param {{
* authServiceUrl: string,
* authServiceSecret: string,
* planDbPath?: string,
* planDirPath?: string,
* }} opts
*
* planDbPath / planDirPath default to OLAM_PLAN_DB_PATH / OLAM_PLAN_DIR env vars,
* falling back to ~/.olam/plan.db and ~/.olam/plan. Pass explicitly in tests to
* avoid touching real home-dir paths.
*/
constructor({ authServiceUrl, authServiceSecret, planDbPath, planDirPath } = {}) {
this.#authServiceUrl = authServiceUrl;
this.#authServiceSecret = authServiceSecret;
const legacyDbPath = path.join(os.homedir(), '.olam', 'plan.db');
// Track whether the caller injected an explicit DB path (used to skip the
// legacy-path migration below — tests inject tmpDir paths and must not
// inherit the operator's real plan.db).
const planDbPathInjected = planDbPath !== undefined;
const resolvedDbPath = planDbPath ?? defaultPlanDbPath();
this.#planDir = planDirPath ?? defaultPlanDir();
this.#registry = new AgentRegistry({ authServiceUrl, authServiceSecret });
this.#handoffEngine = new HandoffEngine(this.#registry);
fs.mkdirSync(path.dirname(resolvedDbPath), { recursive: true });
// One-time migration: if the resolved DB path differs from the legacy default and
// the target doesn't exist yet, copy any existing DB from the old location.
// This preserves conversations on a hot-restart after deploying the compose.yaml fix.
// On full container recreate the legacy path is already gone — this is a no-op.
//
// Skip when the caller injected an explicit planDbPath — that's the unit-
// test shape (each test owns a tmpDir db). Pre-fix history: tests on a host
// with a populated `~/.olam/plan.db` got every `listConversations()` query
// polluted by real operator data because the migration eagerly copied the
// legacy file into the test's tmpDir.
if (
!planDbPathInjected &&
resolvedDbPath !== legacyDbPath &&
!fs.existsSync(resolvedDbPath) &&
fs.existsSync(legacyDbPath)
) {
try {
fs.copyFileSync(legacyDbPath, resolvedDbPath);
console.info('[plan] Migrated plan.db from legacy path to', resolvedDbPath);
} catch (err) {
console.warn('[plan] plan.db migration failed (non-fatal):', err.message);
}
}
this.#db = new Database(resolvedDbPath);
this.#db.exec(`
CREATE TABLE IF NOT EXISTS plan_conversations (
id TEXT PRIMARY KEY,
title TEXT,
persona TEXT NOT NULL DEFAULT 'brainstorm',
created_at INTEGER NOT NULL,
last_turn_at INTEGER
);
CREATE TABLE IF NOT EXISTS plan_turns (
id TEXT PRIMARY KEY,
conversation_id TEXT NOT NULL REFERENCES plan_conversations(id),
role TEXT NOT NULL,
content TEXT NOT NULL DEFAULT '',
persona TEXT,
from_persona TEXT,
to_persona TEXT,
mode TEXT,
fork_node_id TEXT,
created_at INTEGER NOT NULL
);
CREATE INDEX IF NOT EXISTS plan_turns_conv_idx
ON plan_turns(conversation_id, created_at);
-- Phase 4B: lookout agent registry per conversation
CREATE TABLE IF NOT EXISTS plan_lookout_agents (
conversation_id TEXT NOT NULL,
persona_id TEXT NOT NULL,
muted INTEGER NOT NULL DEFAULT 0,
mode TEXT NOT NULL DEFAULT 'observe',
created_at INTEGER NOT NULL,
PRIMARY KEY (conversation_id, persona_id)
);
-- Phase 4B: sidebar signals from lookout agents
CREATE TABLE IF NOT EXISTS plan_sidebar_signals (
id TEXT PRIMARY KEY,
conversation_id TEXT NOT NULL,
agent_id TEXT NOT NULL,
urgency TEXT NOT NULL DEFAULT 'p2',
reason TEXT NOT NULL DEFAULT '',
content TEXT NOT NULL DEFAULT '',
chunk_id TEXT NOT NULL,
created_at INTEGER NOT NULL,
status TEXT NOT NULL DEFAULT 'active',
tension_subject TEXT,
parent_signal_id TEXT
);
CREATE INDEX IF NOT EXISTS plan_sidebar_conv_idx
ON plan_sidebar_signals(conversation_id, created_at);
CREATE INDEX IF NOT EXISTS plan_sidebar_chunk_idx
ON plan_sidebar_signals(chunk_id);
`);
// Migration guard: add pinned column if the table predates this feature.
const planConvCols = this.#db.prepare(`PRAGMA table_info(plan_conversations)`).all();
if (!planConvCols.some(c => c.name === 'pinned')) {
this.#db.exec(`ALTER TABLE plan_conversations ADD COLUMN pinned INTEGER NOT NULL DEFAULT 0`);
}
const authorityConfig = loadAuthorityConfig();
this.#ropeEngine = new RopeEngine({
registry: this.#registry,
db: this.#db,
broadcast: (cId, evt, data) => this.#broadcast(cId, evt, data),
authorityConfig,
});
}
// ── Auth-service credential fetching ──────────────────────────────────────
/**
* Fetch a Claude credential token for an about-to-run turn.
*
* Returns a real vault token when the local vault has one, OR a placeholder
* token in cloud path-vault fallback mode. As a SIDE EFFECT it points the
* AgentRegistry at the right Anthropic base URL (localhost proxy in path-vault
* mode, cleared otherwise) BEFORE any runtime is built — every persona /
* rope / handoff runtime resolves its model base URL from the registry.
*
* Used by all turn-dispatch call sites (dispatch, rope-engine, handoff-engine)
* via `fetchToken: () => this.#fetchToken()`, so the fallback applies uniformly
* without changing those call sites.
*
* @returns {Promise<string>}
*/
async #fetchToken() {
const cred = await this.#resolveCredential();
// setAnthropicBaseUrl points runtimes at the localhost path-vault proxy in
// fallback mode (else clears it). Guard for registries that predate the
// method or are test doubles — the path-vault override is best-effort.
if (typeof this.#registry.setAnthropicBaseUrl === 'function') {
this.#registry.setAnthropicBaseUrl(cred.mode === 'path-vault' ? cred.baseUrl : null);
}
return cred.token;
}
/**
* Resolve a credential for the plan agent, preferring the local auth-service
* vault and falling back to the operator's cloud path-vault URL when the local
* vault is empty.
*
* @typedef {{ mode: 'vault', token: string }
* | { mode: 'path-vault', token: string, baseUrl: string }} CredentialResolution
*
* @returns {Promise<CredentialResolution>}
*/
async #resolveCredential() {
// 1. Prefer the local vault. When it has a credential, behavior is unchanged.
// Call the registry directly (NOT #fetchToken) — #fetchToken delegates
// back here, so going through it would recurse.
try {
const token = await this.#registry.fetchToken('claude');
return { mode: 'vault', token };
} catch (err) {
// Only fall back on a missing credential — surface real auth-service errors
// (timeouts, 5xx) so they don't get masked by the path-vault path.
if (err?.code && err.code !== 'NO_CREDENTIAL') throw err;
}
// 2. Fall back to the cloud path-vault URL, if configured + path-format.
const baseUrl = readAnthropicBaseUrlForFallback();
if (!isPathVaultUrl(baseUrl)) {
// No usable fallback — re-raise the original NO_CREDENTIAL shape so callers
// (hasCredential / dispatch) behave exactly as before.
const e = new Error('no active claude credential in vault');
e.code = 'NO_CREDENTIAL';
throw e;
}
const localBaseUrl = await ensurePathVaultProxy(baseUrl);
return { mode: 'path-vault', token: PATH_VAULT_PLACEHOLDER_KEY, baseUrl: localBaseUrl };
}
/**
* Lightweight check — returns true when a credential is reachable, either from
* the local vault OR the cloud path-vault fallback.
* @returns {Promise<boolean>}
*/
async hasCredential() {
try {
await this.#resolveCredential();
return true;
} catch {
return false;
}
}
// ── Conversation management ───────────────────────────────────────────────
/**
* @param {{ title?: string }} [opts]
* @returns {{ id: string, title: string|null, persona: string, created_at: number }}
*/
createConversation({ title } = {}) {
const id = randomUUID();
const created_at = Date.now();
const sessionDir = path.join(this.#planDir, id);
fs.mkdirSync(sessionDir, { recursive: true });
initSessionFile(path.join(sessionDir, 'session.jsonl'), id);
this.#db
.prepare(
`INSERT INTO plan_conversations (id, title, persona, created_at)
VALUES (?, ?, ?, ?)`,
)
.run(id, title ?? null, DEFAULT_PERSONA_ID, created_at);
this.#activePersona.set(id, DEFAULT_PERSONA_ID);
return { id, title: title ?? null, persona: DEFAULT_PERSONA_ID, created_at };
}
/** @returns {Array<{id, title, pinned, created_at, last_turn_at, persona, snippet}>} */
listConversations() {
return this.#db
.prepare(
`SELECT
c.id, c.title, c.pinned, c.created_at, c.last_turn_at, c.persona,
(SELECT pt.content FROM plan_turns pt
WHERE pt.conversation_id = c.id
ORDER BY pt.created_at DESC LIMIT 1) AS snippet
FROM plan_conversations c
ORDER BY c.pinned DESC, COALESCE(c.last_turn_at, c.created_at) DESC, c.rowid DESC`,
)
.all();
}
/**
* Patch a conversation's title and/or pinned state.
* @param {string} id
* @param {{ title?: string, pinned?: boolean }} updates
* @returns {object|null} Updated row, or null if not found.
*/
patchConversation(id, updates) {
const parts = [];
const values = [];
if (updates.title !== undefined) {
parts.push('title = ?');
values.push(updates.title || null);
}
if (updates.pinned !== undefined) {
parts.push('pinned = ?');
values.push(updates.pinned ? 1 : 0);
}
if (parts.length === 0) return null;
values.push(id);
const changed = this.#db
.prepare(`UPDATE plan_conversations SET ${parts.join(', ')} WHERE id = ?`)
.run(...values);
if (changed.changes === 0) return null;
return this.#db
.prepare(`SELECT id, title, pinned, created_at, last_turn_at, persona FROM plan_conversations WHERE id = ?`)
.get(id) ?? null;
}
/**
* Delete a conversation and all its associated data.
* @param {string} id
* @returns {boolean} true if deleted, false if not found.
*/
deleteConversation(id) {
const exists = this.#db
.prepare(`SELECT 1 FROM plan_conversations WHERE id = ?`)
.get(id);
if (!exists) return false;
this.#db.prepare(`DELETE FROM plan_turns WHERE conversation_id = ?`).run(id);
this.#db.prepare(`DELETE FROM plan_lookout_agents WHERE conversation_id = ?`).run(id);
this.#db.prepare(`DELETE FROM plan_sidebar_signals WHERE conversation_id = ?`).run(id);
this.#db.prepare(`DELETE FROM plan_conversations WHERE id = ?`).run(id);
this.#activePersona.delete(id);
this.#sinks.delete(id);
this.#activeTurns.delete(id);
this.#currentChunkRefs.delete(id);
const sessionDir = path.join(this.#planDir, id);
try { fs.rmSync(sessionDir, { recursive: true }); } catch { /* ok if missing */ }
return true;
}
/**
* @param {string} id
* @returns {{ id, title, persona, created_at, last_turn_at, tree } | null}
*/
getConversation(id) {
const row = this.#db
.prepare(
`SELECT id, title, persona, created_at, last_turn_at
FROM plan_conversations WHERE id = ?`,
)
.get(id);
if (!row) return null;
const sessionFile = path.join(this.#planDir, id, 'session.jsonl');
let tree = [];
try {
const mgr = SessionManager.open(sessionFile, path.join(this.#planDir, id));
tree = mgr.getTree();
} catch {
// Session file missing or corrupt — return empty tree.
}
return { ...row, tree };
}
// ── Active persona management ─────────────────────────────────────────────
/**
* @param {string} conversationId
* @returns {string} Active persona ID.
*/
getActivePersona(conversationId) {
if (this.#activePersona.has(conversationId)) {
return this.#activePersona.get(conversationId);
}
const row = this.#db
.prepare(`SELECT persona FROM plan_conversations WHERE id = ?`)
.get(conversationId);
const personaId = row?.persona ?? DEFAULT_PERSONA_ID;
this.#activePersona.set(conversationId, personaId);
return personaId;
}
/**
* Set the active default persona for a conversation (does NOT trigger a handoff).
* @param {string} conversationId
* @param {string} personaId
*/
setActivePersona(conversationId, personaId) {
this.#activePersona.set(conversationId, personaId);
this.#db
.prepare(`UPDATE plan_conversations SET persona = ? WHERE id = ?`)
.run(personaId, conversationId);
}
// ── SSE broadcast ─────────────────────────────────────────────────────────
#broadcast(conversationId, eventName, data) {
// Buffer event while a turn is active for reconnect replay.
const buf = this.#activeTurns.get(conversationId);
if (buf) {
buf.push({ event: eventName, data });
}
const sinks = this.#sinks.get(conversationId);
if (!sinks || sinks.size === 0) return;
const chunk = `event: ${eventName}\ndata: ${JSON.stringify(data)}\n\n`;
for (const res of sinks) {
try { res.write(chunk); } catch { /* client disconnected */ }
}
// Clear buffer only when all pending personas have completed.
if (eventName === 'turn_complete') {
const pending = (this.#pendingPersonaCount.get(conversationId) ?? 1) - 1;
if (pending <= 0) {
this.#activeTurns.delete(conversationId);
this.#pendingPersonaCount.delete(conversationId);
} else {
this.#pendingPersonaCount.set(conversationId, pending);
}
}
}
// ── Lookout agent management ──────────────────────────────────────────────
/**
* Invite a persona as a lookout for a conversation.
* @param {string} conversationId
* @param {string} personaId
* @returns {{ persona_id: string, state: string, muted: boolean, mode: string }}
*/
inviteLookout(conversationId, personaId) {
const now = Date.now();
this.#db
.prepare(`INSERT OR IGNORE INTO plan_lookout_agents (conversation_id, persona_id, muted, mode, created_at) VALUES (?, ?, 0, 'observe', ?)`)
.run(conversationId, personaId, now);
const agent = { persona_id: personaId, state: 'listening', muted: false, mode: 'observe' };
this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'listening' });
return agent;
}
/**
* Update muted status (or mode) for a lookout agent.
* @param {string} conversationId
* @param {string} personaId
* @param {{ muted?: boolean, mode?: string }} updates
* @returns {{ persona_id: string, state: string, muted: boolean, mode: string } | null}
*/
updateLookout(conversationId, personaId, { muted, mode } = {}) {
const row = this.#db
.prepare(`SELECT * FROM plan_lookout_agents WHERE conversation_id = ? AND persona_id = ?`)
.get(conversationId, personaId);
if (!row) return null;
const newMuted = muted !== undefined ? (muted ? 1 : 0) : row.muted;
const newMode = mode ?? row.mode;
this.#db
.prepare(`UPDATE plan_lookout_agents SET muted = ?, mode = ? WHERE conversation_id = ? AND persona_id = ?`)
.run(newMuted, newMode, conversationId, personaId);
const newState = newMuted ? 'idle' : 'listening';
this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: newState });
return { persona_id: personaId, state: newState, muted: !!newMuted, mode: newMode };
}
/**
* Remove a lookout agent.
* @param {string} conversationId
* @param {string} personaId
*/
uninviteLookout(conversationId, personaId) {
this.#db
.prepare(`DELETE FROM plan_lookout_agents WHERE conversation_id = ? AND persona_id = ?`)
.run(conversationId, personaId);
}
/**
* List active lookout agents for a conversation.
* @param {string} conversationId
* @returns {Array<{ persona_id: string, state: string, muted: boolean, mode: string }>}
*/
listLookoutAgents(conversationId) {
const rows = this.#db
.prepare(`SELECT persona_id, muted, mode FROM plan_lookout_agents WHERE conversation_id = ?`)
.all(conversationId);
return rows.map((r) => ({
persona_id: r.persona_id,
state: r.muted ? 'idle' : 'listening',
muted: !!r.muted,
mode: r.mode,
}));
}
// ── Sidebar signal management ─────────────────────────────────────────────
/**
* Dismiss a sidebar signal.
* @param {string} conversationId
* @param {string} signalId
* @returns {boolean}
*/
dismissSignal(conversationId, signalId) {
const info = this.#db
.prepare(`UPDATE plan_sidebar_signals SET status = 'dismissed' WHERE id = ? AND conversation_id = ?`)
.run(signalId, conversationId);
return info.changes > 0;
}
/**
* Mark a sidebar signal as used (for next turn context).
* @param {string} conversationId
* @param {string} signalId
* @returns {boolean}
*/
useSignal(conversationId, signalId) {
const info = this.#db
.prepare(`UPDATE plan_sidebar_signals SET status = 'used' WHERE id = ? AND conversation_id = ?`)
.run(signalId, conversationId);
return info.changes > 0;
}
/**
* List sidebar signals for a conversation (optionally filtered by chunk_id).
* @param {string} conversationId
* @param {string} [chunkId]
* @returns {Array<object>}
*/
listSignals(conversationId, chunkId) {
if (chunkId) {
return this.#db
.prepare(`SELECT * FROM plan_sidebar_signals WHERE conversation_id = ? AND chunk_id = ? ORDER BY created_at ASC`)
.all(conversationId, chunkId);
}
return this.#db
.prepare(`SELECT * FROM plan_sidebar_signals WHERE conversation_id = ? ORDER BY created_at ASC`)
.all(conversationId);
}
// ── Lookout analysis ──────────────────────────────────────────────────────
/**
* Persona-specific heuristics for lookout analysis.
* Returns { shouldComment: boolean, urgency, content, reason, tension_subject? }
* or null if no comment warranted.
*
* @param {string} personaId
* @param {string} content — chunk content to analyze
* @returns {{ urgency: string, content: string, reason: string, tension_subject?: string } | null}
*/
#analyzeChunkHeuristic(personaId, content) {
const lower = content.toLowerCase();
if (personaId === 'scout') {
// Scout: flag unsubstantiated claims and factual assertions
const claimPatterns = [
/\b(research shows|studies (show|indicate|suggest)|data (shows|indicates|suggests))\b/i,
/\b\d+(\.\d+)?\s*%\b/,
/\b(always|never|all|every|none|no one)\b/i,
/\b(proven|definitive|certain|guaranteed|undeniable)\b/i,
/\b(industry standard|best practice|widely accepted)\b/i,
];
const matched = claimPatterns.find((p) => p.test(content));
if (matched) {
return {
urgency: 'p2',
reason: 'Factual claim without cited source',
content: 'This response contains claims that should be verified with evidence. What data or sources back this up?',
};
}
// Scout spark: look for unexplored data angles
if (lower.includes('option') || lower.includes('approach') || lower.includes('strategy')) {
if (Math.random() < 0.3) {
return {
urgency: 'spark',
reason: 'Potential evidence gap',
content: '_What metrics or signals would tell us which option is actually better here?_',
};
}
}
}
if (personaId === 'pm') {
// PM: flag scope ambiguity and missing requirements
const scopePatterns = [
/\b(could|might|maybe|perhaps|possibly|potentially)\b/i,
/\b(later|eventually|someday|future)\b/i,
/\b(depends on|unclear|tbd|to be determined)\b/i,
];
const matched = scopePatterns.find((p) => p.test(content));
if (matched) {
return {
urgency: 'p1',
reason: 'Scope ambiguity detected',
content: 'Scope boundary needs clarification. What specifically is in vs. out for this iteration?',
};
}
// PM: flag missing success criteria
if ((lower.includes('implement') || lower.includes('build') || lower.includes('create')) && !lower.includes('success') && !lower.includes('metric') && !lower.includes('goal')) {
if (Math.random() < 0.4) {
return {
urgency: 'p2',
reason: 'Missing acceptance criteria',
content: 'What does done look like here? Define the measurable success criteria before building.',
};
}
}
}
if (personaId === 'brainstorm') {
// Brainstorm: flag premature convergence on a single option
const convergencePatterns = [
/\b(the (best|right|correct|only) (way|approach|solution|option))\b/i,
/\b(we should|we must|we need to|the answer is)\b/i,
/\b(obviously|clearly|simply|just)\b/i,
];
const matched = convergencePatterns.find((p) => p.test(content));
if (matched) {
return {
urgency: 'spark',
reason: 'Early convergence on one path',
content: '_Before narrowing: what\'s the alternative that explicitly rejects this approach? What would it look like?_',
};
}
}
return null;
}
/**
* Run lookout analysis for all active lookout agents after a turn completes.
* Emits sidebar_entry SSE events for any signals generated.
*
* @param {string} conversationId
* @param {string} chunkId — the turn ID used as chunk reference
* @param {string} chunkContent — the assistant's response text
* @param {string} chunkPersona — which persona produced the chunk
*/
async #runLookoutAnalysis(conversationId, chunkId, chunkContent, chunkPersona) {
const lookouts = this.#db
.prepare(`SELECT persona_id, muted FROM plan_lookout_agents WHERE conversation_id = ? AND muted = 0`)
.all(conversationId);
for (const lookout of lookouts) {
const { persona_id: personaId } = lookout;
// Skip if this is the persona that produced the chunk
if (personaId === chunkPersona) continue;
// Emit thinking state
this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'thinking' });
// Small async gap to let the SSE event reach the client before analysis
await new Promise((resolve) => setTimeout(resolve, 300 + Math.random() * 700));
try {
const analysis = this.#analyzeChunkHeuristic(personaId, chunkContent);
if (analysis) {
const signalId = randomUUID();
const now = Date.now();
this.#db
.prepare(
`INSERT INTO plan_sidebar_signals (id, conversation_id, agent_id, urgency, reason, content, chunk_id, created_at, status, tension_subject)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active', ?)`,
)
.run(signalId, conversationId, personaId, analysis.urgency, analysis.reason, analysis.content, chunkId, now, analysis.tension_subject ?? null);
const signal = {
id: signalId,
agent_id: personaId,
urgency: analysis.urgency,
reason: analysis.reason,
content: analysis.content,
chunk_id: chunkId,
created_at: now,
status: 'active',
tension_subject: analysis.tension_subject ?? null,
parent_signal_id: null,
};
this.#broadcast(
conversationId,
analysis.urgency === 'p0' ? 'interrupt' : 'sidebar_entry',
{ signal },
);
}
} catch (err) {
console.error(`[plan] lookout analysis error ${conversationId}/${personaId}:`, err.message);
}
// Return to listening state
this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'listening' });
}
}
// ── Persona subscription setup ────────────────────────────────────────────
/**
* Wire pi event listeners for a session so tokens + turn_complete events are
* forwarded to SSE clients.
*
* @param {string} conversationId
* @param {string} personaId
* @param {import('@mariozechner/pi-coding-agent').AgentSession} session
*/
#wireSessionEvents(conversationId, personaId, session) {
session.subscribe((event) => {
if (event.type === 'message_update') {
const ae = event.assistantMessageEvent;
if (ae.type === 'text_delta') {
this.#broadcast(conversationId, 'token', { delta: ae.delta, persona: personaId });
}
} else if (event.type === 'agent_end') {
const msgs = event.messages;
const last = msgs[msgs.length - 1];
let persistedText = '';
let turnId = last?.id ?? randomUUID();
// Persist the assistant turn so history loads correctly.
if (last) {
const text = (last.content ?? [])
.filter((c) => c.type === 'text')
.map((c) => c.text ?? '')
.join('');
if (text) {
persistedText = text;
const now = Date.now();
this.#db
.prepare(
`INSERT OR IGNORE INTO plan_turns
(id, conversation_id, role, content, persona, created_at)
VALUES (?, ?, 'assistant', ?, ?, ?)`,
)
.run(turnId, conversationId, text, personaId, now);
}
}
this.#broadcast(conversationId, 'turn_complete', {
turnId,
persona: personaId,
finishReason: last?.stopReason ?? 'end_turn',
});
this.#db
.prepare(`UPDATE plan_conversations SET last_turn_at = ? WHERE id = ?`)
.run(Date.now(), conversationId);
// Trigger lookout analysis asynchronously — does not block the turn.
if (persistedText) {
this.#runLookoutAnalysis(conversationId, turnId, persistedText, personaId)
.catch((err) => console.error('[plan] lookout run error:', err.message));
}
}
});
}
// ── Public API ────────────────────────────────────────────────────────────
/**
* Submit a user turn to one or more personas in parallel.
* When mentionedPersonas contains 2+ IDs, each receives its own AgentSession
* and streams tokens with per-persona attribution via SSE `persona` field.
* Returns immediately; tokens stream over SSE.
*
* @param {{
* conversationId: string,
* content: string,
* personaOverride?: string,
* mentionedPersonas?: string[],
* }} params
* @returns {Promise<{ turnId: string, persona: string }>}
*/
async submitTurn({ conversationId, content, personaOverride, mentionedPersonas }) {
const row = this.#db
.prepare(`SELECT id, title FROM plan_conversations WHERE id = ?`)
.get(conversationId);
if (!row) {
const err = new Error('conversation not found');
err.code = 'NOT_FOUND';
throw err;
}
const now = Date.now();
// Determine which personas will receive this turn.
// Multi-persona: user @-mentioned 2+ personas explicitly.
// Single-persona: use explicit override or the conversation's active persona.
const personasToDispatch = (mentionedPersonas?.length ?? 0) > 1
? mentionedPersonas
: [personaOverride ?? this.getActivePersona(conversationId)];
// Open (or reset) the replay buffer; track how many turn_complete events are expected.
this.#activeTurns.set(conversationId, []);
this.#pendingPersonaCount.set(conversationId, personasToDispatch.length);
// Set title from first user message if still null.
if (row.title === null) {
this.#db
.prepare(`UPDATE plan_conversations SET title = ? WHERE id = ?`)
.run(deriveTitle(content), conversationId);
}
// Persist the user turn once (regardless of how many personas respond).
this.#db
.prepare(
`INSERT INTO plan_turns (id, conversation_id, role, content, created_at)
VALUES (?, ?, 'user', ?, ?)`,
)
.run(randomUUID(), conversationId, content, now);
const isSinglePersona = personasToDispatch.length === 1;
// Dispatch to each persona. For multi-persona turns, skip rope enrichment —
// the user explicitly chose all participants, so no auto-delegation is needed.
await Promise.all(personasToDispatch.map(async (pId) => {
const onStubCall = (event) => {
this.#broadcast(conversationId, 'tool_stub_call', { persona: pId, ...event });
};
// Refresh credential before each turn. MUST precede getAgent(): in cloud
// path-vault fallback mode #fetchToken points the registry at the localhost
// proxy base URL, and getAgent() bakes that base URL into the runtime's
// model when it first builds the (cached) runtime.
const token = await this.#fetchToken();
const { session, authStorage } = await this.#registry.getAgent(conversationId, pId, { onStubCall });
// Wire events on first use (idempotent because pi de-duplicates subscribers).
this.#wireSessionEvents(conversationId, pId, session);
authStorage.setRuntimeApiKey('anthropic', token);
let promptContent = content;
if (isSinglePersona) {
// Pre-turn autoRope enrichment (Phase D): run any persona's autoRope rules
// before the caller's session sees the content. Keeps pm_gathering_context
// backward-compat; rope_start/rope_complete are emitted by RopeEngine.
promptContent = await this.#ropeEngine.autoDelegateIfNeeded({
conversationId,
callerPersonaId: pId,
content,
fetchToken: () => this.#fetchToken(),
});
}
session.prompt(promptContent).catch((err) => {
console.error(`[plan] prompt error ${conversationId}/${pId}:`, err.message);
this.#broadcast(conversationId, 'error', {
message: err.message,
code: err.code ?? 'PROMPT_ERROR',
});
});
}));
const turnId = randomUUID();
return { turnId, persona: personasToDispatch[0] };
}
/**
* Execute a handoff, switching the default active persona.
*
* @param {{
* conversationId: string,
* toPersona: string,
* mode?: 'full' | 'distilled' | 'quoted',
* selectedTurnIds?: string[],
* }} params
* @returns {Promise<{ handoffId: string, forkNodeId: string | null, seededTurnCount: number }>}
*/
async handoff({ conversationId, toPersona, mode = 'full', selectedTurnIds = [] }) {
const row = this.#db
.prepare(`SELECT id FROM plan_conversations WHERE id = ?`)
.get(conversationId);
if (!row) {
const err = new Error('conversation not found');
err.code = 'NOT_FOUND';
throw err;
}
const fromPersona = this.getActivePersona(conversationId);
const onStubCall = (event) => {
this.#broadcast(conversationId, 'tool_stub_call', { persona: toPersona, ...event });
};
const result = await this.#handoffEngine.handoff({
conversationId,
fromPersona,
toPersona,
mode,
selectedTurnIds,
fetchToken: () => this.#fetchToken(),
onStubCall,
});
// Update the active persona for this conversation.
this.setActivePersona(conversationId, toPersona);
// Persist handoff marker so history replay can reconstruct it.
this.#db
.prepare(
`INSERT OR IGNORE INTO plan_turns
(id, conversation_id, role, content, from_persona, to_persona, mode, fork_node_id, created_at)
VALUES (?, ?, 'handoff', '', ?, ?, ?, ?, ?)`,
)
.run(result.handoffId, conversationId, fromPersona, toPersona, mode, result.forkNodeId ?? null, Date.now());
// Broadcast the handoff event to SSE clients.
this.#broadcast(conversationId, 'handoff', {
handoffId: result.handoffId,
fromPersona,
toPersona,
mode,
forkNodeId: result.forkNodeId,
});
// Wire events for the new persona's session.
try {
const { session } = await this.#registry.getAgent(conversationId, toPersona, { onStubCall });
this.#wireSessionEvents(conversationId, toPersona, session);
} catch {
// Best-effort — events will be wired on first turn if this fails.
}
return result;
}
/**
* Replay buffered in-flight SSE events to a reconnecting client.
* Call this before addEventSink so the client gets events it missed.
* No-op if no turn is active.
*
* @param {string} conversationId
* @param {import('node:http').ServerResponse} res
*/
drainReplayBuffer(conversationId, res) {
const buf = this.#activeTurns.get(conversationId);
if (!buf || buf.length === 0) return;
for (const { event, data } of buf) {
try {
res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
} catch { /* client closed before drain completed */ }
}
}
/**
* Register an SSE sink for a conversation. Returns a cleanup function.
* @param {string} conversationId
* @param {import('node:http').ServerResponse} res
* @returns {() => void}
*/
addEventSink(conversationId, res) {
if (!this.#sinks.has(conversationId)) {
this.#sinks.set(conversationId, new Set());
}
this.#sinks.get(conversationId).add(res);
return () => {
const s = this.#sinks.get(conversationId);
if (s) s.delete(res);
};
}
/**
* Return the ordered turn list for a conversation (for history replay).
* Each turn is one of:
* { role:'user'|'assistant', content, persona?, created_at }
* { role:'handoff', from_persona, to_persona, mode, fork_node_id, created_at }
* @param {string} conversationId
* @returns {Array<object>}
*/
getTurns(conversationId) {
return this.#db
.prepare(
`SELECT id, role, content, persona, from_persona, to_persona, mode, fork_node_id, created_at
FROM plan_turns
WHERE conversation_id = ?
ORDER BY created_at ASC`,
)
.all(conversationId);
}
/** Expose persona list for the /api/plan/personas endpoint. */
listPersonas() {
return PERSONAS.map((p) => ({
id: p.id,
displayName: p.displayName,
model: p.model,
toolNames: p.toolNames,
systemPromptPreview: p.systemPrompt.length > 120
? p.systemPrompt.slice(0, 117) + '...'
: p.systemPrompt,
}));
}
}
/**
* Plan progress parser — reads phase-*-tasks.md trackers to derive
* phase/task state for the inbox progress bar.
*
* @module plan-progress
*/
import { readdirSync, readFileSync, statSync } from 'node:fs';
import path from 'node:path';
const WORKING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
/**
* Parse simple key:value pairs from a YAML frontmatter block (---…---).
* Handles single-line scalar values only — enough for feature/phase keys.
*
* @param {string} content
* @returns {Record<string, string>}
*/
function parseFrontmatter(content) {
const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---/);
if (!match) return {};
const result = {};
for (const line of match[1].split('\n')) {
const m = line.match(/^([\w-]+):\s*(.+)$/);
if (m) result[m[1]] = m[2].trim();
}
return result;
}
/**
* Extract task definitions from "## Task list" section.
* Matches headings like:
* ### A0 — name
* ### B1 step 5 — multi-part name
*
* @param {string} content
* @returns {Array<{id: string, name: string}>}
*/
function extractTaskDefs(content) {
const sectionMatch = content.match(/^## Task list\s*\n([\s\S]*)/m);
if (!sectionMatch) return [];
const taskSection = sectionMatch[1];
const tasks = [];
const re = /^###\s+([A-Z]\d+)\b([^\n]*)/gm;
let m;
while ((m = re.exec(taskSection)) !== null) {
const id = m[1];
const rest = m[2].trim();
// Strip leading em-dash, double-hyphen, or plain hyphen separator
const name = rest.replace(/^\s*[—\-]{1,2}\s*/, '').trim() || id;
tasks.push({ id, name });
}
return tasks;
}
/**
* Extract completed task IDs from the CP0 log comment block.
* Matches lines like: A0 (2026-05-05): ...
* A2 (2026-05-05, rebase): ...
*
* @param {string} content
* @returns {Set<string>}
*/
function extractCp0Completed(content) {
const completed = new Set();
const logMatch = content.match(/<!--\s*CP0 log[\s\S]*?-->/);
if (!logMatch) return completed;
const re = /^([A-Z]\d+)\s*\(/gm;
let m;
while ((m = re.exec(logMatch[0])) !== null) {
completed.add(m[1]);
}
return completed;
}
/**
* Extract completed task IDs from an item-format Status table.
* Matches rows like: | A1 | Tool loader index | done |
*
* @param {string} content
* @returns {Set<string>}
*/
function extractItemTableCompleted(content) {
const completed = new Set();
// No `m` flag — `$` must mean end-of-string so the lazy quantifier captures
// the whole table, not just the first line.
const statusMatch = content.match(/## Status\s*\n([\s\S]*?)(?=\n##\s|$)/);
if (!statusMatch) return completed;
const re = /^\|\s*([A-Z]\d+)\s*\|[^|]+\|\s*done\s*\|/gim;
let m;
while ((m = re.exec(statusMatch[1])) !== null) {
completed.add(m[1]);
}
return completed;
}
/**
* Extract the authoritative done count from a count-format Status table.
* Matches rows like: | done | 3 |
*
* @param {string} content
* @returns {number|null}
*/
function extractDoneCount(content) {
const m = content.match(/\|\s*done\s*\|\s*(\d+)\s*\|/i);
return m ? parseInt(m[1], 10) : null;
}
/**
* Resolve the feature slug from a branch name or by scanning docs/plans/.
*
* Strategy:
* 1. Strip "feat/" prefix + optional "-phase-X" suffix from branch.
* 2. Exact match against plans subdirectory names.
* 3. Prefix match (branch slug starts with a plan dir name).
* 4. Fallback: most-recently-modified plans dir that has phase trackers.
*
* @param {string} repoPath - path to the git checkout
* @param {string|null} branch
* @returns {string|null}
*/
function resolveFeatureSlug(repoPath, branch) {
const plansDir = path.join(repoPath, 'docs', 'plans');
let entries;
try {
entries = readdirSync(plansDir, { withFileTypes: true })
.filter((d) => d.isDirectory())
.map((d) => d.name);
} catch {
return null;
}
if (branch) {
// Strip feat/ prefix, any nested path, and trailing -phase-X suffix
const slug = branch
.replace(/^feat\//, '')
.replace(/\/.*$/, '')
.replace(/-phase-[a-z]$/, '');
// Exact match
if (entries.includes(slug)) return slug;
// Prefix match (slug starts with a plan dir name)
const prefixMatch = entries.find((d) => slug.startsWith(d));
if (prefixMatch) return prefixMatch;
// Branch provided but no name match — don't guess
return null;
}
// No branch: fallback to most-recently-modified dir with phase tracker files
let newest = null;
let newestMtime = 0;
for (const dir of entries) {
const dirPath = path.join(plansDir, dir);
try {
const files = readdirSync(dirPath);
if (!files.some((f) => /^phase-[a-z]-tasks\.md$/.test(f))) continue;
const mtime = statSync(dirPath).mtimeMs;
if (mtime > newestMtime) {
newestMtime = mtime;
newest = dir;
}
} catch {
// skip unreadable entries
}
}
return newest;
}
/**
* Parse a single phase tracker file into phase/task state.
*
* @param {string} filePath
* @param {boolean} isRecentlyActive - whether the world had recent activity
* @param {{ workingMarked: boolean }} state - mutable flag shared across phases
* @returns {{ id: string, name: string, status: string, tasks: Array }|null}
*/
function parseTrackerFile(filePath, isRecentlyActive, state) {
let content;
try {
content = readFileSync(filePath, 'utf8');
} catch {
return null;
}
const fm = parseFrontmatter(content);
// Phase ID: frontmatter "phase" field or filename "phase-X-tasks.md"
const phaseId =
fm.phase ||
path.basename(filePath).match(/^phase-([a-z])-tasks\.md$/)?.[1] ||
'?';
const phaseName = `Phase ${phaseId.toUpperCase()}`;
const taskDefs = extractTaskDefs(content);
if (taskDefs.length === 0) return null;
// Collect completions from all sources
const cp0Completed = extractCp0Completed(content);
const itemTableCompleted = extractItemTableCompleted(content);
const doneCount = extractDoneCount(content);
// Merge CP0 log + item-table; count-format overrides if present
const mergedCompleted = new Set([...cp0Completed, ...itemTableCompleted]);
const tasks = taskDefs.map((t, i) => {
const isComplete =
doneCount !== null
? i < doneCount // count format is authoritative
: mergedCompleted.has(t.id);
if (isComplete) return { id: t.id, name: t.name, status: 'complete' };
// First pending task across all phases = candidate for "working"
if (!state.workingMarked) {
state.workingMarked = true;
return {
id: t.id,
name: t.name,
status: isRecentlyActive ? 'working' : 'pending',
};
}
return { id: t.id, name: t.name, status: 'pending' };
});
const allComplete = tasks.every((t) => t.status === 'complete');
const anyWorking = tasks.some((t) => t.status === 'working');
const phaseStatus = allComplete ? 'complete' : anyWorking ? 'working' : 'pending';
return { id: phaseId, name: phaseName, status: phaseStatus, tasks };
}
/**
* Read plan progress from a world's git checkout.
*
* @param {string} repoPath - absolute path to the git checkout
* @param {string|null} branch - current branch name (e.g. "feat/foo-phase-a")
* @param {{ lastActivityAtMs?: number|null }} [opts]
* @returns {{ feature: string, phases: Array }|null}
* null when no plan tracker is found (caller falls back to legacy bar)
*/
export function readPlanProgress(repoPath, branch, { lastActivityAtMs = null } = {}) {
const feature = resolveFeatureSlug(repoPath, branch);
if (!feature) return null;
const plansDir = path.join(repoPath, 'docs', 'plans', feature);
let phaseFiles;
try {
phaseFiles = readdirSync(plansDir)
.filter((f) => /^phase-[a-z]-tasks\.md$/.test(f))
.sort();
} catch {
return null;
}
if (phaseFiles.length === 0) return null;
const isRecentlyActive =
lastActivityAtMs != null
? Date.now() - lastActivityAtMs <= WORKING_THRESHOLD_MS
: false;
const state = { workingMarked: false };
const phases = phaseFiles
.map((file) =>
parseTrackerFile(path.join(plansDir, file), isRecentlyActive, state),
)
.filter(Boolean);
if (phases.length === 0) return null;
return { feature, phases };
}
// planning-sessions — host-cp surface for creating and inspecting in-flight
// planning sessions stored under world_id = PLANNING_WORLD_ID ('_planning').
//
// Formalises what the plan-chat-spa dev substrate does ad hoc:
//
// createPlanningSession({ actorId, pool })
// Seeds a session with one 'system' chunk so the Electric shape subscriber
// gets a non-empty initial response on its first long-poll cycle. Also
// INSERTs a row into the planning_sessions sidecar table inside the same
// transaction so no partial state can exist (chunk written, no metadata row).
// Returns the allocated world_id, session_id, and the inserted seed chunk.
//
// loadPlanningSession({ pool, sessionId })
// Lightweight metadata read: chunk count, first/last timestamps, first
// operator content (for title derivation). SPA still streams live chunks
// via the existing /v1/shape proxy — this is metadata-only.
//
// recordPlanningSession({ pool, sessionId, actorId, summary })
// UPSERT into planning_sessions. Used by createPlanningSession (wrapped in
// a transaction) and later to update the summary as the session evolves.
//
// setCrystallizeStatus({ pool, sessionId, status, worldId })
// UPDATE planning_sessions.crystallize_status + crystallized_world_id.
// Throws if status is not in PLANNING_SESSION_STATUSES.
//
// listPlanningSessions({ pool, actorId, limit })
// SELECT rows for actorId, ordered created_at DESC. Returns array.
//
// Neither function calls validateChunkInput — that's for the public POST
// surface. INSERTs here are built directly against the chunks column list.
//
// Pool errors surface loudly (never swallowed) so the caller sees the full
// pg error message and can diagnose connectivity or constraint failures.
import { randomUUID } from 'node:crypto';
import { PLANNING_WORLD_ID, PLANNING_SESSION_STATUSES } from '@olam/chunks/schema';
/**
* UPSERT a row in planning_sessions for the given sessionId.
*
* On first call (from createPlanningSession): inserts a fresh row.
* On subsequent calls: updates summary + updated_at only (leaves
* crystallize_status and crystallized_world_id untouched).
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.sessionId
* @param {string} opts.actorId
* @param {string | null} [opts.summary]
* @param {string | null} [opts.linearIssueId] — LinearAgent (handoff principle 6):
* the Linear issue this session is driven by. Immutable once set — subsequent
* upserts preserve the recorded id (COALESCE), so an ordinary update (null)
* never clears a Linear link.
*/
export async function recordPlanningSession({ pool, sessionId, actorId, summary = null, linearIssueId = null }) {
await pool.query(
`INSERT INTO planning_sessions (session_id, actor_id, summary, linear_issue_id)
VALUES ($1, $2, $3, $4)
ON CONFLICT (session_id) DO UPDATE
SET summary = EXCLUDED.summary,
updated_at = NOW(),
linear_issue_id = COALESCE(planning_sessions.linear_issue_id, EXCLUDED.linear_issue_id)`,
[sessionId, actorId, summary, linearIssueId],
);
}
/**
* Find the ACTIVE planning session for a Linear issue (handoff principle 6).
* Active = `archived_at IS NULL` (the substrate's established soft-delete
* marker). A NEW dispatch for the same issue resumes this session; an archived
* run is never matched, so an archived issue re-opened later starts fresh.
* Returns the session_id, or null when there is no active session (or no id).
*
* @param {object} opts
* @param {object} opts.pool
* @param {string | null | undefined} opts.linearIssueId
* @returns {Promise<string | null>}
*/
export async function findActiveLinearSession({ pool, linearIssueId }) {
if (!linearIssueId) return null;
const { rows } = await pool.query(
`SELECT session_id FROM planning_sessions
WHERE linear_issue_id = $1 AND archived_at IS NULL
ORDER BY created_at DESC
LIMIT 1`,
[linearIssueId],
);
return rows?.[0]?.session_id ?? null;
}
/**
* Archive the planning session(s) for a Linear issue (handoff principle 6) —
* sets `archived_at` so a future dispatch for the same issue is NOT resumed and
* starts fresh. Called when the Linear issue is archived. Idempotent: only
* flips still-active rows. Returns the number of sessions archived.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string | null | undefined} opts.linearIssueId
* @returns {Promise<number>}
*/
export async function archiveLinearSession({ pool, linearIssueId }) {
if (!linearIssueId) return 0;
const res = await pool.query(
`UPDATE planning_sessions
SET archived_at = NOW(), updated_at = NOW()
WHERE linear_issue_id = $1 AND archived_at IS NULL`,
[linearIssueId],
);
return res?.rowCount ?? 0;
}
/**
* Create a new in-flight planning session under world_id='_planning'.
*
* Allocates a fresh session_id (UUID v4) and, inside a single transaction:
* 1. INSERTs a seed chunk (actor_type='system', seq=0) so the Electric shape
* subscriber receives a non-empty initial long-poll response.
* 2. INSERTs a planning_sessions sidecar row (via recordPlanningSession) so
* listPlanningSessions can return it immediately.
*
* Transaction guarantee: both INSERTs succeed or both roll back. A chunk
* written without a corresponding planning_sessions row is the partial-state
* bug this transaction prevents.
*
* @param {object} opts
* @param {string} opts.actorId — actor_id to attribute the seed chunk to
* (typically 'system' or the host-cp service id)
* @param {object} opts.pool — pg.Pool-compatible object with .query() and
* optionally .connect() for transactional clients.
* Tests may supply a stub with a transaction-aware
* .query() (BEGIN / INSERT / INSERT / COMMIT).
* @returns {Promise<{
* world_id: string,
* session_id: string,
* seed_chunk: {
* world_id: string, session_id: string, message_id: string, seq: number,
* actor_id: string, actor_type: string, role: string, chunk: string,
* chunk_type: string,
* },
* }>}
*/
export async function createPlanningSession({ actorId, pool }) {
const sessionId = randomUUID();
const messageId = randomUUID();
const seq = 0;
const actorType = 'system';
const role = 'system';
const chunk = 'Planning session created.';
const chunkType = 'text';
// Use a transactional client when pool.connect() is available (real pg.Pool).
// Test stubs that only implement .query() fall through to the flat path;
// the transactional contract is proven by the test that uses a stub whose
// second .query() throws and asserts the chunk INSERT was rolled back.
if (typeof pool.connect === 'function') {
const client = await pool.connect();
try {
await client.query('BEGIN');
await client.query(
`INSERT INTO chunks
(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[PLANNING_WORLD_ID, sessionId, messageId, seq, actorId, actorType, role, chunk, chunkType],
);
await recordPlanningSession({ pool: client, sessionId, actorId, summary: null });
await client.query('COMMIT');
} catch (err) {
await client.query('ROLLBACK');
throw err;
} finally {
client.release();
}
} else {
// Flat path for test stubs: queries run sequentially on the stub pool.
await pool.query(
`INSERT INTO chunks
(world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[PLANNING_WORLD_ID, sessionId, messageId, seq, actorId, actorType, role, chunk, chunkType],
);
await recordPlanningSession({ pool, sessionId, actorId, summary: null });
}
return {
world_id: PLANNING_WORLD_ID,
session_id: sessionId,
seed_chunk: {
world_id: PLANNING_WORLD_ID,
session_id: sessionId,
message_id: messageId,
seq,
actor_id: actorId,
actor_type: actorType,
role,
chunk,
chunk_type: chunkType,
},
};
}
/**
* Update crystallize_status (and optionally crystallized_world_id) for a
* planning session.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.sessionId
* @param {string} opts.status — must be in PLANNING_SESSION_STATUSES
* @param {string | null} [opts.worldId] — required when status='crystallized'
*/
export async function setCrystallizeStatus({ pool, sessionId, status, worldId = null }) {
if (!PLANNING_SESSION_STATUSES.includes(status)) {
throw new Error(
`setCrystallizeStatus: invalid status "${status}"; ` +
`must be one of ${PLANNING_SESSION_STATUSES.join(', ')}`,
);
}
await pool.query(
`UPDATE planning_sessions
SET crystallize_status = $2,
crystallized_world_id = $3,
updated_at = NOW()
WHERE session_id = $1`,
[sessionId, status, worldId],
);
}
/**
* Create a new multi-turn DISPATCH session (multi-turn-cloud-sandbox-dispatch
* Phase A2 — distinct from createPlanningSession which is for planning-flow
* crystallization sessions under world_id='_planning').
*
* Allocates a UUID session_id, INSERTs a planning_sessions row with
* session_type='dispatch' + caller-supplied world_id, applies operator-supplied
* budget_usd_cap / allow_unpriced_models defaults, returns the session_id.
*
* No seed chunk: dispatch sessions accumulate chunks from the agent runtime
* (via /v1/chunks); we don't pre-seed a system chunk because Electric shape
* subscribers for dispatch sessions can wait for the first real agent chunk.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.actorId
* @param {string} opts.worldId — operator-supplied; identifies the dispatch
* target world (NOT the '_planning' sentinel used by createPlanningSession).
* @param {number | null} [opts.budgetUsdCap=null] — per-session budget cap;
* null = uncapped. When null AND `OLAM_SESSION_BUDGET_DEFAULT_USD` is set,
* the env-default applies at /v1/dispatch-turn check time (Phase D); here
* we record the row exactly as supplied.
* @param {boolean} [opts.allowUnpricedModels=false] — opt session into the
* pricingForModel-returns-null fallback (Plan A T11 mitigation; default
* refuses unknown models with 502).
* @returns {Promise<{ session_id: string }>}
*/
export async function createDispatchSession({
pool,
actorId,
worldId,
budgetUsdCap = null,
allowUnpricedModels = false,
sessionId: providedSessionId = null,
}) {
if (!actorId || typeof actorId !== 'string') {
throw new Error('createDispatchSession: actorId required');
}
if (!worldId || typeof worldId !== 'string') {
throw new Error('createDispatchSession: worldId required');
}
// A6 (Decision 9 always-on threading): callers MAY supply session_id to
// upsert an existing planning_sessions row (e.g. /api/cloud-dispatch
// pre-creating the thread before forwarding to plan-DO). When omitted,
// we generate a UUID. ON CONFLICT DO NOTHING handles the race where
// the SPA called /v1/sessions/create concurrently AND server-side
// cloud-dispatch tried to pre-create the same row.
const sessionId = providedSessionId ?? randomUUID();
await pool.query(
`INSERT INTO planning_sessions
(session_id, actor_id, session_type, world_id, budget_usd_cap, allow_unpriced_models)
VALUES ($1, $2, 'dispatch', $3, $4, $5)
ON CONFLICT (session_id) DO NOTHING`,
[sessionId, actorId, worldId, budgetUsdCap, allowUnpricedModels],
);
return { session_id: sessionId };
}
/**
* Atomic test-and-set lock claim on a dispatch session
* (multi-turn-cloud-sandbox-dispatch Phase A3 — Decision 4 + T5 mitigation).
*
* Pattern: single-statement UPDATE ... WHERE in_flight_turn_id IS NULL RETURNING.
* Two concurrent attempts: first claim wins (RETURNING yields 1 row); second
* sees empty result + must return 409 to caller. Matches the established
* planning-sessions.mjs:169 setCrystallizeStatus atomic-write idiom.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.sessionId
* @param {string} opts.turnId — operator-or-server-generated turn UUID
* @returns {Promise<boolean>} true if lock claimed, false if already held
*/
export async function claimDispatchTurnLock({ pool, sessionId, turnId }) {
const result = await pool.query(
`UPDATE planning_sessions
SET in_flight_turn_id = $1,
in_flight_turn_started_at = NOW(),
last_turn_at = NOW()
WHERE session_id = $2
AND session_type = 'dispatch'
AND in_flight_turn_id IS NULL
RETURNING session_id`,
[turnId, sessionId],
);
return (result.rows?.length ?? 0) > 0;
}
/**
* Clear the in-flight turn lock after dispatch completes (success OR failure).
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.sessionId
*/
export async function clearDispatchTurnLock({ pool, sessionId }) {
await pool.query(
`UPDATE planning_sessions
SET in_flight_turn_id = NULL,
in_flight_turn_started_at = NULL
WHERE session_id = $1
AND session_type = 'dispatch'`,
[sessionId],
);
}
/**
* Halt a dispatch session — operator-driven "block next turn" state (T13).
*
* Sets halted_at to NOW() AND clears in_flight_turn_id. Future /v1/dispatch-turn
* calls return 409 'session_halted' until reactivateDispatchSession clears
* halted_at. Does NOT stop an in-flight container — the running container
* completes its current turn naturally. UX is "Block next turn" not "Stop"
* (Plan A Phase C C6).
*
* Scoped by actor_id for ownership isolation.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.sessionId
* @param {string} opts.actorId
* @returns {Promise<boolean>} true if a session row was updated; false if
* the session_id was not found / not owned by actorId.
*/
export async function haltDispatchSession({ pool, sessionId, actorId }) {
const result = await pool.query(
`UPDATE planning_sessions
SET halted_at = NOW(),
in_flight_turn_id = NULL,
in_flight_turn_started_at = NULL
WHERE session_id = $1
AND session_type = 'dispatch'
AND actor_id = $2
RETURNING session_id`,
[sessionId, actorId],
);
return (result.rows?.length ?? 0) > 0;
}
/**
* Reactivate a halted dispatch session — clears halted_at so subsequent
* /v1/dispatch-turn calls can claim the lock again. Idempotent (clearing an
* already-null halted_at is a no-op).
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.sessionId
* @param {string} opts.actorId
* @returns {Promise<boolean>} true if a session row was updated; false if
* the session_id was not found / not owned by actorId.
*/
export async function reactivateDispatchSession({ pool, sessionId, actorId }) {
const result = await pool.query(
`UPDATE planning_sessions
SET halted_at = NULL
WHERE session_id = $1
AND session_type = 'dispatch'
AND actor_id = $2
RETURNING session_id`,
[sessionId, actorId],
);
return (result.rows?.length ?? 0) > 0;
}
/**
* Read a dispatch session by session_id + scope to caller's actor_id
* (ownership check). Returns the session metadata needed for budget check
* + plan-DO forward, OR null when not found / not owned.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.sessionId
* @param {string} opts.actorId
* @returns {Promise<null | {
* session_id: string,
* world_id: string | null,
* actor_id: string,
* total_usd: number,
* budget_usd_cap: number | null,
* allow_unpriced_models: boolean,
* halted_at: string | null,
* }>}
*/
export async function getDispatchSession({ pool, sessionId, actorId }) {
const result = await pool.query(
`SELECT session_id, world_id, actor_id,
total_usd, budget_usd_cap, allow_unpriced_models,
halted_at
FROM planning_sessions
WHERE session_id = $1
AND session_type = 'dispatch'
AND actor_id = $2`,
[sessionId, actorId],
);
const row = result.rows?.[0];
if (!row) return null;
return {
session_id: row.session_id,
world_id: row.world_id ?? null,
actor_id: row.actor_id,
total_usd: Number(row.total_usd ?? 0),
budget_usd_cap:
row.budget_usd_cap === null || row.budget_usd_cap === undefined
? null
: Number(row.budget_usd_cap),
allow_unpriced_models: Boolean(row.allow_unpriced_models),
halted_at: row.halted_at ?? null,
};
}
/**
* List planning sessions for a given actorId, ordered by created_at DESC.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.actorId
* @param {number} [opts.limit=50]
* @returns {Promise<Array<{
* session_id: string,
* summary: string | null,
* crystallize_status: string,
* crystallized_world_id: string | null,
* created_at: string,
* updated_at: string,
* }>>}
*/
export async function listPlanningSessions({ pool, actorId, limit = 50 }) {
const result = await pool.query(
`SELECT session_id, summary, crystallize_status, crystallized_world_id,
created_at, updated_at
FROM planning_sessions
WHERE actor_id = $1
ORDER BY created_at DESC
LIMIT $2`,
[actorId, limit],
);
return result.rows;
}
/**
* List multi-turn DISPATCH sessions for a given actorId, ordered by
* last_turn_at DESC (most recently active first), excluding archived sessions.
*
* Distinct from listPlanningSessions: this returns only `session_type='dispatch'`
* rows + projects the multi-turn-specific columns (total_usd, in_flight_turn_id,
* halted_at, etc.) that the SPA's SessionsListView (Phase C C3) renders.
*
* @param {object} opts
* @param {object} opts.pool
* @param {string} opts.actorId
* @param {number} [opts.limit=50]
* @returns {Promise<Array<{
* session_id: string,
* world_id: string | null,
* total_usd: string,
* budget_usd_cap: string | null,
* in_flight_turn_id: string | null,
* halted_at: string | null,
* last_turn_at: string | null,
* created_at: string,
* summary: string | null,
* }>>}
*/
export async function listDispatchSessions({ pool, actorId, limit = 50 }) {
const result = await pool.query(
`SELECT session_id, world_id,
total_usd, budget_usd_cap,
in_flight_turn_id, halted_at,
last_turn_at, created_at,
summary
FROM planning_sessions
WHERE actor_id = $1
AND session_type = 'dispatch'
AND archived_at IS NULL
ORDER BY last_turn_at DESC NULLS LAST, created_at DESC
LIMIT $2`,
[actorId, limit],
);
return result.rows;
}
/**
* Load lightweight metadata for an existing in-flight planning session.
*
* Performs two queries scoped to world_id='_planning' AND session_id=<sessionId>:
* 1. Aggregate: chunk_count, first_chunk_at, last_chunk_at.
* 2. First operator content: earliest chunk where actor_type='operator',
* used by the SPA for session title derivation.
*
* The SPA streams live chunks via the existing /v1/shape proxy; this function
* is metadata-only and does NOT subscribe to any Electric shape.
*
* @param {object} opts
* @param {object} opts.pool — pg.Pool-compatible object with a .query(sql, params) method
* @param {string} opts.sessionId — UUID of the planning session to inspect
* @returns {Promise<{
* world_id: string,
* session_id: string,
* exists: boolean,
* chunk_count: number,
* first_chunk_at: string | null,
* last_chunk_at: string | null,
* first_operator_content: string | null,
* }>}
*/
export async function loadPlanningSession({ pool, sessionId }) {
const aggResult = await pool.query(
`SELECT COUNT(*) AS chunk_count,
MIN(created_at) AS first_chunk_at,
MAX(created_at) AS last_chunk_at
FROM chunks
WHERE world_id = $1 AND session_id = $2`,
[PLANNING_WORLD_ID, sessionId],
);
const row = aggResult.rows[0];
const chunkCount = Number(row.chunk_count);
const exists = chunkCount > 0;
let firstOperatorContent = null;
if (exists) {
const opResult = await pool.query(
`SELECT chunk
FROM chunks
WHERE world_id = $1 AND session_id = $2 AND actor_type = 'operator'
ORDER BY created_at ASC
LIMIT 1`,
[PLANNING_WORLD_ID, sessionId],
);
if (opResult.rows.length > 0) {
firstOperatorContent = opResult.rows[0].chunk;
}
}
return {
world_id: PLANNING_WORLD_ID,
session_id: sessionId,
exists,
chunk_count: chunkCount,
first_chunk_at: exists ? row.first_chunk_at : null,
last_chunk_at: exists ? row.last_chunk_at : null,
first_operator_content: firstOperatorContent,
};
}
/**
* port-bridge-manager.mjs
* Manages socat sidecar containers that bridge host port → world devbox port.
* Dual-mode: Docker HTTP API (container) vs docker CLI (bare-node).
*/
import { spawnSync } from 'node:child_process';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli';
const SOCAT_IMAGE = 'alpine/socat';
const SOCAT_IMAGE_TAGGED = 'alpine/socat:latest';
const HOST_PORT_MIN = 25000;
const HOST_PORT_MAX = 25999;
const INFRA_PORTS = new Set([8080, 7681, 7682]);
let BRIDGES_PATH =
process.env.OLAM_PORT_BRIDGES_PATH ??
path.join(os.homedir(), '.olam', 'port-bridges.json');
let HOST_IP = '127.0.0.1';
// key: `${worldId}:${containerPort}` → { worldId, containerPort, hostPort, containerId, containerName }
const registry = new Map();
export function configure({ bridgesPath, hostIp }) {
if (bridgesPath && bridgesPath !== BRIDGES_PATH) {
BRIDGES_PATH = bridgesPath;
loadState();
}
if (hostIp) HOST_IP = hostIp;
}
function bridgeKey(worldId, containerPort) {
return `${worldId}:${containerPort}`;
}
function bridgeContainerName(worldId, containerPort) {
return `olam-${worldId}-bridge-${containerPort}`;
}
function loadState() {
try {
if (!fs.existsSync(BRIDGES_PATH)) return;
const raw = fs.readFileSync(BRIDGES_PATH, 'utf-8');
const data = JSON.parse(raw);
if (!data || typeof data !== 'object') return;
for (const [key, entry] of Object.entries(data)) {
registry.set(key, entry);
}
} catch (err) {
console.error(`port-bridge-manager: loadState failed: ${err.message}`);
}
}
function saveState() {
try {
const dir = path.dirname(BRIDGES_PATH);
fs.mkdirSync(dir, { recursive: true });
const data = {};
for (const [key, entry] of registry) {
data[key] = entry;
}
const tmp = `${BRIDGES_PATH}.tmp-${process.pid}-${Date.now()}`;
fs.writeFileSync(tmp, JSON.stringify(data, null, 2), 'utf-8');
fs.renameSync(tmp, BRIDGES_PATH);
} catch (err) {
console.error(`port-bridge-manager: saveState failed: ${err.message}`);
}
}
function allocateHostPort() {
const used = new Set(Array.from(registry.values()).map((e) => e.hostPort));
for (let p = HOST_PORT_MIN; p <= HOST_PORT_MAX; p++) {
if (!used.has(p)) return p;
}
return null;
}
async function dockerApiBase() {
return DOCKER_HOST === 'docker-cli'
? null // bare-node: no socket proxy HTTP API
: DOCKER_HOST.replace(/^tcp:\/\//, 'http://');
}
/**
* Detect whether a docker error message indicates the image is missing
* (and therefore a `docker pull` retry would help). Docker uses a handful
* of phrasings across CLI + HTTP API surfaces.
*/
function isImageMissingError(message) {
if (!message) return false;
return /Unable to find image|pull access denied|manifest unknown|No such image|not found in (the )?(repository|registry)/i.test(
message,
);
}
/**
* Pull alpine/socat:latest via docker CLI. Used by the bare-node bridge
* create path's fallback retry. 60s budget — image is ~5MB; real pull
* is typically <2s.
*
* @returns {{ok: boolean, stderr: string}}
*/
function pullSocatViaCli() {
const r = spawnSync('docker', ['pull', SOCAT_IMAGE_TAGGED], {
encoding: 'utf-8',
timeout: 60_000,
});
return {
ok: r.status === 0,
stderr: (r.stderr ?? '').trim() || (r.stdout ?? '').trim(),
};
}
/**
* Pull alpine/socat:latest via Docker HTTP API. Used by the container-mode
* bridge create path's fallback retry. Streams the pull progress body so
* Docker actually performs the pull (it's a streaming endpoint).
*
* @param {string} apiBase — Docker HTTP API base URL
* @returns {Promise<{ok: boolean, stderr: string}>}
*/
async function pullSocatViaHttpApi(apiBase) {
try {
const resp = await fetch(
`${apiBase}/images/create?fromImage=${encodeURIComponent(SOCAT_IMAGE)}&tag=latest`,
{ method: 'POST', signal: AbortSignal.timeout(60_000) },
);
if (!resp.ok) {
const body = await resp.text().catch(() => '');
return { ok: false, stderr: `pull failed: ${resp.status} ${body}` };
}
// Drain the streaming progress body — Docker only completes the pull
// when the response is consumed.
await resp.text();
return { ok: true, stderr: '' };
} catch (err) {
return { ok: false, stderr: err?.message ?? String(err) };
}
}
/**
* Create and start a socat bridge container.
*
* Returns `{ containerId, pulledImage }` — `pulledImage: true` indicates the
* function had to fall back to `docker pull alpine/socat:latest` (issue #964
* — preflight in `olam services up` should normally have already pulled it).
*
* @param {string} worldId
* @param {number} containerPort
* @param {number} hostPort
* @returns {Promise<{containerId: string, pulledImage: boolean}>}
*/
async function createBridgeContainer(worldId, containerPort, hostPort) {
const name = bridgeContainerName(worldId, containerPort);
const networkName = `olam-${worldId}`;
const devboxName = `olam-${worldId}-devbox`;
const socatCmd = `TCP-LISTEN:${containerPort},fork,reuseaddr TCP:${devboxName}:${containerPort}`;
const apiBase = await dockerApiBase();
if (!apiBase) {
// bare-node: use docker CLI
const args = [
'run', '-d',
'--name', name,
'--network', networkName,
'-p', `${HOST_IP}:${hostPort}:${containerPort}`,
'--label', `olam.world.id=${worldId}`,
'--label', 'olam.role=server-bridge',
'--restart', 'unless-stopped',
SOCAT_IMAGE,
'TCP-LISTEN:' + containerPort + ',fork,reuseaddr',
'TCP:' + devboxName + ':' + containerPort,
];
let result = spawnSync('docker', args, { encoding: 'utf-8', timeout: 10000 });
let pulledImage = false;
// Issue #964 fallback: if docker run failed because the image is missing,
// pull it and retry once. This covers hosts where `olam services up`
// didn't run the preflight (e.g. fresh Hazel install, docker restart
// pruned the image, etc.).
if (result.status !== 0 && isImageMissingError(result.stderr ?? '')) {
const pull = pullSocatViaCli();
if (!pull.ok) {
throw new Error(
`alpine/socat image missing and pull failed: ${pull.stderr || 'unknown error'}`,
);
}
pulledImage = true;
result = spawnSync('docker', args, { encoding: 'utf-8', timeout: 10000 });
}
if (result.status !== 0) {
throw new Error(result.stderr?.trim() || 'docker run failed');
}
return { containerId: result.stdout.trim(), pulledImage };
}
// container mode: Docker HTTP API
const createBody = {
Image: SOCAT_IMAGE,
Cmd: socatCmd.split(' '),
Labels: {
'olam.world.id': worldId,
'olam.role': 'server-bridge',
},
HostConfig: {
NetworkMode: networkName,
PortBindings: {
[`${containerPort}/tcp`]: [{ HostIp: HOST_IP, HostPort: String(hostPort) }],
},
RestartPolicy: { Name: 'unless-stopped' },
},
};
const doCreate = () => fetch(
`${apiBase}/containers/create?name=${encodeURIComponent(name)}`,
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(createBody),
signal: AbortSignal.timeout(10000),
},
);
let createResp = await doCreate();
let pulledImage = false;
// Issue #964 fallback for HTTP API path. Docker returns 404 with a body
// like {"message":"No such image: alpine/socat:latest"} when the image
// is missing.
if (!createResp.ok && createResp.status === 404) {
const body = await createResp.text().catch(() => '');
if (isImageMissingError(body)) {
const pull = await pullSocatViaHttpApi(apiBase);
if (!pull.ok) {
throw new Error(
`alpine/socat image missing and pull failed: ${pull.stderr || 'unknown error'}`,
);
}
pulledImage = true;
createResp = await doCreate();
} else {
throw new Error(`container create failed: 404 ${body}`);
}
}
if (!createResp.ok) {
const body = await createResp.text().catch(() => '');
// If container already exists (409), try to get its ID
if (createResp.status === 409) {
const inspectResp = await fetch(
`${apiBase}/containers/${encodeURIComponent(name)}/json`,
{ signal: AbortSignal.timeout(3000) },
);
if (inspectResp.ok) {
const info = await inspectResp.json();
return { containerId: info.Id, pulledImage };
}
}
throw new Error(`container create failed: ${createResp.status} ${body}`);
}
const { Id: containerId } = await createResp.json();
const startResp = await fetch(`${apiBase}/containers/${encodeURIComponent(containerId)}/start`, {
method: 'POST',
signal: AbortSignal.timeout(5000),
});
if (!startResp.ok && startResp.status !== 304) {
throw new Error(`container start failed: ${startResp.status}`);
}
return { containerId, pulledImage };
}
async function removeBridgeContainer(containerName, containerId) {
const id = containerId || containerName;
const apiBase = await dockerApiBase();
if (!apiBase) {
spawnSync('docker', ['rm', '-f', id], { encoding: 'utf-8', timeout: 5000 });
return;
}
// Force remove (stop + delete in one call)
await fetch(`${apiBase}/containers/${encodeURIComponent(id)}?force=true`, {
method: 'DELETE',
signal: AbortSignal.timeout(5000),
}).catch(() => { /* best-effort */ });
}
/**
* Expose a world's container port via a socat bridge.
* Idempotent: returns existing bridge if already active.
*
* @param {string} worldId
* @param {number} containerPort
* @returns {Promise<{hostPort: number, containerPort: number, url: string, containerId: string, pulledImage?: boolean}>}
*/
export async function exposePort(worldId, containerPort) {
if (INFRA_PORTS.has(containerPort)) {
throw new Error(`port ${containerPort} is reserved for infrastructure`);
}
const key = bridgeKey(worldId, containerPort);
const existing = registry.get(key);
if (existing) {
return {
hostPort: existing.hostPort,
containerPort: existing.containerPort,
url: `http://${HOST_IP}:${existing.hostPort}`,
containerId: existing.containerId,
};
}
const hostPort = allocateHostPort();
if (hostPort === null) {
throw new Error('no host ports available in range 25000–25999');
}
const containerName = bridgeContainerName(worldId, containerPort);
const { containerId, pulledImage } = await createBridgeContainer(worldId, containerPort, hostPort);
const entry = { worldId, containerPort, hostPort, containerId, containerName };
registry.set(key, entry);
saveState();
const result = {
hostPort,
containerPort,
url: `http://${HOST_IP}:${hostPort}`,
containerId,
};
// Only attach pulledImage when true so existing callers/tests don't see
// an unexpected key when the preflight succeeded.
if (pulledImage) result.pulledImage = true;
return result;
}
/**
* Remove a port bridge for a world.
* No-op if bridge doesn't exist.
*
* @param {string} worldId
* @param {number} containerPort
*/
export async function removePort(worldId, containerPort) {
const key = bridgeKey(worldId, containerPort);
const entry = registry.get(key);
if (!entry) return;
registry.delete(key);
saveState();
await removeBridgeContainer(entry.containerName, entry.containerId);
}
/**
* Remove all bridges for a world. Called on world destroy.
* @param {string} worldId
*/
export async function killWorld(worldId) {
const toDelete = [];
for (const [key, entry] of registry) {
if (entry.worldId === worldId) toDelete.push({ key, entry });
}
for (const { key, entry } of toDelete) {
registry.delete(key);
await removeBridgeContainer(entry.containerName, entry.containerId).catch(() => {});
}
if (toDelete.length > 0) saveState();
}
/**
* List active bridges for a world.
* @param {string} worldId
* @returns {Array<{containerPort: number, hostPort: number, url: string}>}
*/
export function getWorldBridges(worldId) {
const result = [];
for (const entry of registry.values()) {
if (entry.worldId === worldId) {
result.push({
containerPort: entry.containerPort,
hostPort: entry.hostPort,
url: `http://${HOST_IP}:${entry.hostPort}`,
});
}
}
return result;
}
loadState();
/**
* In-memory cache for GitHub PR data with TTL and concurrent-fetch coalescing.
*
* @module pr-cache
*/
const GH_API_BASE = 'https://api.github.com';
const TTL_MS = 30_000;
/**
* Parse owner, repo, and PR number from a GitHub PR URL.
*
* @param {string} prUrl e.g. https://github.com/owner/repo/pull/123
* @returns {{ owner: string, repo: string, number: number } | null}
*/
function parsePrUrl(prUrl) {
const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl);
if (!m) return null;
return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) };
}
/**
* Reduce an array of check runs into a single status string.
*
* @param {Array<{conclusion: string|null, status: string}>} checkRuns
* @returns {'pending'|'passing'|'failing'|null}
*/
function reduceCheckRuns(checkRuns) {
if (!checkRuns || checkRuns.length === 0) return null;
let hasFailure = false;
let hasPending = false;
for (const run of checkRuns) {
const conclusion = run.conclusion;
const status = run.status;
if (
conclusion === 'failure' ||
conclusion === 'timed_out' ||
conclusion === 'action_required'
) {
hasFailure = true;
} else if (
status === 'queued' ||
status === 'in_progress' ||
conclusion === null
) {
hasPending = true;
}
}
if (hasFailure) return 'failing';
if (hasPending) return 'pending';
return 'passing';
}
/**
* @typedef {object} PrCacheEntry
* @property {number} fetchedAt
* @property {'open'|'merged'|'closed'|null} prState
* @property {number|null} prNumber
* @property {'pending'|'passing'|'failing'|null} prChecks
* @property {Promise<PrData>|null} promise
*/
/**
* @typedef {object} PrData
* @property {'open'|'merged'|'closed'|null} state
* @property {number|null} number
* @property {'pending'|'passing'|'failing'|null} checks
*/
/**
* Fetch PR data from GitHub API.
*
* @param {string} prUrl
* @param {() => Promise<string|null>} getToken
* @returns {Promise<PrData>}
*/
async function fetchPrData(prUrl, getToken) {
const parsed = parsePrUrl(prUrl);
if (!parsed) return { state: null, number: null, checks: null };
const token = await getToken();
/** @type {HeadersInit} */
const headers = { Accept: 'application/vnd.github+json' };
if (token) headers['Authorization'] = `token ${token}`;
// Fetch PR metadata
const prResp = await fetch(
`${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/pulls/${parsed.number}`,
{ headers, signal: AbortSignal.timeout(10_000) },
);
if (!prResp.ok) {
return { state: null, number: parsed.number, checks: null };
}
const prData = await prResp.json();
let state = prData.state ?? null;
if (state === 'closed' && prData.merged_at) state = 'merged';
const sha = prData.head?.sha ?? null;
if (!sha) {
return { state, number: parsed.number, checks: null };
}
// Fetch check runs for the head SHA
let checks = null;
try {
const checksResp = await fetch(
`${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/commits/${sha}/check-runs`,
{ headers, signal: AbortSignal.timeout(10_000) },
);
if (checksResp.ok) {
const checksData = await checksResp.json();
const runs = Array.isArray(checksData.check_runs) ? checksData.check_runs : [];
checks = reduceCheckRuns(runs);
}
} catch {
// Non-fatal — return null checks
}
return { state, number: parsed.number, checks };
}
/**
* Create a PR data cache with TTL and concurrent-fetch coalescing.
*
* @returns {{ getPr: (prUrl: string, getToken: () => Promise<string|null>) => Promise<PrData|null>, deletePr: (prUrl: string) => void }}
*/
export function createPrCache() {
/** @type {Map<string, PrCacheEntry>} */
const cache = new Map();
/**
* Get PR data for a URL, using cache if fresh or coalescing concurrent fetches.
*
* @param {string} prUrl
* @param {() => Promise<string|null>} getToken
* @returns {Promise<PrData|null>}
*/
async function getPr(prUrl, getToken) {
if (!prUrl) return null;
const now = Date.now();
const entry = cache.get(prUrl);
// Fresh cache hit
if (entry && !entry.promise && now - entry.fetchedAt < TTL_MS) {
return { state: entry.prState, number: entry.prNumber, checks: entry.prChecks };
}
// In-flight fetch — coalesce
if (entry && entry.promise) {
try {
return await entry.promise;
} catch {
return null;
}
}
// Stale or missing — start new fetch
const promise = fetchPrData(prUrl, getToken).then(
(data) => {
cache.set(prUrl, {
fetchedAt: Date.now(),
prState: data.state,
prNumber: data.number,
prChecks: data.checks,
promise: null,
});
return data;
},
(err) => {
// Clear promise on error so next call retries
const current = cache.get(prUrl);
if (current && current.promise) {
cache.set(prUrl, { ...current, promise: null });
}
throw err;
},
);
cache.set(prUrl, {
fetchedAt: entry ? entry.fetchedAt : 0,
prState: entry ? entry.prState : null,
prNumber: entry ? entry.prNumber : null,
prChecks: entry ? entry.prChecks : null,
promise,
});
try {
return await promise;
} catch {
return null;
}
}
/**
* Evict a PR entry from the cache (call on world destroy).
*
* @param {string} prUrl
*/
function deletePr(prUrl) {
cache.delete(prUrl);
}
return { getPr, deletePr };
}
/**
* PR merge poller for auto-destroying worlds whose PR has merged.
*
* State machine per world:
* open -> merged (on GitHub reports merged)
* merged -> merged_destroyed (after grace period, if auto_destroy_on_merge)
*/
const GH_API_BASE = 'https://api.github.com';
/**
* Parse owner, repo, and PR number from a GitHub PR URL.
* @param {string} prUrl e.g. https://github.com/org/repo/pull/123
* @returns {{ owner: string, repo: string, number: number } | null}
*/
function parsePrUrl(prUrl) {
const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl);
if (!m) return null;
return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) };
}
/**
* @param {{
* prStateStore: import('./world-pr-state.mjs').ReturnType<typeof createWorldPrStateStore>,
* getGhToken: () => Promise<string|null>,
* destroyWorld: (worldId: string) => Promise<void>,
* pollIntervalMs?: number,
* gracePeriodMs?: number,
* }} opts
*/
export function createPrMergePoller({
prStateStore,
getGhToken,
destroyWorld,
pollIntervalMs = 300_000,
gracePeriodMs = 600_000,
}) {
let intervalId = null;
let disabled = false;
let warnedOnce = false;
// Track in-flight grace timers so stop() can clear them
const graceTimers = new Map();
async function destroyAndMark(worldId) {
const entry = prStateStore.get(worldId);
const prUrl = entry?.pr_url ?? '(unknown)';
const mergedAt = entry?.pr_merged_at ?? '(unknown)';
console.log(
`[pr-merge-poller] auto-destroyed world ${worldId}: PR ${prUrl} merged at ${mergedAt}, destroyed at ${new Date().toISOString()}`,
);
try {
await destroyWorld(worldId);
} catch (err) {
console.error(`[pr-merge-poller] destroyWorld failed for ${worldId}:`, err.message);
}
prStateStore.set(worldId, { pr_state: 'merged_destroyed' });
graceTimers.delete(worldId);
}
function scheduleGrace(worldId, entry) {
if (graceTimers.has(worldId)) return; // already scheduled
const id = setTimeout(() => {
destroyAndMark(worldId).catch((err) => {
console.error(`[pr-merge-poller] destroyAndMark error for ${worldId}:`, err.message);
});
}, gracePeriodMs);
graceTimers.set(worldId, id);
}
async function checkPr(worldId, entry, ghToken) {
const parsed = parsePrUrl(entry.pr_url);
if (!parsed) {
console.warn(`[pr-merge-poller] cannot parse PR URL for ${worldId}: ${entry.pr_url}`);
return;
}
const apiUrl = `${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/pulls/${parsed.number}`;
let data;
try {
const resp = await fetch(apiUrl, {
headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' },
});
if (!resp.ok) {
console.warn(`[pr-merge-poller] GH API ${resp.status} for ${worldId}`);
return;
}
data = await resp.json();
} catch (err) {
console.warn(`[pr-merge-poller] fetch failed for ${worldId}:`, err.message);
return;
}
const isMerged = data.state === 'closed' && data.merged_at != null;
if (!isMerged) return;
prStateStore.set(worldId, {
pr_state: 'merged',
pr_merged_at: data.merged_at,
});
if (entry.auto_destroy_on_merge === false) return;
scheduleGrace(worldId, prStateStore.get(worldId));
}
async function pollOnce() {
const ghToken = await getGhToken();
if (!ghToken) {
if (!warnedOnce) {
console.warn(
'pr-merge-poller: no GH token found (GH_TOKEN/GITHUB_TOKEN env or /gh-config/hosts.yml); PR polling disabled',
);
warnedOnce = true;
}
disabled = true;
stop();
return;
}
const worlds = prStateStore.getWorldsToWatch();
for (const entry of worlds) {
const { worldId, ...rest } = entry;
if (rest.pr_state === 'open') {
await checkPr(worldId, rest, ghToken);
} else if (rest.pr_state === 'merged') {
// Resume grace timer for merged entries that survived a restart
if (rest.auto_destroy_on_merge !== false) {
scheduleGrace(worldId, rest);
}
}
}
}
function start() {
if (intervalId !== null || disabled) return;
intervalId = setInterval(() => {
pollOnce().catch((err) => {
console.error('[pr-merge-poller] pollOnce error:', err.message);
});
}, pollIntervalMs);
}
function stop() {
if (intervalId !== null) {
clearInterval(intervalId);
intervalId = null;
}
for (const id of graceTimers.values()) {
clearTimeout(id);
}
graceTimers.clear();
}
return { start, stop };
}
/**
* PR Nanny — host-side daemon that watches all worlds' open PRs and
* dispatches fixes via `olam dispatch` when CI/reviews block them.
*
* Extends the pr-merge-poller loop pattern. Runs at 60s cadence.
* Opt-out: OLAM_PR_NANNY=0 (default: enabled).
*
* State machine per PR (stored in world-pr-state.json nanny_* fields):
* watching → dispatching → (paused | escalated | halted)
*
* Halt conditions (stop dispatching but keep watching):
* 1. dispatch_count >= MAX_DISPATCHES (configurable, default 5)
* 2. wall-clock since first dispatch >= MAX_WALL_CLOCK_MIN (default 60)
* 3. same-root-cause loop detected (last 2 dispatch summaries identical)
* 4. operator manual pause
*
* Tier escalation (PR #N tier-escalation):
* On each retry, the nanny advances to the next tier in `escalationTiers`
* (stored per-world in nanny_current_tier) instead of repeating the same
* model. When the chain is exhausted, emits `dispatch.tier-exhausted` on
* the host-stream and falls back to existing operator escalation.
*/
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import { pickNextTier } from './dispatch/tier-escalator.mjs';
import { safePersistLastDispatch } from './dispatch-persister.mjs';
const execFileAsync = promisify(execFile);
const GH_API_BASE = 'https://api.github.com';
// Known external-blocker CI check name patterns.
// When ALL failing checks match these patterns, the PR is not actionable
// (the root cause is infrastructure/release-pipeline, not the world's code).
const EXTERNAL_BLOCKER_PATTERNS = [
/detect-image-scopes/i,
/publish-mcp-auth/i,
/retag-mcp-auth/i,
/bootstrap.*publish/i,
/release.*pipeline/i,
/ghcr.*push/i,
];
/**
* @param {string} checkName
* @returns {boolean}
*/
function isExternalBlockerCheck(checkName) {
return EXTERNAL_BLOCKER_PATTERNS.some((re) => re.test(checkName));
}
/**
* Returns true when ALL failing CI checks are external-blocker patterns.
* @param {Array<{name: string, conclusion: string|null}>} checks
*/
export function isExternalBlocker(checks) {
const failing = checks.filter(
(c) => c.conclusion === 'failure' || c.conclusion === 'action_required',
);
if (failing.length === 0) return false;
return failing.every((c) => isExternalBlockerCheck(c.name));
}
/**
* @param {string} prUrl e.g. https://github.com/org/repo/pull/123
* @returns {{ owner: string, repo: string, number: number } | null}
*/
function parsePrUrl(prUrl) {
const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl);
if (!m) return null;
return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) };
}
/**
* @param {{
* prStateStore: ReturnType<import('./world-pr-state.mjs').createWorldPrStateStore>,
* getGhToken: () => Promise<string|null>,
* dispatchToWorld: (worldId: string, prompt: string, opts?: { tier?: string }) => Promise<void>,
* consultCodex: (ctx: string) => Promise<string>,
* broadcastTierEvent?: (eventType: string, payload: unknown) => void,
* pollIntervalMs?: number,
* maxDispatches?: number,
* maxWallClockMin?: number,
* }} opts
*/
export function createPrNanny({
prStateStore,
getGhToken,
dispatchToWorld,
consultCodex,
broadcastTierEvent = () => {},
pollIntervalMs = 60_000,
maxDispatches = parseInt(process.env.OLAM_PR_NANNY_MAX_DISPATCHES ?? '5', 10),
maxWallClockMin = parseInt(process.env.OLAM_PR_NANNY_MAX_WALL_CLOCK_MIN ?? '60', 10),
}) {
const enabled = (process.env.OLAM_PR_NANNY ?? '1') !== '0';
if (!enabled) return { start() {}, stop() {} };
let intervalId = null;
let warnedOnce = false;
/**
* Fetch CI check runs for the PR's head SHA.
* @param {string} owner @param {string} repo @param {number} prNumber @param {string} ghToken
* @returns {Promise<Array<{name: string, conclusion: string|null}>>}
*/
async function fetchChecks(owner, repo, prNumber, ghToken) {
try {
// First get the PR head SHA
const prRes = await fetch(
`${GH_API_BASE}/repos/${owner}/${repo}/pulls/${prNumber}`,
{ headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' } },
);
if (!prRes.ok) return [];
const prData = await prRes.json();
const sha = prData.head?.sha;
if (!sha) return [];
const checkRes = await fetch(
`${GH_API_BASE}/repos/${owner}/${repo}/commits/${sha}/check-runs?per_page=100`,
{ headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' } },
);
if (!checkRes.ok) return [];
const checkData = await checkRes.json();
return (checkData.check_runs ?? []).map((r) => ({
name: r.name,
conclusion: r.conclusion,
status: r.status,
}));
} catch {
return [];
}
}
/**
* @param {string} worldId
* @param {object} entry current pr-state entry
* @param {string} ghToken
*/
async function processWorld(worldId, entry, ghToken) {
if (entry.nanny_paused || entry.nanny_escalated) return;
if (entry.pr_state !== 'open') return;
const parsed = parsePrUrl(entry.pr_url);
if (!parsed) return;
// Halt: dispatch cap
const dispatchCount = entry.nanny_dispatch_count ?? 0;
if (dispatchCount >= maxDispatches) return;
// Halt: wall-clock ceiling
if (entry.nanny_first_dispatch_at) {
const elapsedMin = (Date.now() - new Date(entry.nanny_first_dispatch_at).getTime()) / 60_000;
if (elapsedMin >= maxWallClockMin) return;
}
const checks = await fetchChecks(parsed.owner, parsed.repo, parsed.number, ghToken);
const hasCiFailure = checks.some(
(c) => c.conclusion === 'failure' || c.conclusion === 'action_required',
);
const allPassing = checks.length > 0 && checks.every(
(c) => c.conclusion === 'success' || c.conclusion === 'skipped' || c.conclusion === 'neutral',
);
if (allPassing || checks.length === 0) return;
if (!hasCiFailure) return;
// External blocker — do not dispatch
if (isExternalBlocker(checks)) {
prStateStore.set(worldId, { nanny_external_blocker: true });
return;
}
prStateStore.set(worldId, { nanny_external_blocker: false });
const failingNames = checks
.filter((c) => c.conclusion === 'failure' || c.conclusion === 'action_required')
.map((c) => c.name)
.join(', ');
const prompt = `CI is failing on PR ${entry.pr_url}. Failing checks: ${failingNames}. Investigate the root cause, fix the code, and push.`;
// Halt: same-root-cause loop detection
if (entry.nanny_last_dispatch_prompt && entry.nanny_last_dispatch_prompt === prompt) {
console.log(`[pr-nanny] loop detected for ${worldId} — same prompt as last dispatch, halting`);
prStateStore.set(worldId, { nanny_loop_halted: true });
return;
}
// Consult Codex before dispatching
const codexCtx = `World ${worldId} has a failing PR: ${entry.pr_url}. Failing CI checks: ${failingNames}. Should we dispatch a fix? Answer: agree, push-back, or rethink.`;
let verdict = 'agree';
try {
verdict = await consultCodex(codexCtx);
} catch (err) {
console.warn(`[pr-nanny] codex consult failed for ${worldId}: ${err.message} — defaulting to agree`);
}
if (verdict === 'push-back') {
prStateStore.set(worldId, { nanny_paused: true, nanny_pause_reason: 'codex_pushback' });
console.log(`[pr-nanny] Codex push-back for ${worldId} — pausing nanny`);
return;
}
if (verdict === 'rethink') {
prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'codex_rethink' });
console.log(`[pr-nanny] Codex rethink for ${worldId} — escalating`);
return;
}
// ── Tier escalation (PR #938) ───────────────────────────────────────────
//
// `nanny_escalation_tiers` is set by the olam_dispatch caller via the
// escalationTiers schema field and persisted here by server.mjs when the
// world is registered for nanny tracking. Defaults to ['sonnet'] when
// absent (no escalation, no cost surprise).
//
// `nanny_current_tier` tracks the model tier used by the LAST dispatch for
// this PR. On first dispatch (dispatchCount === 0) it is undefined, and we
// use escalationTiers[0] as the starting tier. On retries we advance the
// chain via pickNextTier. This is the pr-state store (option c from the
// design doc) — it persists across polls and matches the nanny_* field
// pattern already established by nanny_dispatch_count et al.
const escalationTiers = entry.nanny_escalation_tiers ?? ['sonnet'];
const currentTier = entry.nanny_current_tier ?? escalationTiers[0] ?? 'sonnet';
let tierForThisDispatch = currentTier;
if (dispatchCount > 0) {
// This is a retry — try to escalate the tier.
const nextTier = pickNextTier(currentTier, escalationTiers);
if (nextTier !== null) {
tierForThisDispatch = nextTier;
broadcastTierEvent('dispatch.escalated', {
worldId,
fromTier: currentTier,
toTier: nextTier,
reason: 'retry-after-failure',
});
console.log(`[pr-nanny] tier escalated for ${worldId}: ${currentTier} → ${nextTier}`);
} else {
// Chain exhausted — emit tier-exhausted and fall back to operator escalation.
broadcastTierEvent('dispatch.tier-exhausted', {
worldId,
exhaustedTier: currentTier,
escalationTiers,
});
console.log(`[pr-nanny] tier chain exhausted for ${worldId} (last tier: ${currentTier}) — escalating to operator`);
prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'tier_exhausted' });
return;
}
}
// Dispatch fix
try {
safePersistLastDispatch({
worldId,
messageId: `nanny-${worldId}-${Date.now()}`,
prompt,
source: 'pr-nanny',
});
await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
const now = new Date().toISOString();
prStateStore.set(worldId, {
nanny_dispatch_count: dispatchCount + 1,
nanny_first_dispatch_at: entry.nanny_first_dispatch_at ?? now,
nanny_last_dispatch_at: now,
nanny_last_dispatch_prompt: prompt,
nanny_current_tier: tierForThisDispatch,
});
console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches}, tier: ${tierForThisDispatch})`);
} catch (err) {
console.error(`[pr-nanny] dispatch failed for ${worldId}: ${err.message}`);
}
}
async function pollOnce() {
const ghToken = await getGhToken();
if (!ghToken) {
if (!warnedOnce) {
console.warn('[pr-nanny] no GH token — CI polling disabled');
warnedOnce = true;
}
return;
}
const worlds = prStateStore.getWorldsToWatch();
for (const { worldId, ...entry } of worlds) {
try {
await processWorld(worldId, entry, ghToken);
} catch (err) {
console.error(`[pr-nanny] processWorld error for ${worldId}: ${err.message}`);
}
}
}
function start() {
if (intervalId !== null) return;
// Immediate first poll
pollOnce().catch((err) => console.error('[pr-nanny] pollOnce error:', err.message));
intervalId = setInterval(() => {
pollOnce().catch((err) => console.error('[pr-nanny] pollOnce error:', err.message));
}, pollIntervalMs);
}
function stop() {
if (intervalId !== null) {
clearInterval(intervalId);
intervalId = null;
}
}
return { start, stop };
}
/**
* Default Codex consultation via the host-side `codex` CLI.
* @param {string} ctx
* @returns {Promise<'agree'|'push-back'|'rethink'>}
*/
export async function defaultConsultCodex(ctx) {
try {
const { stdout } = await execFileAsync('codex', [
'--quiet',
'--model', 'codex-mini-latest',
`Adversarial review — is this a good idea? ${ctx} Reply with exactly one word: agree, push-back, or rethink.`,
], { timeout: 30_000 });
const text = stdout.trim().toLowerCase();
if (text.startsWith('push')) return 'push-back';
if (text.startsWith('rethink')) return 'rethink';
return 'agree';
} catch {
return 'agree'; // fail-open: if codex unavailable, dispatch anyway
}
}
/**
* Default dispatch: shell out to `olam dispatch <worldId> <prompt>`.
* @param {string} worldId
* @param {string} prompt
*/
export async function defaultDispatchToWorld(worldId, prompt) {
await execFileAsync('olam', ['dispatch', worldId, prompt], { timeout: 60_000 });
}
/**
* process-poller.mjs — per-world docker top SSE fanout.
*
* Dual-mode: HTTP API when DOCKER_HOST != 'docker-cli'; spawnSync otherwise.
*
* NOTE: process argv may contain secrets (--api-key=, --token=). Post-v1 audit needed. (S1)
*/
import { spawnSync } from 'node:child_process';
const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli';
/**
* @typedef {{ pid: string, user: string, cpu: string, mem: string, started: string, state: string, command: string }} ProcessRow
*/
function worldContainerName(worldId) {
return `olam-${worldId}-devbox`;
}
/**
* Parse docker top JSON (Titles + Processes arrays) into normalized rows.
* Falls back gracefully if the response is not JSON.
* lstart is stored as a raw string — no Date parse (T1).
*
* @param {string} stdout
* @returns {ProcessRow[]}
*/
function parseDockerTop(stdout) {
let parsed;
try {
parsed = JSON.parse(stdout);
} catch {
return [];
}
const titles = parsed?.Titles;
const processes = parsed?.Processes;
if (!Array.isArray(titles) || !Array.isArray(processes)) return [];
// Find column indices by title (case-insensitive partial match).
function idx(name) {
const n = name.toLowerCase();
const i = titles.findIndex((t) => typeof t === 'string' && t.toLowerCase().includes(n));
return i;
}
const pidIdx = idx('pid');
const userIdx = idx('user');
const cpuIdx = idx('cpu');
const memIdx = idx('mem');
// Accept LSTART, STARTED, STIME, or START_TIME (T1: store as raw string)
const startIdx = (() => {
for (const candidate of ['lstart', 'stime', 'start_time', 'start']) {
const i = idx(candidate);
if (i !== -1) return i;
}
return -1;
})();
const stateIdx = idx('stat');
const cmdIdx = (() => {
// CMD may be titled "CMD", "COMMAND", or "cmd"
const i = idx('command');
return i !== -1 ? i : idx('cmd');
})();
return processes.map((row) => ({
pid: pidIdx !== -1 ? String(row[pidIdx] ?? '').trim() : '',
user: userIdx !== -1 ? String(row[userIdx] ?? '').trim() : '',
cpu: cpuIdx !== -1 ? String(row[cpuIdx] ?? '').trim() : '0',
mem: memIdx !== -1 ? String(row[memIdx] ?? '').trim() : '0',
started: startIdx !== -1 ? String(row[startIdx] ?? '').trim() : '',
state: stateIdx !== -1 ? String(row[stateIdx] ?? '').trim() : '',
command: cmdIdx !== -1 ? String(row[cmdIdx] ?? '').trim() : '',
}));
}
/**
* Fetch processes for a world container.
* Returns {ts, processes, error?}.
* Non-running containers return an empty array + error field (T3).
*
* @param {string} worldId
* @returns {Promise<{ts: number, processes: ProcessRow[], error?: string}>}
*/
async function fetchProcesses(worldId) {
const containerName = worldContainerName(worldId);
// Docker's /containers/<name>/top?ps_args=<X> passes ps_args verbatim to
// ps(1) inside the container. The pre-2026-05-05 form `pid,user,...` was
// a bare comma-separated list that ps treats as a process-ID *list*, not
// a column selector — yielding 500 "ps: error: process ID list syntax
// error" from the Docker API and a misleading "container not running"
// chip in the SPA. Correct invocation is `ps -eo <cols>` to select all
// processes (`-e`) and project specific columns (`-o`). Confirmed via
// host-cp container against olam-dawn-arc-5703-devbox: this form returns
// 200 with both Titles + Processes arrays, which parseDockerTop expects.
//
// Switched lstart → stime to match the CLI path's column choice (line 98)
// and avoid multi-word timestamp values; the CLI path's split-on-1+ws
// parser would break on "Mon May 4 14:00:00 2026", and consistency between
// paths reduces surprise. parseDockerTop accepts either via title match.
const ps_args = '-eo pid,user,pcpu,pmem,stime,stat,cmd';
let stdout;
try {
if (DOCKER_HOST === 'docker-cli') {
// Bare-node mode: spawnSync blocks ~50ms at 5s cadence (P2 — acceptable).
// Use `stime` (single-word start time) instead of `lstart` to avoid
// multi-word timestamp values that break column-split parsing.
const result = spawnSync(
'docker',
['top', containerName, 'pid', 'user', 'pcpu', 'pmem', 'stime', 'stat', 'cmd'],
{ encoding: 'utf-8', timeout: 3000 },
);
if (result.status !== 0 || result.error) {
return { ts: Date.now(), processes: [], error: 'container not running' };
}
// docker top bare CLI outputs tabular text, not JSON. Wrap it for parseDockerTop.
stdout = result.stdout ?? '';
const lines = stdout.trim().split('\n');
if (lines.length < 1) return { ts: Date.now(), processes: [] };
// First line is the header row; remaining are process rows.
// stime is always a single word (e.g. "10:00" or "Feb11"), so splitting
// on 1+ whitespace is safe.
const titleFields = lines[0].trim().split(/\s+/);
const dataRows = lines.slice(1).map((line) => {
const parts = line.trim().split(/\s+/);
// CMD may contain spaces — rejoin everything after the 7th token.
if (parts.length > 7) {
return [...parts.slice(0, 6), parts.slice(6).join(' ')];
}
return parts;
});
const wrapped = JSON.stringify({ Titles: titleFields, Processes: dataRows });
return { ts: Date.now(), processes: parseDockerTop(wrapped) };
} else {
// Container mode: Docker HTTP API.
const apiBase = DOCKER_HOST.replace(/^tcp:\/\//, 'http://');
const url = `${apiBase}/containers/${encodeURIComponent(containerName)}/top?ps_args=${encodeURIComponent(ps_args)}`;
const resp = await fetch(url, { signal: AbortSignal.timeout(3000) });
if (!resp.ok) {
return { ts: Date.now(), processes: [], error: 'container not running' };
}
stdout = await resp.text();
return { ts: Date.now(), processes: parseDockerTop(stdout) };
}
} catch {
return { ts: Date.now(), processes: [], error: 'container not running' };
}
}
/**
* Snapshot — thin wrapper over fetchProcesses.
*
* @param {string} worldId
*/
export async function getProcessSnapshot(worldId) {
return fetchProcesses(worldId);
}
// ── SSE fanout state ─────────────────────────────────────────────────
/**
* Per-world subscriber registry.
* @type {Map<string, {pollTimer: ReturnType<typeof setInterval>, heartbeatTimer: ReturnType<typeof setInterval>, subscribers: Set<import('node:http').ServerResponse>}>}
*/
const worldPollers = new Map();
/**
* Broadcast a payload to all subscribers for a world.
* @param {string} worldId
* @param {{ts: number, processes: ProcessRow[], error?: string}} data
*/
function broadcast(worldId, data) {
const entry = worldPollers.get(worldId);
if (!entry) return;
const payload = `event: processes\ndata: ${JSON.stringify(data)}\n\n`;
for (const res of entry.subscribers) {
try { res.write(payload); } catch { /* subscriber gone; cleanup fires on close */ }
}
}
/**
* Subscribe an SSE response to the world's process stream.
*
* SSE headers are written BEFORE adding to the Set (T2: prevents leak if close
* fires before headers are flushed — the cleanup handler is safe to call even
* with an empty Set).
*
* @param {string} worldId
* @param {import('node:http').ServerResponse} res
*/
export function subscribeToProcesses(worldId, res) {
// Write SSE headers synchronously before touching the subscriber Set (T2).
res.writeHead(200, {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'X-Accel-Buffering': 'no',
});
let entry = worldPollers.get(worldId);
if (!entry) {
// First subscriber — start the poll + heartbeat timers.
const pollTimer = setInterval(async () => {
const data = await fetchProcesses(worldId);
broadcast(worldId, data);
}, 5000);
const heartbeatTimer = setInterval(() => {
const e = worldPollers.get(worldId);
if (!e) return;
for (const r of e.subscribers) {
try { r.write(': heartbeat\n\n'); } catch { /* ignore */ }
}
}, 25000);
entry = { pollTimer, heartbeatTimer, subscribers: new Set() };
worldPollers.set(worldId, entry);
}
entry.subscribers.add(res);
// Send an immediate first snapshot so the client doesn't wait 5s.
fetchProcesses(worldId).then((data) => {
try { res.write(`event: processes\ndata: ${JSON.stringify(data)}\n\n`); } catch { /* gone */ }
});
// Cleanup on disconnect — mirrors wireRelease pattern with once-flag.
let cleaned = false;
function cleanup() {
if (cleaned) return;
cleaned = true;
const e = worldPollers.get(worldId);
if (!e) return;
e.subscribers.delete(res);
if (e.subscribers.size === 0) {
clearInterval(e.pollTimer);
clearInterval(e.heartbeatTimer);
worldPollers.delete(worldId);
}
}
res.on('close', cleanup);
res.on('finish', cleanup);
}
// Export parseDockerTop for unit tests.
export { parseDockerTop };
// Phase F-2-B (B3): host CP HTTP proxy.
//
// Rewrites incoming requests under `/api/world/<id>/<route...>` to the
// per-world CP at `<perWorldBase>/<route...>` with `X-Olam-Secret`
// injected server-side.
//
// Pattern lifted from `packages/cloudflare-worker/src/index.ts:462-551`
// (`proxyContainer`). CF Worker uses Workers' `fetch()`; host CP uses
// Node's `http.request` so SSE streams flow byte-for-byte without
// buffering. Verbatim passthrough on /hooks/* and /api/auth/* (D8) is
// implemented in B4 (this module is JSON-API-only — B4 wraps).
import http from 'node:http';
/**
* Default upstream-request timeout for proxied per-world CP calls. SSE
* streams (`/api/stream`, `/hooks/*` long-poll) MUST opt out — they
* intentionally hold the socket open. Everything else should respond
* within a few seconds; if the per-world CP wedges (slow sqlite,
* tmux command stuck, long docker exec), this prevents the host-cp
* connection from hanging until the OS RSTs it. The browser sees a
* clean 504 instead of Safari's TypeError "Load failed", and useLanes /
* useReadiness can retry on a known status code.
*
* 10s matches the longest legitimate handler we've measured (cold
* sqlite open + readiness query) with headroom.
*
* @internal exported for test override
*/
export const DEFAULT_PROXY_TIMEOUT_MS = 10_000;
/**
* Parse `/api/world/<id>/<route...>` from a request path. Returns
* `{ worldId, subPath }` or null if the path doesn't match.
*
* Anchored at `^/api/world/` to prevent prefix-matching from /api/worlds
* (the worlds-list endpoint, plural). Empty world IDs do not match.
*
* @param {string} path
* @returns {{ worldId: string, subPath: string } | null}
*/
export function parseProxyPath(path) {
const m = /^\/api\/world\/([^/?#]+)(\/.*|\?.*|#.*)?$/.exec(path);
if (!m) return null;
return {
worldId: m[1],
subPath: m[2] ?? '/',
};
}
/**
* Compute the per-world CP's base URL from a worldId. Today the world
* registry stores port offsets; the canonical port is `19080 + offset`.
* For B3, accept the port directly (deferring worlds.db integration to
* B6/B10). The caller (server.mjs) resolves worldId → port via worlds.db
* and passes the port here.
*
* In Docker Compose mode, host-cp is in its own network and reaches
* world CPs via `host.docker.internal:<port>` (compose.yaml's
* extra_hosts: host-gateway). On Docker Desktop this is automatic;
* on Linux it requires the `host-gateway` extra-host directive.
*
* @param {number} port per-world CP host port (e.g., 20780)
* @param {string} [host] optional hostname override (default 'host.docker.internal')
* @returns {string}
*/
export function perWorldBase(port, host = 'host.docker.internal') { // bare-node-allow: container-mode default; bare callers pass WORLD_HOST explicitly (server.mjs)
return `http://${host}:${port}`;
}
/**
* SSE / long-poll paths whose handlers intentionally hold the socket
* open. These MUST be exempt from the upstream timeout — applying it
* would kill the stream every 10s. Caller can override per-request via
* `streaming: true`.
*
* @param {string} subPath
* @returns {boolean}
*/
function isStreamingPath(subPath) {
// Strip query string before matching.
const p = subPath.split('?')[0];
return (
p === '/api/stream' ||
p.endsWith('/api/stream') ||
p.startsWith('/hooks/') ||
p === '/hooks' ||
/^\/api\/auth\/events(\/|$)/.test(p)
);
}
/**
* Proxy an incoming request to a per-world CP, injecting X-Olam-Secret.
*
* Forwards: method, path (subPath), body bytes, ALL request headers
* EXCEPT `host` (rewritten) and `x-olam-secret` (overwritten with the
* injected secret to prevent client spoofing).
*
* Returns: status code, ALL response headers (verbatim — D8 contract
* forwards Set-Cookie, Location, etc. unchanged), body bytes streamed
* via Node's http.IncomingMessage→ServerResponse pipe (no buffering).
*
* Upstream timeout: short-request handlers (≠ SSE) get an upstream
* socket timeout of `timeoutMs` (defaults to DEFAULT_PROXY_TIMEOUT_MS).
* On expiry we abort the upstream socket and respond 504 — this
* converts a wedged per-world CP into a deterministic status code
* instead of a TCP RST that Safari surfaces as `TypeError: Load
* failed`. Pass `streaming: true` (or hit a path matching
* `isStreamingPath`) to opt out.
*
* @param {object} args
* @param {import('node:http').IncomingMessage} args.req
* @param {import('node:http').ServerResponse} args.res
* @param {string} args.subPath e.g., '/api/world' or '/api/stream'
* @param {string} args.targetBase e.g., 'http://host.docker.internal:20780'
* @param {string} args.secret the X-Olam-Secret value
* @param {(message: string) => void} [args.log]
* @param {number} [args.timeoutMs] per-request upstream timeout; ignored for streams
* @param {boolean} [args.streaming] force SSE/long-poll mode (skip timeout)
*/
export function proxyToWorld({
req,
res,
subPath,
targetBase,
secret,
log = console.log,
timeoutMs = DEFAULT_PROXY_TIMEOUT_MS,
streaming = false,
}) {
const target = new URL(subPath, targetBase);
const isStream = streaming || isStreamingPath(subPath);
// Build outbound headers. Filter `host` (Node will set from URL) +
// overwrite `x-olam-secret` (defense against client spoofing).
/** @type {Record<string, string | string[]>} */
const outHeaders = {};
for (const [k, v] of Object.entries(req.headers)) {
if (v === undefined) continue;
const lower = k.toLowerCase();
if (lower === 'host' || lower === 'x-olam-secret') continue;
outHeaders[k] = v;
}
outHeaders['x-olam-secret'] = secret;
outHeaders['x-forwarded-by'] = 'olam-host-cp';
const upstreamReq = http.request(
target,
{
method: req.method ?? 'GET',
headers: outHeaders,
},
(upstreamRes) => {
// Once headers come back from upstream, the request is no longer
// "stuck" — clear the timeout so a slow stream-of-body doesn't
// get killed mid-flight. Streaming handlers that intentionally
// delay between writes still rely on the no-timeout path.
if (timer !== null) {
clearTimeout(timer);
timer = null;
}
// Verbatim passthrough: status + ALL headers + body bytes.
// Use res.writeHead so the headers go out atomically with the
// status line (response.statusCode + setHeader split would race
// on early body write). statusMessage may be undefined on some
// upstream paths — fall back to the default.
res.writeHead(
upstreamRes.statusCode ?? 502,
upstreamRes.statusMessage,
upstreamRes.headers,
);
upstreamRes.pipe(res);
},
);
/** @type {ReturnType<typeof setTimeout> | null} */
let timer = null;
if (!isStream && timeoutMs > 0) {
timer = setTimeout(() => {
timer = null;
log(`proxy: upstream timeout (${timeoutMs}ms) for ${target}`);
// Destroying the upstream req triggers the 'error' handler with
// a generic socket error; we pre-empt it with an explicit 504
// first so the client sees a clean status instead of the generic
// 502 the error handler would emit.
if (!res.headersSent) {
res.writeHead(504, { 'Content-Type': 'application/json; charset=utf-8' });
res.end(JSON.stringify({
error: 'upstream_timeout',
message: `per-world CP did not respond within ${timeoutMs}ms`,
worldUrl: target.origin,
}));
} else {
res.end();
}
try {
upstreamReq.destroy(new Error('proxy upstream timeout'));
} catch {
// already destroyed
}
}, timeoutMs);
}
// Upstream connection error — don't leak internals to the client.
upstreamReq.on('error', (err) => {
if (timer !== null) {
clearTimeout(timer);
timer = null;
}
log(`proxy: upstream error for ${target}: ${err.message}`);
if (!res.headersSent) {
res.writeHead(502, { 'Content-Type': 'application/json; charset=utf-8' });
res.end(JSON.stringify({
error: 'upstream_unreachable',
message: 'per-world CP did not respond',
worldUrl: target.origin,
}));
} else {
// Response already started (likely SSE); just close.
res.end();
}
});
// Client closed early (browser navigated away, Safari unloaded the
// EventSource, etc.). Tear down the upstream so we don't keep an
// open socket to the per-world CP for an answer the caller no longer
// wants. Without this, host-cp leaks sockets per cancelled poll.
res.on('close', () => {
if (timer !== null) {
clearTimeout(timer);
timer = null;
}
if (!upstreamReq.destroyed) {
try {
upstreamReq.destroy();
} catch {
// already gone
}
}
});
// Pipe request body. For GET/HEAD this is a no-op (no body bytes);
// for POST/PUT/PATCH this streams the body upstream.
req.pipe(upstreamReq);
}
/**
* Phase E3 (olam-dogfood-vision): PylonWorldsSource skeleton.
*
* Stub implementation of the WorldsSource contract (E1) for Pylon-
* managed cloud worlds. Returns `[]` for now — the actual @pleri/pylon
* SDK integration is intentionally deferred (T5 mitigation: design the
* contract before the SDK lands so consumers don't churn when it does).
*
* The class proves the interface composes: E4 wires this alongside
* LocalWorldsSource into the GET /api/worlds handler so a Pylon-enabled
* deployment fans out across both sources, dedupes by id, and returns
* the union. With this stub returning `[]`, an enabled-but-empty Pylon
* source is a strict no-op over local-only behavior.
*
* Activation: gated by `OLAM_HOST_CP_PYLON_ENABLED=1`. When the env
* var is unset/0/false, server.mjs (E4) does NOT instantiate this
* source — the local-only path is preserved verbatim. When enabled,
* the empty source layers additively on top of local; behavior is
* still observably identical until the SDK ships.
*
* Why a no-op stub instead of waiting for the SDK:
* - Consumers (SPA badge logic in E5, regression tests, CLI lookup)
* can be wired against the contract without blocking on the SDK.
* - Forces E4's composition logic to actually fan out, dedupe, and
* merge — exercising the multi-source path in CI before any cloud
* traffic touches it.
* - Surface-area lock-in: anything missing here surfaces as a
* contract gap NOW, not after the SDK is wired.
*
* @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource
* @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary
*/
/**
* @typedef {object} PylonWorldsSourceDeps
* @property {boolean} enabled
* When false, list() short-circuits to `[]` without any Pylon
* interaction. Kept on the deps object (rather than read from
* process.env at construction time) so tests can flip it without
* mutating module-level env state.
*/
/**
* @param {PylonWorldsSourceDeps} [deps]
* @returns {WorldsSource}
*/
export function createPylonWorldsSource(deps = { enabled: false }) {
return {
name: 'pylon-cloud',
async list() {
if (!deps.enabled) return [];
// TODO(pylon): wire @pleri/pylon SDK. Expected shape:
// const client = new PylonClient({ token: scopedToken });
// const cloudWorlds = await client.worlds.list();
// return cloudWorlds.map((w) => ({
// id: w.id,
// name: w.displayName ?? null,
// status: mapPylonStatus(w.state), // 'running' | 'starting' | ...
// services: mapPylonServices(w.services),
// source: 'pylon-cloud',
// }));
// Until the SDK lands, the source is intentionally empty —
// proving the interface composes (E4) without committing the
// mapping shape prematurely.
return [];
},
};
}
// Phase F-2-B (B6): redact sensitive keys from workspace YAML before
// exposing via /api/workspaces.
//
// T11 mitigation. Workspace YAMLs may contain operator-set environment
// variables that include OAuth client secrets, API keys, deployment
// tokens, database passwords. These should NEVER cross the host-cp ↔
// browser boundary.
//
// Strategy: pattern-based recursive redaction. Any object key matching
// SENSITIVE_KEY_PATTERN replaces its value with `[redacted]`. Catches
// the standard naming conventions while remaining permissive on
// non-sensitive keys (we don't false-positive on legitimate config).
//
// The pattern is intentionally broad — it's defensive. If an operator
// names a non-sensitive var with a `_KEY`/`_SECRET`/`_TOKEN`/`_PASSWORD`/
// `_CREDENTIALS` suffix, it gets redacted. Operators get a clear signal
// (the value becomes `[redacted]`) and can rename the var if needed.
//
// We deliberately do NOT use the `PROTECTED_ENV_KEYS` set from
// packages/core/src/world/env-setup.ts — that set is for service-
// discovery host/port/URL keys (POSTGRES_HOST, REDIS_URL, etc.), not
// org secrets. The two filters address different surfaces:
// - PROTECTED_ENV_KEYS in core: prevents manifest from overriding
// service-discovery state on the world's runtime env
// - SENSITIVE_KEY_PATTERN here: prevents the host CP API from leaking
// org secrets to the browser
// Both are needed.
export const SENSITIVE_KEY_PATTERN = /(.*_KEY|.*_SECRET|.*_TOKEN|.*_PASSWORD|.*_CREDENTIALS|.*_AUTH|API_KEY|PASSWORD|SECRET|TOKEN)$/i;
/**
* Recursively redact sensitive values in any JSON-like structure
* (objects, arrays, primitives). Returns a new value; does not mutate
* input.
*
* @param {unknown} value
* @returns {unknown}
*/
export function redactSensitive(value) {
if (Array.isArray(value)) {
return value.map(redactSensitive);
}
if (value !== null && typeof value === 'object') {
/** @type {Record<string, unknown>} */
const out = {};
for (const [k, v] of Object.entries(value)) {
if (SENSITIVE_KEY_PATTERN.test(k)) {
out[k] = '[redacted]';
} else {
out[k] = redactSensitive(v);
}
}
return out;
}
return value;
}
/**
* Quick predicate: does this key name look sensitive? Useful for
* pre-screening when iterating large maps.
*
* @param {string} key
* @returns {boolean}
*/
export function isSensitiveKey(key) {
return SENSITIVE_KEY_PATTERN.test(key);
}
// redirect.mjs — Phase B3 (plan-chat-spa-supersedes-control-plane).
//
// 301 redirect layer that fronts host-cp's HTTP handler. Maps legacy
// control-plane routes that get deleted in Phase B4 onto their canonical
// successors so live URLs in operator history / bookmarks / Slack do not
// 404 after the deletion lands.
//
// Redirect rules (allow-listed; closed set):
//
// /plan/:id → no-op (falls through to SPA shell;
// plan-chat-spa-side router handles the
// resolver dispatch via useResolveId).
// Implemented as a sentinel so callers
// can short-circuit but the request
// continues to static-serve.
// /world/:id → 301 /worlds?highlight=:id
// /sandbox/:id → 301 /worlds?highlight=:id
// /session/:worldId/plan → 301 /plan/:worldId
//
// EXPLICITLY NOT REDIRECTED (more-specific routes still owned by
// control-plane until Phase E):
// /world/:id/editor /world/:id/events
// /sandbox/:id/editor /sandbox/:id/events
// /inbox/* /workspaces/*
// /repos /runbooks /design
//
// Security (per plan-chat-spa-supersedes-control-plane.md K1 SEC-2):
// - Redirect targets are HARDCODED prefixes (`/plan/`, `/worlds`). No
// caller-supplied target is ever reflected into Location.
// - `:id` segment is validated against RESOLVE_ID_RE before any
// reflection into the Location header; invalid shapes → 400, not
// 301. This kills open-redirect / response-splitting / header-
// injection vectors at the door.
// - `highlight=<id>` query param uses the SAME shape regex. We do not
// trust the inbound URL beyond the regex match (no decoding, no
// surrogate pair handling).
//
// Returns one of:
// { kind: 'redirect', status: 301, location: '<target>' }
// { kind: 'bad-request', status: 400, message: '<reason>' }
// { kind: 'passthrough' } — caller continues normal request flow
import { RESOLVE_ID_RE } from './resolver.mjs';
/**
* Compute the redirect verdict for a given pathname. Pure function;
* does not consume the request body or write the response.
*
* @param {string} pathname - URL.pathname (no querystring, no hash)
* @returns {{ kind: 'redirect', status: 301, location: string }
* | { kind: 'bad-request', status: 400, message: string }
* | { kind: 'passthrough' }}
*/
export function evaluateRedirect(pathname) {
if (typeof pathname !== 'string' || pathname.length === 0) {
return { kind: 'passthrough' };
}
// /session/:worldId/plan → /plan/:worldId
// Match BEFORE the catch-all world rules so the `/session/...` prefix
// wins. The trailing `/plan` is fixed; only the worldId varies.
const sessionMatch = /^\/session\/([^/]+)\/plan\/?$/.exec(pathname);
if (sessionMatch) {
const worldId = sessionMatch[1];
if (!RESOLVE_ID_RE.test(worldId)) {
return {
kind: 'bad-request',
status: 400,
message: 'invalid worldId shape on /session/:worldId/plan',
};
}
return {
kind: 'redirect',
status: 301,
location: `/plan/${worldId}`,
};
}
// /design → / (Phase E2: the DesignSurface alpha placeholder is retired.
// Hardcoded target — no caller reflection. Exact-match only so /designfoo
// or /design/sub do not over-match into the redirect.)
if (pathname === '/design' || pathname === '/design/') {
return { kind: 'redirect', status: 301, location: '/' };
}
// /world/:id (catch-all, EXCLUDING /editor and /events sub-routes)
// /sandbox/:id (catch-all, EXCLUDING /editor and /events sub-routes)
const worldMatch = /^\/(world|sandbox)\/([^/]+)(\/.*)?$/.exec(pathname);
if (worldMatch) {
const [, , id, rest] = worldMatch;
// KEEP these — control-plane still owns them until Phase E.
if (rest === '/editor' || rest === '/events' ||
rest?.startsWith('/editor/') || rest?.startsWith('/events/')) {
return { kind: 'passthrough' };
}
if (!RESOLVE_ID_RE.test(id)) {
return {
kind: 'bad-request',
status: 400,
message: 'invalid id shape on /(world|sandbox)/:id',
};
}
return {
kind: 'redirect',
status: 301,
location: `/worlds?highlight=${encodeURIComponent(id)}`,
};
}
// /plan/:id is intentionally passthrough — the SPA shell serves it
// and the SPA-side router (with useResolveId) decides what to mount.
// We DO NOT emit a self-loop 301 here. Including the rule for
// completeness / future-proofing only.
// (No regex needed; the static-serve layer already handles /plan/*
// via SPA_PREFIX.)
return { kind: 'passthrough' };
}
/**
* Apply the redirect verdict to a node:http ServerResponse. Returns
* `true` when the response was written (caller must NOT continue);
* returns `false` when the caller should continue the normal request
* flow.
*
* @param {import('node:http').ServerResponse} res
* @param {ReturnType<typeof evaluateRedirect>} verdict
* @returns {boolean} true if response was sent, false to passthrough.
*/
export function applyRedirect(res, verdict) {
if (verdict.kind === 'passthrough') return false;
if (verdict.kind === 'redirect') {
res.writeHead(301, {
Location: verdict.location,
// Short cache so bookmarks update once but operator-local mistakes
// (typo'd URL) don't pin to a stale redirect forever.
'Cache-Control': 'public, max-age=300',
'Content-Type': 'text/plain; charset=utf-8',
});
res.end(`Moved permanently: ${verdict.location}\n`);
return true;
}
if (verdict.kind === 'bad-request') {
res.writeHead(400, {
'Content-Type': 'application/json; charset=utf-8',
'Cache-Control': 'no-store',
});
res.end(JSON.stringify({
error: 'bad-request',
message: verdict.message,
}));
return true;
}
// Defensive: unknown verdict shape → fall through silently.
return false;
}
// resolver.mjs — Phase A A1 (plan-chat-spa-supersedes-control-plane).
//
// Disambiguates a single opaque :id supplied on /plan/:id between
//
// - a planning session (planning_sessions.session_id), or
// - a crystallized world (planning_artifacts.crystallized_world_id), or
// - unresolvable (returns {kind:'unresolved', canonical_id:null}).
//
// Used by plan-chat-spa's useResolveId hook (Phase A A2) so the SPA's
// cold-open path can mount the correct surface without trusting the
// id-shape (sentinel `sess_*` prefix is a hint, not authority — see
// plan-chat-spa-supersedes-control-plane.md K1 SEC-1).
//
// Single SQL query (UNION ALL) so resolution costs one round-trip even
// when the id misses both tables. Bearer auth + rate-limit live in the
// HTTP handler in plan-chat-service.mjs; this helper is pool-pure for
// unit testability.
/**
* Validate the resolver :id shape. Mirrors plan-chat-service.mjs's
* SCOPE_ID_RE; tightened to 6-80 chars so an enumeration attacker can't
* grind through 1-5 char shapes.
*/
export const RESOLVE_ID_RE = /^[A-Za-z0-9._-]{6,80}$/;
/**
* Resolve an opaque id against the chunks substrate.
*
* @param {{ query: (sql: string, params: unknown[]) => Promise<{ rows: unknown[] }> }} pool
* A pg-shaped pool. Tests pass a stub; production passes pg.Pool.
* @param {string} id The candidate id.
* @returns {Promise<{ kind: 'session' | 'world' | 'unresolved', canonical_id: string | null }>}
*/
export async function resolveId(pool, id) {
if (typeof id !== 'string' || !RESOLVE_ID_RE.test(id)) {
return { kind: 'unresolved', canonical_id: null };
}
// Single round-trip. Both branches return the same shape
// (kind, canonical_id) so PG can UNION ALL them without coercion.
//
// Session branch wins on tie (LIMIT 1 + session ordered first) — a
// session id colliding with a world id is unlikely in practice
// (worldId is the random docker name; sessionId is uuid-shaped),
// but the deterministic ordering closes the K1 collision risk
// surfaced in pass 3 review.
const sql = `
SELECT kind, canonical_id FROM (
SELECT 'session' AS kind, session_id AS canonical_id, 1 AS rank
FROM planning_sessions
WHERE session_id = $1
UNION ALL
SELECT 'world' AS kind, crystallized_world_id AS canonical_id, 2 AS rank
FROM planning_artifacts
WHERE crystallized_world_id = $1
) AS resolved
ORDER BY rank
LIMIT 1
`;
const result = await pool.query(sql, [id]);
const row = result.rows && result.rows[0];
if (!row) return { kind: 'unresolved', canonical_id: null };
// Pool stub-friendly: tolerate column names emerging from pg's
// case-insensitive identifier handling.
const kind = row.kind ?? row.KIND;
const canonical_id = row.canonical_id ?? row.CANONICAL_ID;
if (kind !== 'session' && kind !== 'world') {
return { kind: 'unresolved', canonical_id: null };
}
if (typeof canonical_id !== 'string' || canonical_id.length === 0) {
return { kind: 'unresolved', canonical_id: null };
}
return { kind, canonical_id };
}
/**
* Token-bucket rate limiter, per bearer principal. Closes the brute-
* force enumeration vector that bearer auth alone leaves open (an
* authenticated caller could otherwise grind through ids at
* line-rate).
*
* 60 req/min per bearer. Single-process in-memory map (one host-cp
* per host); a multi-instance deployment would need a shared store,
* but plan-chat-service is single-tenant single-host by design.
*/
export function createRateLimiter({
capacity = 60,
windowMs = 60_000,
now = () => Date.now(),
} = {}) {
const buckets = new Map(); // key -> { tokens, lastRefill }
function take(key) {
const t = now();
let bucket = buckets.get(key);
if (!bucket) {
bucket = { tokens: capacity, lastRefill: t };
buckets.set(key, bucket);
}
// Refill proportional to elapsed time.
const elapsed = t - bucket.lastRefill;
if (elapsed > 0) {
const refill = (elapsed / windowMs) * capacity;
bucket.tokens = Math.min(capacity, bucket.tokens + refill);
bucket.lastRefill = t;
}
if (bucket.tokens < 1) {
return { allowed: false, retryAfterMs: Math.ceil((1 - bucket.tokens) * (windowMs / capacity)) };
}
bucket.tokens -= 1;
return { allowed: true, retryAfterMs: 0 };
}
function reset() {
buckets.clear();
}
return { take, reset };
}
// host-cp request router.
//
// Replaces the long linear `if (url.pathname === ...)` dispatch chain in
// server.mjs with an ordered route table. The table is walked in
// registration order, so route PRECEDENCE is preserved exactly as it was
// in the original if-ladder: the first matching route wins, later routes
// are never consulted once a match handles the request.
//
// Why a table and not a framework:
// - host-cp ships with no external HTTP framework (no express/fastify);
// this matches the existing zero-dep style.
// - The table is a plain data structure, so it is importable + unit
// testable WITHOUT booting server.mjs (which spawns docker-events,
// the auth poller, and the worlds.db reconciler at import time).
// - A route is now a table entry instead of a `return` buried in a
// 1700-line ladder. That kills the silent route-shadowing class: a
// misplaced `return` can no longer swallow a later route, and the
// full set of routes is enumerable (see `router.routes()`).
//
// Behavior-preservation contract (load-bearing — see
// __tests__/router.test.mjs):
// 1. Walk order == registration order == original source order.
// 2. A route MATCHES when its matcher returns a truthy match value AND
// (no method filter OR the method matches). The matcher receives
// ({ pathname, method, url }) and returns either a boolean or, for
// regex routes, the RegExpMatchArray (truthy) so the handler can read
// capture groups.
// 3. The FIRST matching route is invoked and dispatch STOPS — identical
// to `if (cond) { ...; return; }`. The handler owns the response.
// 4. A route whose path matches but whose METHOD does not is SKIPPED,
// and the walk continues — identical to the original
// `if (pathMatch && req.method === 'X')` blocks, where a path hit
// with the wrong method fell through to the next `if`.
// 5. If no route matches, dispatch returns `false` so the caller runs
// its terminal 404 — identical to the original fall-through.
//
// The router does NOT add auth, body parsing, or any middleware semantics.
// Those stay exactly where they were in server.mjs (pre-auth routes, the
// auth gate, the plan-chat bypass) — the router only models the part of
// the chain that was a flat sequence of `if` blocks.
/**
* @typedef {object} RouteContext
* @property {string} pathname url.pathname
* @property {string} method req.method (already normalized by node to uppercase)
* @property {URL} url parsed request URL
*/
/**
* A matcher decides whether a route applies to a request, ignoring method.
* Returning a non-boolean truthy value (e.g. a RegExpMatchArray) is
* forwarded to the handler as `ctx.match` so regex routes can read groups.
*
* @typedef {(ctx: RouteContext) => (boolean | RegExpMatchArray | null | undefined)} RouteMatcher
*/
/**
* A handler receives the node req/res plus the parsed url, the matched
* value (for regex routes), and is responsible for writing the response.
* It mirrors the body of an original `if` block. Return value is ignored;
* matching alone terminates dispatch (preserving the `if ... return`
* semantics where reaching the block always handled the request).
*
* @typedef {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse, ctx: RouteContext & { match: any }) => unknown | Promise<unknown>} RouteHandler
*/
/**
* @typedef {object} Route
* @property {string} name human label for diagnostics / tests
* @property {string[] | null} methods allowed methods, or null for "any method"
* @property {RouteMatcher} match
* @property {RouteHandler} handler
*/
/**
* Create an ordered router. Routes are matched in the order they are
* registered — register in the SAME order the original if-ladder ran.
*/
export function createRouter() {
/** @type {Route[]} */
const routes = [];
/**
* Register a route. Returns the router for chaining.
*
* @param {object} spec
* @param {string} spec.name
* @param {string | string[] | null} [spec.method] single method, list, or null/omitted for any
* @param {string} [spec.path] exact pathname match (mutually exclusive with prefix/match)
* @param {string} [spec.prefix] pathname.startsWith(prefix) match
* @param {RegExp} [spec.pattern] pathname.match(pattern) — match value passed to handler
* @param {RouteMatcher} [spec.match] custom matcher (overrides path/prefix/pattern)
* @param {RouteHandler} spec.handler
*/
function register(spec) {
const { name, method, path, prefix, pattern } = spec;
const handler = spec.handler;
if (typeof handler !== 'function') {
throw new TypeError(`route "${name}" requires a handler function`);
}
/** @type {string[] | null} */
let methods = null;
if (Array.isArray(method)) methods = method.slice();
else if (typeof method === 'string') methods = [method];
// method omitted or null → any method
/** @type {RouteMatcher} */
let match;
if (typeof spec.match === 'function') {
match = spec.match;
} else if (typeof path === 'string') {
match = (ctx) => ctx.pathname === path;
} else if (typeof prefix === 'string') {
match = (ctx) => ctx.pathname.startsWith(prefix);
} else if (pattern instanceof RegExp) {
match = (ctx) => ctx.pathname.match(pattern);
} else {
throw new TypeError(
`route "${name}" requires one of: path, prefix, pattern, or match`,
);
}
routes.push({ name, methods, match, handler });
return api;
}
/**
* Walk the table in registration order. Invokes the first route whose
* matcher is truthy AND whose method filter admits the request, then
* stops. A path-match with a non-admitted method is skipped (the walk
* continues), preserving the original `if (pathMatch && method===X)`
* fall-through.
*
* @param {import('node:http').IncomingMessage} req
* @param {import('node:http').ServerResponse} res
* @param {URL} url
* @returns {Promise<boolean>} true if a route handled the request, false to fall through to 404
*/
async function dispatch(req, res, url) {
const ctx = { pathname: url.pathname, method: req.method ?? 'GET', url };
for (const route of routes) {
const matched = route.match(ctx);
if (!matched) continue;
// Path matched. Now gate on method — a mismatch is a SKIP, not a
// 405, exactly mirroring the original if-ladder fall-through.
if (route.methods !== null && !route.methods.includes(ctx.method)) {
continue;
}
await route.handler(req, res, { ...ctx, match: matched });
return true;
}
return false;
}
/**
* Enumerate registered routes (name + methods + matcher kind) for
* diagnostics, audits, and tests. Pure read of the table.
*
* @returns {Array<{ name: string, methods: string[] | null }>}
*/
function list() {
return routes.map((r) => ({ name: r.name, methods: r.methods }));
}
const api = { register, dispatch, list, get size() { return routes.length; } };
return api;
}
// Phase F-2-B (B3): per-world secret cache.
//
// Pattern lifted from `packages/cloudflare-worker/src/index.ts:428-446`
// (`getContainerSecret`). CF Worker uses Durable Object storage with a
// 1h TTL; host CP uses an in-memory Map with a 5min TTL (D2 — demoted
// from 1h after the security review pass).
//
// The cache invalidates on two paths:
// 1. TTL expiry (lazy: checked on each `get(worldId)` call)
// 2. Docker events stream (eager: docker-events.mjs subscribes to
// `restart` / `stop` events and calls `invalidate(worldId)` —
// M2 ship gate is "docker restart <world>; within 10s, proxy
// call returns 200 not 401").
/**
* @typedef {object} CacheEntry
* @property {string} secret
* @property {number} expiresAt epoch ms
*/
export class SecretCache {
/**
* @param {object} opts
* @param {number} [opts.ttlSec] cache TTL in seconds (default 300 = 5min)
* @param {() => number} [opts.now] clock injectable for tests
* @param {(message: string) => void} [opts.log] logger injectable for tests
*/
constructor({ ttlSec = 300, now = Date.now, log = console.log } = {}) {
this.ttlMs = ttlSec * 1000;
/** @type {Map<string, CacheEntry>} */
this.entries = new Map();
this.now = now;
this.log = log;
}
/**
* Look up a cached secret. Returns null if absent OR expired (TTL check
* is lazy — caller must re-fetch and call set()). The expiry path emits
* a debug log so cache-miss observability is wired in from day one.
*
* @param {string} worldId
* @returns {string | null}
*/
get(worldId) {
const entry = this.entries.get(worldId);
if (!entry) return null;
if (entry.expiresAt <= this.now()) {
// Lazy expiry — clean up to keep the map tidy. Don't log per-call;
// would be noisy under load.
this.entries.delete(worldId);
return null;
}
return entry.secret;
}
/**
* Cache a freshly-fetched secret. Overrides any prior entry. The
* `set` path is the only place TTL is reset — ensures a cache hit
* never extends beyond ttlMs from the most recent fetch.
*
* @param {string} worldId
* @param {string} secret
*/
set(worldId, secret) {
this.entries.set(worldId, {
secret,
expiresAt: this.now() + this.ttlMs,
});
}
/**
* Eager invalidation. Called by docker-events.mjs on `restart` / `stop`
* events. Returns true if an entry was present (test-observable).
*
* @param {string} worldId
* @returns {boolean}
*/
invalidate(worldId) {
const had = this.entries.has(worldId);
if (had) {
this.entries.delete(worldId);
this.log(`secret-cache: invalidated ${worldId}`);
}
return had;
}
/**
* Drop everything. Used at shutdown for clean teardown; also useful
* in tests.
*/
clear() {
this.entries.clear();
}
/**
* Snapshot of cached worldIds (for /health diagnostics + tests).
* Returns just the keys — never the secrets themselves.
*
* @returns {string[]}
*/
worldIds() {
return [...this.entries.keys()];
}
}
// serve-only-config.mjs — host-cp SERVE-ONLY mode gate (Phase A of
// host-cp-gke-serve-only-mode).
//
// host-cp normally runs as a local operator sidecar coupled to the host's
// docker daemon + operator-repo + gh-config. On a managed GKE cluster those
// host-couplings are absent: host-cp only serves plan-chat-spa + the
// host-native `/api/*` surface; world orchestration runs elsewhere.
//
// `OLAM_HOST_CP_SERVE_ONLY=true` switches host-cp into that degraded shape:
// - no docker transport connect, no world discovery
// - no PlanOrchestrator docker wiring, no pr-merge-poller docker/repo deps
// - world-orchestration routes (`/api/world/*`) return a structured 503
// - version-status degrades to 'unknown' (no operator-repo)
//
// The flag defaults OFF — the local docker/k3d FULL mode is byte-for-byte
// unchanged. This module is a tiny pure seam so the gate decision can be
// unit-tested WITHOUT booting server.mjs (which connects docker + binds a
// port at module load and therefore can't be imported in a test).
//
// ONE coarse flag — no granular per-subsystem toggles (plan S1 / YAGNI).
/**
* Decide whether host-cp runs in SERVE-ONLY mode.
*
* Strict `=== 'true'` parse (mirrors the HOST_CP_MODE env-flag convention
* in server.mjs): only the literal string `'true'` enables it. Any other
* value — unset, `'1'`, `'false'`, `''`, `'TRUE'` — keeps FULL mode so the
* default stays OFF and operators can't half-enable it by accident.
*
* @param {NodeJS.ProcessEnv | Record<string, string | undefined>} [env]
* Environment to read `OLAM_HOST_CP_SERVE_ONLY` from. Defaults to
* `process.env`.
* @returns {boolean} `true` when serve-only mode is active.
*/
export function isServeOnly(env = process.env) {
return env?.OLAM_HOST_CP_SERVE_ONLY === 'true';
}
/**
* Structured 503 body for world-orchestration routes that are unavailable
* in serve-only mode. Reuses the host-cp `/api/*` JSON-error shape
* (`{ error, message }`) so SPA error handling treats it uniformly.
*
* @type {{ error: 'orchestration_unavailable', message: string }}
*/
export const ORCHESTRATION_UNAVAILABLE = Object.freeze({
error: 'orchestration_unavailable',
message:
'host-cp is in serve-only mode (managed cluster); world orchestration runs elsewhere',
});
/**
* True when `pathname` (+ `method`) is a world-ORCHESTRATION route that must
* degrade to a structured 503 in serve-only mode. The surface is wider than
* the singular `/api/world/` proxy: it also covers the plural `/api/worlds/`
* per-world mutation/read routes (e.g. `POST /api/worlds/<id>/tunnels` which
* spawns a real cloudflare tunnel, `DELETE /api/worlds/<id>` which destroys a
* world), world creation (`POST /api/worlds`), and the CLI `/v1/worlds/`
* routes. Without this breadth a serve-only host-cp on a shared cluster would
* execute tunnel/destroy mutations — the opposite of honest degradation.
* (CP3 finding: the singular-only guard let POST /api/worlds/<id>/tunnels
* open a live public tunnel in serve-only.)
*
* Deliberately NOT orchestration: `GET`/`HEAD /api/worlds` (the bare LIST
* endpoint) — it returns an empty array in serve-only, which is honest.
*
* @param {unknown} pathname URL.pathname (no querystring).
* @param {string} [method] HTTP method (defaults 'GET').
* @returns {boolean}
*/
export function isOrchestrationRoute(pathname, method = 'GET') {
if (typeof pathname !== 'string') return false;
// Singular /api/world/<id>/... — the per-world CP proxy + /progress.
if (pathname.startsWith('/api/world/')) return true;
// CLI per-world routes (olam status/logs <world>).
if (pathname.startsWith('/v1/worlds/')) return true;
// Plural /api/worlds:
// bare LIST (GET/HEAD /api/worlds) → honest [] in serve-only, NOT blocked.
// create (POST /api/worlds) + any per-world subpath (/api/worlds/<id>...) → 503.
if (pathname === '/api/worlds') {
return method !== 'GET' && method !== 'HEAD';
}
if (/^\/api\/worlds\/[^/?#]+/.test(pathname)) return true;
return false;
}

Sorry, the diff of this file is too big to display

// Phase F-2-B (B5): SSE concurrent-connection gate + path detection.
//
// Background. Each open SSE proxy holds:
// - A Node http.ClientRequest to the per-world CP (one fd)
// - The browser's incoming socket (one fd)
// Plus the Node event loop wakes on every chunk. With N worlds × M tabs
// × Sse-per-tab, the FD budget grows linearly. P3 budgets ≤100 concurrent
// SSE proxies; P4 caps at 50 + returns 503 with Retry-After: 30 above
// that. Below the cap there's no impact.
//
// Cap semantics:
// - increment() returns true if we're allowed to open; false → reject.
// - decrement() is idempotent + fire-once via the FiredFlag pattern
// because Node emits both 'close' and 'finish' on a normal stream
// end. Without idempotency the counter would underflow.
//
// SSE detection is path-based (cheap; runs before opening upstream).
// Two patterns are SSE today:
// /api/stream — per-world CP's existing SSE feed
// /api/world/<id>/bootstrap-progress — placeholder for B7's UI strip
// (per-world CP route lands later)
const SSE_PATH_PATTERNS = [
/\/api\/stream(?:\/|$|\?)/,
/\/bootstrap-progress(?:\/|$|\?)/,
/\/api\/logs(?:\/|$|\?)/,
];
/**
* Detect whether an upstream subPath represents an SSE stream. The
* subPath is the value emitted by `parseProxyPath()` — i.e., everything
* AFTER `/api/world/<id>`. So we match on the inner route, not the
* `/api/world/<id>` prefix.
*
* @param {string} subPath
* @returns {boolean}
*/
export function isSsePath(subPath) {
return SSE_PATH_PATTERNS.some((re) => re.test(subPath));
}
export class SseGate {
/**
* @param {object} opts
* @param {number} [opts.maxConcurrent] default 50 (P4 cap)
* @param {(message: string) => void} [opts.log]
*/
constructor({ maxConcurrent = 50, log = console.log } = {}) {
if (maxConcurrent < 1) {
throw new Error('SseGate: maxConcurrent must be >= 1');
}
this.maxConcurrent = maxConcurrent;
this.active = 0;
this.log = log;
}
/**
* Try to acquire a slot. If at cap, returns null + writes a 503 to
* res. Caller MUST check the return value.
*
* @param {import('node:http').ServerResponse} res
* @returns {{ release: () => void } | null}
*/
acquire(res) {
if (this.active >= this.maxConcurrent) {
res.writeHead(503, {
'Content-Type': 'application/json; charset=utf-8',
'Retry-After': '30',
});
res.end(JSON.stringify({
error: 'sse_capacity_reached',
active: this.active,
cap: this.maxConcurrent,
retry_after_sec: 30,
message: 'host CP has reached the SSE concurrent-connection cap. Retry after the indicated delay or close idle SPA tabs.',
}));
this.log(`sse-gate: 503 — cap reached (active=${this.active}, cap=${this.maxConcurrent})`);
return null;
}
this.active++;
let released = false;
const release = () => {
if (released) return;
released = true;
this.active--;
};
return { release };
}
/** Diagnostics for /health. */
stats() {
return {
active: this.active,
cap: this.maxConcurrent,
};
}
}
/**
* Wire SSE-gate teardown to a ServerResponse's lifecycle. Node's
* http response emits 'close' (client disconnected) AND 'finish'
* (response.end() called) on different code paths. We want decrement
* exactly once per acquire(), regardless of which event fires first.
*
* The release closure is already idempotent (released flag). Wiring
* both events covers every termination path:
* - browser closes tab → 'close' on res
* - upstream EOF + res.end → 'finish' on res
* - error in proxy → 'close' on res (Node fires close on errors)
*
* @param {import('node:http').ServerResponse} res
* @param {() => void} release
*/
export function wireRelease(res, release) {
res.on('close', release);
res.on('finish', release);
}
// packages/host-cp/src/tasks-route.mjs
//
// B2.2: mount @olam/tasks-write-api's framework-neutral handlers under
// /api/tasks/*. host-cp owns the pg.Pool (per D-B-19 olam-local-PG-primary);
// wraps it via pgPoolExecutor (B2.1.1 adapter) and passes as HandlerDeps.pglite
// (duck-typed; PgExecutor's query/exec/transaction match PGlite's shape).
//
// Auth model: leverages host-cp's existing StartupToken bearer gate (Authorization:
// Bearer <token>). Per-request scopes + olamNodeId come from headers:
// X-Olam-Node-Id: UUID of the caller's olam node (sets RLS scope per D-B-23)
// X-Olam-Session-Id: UUID of the caller's session row (FK for task_claims)
// X-Olam-Tasks-Scopes: comma-separated scope list (tasks-create,tasks-claim,
// tasks-state-update,tasks-query). Trust model: bearer
// token gates access; scope header lets the caller declare
// narrower intent.
//
// Deviation from B2.2 plan spec: spec called for JWT + auth-service integration;
// host-cp uses opaque tokens (StartupToken) and HTTP calls auth-service via HTTP.
// JWT scope encoding deferred to Phase D++ when multi-user auth lands; for v1,
// the existing bearer + per-request header model is sufficient (single-operator;
// 127.0.0.1:19000 only per host-cp threat model).
import pg from 'pg';
// Treat BIGINT (OID 20) as Number, not the default string. The tasks schema's
// `version` column is BIGINT but stays well within Number-safe range; without
// this parser pg returns the value as a string, and the task-store types
// declare it as `number`, letting a stray BigInt propagate (PGlite returns
// BigInt by default). JSON.stringify on BigInt throws — caused /api/tasks
// 500s with "Do not know how to serialize a BigInt" during the CLI E2E proof.
pg.types.setTypeParser(20, (v) => (v == null ? null : Number.parseInt(v, 10)));
let writeApi = null; // lazy-load tasks-write-api to keep cold-path light
let executor = null;
let pool = null;
const VALID_SCOPES = new Set(['tasks-create', 'tasks-claim', 'tasks-state-update', 'tasks-query']);
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
async function ensureWriteApi() {
if (writeApi) return writeApi;
// Dynamic import: tasks-write-api is built TS (ESM dist). Fail-loud if not
// built — operator must `npm run build --workspace=@olam/tasks-write-api`
// before host-cp starts.
writeApi = await import('@olam/tasks-write-api');
return writeApi;
}
function ensureExecutor() {
if (executor) return executor;
const connectionString = process.env.OLAM_LOCAL_PG_URL;
if (!connectionString) {
throw new Error(
'tasks-route: OLAM_LOCAL_PG_URL not set. Bring up Docker PG: docker compose -f packages/infra/docker-compose.local-electric.yml up -d, then export OLAM_LOCAL_PG_URL=postgres://postgres:olam@localhost:54331/olam_tasks',
);
}
pool = new pg.Pool({ connectionString, max: 8 });
// Lazy require pgPoolExecutor from the same dynamic-imported module.
// ensureWriteApi must have run first; tasks-route's dispatch order guarantees it.
return writeApi.pgPoolExecutor(pool);
}
function parseAuth(req) {
const olamNodeId = req.headers['x-olam-node-id'];
const sessionId = req.headers['x-olam-session-id'];
const scopesHeader = req.headers['x-olam-tasks-scopes'] ?? '';
const scopes = String(scopesHeader)
.split(',')
.map((s) => s.trim())
.filter((s) => VALID_SCOPES.has(s));
if (!olamNodeId || !UUID_RE.test(String(olamNodeId))) return null;
if (!sessionId || !UUID_RE.test(String(sessionId))) return null;
if (scopes.length === 0) return null;
return { olamNodeId: String(olamNodeId), sessionId: String(sessionId), scopes };
}
async function readBody(req) {
if (req.method === 'GET' || req.method === 'HEAD') return {};
return new Promise((resolve, reject) => {
let raw = '';
req.on('data', (chunk) => (raw += chunk));
req.on('end', () => {
if (!raw) return resolve({});
try {
resolve(JSON.parse(raw));
} catch {
resolve({ __invalid: true });
}
});
req.on('error', reject);
});
}
function sendEnvelope(res, status, envelope) {
res.statusCode = status;
res.setHeader('Content-Type', 'application/json');
// BigInt-safe serialization. @olam/tasks's task-store wraps `version`
// (and any future BIGINT fields) in BigInt() during row→Task mapping;
// default JSON.stringify throws on BigInt. The values stay safely
// within Number range (version starts at 0, increments per mutation),
// so emitting as a JSON number is lossless for any realistic load.
res.end(JSON.stringify(envelope, (_key, value) =>
typeof value === 'bigint' ? Number(value) : value,
));
}
/**
* Dispatch a /api/tasks/* request. Returns true if handled; false if route
* doesn't match (caller continues to next dispatcher in server.mjs).
*/
export async function dispatchTasksRoute(req, res, url) {
const pathname = url.pathname;
if (!pathname.startsWith('/api/tasks')) return false;
// Lazy initialise on first request (avoids boot-time crash when PG not up).
let api;
try {
api = await ensureWriteApi();
} catch (e) {
sendEnvelope(res, 500, { success: false, data: null, error: `tasks-write-api unbuilt: ${e.message}` });
return true;
}
let exec;
try {
exec = ensureExecutor();
} catch (e) {
sendEnvelope(res, 503, { success: false, data: null, error: e.message });
return true;
}
const auth = parseAuth(req);
if (!auth) {
sendEnvelope(res, 401, {
success: false,
data: null,
error: 'Missing or malformed X-Olam-Node-Id / X-Olam-Session-Id / X-Olam-Tasks-Scopes headers',
});
return true;
}
const body = await readBody(req);
if (body && body.__invalid) {
sendEnvelope(res, 400, { success: false, data: null, error: 'Invalid JSON body' });
return true;
}
// Route matching — minimal pattern (host-cp's existing if-ladder style).
const segments = pathname.split('/').filter(Boolean); // ['api','tasks',...]
const ctx = { auth, params: {}, query: Object.fromEntries(url.searchParams) };
const deps = { pglite: exec };
try {
let response;
if (segments.length === 2 && req.method === 'POST') {
response = await api.createHandler({ ...ctx, body }, deps);
} else if (segments.length === 2 && req.method === 'GET') {
response = await api.queryHandler({ ...ctx, body }, deps);
} else if (segments.length === 3 && segments[2] === 'claim' && req.method === 'POST') {
response = await api.claimHandler({ ...ctx, body }, deps);
} else if (segments.length === 3 && segments[2] === 'distill' && req.method === 'GET') {
response = await api.distillHandler({ ...ctx, body }, deps);
} else if (segments.length === 4 && segments[3] === 'heartbeat' && req.method === 'POST') {
ctx.params.id = segments[2];
response = await api.heartbeatHandler({ ...ctx, body }, deps);
} else if (segments.length === 4 && segments[3] === 'complete' && req.method === 'POST') {
ctx.params.id = segments[2];
response = await api.completeHandler({ ...ctx, body }, deps);
} else if (segments.length === 4 && segments[3] === 'update' && req.method === 'POST') {
ctx.params.id = segments[2];
response = await api.updateHandler({ ...ctx, body }, deps);
} else {
sendEnvelope(res, 404, { success: false, data: null, error: `Unknown /api/tasks route: ${req.method} ${pathname}` });
return true;
}
sendEnvelope(res, response.status, response.envelope);
return true;
} catch (e) {
console.error('[tasks-route] handler error:', e);
sendEnvelope(res, 500, { success: false, data: null, error: e?.message ?? 'internal error' });
return true;
}
}
// Test surface — for unit tests to reset module state between cases.
export function _resetForTests() {
writeApi = null;
executor = null;
if (pool) pool.end().catch(() => undefined);
pool = null;
}
// Upgrade-trigger: spawn an ephemeral `olam upgrade` runner container.
//
// The user clicks "Run upgrade" in the dashboard → host-cp's
// /api/admin/upgrade endpoint asks the docker daemon (via the
// socket-proxy sidecar) to create + start a one-off container that
// runs the olam CLI's full upgrade pipeline against the local stack.
//
// Why a separate container (and not a child process inside host-cp)?
// `olam upgrade` recreates host-cp itself as part of the atomic
// tag-swap. If the orchestrator lived inside host-cp, the moment it
// asked docker to stop the old host-cp container the orchestrator
// would die with it — leaving no one to start the new container.
// A sibling container survives host-cp's recreate.
//
// Why this same image (not a purpose-built `olam-upgrader`)?
// The host-cp image already has Node, the olam CLI, the docker CLI,
// and the docker compose plugin baked in by Dockerfile. Reusing it
// means there's nothing extra to publish, and the upgrader is
// guaranteed to ship from the same source SHA as the host-cp it
// replaces. The upgrader's `Cmd` overrides host-cp's default CMD
// so it runs the CLI instead of starting the server.
//
// Security note (single-user-trusted-local-dev assumption):
// POST /api/admin/upgrade requires the host-cp auth token. Anyone
// with that token can already spawn arbitrary commands inside
// running devboxes via the existing exec path; spawning an upgrader
// container does not meaningfully widen the blast radius for the
// single-user model. Multi-user / hosted deployments will need a
// tighter policy (capability bit, user-scoped tokens).
import http from 'node:http';
/**
* Default upgrader entrypoint. Authenticates docker against GHCR (where
* the host-cp / auth / devbox images live), then runs the full
* atomic-swap pipeline.
*
* Auth resolution order:
* 1. `$GH_TOKEN` env var (set on host-cp via compose; operator
* typically resolves it from `gh auth token` before `olam host-cp
* start`). Required path on macOS — the host's `gh` keeps the
* token in Keychain, which doesn't follow into a Linux container.
* 2. `gh auth token` against the mounted ~/.config/gh. Works on Linux
* operators whose gh keeps the token in the config dir directly.
* 3. No-token (warns). The pull-by-digest step will fail with a clear
* `unauthorized` error from the daemon and the upgrader exits non-
* zero — surfacing in `docker logs` for diagnosis.
*
* Wrapping the auth + upgrade in a single `sh -c` invocation lets the
* `docker login` stage feed credentials directly into the docker
* daemon without leaking the token through shared volumes.
*/
const DEFAULT_UPGRADER_CMD = [
'sh',
'-c',
[
'set -e',
// Resolve the GH token. Prefer the env var (works on every OS);
// fall back to `gh auth token` (Linux-only on macOS Keychain hosts).
'TOKEN="${GH_TOKEN:-$(gh auth token 2>/dev/null || true)}"',
'if [ -z "$TOKEN" ]; then echo "[upgrader] no GH_TOKEN; ghcr pulls will fail" >&2; fi',
// Authenticate against ghcr.io. `oauth2` is GitHub's canonical
// username placeholder for PAT-style tokens.
'[ -n "$TOKEN" ] && echo "$TOKEN" | docker login ghcr.io -u oauth2 --password-stdin',
// The CLI resolves `packages/host-cp/compose.yaml` relative to its
// current working directory (see upgrade.ts:1008-1009). Inside the
// upgrader the npm-installed package lives at
// /usr/local/lib/node_modules/@pleri/olam-cli/, so cd there before
// running so the relative path resolves to the bundled compose
// file. Without this the recreate step fails with `open
// /app/packages/host-cp/compose.yaml: no such file or directory`.
// Operator's olam repo is bind-mounted at /workspace below; cd
// there so the CLI finds packages/host-cp/compose.yaml. Without
// this the recreate step fails with `open <cwd>/packages/host-cp/
// compose.yaml: no such file or directory` because the npm-
// installed @pleri/olam-cli package does NOT bundle the compose
// file (it's repo-source only).
'cd /workspace',
// Then run the upgrade. The CLI handles pull-by-digest, atomic
// swap, recreate, and the post-recreate /api/version/status
// round-trip itself.
'olam upgrade -y',
].join(' && '),
];
/**
* Spawn the upgrader. Resolves with the container ID on a successful
* `/start`; throws on any failure path so the caller can surface a
* clean 500 with the daemon's reason.
*
* @param {object} args
* @param {string} args.dockerHost tcp://docker-socket-proxy:2375 or 'docker-cli'
* @param {string} args.olamHomeHostPath e.g. /Users/ernie/.olam
* @param {string} args.dockerSockHostPath e.g. /var/run/docker.sock
* @param {string} args.image upgrader image (defaults to host-cp's own image)
* @param {string} [args.ghConfigHostPath] operator's ~/.config/gh; bind-mounted ro for
* `gh auth token` to work inside the upgrader
* (Linux fallback only; macOS uses GH_TOKEN env)
* @param {string} [args.ghToken] pre-resolved GH token (typically read from
* host-cp's GH_TOKEN env via compose). Passed
* to the upgrader as $GH_TOKEN so `docker login
* ghcr.io` works on macOS hosts whose Keychain-
* backed gh config can't be read inside a Linux
* container.
* @param {string} [args.repoHostPath] operator's olam repo path on the host. Bind-
* mounted into the upgrader at /workspace so
* the CLI's cwd-relative compose-file lookup
* resolves (the npm package doesn't bundle
* packages/host-cp/compose.yaml).
* @param {string} [args.operatorHomeHostPath] operator's $HOME on the host. Passed as the
* upgrader's HOME env so docker-compose's
* `${HOME}` interpolation in bind sources
* resolves to a daemon-visible path.
* @param {ReadonlyArray<string>} [args.cmd] override the upgrade command for tests
* @param {(host: string, init: object) => Promise<Response>} [args.fetchImpl]
* @param {(message: string) => void} [args.log]
* @returns {Promise<{ id: string, name: string }>}
*/
export async function spawnUpgraderContainer({
dockerHost,
olamHomeHostPath,
dockerSockHostPath,
image,
ghConfigHostPath,
ghToken,
repoHostPath,
operatorHomeHostPath,
cmd = DEFAULT_UPGRADER_CMD,
fetchImpl = globalThis.fetch,
log = console.log,
}) {
if (!olamHomeHostPath) {
throw new Error('OLAM_HOME_HOST_PATH not set; cannot bind-mount operator state');
}
if (!dockerSockHostPath) {
throw new Error('OLAM_DOCKER_SOCK_HOST_PATH not set; upgrader cannot reach docker daemon');
}
if (!image) {
throw new Error('upgrader image not configured (OLAM_UPGRADER_IMAGE)');
}
if (!repoHostPath) {
throw new Error(
'OLAM_REPO_HOST_PATH not set; upgrader cannot find packages/host-cp/compose.yaml',
);
}
// Bare-node (operator's host docker CLI on PATH) is documented but
// out of scope for the trigger feature — container + unix-socket paths
// are supported (compose stack and k8s hostPath socket mount).
if (dockerHost === 'docker-cli') {
// The literals below (`unix:///var/run/docker.sock` and `tcp://docker-socket-proxy:2375`)
// are diagnostic text naming the deployment shapes that ARE supported,
// not hostnames being used as transport — error-message-only.
throw new Error(
'upgrade-trigger requires a docker socket (unix:///var/run/docker.sock via k8s hostPath mount, ' + // bare-node-allow: diagnostic-text
'or tcp://docker-socket-proxy:2375 via compose); bare-node not yet supported. ' + // bare-node-allow: diagnostic-text
'For k8s: ensure the cluster was created with ' +
'--volume /var/run/docker.sock:/var/run/docker.sock@server:* ' +
'and olam doctor reports probeDockerSocketBindMount [PASS].',
);
}
const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
const containerName = `olam-upgrader-${Date.now()}`;
// Container create body. AutoRemove cleans up on exit so we don't
// accumulate stopped upgrader carcasses; HostConfig.Binds gives the
// CLI everything it needs (operator state + docker socket).
const createBody = {
Image: image,
Cmd: [...cmd],
Entrypoint: [], // override host-cp's tini entrypoint; olam CLI is self-contained
Env: [
// HOME serves two roles:
// - The CLI uses HOME to resolve ~/.olam (we bind-mount the
// operator's ~/.olam into the upgrader so the CLI sees its
// state).
// - docker-compose interpolates `${HOME}` in bind sources of
// compose.yaml. The daemon resolves those bind sources on
// the HOST filesystem, so HOME must be a path the daemon
// can find (typically the operator's host $HOME).
//
// Default to /root for back-compat with tests that don't pass
// the operator host path; production callers (server.mjs) pass
// operatorHomeHostPath through.
`HOME=${operatorHomeHostPath ?? '/root'}`,
// Non-interactive mode + auto-yes are belt-and-braces: -y flag is
// also passed in Cmd, but env is the canonical way to opt out of
// tty prompts when stdin is closed.
'OLAM_NON_INTERACTIVE=1',
'CI=1',
// GH token forwarded for the docker-login-to-ghcr step. Only
// included when host-cp received it (compose set GH_TOKEN); the
// wrapper's auth fallback handles the unset case explicitly.
...(ghToken ? [`GH_TOKEN=${ghToken}`] : []),
],
HostConfig: {
AutoRemove: true,
// Bind sources are resolved by the docker daemon on the HOST
// filesystem, so target paths must match the host's view too —
// when the upgrader's `docker compose up` recreates host-cp,
// compose's ${HOME} interpolation pulls operatorHomeHostPath
// (set as HOME below). We keep the source==target convention
// for ~/.olam so the path is identical inside and out.
Binds: [
`${olamHomeHostPath}:${operatorHomeHostPath ? `${operatorHomeHostPath}/.olam` : '/root/.olam'}`,
`${dockerSockHostPath}:/var/run/docker.sock`,
// Operator's repo bind-mounted read-only at /workspace. The
// wrapper cds here so the CLI's relative compose-file lookup
// resolves to `<repo>/packages/host-cp/compose.yaml`.
`${repoHostPath}:/workspace:ro`,
// Optional gh config bind. When unset (operator skipped
// `gh auth login`) the upgrader's `gh auth token` step fails
// and the wrapper exits early — surface the failure via
// `docker logs` rather than spawning a doomed run.
...(ghConfigHostPath
? [`${ghConfigHostPath}:${operatorHomeHostPath ? `${operatorHomeHostPath}/.config/gh` : '/root/.config/gh'}:ro`]
: []),
],
// Same network as host-cp so the upgrader can reach the
// docker-socket-proxy + auth-service if it needs to during the
// verification phase. Falls through to docker.sock for daemon
// operations.
NetworkMode: 'olam-host-cp-internal',
},
Labels: {
'olam.role': 'upgrader',
'olam.spawned-by': 'host-cp',
'olam.spawned-at': new Date().toISOString(),
},
};
log(`[upgrade] creating upgrader container ${containerName} from ${image}`);
const createUrl = `${apiBase}/containers/create?name=${encodeURIComponent(containerName)}`;
const createRes = await fetchImpl(createUrl, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(createBody),
});
if (!createRes.ok) {
const detail = await safeReadBody(createRes);
throw new Error(
`daemon rejected POST /containers/create: ${createRes.status} ${createRes.statusText} ${detail}`.trim(),
);
}
const created = await createRes.json();
const containerId = created.Id;
if (!containerId) {
throw new Error(`POST /containers/create returned no Id: ${JSON.stringify(created)}`);
}
log(`[upgrade] starting upgrader ${containerId.slice(0, 12)}`);
const startUrl = `${apiBase}/containers/${encodeURIComponent(containerId)}/start`;
const startRes = await fetchImpl(startUrl, { method: 'POST' });
if (!startRes.ok && startRes.status !== 304) {
// 304 Not Modified = already started; treat as success.
const detail = await safeReadBody(startRes);
throw new Error(
`daemon rejected POST /containers/${containerId}/start: ${startRes.status} ${detail}`.trim(),
);
}
return { id: containerId, name: containerName };
}
/**
* Read the response body without crashing if it isn't JSON or is empty.
* Used for human-readable error surfaces in 500 responses.
*
* @param {Response} res
* @returns {Promise<string>}
*/
async function safeReadBody(res) {
try {
const txt = await res.text();
return txt.trim().slice(0, 512);
} catch {
return '';
}
}
// Version detection for Phase 1 of self-upgrade.
//
// Compares each component's baked OLAM_BUILD_SHA against the operator's
// local repo HEAD (mounted read-only at /operator-repo). Reports upgrade
// availability without triggering any automatic action — Phase 1 is
// detection only.
import fs from 'node:fs';
import path from 'node:path';
/** @typedef {'ok' | 'behind' | 'unknown'} VersionState */
/**
* @typedef {Object} ComponentVersion
* @property {string} running - SHA baked into the running image
* @property {string} latest - SHA of operator's local HEAD (or 'unknown')
* @property {boolean} upgradeAvailable
*/
/**
* @typedef {Object} VersionSnapshot
* @property {ComponentVersion} hostCp
* @property {ComponentVersion} authService
* @property {ComponentVersion} devbox
* @property {string} operatorHead - resolved HEAD or 'unknown'
* @property {string} checkedAt - ISO timestamp
* @property {string} cliVersion - operator's CLI semver (e.g. "0.1.69") or 'unknown'
*/
/**
* Read the operator's local repo HEAD.
*
* Tries OLAM_REPO_PATH env var first, then /operator-repo (the compose-
* mounted path), then $HOME/Projects/ein-sof/olam as a bare-node fallback.
*
* Returns 'unknown' on any read error.
*
* @returns {string}
*/
export function readOperatorHead() {
const candidates = [
process.env.OLAM_REPO_PATH,
'/operator-repo',
].filter(Boolean);
for (const repoPath of candidates) {
try {
// Read HEAD to find the current branch ref (e.g. "ref: refs/heads/main")
// then resolve to the SHA.
const headFile = path.join(repoPath, '.git', 'HEAD');
if (!fs.existsSync(headFile)) continue;
const headContent = fs.readFileSync(headFile, 'utf-8').trim();
if (headContent.startsWith('ref: ')) {
// Symbolic ref → resolve to SHA via the packed-refs or loose ref.
const refPath = headContent.slice('ref: '.length);
const looseRef = path.join(repoPath, '.git', refPath);
if (fs.existsSync(looseRef)) {
return fs.readFileSync(looseRef, 'utf-8').trim();
}
// Try packed-refs fallback.
const packedRefs = path.join(repoPath, '.git', 'packed-refs');
if (fs.existsSync(packedRefs)) {
const lines = fs.readFileSync(packedRefs, 'utf-8').split('\n');
for (const line of lines) {
if (line.startsWith('#')) continue;
const [sha, ref] = line.trim().split(' ');
if (ref === refPath) return sha;
}
}
} else if (/^[0-9a-f]{40}$/i.test(headContent)) {
// Detached HEAD — use the SHA directly.
return headContent;
}
} catch {
// silently try next candidate
}
}
return 'unknown';
}
/**
* Compare two SHAs. Returns true when they differ and both are known.
* If either is 'unknown' we cannot assert an upgrade is available.
*
* @param {string} running
* @param {string} latest
* @returns {boolean}
*/
export function isUpgradeAvailable(running, latest) {
if (running === 'unknown' || latest === 'unknown') return false;
// SHAs may be full (40 hex chars) or short (7+ hex chars from --short).
// Compare by checking if one is a prefix of the other.
const a = running.toLowerCase();
const b = latest.toLowerCase();
return !a.startsWith(b) && !b.startsWith(a);
}
/**
* Fetch the auth-service's /health endpoint and extract buildSha.
*
* @param {string} authServiceUrl
* @returns {Promise<string>}
*/
export async function fetchAuthServiceSha(authServiceUrl) {
try {
const res = await fetch(`${authServiceUrl}/health`, {
signal: AbortSignal.timeout(5000),
});
if (!res.ok) return 'unknown';
const data = /** @type {unknown} */ (await res.json());
if (data && typeof data === 'object' && 'buildSha' in data) {
const sha = /** @type {Record<string, unknown>} */ (data)['buildSha'];
return typeof sha === 'string' ? sha : 'unknown';
}
return 'unknown';
} catch {
return 'unknown';
}
}
/**
* Inspect a locally-tagged docker image (by reference such as
* `ghcr.io/pleri/olam-host-cp:latest`) and extract its baked
* OLAM_BUILD_SHA env. Returns 'unknown' if the image isn't pulled,
* the docker socket is unreachable, or the env is missing.
*
* Used as the "what's the latest published image we'd swap to?"
* signal for the upgrade comparator — replaces the prior
* `operatorHead` (operator's local git HEAD) which over-reports
* upgradeAvailable whenever an SPA-only PR merges between releases.
*
* @param {string} dockerApiBase
* @param {string} imageRef e.g. "ghcr.io/pleri/olam-host-cp:latest"
* @returns {Promise<string>}
*/
export async function fetchLatestImageSha(dockerApiBase, imageRef) {
try {
const res = await fetch(
`${dockerApiBase}/images/${encodeURIComponent(imageRef)}/json`,
{ signal: AbortSignal.timeout(5000) },
);
if (!res.ok) return 'unknown';
const image = /** @type {unknown} */ (await res.json());
if (!image || typeof image !== 'object') return 'unknown';
const config = /** @type {Record<string, unknown>} */ (image)['Config'];
if (!config || typeof config !== 'object') return 'unknown';
const env = /** @type {Record<string, unknown>} */ (config)['Env'];
if (!Array.isArray(env)) return 'unknown';
for (const e of env) {
if (typeof e === 'string' && e.startsWith('OLAM_BUILD_SHA=')) {
return e.slice('OLAM_BUILD_SHA='.length);
}
}
return 'unknown';
} catch {
return 'unknown';
}
}
/**
* Fetch the devbox image SHA. We check the running devbox container's
* OLAM_BUILD_SHA env var via the docker socket proxy (inspect endpoint).
* Returns 'unknown' if any step fails.
*
* @param {string} dockerApiBase e.g. "http://docker-socket-proxy:2375" or "http://localhost:2375"
* @returns {Promise<string>}
*/
export async function fetchDevboxImageSha(dockerApiBase) {
try {
// List containers named olam-*-devbox and grab the first one.
const listRes = await fetch(
`${dockerApiBase}/containers/json?filters=${encodeURIComponent(JSON.stringify({ name: ['olam-devbox'] }))}`,
{ signal: AbortSignal.timeout(5000) },
);
if (!listRes.ok) return 'unknown';
const containers = /** @type {unknown} */ (await listRes.json());
if (!Array.isArray(containers) || containers.length === 0) return 'unknown';
// Use the most recently-created devbox container's image ID.
// Inspect the image for OLAM_BUILD_SHA label or env.
const container = /** @type {Record<string, unknown>} */ (containers[0]);
const imageId = typeof container['ImageID'] === 'string' ? container['ImageID'] : null;
if (!imageId) return 'unknown';
const inspectRes = await fetch(
`${dockerApiBase}/images/${encodeURIComponent(imageId)}/json`,
{ signal: AbortSignal.timeout(5000) },
);
if (!inspectRes.ok) return 'unknown';
const image = /** @type {unknown} */ (await inspectRes.json());
if (!image || typeof image !== 'object') return 'unknown';
const config = /** @type {Record<string, unknown>} */ (image)['Config'];
if (!config || typeof config !== 'object') return 'unknown';
const env = /** @type {Record<string, unknown>} */ (config)['Env'];
if (!Array.isArray(env)) return 'unknown';
for (const e of env) {
if (typeof e === 'string' && e.startsWith('OLAM_BUILD_SHA=')) {
return e.slice('OLAM_BUILD_SHA='.length);
}
}
return 'unknown';
} catch {
return 'unknown';
}
}
/**
* Build a full VersionSnapshot from all available sources.
*
* @param {{
* authServiceUrl: string;
* dockerApiBase: string;
* }} opts
* @returns {Promise<VersionSnapshot>}
*/
export async function buildVersionSnapshot({ authServiceUrl, dockerApiBase }) {
const operatorHead = readOperatorHead();
// Inspect locally-pulled `:latest` image tags to get the actual
// published baked SHA — what `olam upgrade` would swap us to next.
// Fall back to operatorHead when the image isn't pulled (first-run
// or stack never upgraded) so the banner still surfaces SOMETHING.
const [authSha, devboxSha, hostCpLatestPublished, authLatestPublished, devboxLatestPublished] =
await Promise.all([
fetchAuthServiceSha(authServiceUrl),
fetchDevboxImageSha(dockerApiBase),
fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-host-cp:latest'),
// NOTE: docker tag is `olam-auth` (no `-service` suffix); npm
// workspace is `auth-service`. The two diverged historically.
fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-auth:latest'),
fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-devbox:latest'),
]);
const hostCpRunning = process.env.OLAM_BUILD_SHA ?? 'unknown';
// Pick "latest" per component: use the published image SHA when we
// can read it (truthful — that's what would swap in), else fall back
// to operatorHead (legacy behaviour, may over-report between SPA-only
// PR merges and the next image rebuild — but still informative when
// the operator hasn't yet pulled `:latest`).
const hostCpLatest = pickLatest(hostCpLatestPublished, operatorHead);
const authLatest = pickLatest(authLatestPublished, operatorHead);
const devboxLatest = pickLatest(devboxLatestPublished, operatorHead);
// CLI version is propagated by `olam host-cp start` via the
// OLAM_CLI_VERSION env (see packages/cli/src/commands/host-cp.ts
// buildComposeEnv). Falls back to host-cp's own package.json when
// an older CLI started this container without setting the env.
const cliVersion = process.env.OLAM_CLI_VERSION
|| readHostCpPackageVersion()
|| 'unknown';
return {
hostCp: {
running: hostCpRunning,
latest: hostCpLatest,
upgradeAvailable: isUpgradeAvailable(hostCpRunning, hostCpLatest),
},
authService: {
running: authSha,
latest: authLatest,
upgradeAvailable: isUpgradeAvailable(authSha, authLatest),
},
devbox: {
running: devboxSha,
latest: devboxLatest,
upgradeAvailable: isUpgradeAvailable(devboxSha, devboxLatest),
},
operatorHead,
checkedAt: new Date().toISOString(),
cliVersion,
};
}
/**
* Prefer the published-image SHA (truthful "would swap to") over the
* operator's local git HEAD (over-reports when SPA-only PRs land
* between image rebuilds). Falls back to operatorHead when the image
* isn't pulled (e.g. cold-start before first `olam upgrade`).
*
* @param {string} publishedImageSha
* @param {string} operatorHead
* @returns {string}
*/
export function pickLatest(publishedImageSha, operatorHead) {
if (publishedImageSha && publishedImageSha !== 'unknown') return publishedImageSha;
return operatorHead;
}
/**
* Read host-cp's bundled package.json version as the CLI-version
* fallback when OLAM_CLI_VERSION isn't propagated. The container
* Dockerfile copies the manifest into /app, so the lookup walks up
* from this module's location.
*
* @returns {string | null}
*/
function readHostCpPackageVersion() {
try {
const here = path.dirname(new URL(import.meta.url).pathname);
for (const candidate of [
path.join(here, '..', 'package.json'),
path.join(here, '..', '..', 'package.json'),
]) {
if (fs.existsSync(candidate)) {
const pkg = JSON.parse(fs.readFileSync(candidate, 'utf-8'));
if (typeof pkg.version === 'string' && pkg.version.length > 0) return pkg.version;
}
}
} catch {
// best-effort
}
return null;
}
// Phase F-2-B (B6): workspace + project catalog for host CP.
//
// Reads workspace YAML files from `~/.olam/workspaces/*.yaml` (mounted
// at `/data/workspaces` inside the host-cp container per compose.yaml).
// Provides three endpoints' worth of data:
//
// 1. /api/workspaces — list all workspaces (redacted)
// 2. /api/projects — deduplicated project union
// 3. POST /api/workspaces/match — exact set-equality matching
// for D13's project-first
// create-world flow
import fs from 'node:fs';
import path from 'node:path';
import YAML from 'yaml';
import { redactSensitive } from './redact.mjs';
/**
* @typedef {object} Project
* @property {string} name
* @property {string} [url]
* @property {string} [path]
* @property {string} [branch]
*/
/**
* @typedef {object} Workspace
* @property {string} name
* @property {Project[]} repos project list (called `repos` in YAML)
* @property {Record<string, unknown>} [defaults]
* @property {Record<string, unknown>} [services]
* @property {Record<string, unknown>} [image]
* @property {Record<string, unknown>} [host_ui]
* @property {number} [updatedAt]
*/
/**
* Load all workspace YAMLs from a directory. Returns an array, sorted
* by name. Invalid YAMLs are logged + skipped (don't bring down the
* whole list because one file is malformed).
*
* @param {string} dir
* @param {(message: string) => void} [log]
* @returns {Workspace[]}
*/
export function loadWorkspaces(dir, log = console.log) {
if (!fs.existsSync(dir)) {
log(`workspace-catalog: directory ${dir} does not exist`);
return [];
}
/** @type {Workspace[]} */
const out = [];
for (const entry of fs.readdirSync(dir)) {
if (!entry.endsWith('.yaml') && !entry.endsWith('.yml')) continue;
const filePath = path.join(dir, entry);
try {
const raw = fs.readFileSync(filePath, 'utf-8');
const parsed = YAML.parse(raw);
if (parsed && typeof parsed === 'object' && parsed.name) {
// Normalize: ensure `repos` is at least an empty array.
out.push({ ...parsed, repos: parsed.repos ?? [] });
} else {
log(`workspace-catalog: skipping ${entry} (no .name field)`);
}
} catch (err) {
log(`workspace-catalog: failed to parse ${entry}: ${err.message}`);
}
}
return out.sort((a, b) => a.name.localeCompare(b.name));
}
/**
* /api/workspaces response: redacted workspace list.
*
* @param {Workspace[]} workspaces
* @returns {Workspace[]}
*/
export function workspacesForApi(workspaces) {
return /** @type {Workspace[]} */ (redactSensitive(workspaces));
}
/**
* /api/projects response: deduplicated project union across all
* workspaces. Dedup key is project name (case-sensitive — Atlas Core
* and atlas-core would be distinct, which matches the workspace YAML
* convention of using kebab-case throughout).
*
* Per-project metadata: takes the FIRST occurrence's url/path/branch.
* Subsequent occurrences with the same name are ignored. This keeps
* the response stable across reorderings within individual workspace
* YAMLs.
*
* @param {Workspace[]} workspaces
* @returns {Project[]}
*/
export function projectsFromWorkspaces(workspaces) {
/** @type {Map<string, Project>} */
const byName = new Map();
for (const ws of workspaces) {
for (const repo of ws.repos ?? []) {
if (!repo?.name) continue;
if (!byName.has(repo.name)) {
byName.set(repo.name, { ...repo });
}
}
}
return [...byName.values()].sort((a, b) => a.name.localeCompare(b.name));
}
/**
* POST /api/workspaces/match request body: { projects: string[] }.
* Returns workspaces whose project-name set EXACTLY equals the input
* set (no subset, no superset). Sorted by name for response stability.
*
* Algorithm: O(W × P) where W = #workspaces, P = average projects per
* workspace. Workspaces are small (<10 projects each); fine for direct
* iteration.
*
* @param {Workspace[]} workspaces
* @param {string[]} projectNames
* @returns {Workspace[]}
*/
export function matchWorkspacesByProjects(workspaces, projectNames) {
const target = new Set(projectNames);
/** @type {Workspace[]} */
const matches = [];
for (const ws of workspaces) {
const wsNames = new Set((ws.repos ?? []).map((r) => r.name).filter(Boolean));
if (setsEqual(target, wsNames)) {
matches.push(ws);
}
}
return matches.sort((a, b) => a.name.localeCompare(b.name));
}
/**
* Set equality. Two sets are equal iff same size + same members.
*
* @param {Set<string>} a
* @param {Set<string>} b
* @returns {boolean}
*/
function setsEqual(a, b) {
if (a.size !== b.size) return false;
for (const x of a) {
if (!b.has(x)) return false;
}
return true;
}
/**
* WorldActivityTracker — periodic scanner that turns each active world's
* Claude session JSONL into `thought_count` + `total_cost_usd` updates on
* the `worlds` table (~/.olam/worlds.db), plus a `world.activity.tick`
* event on the host-stream broadcaster.
*
* Closes #965. Pre-fix, `olam_status <world>` always reported
* `Cost $0.0000 / Thoughts 0` because nothing wrote those columns after
* world creation. Rico (the orchestrator) reads those fields to decide
* whether a world is progressing or stalled, so as far as it was
* concerned every world was frozen.
*
* Design notes:
* - **JSONL path is operator-configurable.** Default contract per #965
* is `~/.olam/worlds/<id>/state/claude-main.jsonl`; override the
* template via `OLAM_WORLD_JSONL_PATH_TEMPLATE`. On this host the
* producer for the default path is not yet shipped (Claude Code
* writes to `~/.claude/projects/<sanitized>/<uuid>.jsonl` by
* default), so values stay at 0 until either the producer lands or
* the env override repoints the scanner.
* - **Dedupe by `message.id`.** Claude SDK JSONL emits multiple lines
* per assistant API turn (one per content block), each carrying the
* SAME `message.id` + the SAME `usage` block. Naive sum-by-line
* double-counts. We dedupe by `message.id` for usage totals and
* count unique-message-id as `thoughtCount`.
* - **Idempotent.** Re-scanning the same JSONL produces the same
* numbers; safe to run at any cadence.
* - **Fail-soft per world.** A bad JSONL line, missing file, or
* unreadable handle never crashes the loop — the failing world is
* skipped with a debug log and the next world proceeds.
*
* Cadence: `OLAM_WORLD_ACTIVITY_TICK_MS` (default 60_000).
*
* Wire-in: `server.mjs` constructs once with `{ db, broadcaster }` after
* both are ready and calls `.stop()` from the SIGTERM/SIGINT handler.
*
* @see ../host-stream.mjs broadcaster API
* @see ../worlds-db-source.mjs read-only DB open pattern (model for
* `tryOpenDb` here, though tracker WRITES not reads).
*/
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import readline from 'node:readline';
import { createRequire } from 'node:module';
const require = createRequire(import.meta.url);
// TODO(rates): source live model rates from auth-service or a config
// file. For now we anchor on Claude Opus per-million baseline ($3 input
// / $15 output) — the issue surface is "value advances post-creation",
// not "is dollar-accurate to 4 decimals". When per-model rates land,
// pluck the model id from the assistant message and dispatch.
const INPUT_USD_PER_M_TOKENS = 3.0;
const OUTPUT_USD_PER_M_TOKENS = 15.0;
const DEFAULT_TICK_MS = 60_000;
/**
* Resolve a per-world JSONL path from an operator-supplied template
* string. The template supports a single `{worldId}` placeholder, and a
* leading `~/` is expanded to `os.homedir()`.
*
* @param {string} template
* @param {string} worldId
* @returns {string}
*/
export function resolveJsonlPath(template, worldId) {
const swapped = template.replace(/\{worldId\}/g, worldId);
if (swapped.startsWith('~/')) {
return path.join(os.homedir(), swapped.slice(2));
}
return swapped;
}
/**
* Scan a single JSONL file and return aggregate counts.
*
* @param {string} jsonlPath
* @returns {Promise<{thoughtCount:number, inputTokens:number, outputTokens:number, costUsd:number, lastActivityAt:string|null}>}
*/
export async function scanWorldJsonl(jsonlPath) {
const seenMessageIds = new Set();
let inputTokens = 0;
let outputTokens = 0;
let lastTimestamp = null;
let stream;
try {
stream = fs.createReadStream(jsonlPath, { encoding: 'utf8' });
} catch {
// ENOENT or permission error — return zeros.
return zeroStats();
}
// createReadStream defers ENOENT to the 'error' event; convert to a
// rejected promise so the caller's try/catch sees it uniformly.
const errorPromise = new Promise((_, reject) => {
stream.on('error', reject);
});
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
const linesPromise = (async () => {
for await (const line of rl) {
if (!line) continue;
let row;
try {
row = JSON.parse(line);
} catch {
// Skip malformed lines silently — the JSONL has been observed
// to contain partial writes during active sessions.
continue;
}
if (!row || row.type !== 'assistant') continue;
const msg = row.message;
if (!msg || typeof msg !== 'object') continue;
const messageId = typeof msg.id === 'string' ? msg.id : null;
if (messageId === null) continue;
if (seenMessageIds.has(messageId)) continue;
seenMessageIds.add(messageId);
const usage = msg.usage;
if (usage && typeof usage === 'object') {
if (Number.isFinite(usage.input_tokens)) {
inputTokens += Number(usage.input_tokens);
}
if (Number.isFinite(usage.output_tokens)) {
outputTokens += Number(usage.output_tokens);
}
}
if (typeof row.timestamp === 'string') {
// Lexicographic comparison is correct on ISO-8601 with consistent zone.
if (lastTimestamp === null || row.timestamp > lastTimestamp) {
lastTimestamp = row.timestamp;
}
}
}
})();
try {
await Promise.race([linesPromise, errorPromise]);
} catch {
return zeroStats();
} finally {
try { stream.destroy(); } catch { /* ignore */ }
}
const costUsd =
(inputTokens / 1_000_000) * INPUT_USD_PER_M_TOKENS +
(outputTokens / 1_000_000) * OUTPUT_USD_PER_M_TOKENS;
return {
thoughtCount: seenMessageIds.size,
inputTokens,
outputTokens,
costUsd,
lastActivityAt: lastTimestamp,
};
}
function zeroStats() {
return {
thoughtCount: 0,
inputTokens: 0,
outputTokens: 0,
costUsd: 0,
lastActivityAt: null,
};
}
/**
* @typedef {object} WorldActivityTrackerDeps
* @property {string} [dbPath] Path to worlds.db; defaults to
* `OLAM_WORLDS_DB` env var or `~/.olam/worlds.db`.
* @property {object} [broadcaster] Object with `.broadcast(type, payload)`
* (e.g. the return of `createHostStream`). Optional — when absent
* events are skipped but DB writes still happen.
* @property {number} [intervalMs] Tick cadence. Defaults to
* `OLAM_WORLD_ACTIVITY_TICK_MS` env or 60000.
* @property {string} [jsonlPathTemplate] JSONL path template.
* `{worldId}` is replaced per world. Defaults to
* `OLAM_WORLD_JSONL_PATH_TEMPLATE` env or
* `~/.olam/worlds/{worldId}/state/claude-main.jsonl`.
* @property {(msg: string) => void} [log] Defaults to `console.log`.
* @property {(msg: string) => void} [debug] Optional verbose log; defaults
* to no-op (debug-level skips on missing JSONL would be noisy).
* @property {(cb: () => void, ms: number) => any} [setTimer] Injectable
* `setInterval` for tests.
* @property {(handle: any) => void} [clearTimer] Injectable
* `clearInterval` for tests.
* @property {() => Date} [now] Clock injection for tests.
*/
/**
* @typedef {object} WorldActivityTrackerHandle
* @property {() => void} stop
* @property {() => Promise<number>} tickNow Run one tick synchronously
* (returns the count of worlds processed). Exposed for tests.
*/
/**
* Start the world activity tracker. Returns a `{ stop, tickNow }`
* handle. Safe to call before the worlds.db file exists — the tracker
* skip-with-log until the file appears.
*
* @param {WorldActivityTrackerDeps} [deps]
* @returns {WorldActivityTrackerHandle}
*/
export function startWorldActivityTracker(deps = {}) {
const log = deps.log ?? ((m) => console.log(`[world-activity] ${m}`));
const debug = deps.debug ?? (() => {});
const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));
const now = deps.now ?? (() => new Date());
const intervalMs =
deps.intervalMs ??
parseInt(process.env.OLAM_WORLD_ACTIVITY_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10);
const dbPath =
deps.dbPath ??
process.env.OLAM_WORLDS_DB ??
path.join(os.homedir(), '.olam', 'worlds.db');
const jsonlPathTemplate =
deps.jsonlPathTemplate ??
process.env.OLAM_WORLD_JSONL_PATH_TEMPLATE ??
'~/.olam/worlds/{worldId}/state/claude-main.jsonl';
const broadcaster = deps.broadcaster ?? null;
let stopped = false;
let inFlight = false;
let intervalHandle = null;
/**
* One tick: open DB, read active worlds, scan each JSONL, write back,
* emit event. Returns the count of worlds processed.
*
* @returns {Promise<number>}
*/
async function tick() {
if (stopped) return 0;
if (inFlight) {
// Skip overlap — slow filesystem must not pile up ticks.
debug('tick skipped: previous tick still in flight');
return 0;
}
inFlight = true;
let db = null;
let processed = 0;
try {
let Database;
try {
Database = require('better-sqlite3');
} catch (err) {
// better-sqlite3 unavailable (e.g. container without native
// build) — degrade silently.
log(`better-sqlite3 unavailable; skipping tick: ${err.message}`);
return 0;
}
try {
db = new Database(dbPath, { fileMustExist: true });
} catch (err) {
// SQLITE_CANTOPEN (file absent) is the expected first-boot
// case; everything else is worth surfacing.
if (err.code !== 'SQLITE_CANTOPEN') {
log(`open ${dbPath} failed: ${err.message}`);
} else {
debug(`${dbPath} not present yet; skipping tick`);
}
return 0;
}
let activeWorlds;
try {
activeWorlds = db
.prepare(
"SELECT id FROM worlds WHERE status NOT IN ('destroyed', 'failed')",
)
.all();
} catch (err) {
log(`query active worlds failed: ${err.message}`);
return 0;
}
const updateStmt = db.prepare(
`UPDATE worlds
SET thought_count = ?,
total_cost_usd = ?,
updated_at = ?
WHERE id = ?`,
);
for (const row of activeWorlds) {
if (stopped) break;
const worldId = row.id;
if (typeof worldId !== 'string') continue;
const jsonlPath = resolveJsonlPath(jsonlPathTemplate, worldId);
let stats;
try {
stats = await scanWorldJsonl(jsonlPath);
} catch (err) {
// Defence in depth — scanWorldJsonl is already fail-soft, but
// this catches anything unforeseen at the call seam.
debug(`scan ${worldId} failed: ${err.message}`);
continue;
}
const updatedAt = now().toISOString();
try {
updateStmt.run(
stats.thoughtCount,
stats.costUsd,
updatedAt,
worldId,
);
} catch (err) {
log(`update ${worldId} failed: ${err.message}`);
continue;
}
if (broadcaster && typeof broadcaster.broadcast === 'function') {
try {
broadcaster.broadcast('world.activity.tick', {
worldId,
thoughtCount: stats.thoughtCount,
costUsd: stats.costUsd,
inputTokens: stats.inputTokens,
outputTokens: stats.outputTokens,
lastActivityAt: stats.lastActivityAt,
updatedAt,
});
} catch (err) {
log(`broadcast ${worldId} failed: ${err.message}`);
}
}
processed += 1;
}
} finally {
if (db) {
try { db.close(); } catch { /* ignore */ }
}
inFlight = false;
}
return processed;
}
// Kick off an initial tick on next event-loop turn so callers can
// attach test spies before any DB work happens.
setImmediate(() => {
if (stopped) return;
void tick().catch((err) => {
log(`initial tick crashed: ${err?.message ?? err}`);
});
});
intervalHandle = setTimer(() => {
void tick().catch((err) => {
log(`tick crashed: ${err?.message ?? err}`);
});
}, intervalMs);
// Don't pin the event loop on shutdown.
if (intervalHandle && typeof intervalHandle.unref === 'function') {
intervalHandle.unref();
}
log(
`started: db=${dbPath} template=${jsonlPathTemplate} interval=${intervalMs}ms`,
);
return {
stop() {
if (stopped) return;
stopped = true;
if (intervalHandle !== null) {
try { clearTimer(intervalHandle); } catch { /* ignore */ }
intervalHandle = null;
}
},
tickNow: tick,
};
}
// Phase F-2-D follow-up: persistent world-name store.
//
// Background: world.id is the docker container suffix (e.g. `gold-arc-1454`)
// and is immutable. Operators want a separate human-friendly `name`
// (e.g. "Refactor the auth module") so the worlds list reads like a
// task board instead of a string of CSS-color-words.
//
// Storage: a single JSON file at /data/world-names.json (mounted from
// ~/.olam/world-names.json on the host). Atomic write via tmp+rename so
// concurrent PATCHes can't half-write the file. Read-on-demand with a
// tiny in-process cache keyed off mtime so steady-state GET /api/worlds
// doesn't reread the file every poll.
//
// Schema:
// { "<worldId>": "<name>", ... }
//
// Names are arbitrary UTF-8 strings, capped at NAME_MAX_LEN to keep
// the file small + the UI sane.
import fs from 'node:fs';
import path from 'node:path';
const NAME_MAX_LEN = 120;
/**
* @typedef {object} WorldNamesStore
* @property {() => Record<string, string>} all
* @property {(id: string) => string | null} get
* @property {(id: string, name: string) => string} set
* @property {(id: string) => void} remove
*/
/**
* Create a JSON-backed world-names store rooted at `filePath`.
* Resilient to a missing file (treats as empty); resilient to a
* malformed file (logs + treats as empty).
*
* @param {string} filePath
* @returns {WorldNamesStore}
*/
export function createWorldNamesStore(filePath) {
/** @type {Record<string, string>} */
let cache = {};
let cacheMtimeMs = -1;
function readFromDisk() {
if (!fs.existsSync(filePath)) {
cache = {};
cacheMtimeMs = 0;
return;
}
try {
const stat = fs.statSync(filePath);
if (stat.mtimeMs === cacheMtimeMs) return; // cache hit
const raw = fs.readFileSync(filePath, 'utf-8');
const parsed = JSON.parse(raw);
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
const next = {};
for (const [k, v] of Object.entries(parsed)) {
if (typeof v === 'string') next[k] = v;
}
cache = next;
} else {
cache = {};
}
cacheMtimeMs = stat.mtimeMs;
} catch (err) {
console.error(`world-names-store: failed to read ${filePath}: ${err.message}`);
cache = {};
cacheMtimeMs = 0;
}
}
function writeToDisk() {
const dir = path.dirname(filePath);
fs.mkdirSync(dir, { recursive: true });
const tmp = `${filePath}.tmp-${process.pid}-${Date.now()}`;
fs.writeFileSync(tmp, JSON.stringify(cache, null, 2), 'utf-8');
fs.renameSync(tmp, filePath);
try {
const stat = fs.statSync(filePath);
cacheMtimeMs = stat.mtimeMs;
} catch {
cacheMtimeMs = 0;
}
}
/** @returns {Record<string, string>} */
function all() {
readFromDisk();
return { ...cache };
}
/**
* @param {string} id
* @returns {string | null}
*/
function get(id) {
readFromDisk();
return cache[id] ?? null;
}
/**
* @param {string} id
* @param {string} name
* @returns {string} the normalized name actually stored
*/
function set(id, name) {
if (typeof id !== 'string' || id.length === 0) {
throw new Error('worldId must be a non-empty string');
}
const normalized = normalizeName(name);
if (normalized === null) {
throw new Error('name must be a non-empty string (after trim)');
}
readFromDisk();
cache = { ...cache, [id]: normalized };
writeToDisk();
return normalized;
}
/**
* @param {string} id
*/
function remove(id) {
readFromDisk();
if (!(id in cache)) return;
const next = { ...cache };
delete next[id];
cache = next;
writeToDisk();
}
return { all, get, set, remove };
}
/**
* Normalize a name input. Trims, collapses internal whitespace, caps
* length. Returns null for empty/whitespace-only input.
*
* @param {unknown} input
* @returns {string | null}
*/
export function normalizeName(input) {
if (typeof input !== 'string') return null;
const trimmed = input.replace(/\s+/g, ' ').trim();
if (trimmed.length === 0) return null;
return trimmed.length > NAME_MAX_LEN
? trimmed.slice(0, NAME_MAX_LEN).trimEnd()
: trimmed;
}
/**
* Derive a human-friendly name from an initial task / dispatch text.
* Takes the first sentence (split on `.`/`?`/`!`/newline), trims, caps
* at ~60 chars at a word boundary so the UI doesn't truncate mid-word.
* Returns null for empty input — caller falls back to id.
*
* @param {unknown} taskText
* @returns {string | null}
*/
export function inferNameFromTask(taskText) {
if (typeof taskText !== 'string') return null;
const cleaned = taskText.replace(/\s+/g, ' ').trim();
if (cleaned.length === 0) return null;
// First sentence terminator wins; otherwise the whole string.
const firstSentence = cleaned.split(/[.!?\n]/)[0]?.trim() ?? cleaned;
const SOFT_CAP = 60;
if (firstSentence.length <= SOFT_CAP) return firstSentence || null;
// Cap at a word boundary close to SOFT_CAP so we don't dangle
// half a word + an ellipsis.
const head = firstSentence.slice(0, SOFT_CAP);
const lastSpace = head.lastIndexOf(' ');
const truncated = lastSpace > 30 ? head.slice(0, lastSpace) : head;
return truncated.replace(/[\s,;:—–-]+$/u, '');
}
import fs from 'node:fs';
import path from 'node:path';
/**
* @typedef {object} PrStateEntry
* @property {string} pr_url
* @property {number|null} pr_number
* @property {string|null} pr_repo
* @property {string|null} pr_created_at
* @property {'open'|'merged'|'merged_destroyed'} pr_state
* @property {string|null} pr_merged_at
* @property {boolean} auto_destroy_on_merge
*/
/**
* @param {string} filePath
*/
export function createWorldPrStateStore(filePath) {
/** @type {Record<string, PrStateEntry>} */
let cache = {};
let cacheMtimeMs = -1;
function readFromDisk() {
if (!fs.existsSync(filePath)) {
cache = {};
cacheMtimeMs = 0;
return;
}
try {
const stat = fs.statSync(filePath);
if (stat.mtimeMs === cacheMtimeMs) return;
const raw = fs.readFileSync(filePath, 'utf-8');
const parsed = JSON.parse(raw);
cache = parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {};
cacheMtimeMs = stat.mtimeMs;
} catch (err) {
console.error(`world-pr-state: failed to read ${filePath}: ${err.message}`);
cache = {};
cacheMtimeMs = 0;
}
}
function writeToDisk() {
const dir = path.dirname(filePath);
fs.mkdirSync(dir, { recursive: true });
const tmp = `${filePath}.tmp-${process.pid}-${Date.now()}`;
fs.writeFileSync(tmp, JSON.stringify(cache, null, 2), 'utf-8');
fs.renameSync(tmp, filePath);
try {
cacheMtimeMs = fs.statSync(filePath).mtimeMs;
} catch {
cacheMtimeMs = 0;
}
}
function getAll() {
readFromDisk();
return { ...cache };
}
/** @param {string} worldId */
function get(worldId) {
readFromDisk();
return cache[worldId] ?? null;
}
/**
* Upsert — merges data with the existing entry.
* @param {string} worldId
* @param {Partial<PrStateEntry>} data
*/
function set(worldId, data) {
readFromDisk();
const existing = cache[worldId] ?? {};
cache = { ...cache, [worldId]: { ...existing, ...data } };
writeToDisk();
}
/** @param {string} worldId */
function remove(worldId) {
readFromDisk();
if (!(worldId in cache)) return;
const next = { ...cache };
delete next[worldId];
cache = next;
writeToDisk();
}
function getWorldsToWatch() {
readFromDisk();
return Object.entries(cache)
.filter(([, entry]) => entry.pr_url && entry.pr_state !== 'merged_destroyed')
.map(([worldId, entry]) => ({ worldId, ...entry }));
}
return { getAll, get, set, remove, getWorldsToWatch };
}
/**
* World progress computation — maps world state onto the 8-phase ladder
* shown in the inbox row progress bar.
*
* @module world-progress
*/
import path from 'node:path';
import { homedir } from 'node:os';
import { createRequire } from 'node:module';
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import { readPlanProgress } from './plan-progress.mjs';
const execFileAsync = promisify(execFile);
// Mirror of @olam/core/src/world-paths.mjs. Inlined deliberately: host-cp's
// slim Docker image does NOT bundle @olam/core (see server.mjs ~L560 for the
// architectural decision). Keep these two definitions in sync until the
// host-cp image build is taught to vendor workspace deps.
const WORLD_DB_FILENAME = 'world.db';
function getWorldDbPath(workspacePath) {
return path.join(workspacePath, WORLD_DB_FILENAME);
}
/**
* Phase ladder definition.
* @type {Array<{name: string, index: number}>}
*/
const PHASES = [
{ name: 'starting', index: 1 },
{ name: 'implementing', index: 2 },
{ name: 'committing', index: 3 },
{ name: 'pushing', index: 4 },
{ name: 'in_review', index: 5 },
{ name: 'ci_failed', index: 6 },
{ name: 'ready', index: 7 },
{ name: 'merged', index: 8 },
];
const PHASE_TOTAL = PHASES.length;
const IDLE_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes
/**
* Determine the current phase from observable state.
*
* @param {{
* thoughts: number,
* commitsAhead: number,
* pushed: boolean,
* prUrl: string|null,
* prChecks: 'pending'|'passing'|'failing'|null,
* prState: 'open'|'merged'|'closed'|null,
* }} state
* @returns {string} phase name
*/
export function determinePhase({ thoughts, commitsAhead, pushed, prUrl, prChecks, prState }) {
// merged
if (prState === 'merged') return 'merged';
// prUrl exists
if (prUrl) {
if (prChecks === 'failing') return 'ci_failed';
if (prChecks === 'passing' && prState === 'open') return 'ready';
// prChecks is null or pending
return 'in_review';
}
// No PR yet
if (pushed) return 'pushing';
if (commitsAhead >= 1) return 'committing';
if (thoughts >= 30) return 'implementing';
return 'starting';
}
/**
* Build the safe/default response for a world.
*
* @param {string} worldId
* @returns {object}
*/
export function makeSafeResponse(worldId) {
return {
worldId,
phase: 'starting',
phaseIndex: 1,
phaseTotal: PHASE_TOTAL,
isIdle: false,
thoughts: 0,
lastActivityAt: null,
runtimeMs: 0,
commitsAhead: 0,
pushed: false,
prUrl: null,
prNumber: null,
prChecks: null,
prState: null,
plan: null,
};
}
/**
* Read a world row from worlds.db.
*
* @param {string} dbPath
* @param {string} worldId
* @returns {{ branch: string, repos: string[], workspacePath: string, createdAt: string } | null}
*/
function defaultReadWorldRow(dbPath, worldId) {
try {
const Database = createRequire(import.meta.url)('better-sqlite3');
const db = new Database(dbPath, { readonly: true });
db.pragma('journal_mode = WAL');
const row = db.prepare(
'SELECT branch, repos, workspace_path, created_at FROM worlds WHERE id = ?',
).get(worldId);
db.close();
if (!row) return null;
let repos = [];
try {
repos = typeof row.repos === 'string' ? JSON.parse(row.repos) : (row.repos ?? []);
} catch {
repos = [];
}
return {
branch: row.branch ?? 'main',
repos,
workspacePath: row.workspace_path ?? '',
createdAt: row.created_at ?? null,
};
} catch {
return null;
}
}
/**
* Read thought count and last activity from a world.db.
*
* @param {string} dbPath
* @returns {{ count: number, lastAt: string|null }}
*/
function defaultReadThoughts(dbPath) {
try {
const Database = createRequire(import.meta.url)('better-sqlite3');
const db = new Database(dbPath, { readonly: true });
db.pragma('journal_mode = WAL');
const row = db
.prepare('SELECT COUNT(*) AS cnt, MAX(created_at) AS last_at FROM thought_nodes')
.get();
db.close();
return {
count: Number(row?.cnt ?? 0),
lastAt: row?.last_at ?? null,
};
} catch {
return { count: 0, lastAt: null };
}
}
/**
* Count commits ahead of origin/main for a git worktree.
*
* @param {string} worktreePath
* @returns {Promise<number>}
*/
async function defaultGitCommitsAhead(worktreePath) {
try {
const { stdout } = await execFileAsync(
'git',
['-C', worktreePath, 'rev-list', 'origin/main..HEAD', '--count'],
{ timeout: 5000 },
);
const n = parseInt(stdout.trim(), 10);
return Number.isFinite(n) ? n : 0;
} catch {
return 0;
}
}
/**
* Check whether the branch has been pushed to origin.
*
* @param {string} worktreePath
* @param {string} branch
* @returns {Promise<boolean>}
*/
async function defaultGitIsPushed(worktreePath, branch) {
try {
await execFileAsync(
'git',
['-C', worktreePath, 'rev-parse', '--quiet', '--verify', `origin/${branch}`],
{ timeout: 5000 },
);
return true;
} catch {
return false;
}
}
/**
* Compute the current progress state for a world.
*
* @param {string} worldId
* @param {{
* worldsDbPath?: string,
* prCache?: { getPr: (prUrl: string, getToken: () => Promise<string|null>) => Promise<{state:string|null,number:number|null,checks:string|null}|null> },
* prStateStore?: { get: (worldId: string) => object|null },
* getGhToken?: () => Promise<string|null>,
* _readWorldRow?: (dbPath: string, worldId: string) => object|null,
* _readThoughts?: (dbPath: string) => { count: number, lastAt: string|null },
* _gitCommitsAhead?: (worktreePath: string) => Promise<number>,
* _gitIsPushed?: (worktreePath: string, branch: string) => Promise<boolean>,
* }} [deps]
* @returns {Promise<object>}
*/
export async function computeProgress(worldId, deps = {}) {
const safe = makeSafeResponse(worldId);
try {
const {
worldsDbPath = process.env.OLAM_WORLDS_DB ?? path.join(homedir(), '.olam/worlds.db'),
prCache = null,
prStateStore = null,
getGhToken = async () => null,
_readWorldRow = defaultReadWorldRow,
_readThoughts = defaultReadThoughts,
_gitCommitsAhead = defaultGitCommitsAhead,
_gitIsPushed = defaultGitIsPushed,
} = deps;
// Read world row
const worldRow = _readWorldRow(worldsDbPath, worldId);
if (!worldRow) return safe;
const { branch, repos, workspacePath, createdAt } = worldRow;
const worktreePath = repos.length > 0 ? path.join(workspacePath, repos[0]) : workspacePath;
// Compute runtimeMs
const runtimeMs = createdAt ? Date.now() - new Date(createdAt).getTime() : 0;
// Read thoughts
const thoughtsDbPath = getWorldDbPath(workspacePath);
const { count: thoughts, lastAt: thoughtsLastAt } = _readThoughts(thoughtsDbPath);
// Git state
const [commitsAhead, pushed] = await Promise.all([
_gitCommitsAhead(worktreePath),
_gitIsPushed(worktreePath, branch),
]);
// PR state — check prStateStore first
let prUrl = null;
let prNumber = null;
let prState = null;
let prChecks = null;
if (prStateStore) {
const prEntry = prStateStore.get(worldId);
if (prEntry) {
prUrl = prEntry.pr_url ?? null;
prNumber = prEntry.pr_number ?? null;
// Normalize merged_destroyed → merged
const rawState = prEntry.pr_state ?? null;
prState = rawState === 'merged_destroyed' ? 'merged' : (rawState === 'none' ? null : rawState);
}
}
// Live PR data from cache
if (prUrl && prCache) {
try {
const livePr = await prCache.getPr(prUrl, getGhToken);
if (livePr) {
prChecks = livePr.checks;
// Update state if live data shows merged
if (livePr.state === 'merged') prState = 'merged';
if (livePr.number != null) prNumber = livePr.number;
}
} catch {
// Non-fatal
}
}
// Determine phase
const phase = determinePhase({ thoughts, commitsAhead, pushed, prUrl, prChecks, prState });
const phaseEntry = PHASES.find((p) => p.name === phase) ?? PHASES[0];
// Idle overlay — only for implementing or committing phases
let isIdle = false;
if (phase === 'implementing' || phase === 'committing') {
if (thoughtsLastAt) {
const lastActivityMs = new Date(thoughtsLastAt).getTime();
if (!isNaN(lastActivityMs) && Date.now() - lastActivityMs > IDLE_THRESHOLD_MS) {
isIdle = true;
}
}
}
// Plan progress — additive; null when no tracker found
const lastActivityAtMs = thoughtsLastAt ? new Date(thoughtsLastAt).getTime() : null;
const plan = readPlanProgress(worktreePath, branch, { lastActivityAtMs });
return {
worldId,
phase,
phaseIndex: phaseEntry.index,
phaseTotal: PHASE_TOTAL,
isIdle,
thoughts,
lastActivityAt: thoughtsLastAt ?? null,
runtimeMs: Math.max(0, runtimeMs),
commitsAhead,
pushed,
prUrl,
prNumber,
prChecks,
prState,
plan,
};
} catch {
return safe;
}
}
// Service enrichment (Phase F-2-D dogfood fix) — extracted from server.mjs.
//
// Fetch port bindings for a world's container via docker-socket-proxy
// inspect, map each to a clickable URL tagged with well-known internal
// ports, and probe each for actual reachability.
//
// Extracted as a standalone module so the probe + enrichment logic can be
// unit-tested in isolation (server.mjs has module-level side effects that
// make direct import impractical). The two host-specific values that the
// inline version read from server.mjs module constants — HOST_FOR_WORLD and
// DOCKER_HOST — are injected as a `deps` object so the functions stay pure
// and deterministically testable.
export const WELL_KNOWN_PORTS = {
3000: 'atlas-core (Rails)',
5175: 'diner-app (Vite)',
7681: 'Terminal (ttyd)',
7682: 'Terminal Shell (ttyd)',
8080: 'Per-world CP',
};
/**
* Quick liveness probe against a service URL. Returns true if the
* service responds with ANY HTTP response (1xx-5xx) — we don't care
* about status codes because each app has its own conventions (Vite
* 200s on /, ttyd may 401, Rails may 500 on /, the per-world CP 200s).
* What matters is that something is listening.
*
* Probed from inside the host-cp container so we use hostForWorld
* (host.docker.internal on macOS/Windows, 172.17.0.1 on Linux) — the
* SPA's own 127.0.0.1:<port> URL is unreachable from container-side.
*
* Tight 800ms timeout. Worst case: 4 services × 800ms in parallel ≤ 1s
* added to the /api/worlds response — acceptable for a 4s poll cycle.
*
* @param {number} hostPort
* @param {{ hostForWorld: string }} deps
* @returns {Promise<boolean>}
*/
export async function probeServiceLive(hostPort, { hostForWorld }) {
const probeUrl = `http://${hostForWorld}:${hostPort}/`;
try {
const res = await fetch(probeUrl, {
method: 'HEAD',
signal: AbortSignal.timeout(800),
redirect: 'manual',
});
return res.status > 0;
} catch {
// ECONNREFUSED, timeout, DNS — anything counts as not-live. Try
// GET as a fallback because some servers (e.g. ttyd) close on HEAD
// and we don't want false negatives from picky upstream behavior.
try {
const res2 = await fetch(probeUrl, {
method: 'GET',
signal: AbortSignal.timeout(800),
redirect: 'manual',
});
return res2.status > 0;
} catch {
return false;
}
}
}
/**
* Get the running container's port bindings from socket-proxy + map
* each to a clickable URL. Each service is then probed in parallel
* for actual reachability — the docker port mapping just tells us
* what's CONFIGURED; the probe confirms what's actually LISTENING.
*
* Returns [] on any docker-inspect failure (container missing, socket-
* proxy down) so the API still returns a valid worlds list.
*
* @param {string} worldId
* @param {{ hostForWorld: string, dockerHost: string }} deps
* @returns {Promise<Array<{name: string, host_port: number, internal_port: number, url: string, live: boolean}>>}
*/
export async function fetchWorldServices(worldId, { hostForWorld, dockerHost }) {
const containerName = `olam-${worldId}-devbox`;
let data;
try {
if (dockerHost === 'docker-cli') {
// Bare-node mode: shell out to `docker inspect` instead of HTTP.
// Same fix pattern as fetchContainerSecret (PR #108). Without
// this, the services array is always empty in bare-node and the
// SPA can't find the ttyd host port → terminal renders blank.
const { spawnSync } = await import('node:child_process');
const result = spawnSync(
'docker',
['inspect', containerName],
{ encoding: 'utf-8', timeout: 2000 },
);
if (result.status !== 0) return [];
const arr = JSON.parse(result.stdout || '[]');
data = Array.isArray(arr) && arr.length > 0 ? arr[0] : null;
if (!data) return [];
} else {
const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
const res = await fetch(`${apiBase}/containers/${encodeURIComponent(containerName)}/json`, {
signal: AbortSignal.timeout(2000),
});
if (!res.ok) return [];
data = await res.json();
}
const ports = data?.NetworkSettings?.Ports ?? {};
const draft = [];
for (const [internal, bindings] of Object.entries(ports)) {
if (!Array.isArray(bindings) || bindings.length === 0) continue;
const internalPort = parseInt(internal.split('/')[0], 10);
const hostPort = parseInt(bindings[0].HostPort, 10);
if (!Number.isFinite(internalPort) || !Number.isFinite(hostPort)) continue;
draft.push({
name: WELL_KNOWN_PORTS[internalPort] ?? `App (port ${internalPort})`,
host_port: hostPort,
internal_port: internalPort,
url: `http://127.0.0.1:${hostPort}`,
});
}
// Probe each service in parallel for actual reachability. Adds a
// `live: boolean` field. The UI dims chips for non-live services
// so operators can see what's configured-but-down vs configured-
// and-up at a glance.
const liveResults = await Promise.all(
draft.map((s) => probeServiceLive(s.host_port, { hostForWorld })),
);
const services = draft.map((s, i) => ({ ...s, live: liveResults[i] }));
// Stable order: well-known ports first (CP, then Rails/Vite, then terminal).
services.sort((a, b) => a.internal_port - b.internal_port);
return services;
} catch {
return [];
}
}
import { spawn } from 'node:child_process';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
// Deployment-mode values injected by server.mjs via configure().
// Defaults are bare-node-safe so the module is usable in tests without configure().
let HOST_FOR_WORLD = process.env.OLAM_HOST_FOR_WORLD ?? '127.0.0.1';
let TUNNELS_PATH =
process.env.OLAM_WORLD_TUNNELS_PATH ??
path.join(os.homedir(), '.olam', 'world-tunnels.json');
/**
* Called by server.mjs immediately after it resolves HOST_FOR_WORLD and
* WORLD_TUNNELS_PATH from the deployment-mode branch. Avoids re-deriving
* container-specific literals (host.docker.internal, /data/…) in this module.
* Re-runs loadState() when tunnelsPath differs from the env-var default so
* container-mode persistence is loaded from /data/ rather than ~/.olam/.
*/
export function configure({ hostForWorld, tunnelsPath }) {
HOST_FOR_WORLD = hostForWorld;
if (tunnelsPath !== TUNNELS_PATH) {
TUNNELS_PATH = tunnelsPath;
loadState();
}
}
const TUNNEL_TIMEOUT_MS = 30_000;
const PROBE_TIMEOUT_MS = 3_000;
const URL_PATTERN = /https:\/\/[a-z0-9-]+\.trycloudflare\.com/;
export const STATUS = {
IDLE: 'idle',
STARTING: 'starting',
RUNNING: 'running',
ERROR: 'error',
STALE: 'stale',
};
export class AlreadyStartingError extends Error {
constructor(worldId, serviceName) {
super(`tunnel for ${serviceName} in world ${worldId} is already starting`);
this.name = 'AlreadyStartingError';
this.worldId = worldId;
this.serviceName = serviceName;
}
}
export class TunnelTimeoutError extends Error {
constructor(worldId, serviceName) {
super(`tunnel for ${serviceName} in world ${worldId} timed out after 30s with no URL`);
this.name = 'TunnelTimeoutError';
this.worldId = worldId;
this.serviceName = serviceName;
}
}
// Key: `${worldId}:${serviceName}` → {worldId, serviceName, port, status, url, process?}
const registry = new Map();
function tunnelKey(worldId, serviceName) {
return `${worldId}:${serviceName}`;
}
function loadState() {
try {
if (!fs.existsSync(TUNNELS_PATH)) return;
const raw = fs.readFileSync(TUNNELS_PATH, 'utf-8');
const data = JSON.parse(raw);
if (!data || typeof data !== 'object' || Array.isArray(data)) return;
for (const [key, entry] of Object.entries(data)) {
registry.set(key, { ...entry, process: null });
}
} catch (err) {
console.error(`world-tunnel-manager: loadState failed: ${err.message}`);
}
}
function saveState() {
try {
const dir = path.dirname(TUNNELS_PATH);
fs.mkdirSync(dir, { recursive: true });
const data = {};
for (const [key, entry] of registry) {
// eslint-disable-next-line no-unused-vars
const { process: _proc, ...rest } = entry;
data[key] = rest;
}
const tmp = `${TUNNELS_PATH}.tmp-${process.pid}-${Date.now()}`;
fs.writeFileSync(tmp, JSON.stringify(data, null, 2), 'utf-8');
fs.renameSync(tmp, TUNNELS_PATH);
} catch (err) {
console.error(`world-tunnel-manager: saveState failed: ${err.message}`);
}
}
/**
* Start a cloudflared quick-tunnel for a world service.
* Resolves with the assigned trycloudflare.com URL.
* Rejects with AlreadyStartingError if the service is already starting/running.
* Rejects with TunnelTimeoutError if no URL is emitted within 30s.
*
* @param {string} worldId
* @param {string} serviceName
* @param {number} port host-side port (i.e. the published port on this machine)
* @returns {Promise<string>} the public tunnel URL
*/
export async function startTunnel(worldId, serviceName, port) {
const key = tunnelKey(worldId, serviceName);
const existing = registry.get(key);
if (existing && (existing.status === STATUS.STARTING || existing.status === STATUS.RUNNING)) {
throw new AlreadyStartingError(worldId, serviceName);
}
const entry = {
worldId,
serviceName,
port,
status: STATUS.STARTING,
url: null,
process: null,
};
registry.set(key, entry);
saveState();
const target = `http://${HOST_FOR_WORLD}:${port}`;
const child = spawn('cloudflared', ['tunnel', '--url', target], {
stdio: ['ignore', 'pipe', 'pipe'],
detached: false,
});
entry.process = child;
return new Promise((resolve, reject) => {
let settled = false;
function settle(resolvedUrl) {
if (settled) return;
settled = true;
clearTimeout(timer);
if (resolvedUrl) {
entry.status = STATUS.RUNNING;
entry.url = resolvedUrl;
saveState();
resolve(resolvedUrl);
} else {
entry.status = STATUS.ERROR;
entry.process = null;
saveState();
reject(new TunnelTimeoutError(worldId, serviceName));
}
}
const timer = setTimeout(() => settle(null), TUNNEL_TIMEOUT_MS);
function scanChunk(chunk) {
const lines = chunk.toString().split('\n');
for (const line of lines) {
const match = URL_PATTERN.exec(line);
if (match) { settle(match[0]); return; }
}
}
child.stdout.on('data', scanChunk);
child.stderr.on('data', scanChunk);
child.on('error', (err) => {
console.error(`world-tunnel-manager: cloudflared spawn error: ${err.message}`);
settle(null);
});
child.on('exit', (code) => {
if (!settled) {
console.error(`world-tunnel-manager: cloudflared exited (code ${code}) before URL`);
settle(null);
} else {
// Process died after URL was emitted (tunnel dropped)
entry.status = STATUS.ERROR;
entry.process = null;
saveState();
}
});
});
}
/**
* Stop a tunnel for a specific service. No-op if the service has no tunnel.
* @param {string} worldId
* @param {string} serviceName
*/
export function stopTunnel(worldId, serviceName) {
const key = tunnelKey(worldId, serviceName);
const entry = registry.get(key);
if (!entry) return;
if (entry.process) {
try { entry.process.kill('SIGTERM'); } catch { /* already dead */ }
entry.process = null;
}
entry.status = STATUS.IDLE;
entry.url = null;
saveState();
}
/**
* Return tunnel state for ALL worlds, keyed by worldId. Used by the
* host-stream broadcaster (sse-consolidation Phase B-bonus) to push a
* `tunnels.snapshot` whenever the registry changes — replaces the
* SPA's per-row `usePublishedTunnels` poll loop.
*
* @returns {{ [worldId: string]: Array<{name: string, port: number, url: string|null, status: string}> }}
*/
export function getAllTunnels() {
/** @type {Record<string, Array<{name: string, port: number, url: string|null, status: string}>>} */
const byWorld = {};
for (const entry of registry.values()) {
if (!byWorld[entry.worldId]) byWorld[entry.worldId] = [];
byWorld[entry.worldId].push({
name: entry.serviceName,
port: entry.port,
url: entry.url,
status: entry.status,
});
}
return byWorld;
}
/**
* Return the current tunnel state for all services in a world.
* @param {string} worldId
* @returns {Array<{name: string, port: number, url: string|null, status: string}>}
*/
export function getWorldTunnels(worldId) {
const result = [];
for (const entry of registry.values()) {
if (entry.worldId === worldId) {
result.push({
name: entry.serviceName,
port: entry.port,
url: entry.url,
status: entry.status,
});
}
}
return result;
}
/**
* Kill all tunnels for a world. Called when a world is destroyed.
* Idempotent — no-op if world has no tunnels.
* @param {string} worldId
*/
export function killWorld(worldId) {
const toDelete = [];
for (const [key, entry] of registry) {
if (entry.worldId !== worldId) continue;
if (entry.process) {
try { entry.process.kill('SIGTERM'); } catch { /* already dead */ }
entry.process = null;
}
toDelete.push(key);
}
for (const key of toDelete) registry.delete(key);
if (toDelete.length > 0) saveState();
}
/**
* On startup, probe each persisted "running" tunnel. If the URL is unreachable,
* mark as stale so the UI can surface a Re-publish affordance.
*/
export async function probeAllOnStartup() {
const toProbe = [];
for (const [key, entry] of registry) {
if (entry.status === STATUS.RUNNING && entry.url) {
toProbe.push({ key, url: entry.url });
}
}
await Promise.all(
toProbe.map(async ({ key, url }) => {
try {
const res = await fetch(url, {
signal: AbortSignal.timeout(PROBE_TIMEOUT_MS),
});
if (!res.ok) throw new Error(`HTTP ${res.status}`);
} catch {
const entry = registry.get(key);
if (entry) {
entry.status = STATUS.STALE;
saveState();
}
}
}),
);
}
function killAll() {
for (const entry of registry.values()) {
if (entry.process) {
try { entry.process.kill('SIGTERM'); } catch { /* already dead */ }
entry.process = null;
}
}
}
process.on('SIGTERM', killAll);
process.on('exit', killAll);
// Initialise on module load using env-var or bare-node default path.
// configure() re-runs loadState() when server.mjs provides a different path
// (container mode: /data/world-tunnels.json vs the ~/.olam default above).
loadState();
/**
* world-watchdog-pid-lookup.mjs — host-visible PID lookup for the world watchdog.
*
* Uses `docker top <containerId>` to enumerate processes inside a world's
* container and returns the host-visible PID of the claude process.
*
* `docker top` output format (Linux Docker / Colima):
* UID PID PPID C STIME TTY TIME CMD
* root 1234 1 0 10:00 ? 00:00:00 node /usr/local/bin/claude ...
*
* The PID column (index 1 in default ps output) is already the host-visible
* PID. On Mac/Colima the container runs inside a Linux VM so `docker top`
* returns PIDs within the VM's PID namespace — these are NOT the macOS host
* PIDs, but they ARE the PIDs visible from within the Linux layer (where
* /proc reads happen). This is the same namespace the watchdog probes use
* when reading /proc/<pid>/wchan etc., so the PIDs are correct for probe use.
*
* Inject `docker` for tests (avoids spawning real docker processes).
*
* @see docs/architecture/world-watchdog.md
*/
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
const execFileAsync = promisify(execFile);
/**
* Default docker executor — shells out to the real `docker` CLI.
*
* @param {string} containerId
* @returns {Promise<string>} stdout from `docker top <containerId>`
*/
async function defaultDockerTop(containerId) {
const { stdout } = await execFileAsync('docker', ['top', containerId], {
timeout: 5_000,
});
return stdout;
}
/**
* Parse the stdout from `docker top` and extract host-visible PIDs whose
* CMD column matches a claude process.
*
* docker top default output columns (ps -ef format):
* UID PID PPID C STIME TTY TIME CMD
* Indices: 0=UID, 1=PID, 2=PPID, ..., 7+=CMD (rest of line after 7 columns).
*
* @param {string} stdout Raw output from `docker top <id>`
* @returns {number[]} Host-visible PIDs of matching claude processes, sorted ascending.
*/
export function parseDockerTopOutput(stdout) {
const lines = stdout.split('\n').filter((l) => l.trim().length > 0);
if (lines.length < 2) return []; // header only or empty
// Skip the header line (first line contains column names).
const dataLines = lines.slice(1);
const pids = [];
for (const line of dataLines) {
// Split on any whitespace — `docker top` columns are space-separated.
// CMD may contain spaces; split into at most 8 parts (last = full CMD string).
const parts = line.trim().split(/\s+/);
if (parts.length < 8) continue;
const pid = parseInt(parts[1], 10);
if (!Number.isFinite(pid) || pid <= 0) continue;
// parts[7] onward is the CMD. Rejoin the remainder.
const cmd = parts.slice(7).join(' ');
// Match: `claude` as standalone binary, or `node` process running claude.
if (/(?:^|\/)claude(\s|$)/.test(cmd) || /node[^\s]*\s+.*[/\\]claude(?:\s|$)/.test(cmd)) {
pids.push(pid);
}
}
return pids.sort((a, b) => a - b);
}
/**
* Find the host-visible PID of the claude process running inside a container.
*
* Returns the lowest matching PID (parent process heuristic — the supervisor
* claude process has a lower PID than any child workers it spawns).
*
* Fail-soft:
* - docker unreachable / container not found → null + log
* - no claude process in the container → null (silent)
* - multiple claude processes → return the lowest PID
*
* @param {{
* containerId: string,
* dockerTop?: (containerId: string) => Promise<string>,
* log?: (msg: string) => void,
* }} opts
* @returns {Promise<number | null>}
*/
export async function findClaudePid({
containerId,
dockerTop = defaultDockerTop,
log = (m) => console.log(`[world-watchdog-pid-lookup] ${m}`),
}) {
if (!containerId) return null;
let stdout;
try {
stdout = await dockerTop(containerId);
} catch (err) {
log(`docker top ${containerId} failed: ${err?.message ?? err}`);
return null;
}
const pids = parseDockerTopOutput(stdout);
if (pids.length === 0) return null;
// Lowest PID = the parent/supervisor process.
return pids[0];
}
/**
* world-watchdog-probes.mjs — pure probe functions for the world watchdog.
*
* Three readers extract raw signals from the Linux /proc filesystem:
* - readWchan(pid, opts) → string | null
* - readCloseWaitSockets(pid, opts) → Array<{remoteIp, remotePort}>
* - readCpuPercent(pid, windowMs, opts) → number | null
*
* One pure classifier turns those signals into a verdict:
* - classify({ wchan, closeWaitCount, cpuPercent }) → 'healthy'|'suspect'|'wedged'
*
* All readers are fail-soft: any I/O error or parse error returns
* null / [] / 0 rather than throwing. The classifier treats null inputs as
* the signal not firing (conservative — only promotes to 'wedged' when all
* three signals are conclusive).
*
* Test injection: pass `opts.procRoot` to redirect /proc reads to a fixture
* directory (e.g. src/__tests__/fixtures/proc-gold-elk-5574/).
*
* CLOSE_WAIT threshold note (deviation from D2): Decision D2 specifies
* filtering CLOSE_WAIT by peer hostname (*.anthropic.com | auth-worker.*).
* DNS resolution at every tick is unreliable under network stress (exactly
* when the watchdog must be most accurate). The gold-elk-5574 forensic data
* shows ≥3 CLOSE_WAIT to ANY peer is already diagnostic — a healthy claude
* process has 0-1 CLOSE_WAIT sockets under normal operation. The classifier
* therefore uses count ≥ 3 without hostname filtering. This deviation is
* documented in docs/architecture/world-watchdog.md Signal 2.
*
* @see docs/architecture/world-watchdog.md
* @see packages/host-cp/src/__tests__/world-watchdog-probes.test.mjs
*/
import fs from 'node:fs/promises';
import path from 'node:path';
// HZ — Linux scheduler tick rate. Kernel default is 100; can be 250 or 1000
// on tickless kernels but the /proc/stat jiffies-to-seconds conversion is
// independent of the actual HZ when the denominator is wall-clock ms.
// We divide jiffies by HZ to get seconds of CPU time, then compare to the
// wall-clock window. HZ=100 is correct for virtually all container environments.
const LINUX_HZ = 100;
// /proc/net/tcp state byte for CLOSE_WAIT.
const CLOSE_WAIT_STATE = '08';
/**
* Read the wchan (wait channel) of a process's main thread.
*
* @param {number|string} pid Process ID.
* @param {{ procRoot?: string }} [opts]
* `procRoot` defaults to '/proc'; override for tests.
* @returns {Promise<string|null>}
* The wchan string (e.g. 'futex_wait_queue', 'epoll_wait') or null on error.
*/
export async function readWchan(pid, opts = {}) {
const procRoot = opts.procRoot ?? '/proc';
const wchanPath = path.join(procRoot, String(pid), 'wchan');
try {
const content = await fs.readFile(wchanPath, 'utf8');
return content.trim() || null;
} catch {
return null;
}
}
/**
* Read CLOSE_WAIT sockets for a process from /proc/<pid>/net/tcp (and tcp6).
*
* Parses the /proc/net/tcp format (space-separated hex fields). State field
* (column index 3, 0-based) == '08' means CLOSE_WAIT. Returns all matching
* entries regardless of remote peer — see module JSDoc for rationale.
*
* @param {number|string} pid Process ID.
* @param {{ procRoot?: string }} [opts]
* @returns {Promise<Array<{remoteIp: string, remotePort: number}>>}
* Array of CLOSE_WAIT socket descriptors, empty on error or no matches.
*/
export async function readCloseWaitSockets(pid, opts = {}) {
const procRoot = opts.procRoot ?? '/proc';
const results = [];
for (const proto of ['tcp', 'tcp6']) {
const tcpPath = path.join(procRoot, String(pid), 'net', proto);
let content;
try {
content = await fs.readFile(tcpPath, 'utf8');
} catch {
// ENOENT: pid gone or proto not available — skip, not an error.
continue;
}
const lines = content.split('\n');
// Skip header line.
for (let i = 1; i < lines.length; i++) {
const line = lines[i].trim();
if (!line) continue;
const fields = line.split(/\s+/);
// /proc/net/tcp columns (0-based):
// 0: sl
// 1: local_address (hex IP:port)
// 2: rem_address (hex IP:port)
// 3: st (hex state)
if (fields.length < 4) continue;
const state = fields[3];
if (state !== CLOSE_WAIT_STATE) continue;
const remAddr = fields[2];
const colonIdx = remAddr.lastIndexOf(':');
if (colonIdx === -1) continue;
const remIpHex = remAddr.slice(0, colonIdx);
const remPortHex = remAddr.slice(colonIdx + 1);
const remIp = parseHexIp(remIpHex);
const remPort = parseInt(remPortHex, 16);
if (remIp !== null && Number.isFinite(remPort)) {
results.push({ remoteIp: remIp, remotePort: remPort });
}
}
}
return results;
}
/**
* Measure CPU utilisation for a process over a time window.
*
* Reads /proc/<pid>/stat twice (before + after `windowMs` ms) and computes:
* cpuPercent = (utime+stime delta) / (HZ * windowMs / 1000) * 100
*
* @param {number|string} pid Process ID.
* @param {number} windowMs Measurement window in milliseconds.
* @param {{ procRoot?: string, sleep?: (ms: number) => Promise<void>, now?: () => number }} [opts]
* `sleep` — injectable delay function (default: real setTimeout).
* `now` — injectable clock (default: Date.now).
* `procRoot` — injectable proc root for tests.
* @returns {Promise<number|null>}
* CPU percent (0–100+) or null on read/parse error.
*/
export async function readCpuPercent(pid, windowMs, opts = {}) {
const procRoot = opts.procRoot ?? '/proc';
const sleep = opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
const statPath = path.join(procRoot, String(pid), 'stat');
const before = await readStatTimes(statPath);
if (before === null) return null;
await sleep(windowMs);
const after = await readStatTimes(statPath);
if (after === null) return null;
const deltaTicks = (after.utime + after.stime) - (before.utime + before.stime);
if (deltaTicks < 0) return null;
// deltaTicks jiffies / HZ = delta CPU-seconds.
// windowMs / 1000 = window in seconds.
const windowSec = windowMs / 1000;
if (windowSec <= 0) return null;
const cpuPercent = (deltaTicks / LINUX_HZ / windowSec) * 100;
return cpuPercent;
}
// ── Internal helpers ──────────────────────────────────────────────────────────
/**
* Parse utime + stime from /proc/<pid>/stat content.
*
* @param {string} statPath
* @returns {Promise<{utime: number, stime: number}|null>}
*/
async function readStatTimes(statPath) {
let content;
try {
content = await fs.readFile(statPath, 'utf8');
} catch {
return null;
}
// The stat format is: pid (comm) state ppid pgroup session ... utime stime ...
// The command name (field 2) can contain spaces and parentheses, so we
// find the last ')' to reliably locate the fields that follow.
const parenClose = content.lastIndexOf(')');
if (parenClose === -1) return null;
// After the closing ')', fields are space-separated starting with ' state'.
// Fields after ')' (0-indexed):
// 0: state, 1: ppid, 2: pgrp, 3: session, 4: tty_nr, 5: tpgid,
// 6: flags, 7: minflt, 8: cminflt, 9: majflt, 10: cmajflt,
// 11: utime, 12: stime (i.e. indices 11+12 from the post-paren split)
const afterParen = content.slice(parenClose + 1).trim();
const fields = afterParen.split(/\s+/);
// utime = fields[11], stime = fields[12]
if (fields.length < 13) return null;
const utime = parseInt(fields[11], 10);
const stime = parseInt(fields[12], 10);
if (!Number.isFinite(utime) || !Number.isFinite(stime)) return null;
return { utime, stime };
}
/**
* Parse a hex-encoded IP address from /proc/net/tcp format.
*
* IPv4: 8 hex chars in little-endian byte order (e.g. "0101007F" → "127.0.0.1").
* IPv6: 32 hex chars (4 groups of 8, each in little-endian).
*
* @param {string} hexIp
* @returns {string|null}
*/
function parseHexIp(hexIp) {
if (hexIp.length === 8) {
// IPv4: stored as little-endian 32-bit integer.
const b = [
parseInt(hexIp.slice(6, 8), 16),
parseInt(hexIp.slice(4, 6), 16),
parseInt(hexIp.slice(2, 4), 16),
parseInt(hexIp.slice(0, 2), 16),
];
if (b.some((x) => !Number.isFinite(x))) return null;
return b.join('.');
}
if (hexIp.length === 32) {
// IPv6: 4 groups of 8 hex chars, each group little-endian.
const groups = [];
for (let g = 0; g < 4; g++) {
const chunk = hexIp.slice(g * 8, g * 8 + 8);
// Reverse byte order within each 32-bit group.
const bytes = [
chunk.slice(6, 8),
chunk.slice(4, 6),
chunk.slice(2, 4),
chunk.slice(0, 2),
];
// Pair bytes into 16-bit groups for IPv6 notation.
groups.push(bytes[0] + bytes[1], bytes[2] + bytes[3]);
}
return groups.join(':');
}
return null;
}
// ── Classifier ───────────────────────────────────────────────────────────────
/**
* @typedef {'healthy'|'suspect'|'wedged'} WatchdogVerdict
*/
/**
* Classify a set of probe signals into a watchdog verdict.
*
* AND-gate: all three of (wchan=futex_wait_queue, closeWaitCount≥3, cpuPercent<1)
* must fire for 'wedged'. Any subset → 'suspect'. None → 'healthy'.
* Null inputs are treated as not-firing (fail-soft).
*
* @param {{ wchan: string|null, closeWaitCount: number|null, cpuPercent: number|null }} signals
* @returns {WatchdogVerdict}
*/
export function classify({ wchan, closeWaitCount, cpuPercent }) {
const wchanFires = wchan === 'futex_wait_queue';
const closeWaitFires = typeof closeWaitCount === 'number' && closeWaitCount >= 3;
const cpuFires = typeof cpuPercent === 'number' && cpuPercent < 1;
const firingCount = (wchanFires ? 1 : 0) + (closeWaitFires ? 1 : 0) + (cpuFires ? 1 : 0);
if (firingCount === 3) return 'wedged';
if (firingCount > 0) return 'suspect';
return 'healthy';
}
/**
* world-watchdog-recovery.mjs — recovery hook for wedged claude processes.
*
* Isolated from world-watchdog.mjs so kill + replay logic is independently
* mockable in tests without touching the watchdog's ticker.
*
* API:
* createRecovery({ autoRecoverMode, leakyBucket, broadcaster, persister,
* replay, processKill, log })
* → { onWedgedVerdict({ worldId, pid }): Promise<void> }
*
* Three modes (from compute.autoRecover in .olam/config.yaml):
* false — no-op; recovery never fires even on wedged verdict (DEFAULT)
* 'dry-run' — emits all breadcrumbs, never calls processKill or replay
* true — SIGKILL pid + read last-dispatch + replay; rate-limited
*
* Rate-limit: B2 leaky-bucket (3/hour/world). 4th wedge in window emits
* world.watchdog.recovery.budget_exhausted and skips all action.
*
* Replay stub: the `replay` dep is accepted as an injected function. In
* server.mjs it is wired to a console.warn stub + breadcrumb until the
* operator runs the B3 idempotence probe and signs off. See TODO below.
*
* @see docs/architecture/world-watchdog.md Recovery section
* @see packages/host-cp/src/lib/leaky-bucket.mjs
* @see packages/host-cp/src/dispatch-persister.mjs
*/
/**
* @typedef {'false'|true|'dry-run'} AutoRecoverMode
*/
/**
* @typedef {object} RecoveryDeps
* @property {false|true|'dry-run'} autoRecoverMode
* Passed from server.mjs which reads config.compute.autoRecover.
* Default false if config unavailable.
* @property {{ tryConsume(key: string): { allowed: boolean, retryAfterMs?: number, totalInWindow: number } }} leakyBucket
* B2 leaky-bucket instance. Keyed by worldId.
* @property {{ broadcast(type: string, payload: object): void }} [broadcaster]
* Host-stream broadcaster. Optional — when absent, breadcrumbs are skipped.
* @property {{ read({ worldId: string }): Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null> }} persister
* B4 dispatch-persister read function.
* @property {(opts: { worldId: string, messageId: string, prompt: string }) => Promise<void>} replay
* Opaque dispatch helper. Injected dep — DO NOT implement dispatch here.
* In server.mjs this is wired to a stub until operator signs off on B3 probe.
* @property {(pid: number) => void} [processKill]
* process.kill indirection so tests can spy without actually killing.
* Defaults to process.kill.
* @property {(msg: string) => void} [log]
* Logger. Defaults to console.log with [world-watchdog-recovery] prefix.
*/
/**
* @typedef {object} RecoveryHandle
* @property {(opts: { worldId: string, pid: number|null }) => Promise<void>} onWedgedVerdict
*/
/**
* Create a recovery handle.
*
* @param {RecoveryDeps} deps
* @returns {RecoveryHandle}
*/
export function createRecovery({
autoRecoverMode = false,
leakyBucket,
broadcaster = null,
persister,
replay,
processKill = (pid) => process.kill(pid, 'SIGKILL'),
log = (m) => console.log(`[world-watchdog-recovery] ${m}`),
} = {}) {
/**
* Emit a breadcrumb via broadcaster (fail-soft).
*
* @param {string} type
* @param {object} payload
*/
function broadcast(type, payload) {
if (!broadcaster || typeof broadcaster.broadcast !== 'function') return;
try {
broadcaster.broadcast(type, payload);
} catch (err) {
log(`broadcast ${type} failed: ${err?.message ?? err}`);
}
}
/**
* Handle a 2-tick-confirmed wedged verdict for a world.
*
* Called by world-watchdog.mjs on verdict-transition only (suspect → wedged),
* NOT on steady-state re-wedge.
*
* @param {{ worldId: string, pid: number|null }} opts
* @returns {Promise<void>}
*/
async function onWedgedVerdict({ worldId, pid }) {
// mode=false → detection-only; never act.
if (autoRecoverMode === false) return;
// PID null → watchdog hasn't resolved a real PID yet (Phase A stub case);
// skip silently — there is nothing to kill.
if (pid === null) return;
// Rate-limit gate.
const bucket = leakyBucket.tryConsume(worldId);
if (!bucket.allowed) {
broadcast('world.watchdog.recovery.budget_exhausted', {
worldId,
retryAfterMs: bucket.retryAfterMs,
totalInWindow: bucket.totalInWindow,
});
log(`worldId=${worldId}: budget exhausted (${bucket.totalInWindow} in window); skipping recovery`);
return;
}
// Read last persisted dispatch for replay.
let lastDispatch = null;
try {
lastDispatch = await persister.read({ worldId });
} catch (err) {
log(`worldId=${worldId}: persister.read failed: ${err?.message ?? err}`);
}
broadcast('world.watchdog.recovery.start', {
worldId,
pid,
mode: autoRecoverMode,
lastDispatchMessageId: lastDispatch?.messageId ?? null,
});
// dry-run — log planned action but do NOT kill.
if (autoRecoverMode === 'dry-run') {
log(`worldId=${worldId}: dry-run — would SIGKILL pid=${pid}${lastDispatch ? ` + replay messageId=${lastDispatch.messageId}` : ' (no last-dispatch)'}`);
broadcast('world.watchdog.recovery.complete', {
worldId,
pid,
mode: 'dry-run',
replayed: false,
});
return;
}
// mode=true — act.
try {
// 1. SIGKILL the wedged process.
processKill(pid);
log(`worldId=${worldId}: SIGKILL sent to pid=${pid}`);
// 2. Replay or note absence of last-dispatch.
if (!lastDispatch) {
broadcast('world.watchdog.recovery.restart_without_replay', {
worldId,
pid,
});
log(`worldId=${worldId}: no last-dispatch; killed without replay`);
} else {
// TODO: wire real replay once operator has run the B3 idempotence probe
// and confirmed dispatch is idempotent for the substrates in use.
// Until then this stub logs and emits a breadcrumb so the stub path
// is visible in production logs. See B3 probe + operator review gate B6.
broadcast('world.watchdog.recovery.replay_stub', {
worldId,
prompt: lastDispatch.prompt,
});
log(`worldId=${worldId}: replay stub hit — real replay deferred pending B3 sign-off`);
await replay({
worldId,
messageId: lastDispatch.messageId,
prompt: lastDispatch.prompt,
});
}
broadcast('world.watchdog.recovery.complete', {
worldId,
pid,
mode: true,
replayed: !!lastDispatch,
});
} catch (err) {
log(`worldId=${worldId}: recovery failed: ${err?.message ?? err}`);
broadcast('world.watchdog.recovery.failed', {
worldId,
pid,
error: err?.message ?? String(err),
});
}
}
return { onWedgedVerdict };
}
/**
* world-watchdog.mjs — periodic watchdog that probes each active world's
* `claude` PID for the three wedge signals (wchan + CLOSE_WAIT + CPU) and
* emits `world.watchdog.tick` events on the host-stream broadcaster.
*
* Design:
* - Mirrors `world-activity-tracker.mjs` shape exactly: `startWorldWatchdog(deps)`
* returns `{ stop, tickNow }`.
* - Per-world 2-tick confirm: a `'wedged'` classification is only emitted
* after TWO consecutive ticks with the wedge signature. A single-tick
* wedge emits `'suspect'`. A healthy tick resets the streak.
* - Per-world fail-soft: a probe error for one world never skips other worlds.
* - `OLAM_WORLD_WATCHDOG_DISABLED=1` → `start()` is a no-op (returns stub).
* - Cadence: `OLAM_WORLD_WATCHDOG_TICK_MS` env or `intervalMs` dep (default 30_000).
*
* v1 stub: `getClaudePidForWorld(worldId)` returns null for all worlds in
* Phase A. When null, the tick still fires but all probe signals are null,
* producing `verdict: 'unknown'`. Real PID lookup (docker inspect →
* /proc/<hostPid>/status NSpid field) is wired in a follow-up.
* This is documented here and in docs/architecture/world-watchdog.md.
*
* Wire-in: `server.mjs` constructs once after broadcaster is ready and calls
* `.stop()` from the SIGTERM/SIGINT handler. Gated on `!SERVE_ONLY`.
*
* @see docs/architecture/world-watchdog.md
* @see packages/host-cp/src/world-watchdog-probes.mjs
* @see packages/host-cp/src/world-activity-tracker.mjs (shape reference)
*/
import {
readWchan,
readCloseWaitSockets,
readCpuPercent,
classify,
} from './world-watchdog-probes.mjs';
// Recovery hook (B5). Optional dep — when absent (recovery is null/undefined),
// the watchdog behaves exactly as Phase A: detection-only, no kill, no replay.
// Wire via startWorldWatchdog({ recovery: createRecovery({...}) }) in server.mjs.
const DEFAULT_TICK_MS = 30_000;
// CPU measurement window: shorter than the tick cadence so we don't overlap.
const CPU_WINDOW_MS = 500;
/**
* @typedef {object} WorldWatchdogDeps
* @property {object} [broadcaster] Object with `.broadcast(type, payload)`.
* Optional — when absent events are skipped but state tracking still works.
* @property {number} [intervalMs] Tick cadence in ms. Defaults to
* `OLAM_WORLD_WATCHDOG_TICK_MS` env or 30_000.
* @property {() => Promise<string[]>} [listActiveWorlds]
* Returns an array of active world IDs to probe each tick.
* Defaults to returning [].
* @property {(worldId: string) => Promise<number|null>} [getClaudePidForWorld]
* Returns the host-side PID of the claude process for a world, or null.
* v1 default: always returns null (all worlds → verdict 'unknown').
* @property {{ procRoot?: string }} [probes]
* Injectable probe options (procRoot for tests).
* @property {{ onWedgedVerdict(opts: { worldId: string, pid: number|null }): Promise<void> }} [recovery]
* Optional recovery handle (from world-watchdog-recovery.mjs). When present,
* called once on verdict-transition to 'wedged' (suspect → wedged), NOT on
* steady-state re-wedge. When absent, detection-only (Phase A behaviour).
* @property {(msg: string) => void} [log] Defaults to `console.log`.
* @property {(msg: string) => void} [debug] Defaults to no-op.
* @property {(cb: () => void, ms: number) => any} [setTimer]
* Injectable `setInterval` for tests.
* @property {(handle: any) => void} [clearTimer]
* Injectable `clearInterval` for tests.
* @property {() => Date} [now] Clock injection for tests.
*/
/**
* @typedef {object} WorldWatchdogHandle
* @property {() => void} stop
* @property {() => Promise<number>} tickNow Run one tick immediately (returns
* the count of worlds processed). Exposed for tests.
* @property {(worldId: string) => object|null} getVerdict
* Returns the latest in-memory verdict entry for a world, or null if no tick
* has fired yet. Used by the HTTP endpoint (A5).
*/
/**
* Per-world state tracked between ticks for the 2-tick confirm.
*
* @typedef {object} WorldWatchdogState
* @property {'healthy'|'suspect'|'wedged'|'unknown'} lastClassification
* The raw classification from the previous tick (before 2-tick confirm).
* @property {'healthy'|'suspect'|'wedged'|'unknown'} lastVerdict
* The emitted verdict (post-confirm).
* @property {string} lastTickAt ISO-8601 timestamp of last tick.
* @property {object|null} lastSignals The signals from the last tick.
* @property {number|null} lastPid The PID probed last tick.
*/
/**
* Start the world watchdog. Returns a `{ stop, tickNow, getVerdict }` handle.
*
* Honoring `OLAM_WORLD_WATCHDOG_DISABLED=1`: if the env var is set, returns
* a no-op stub immediately without starting the interval or making any probe
* calls.
*
* @param {WorldWatchdogDeps} [deps]
* @returns {WorldWatchdogHandle}
*/
export function startWorldWatchdog(deps = {}) {
// Honour kill switch — return a no-op stub.
if (process.env.OLAM_WORLD_WATCHDOG_DISABLED === '1') {
return {
stop() {},
tickNow: async () => 0,
getVerdict: () => null,
};
}
const log = deps.log ?? ((m) => console.log(`[world-watchdog] ${m}`));
const debug = deps.debug ?? (() => {});
const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));
const now = deps.now ?? (() => new Date());
const intervalMs =
deps.intervalMs ??
parseInt(process.env.OLAM_WORLD_WATCHDOG_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10);
const broadcaster = deps.broadcaster ?? null;
const listActiveWorlds = deps.listActiveWorlds ?? (async () => []);
const getClaudePidForWorld = deps.getClaudePidForWorld ?? (async (_id) => null);
const probeOpts = deps.probes ?? {};
// Recovery hook — null when not configured (Phase A / default-off behaviour).
const recovery = deps.recovery ?? null;
// Per-world state map: worldId → WorldWatchdogState.
/** @type {Map<string, WorldWatchdogState>} */
const worldState = new Map();
let stopped = false;
let inFlight = false;
let intervalHandle = null;
/**
* Probe a single world and update its state. Returns the verdict emitted.
*
* @param {string} worldId
* @returns {Promise<'healthy'|'suspect'|'wedged'|'unknown'>}
*/
async function probeWorld(worldId) {
const pid = await getClaudePidForWorld(worldId);
let wchan = null;
let closeWaitSockets = [];
let cpuPercent = null;
if (pid !== null) {
// All probes are fail-soft — they return null/[] on I/O error.
[wchan, closeWaitSockets, cpuPercent] = await Promise.all([
readWchan(pid, probeOpts),
readCloseWaitSockets(pid, probeOpts),
readCpuPercent(pid, CPU_WINDOW_MS, probeOpts),
]);
}
const closeWaitCount = closeWaitSockets.length;
const signals = pid !== null
? { wchan, closeWaitCount, cpuPercent }
: null;
// Classify raw signals.
const rawClassification = pid !== null
? classify({ wchan, closeWaitCount, cpuPercent })
: 'unknown';
// 2-tick confirm: only emit 'wedged' if BOTH this tick AND the previous tick
// classified as 'wedged'. Otherwise emit the raw classification.
const prev = worldState.get(worldId);
let verdict;
if (rawClassification === 'wedged' && prev?.lastClassification === 'wedged') {
verdict = 'wedged';
} else if (rawClassification === 'wedged') {
// First 'wedged' tick — emit 'suspect' (2-tick confirm pending).
verdict = 'suspect';
} else {
verdict = rawClassification;
}
const tickAt = now().toISOString();
// Update per-world state.
worldState.set(worldId, {
lastClassification: rawClassification,
lastVerdict: verdict,
lastTickAt: tickAt,
lastSignals: signals,
lastPid: pid,
});
// Recovery hook — fire ONCE on verdict-transition to 'wedged' (not on
// steady-state re-wedge). Guard: prev?.lastVerdict !== 'wedged' ensures
// only the suspect→wedged transition triggers, not wedged→wedged.
if (
verdict === 'wedged' &&
recovery !== null &&
prev?.lastVerdict !== 'wedged'
) {
// Fire-and-forget; fail-soft so a recovery error never skips other worlds.
void recovery.onWedgedVerdict({ worldId, pid }).catch((err) => {
log(`recovery.onWedgedVerdict ${worldId} failed: ${err?.message ?? err}`);
});
}
// Emit broadcaster event.
if (broadcaster && typeof broadcaster.broadcast === 'function') {
try {
broadcaster.broadcast('world.watchdog.tick', {
worldId,
verdict,
signals,
pid,
lastTickAt: tickAt,
});
} catch (err) {
log(`broadcast ${worldId} failed: ${err?.message ?? err}`);
}
}
return verdict;
}
/**
* One tick: get active worlds, probe each, return count processed.
*
* @returns {Promise<number>}
*/
async function tick() {
if (stopped) return 0;
if (inFlight) {
debug('tick skipped: previous tick still in flight');
return 0;
}
inFlight = true;
let processed = 0;
try {
let worlds;
try {
worlds = await listActiveWorlds();
} catch (err) {
log(`listActiveWorlds failed: ${err?.message ?? err}`);
return 0;
}
for (const worldId of worlds) {
if (stopped) break;
if (typeof worldId !== 'string') continue;
try {
await probeWorld(worldId);
processed += 1;
} catch (err) {
// Per-world fail-soft: one bad world doesn't crash the loop.
debug(`probe ${worldId} failed: ${err?.message ?? err}`);
}
}
} finally {
inFlight = false;
}
return processed;
}
// Kick off an initial tick on next event-loop turn so callers can
// attach test spies before any probe work happens.
setImmediate(() => {
if (stopped) return;
void tick().catch((err) => {
log(`initial tick crashed: ${err?.message ?? err}`);
});
});
intervalHandle = setTimer(() => {
void tick().catch((err) => {
log(`tick crashed: ${err?.message ?? err}`);
});
}, intervalMs);
// Don't pin the event loop on shutdown.
if (intervalHandle && typeof intervalHandle.unref === 'function') {
intervalHandle.unref();
}
log(`started: interval=${intervalMs}ms`);
return {
stop() {
if (stopped) return;
stopped = true;
if (intervalHandle !== null) {
try { clearTimer(intervalHandle); } catch { /* ignore */ }
intervalHandle = null;
}
},
tickNow: tick,
/**
* Return the latest in-memory verdict entry for a world.
* Returns null if no tick has fired for this world yet.
*
* @param {string} worldId
* @returns {WorldWatchdogState|null}
*/
getVerdict(worldId) {
return worldState.get(worldId) ?? null;
},
};
}
/**
* WorldsDbSource — reconcile loop that reads ~/.olam/worlds.db and
* auto-registers running worlds into host-cp's in-memory registry.
*
* Two triggers (belt-and-suspenders):
* 1. fs.watch on the worlds.db file — fires within ~100ms of a write
* 2. 30s setInterval backstop — catches cases where fs.watch silently
* misses events (network filesystems, some Linux kernels)
*
* Uses better-sqlite3 for synchronous, lightweight reads. If the module
* is not installed (e.g., no native build in the container), the module
* logs a warning and exits without crashing the server.
*
* DB handle: deliberately NOT cached across reconcile calls. A long-lived
* readonly connection with the DB bind-mounted across the docker boundary
* does not reliably pick up writes committed on the host side — the host
* writer appends to the WAL, but the container reader's snapshot is stuck
* at the point the handle was first opened. Closing and reopening on every
* reconcile forces a new read transaction that sees all committed WAL
* frames. Cost: ~1 ms per call at a 30 s interval — negligible. This
* eliminates the entire class of "olam create world vanishes within 30 s"
* bugs (regression confirmed: ember-elk-9191 removed by reconciler despite
* being present in worlds.db with status=running).
*
* Interface: thin wrapper so a future "remote" source (cloud orchestrator)
* can drop in via the same WorldsSource interface in worlds-source.mjs.
*/
import fs from 'node:fs';
import { createRequire } from 'node:module';
const require = createRequire(import.meta.url);
/**
* @typedef {object} WorldsDbSourceDeps
* @property {string} dbPath Path to worlds.db (OLAM_WORLDS_DB or ~/.olam/worlds.db)
* @property {string} dockerHost Docker API base URL (tcp://host:port)
* @property {string} worldHost Host used to reach world CPs (127.0.0.1 or host.docker.internal)
* @property {() => Record<string, number>} getRegistry Current WORLDS map
* @property {(id: string, port: number) => void} onWorldAdded Called when a new running world is found
* @property {(id: string) => void} onWorldRemoved Called when a running world disappears
* @property {(msg: string) => void} [log]
*/
/**
* Derive the per-world CP host port from docker inspect.
*
* @param {string} worldId
* @param {string} dockerHost e.g. 'tcp://docker-socket-proxy:2375'
* @returns {Promise<number | null>}
*/
async function getWorldPortFromDocker(worldId, dockerHost) {
const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://');
const containerName = `olam-${worldId}-devbox`;
try {
const res = await fetch(`${apiBase}/containers/${encodeURIComponent(containerName)}/json`, {
signal: AbortSignal.timeout(3000),
});
if (!res.ok) return null;
const data = await res.json();
// Per-world CP runs on internal port 8080; host port is the published binding.
const ports = data?.NetworkSettings?.Ports ?? {};
const binding = ports['8080/tcp'];
if (!Array.isArray(binding) || binding.length === 0) return null;
const hostPort = parseInt(binding[0].HostPort, 10);
return Number.isFinite(hostPort) ? hostPort : null;
} catch {
return null;
}
}
/**
* Start the worlds-db reconcile loop. Returns a stop function.
*
* @param {WorldsDbSourceDeps} deps
* @returns {{ stop: () => void }}
*/
export function startWorldsDbReconciler(deps) {
const { dbPath, dockerHost, getRegistry, onWorldAdded, onWorldRemoved, log = console.log } = deps;
let db = null;
let stopped = false;
let watcher = null;
function tryOpenDb() {
if (db) return db;
try {
// Dynamic require — gracefully degrade if better-sqlite3 is not installed.
// better-sqlite3 is CommonJS-only; createRequire enables sync dynamic loading in ESM.
const Database = require('better-sqlite3');
db = new Database(dbPath, { readonly: true, fileMustExist: true });
log(`[worlds-db] opened ${dbPath}`);
return db;
} catch (err) {
if (err.code === 'MODULE_NOT_FOUND') {
log('[worlds-db] better-sqlite3 not available; skipping DB reconciler');
} else if (err.code !== 'SQLITE_CANTOPEN') {
log(`[worlds-db] failed to open ${dbPath}: ${err.message}`);
}
return null;
}
}
async function reconcile() {
if (stopped) return;
// Close any cached handle so tryOpenDb() opens a fresh connection below.
// A long-lived readonly handle under cross-bind-mount WAL mode has its
// read snapshot frozen at open time; closing and reopening starts a new
// read transaction that includes all WAL frames committed by the host.
if (db) {
try { db.close(); } catch { /* ignore */ }
db = null;
}
const database = tryOpenDb();
if (!database) return;
try {
let runningIds;
try {
const rows = database.prepare("SELECT id FROM worlds WHERE status = 'running'").all();
runningIds = new Set(rows.map((r) => r.id));
} catch (err) {
log(`[worlds-db] query failed: ${err.message}`);
return;
}
const registry = getRegistry();
// Add worlds that are running in DB but missing from registry.
for (const id of runningIds) {
if (id in registry) continue;
const port = await getWorldPortFromDocker(id, dockerHost);
if (port === null) {
log(`[worlds-db] world ${id} running in DB but no docker port found; skipping`);
continue;
}
log(`[worlds-db] reconcile: adding ${id} → :${port}`);
onWorldAdded(id, port);
}
// Remove worlds that are registered but no longer 'running' in DB.
for (const id of Object.keys(registry)) {
if (runningIds.has(id)) continue;
log(`[worlds-db] reconcile: removing ${id} (not running in DB)`);
onWorldRemoved(id);
}
} finally {
// Always close — no need to hold the handle between reconciles.
try { db.close(); } catch { /* ignore */ }
db = null;
}
}
// Watch the DB file for changes (fast path).
if (fs.existsSync(dbPath)) {
try {
watcher = fs.watch(dbPath, { persistent: false }, () => {
void reconcile();
});
} catch (err) {
log(`[worlds-db] fs.watch failed: ${err.message}; relying on 30s poll`);
}
// Initial reconcile on startup.
void reconcile();
} else {
log(`[worlds-db] ${dbPath} not found; will poll every 30s`);
}
// 30s backstop poll. Also watches for the file to appear.
const interval = setInterval(async () => {
if (!watcher && fs.existsSync(dbPath)) {
// File appeared since startup — set up watcher now.
try {
watcher = fs.watch(dbPath, { persistent: false }, () => { void reconcile(); });
log(`[worlds-db] ${dbPath} appeared; watcher started`);
} catch { /* fs.watch failure is non-fatal */ }
}
await reconcile();
}, 30_000);
return {
stop() {
stopped = true;
clearInterval(interval);
if (watcher) { try { watcher.close(); } catch { /* ignore */ } }
if (db) { try { db.close(); } catch { /* ignore */ } }
},
};
}
/**
* Phase E1 (olam-dogfood-vision): WorldsSource interface.
*
* Single narrow boundary that both LocalWorldsSource (today's
* dockerode-driven enumeration) and PylonWorldsSource (future cloud
* worlds) implement. The interface is the entire contract — there is
* no shared abstract class, no shared base, no shared utility module.
*
* Per Phase E plan (S1 contract carried through C-phase): the wire
* shape IS the abstraction. Sources implementing this interface are
* free to pick any backend (dockerode, Pylon SDK, mock, sqlite cache
* — anything) as long as `list()` returns the WorldSummary shape.
*
* Deliberately narrow:
* - `name` — discriminator for the source. SPA uses this to render
* the per-world `source` chip (E5).
* - `list()` — read-only enumeration. NO mutations. Mutations stay
* on host-cp's existing endpoints (POST /api/worlds delegation,
* DELETE via per-world CP, etc.). T5 mitigation: keeping the
* surface narrow lets the future Pylon SDK integration extend
* `list()`'s implementation without forcing a contract change
* across consumers.
*
* This is a `.mjs` file (matches host-cp's existing module style).
* Type information is conveyed via JSDoc; consumers reading via
* TypeScript get the shape via `// @ts-check` + JSDoc inference.
*
* @typedef {object} ServiceInfo
* @property {string} name
* @property {number} host_port
* @property {number} internal_port
* @property {string} url
* @property {boolean} live
*
* @typedef {object} WorldSummary
* @property {string} id
* @property {string | null} name
* @property {'running' | 'starting' | 'unknown' | 'failed'} status
* @property {ServiceInfo[]} services
* @property {'local' | 'pylon-cloud'} source
*
* @typedef {object} WorldsSource
* @property {'local' | 'pylon-cloud'} name
* @property {() => Promise<WorldSummary[]>} list
*/
// Re-export the source-name discriminator so consumers don't repeat
// the literal string. Both implementations + E4's composition layer
// + E5's SPA badge logic reference this.
export const SOURCE_NAMES = /** @type {const} */ (['local', 'pylon-cloud']);
// `WorldsSource` is a TYPE export — no runtime symbol. Consumers
// import it via JSDoc references:
// /** @type {import('./worlds-source.mjs').WorldsSource} */
// or in TypeScript:
// import type { WorldsSource } from './worlds-source.mjs';
//
// Test files exercising the interface treat it as duck-typed: any
// object with the right shape passes structural compatibility.

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display