@pleri/olam-cli
Advanced tools
@@ -1,1 +0,1 @@ | ||
| {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mCAAmC,CAAC;AAC9E,OAAO,EAAE,0BAA0B,EAAE,MAAM,oCAAoC,CAAC;AAChF,OAAO,EAAE,8BAA8B,EAAE,MAAM,yCAAyC,CAAC;AACzF,OAAO,EAAE,2BAA2B,EAAE,MAAM,qCAAqC,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,sBAAsB,CAAC;AAEjE,MAAM,UAAU,GAAG,cAAc,EAAE,CAAC;AACpC,iFAAiF;AACjF,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,GAAG,UAAU,CAAC;AAE7C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,MAAM,CAAC;KACZ,WAAW,CAAC,+DAA+D,CAAC;IAC7E,4EAA4E;IAC5E,0EAA0E;KACzE,MAAM,CAAC,OAAO,EAAE,6DAA6D,CAAC;KAC9E,OAAO,CAAC,UAAU,CAAC;KACnB,aAAa,CAAC,yBAAyB,EAAE,CAAC,CAAC;AAE9C,gEAAgE;AAChE,6EAA6E;AAC7E,4EAA4E;AAC5E,yEAAyE;AACzE,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AACnD,IAAI,YAAY,KAAK,CAAC,CAAC,EAAE,CAAC;IACxB,qEAAqE;IACrE,+EAA+E;IAC/E,iDAAiD;IACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IACzF,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,0DAA0D,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK;YACtF,qEAAqE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AACvD,CAAC;AAED,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,sEAAsE;AACtE,yEAAyE;AACzE,0EAA0E;AAC1E,wEAAwE;AACxE,mBAAmB,CAAC,OAAO,EAAE,EAAE,MAAM,EAAE,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC/D,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,iBAAiB,CAAC,OAAO,CAAC,CAAC;AAC3B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC7B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,yBAAyB,CAAC,OAAO,CAAC,CAAC;AACnC,0BAA0B,CAAC,OAAO,CAAC,CAAC;AACpC,8BAA8B,CAAC,OAAO,CAAC,CAAC;AACxC,2BAA2B,CAAC,OAAO,CAAC,CAAC;AACrC,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,uBAAuB,CAAC,OAAO,CAAC,CAAC;AACjC,cAAc,CAAC,OAAO,CAAC,CAAC;AAExB,6EAA6E;AAC7E,4EAA4E;AAC5E,uEAAuE;AACvE,iEAAiE;AACjE,KAAK,oBAAoB,CAAC,UAAU,CAAC,CAAC;AAEtC,sEAAsE;AACtE,yEAAyE;AACzE,yEAAyE;AACzE,4EAA4E;AAC5E,kDAAkD;AAClD,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC,EAAE,CAAC;IACrF,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACpD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;AACtD,CAAC;AAED,wEAAwE;AACxE,0EAA0E;AAC1E,uEAAuE;AACvE,wEAAwE;AACxE,qEAAqE;AACrE,wEAAwE;AACxE,2EAA2E;AAC3E,+DAA+D;AAC/D,EAAE;AACF,wEAAwE;AACxE,0EAA0E;AAC1E,yEAAyE;AACzE,mEAAmE;AACnE,+DAA+D;AAC/D,sBAAsB;AACtB,IAAI,CAAC;IACH,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;AACtC,CAAC;AAAC,OAAO,GAAY,EAAE,CAAC;IACtB,UAAU,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;QACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC;IACzC,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC;IAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC;AACnF,CAAC"} | ||
| {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mCAAmC,CAAC;AAC9E,OAAO,EAAE,0BAA0B,EAAE,MAAM,oCAAoC,CAAC;AAChF,OAAO,EAAE,8BAA8B,EAAE,MAAM,yCAAyC,CAAC;AACzF,OAAO,EAAE,2BAA2B,EAAE,MAAM,qCAAqC,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,gCAAgC,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,sBAAsB,CAAC;AAEjE,MAAM,UAAU,GAAG,cAAc,EAAE,CAAC;AACpC,iFAAiF;AACjF,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,GAAG,UAAU,CAAC;AAE7C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,MAAM,CAAC;KACZ,WAAW,CAAC,+DAA+D,CAAC;IAC7E,4EAA4E;IAC5E,0EAA0E;KACzE,MAAM,CAAC,OAAO,EAAE,6DAA6D,CAAC;KAC9E,OAAO,CAAC,UAAU,CAAC;KACnB,aAAa,CAAC,yBAAyB,EAAE,CAAC,CAAC;AAE9C,gEAAgE;AAChE,6EAA6E;AAC7E,4EAA4E;AAC5E,yEAAyE;AACzE,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AACnD,IAAI,YAAY,KAAK,CAAC,CAAC,EAAE,CAAC;IACxB,qEAAqE;IACrE,+EAA+E;IAC/E,iDAAiD;IACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IACzF,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,0DAA0D,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK;YACtF,qEAAqE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AACvD,CAAC;AAED,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,sEAAsE;AACtE,yEAAyE;AACzE,0EAA0E;AAC1E,wEAAwE;AACxE,mBAAmB,CAAC,OAAO,EAAE,EAAE,MAAM,EAAE,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC/D,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,iBAAiB,CAAC,OAAO,CAAC,CAAC;AAC3B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,YAAY,CAAC,OAAO,CAAC,CAAC;AACtB,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,eAAe,CAAC,OAAO,CAAC,CAAC;AACzB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC7B,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,UAAU,CAAC,OAAO,CAAC,CAAC;AACpB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,WAAW,CAAC,OAAO,CAAC,CAAC;AACrB,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,aAAa,CAAC,OAAO,CAAC,CAAC;AACvB,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAC1B,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,cAAc,CAAC,OAAO,CAAC,CAAC;AACxB,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,yBAAyB,CAAC,OAAO,CAAC,CAAC;AACnC,0BAA0B,CAAC,OAAO,CAAC,CAAC;AACpC,8BAA8B,CAAC,OAAO,CAAC,CAAC;AACxC,2BAA2B,CAAC,OAAO,CAAC,CAAC;AACrC,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC9B,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC5B,sBAAsB,CAAC,OAAO,CAAC,CAAC;AAChC,qBAAqB,CAAC,OAAO,CAAC,CAAC;AAC/B,uBAAuB,CAAC,OAAO,CAAC,CAAC;AACjC,cAAc,CAAC,OAAO,CAAC,CAAC;AAExB,6EAA6E;AAC7E,4EAA4E;AAC5E,uEAAuE;AACvE,iEAAiE;AACjE,KAAK,oBAAoB,CAAC,UAAU,CAAC,CAAC;AAEtC,sEAAsE;AACtE,yEAAyE;AACzE,yEAAyE;AACzE,4EAA4E;AAC5E,kDAAkD;AAClD,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC,EAAE,CAAC;IACrF,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACpD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;AACtD,CAAC;AAED,wEAAwE;AACxE,0EAA0E;AAC1E,uEAAuE;AACvE,wEAAwE;AACxE,qEAAqE;AACrE,wEAAwE;AACxE,2EAA2E;AAC3E,+DAA+D;AAC/D,EAAE;AACF,wEAAwE;AACxE,0EAA0E;AAC1E,yEAAyE;AACzE,mEAAmE;AACnE,+DAA+D;AAC/D,sBAAsB;AACtB,IAAI,CAAC;IACH,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;AACtC,CAAC;AAAC,OAAO,GAAY,EAAE,CAAC;IACtB,UAAU,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;QACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC;IACzC,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC;IAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC;AACnF,CAAC"} |
| { | ||
| "bundledAt": "2026-06-15T08:36:13.799Z", | ||
| "bundledAt": "2026-06-18T05:40:50.182Z", | ||
| "kgFirstSha": "29a9ccce1b115d049e375c4a90eb5cf7c123e610e2d0590270a4db2cdbc64a28" | ||
| } |
+1
-4
| { | ||
| "name": "@pleri/olam-cli", | ||
| "version": "0.1.218", | ||
| "version": "0.1.219", | ||
| "type": "module", | ||
@@ -13,7 +13,5 @@ "bin": { | ||
| "dist/mcp-server.js", | ||
| "dist/image-digests.json", | ||
| "dist/agent-stream", | ||
| "hermes-bundle", | ||
| "hooks", | ||
| "host-cp", | ||
| "memory-hooks", | ||
@@ -43,3 +41,2 @@ "README.md" | ||
| "audit:publish-deps": "node scripts/audit-publish-deps.mjs", | ||
| "audit:cli-bundle-k8s": "node scripts/audit-cli-bundle-k8s.mjs", | ||
| "audit:cli-package-contents": "node scripts/audit-cli-package-contents.mjs", | ||
@@ -46,0 +43,0 @@ "audit:cli-test-coverage": "node ../../scripts/audit-cli-test-coverage.mjs" |
| { | ||
| "auth": "sha256:770ee97ee4d06d2c1b6512ba99421a5fe312393d592df1684fd0d03b3476ff10", | ||
| "host-cp": "sha256:328baca8b9b28ccef1d858aa20e0ab27855604a630132dcadd423990cb376f60", | ||
| "kg-service": "sha256:f97ee90fe1bd5b12cb56d5fbf0d3085c301bb7abeef0dd28d2b2a5c90ab6efbb", | ||
| "memory-service": "sha256:923bff54d2ba3da162a35d3e8ebc6bd440bed6d290a5cff7bae2888281a4e003", | ||
| "mcp-auth": "sha256:eaac2164349e388a70dae0d86c34132f97aa74177a2376cdfa10732e8eadb507", | ||
| "$schema_version": 1, | ||
| "$published_version": "0.1.218", | ||
| "$registry": "ghcr.io/pleri" | ||
| } |
| # Phase F-2-B (B2): olam-host-cp compose stack. | ||
| # | ||
| # Two services on a private internal network: | ||
| # | ||
| # 1. host-cp — the SPA proxy server (B3+ implementation). Exposes | ||
| # port 19000 to the operator's host. Talks to the | ||
| # docker-socket-proxy via `tcp://docker-socket-proxy:2375` | ||
| # (NOT the raw /var/run/docker.sock). | ||
| # | ||
| # 2. docker-socket-proxy | ||
| # — tecnativa/docker-socket-proxy sidecar. Mounts the | ||
| # real /var/run/docker.sock read-only and exposes a | ||
| # whitelisted subset of the Docker API. Whitelist: | ||
| # CONTAINERS=1 — list/inspect (find world IDs) | ||
| # EVENTS=1 — stream restart/stop events | ||
| # (cache invalidation; B3 / T2) | ||
| # EXEC=1 — exec inside containers | ||
| # (read /tmp/olam-container-secret) | ||
| # Everything else is denied (images, volumes, | ||
| # networks, swarm, build, push, etc.). T6 + T8 | ||
| # mitigation: blast-radius reduction vs raw socket. | ||
| # | ||
| # Bring up: `docker compose -f packages/host-cp/compose.yaml up --build -d` | ||
| # Tear down: `docker compose -f packages/host-cp/compose.yaml down` | ||
| services: | ||
| olam-host-cp: | ||
| container_name: olam-host-cp | ||
| # Image-only — operator's `olam bootstrap` pulls the digest-pinned | ||
| # `ghcr.io/pleri/olam-host-cp:latest` (digest from image-digests.json) | ||
| # and tags it as the local `:latest` BEFORE compose up. No `build:` | ||
| # in this file — fresh-install operators don't have the source tree | ||
| # so a `build:` block crashes them with "Dockerfile not found". | ||
| # | ||
| # Local-dev contributors who want to test host-cp source changes | ||
| # use the sibling compose.dev.yaml as an override: | ||
| # | ||
| # docker compose \ | ||
| # -f packages/host-cp/compose.yaml \ | ||
| # -f packages/host-cp/compose.dev.yaml \ | ||
| # up --build -d | ||
| # | ||
| # The CLI's `olam host-cp start` always uses ONLY compose.yaml, so | ||
| # operator boots are never blocked on a missing Dockerfile / build | ||
| # context. | ||
| image: ghcr.io/pleri/olam-host-cp:latest | ||
| ports: | ||
| # Bind to 127.0.0.1 only — single-user-per-host assumption (T4). | ||
| # Multi-user / TLS / remote access lands in Phase G+. | ||
| - "127.0.0.1:19000:19000" | ||
| environment: | ||
| # Connection string for docker-socket-proxy. The proxy listens on | ||
| # tcp://0.0.0.0:2375 inside the internal network. host-cp uses | ||
| # this to enumerate worlds (containers list) + read secrets | ||
| # (containers exec) + subscribe to restart events. | ||
| DOCKER_HOST: "tcp://docker-socket-proxy:2375" | ||
| # Phase F-2-B M2 ship gate: secret cache TTL (5min, demoted from | ||
| # 1h per D2). B3 reads this; B10's m2-cache-invalidate.sh tests | ||
| # the docker-events invalidation path. | ||
| OLAM_SECRET_CACHE_TTL_SEC: "300" | ||
| # Bind operator-facing UI port. Always 19000 in compose. | ||
| OLAM_HOST_CP_PORT: "19000" | ||
| # Token + workspace + world registry mount points. Bind-mounted | ||
| # below; host CP reads these at boot. | ||
| OLAM_HOST_CP_TOKEN_PATH: "/data/host-cp.token" | ||
| OLAM_WORKSPACES_DIR: "/data/workspaces" | ||
| OLAM_WORLDS_DB: "/data/worlds.db" | ||
| OLAM_PR_POLL_INTERVAL_MS: "300000" | ||
| OLAM_MERGE_GRACE_MS: "600000" | ||
| # NOTE: OLAM_REPO_PATH is intentionally NOT passed into the | ||
| # container env. The HOST-side variable names a bind-mount source | ||
| # (a host path like /Users/.../olam — see the volumes block below). | ||
| # Inside the container, the bind-mount target is always | ||
| # `/operator-repo`. Pre-fix the env was passed through, server.mjs | ||
| # consumers (version-status.mjs, /api/prs handler) read it | ||
| # expecting a container-side path, then `cwd:` to a host path that | ||
| # doesn't exist inside the container — `gh pr list` failed with | ||
| # "not a git repository", `gh` itself failed with `spawn ENOENT`. | ||
| # Server-side consumers default to `/operator-repo` which is | ||
| # always correct. | ||
| # Auth-service inter-service auth. The secret is shared with the | ||
| # long-lived olam-auth container (generated on first `olam auth | ||
| # up` at ~/.olam/auth-secret). Without it, X-Olam-Secret is never | ||
| # sent and auth-service 401s every host-cp → /credentials/* call, | ||
| # which surfaces in the dashboard as a failed Connect Claude flow. | ||
| OLAM_AUTH_SERVICE_URL: "http://host.docker.internal:9999" | ||
| OLAM_AUTH_SECRET: "${OLAM_AUTH_SECRET:-}" | ||
| # Operator's CLI version, propagated by `olam host-cp start` via | ||
| # buildComposeEnv. Surfaces in /api/version/status so the | ||
| # dashboard's TopNav can render "the version we're working on." | ||
| # Empty when older CLI versions render this compose; the server | ||
| # falls back to host-cp's own package.json. | ||
| OLAM_CLI_VERSION: "${OLAM_CLI_VERSION:-}" | ||
| # Upgrade-trigger feature: host-cp uses these to construct bind | ||
| # mounts on the spawned upgrader container. The upgrader runs | ||
| # `olam upgrade -y` and needs (a) the operator's ~/.olam state, | ||
| # (b) the docker socket so the CLI can talk to the daemon. Both | ||
| # are HOST-side paths because docker resolves bind sources on | ||
| # the daemon, not inside the requesting container. | ||
| OLAM_HOME_HOST_PATH: "${HOME}/.olam" | ||
| OLAM_DOCKER_SOCK_HOST_PATH: "/var/run/docker.sock" | ||
| # Operator's olam repo path on the host. The upgrader needs this | ||
| # bind-mounted so the CLI's cwd-relative `packages/host-cp/compose.yaml` | ||
| # lookup resolves. Defaults to the `OLAM_REPO_PATH` already used by | ||
| # host-cp for version-detection (mounted at /operator-repo:ro). | ||
| OLAM_REPO_HOST_PATH: "${OLAM_REPO_PATH:-${HOME}/Projects/ein-sof/olam}" | ||
| # Operator's $HOME on the host. Forwarded to the upgrader as HOME | ||
| # so `${HOME}` interpolation in compose.yaml's bind sources | ||
| # resolves to a path the docker daemon can find. Inside the | ||
| # upgrader container HOME defaults to /root, which the daemon | ||
| # rejects when used as a bind source ("path not shared from | ||
| # the host"). Without this the recreate step fails right at the | ||
| # last hop of the upgrade pipeline. | ||
| OLAM_OPERATOR_HOME_HOST_PATH: "${HOME}" | ||
| # GitHub CLI config bind for the upgrader. The CLI runs | ||
| # `gh auth token | docker login ghcr.io ...` before `olam upgrade` | ||
| # so the spawned container can pull GHCR images even though the | ||
| # host's ~/.docker/config.json uses a Keychain credsStore that | ||
| # doesn't follow into a Linux container. The gh config is also | ||
| # mounted into host-cp itself (line 89 below) for `gh pr list` — | ||
| # this is the same path, mounted again for the upgrader. | ||
| OLAM_GH_CONFIG_HOST_PATH: "${HOME}/.config/gh" | ||
| # GitHub token used by the upgrader to `docker login ghcr.io` so it can | ||
| # pull the host-cp / auth / devbox images by digest. Resolved from | ||
| # the operator's `gh auth token` BEFORE compose up (or set explicitly | ||
| # via the GH_TOKEN env var). If unset, the upgrader falls back to | ||
| # `gh auth token` against the mounted ~/.config/gh — which works | ||
| # only on Linux operators (macOS keeps the token in Keychain, not in | ||
| # ~/.config/gh). | ||
| GH_TOKEN: "${GH_TOKEN:-}" | ||
| # Optional override for the upgrader image. Defaults to the | ||
| # currently-running host-cp image (which has the olam CLI + | ||
| # docker CLI + gh CLI baked in by Dockerfile). | ||
| OLAM_UPGRADER_IMAGE: "${OLAM_UPGRADER_IMAGE:-ghcr.io/pleri/olam-host-cp:latest}" | ||
| # Plan DB persistence fix (Bug 1): os.homedir() inside the container is | ||
| # /root, but ~/.olam is bind-mounted to /data — not /root/.olam. Without | ||
| # these overrides, plan.db lands in the container's ephemeral layer and is | ||
| # destroyed by every `docker compose up --force-recreate` (i.e. olam upgrade). | ||
| # Pointing to /data/ routes all writes through the bind-mount to the host. | ||
| OLAM_PLAN_DB_PATH: "/data/plan.db" | ||
| OLAM_PLAN_DIR: "/data/plan" | ||
| # Same /root vs /data bind-mount issue applies to the plan-chat bearer | ||
| # gateway. Without this override, plan-chat-secret.mjs reads from | ||
| # /root/.olam/plan-chat-secret (container ephemeral, missing) and | ||
| # /agent-runtime/trigger answers HTTP 500. Routing through /data | ||
| # surfaces the on-disk bearer created by ensureSecret() on host FS. | ||
| OLAM_PLAN_CHAT_SECRET_PATH: "/data/plan-chat-secret" | ||
| volumes: | ||
| # ~/.olam/ from operator's home → /data/ inside container. B4 | ||
| # writes the startup token here (chmod 600). B6 reads workspaces | ||
| # + worlds.db from here. ~/.olam/ is the canonical operator-state | ||
| # directory established by the Olam CLI; consistent with the | ||
| # devbox container's mount layout. | ||
| - ${HOME}/.olam:/data | ||
| - ${HOME}/.config/gh:/gh-config:ro | ||
| # Operator's olam repo mounted read-only so host-cp can poll | ||
| # .git/refs/heads/main to detect when a new version is available. | ||
| # The path inside the container is always /operator-repo. | ||
| # On the host: OLAM_REPO_PATH env var, or defaults to | ||
| # $HOME/Projects/ein-sof/olam. If the path doesn't exist, the | ||
| # mount is a no-op and version detection reports "operator-repo not mounted". | ||
| - ${OLAM_REPO_PATH:-${HOME}/Projects/ein-sof/olam}:/operator-repo:ro | ||
| depends_on: | ||
| docker-socket-proxy: | ||
| condition: service_started | ||
| networks: | ||
| - olam-host-cp-internal | ||
| restart: unless-stopped | ||
| docker-socket-proxy: | ||
| container_name: olam-docker-socket-proxy | ||
| # Pin to a specific tag, not :latest. Update via Renovate / dependabot. | ||
| # tecnativa/docker-socket-proxy:0.3.0 (2024-10-22) — last tagged | ||
| # release as of plan-pass-2 emit. T8 mitigation: pinning prevents | ||
| # supply-chain drift on the sidecar. | ||
| image: tecnativa/docker-socket-proxy:0.3.0 | ||
| environment: | ||
| # Whitelist matches plan D5 + T6/T8: host CP needs exactly these | ||
| # four operations. EVERYTHING else stays at the proxy default | ||
| # (deny). Audit periodically; widen with explicit justification. | ||
| CONTAINERS: "1" | ||
| EVENTS: "1" | ||
| EXEC: "1" | ||
| # Allows GET /images/<ref>/json. Needed by version-status.mjs to | ||
| # resolve the baked OLAM_BUILD_SHA of locally-pulled images | ||
| # (host-cp + auth-service + devbox `:latest` tags) so the | ||
| # upgrade comparator can answer "is there a newer image I'd | ||
| # actually swap to?" — see PR #459 for the comparator rewrite | ||
| # and `fetchLatestImageSha`. Without this, both the new | ||
| # comparator AND the pre-existing fetchDevboxImageSha fall back | ||
| # to 'unknown', producing the over-reporting "Upgrade available" | ||
| # banner regression. Socket is mounted :ro so this remains | ||
| # read-only inspect; no container mutation surface. | ||
| IMAGES: "1" | ||
| # tecnativa/docker-socket-proxy 0.3.0 requires POST=1 to allow | ||
| # POST verbs on whitelisted endpoints (exec creation requires | ||
| # POST /containers/<id>/exec + POST /exec/<id>/start). Phase | ||
| # F-2-D dogfood revealed the missing perm. | ||
| POST: "1" | ||
| # Optional: lower log verbosity. Default is INFO; DEBUG floods | ||
| # logs in dev. Comment out for troubleshooting. | ||
| LOG_LEVEL: "warning" | ||
| volumes: | ||
| # Mount the host's docker socket READ-ONLY. The proxy is the only | ||
| # consumer of the raw socket. host-cp talks to the proxy over | ||
| # TCP (port 2375 on the internal network). | ||
| - /var/run/docker.sock:/var/run/docker.sock:ro | ||
| networks: | ||
| - olam-host-cp-internal | ||
| restart: unless-stopped | ||
| networks: | ||
| olam-host-cp-internal: | ||
| name: olam-host-cp-internal | ||
| driver: bridge | ||
| # Internal-only: no host port published; host-cp <-> proxy traffic | ||
| # never leaves the docker network. |
| # Host-side docker-socket-proxy for the olam kubernetes substrate. | ||
| # | ||
| # Background — round-4 wave-2 R4-W2-F (kuro-bear retest 2026-05-21): | ||
| # on macOS + colima + virtiofs, containerd's OCI runtime spec generator | ||
| # calls stat() on docker.sock hostPath bind mounts; virtiofs returns | ||
| # ENOTSUP for stat/statx on socket files; pod creation fails. The R3-A | ||
| # two-volume hostPath approach is unrecoverable on virtiofs. | ||
| # | ||
| # This compose file provisions the docker-socket-proxy AS A HOST-SIDE | ||
| # CONTAINER (sibling to k3d on the operator's docker daemon), NOT as a | ||
| # pod inside the k3d cluster. The in-cluster Service in | ||
| # packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml is | ||
| # `type: ExternalName` aliasing `host.k3d.internal` — cluster pods reach | ||
| # THIS container via that DNS handle. | ||
| # | ||
| # Architecture mirrors the compose substrate's pattern (see | ||
| # packages/host-cp/compose.yaml:170-210). Same image, same allowlist, | ||
| # same restart policy. The only difference: this proxy publishes to | ||
| # the operator host on 127.0.0.1:2375 so k3d nodes can reach it via | ||
| # host.k3d.internal — the compose-substrate sibling stays internal-only. | ||
| # | ||
| # Operator UX: `olam upgrade -y` Step 0.7 auto-starts this on macOS via | ||
| # `docker compose -f <this-file> up -d`. Linux operators get a no-op | ||
| # (Step 0.7 is platform-gated). See docs/operator/kubernetes-substrate-beta.md. | ||
| services: | ||
| docker-socket-proxy: | ||
| container_name: olam-host-side-docker-socket-proxy | ||
| # tecnativa/docker-socket-proxy:0.3.0 — matches the compose substrate's | ||
| # pin verbatim. T8 supply-chain: pinning prevents drift. Update via | ||
| # Renovate / dependabot. | ||
| image: tecnativa/docker-socket-proxy:0.3.0 | ||
| environment: | ||
| # Whitelist matches packages/host-cp/compose.yaml:181-202 verbatim. | ||
| # Anything outside this list stays at proxy default (deny). | ||
| CONTAINERS: "1" | ||
| EVENTS: "1" | ||
| EXEC: "1" | ||
| # IMAGES=1 needed for GET /images/<ref>/json (version-status.mjs | ||
| # fetchLatestImageSha). Socket is :ro so this is read-only inspect. | ||
| IMAGES: "1" | ||
| # POST=1 required since tecnativa 0.3.0 for exec creation | ||
| # (POST /containers/<id>/exec + POST /exec/<id>/start). See | ||
| # packages/host-cp/compose.yaml:195-199 for the F-2-D dogfood | ||
| # finding that surfaced this. | ||
| POST: "1" | ||
| LOG_LEVEL: "warning" | ||
| ports: | ||
| # Publish to operator host on 127.0.0.1:2375 ONLY. k3d nodes reach | ||
| # this via host.k3d.internal:2375. Binding to 127.0.0.1 (not | ||
| # 0.0.0.0) is T1 mitigation: docker API surface stays loopback-only | ||
| # on a single-tenant operator machine. | ||
| - "127.0.0.1:2375:2375" | ||
| volumes: | ||
| # Read-only mount of the host's docker socket. The proxy is the | ||
| # only consumer of the raw socket on the operator's mac. | ||
| - /var/run/docker.sock:/var/run/docker.sock:ro | ||
| restart: unless-stopped |
| apiVersion: v1 | ||
| kind: Namespace | ||
| metadata: | ||
| name: olam | ||
| labels: | ||
| name: olam | ||
| olam.io/component: host-stack |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-host-cp | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack |
| # Phase 1b Decision 19: Role scoped to resourceNames: ["olam-host-cp"] on | ||
| # apps/v1 deployments. Without this scope, the in-cluster ServiceAccount | ||
| # could patch ANY Deployment in the namespace. This is the load-bearing | ||
| # security guardrail — preserve verbatim. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-host-cp | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| rules: | ||
| - apiGroups: ["apps"] | ||
| resources: ["deployments"] | ||
| resourceNames: ["olam-host-cp"] | ||
| verbs: ["get", "patch", "watch"] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-host-cp | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-host-cp | ||
| namespace: olam | ||
| roleRef: | ||
| kind: Role | ||
| name: olam-host-cp | ||
| apiGroup: rbac.authorization.k8s.io |
| # ConfigMap for olam-host-cp environment. Sensitive values (OLAM_AUTH_SECRET, | ||
| # GH_TOKEN) are NOT here — they live in the Secret (see templates/40-secret-template.yaml). | ||
| # Operators apply the Secret separately before applying the manifests. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-host-cp-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| data: | ||
| # Auth service URL. Default targets host.docker.internal for Colima/Docker | ||
| # Desktop k3d setups. Override when auth-service runs elsewhere (e.g. via | ||
| # an ExternalName Service pointing at the host gateway). | ||
| # | ||
| # Port :9999 matches the published port in AuthContainerController.start() | ||
| # (packages/core/src/auth/container.ts) — the value was historically :8000, | ||
| # which never matched any running auth-service version and surfaced as | ||
| # {"error":"auth_service_unavailable","message":"fetch failed"} | ||
| # on /api/auth/* calls. Verified during the K3d-HTTPS PR live bring-up; | ||
| # see docs/runbooks/k3d-https-setup.md. | ||
| OLAM_AUTH_SERVICE_URL: "http://host.docker.internal:9999" | ||
| # Docker socket proxy — ClusterIP Service DNS inside the namespace. | ||
| DOCKER_HOST: "tcp://docker-socket-proxy:2375" | ||
| # Host-cp server port — must match the Service targetPort in 60-service.yaml. | ||
| OLAM_HOST_CP_PORT: "19000" | ||
| # Operator state paths (resolved inside the K3s node via hostPath volumes). | ||
| OLAM_HOST_CP_TOKEN_PATH: "/data/host-cp.token" | ||
| OLAM_WORKSPACES_DIR: "/data/workspaces" | ||
| OLAM_WORLDS_DB: "/data/worlds.db" | ||
| OLAM_PLAN_DB_PATH: "/data/plan.db" | ||
| OLAM_PLAN_DIR: "/data/plan" | ||
| # Phase B Model B: bearer file is now sourced from the shared | ||
| # olam-plan-chat-secret Kubernetes Secret (mounted at /etc/olam-plan-chat/). | ||
| # Two readers, one source-of-truth — replaces the per-pod /data/plan-chat-secret | ||
| # file that couldn't be shared across pods on RWO PVCs. The plan-chat-service | ||
| # pod also mounts the SAME Secret at the SAME path so bearer comparisons | ||
| # work both ways. | ||
| OLAM_PLAN_CHAT_SECRET_PATH: "/etc/olam-plan-chat/secret" | ||
| # In-cluster plan-chat-service URL. Rewritten by upgrade-kubernetes.ts step 2.5 | ||
| # (buildK8sDnsUrl) — the default below is a sane fallback for raw | ||
| # `kubectl apply -f` operators who skip the CLI wrapper. | ||
| PLAN_CHAT_SERVICE_URL: "http://olam-plan-chat-service.olam.svc.cluster.local:3200" | ||
| # NDJSON span sink + recovery ledger — route to the writable PVC mount at | ||
| # /data rather than the default ~/.olam/logs (which resolves to | ||
| # /home/node/.olam/logs and is not writable with readOnlyRootFilesystem: true). | ||
| OLAM_TRACE_LOG_PATH: "/data/logs/host.trace.ndjson" | ||
| OLAM_RECOVERY_LEDGER_PATH: "/data/logs/recovery-ledger.ndjson" | ||
| # Tunable defaults. | ||
| OLAM_SECRET_CACHE_TTL_SEC: "300" | ||
| OLAM_PR_POLL_INTERVAL_MS: "300000" | ||
| OLAM_MERGE_GRACE_MS: "600000" | ||
| # World watchdog — periodic probe of each active world's claude PID for the | ||
| # three wedge signals (wchan + CLOSE_WAIT + CPU). Detection-only in Phase A. | ||
| # Set OLAM_WORLD_WATCHDOG_DISABLED=1 in the deployment env to kill-switch. | ||
| OLAM_WORLD_WATCHDOG_TICK_MS: "30000" |
| # PersistentVolumeClaim for olam-host-cp /data volume — k3d substrate default. | ||
| # | ||
| # Why PVC instead of hostPath: | ||
| # hostPath volumes on k3d nodes resolve to paths INSIDE the k3d node | ||
| # container — not the operator's host filesystem. A bare k3d cluster has | ||
| # an empty node filesystem, so a hostPath at /host/.olam is always empty. | ||
| # Additionally, fsGroup does NOT relabel hostPath volumes (only PVCs / | ||
| # emptyDir / projected volumes), so UID-1000 pods cannot write to | ||
| # root-owned hostPath mounts even when fsGroup: 1000 is set. | ||
| # | ||
| # local-path StorageClass ships with k3d by default (rancher/local-path-provisioner). | ||
| # On non-k3d clusters, substitute with the appropriate StorageClass name (D24, | ||
| # operator-editable). For managed clusters (GKE, EKS, AKS) use the GKE-variant | ||
| # manifest instead: packages/host-cp/k8s/manifests/gke/45-pvc.yaml (storageClassName: | ||
| # standard-rwo). See docs/architecture/peripheral-services-on-k3s.md Decision #3 | ||
| # for the full per-cluster storageclass table. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-host-cp-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| storage: 5Gi |
| # Deployment for olam-host-cp. | ||
| # | ||
| # Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model. | ||
| # Digest resolves to ghcr.io/pleri/olam-host-cp:0.1.168 (multi-arch index). | ||
| # Pinned to the last image built before PRs #915/#919/#920/#921 introduced | ||
| # lifecycle/, observability/, and recovery/ module directories — those PRs | ||
| # updated server.mjs imports but the Dockerfile was not updated to COPY | ||
| # the new directories, so all images from 0.1.169+ crash with | ||
| # ERR_MODULE_NOT_FOUND. The Dockerfile fix (COPY lifecycle/ / observability/ | ||
| # / recovery/) lands in PR #940; the next release will ship a working image. | ||
| # At that point, refresh this digest via the instructions below. | ||
| # To update: resolve the new tag's digest via: | ||
| # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-host-cp:pull&service=ghcr.io" | jq -r .token) | ||
| # curl -sI -H "Authorization: Bearer $TOKEN" \ | ||
| # -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \ | ||
| # https://ghcr.io/v2/pleri/olam-host-cp/manifests/<tag> | grep docker-content-digest | ||
| # | ||
| # securityContext: conservative defaults per T6/T7 threat model. | ||
| # Operators who need to relax these (e.g. for debugging) must pass | ||
| # --accept-security-regression (Phase C, Decision D14) — out of scope here. | ||
| # | ||
| # Volume requirements for k3d: | ||
| # olam-home (/data): backed by a PersistentVolumeClaim (45-pvc.yaml). | ||
| # An init container (chown-data) runs `chown -R 1000:1000 /data` as root | ||
| # before the main container starts, granting UID-1000 write access on the | ||
| # freshly-provisioned PV. fsGroup alone is insufficient for hostPath volumes. | ||
| # | ||
| # docker access — NO LONGER VIA hostPath (changed in olam-k3d-on-mac- | ||
| # substrate-decision Phase B B2, 2026-05-21). The previous R3-A two-volume | ||
| # hostPath pattern is retracted: round-4 R4-W2-F showed virtiofs returns | ||
| # ENOTSUP on stat/statx of socket files, and that failure is unrecoverable | ||
| # at the containerd OCI runtime layer. host-cp now reaches docker via TCP | ||
| # through the docker-socket-proxy ExternalName Service in the olam | ||
| # namespace (packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml), | ||
| # which kube-dns resolves as a CNAME to host.k3d.internal. The actual | ||
| # proxy container runs on the operator's docker daemon (sibling to k3d), | ||
| # started by `olam upgrade` Step 0.7. See also | ||
| # packages/host-cp/src/lib/docker-request-options.mjs (both substrates now | ||
| # return identical TCP options). | ||
| # | ||
| # The operator's k3d cluster create command is therefore simpler — no | ||
| # `--volume $HOME/.colima/default/:/host-colima/@server:*` flag needed. | ||
| # See docs/operator/kubernetes-substrate-beta.md for the current install | ||
| # command. | ||
| # | ||
| # gh-config (/gh-config) and operator-repo (/operator-repo) remain hostPath | ||
| # volumes that resolve to paths inside the k3d node container. | ||
| # OPERATORS MUST pass these volume mounts when creating the k3d cluster. | ||
| # Without these flags the gh-config and operator-repo mounts will be empty. | ||
| # The pod will still start — features that depend on GitHub auth or the | ||
| # operator repo will fail gracefully. | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-host-cp | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| type: RollingUpdate | ||
| rollingUpdate: | ||
| maxSurge: 1 | ||
| maxUnavailable: 0 | ||
| selector: | ||
| matchLabels: | ||
| app: olam-host-cp | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-host-cp | ||
| spec: | ||
| # B9 (round 2 recovery): disable k8s automatic Service env injection. | ||
| # Without this, k8s injects OLAM_<UPPER-NAME>_SERVICE_HOST/_PORT env vars | ||
| # into all Pods in the namespace. These collide with olam's own config env | ||
| # vars (e.g. OLAM_KG_SERVICE_PORT) causing Python's int() to crash on the | ||
| # auto-injected "tcp://..." string. Decision #4 (no app-code rename; field | ||
| # removes the collision class entirely). GA since k8s 1.13; we target 1.30+. | ||
| enableServiceLinks: false | ||
| # R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret | ||
| # created by `olam upgrade` step 0.4 when GH_TOKEN is available. Allows | ||
| # pulling from ghcr.io/pleri/* without anonymous rate limits. | ||
| imagePullSecrets: | ||
| - name: ghcr-pull | ||
| serviceAccountName: olam-host-cp | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| runAsGroup: 1000 | ||
| fsGroup: 1000 | ||
| initContainers: | ||
| - name: chown-data | ||
| # busybox:1.36 — sha256-pinned per T4 threat model. | ||
| # To update: docker pull busybox:1.36 && docker inspect busybox:1.36 --format '{{index .RepoDigests 0}}' | ||
| image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662 | ||
| imagePullPolicy: IfNotPresent | ||
| # Run as root to chown the freshly-provisioned PV to UID 1000. | ||
| # The pod-level runAsNonRoot: true is overridden here deliberately. | ||
| # The main container still runs as UID 1000 with all security defaults intact. | ||
| securityContext: | ||
| runAsUser: 0 | ||
| runAsNonRoot: false | ||
| allowPrivilegeEscalation: false | ||
| command: ["chown", "-R", "1000:1000", "/data"] | ||
| volumeMounts: | ||
| - name: olam-home | ||
| mountPath: /data | ||
| # socket-perm init container REMOVED in olam-k3d-on-mac-substrate-decision | ||
| # Phase B B2 (2026-05-21). The R3-A two-volume hostPath approach for | ||
| # docker.sock has been retracted: round-4 R4-W2-F showed virtiofs | ||
| # ENOTSUP on socket-file stat blocks the mount entirely. host-cp now | ||
| # reaches docker via TCP through the docker-socket-proxy ExternalName | ||
| # Service in the olam namespace (see | ||
| # packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml). | ||
| # The proxy itself runs on the operator's docker daemon (sibling to | ||
| # k3d), started by `olam upgrade` Step 0.7 — not inside this Pod. | ||
| containers: | ||
| - name: olam-host-cp | ||
| image: ghcr.io/pleri/olam-host-cp@sha256:328baca8b9b28ccef1d858aa20e0ab27855604a630132dcadd423990cb376f60 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| readOnlyRootFilesystem: true | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
| drop: ["ALL"] | ||
| ports: | ||
| - name: http | ||
| containerPort: 19000 | ||
| protocol: TCP | ||
| env: | ||
| # World watchdog — tick cadence (from ConfigMap default = 30s). | ||
| # Override per-operator to tune probe frequency. | ||
| - name: OLAM_WORLD_WATCHDOG_TICK_MS | ||
| valueFrom: | ||
| configMapKeyRef: | ||
| name: olam-host-cp-env | ||
| key: OLAM_WORLD_WATCHDOG_TICK_MS | ||
| # Set to "1" to disable the world-watchdog entirely (emergency kill switch). | ||
| # Unset by default — watchdog runs in detection-only mode. | ||
| # - name: OLAM_WORLD_WATCHDOG_DISABLED | ||
| # value: "1" | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-host-cp-env | ||
| - secretRef: | ||
| name: olam-host-cp-secret | ||
| volumeMounts: | ||
| - name: olam-home | ||
| mountPath: /data | ||
| - name: gh-config | ||
| mountPath: /gh-config | ||
| readOnly: true | ||
| - name: operator-repo | ||
| mountPath: /operator-repo | ||
| readOnly: true | ||
| - name: tmp | ||
| mountPath: /tmp | ||
| # Phase B Model B: shared olam-plan-chat-secret mounted read-only | ||
| # so renderSpaShell can inject window.__OLAM_PLAN_CHAT_BEARER__. | ||
| # Plan-chat-service mounts the SAME Secret at the SAME path so | ||
| # bearer compares match across pods. | ||
| - name: plan-chat-secret | ||
| mountPath: /etc/olam-plan-chat | ||
| readOnly: true | ||
| # docker-socket volumeMount REMOVED in olam-k3d-on-mac-substrate- | ||
| # decision Phase B B2. Docker access now goes via TCP to the | ||
| # docker-socket-proxy ExternalName Service in the olam namespace. | ||
| # host-cp's `getDockerRequestOptions('kubernetes')` returns | ||
| # `{ host: 'docker-socket-proxy', port: 2375 }` (collapsed to the | ||
| # same value as the compose substrate's branch — see | ||
| # packages/host-cp/src/lib/docker-request-options.mjs). | ||
| readinessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 19000 | ||
| initialDelaySeconds: 5 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 6 | ||
| livenessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 19000 | ||
| initialDelaySeconds: 30 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "50m" | ||
| memory: "256Mi" | ||
| limits: | ||
| cpu: "1000m" | ||
| memory: "1Gi" | ||
| volumes: | ||
| - name: olam-home | ||
| persistentVolumeClaim: | ||
| claimName: olam-host-cp-data | ||
| - name: gh-config | ||
| hostPath: | ||
| path: /host/.config/gh | ||
| type: DirectoryOrCreate | ||
| - name: operator-repo | ||
| hostPath: | ||
| path: /host/olam | ||
| type: DirectoryOrCreate | ||
| - name: tmp | ||
| emptyDir: {} | ||
| - name: plan-chat-secret | ||
| secret: | ||
| secretName: olam-plan-chat-secret | ||
| defaultMode: 0400 | ||
| items: | ||
| - key: PLAN_CHAT_SECRET | ||
| path: secret | ||
| # host-colima + docker-socket volumes REMOVED in olam-k3d-on-mac- | ||
| # substrate-decision Phase B B2 (2026-05-21). R3-A's two-volume | ||
| # hostPath approach is fully retracted: round-4 R4-W2-F demonstrated | ||
| # virtiofs ENOTSUP on socket-file stat is unrecoverable at the | ||
| # containerd OCI runtime layer (kubelet bypass via R4-W2-E was | ||
| # necessary-but-not-sufficient). host-cp now reaches docker via TCP | ||
| # through the docker-socket-proxy ExternalName Service — see | ||
| # packages/host-cp/k8s/manifests/docker-socket-proxy/60-service.yaml. | ||
| # The proxy itself runs on the operator's docker daemon (sibling to | ||
| # k3d), started by `olam upgrade` Step 0.7 on macOS. |
| # ClusterIP Service for olam-host-cp. | ||
| # | ||
| # Two ways to reach the SPA externally: | ||
| # 1. (preferred) Traefik IngressRoute at https://olam.local:<traefik-https-port> | ||
| # Terminates TLS at the cluster edge, unlocks HTTP/2 multiplexing for | ||
| # Electric SQL long-polls. See 70-ingressroute.yaml + 65-tls-secret-template.yaml.tmpl. | ||
| # The pod itself stays HTTP-only — Traefik handles TLS at the edge. | ||
| # 2. (fallback) kubectl port-forward -n olam svc/olam-host-cp 19000:19000 | ||
| # Plain HTTP/1.1; hits browser's 6-conn-per-origin cap under Electric load. | ||
| # | ||
| # ClusterIP (not NodePort) preserves the "127.0.0.1-only" single-user-per-host | ||
| # invariant — exposure is via Traefik's LoadBalancer or port-forward, not by | ||
| # binding pod ports on every node interface. | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-host-cp | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-host-cp | ||
| ports: | ||
| - name: http | ||
| port: 19000 | ||
| targetPort: 19000 | ||
| protocol: TCP |
| # TLS secret template for olam-host-cp Traefik IngressRoute. | ||
| # | ||
| # DO NOT apply this template directly — the placeholders `__TLS_CRT_BASE64__` | ||
| # and `__TLS_KEY_BASE64__` are substituted at apply time by | ||
| # `olam services tls-install` (packages/cli/src/commands/services-tls.ts), | ||
| # which uses `mkcert` to mint a locally-trusted certificate for the SAN list | ||
| # olam.local 127.0.0.1 ::1 | ||
| # and then `kubectl apply -f -` against the rendered manifest. | ||
| # | ||
| # Why a Secret of type kubernetes.io/tls (instead of a plain Opaque secret): | ||
| # Traefik's IngressRoute TLS resolver requires this exact type — it reads | ||
| # tls.crt + tls.key fields by convention. Using Opaque would silently fail | ||
| # the handshake at request time. | ||
| # | ||
| # Why the cert covers SANs (not just CN): modern browsers (Chrome 58+, Brave, | ||
| # Safari, Firefox) ignore the certificate CN entirely and only honour SANs. | ||
| # Without `127.0.0.1` + `::1` in the SAN list, hitting the IP directly fails | ||
| # even though the cert is "valid for olam.local". | ||
| # | ||
| # Renewal: certs minted by mkcert are valid ~2 years and 3 months. The | ||
| # tls-install command checks NotAfter and regenerates when within 30 days | ||
| # of expiry. To force regeneration: `kubectl -n olam delete secret olam-host-cp-tls` | ||
| # and re-run `olam services tls-install`. | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-host-cp-tls | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| type: kubernetes.io/tls | ||
| data: | ||
| tls.crt: __TLS_CRT_BASE64__ | ||
| tls.key: __TLS_KEY_BASE64__ |
| # Traefik IngressRoute terminating TLS at the cluster edge for olam-host-cp. | ||
| # | ||
| # Topology: | ||
| # Browser --HTTPS/h2--> Traefik :443 (LoadBalancer / k3d NodePort) | ||
| # | | ||
| # | (TLS terminated; cleartext inside cluster) | ||
| # v | ||
| # olam-host-cp:19000 (ClusterIP, HTTP/1.1 internal) | ||
| # | | ||
| # v | ||
| # plan-chat-service:3200 (and other peripherals) | ||
| # | ||
| # Why terminate TLS at Traefik (NOT at host-cp): host-cp is a Node/Hono | ||
| # server tuned for cleartext HTTP. Pushing TLS into the pod would force a | ||
| # second cert-distribution mechanism (Secret → volumeMount → server.mjs | ||
| # reload) and double the operational surface. Traefik already owns cert | ||
| # lifecycle in production (cert-manager + Let's Encrypt), so dev-mode | ||
| # mkcert at the same boundary keeps prod parity tight. | ||
| # | ||
| # Why HTTP/2 matters: TanStack DB / Electric SQL opens N long-poll | ||
| # connections per browser tab (one per shape subscription). Without h2 | ||
| # multiplexing they queue against the browser's 6-connection-per-origin | ||
| # cap, leading to the "25-second pending requests" symptom Electric users | ||
| # hit on HTTP/1.1. Traefik 2.x advertises h2 over TLS via ALPN by default; | ||
| # no extra config needed. | ||
| # | ||
| # Why Host(olam.local) instead of a wildcard: the cert is minted for that | ||
| # exact SAN. Traefik routes based on SNI, so the host-rule must match the | ||
| # cert subject or the TLS handshake completes but the route 404s. | ||
| # | ||
| # Operator MUST add `127.0.0.1 olam.local` to /etc/hosts before this works. | ||
| # `olam services tls-install` prints the line + sudo command — it does NOT | ||
| # auto-edit (touching /etc/hosts behind the operator's back is a foot-gun). | ||
| apiVersion: traefik.io/v1alpha1 | ||
| kind: IngressRoute | ||
| metadata: | ||
| # Distinct name avoids collision with packages/peripheral-services' | ||
| # `olam-host-cp` IngressRoute (the legacy `web`-entrypoint + path-based | ||
| # router that 50+ SPA fetch sites still depend on). The `-https` variant | ||
| # adds a SECOND ingress that matches Host(olam.local) on `websecure` and | ||
| # terminates TLS via the operator-minted Secret. Both coexist; the legacy | ||
| # one keeps `http://<lb>/api/...` working, this one unlocks HTTP/2. | ||
| name: olam-host-cp-https | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| spec: | ||
| entryPoints: | ||
| - websecure | ||
| routes: | ||
| - match: Host(`olam.local`) | ||
| kind: Rule | ||
| services: | ||
| - name: olam-host-cp | ||
| port: 19000 | ||
| tls: | ||
| secretName: olam-host-cp-tls |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral |
| # Phase 1a Decision 19: Role scoped to resourceNames: ["olam-auth-service"] on | ||
| # apps/v1 deployments. Without this scope, the in-cluster ServiceAccount | ||
| # could patch ANY Deployment in the namespace. This is the load-bearing | ||
| # security guardrail — preserve verbatim. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral | ||
| rules: | ||
| - apiGroups: ["apps"] | ||
| resources: ["deployments"] | ||
| resourceNames: ["olam-auth-service"] | ||
| verbs: ["get", "patch", "watch"] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-auth-service | ||
| namespace: olam | ||
| roleRef: | ||
| kind: Role | ||
| name: olam-auth-service | ||
| apiGroup: rbac.authorization.k8s.io |
| # ConfigMap for olam-auth-service environment. Sensitive values (AUTH_DB_SECRET, | ||
| # API keys) are NOT here — they live in the Secret (see templates/auth-service-secret-template.yaml). | ||
| # Operators apply the Secret separately before applying the manifests. | ||
| # | ||
| # Inter-peripheral URL placeholders (e.g. OLAM_MCP_AUTH_URL) are set to | ||
| # cluster-internal DNS names. These are resolved by Phase C substitution; | ||
| # operators running Phase 2 Beta may override them directly. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-auth-service-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral | ||
| data: | ||
| # Port auth-service listens on. Must match 60-service.yaml targetPort. | ||
| OLAM_AUTH_PORT: "9999" | ||
| # Data directory — backed by the PVC mounted at /data. | ||
| OLAM_AUTH_DATA_PATH: "/data/auth" | ||
| # URL of mcp-auth-service (cluster-internal DNS). Override in non-k3d environments. | ||
| OLAM_MCP_AUTH_SERVICE_URL: "http://olam-mcp-auth-service.olam.svc.cluster.local:9998" | ||
| # Credential vault poll interval. | ||
| OLAM_CREDENTIAL_POLL_MS: "60000" | ||
| # R3-B (Decision R3-#2): bind on all interfaces so the k8s readiness probe | ||
| # (hitting the pod IP 10.42.x.x:9999) succeeds. Default in image source was | ||
| # 127.0.0.1 which caused CrashLoopBackOff in k8s. ConfigMap override is the | ||
| # second defense layer; the image source default was also changed to 0.0.0.0. | ||
| AUTH_BIND: "0.0.0.0" |
| # PersistentVolumeClaim for olam-auth-service /data volume. | ||
| # | ||
| # Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml | ||
| # for the full rationale (fsGroup, k3d node filesystem, etc.). | ||
| # | ||
| # local-path StorageClass ships with k3d by default (rancher/local-path-provisioner). | ||
| # On non-k3d clusters, substitute storageClassName with your cluster's provisioner. | ||
| # D24: storageClassName operator-editable — edit the field below for non-k3d substrates. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-auth-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| # D24: operator-editable. k3d default is local-path. Change for non-k3d substrates. | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| # D25: auth-service PVC size 5Gi. | ||
| storage: 5Gi |
| # Deployment for olam-auth-service. | ||
| # | ||
| # Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model. | ||
| # Digest resolves to ghcr.io/pleri/olam-auth:latest (multi-arch index). | ||
| # NOTE (B1): image name is olam-auth (NOT olam-auth-service) — matches the | ||
| # actual GHCR package name published by release.yml publish-auth job. | ||
| # To update: resolve the new tag's digest via: | ||
| # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-auth:pull&service=ghcr.io" | jq -r .token) | ||
| # curl -sI -H "Authorization: Bearer $TOKEN" \ | ||
| # -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \ | ||
| # https://ghcr.io/v2/pleri/olam-auth/manifests/<tag> | grep docker-content-digest | ||
| # Or use: node scripts/refresh-manifest-digests.mjs | ||
| # | ||
| # securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot, | ||
| # readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs. | ||
| # | ||
| # D17: auth-service does NOT mount /var/run/docker.sock (Phase 2 k8s pods | ||
| # cannot reach docker.sock — no hostPath socket mount). | ||
| # | ||
| # chown-data init container: grants UID-1000 write access on the freshly- | ||
| # provisioned PV (fsGroup alone is insufficient for local-path PVs). | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| type: RollingUpdate | ||
| rollingUpdate: | ||
| maxSurge: 1 | ||
| maxUnavailable: 0 | ||
| selector: | ||
| matchLabels: | ||
| app: olam-auth-service | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-auth-service | ||
| spec: | ||
| # B9 (round 2 recovery): disable k8s automatic Service env injection. | ||
| # See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale. | ||
| enableServiceLinks: false | ||
| # R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret | ||
| # created by `olam upgrade` step 0.4 when GH_TOKEN is available. | ||
| imagePullSecrets: | ||
| - name: ghcr-pull | ||
| serviceAccountName: olam-auth-service | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| runAsGroup: 1000 | ||
| fsGroup: 1000 | ||
| initContainers: | ||
| - name: chown-data | ||
| # busybox:1.36 — sha256-pinned per T4 threat model. | ||
| image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsUser: 0 | ||
| runAsNonRoot: false | ||
| allowPrivilegeEscalation: false | ||
| command: ["chown", "-R", "1000:1000", "/data"] | ||
| volumeMounts: | ||
| - name: auth-data | ||
| mountPath: /data | ||
| containers: | ||
| - name: olam-auth-service | ||
| image: ghcr.io/pleri/olam-auth@sha256:770ee97ee4d06d2c1b6512ba99421a5fe312393d592df1684fd0d03b3476ff10 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| readOnlyRootFilesystem: true | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
| drop: ["ALL"] | ||
| ports: | ||
| - name: http | ||
| containerPort: 9999 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-auth-service-env | ||
| - secretRef: | ||
| name: olam-auth-service-secret | ||
| volumeMounts: | ||
| - name: auth-data | ||
| mountPath: /data | ||
| - name: tmp | ||
| mountPath: /tmp | ||
| readinessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 9999 | ||
| initialDelaySeconds: 5 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 6 | ||
| livenessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 9999 | ||
| initialDelaySeconds: 30 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "50m" | ||
| memory: "128Mi" | ||
| limits: | ||
| cpu: "500m" | ||
| memory: "512Mi" | ||
| volumes: | ||
| - name: auth-data | ||
| persistentVolumeClaim: | ||
| claimName: olam-auth-data | ||
| - name: tmp | ||
| emptyDir: {} |
| # ClusterIP Service for olam-auth-service. | ||
| # Port 9999 — consumed by host-cp and other peripherals via cluster-internal DNS. | ||
| # Operator surfaces externally via: | ||
| # kubectl port-forward -n olam svc/olam-auth-service 9999:9999 | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-auth-service | ||
| ports: | ||
| - name: http | ||
| port: 9999 | ||
| targetPort: 9999 | ||
| protocol: TCP |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-chunks-electric | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-electric | ||
| olam.io/component: substrate |
| # Electric does not call the Kubernetes API. Empty Role kept for layout parity. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-chunks-electric | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-electric | ||
| olam.io/component: substrate | ||
| rules: [] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-chunks-electric | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-electric | ||
| olam.io/component: substrate | ||
| roleRef: | ||
| apiGroup: rbac.authorization.k8s.io | ||
| kind: Role | ||
| name: olam-chunks-electric | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-chunks-electric | ||
| namespace: olam |
| # ConfigMap for olam-chunks-electric. | ||
| # | ||
| # ELECTRIC_INSECURE=true disables Electric's API-secret-token gate. Acceptable | ||
| # in a single-operator local-dev k3d cluster (the Service is ClusterIP — no | ||
| # external reachability). For multi-tenant deploys, set ELECTRIC_INSECURE=false | ||
| # and provision ELECTRIC_SECRET via a Secret instead. | ||
| # | ||
| # DATABASE_URL is composed at runtime in the Deployment via env: composition | ||
| # referencing the chunks-postgres Secret (POSTGRES_PASSWORD). It is NOT | ||
| # stored here. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-chunks-electric-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-electric | ||
| olam.io/component: substrate | ||
| data: | ||
| ELECTRIC_INSECURE: "true" | ||
| ELECTRIC_PORT: "3000" | ||
| ELECTRIC_HTTP_API_PORT: "3000" | ||
| ELECTRIC_LOG_LEVEL: "info" |
| # Electric's HTTP server state lives in-memory + the replication slot lives on | ||
| # Postgres. No persistent state required, but a small PVC is kept for parity | ||
| # with other peripherals — Electric writes its persisted-shape index to | ||
| # /app/persistent by default; PVC backs that path. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-chunks-electric-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-electric | ||
| olam.io/component: substrate | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| storage: 1Gi |
| # Deployment for olam-chunks-electric. | ||
| # | ||
| # Electric SQL — Postgres logical-replication → HTTP long-poll shape proxy. | ||
| # Single replica (replication slot is single-writer). | ||
| # | ||
| # Image: electricsql/electric:1.6.8 — sha256-pinned per T4 threat model. | ||
| # Resolves to the same digest as :latest at 2026-05-27; refresh when the | ||
| # upstream cuts a new release that closes a security advisory. | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-chunks-electric | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-electric | ||
| olam.io/component: substrate | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| # Recreate (NOT RollingUpdate) — Electric holds a postgres replication | ||
| # slot; two pods running at once would fight for the same slot and one | ||
| # would crashloop. | ||
| type: Recreate | ||
| selector: | ||
| matchLabels: | ||
| app: olam-chunks-electric | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-chunks-electric | ||
| spec: | ||
| enableServiceLinks: false | ||
| serviceAccountName: olam-chunks-electric | ||
| containers: | ||
| - name: electric | ||
| image: electricsql/electric:1.6.8@sha256:a716f2affde44d5b991bdd1492876d9d6bddbcae5c98411327614575cd8f9eec | ||
| imagePullPolicy: IfNotPresent | ||
| ports: | ||
| - name: http | ||
| containerPort: 3000 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-chunks-electric-env | ||
| env: | ||
| # DATABASE_URL composition. POSTGRES_PASSWORD is sourced from the | ||
| # chunks-postgres Secret (rendered by k8s-secret-render.ts). | ||
| - name: POSTGRES_PASSWORD | ||
| valueFrom: | ||
| secretKeyRef: | ||
| name: olam-chunks-postgres-secret | ||
| key: POSTGRES_PASSWORD | ||
| - name: DATABASE_URL | ||
| value: "postgres://postgres:$(POSTGRES_PASSWORD)@olam-chunks-postgres.olam.svc.cluster.local:5432/chunks?sslmode=disable" | ||
| volumeMounts: | ||
| - name: persistent | ||
| mountPath: /app/persistent | ||
| readinessProbe: | ||
| httpGet: | ||
| path: /v1/health | ||
| port: 3000 | ||
| initialDelaySeconds: 10 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 12 | ||
| livenessProbe: | ||
| httpGet: | ||
| path: /v1/health | ||
| port: 3000 | ||
| initialDelaySeconds: 60 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "100m" | ||
| memory: "256Mi" | ||
| limits: | ||
| cpu: "1000m" | ||
| memory: "1Gi" | ||
| volumes: | ||
| - name: persistent | ||
| persistentVolumeClaim: | ||
| claimName: olam-chunks-electric-data |
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-chunks-electric | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-electric | ||
| olam.io/component: substrate | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-chunks-electric | ||
| ports: | ||
| - name: http | ||
| port: 3000 | ||
| targetPort: 3000 | ||
| protocol: TCP |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-chunks-postgres | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate |
| # Minimal-privilege RBAC for chunks-postgres. The pod does not call the | ||
| # Kubernetes API; this Role exists to make the per-service apply order | ||
| # (10/20/30/45/50/60) uniform across peripherals + substrate. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-chunks-postgres | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| rules: [] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-chunks-postgres | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| roleRef: | ||
| apiGroup: rbac.authorization.k8s.io | ||
| kind: Role | ||
| name: olam-chunks-postgres | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-chunks-postgres | ||
| namespace: olam |
| # ConfigMap for olam-chunks-postgres. | ||
| # | ||
| # Two ConfigMaps in one file: | ||
| # | ||
| # 1. olam-chunks-postgres-env — non-secret env vars (POSTGRES_USER, POSTGRES_DB). | ||
| # POSTGRES_PASSWORD lives in the Secret rendered by | ||
| # packages/cli/src/lib/k8s-secret-render.ts. | ||
| # | ||
| # 2. olam-chunks-postgres-initdb-sql — the chunks schema. Mounted at | ||
| # /docker-entrypoint-initdb.d/01-chunks.sql so | ||
| # the postgres image's entrypoint auto-applies it | ||
| # on FIRST init (empty data dir). Subsequent | ||
| # restarts skip the directory by design. | ||
| # | ||
| # Source-of-truth: packages/chunks/src/schema.ts | ||
| # (SCHEMA_SQL export). The CI gate | ||
| # `audit:chunks-schema-parity` (follow-up) will | ||
| # fail when this ConfigMap drifts from | ||
| # SCHEMA_VERSION-tagged schema.ts. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-chunks-postgres-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| data: | ||
| POSTGRES_USER: "postgres" | ||
| POSTGRES_DB: "chunks" | ||
| # PGDATA must point at a subdirectory of the PVC mount, not its root — | ||
| # the PVC root may carry the local-path provisioner's lost+found dir, | ||
| # which postgres's initdb rejects ("data directory not empty"). | ||
| PGDATA: "/var/lib/postgresql/data/pgdata" | ||
| --- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-chunks-postgres-initdb-sql | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| data: | ||
| # MIRRORS packages/chunks/src/schema.ts SCHEMA_VERSION=2. | ||
| # Idempotent: CREATE TABLE IF NOT EXISTS / ADD COLUMN IF NOT EXISTS / | ||
| # DO blocks with EXCEPTION-WHEN-{undefined_object,duplicate_object}. | ||
| 01-chunks.sql: | | ||
| CREATE TABLE IF NOT EXISTS chunks ( | ||
| world_id TEXT NOT NULL, | ||
| session_id TEXT NOT NULL, | ||
| message_id TEXT NOT NULL, | ||
| seq INTEGER NOT NULL, | ||
| actor_id TEXT NOT NULL, | ||
| actor_type TEXT NOT NULL CHECK (actor_type IN ('agent', 'operator', 'codex', 'system')), | ||
| role TEXT NOT NULL CHECK (role IN ('user', 'assistant', 'tool', 'system')), | ||
| chunk TEXT NOT NULL, | ||
| chunk_type TEXT NOT NULL DEFAULT 'text' CHECK (chunk_type IN ('text', 'tool_use', 'goal_mode_assumption', 'dispatch_overflow')), | ||
| created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(), | ||
| PRIMARY KEY (message_id, seq) | ||
| ); | ||
| ALTER TABLE chunks ADD COLUMN IF NOT EXISTS chunk_type TEXT NOT NULL DEFAULT 'text'; | ||
| DO $$ BEGIN | ||
| ALTER TABLE chunks DROP CONSTRAINT IF EXISTS chunks_chunk_type_check; | ||
| EXCEPTION WHEN undefined_object THEN NULL; | ||
| END $$; | ||
| DO $$ BEGIN | ||
| ALTER TABLE chunks ADD CONSTRAINT chunks_chunk_type_check | ||
| CHECK (chunk_type IN ('text', 'tool_use', 'goal_mode_assumption', 'dispatch_overflow')); | ||
| EXCEPTION WHEN duplicate_object THEN NULL; | ||
| END $$; | ||
| CREATE INDEX IF NOT EXISTS chunks_world_session_seq | ||
| ON chunks (world_id, session_id, seq); | ||
| CREATE INDEX IF NOT EXISTS chunks_world_session_created | ||
| ON chunks (world_id, session_id, created_at); | ||
| CREATE INDEX IF NOT EXISTS idx_chunks_planning | ||
| ON chunks (session_id, seq) | ||
| WHERE world_id = '_planning'; | ||
| CREATE TABLE IF NOT EXISTS planning_sessions ( | ||
| session_id TEXT PRIMARY KEY, | ||
| actor_id TEXT NOT NULL, | ||
| summary TEXT, | ||
| crystallize_status TEXT NOT NULL DEFAULT 'open' | ||
| CHECK (crystallize_status IN ('open', 'in_progress', 'crystallized', 'failed', 'abandoned')), | ||
| crystallized_world_id TEXT, | ||
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | ||
| updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() | ||
| ); | ||
| CREATE INDEX IF NOT EXISTS idx_planning_sessions_created_at | ||
| ON planning_sessions (created_at DESC); | ||
| ALTER TABLE planning_sessions ADD COLUMN IF NOT EXISTS session_source TEXT; | ||
| CREATE OR REPLACE FUNCTION chunks_append_only_trigger() | ||
| RETURNS trigger AS $body$ | ||
| BEGIN | ||
| RAISE EXCEPTION 'chunks is append-only; % forbidden', TG_OP; | ||
| END; | ||
| $body$ LANGUAGE plpgsql; | ||
| DROP TRIGGER IF EXISTS chunks_no_update ON chunks; | ||
| CREATE TRIGGER chunks_no_update | ||
| BEFORE UPDATE ON chunks | ||
| FOR EACH ROW EXECUTE FUNCTION chunks_append_only_trigger(); | ||
| DROP TRIGGER IF EXISTS chunks_no_delete ON chunks; | ||
| CREATE TRIGGER chunks_no_delete | ||
| BEFORE DELETE ON chunks | ||
| FOR EACH ROW EXECUTE FUNCTION chunks_append_only_trigger(); | ||
| CREATE TABLE IF NOT EXISTS message_usage ( | ||
| world_id TEXT NOT NULL, | ||
| session_id TEXT NOT NULL, | ||
| message_id TEXT NOT NULL, | ||
| actor_id TEXT NOT NULL, | ||
| model TEXT NOT NULL, | ||
| input_tokens INTEGER NOT NULL DEFAULT 0, | ||
| output_tokens INTEGER NOT NULL DEFAULT 0, | ||
| cache_read_tokens INTEGER NOT NULL DEFAULT 0, | ||
| cache_create_tokens INTEGER NOT NULL DEFAULT 0, | ||
| created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(), | ||
| PRIMARY KEY (message_id, actor_id) | ||
| ); | ||
| CREATE INDEX IF NOT EXISTS message_usage_session_created | ||
| ON message_usage (session_id, created_at); | ||
| CREATE OR REPLACE FUNCTION message_usage_append_only_trigger() | ||
| RETURNS trigger AS $body$ | ||
| BEGIN | ||
| RAISE EXCEPTION 'message_usage is append-only; % forbidden', TG_OP; | ||
| END; | ||
| $body$ LANGUAGE plpgsql; | ||
| DROP TRIGGER IF EXISTS message_usage_no_update ON message_usage; | ||
| CREATE TRIGGER message_usage_no_update | ||
| BEFORE UPDATE ON message_usage | ||
| FOR EACH ROW EXECUTE FUNCTION message_usage_append_only_trigger(); | ||
| DROP TRIGGER IF EXISTS message_usage_no_delete ON message_usage; | ||
| CREATE TRIGGER message_usage_no_delete | ||
| BEFORE DELETE ON message_usage | ||
| FOR EACH ROW EXECUTE FUNCTION message_usage_append_only_trigger(); | ||
| CREATE TABLE IF NOT EXISTS planning_artifacts ( | ||
| id TEXT PRIMARY KEY, | ||
| world_id TEXT NOT NULL, | ||
| session_id TEXT NOT NULL, | ||
| type TEXT NOT NULL CHECK (type IN ('commit_plan', 'component_scaffold', 'design_jam')), | ||
| title TEXT NOT NULL, | ||
| body JSONB NOT NULL, | ||
| status TEXT NOT NULL DEFAULT 'open' | ||
| CHECK (status IN ('open', 'crystallized', 'failed', 'archived')), | ||
| linear_issue_url TEXT, | ||
| crystallized_world_id TEXT, | ||
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | ||
| updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() | ||
| ); | ||
| CREATE INDEX IF NOT EXISTS idx_planning_artifacts_session | ||
| ON planning_artifacts (session_id, created_at); | ||
| CREATE INDEX IF NOT EXISTS idx_planning_artifacts_world | ||
| ON planning_artifacts (world_id, status); | ||
| CREATE OR REPLACE FUNCTION planning_artifacts_touch_updated_at() | ||
| RETURNS trigger AS $body$ | ||
| BEGIN | ||
| NEW.updated_at = NOW(); | ||
| RETURN NEW; | ||
| END; | ||
| $body$ LANGUAGE plpgsql; | ||
| DROP TRIGGER IF EXISTS planning_artifacts_touch ON planning_artifacts; | ||
| CREATE TRIGGER planning_artifacts_touch | ||
| BEFORE UPDATE ON planning_artifacts | ||
| FOR EACH ROW EXECUTE FUNCTION planning_artifacts_touch_updated_at(); |
| # PVC for the chunks-postgres data directory. | ||
| # | ||
| # Sized 10Gi for local-dev. Chunks rows are small (~1KB each) so even a | ||
| # busy single-operator world rarely cracks 1Gi; the headroom is for the | ||
| # message_usage + planning_artifacts sidecar tables. | ||
| # | ||
| # accessModes: ReadWriteOnce — postgres is a StatefulSet with replicas=1. | ||
| # k3d's local-path provisioner only supports RWO; the in-cluster postgres | ||
| # pattern is single-writer by design (no operator-managed HA). | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-chunks-postgres-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| storage: 10Gi |
| # StatefulSet for olam-chunks-postgres. | ||
| # | ||
| # Why StatefulSet vs Deployment: even with replicas=1 the StatefulSet gives | ||
| # stable network identity (olam-chunks-postgres-0 inside the headless service) | ||
| # and ordered termination semantics — both useful when Electric's replication | ||
| # slot survives pod restarts. | ||
| # | ||
| # command override: postgres requires wal_level=logical for Electric SQL's | ||
| # logical-replication subscription. The image's default postgresql.conf | ||
| # ships wal_level=replica; the -c overrides on the entrypoint args take | ||
| # precedence. max_replication_slots / max_wal_senders need raising too — | ||
| # Electric holds one slot per database. | ||
| # | ||
| # securityContext: postgres image runs as uid 999 by default. fsGroup=999 | ||
| # on the pod ensures the PVC mount is chowned to 999 so postgres can write | ||
| # its data dir. | ||
| apiVersion: apps/v1 | ||
| kind: StatefulSet | ||
| metadata: | ||
| name: olam-chunks-postgres | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| spec: | ||
| replicas: 1 | ||
| serviceName: olam-chunks-postgres | ||
| selector: | ||
| matchLabels: | ||
| app: olam-chunks-postgres | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| spec: | ||
| enableServiceLinks: false | ||
| serviceAccountName: olam-chunks-postgres | ||
| securityContext: | ||
| fsGroup: 999 | ||
| containers: | ||
| - name: postgres | ||
| # postgres:16-alpine — sha256-pinned per T4 threat model. | ||
| image: postgres:16-alpine@sha256:16bc17c64a573ef34162af9298258d1aec548232985b33ed7b1eac33ba35c229 | ||
| imagePullPolicy: IfNotPresent | ||
| args: | ||
| - postgres | ||
| - -c | ||
| - wal_level=logical | ||
| - -c | ||
| - max_replication_slots=10 | ||
| - -c | ||
| - max_wal_senders=10 | ||
| ports: | ||
| - name: postgres | ||
| containerPort: 5432 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-chunks-postgres-env | ||
| - secretRef: | ||
| name: olam-chunks-postgres-secret | ||
| volumeMounts: | ||
| - name: data | ||
| mountPath: /var/lib/postgresql/data | ||
| - name: initdb | ||
| mountPath: /docker-entrypoint-initdb.d | ||
| readOnly: true | ||
| readinessProbe: | ||
| exec: | ||
| command: | ||
| - sh | ||
| - -c | ||
| - pg_isready -U postgres -d chunks -h 127.0.0.1 | ||
| initialDelaySeconds: 5 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 12 | ||
| livenessProbe: | ||
| exec: | ||
| command: | ||
| - sh | ||
| - -c | ||
| - pg_isready -U postgres -h 127.0.0.1 | ||
| initialDelaySeconds: 30 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "100m" | ||
| memory: "256Mi" | ||
| limits: | ||
| cpu: "1000m" | ||
| memory: "1Gi" | ||
| volumes: | ||
| - name: data | ||
| persistentVolumeClaim: | ||
| claimName: olam-chunks-postgres-data | ||
| - name: initdb | ||
| configMap: | ||
| name: olam-chunks-postgres-initdb-sql |
| # Headless Service for olam-chunks-postgres StatefulSet. | ||
| # | ||
| # clusterIP: None gives the StatefulSet's pod stable DNS: | ||
| # olam-chunks-postgres-0.olam-chunks-postgres.olam.svc.cluster.local | ||
| # Callers (plan-chat-service, chunks-electric) connect via the shorter | ||
| # olam-chunks-postgres.olam.svc.cluster.local form which Kubernetes resolves | ||
| # round-robin to the single backing pod. | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-chunks-postgres | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| spec: | ||
| clusterIP: None | ||
| selector: | ||
| app: olam-chunks-postgres | ||
| ports: | ||
| - name: postgres | ||
| port: 5432 | ||
| targetPort: 5432 | ||
| protocol: TCP |
| # ExternalName Service for the host-side docker-socket-proxy. | ||
| # | ||
| # Provides in-cluster DNS for pods to reach the host-side proxy | ||
| # container (defined in packages/host-cp/k8s/host-side/docker-socket-proxy.compose.yaml). | ||
| # The Service has NO backing Pod — `type: ExternalName` is a kube-dns | ||
| # CNAME alias to `host.k3d.internal`, the gateway address that k3d | ||
| # auto-provisions inside every node container. | ||
| # | ||
| # Decision #7 (round-4 plan pass 2): Universal across all k8s substrates | ||
| # (macOS+colima+virtiofs, Linux native k3d, WSL2). One codepath; the | ||
| # per-Pod cost of running an in-cluster proxy elsewhere is invisible | ||
| # against the maintenance tax of OS-conditional Service generation. | ||
| # | ||
| # Why ExternalName and not in-cluster Pod with hostPath: | ||
| # the in-cluster Pod would itself need to bind /var/run/docker.sock | ||
| # from the lima VM, hitting the same virtiofs ENOTSUP class that | ||
| # R4-W2-F is. The proxy must live OUTSIDE the k3d cluster, on the | ||
| # operator's colima docker daemon. ExternalName makes that | ||
| # transparent to consumers: host-cp configures | ||
| # { host: 'docker-socket-proxy', port: 2375 } regardless of where | ||
| # the actual proxy container lives. | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: docker-socket-proxy | ||
| namespace: olam | ||
| labels: | ||
| app: docker-socket-proxy | ||
| olam.io/component: host-stack | ||
| spec: | ||
| type: ExternalName | ||
| externalName: host.k3d.internal | ||
| ports: | ||
| - name: tcp-2375 | ||
| port: 2375 | ||
| targetPort: 2375 | ||
| protocol: TCP |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral |
| # Phase 1a Decision 19: Role scoped to resourceNames: ["olam-kg-service"] on | ||
| # apps/v1 deployments. Without this scope, the in-cluster ServiceAccount | ||
| # could patch ANY Deployment in the namespace. This is the load-bearing | ||
| # security guardrail — preserve verbatim. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral | ||
| rules: | ||
| - apiGroups: ["apps"] | ||
| resources: ["deployments"] | ||
| resourceNames: ["olam-kg-service"] | ||
| verbs: ["get", "patch", "watch"] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| roleRef: | ||
| kind: Role | ||
| name: olam-kg-service | ||
| apiGroup: rbac.authorization.k8s.io |
| # ConfigMap for olam-kg-service environment. Sensitive values live in | ||
| # the Secret (see templates/kg-service-secret-template.yaml). | ||
| # Operators apply the Secret separately before applying the manifests. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-kg-service-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral | ||
| data: | ||
| # Port kg-service listens on. Must match 60-service.yaml targetPort. | ||
| OLAM_KG_PORT: "9997" | ||
| # Data directory — backed by the PVC mounted at /data. | ||
| OLAM_KG_DATA_PATH: "/data/kg" | ||
| # URL of auth-service (cluster-internal DNS). Override in non-k3d environments. | ||
| OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999" | ||
| # R3-B (Decision R3-#2): kg-service source (server.py) uses OLAM_KG_SERVICE_BIND | ||
| # and defaults to 127.0.0.1. In k8s the readiness probe hits the pod IP, so | ||
| # 127.0.0.1-only listener causes CrashLoopBackOff. ConfigMap override forces | ||
| # all-interfaces bind without requiring an image rebuild. | ||
| OLAM_KG_SERVICE_BIND: "0.0.0.0" |
| # PersistentVolumeClaim for olam-kg-service /data volume. | ||
| # | ||
| # Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml | ||
| # for the full rationale (fsGroup, k3d node filesystem, etc.). | ||
| # | ||
| # local-path StorageClass ships with k3d by default (rancher/local-path-provisioner). | ||
| # On non-k3d clusters, substitute storageClassName with your cluster's provisioner. | ||
| # D24: storageClassName operator-editable — edit the field below for non-k3d substrates. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-kg-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| # D24: operator-editable. k3d default is local-path. Change for non-k3d substrates. | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| # D25: kg-service PVC size 10Gi (larger: graph index grows with codebase). | ||
| storage: 10Gi |
| # Deployment for olam-kg-service. | ||
| # | ||
| # Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model. | ||
| # Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.0 (multi-arch index). | ||
| # To update: resolve the new tag's digest via: | ||
| # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" | jq -r .token) | ||
| # curl -sI -H "Authorization: Bearer $TOKEN" \ | ||
| # -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \ | ||
| # https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> | grep docker-content-digest | ||
| # | ||
| # securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot, | ||
| # readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs. | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| type: RollingUpdate | ||
| rollingUpdate: | ||
| maxSurge: 1 | ||
| maxUnavailable: 0 | ||
| selector: | ||
| matchLabels: | ||
| app: olam-kg-service | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-kg-service | ||
| spec: | ||
| # B9 (round 2 recovery): disable k8s automatic Service env injection. | ||
| # See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale. | ||
| enableServiceLinks: false | ||
| # R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret | ||
| # created by `olam upgrade` step 0.4 when GH_TOKEN is available. | ||
| imagePullSecrets: | ||
| - name: ghcr-pull | ||
| serviceAccountName: olam-kg-service | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| runAsGroup: 1000 | ||
| fsGroup: 1000 | ||
| initContainers: | ||
| - name: chown-data | ||
| # busybox:1.36 — sha256-pinned per T4 threat model. | ||
| image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsUser: 0 | ||
| runAsNonRoot: false | ||
| allowPrivilegeEscalation: false | ||
| command: ["chown", "-R", "1000:1000", "/data"] | ||
| volumeMounts: | ||
| - name: kg-data | ||
| mountPath: /data | ||
| containers: | ||
| - name: olam-kg-service | ||
| image: ghcr.io/pleri/olam-kg-service@sha256:f97ee90fe1bd5b12cb56d5fbf0d3085c301bb7abeef0dd28d2b2a5c90ab6efbb | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| readOnlyRootFilesystem: true | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
| drop: ["ALL"] | ||
| ports: | ||
| - name: http | ||
| containerPort: 9997 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-kg-service-env | ||
| - secretRef: | ||
| name: olam-kg-service-secret | ||
| volumeMounts: | ||
| - name: kg-data | ||
| mountPath: /data | ||
| - name: tmp | ||
| mountPath: /tmp | ||
| readinessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 9997 | ||
| initialDelaySeconds: 5 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 6 | ||
| livenessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 9997 | ||
| initialDelaySeconds: 30 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "100m" | ||
| memory: "256Mi" | ||
| limits: | ||
| cpu: "1000m" | ||
| memory: "1Gi" | ||
| volumes: | ||
| - name: kg-data | ||
| persistentVolumeClaim: | ||
| claimName: olam-kg-data | ||
| - name: tmp | ||
| emptyDir: {} |
| # ClusterIP Service for olam-kg-service. | ||
| # Port 9997 — consumed by agents and host-cp via cluster-internal DNS. | ||
| # Operator surfaces externally via: | ||
| # kubectl port-forward -n olam svc/olam-kg-service 9997:9997 | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-kg-service | ||
| ports: | ||
| - name: http | ||
| port: 9997 | ||
| targetPort: 9997 | ||
| protocol: TCP |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-mcp-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral |
| # Phase 1a Decision 19: Role scoped to resourceNames: ["olam-mcp-auth-service"] on | ||
| # apps/v1 deployments. Without this scope, the in-cluster ServiceAccount | ||
| # could patch ANY Deployment in the namespace. This is the load-bearing | ||
| # security guardrail — preserve verbatim. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-mcp-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral | ||
| rules: | ||
| - apiGroups: ["apps"] | ||
| resources: ["deployments"] | ||
| resourceNames: ["olam-mcp-auth-service"] | ||
| verbs: ["get", "patch", "watch"] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-mcp-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-mcp-auth-service | ||
| namespace: olam | ||
| roleRef: | ||
| kind: Role | ||
| name: olam-mcp-auth-service | ||
| apiGroup: rbac.authorization.k8s.io |
| # ConfigMap for olam-mcp-auth-service environment. Sensitive values live in | ||
| # the Secret (see templates/mcp-auth-service-secret-template.yaml). | ||
| # Operators apply the Secret separately before applying the manifests. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-mcp-auth-service-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral | ||
| data: | ||
| # Port mcp-auth-service listens on. Must match 60-service.yaml targetPort. | ||
| OLAM_MCP_AUTH_PORT: "9998" | ||
| # Data directory — backed by the PVC mounted at /data. | ||
| OLAM_MCP_AUTH_DATA_PATH: "/data/mcp-auth" | ||
| # URL of auth-service (cluster-internal DNS). Override in non-k3d environments. | ||
| OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999" | ||
| # R3-B defensive (Decision R3-#2): mcp-auth-service source already defaults to | ||
| # 0.0.0.0 (MCP_AUTH_BIND env var) but ConfigMap override is explicit defense | ||
| # against a future image regression reverting to 127.0.0.1. | ||
| MCP_AUTH_BIND: "0.0.0.0" |
| # PersistentVolumeClaim for olam-mcp-auth-service /data volume. | ||
| # | ||
| # Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml | ||
| # for the full rationale (fsGroup, k3d node filesystem, etc.). | ||
| # | ||
| # local-path StorageClass ships with k3d by default (rancher/local-path-provisioner). | ||
| # On non-k3d clusters, substitute storageClassName with your cluster's provisioner. | ||
| # D24: storageClassName operator-editable — edit the field below for non-k3d substrates. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-mcp-auth-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| # D24: operator-editable. k3d default is local-path. Change for non-k3d substrates. | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| # D25: mcp-auth-service PVC size 5Gi. | ||
| storage: 5Gi |
| # Deployment for olam-mcp-auth-service. | ||
| # | ||
| # Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model. | ||
| # Digest resolves to ghcr.io/pleri/olam-mcp-auth:latest (multi-arch index). | ||
| # NOTE (B1): image name is olam-mcp-auth (NOT olam-mcp-auth-service) — matches the | ||
| # actual GHCR package name published by release.yml publish-mcp-auth job. | ||
| # To update: resolve the new tag's digest via: | ||
| # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-mcp-auth:pull&service=ghcr.io" | jq -r .token) | ||
| # curl -sI -H "Authorization: Bearer $TOKEN" \ | ||
| # -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \ | ||
| # https://ghcr.io/v2/pleri/olam-mcp-auth/manifests/<tag> | grep docker-content-digest | ||
| # Or use: node scripts/refresh-manifest-digests.mjs | ||
| # | ||
| # securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot, | ||
| # readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs. | ||
| # | ||
| # D17 (LOAD-BEARING): mcp-auth-service MUST NOT mount /var/run/docker.sock. | ||
| # Phase 2 architecture: k8s pods cannot reach docker.sock. No hostPath socket | ||
| # mount here — mcp-auth-service authenticates MCP clients via JWT, not Docker. | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-mcp-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| type: RollingUpdate | ||
| rollingUpdate: | ||
| maxSurge: 1 | ||
| maxUnavailable: 0 | ||
| selector: | ||
| matchLabels: | ||
| app: olam-mcp-auth-service | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| spec: | ||
| # B9 (round 2 recovery): disable k8s automatic Service env injection. | ||
| # See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale. | ||
| enableServiceLinks: false | ||
| # R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret | ||
| # created by `olam upgrade` step 0.4 when GH_TOKEN is available. | ||
| imagePullSecrets: | ||
| - name: ghcr-pull | ||
| serviceAccountName: olam-mcp-auth-service | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| runAsGroup: 1000 | ||
| fsGroup: 1000 | ||
| initContainers: | ||
| - name: chown-data | ||
| # busybox:1.36 — sha256-pinned per T4 threat model. | ||
| image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsUser: 0 | ||
| runAsNonRoot: false | ||
| allowPrivilegeEscalation: false | ||
| command: ["chown", "-R", "1000:1000", "/data"] | ||
| volumeMounts: | ||
| - name: mcp-auth-data | ||
| mountPath: /data | ||
| containers: | ||
| - name: olam-mcp-auth-service | ||
| image: ghcr.io/pleri/olam-mcp-auth@sha256:eaac2164349e388a70dae0d86c34132f97aa74177a2376cdfa10732e8eadb507 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| readOnlyRootFilesystem: true | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
| drop: ["ALL"] | ||
| ports: | ||
| - name: http | ||
| containerPort: 9998 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-mcp-auth-service-env | ||
| - secretRef: | ||
| name: olam-mcp-auth-service-secret | ||
| volumeMounts: | ||
| - name: mcp-auth-data | ||
| mountPath: /data | ||
| - name: tmp | ||
| mountPath: /tmp | ||
| readinessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 9998 | ||
| initialDelaySeconds: 5 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 6 | ||
| livenessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 9998 | ||
| initialDelaySeconds: 30 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "50m" | ||
| memory: "128Mi" | ||
| limits: | ||
| cpu: "500m" | ||
| memory: "512Mi" | ||
| volumes: | ||
| - name: mcp-auth-data | ||
| persistentVolumeClaim: | ||
| claimName: olam-mcp-auth-data | ||
| - name: tmp | ||
| emptyDir: {} | ||
| # D17 (LOAD-BEARING): NO docker.sock volume or hostPath mount here. | ||
| # mcp-auth-service does not need Docker access in Phase 2 k8s architecture. |
| # ClusterIP Service for olam-mcp-auth-service. | ||
| # Port 9998 — consumed by other peripherals and host-cp via cluster-internal DNS. | ||
| # Operator surfaces externally via: | ||
| # kubectl port-forward -n olam svc/olam-mcp-auth-service 9998:9998 | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-mcp-auth-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-mcp-auth-service | ||
| ports: | ||
| - name: http | ||
| port: 9998 | ||
| targetPort: 9998 | ||
| protocol: TCP |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-memory-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral |
| # Phase 1a Decision 19: Role scoped to resourceNames: ["olam-memory-service"] on | ||
| # apps/v1 deployments. Without this scope, the in-cluster ServiceAccount | ||
| # could patch ANY Deployment in the namespace. This is the load-bearing | ||
| # security guardrail — preserve verbatim. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-memory-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| rules: | ||
| - apiGroups: ["apps"] | ||
| resources: ["deployments"] | ||
| resourceNames: ["olam-memory-service"] | ||
| verbs: ["get", "patch", "watch"] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-memory-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-memory-service | ||
| namespace: olam | ||
| roleRef: | ||
| kind: Role | ||
| name: olam-memory-service | ||
| apiGroup: rbac.authorization.k8s.io |
| # ConfigMap for olam-memory-service environment. Sensitive values live in | ||
| # the Secret (see templates/memory-service-secret-template.yaml). | ||
| # Operators apply the Secret separately before applying the manifests. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-memory-service-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| data: | ||
| # Port memory-service listens on. Must match 60-service.yaml targetPort. | ||
| OLAM_MEMORY_PORT: "3111" | ||
| # Data directory — backed by the PVC mounted at /data. | ||
| OLAM_MEMORY_DATA_PATH: "/data/memory" | ||
| # URL of auth-service (cluster-internal DNS). Override in non-k3d environments. | ||
| OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999" | ||
| # Health path exposed at /agentmemory/livez (D15 — do not change). | ||
| OLAM_MEMORY_HEALTH_PATH: "/agentmemory/livez" | ||
| # R3-B defensive (Decision R3-#2): memory-service Dockerfile already sets | ||
| # AGENTMEMORY_HOST=0.0.0.0 but ConfigMap override is explicit defense against | ||
| # a future image regression reverting to 127.0.0.1. | ||
| AGENTMEMORY_HOST: "0.0.0.0" | ||
| # III_REST_PORT is the env var the agentmemory CLI wrapper reads when it | ||
| # polls its iii subprocess for readiness (cli.mjs:155 — `process.env | ||
| # ["III_REST_PORT"] || "3111"`). The iii engine itself binds the port | ||
| # declared in iii-config.yaml's iii-http worker (overridden via the | ||
| # olam-memory-service-iii-config ConfigMap to 3110, so it does not | ||
| # collide with the metrics-proxy on 3111). Without this env var the | ||
| # wrapper polls 3111 forever, prints "iii-engine did not become ready", | ||
| # and exits — entrypoint propagates the exit, container restarts, and | ||
| # the liveness probe returns 502 from the proxy (its backend was never | ||
| # up). Must equal the iii-http port in 35-configmap-iii-config.yaml. | ||
| III_REST_PORT: "3110" |
| # Overrides the iii-config.yaml shipped inside the agentmemory image so the | ||
| # iii engine binds the INTERNAL port (3110) instead of the EXTERNAL port | ||
| # (3111). The shipped yaml hardcodes `port: 3111` and the agentmemory CLI | ||
| # reads its bind from yaml (NOT from the AGENTMEMORY_PORT env var), so | ||
| # entrypoint.sh's `AGENTMEMORY_PORT=3110` override has no effect. | ||
| # | ||
| # Without this override, the engine and the metrics-proxy both try to bind | ||
| # 0.0.0.0:3111. The proxy starts first and wins the port; the engine fails | ||
| # silently. Probes to /agentmemory/livez hit the proxy and get forwarded to | ||
| # 127.0.0.1:3110, where nothing is listening — proxy returns 502, readiness | ||
| # fails, container restarts. | ||
| # | ||
| # Mounted at /usr/local/lib/node_modules/@agentmemory/agentmemory/dist/iii-config.yaml | ||
| # via subPath in 50-deployment.yaml. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-memory-service-iii-config | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| data: | ||
| iii-config.yaml: | | ||
| workers: | ||
| - name: iii-http | ||
| config: | ||
| port: 3110 | ||
| host: 0.0.0.0 | ||
| default_timeout: 180000 | ||
| cors: | ||
| allowed_origins: ["http://localhost:3111", "http://localhost:3113", "http://127.0.0.1:3111", "http://127.0.0.1:3113"] | ||
| allowed_methods: [GET, POST, PUT, DELETE, OPTIONS] | ||
| - name: iii-state | ||
| config: | ||
| adapter: | ||
| name: kv | ||
| config: | ||
| store_method: file_based | ||
| file_path: ./data/state_store.db | ||
| - name: iii-queue | ||
| config: | ||
| adapter: | ||
| name: builtin | ||
| - name: iii-pubsub | ||
| config: | ||
| adapter: | ||
| name: local | ||
| - name: iii-cron | ||
| config: | ||
| adapter: | ||
| name: kv | ||
| - name: iii-stream | ||
| config: | ||
| port: 3112 | ||
| host: 0.0.0.0 | ||
| adapter: | ||
| name: kv | ||
| config: | ||
| store_method: file_based | ||
| file_path: ./data/stream_store | ||
| - name: iii-observability | ||
| config: | ||
| enabled: true | ||
| service_name: agentmemory | ||
| exporter: memory | ||
| sampling_ratio: 1.0 | ||
| metrics_enabled: true | ||
| logs_enabled: true | ||
| logs_console_output: true | ||
| - name: iii-exec | ||
| config: | ||
| watch: | ||
| - src/**/*.ts | ||
| exec: | ||
| - node dist/index.mjs |
| # PersistentVolumeClaim for olam-memory-service /data volume. | ||
| # | ||
| # Why PVC instead of hostPath: see packages/host-cp/k8s/manifests/host-cp/45-pvc.yaml | ||
| # for the full rationale (fsGroup, k3d node filesystem, etc.). | ||
| # | ||
| # local-path StorageClass ships with k3d by default (rancher/local-path-provisioner). | ||
| # On non-k3d clusters, substitute storageClassName with your cluster's provisioner. | ||
| # D24: storageClassName operator-editable — edit the field below for non-k3d substrates. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-memory-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| # D24: operator-editable. k3d default is local-path. Change for non-k3d substrates. | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| # D25: memory-service PVC size 5Gi. | ||
| storage: 5Gi |
| # Deployment for olam-memory-service. | ||
| # | ||
| # Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model. | ||
| # Digest resolves to ghcr.io/pleri/olam-memory-service:0.1.0 (multi-arch index). | ||
| # To update: resolve the new tag's digest via: | ||
| # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-memory-service:pull&service=ghcr.io" | jq -r .token) | ||
| # curl -sI -H "Authorization: Bearer $TOKEN" \ | ||
| # -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \ | ||
| # https://ghcr.io/v2/pleri/olam-memory-service/manifests/<tag> | grep docker-content-digest | ||
| # | ||
| # securityContext: conservative defaults per T6/T7 threat model (runAsNonRoot, | ||
| # readOnlyRootFilesystem). /tmp backed by emptyDir for transient write needs. | ||
| # | ||
| # D15 (LOAD-BEARING): readinessProbe and livenessProbe path MUST be | ||
| # /agentmemory/livez (not /health). Source: DEFAULT_HEALTH_PATH in | ||
| # packages/core/src/services-status/memory-probe.ts:18. | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-memory-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| type: RollingUpdate | ||
| rollingUpdate: | ||
| maxSurge: 1 | ||
| maxUnavailable: 0 | ||
| selector: | ||
| matchLabels: | ||
| app: olam-memory-service | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-memory-service | ||
| spec: | ||
| # B9 (round 2 recovery): disable k8s automatic Service env injection. | ||
| # See packages/host-cp/k8s/manifests/50-deployment.yaml for rationale. | ||
| enableServiceLinks: false | ||
| # R3-C (Decision R3-#3): imagePullSecrets references the ghcr-pull Secret | ||
| # created by `olam upgrade` step 0.4 when GH_TOKEN is available. | ||
| imagePullSecrets: | ||
| - name: ghcr-pull | ||
| serviceAccountName: olam-memory-service | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| runAsGroup: 1000 | ||
| fsGroup: 1000 | ||
| initContainers: | ||
| - name: chown-data | ||
| # busybox:1.36 — sha256-pinned per T4 threat model. | ||
| image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsUser: 0 | ||
| runAsNonRoot: false | ||
| allowPrivilegeEscalation: false | ||
| command: ["chown", "-R", "1000:1000", "/data"] | ||
| volumeMounts: | ||
| - name: memory-data | ||
| mountPath: /data | ||
| containers: | ||
| - name: olam-memory-service | ||
| # image first appears on GHCR after Phase B's publish-memory-service | ||
| # job fires on the first release post-merge. Remove the | ||
| # bootstrap-placeholder comment + run `npm run refresh:manifest-digests` | ||
| # once ghcr.io/pleri/olam-memory-service has a real published digest. | ||
| # bootstrap-placeholder: pre-publish; refresh after first release | ||
| image: ghcr.io/pleri/olam-memory-service@sha256:923bff54d2ba3da162a35d3e8ebc6bd440bed6d290a5cff7bae2888281a4e003 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| readOnlyRootFilesystem: true | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
| drop: ["ALL"] | ||
| ports: | ||
| - name: http | ||
| containerPort: 3111 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-memory-service-env | ||
| - secretRef: | ||
| name: olam-memory-service-secret | ||
| volumeMounts: | ||
| - name: memory-data | ||
| mountPath: /data | ||
| - name: tmp | ||
| mountPath: /tmp | ||
| # Overrides the shipped iii-config.yaml so the engine binds the | ||
| # internal port (3110) instead of colliding with the metrics-proxy | ||
| # on 3111. See 35-configmap-iii-config.yaml for full rationale. | ||
| - name: iii-config-override | ||
| mountPath: /usr/local/lib/node_modules/@agentmemory/agentmemory/dist/iii-config.yaml | ||
| subPath: iii-config.yaml | ||
| readOnly: true | ||
| readinessProbe: | ||
| httpGet: | ||
| # D15 (LOAD-BEARING): memory-service health path is /agentmemory/livez. | ||
| # Source: DEFAULT_HEALTH_PATH in packages/core/src/services-status/memory-probe.ts:18. | ||
| # Do NOT change to /health — that endpoint does not exist on this service. | ||
| path: /agentmemory/livez | ||
| port: 3111 | ||
| initialDelaySeconds: 5 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 6 | ||
| livenessProbe: | ||
| httpGet: | ||
| # D15 (LOAD-BEARING): same path as readinessProbe. | ||
| path: /agentmemory/livez | ||
| port: 3111 | ||
| initialDelaySeconds: 30 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "50m" | ||
| memory: "256Mi" | ||
| limits: | ||
| cpu: "500m" | ||
| memory: "1Gi" | ||
| volumes: | ||
| - name: memory-data | ||
| persistentVolumeClaim: | ||
| claimName: olam-memory-data | ||
| - name: tmp | ||
| emptyDir: {} | ||
| - name: iii-config-override | ||
| configMap: | ||
| name: olam-memory-service-iii-config |
| # ClusterIP Service for olam-memory-service. | ||
| # Port 3111 — consumed by host-cp and agents via cluster-internal DNS. | ||
| # Operator surfaces externally via: | ||
| # kubectl port-forward -n olam svc/olam-memory-service 3111:3111 | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-memory-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-memory-service | ||
| ports: | ||
| - name: http | ||
| port: 3111 | ||
| targetPort: 3111 | ||
| protocol: TCP |
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-plan-chat-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| olam.io/component: peripheral |
| # plan-chat-service does not need to read or write any Kubernetes API objects. | ||
| # A no-op Role + RoleBinding documents the minimal-privilege stance and | ||
| # keeps the file present so audit:cli-bundle-k8s does not skip this peripheral. | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-plan-chat-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| olam.io/component: peripheral | ||
| rules: [] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-plan-chat-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| olam.io/component: peripheral | ||
| roleRef: | ||
| apiGroup: rbac.authorization.k8s.io | ||
| kind: Role | ||
| name: olam-plan-chat-service | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-plan-chat-service | ||
| namespace: olam |
| # ConfigMap for olam-plan-chat-service. | ||
| # | ||
| # plan-chat-service.mjs (packages/host-cp/src/plan-chat-service.mjs) reads | ||
| # these env vars at startup. See the file header for the canonical names. | ||
| # | ||
| # DATABASE_URL: points at the in-cluster chunks-postgres StatefulSet's Service. | ||
| # The password is sourced from the chunks-postgres-secret | ||
| # (mounted via envFrom in 50-deployment.yaml) — the literal | ||
| # here uses the env-var substitution syntax | ||
| # `$(VAR)` which kubelet expands when DATABASE_URL is itself | ||
| # read via envFrom or env: subordinate. | ||
| # | ||
| # BUT: kubelet only expands env-refs declared on the container, | ||
| # not values inside a ConfigMap key. So we keep DATABASE_URL | ||
| # OUT of this ConfigMap and assemble it in the Deployment's | ||
| # env: section instead (which CAN reference the Secret-backed | ||
| # POSTGRES_PASSWORD via $(POSTGRES_PASSWORD)). See 50-deployment.yaml. | ||
| # | ||
| # ELECTRIC_URL: chunks-electric ClusterIP. No auth (ELECTRIC_INSECURE=true on | ||
| # that service in local-dev mode). | ||
| # | ||
| # SECRET_PATH: filesystem path where the olam-plan-chat-secret Secret is | ||
| # mounted (see volumeMounts in 50-deployment.yaml). The mount | ||
| # key is "secret" → file `/etc/olam-plan-chat/secret`. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-plan-chat-service-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| olam.io/component: peripheral | ||
| data: | ||
| OLAM_PLAN_CHAT_PORT: "3200" | ||
| OLAM_PLAN_CHAT_ELECTRIC_URL: "http://olam-chunks-electric.olam.svc.cluster.local:3000" | ||
| OLAM_PLAN_CHAT_SECRET_PATH: "/etc/olam-plan-chat/secret" |
| # PersistentVolumeClaim for olam-plan-chat-service /data volume. | ||
| # | ||
| # plan-chat-service is mostly stateless (DB lives in chunks-postgres, secret | ||
| # lives in olam-plan-chat-secret), but ships a /data PVC for parity with | ||
| # the other peripherals. Used for any transient state the service decides | ||
| # to spool (e.g. planning-session resumption buffers). | ||
| # | ||
| # local-path StorageClass ships with k3d by default. On non-k3d clusters, | ||
| # substitute storageClassName with your cluster's provisioner. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-plan-chat-service-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| storage: 1Gi |
| # Deployment for olam-plan-chat-service. | ||
| # | ||
| # Image strategy: REUSES the olam-host-cp image. Per the package layout, | ||
| # plan-chat-service.mjs is a sibling under packages/host-cp/src/, and the | ||
| # host-cp image's WORKDIR=/app already contains it at /app/src/plan-chat-service.mjs. | ||
| # The single shared image avoids version-drift between the two binaries that | ||
| # share plan-chat-secret.mjs (bearer-auth logic), planning-sessions.mjs, | ||
| # crystallize-planning.mjs, and resolver.mjs. | ||
| # | ||
| # The command override replaces the host-cp default | ||
| # ENTRYPOINT (`node src/server.mjs`) with the plan-chat-service entrypoint. | ||
| # | ||
| # Image: pinned to the SAME digest as host-cp's 50-deployment.yaml. Refresh | ||
| # both in lockstep via scripts/refresh-manifest-digests.mjs on every release. | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-plan-chat-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| type: RollingUpdate | ||
| rollingUpdate: | ||
| maxSurge: 1 | ||
| maxUnavailable: 0 | ||
| selector: | ||
| matchLabels: | ||
| app: olam-plan-chat-service | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| spec: | ||
| enableServiceLinks: false | ||
| imagePullSecrets: | ||
| - name: ghcr-pull | ||
| serviceAccountName: olam-plan-chat-service | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| runAsGroup: 1000 | ||
| fsGroup: 1000 | ||
| initContainers: | ||
| # chown-data: identical to memory-service pattern. Postgres-RWO PVC | ||
| # mounts as root-owned on local-path; this brings it to 1000:1000. | ||
| - name: chown-data | ||
| image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsUser: 0 | ||
| runAsNonRoot: false | ||
| allowPrivilegeEscalation: false | ||
| command: ["chown", "-R", "1000:1000", "/data"] | ||
| volumeMounts: | ||
| - name: plan-chat-data | ||
| mountPath: /data | ||
| containers: | ||
| - name: olam-plan-chat-service | ||
| # Reuses the host-cp image (same source tree, same node_modules). | ||
| # Digest pinned in lockstep with packages/host-cp/k8s/manifests/50-deployment.yaml. | ||
| image: ghcr.io/pleri/olam-host-cp@sha256:20d84b6d490c633bc5a158b0f7f849152aba3cf1d2d45657360f627d8d41ec3f | ||
| imagePullPolicy: IfNotPresent | ||
| # Override the host-cp ENTRYPOINT. plan-chat-service.mjs exports | ||
| # startService(); we boot it via -e import-and-call. | ||
| command: ["node"] | ||
| args: | ||
| - "-e" | ||
| - "import('/app/src/plan-chat-service.mjs').then(m => m.startService()).catch(e => { console.error('[plan-chat-service]', e); process.exit(1); });" | ||
| workingDir: /app | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
| drop: ["ALL"] | ||
| ports: | ||
| - name: http | ||
| containerPort: 3200 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-plan-chat-service-env | ||
| env: | ||
| # DATABASE_URL composition. Same pattern as chunks-electric. | ||
| - name: POSTGRES_PASSWORD | ||
| valueFrom: | ||
| secretKeyRef: | ||
| name: olam-chunks-postgres-secret | ||
| key: POSTGRES_PASSWORD | ||
| - name: OLAM_PLAN_CHAT_DATABASE_URL | ||
| value: "postgres://postgres:$(POSTGRES_PASSWORD)@olam-chunks-postgres.olam.svc.cluster.local:5432/chunks" | ||
| volumeMounts: | ||
| - name: plan-chat-data | ||
| mountPath: /data | ||
| - name: plan-chat-secret | ||
| mountPath: /etc/olam-plan-chat | ||
| readOnly: true | ||
| readinessProbe: | ||
| httpGet: | ||
| path: /livez | ||
| port: 3200 | ||
| initialDelaySeconds: 10 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 12 | ||
| livenessProbe: | ||
| httpGet: | ||
| path: /livez | ||
| port: 3200 | ||
| initialDelaySeconds: 60 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "50m" | ||
| memory: "256Mi" | ||
| limits: | ||
| cpu: "500m" | ||
| memory: "1Gi" | ||
| volumes: | ||
| - name: plan-chat-data | ||
| persistentVolumeClaim: | ||
| claimName: olam-plan-chat-service-data | ||
| - name: plan-chat-secret | ||
| secret: | ||
| secretName: olam-plan-chat-secret | ||
| defaultMode: 0400 | ||
| items: | ||
| - key: PLAN_CHAT_SECRET | ||
| path: secret |
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-plan-chat-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-plan-chat-service | ||
| olam.io/component: peripheral | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-plan-chat-service | ||
| ports: | ||
| - name: http | ||
| port: 3200 | ||
| targetPort: 3200 | ||
| protocol: TCP |
| # Secret TEMPLATE for olam-host-cp. | ||
| # | ||
| # This file is a TEMPLATE — it MUST NOT be applied directly without substituting | ||
| # the placeholder values. The placeholders are intentionally invalid; a raw | ||
| # `kubectl apply` will result in auth-service 401s rather than silently shipping | ||
| # fake credentials. | ||
| # | ||
| # Preferred substitution (keeps secrets out of git): | ||
| # kubectl create secret generic olam-host-cp-secret -n olam \ | ||
| # --from-literal=OLAM_AUTH_SECRET=$(cat ~/.olam/auth-secret) \ | ||
| # --from-literal=GH_TOKEN=$(gh auth token) \ | ||
| # --dry-run=client -o yaml | kubectl apply -f - | ||
| # | ||
| # This template lives in packages/host-cp/k8s/templates/ (NOT manifests/) | ||
| # so that `kubectl apply -f manifests/` does NOT apply it — operators must | ||
| # explicitly handle Secret provisioning before applying the manifests. | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-host-cp-secret | ||
| namespace: olam | ||
| labels: | ||
| app: olam-host-cp | ||
| olam.io/component: host-stack | ||
| type: Opaque | ||
| stringData: | ||
| # Shared bearer secret between host-cp and the long-lived olam-auth process. | ||
| # Source: cat ~/.olam/auth-secret | ||
| OLAM_AUTH_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_AUTH_SECRET" | ||
| # GitHub token for GHCR image pulls and the /api/prs endpoint. | ||
| # Source: gh auth token | ||
| GH_TOKEN: "REPLACE_ME_FROM_GH_AUTH_TOKEN" |
| # Secret TEMPLATE for olam-auth-service. | ||
| # | ||
| # This file is a TEMPLATE — it MUST NOT be applied directly without substituting | ||
| # the placeholder values. The placeholders are intentionally invalid; a raw | ||
| # `kubectl apply` will result in auth failures rather than silently shipping | ||
| # fake credentials. | ||
| # | ||
| # Preferred substitution (keeps secrets out of git): | ||
| # kubectl create secret generic olam-auth-service-secret -n olam \ | ||
| # --from-literal=OLAM_AUTH_DB_SECRET=$(cat ~/.olam/auth-db-secret) \ | ||
| # --dry-run=client -o yaml | kubectl apply -f - | ||
| # | ||
| # This template lives in packages/host-cp/k8s/templates/ (NOT manifests/) | ||
| # so that `kubectl apply -f manifests/auth-service/` does NOT apply it — | ||
| # operators must explicitly handle Secret provisioning before applying manifests. | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-auth-service-secret | ||
| namespace: olam | ||
| labels: | ||
| app: olam-auth-service | ||
| olam.io/component: peripheral | ||
| type: Opaque | ||
| stringData: | ||
| # Shared database encryption secret for the credential vault. | ||
| # Source: cat ~/.olam/auth-db-secret | ||
| OLAM_AUTH_DB_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_AUTH_DB_SECRET" |
| # Secret TEMPLATE for olam-chunks-postgres. | ||
| # | ||
| # Generates a random 64-char hex POSTGRES_PASSWORD on first apply (via | ||
| # k8s-secret-render.ts generate-if-missing). The Secret is consumed by: | ||
| # - chunks-postgres StatefulSet (envFrom → POSTGRES_PASSWORD) | ||
| # - chunks-electric Deployment (env: valueFrom.secretKeyRef) | ||
| # - plan-chat-service Deployment (env: valueFrom.secretKeyRef) | ||
| # | ||
| # All three resolve the SAME random value because the secret-renderer | ||
| # persists generated values in ~/.olam/k8s-secrets-state.json so reapply | ||
| # is idempotent (no rotation unless --rotate-secrets). | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-chunks-postgres-secret | ||
| namespace: olam | ||
| labels: | ||
| app: olam-chunks-postgres | ||
| olam.io/component: substrate | ||
| type: Opaque | ||
| stringData: | ||
| # Postgres superuser password. Generated by the CLI's secret-renderer on | ||
| # first apply (no host-side file to read; this is in-cluster-only state). | ||
| POSTGRES_PASSWORD: "REPLACE_ME_GENERATE_RANDOM_HEX" |
| # Secret TEMPLATE for olam-kg-service. | ||
| # | ||
| # This file is a TEMPLATE — it MUST NOT be applied directly without substituting | ||
| # the placeholder values. The placeholders are intentionally invalid; a raw | ||
| # `kubectl apply` will result in auth failures rather than silently shipping | ||
| # fake credentials. | ||
| # | ||
| # Preferred substitution (keeps secrets out of git): | ||
| # kubectl create secret generic olam-kg-service-secret -n olam \ | ||
| # --from-literal=OLAM_KG_BEARER_TOKEN=$(cat ~/.olam/kg-bearer-token) \ | ||
| # --dry-run=client -o yaml | kubectl apply -f - | ||
| # | ||
| # This template lives in packages/host-cp/k8s/templates/ (NOT manifests/) | ||
| # so that `kubectl apply -f manifests/kg-service/` does NOT apply it — | ||
| # operators must explicitly handle Secret provisioning before applying manifests. | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-kg-service-secret | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| olam.io/component: peripheral | ||
| type: Opaque | ||
| stringData: | ||
| # Bearer token for in-cluster KG query authentication. | ||
| # Source: cat ~/.olam/kg-bearer-token | ||
| OLAM_KG_BEARER_TOKEN: "REPLACE_ME_FROM_HOME_DOTOLAM_KG_BEARER_TOKEN" |
| # Secret TEMPLATE for olam-mcp-auth-service. | ||
| # | ||
| # This file is a TEMPLATE — it MUST NOT be applied directly without substituting | ||
| # the placeholder values. The placeholders are intentionally invalid; a raw | ||
| # `kubectl apply` will result in auth failures rather than silently shipping | ||
| # fake credentials. | ||
| # | ||
| # Preferred substitution (keeps secrets out of git): | ||
| # kubectl create secret generic olam-mcp-auth-service-secret -n olam \ | ||
| # --from-literal=OLAM_MCP_AUTH_JWT_SECRET=$(cat ~/.olam/mcp-auth-jwt-secret) \ | ||
| # --dry-run=client -o yaml | kubectl apply -f - | ||
| # | ||
| # This template lives in packages/host-cp/k8s/templates/ (NOT manifests/) | ||
| # so that `kubectl apply -f manifests/mcp-auth-service/` does NOT apply it — | ||
| # operators must explicitly handle Secret provisioning before applying manifests. | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-mcp-auth-service-secret | ||
| namespace: olam | ||
| labels: | ||
| app: olam-mcp-auth-service | ||
| olam.io/component: peripheral | ||
| type: Opaque | ||
| stringData: | ||
| # JWT signing secret for MCP client authentication. | ||
| # Source: cat ~/.olam/mcp-auth-jwt-secret | ||
| OLAM_MCP_AUTH_JWT_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_MCP_AUTH_JWT_SECRET" |
| # Secret TEMPLATE for olam-memory-service. | ||
| # | ||
| # This file is a TEMPLATE — it MUST NOT be applied directly without substituting | ||
| # the placeholder values. The placeholders are intentionally invalid; a raw | ||
| # `kubectl apply` will result in auth failures rather than silently shipping | ||
| # fake credentials. | ||
| # | ||
| # Preferred substitution (keeps secrets out of git): | ||
| # kubectl create secret generic olam-memory-service-secret -n olam \ | ||
| # --from-literal=OLAM_MEMORY_BEARER_SECRET=$(cat ~/.olam/memory-bearer-secret) \ | ||
| # --dry-run=client -o yaml | kubectl apply -f - | ||
| # | ||
| # This template lives in packages/host-cp/k8s/templates/ (NOT manifests/) | ||
| # so that `kubectl apply -f manifests/memory-service/` does NOT apply it — | ||
| # operators must explicitly handle Secret provisioning before applying manifests. | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-memory-service-secret | ||
| namespace: olam | ||
| labels: | ||
| app: olam-memory-service | ||
| olam.io/component: peripheral | ||
| type: Opaque | ||
| stringData: | ||
| # Bearer secret for the memory-service HTTP API (matches OLAM_MEMORY_BEARER_SECRET | ||
| # used by host-cp and agents that call the memory endpoints). | ||
| # Source: cat ~/.olam/memory-bearer-secret | ||
| OLAM_MEMORY_BEARER_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_MEMORY_BEARER_SECRET" |
| # Secret TEMPLATE for olam-plan-chat-secret. | ||
| # | ||
| # This file is a TEMPLATE — it MUST NOT be applied directly without substituting | ||
| # the placeholder values. The placeholders are intentionally invalid; a raw | ||
| # `kubectl apply` will result in auth failures rather than silently shipping | ||
| # fake credentials. | ||
| # | ||
| # Preferred substitution (keeps secrets out of git): | ||
| # kubectl create secret generic olam-plan-chat-secret -n olam \ | ||
| # --from-literal=PLAN_CHAT_SECRET=$(cat ~/.olam/plan-chat-secret) \ | ||
| # --dry-run=client -o yaml | kubectl apply -f - | ||
| # | ||
| # This template lives in packages/host-cp/k8s/templates/ (NOT manifests/) | ||
| # so that `kubectl apply -f manifests/plan-chat-service/` does NOT apply it — | ||
| # operators must explicitly handle Secret provisioning before applying manifests. | ||
| # | ||
| # Architecture: this Secret is mounted by BOTH the host-cp pod (so its | ||
| # renderSpaShell can inject window.__OLAM_PLAN_CHAT_BEARER__) AND the | ||
| # plan-chat-service pod (so its bearer-auth gate timing-safe-compares incoming | ||
| # Authorization: Bearer headers against the same value). One source-of-truth, | ||
| # two readers — replaces the previous "/data/plan-chat-secret in host-cp PVC" | ||
| # pattern that couldn't be shared across pods (RWO PVC). | ||
| apiVersion: v1 | ||
| kind: Secret | ||
| metadata: | ||
| name: olam-plan-chat-secret | ||
| namespace: olam | ||
| labels: | ||
| olam.io/component: substrate | ||
| type: Opaque | ||
| stringData: | ||
| # Shared bearer secret for plan-chat-service's POST /v1/chunks and | ||
| # GET /v1/shape endpoints. host-cp injects this into window.__OLAM_PLAN_CHAT_BEARER__. | ||
| # Source: cat ~/.olam/plan-chat-secret | ||
| PLAN_CHAT_SECRET: "REPLACE_ME_FROM_HOME_DOTOLAM_PLAN_CHAT_SECRET" |
| // classifyStartupFailure — pure mapping from evidence shape to bucket. | ||
| // | ||
| // Precedence rules (walked top-down; first match wins): | ||
| // | ||
| // 1. processExitCode !== undefined → ProviderProcessGone | ||
| // The agent process is dead; nothing else matters. This is the | ||
| // highest-confidence signal because it's observable from outside | ||
| // the container (docker exit code, child_process exit). | ||
| // | ||
| // 2. pluginErrors.length > 0 → PluginStartupFailed | ||
| // Boot-time stderr from a plugin/skill source is definitive. | ||
| // Comes before transport/handshake checks because a failed | ||
| // plugin can leave transport+mcp in 'pending' permanently. | ||
| // | ||
| // 3. transportStatus === 'failed' → TransportDead | ||
| // Channel-open never succeeded — agent is alive but unreachable. | ||
| // | ||
| // 4. mcpHandshakeStatus === 'failed' → McpHandshakeStall | ||
| // Channel opened, MCP handshake explicitly failed. | ||
| // | ||
| // 5. mcpHandshakeStatus === 'pending' | ||
| // AND elapsedSecondsSinceCreation > 30 → McpHandshakeStall | ||
| // Time-bounded inference: a never-completed handshake after 30s | ||
| // is the stall signal even without an explicit failure marker. | ||
| // | ||
| // 6. lastPhase === 'TrustRequired' | ||
| // AND elapsedSecondsSinceCreation > 10 → TrustGateUnanswered | ||
| // Agent reached the trust gate; no approval ever came back. | ||
| // 10s is the operator's attention budget — past that, the | ||
| // agent is silently stuck on a human gate. | ||
| // | ||
| // 7. promptSentAt !== undefined | ||
| // AND firstThoughtAt === undefined → PromptMisdelivery | ||
| // Dispatch landed on the host side but the agent never produced | ||
| // a first thought — the prompt didn't reach the agent process. | ||
| // | ||
| // 8. lastPhase === 'TrustRequired' → TrustGateUnanswered (fallback) | ||
| // Stuck at the trust gate even under 10s — still the most likely | ||
| // explanation for a Failed transition from that phase. | ||
| // | ||
| // 9. fallthrough → PromptMisdelivery | ||
| // The classifier is total: every Failed transition gets a bucket. | ||
| // PromptMisdelivery is the most operator-actionable "we don't | ||
| // know why but the dispatch path is the prime suspect" default. | ||
| // | ||
| // Tests in __tests__/classify.test.mjs assert exactly one case per | ||
| // bucket. The function is pure: no I/O, no side effects, deterministic | ||
| // — same evidence in always yields the same bucket out. | ||
| import { WorldStartupFailureKind } from './failure-kinds.mjs'; | ||
| const MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS = 30; | ||
| const TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS = 10; | ||
| /** | ||
| * Map a WorldStartupEvidence bundle to its WorldStartupFailureKind. | ||
| * | ||
| * @param {import('./evidence.mjs').WorldStartupEvidence} evidence | ||
| * @returns {import('./failure-kinds.mjs').WorldStartupFailureKind} | ||
| */ | ||
| export function classifyStartupFailure(evidence) { | ||
| // 1. Process exited — terminal signal, short-circuits all other checks. | ||
| if (evidence.processExitCode !== undefined) { | ||
| return WorldStartupFailureKind.ProviderProcessGone; | ||
| } | ||
| // 2. Plugin boot errors — definitive boot-time failure. | ||
| if (evidence.pluginErrors.length > 0) { | ||
| return WorldStartupFailureKind.PluginStartupFailed; | ||
| } | ||
| // 3. Transport explicitly failed — agent alive but unreachable. | ||
| if (evidence.transportStatus === 'failed') { | ||
| return WorldStartupFailureKind.TransportDead; | ||
| } | ||
| // 4. MCP handshake explicitly failed. | ||
| if (evidence.mcpHandshakeStatus === 'failed') { | ||
| return WorldStartupFailureKind.McpHandshakeStall; | ||
| } | ||
| // 5. MCP handshake pending past threshold — inferred stall. | ||
| if ( | ||
| evidence.mcpHandshakeStatus === 'pending' && | ||
| evidence.elapsedSecondsSinceCreation > MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS | ||
| ) { | ||
| return WorldStartupFailureKind.McpHandshakeStall; | ||
| } | ||
| // 6. Stuck on trust gate past operator-attention threshold. | ||
| if ( | ||
| evidence.lastPhase === 'TrustRequired' && | ||
| evidence.elapsedSecondsSinceCreation > TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS | ||
| ) { | ||
| return WorldStartupFailureKind.TrustGateUnanswered; | ||
| } | ||
| // 7. Prompt sent but agent never produced a first thought. | ||
| if (evidence.promptSentAt !== undefined && evidence.firstThoughtAt === undefined) { | ||
| return WorldStartupFailureKind.PromptMisdelivery; | ||
| } | ||
| // 8. Still at trust gate under threshold — bucket as trust-gate. | ||
| if (evidence.lastPhase === 'TrustRequired') { | ||
| return WorldStartupFailureKind.TrustGateUnanswered; | ||
| } | ||
| // 9. Total-function fallback. | ||
| return WorldStartupFailureKind.PromptMisdelivery; | ||
| } |
| // recordWorldLifecycle — the single broadcast helper every host-cp | ||
| // surface uses to emit a lifecycle transition. | ||
| // | ||
| // Emits TWO event types on the host-stream: | ||
| // | ||
| // 1. event: 'world.lifecycle' → live SSE consumers (SPA, MCP, etc.). | ||
| // Shape: { worldId, phase, at, evidence?, failureKind? } | ||
| // | ||
| // 2. event: 'span' → NDJSON trace sink (PR #915 + follow-ups). | ||
| // Shape: { name: 'world.lifecycle', startedAt: at, endedAt: at, | ||
| // attributes: { worldId, phase, evidence?, failureKind? }, | ||
| // exit: { _tag: 'Success' | 'Failure', reason? } } | ||
| // | ||
| // The dual-emit keeps live consumers and trace consumers on the same | ||
| // substrate without either path coupling to the other. The README jq | ||
| // example `select(.name == "world.lifecycle" ...)` matches the span | ||
| // emission; the SPA's `useHostStream().subscribe('world.lifecycle', ...)` | ||
| // matches the live emission. | ||
| // | ||
| // Failed transitions auto-classify via classifyStartupFailure(evidence) | ||
| // when caller passes evidence but omits an explicit failureKind. Callers | ||
| // MAY provide their own failureKind to override the inference (e.g. | ||
| // docker SIGKILL — the caller knows it was ProviderProcessGone before | ||
| // the classifier could trip its time-thresholds). | ||
| import { TERMINAL_PHASES, WorldLifecyclePhase } from './phases.mjs'; | ||
| import { classifyStartupFailure } from './classify.mjs'; | ||
| import { redactSensitive } from '../observability/redactor.mjs'; | ||
| /** | ||
| * @typedef {object} HostStreamLike | ||
| * @property {(eventType: string, payload: unknown) => unknown} broadcast | ||
| */ | ||
| /** | ||
| * @typedef {object} WorldLifecycleEvent | ||
| * @property {string} worldId | ||
| * @property {import('./phases.mjs').WorldLifecyclePhase} phase | ||
| * @property {number} at | ||
| * @property {import('./evidence.mjs').WorldStartupEvidence} [evidence] | ||
| * @property {import('./failure-kinds.mjs').WorldStartupFailureKind} [failureKind] | ||
| */ | ||
| /** | ||
| * Emit a world lifecycle transition on both `world.lifecycle` and `span` | ||
| * host-stream channels. | ||
| * | ||
| * @param {HostStreamLike} hostStream | ||
| * @param {object} args | ||
| * @param {string} args.worldId | ||
| * @param {import('./phases.mjs').WorldLifecyclePhase} args.phase | ||
| * @param {number} [args.at] | ||
| * @param {import('./evidence.mjs').WorldStartupEvidence} [args.evidence] | ||
| * @param {import('./failure-kinds.mjs').WorldStartupFailureKind} [args.failureKind] | ||
| * @returns {WorldLifecycleEvent} the payload that was broadcast (test convenience) | ||
| */ | ||
| export function recordWorldLifecycle(hostStream, args) { | ||
| if (!hostStream || typeof hostStream.broadcast !== 'function') { | ||
| throw new TypeError('recordWorldLifecycle: hostStream.broadcast is required'); | ||
| } | ||
| if (typeof args?.worldId !== 'string' || args.worldId.length === 0) { | ||
| throw new TypeError('recordWorldLifecycle: worldId is required'); | ||
| } | ||
| if (typeof args?.phase !== 'string') { | ||
| throw new TypeError('recordWorldLifecycle: phase is required'); | ||
| } | ||
| const at = typeof args.at === 'number' ? args.at : Date.now(); | ||
| // Resolve failureKind: explicit override > classifier inference > undefined. | ||
| let failureKind = args.failureKind; | ||
| if ( | ||
| failureKind === undefined && | ||
| args.phase === WorldLifecyclePhase.Failed && | ||
| args.evidence !== undefined | ||
| ) { | ||
| failureKind = classifyStartupFailure(args.evidence); | ||
| } | ||
| /** @type {WorldLifecycleEvent} */ | ||
| const livePayload = { | ||
| worldId: args.worldId, | ||
| phase: args.phase, | ||
| at, | ||
| }; | ||
| if (args.evidence !== undefined) livePayload.evidence = redactSensitive(args.evidence); | ||
| if (failureKind !== undefined) livePayload.failureKind = failureKind; | ||
| hostStream.broadcast('world.lifecycle', livePayload); | ||
| // Mirror as a span so the NDJSON trace sink (PR #915) records it. | ||
| // Lifecycle transitions are point-in-time events — startedAt === endedAt. | ||
| /** @type {Record<string, unknown>} */ | ||
| const spanAttributes = { | ||
| worldId: args.worldId, | ||
| phase: args.phase, | ||
| }; | ||
| if (args.evidence !== undefined) spanAttributes.evidence = redactSensitive(args.evidence); | ||
| if (failureKind !== undefined) spanAttributes.failureKind = failureKind; | ||
| /** @type {{ _tag: 'Success' | 'Failure', reason?: string }} */ | ||
| const exit = | ||
| args.phase === WorldLifecyclePhase.Failed | ||
| ? { _tag: 'Failure', reason: failureKind ?? 'unclassified' } | ||
| : { _tag: 'Success' }; | ||
| hostStream.broadcast('span', { | ||
| name: 'world.lifecycle', | ||
| startedAt: at, | ||
| endedAt: at, | ||
| attributes: spanAttributes, | ||
| exit, | ||
| }); | ||
| return livePayload; | ||
| } | ||
| /** Re-export so callers don't need to import both modules. */ | ||
| export { WorldLifecyclePhase, TERMINAL_PHASES }; |
| // WorldStartupEvidence — the typed bundle the classifier consumes. | ||
| // | ||
| // Every Failed lifecycle transition carries one of these. Fields are | ||
| // strict-optional (undefined, not null) so consumers can use the | ||
| // presence/absence as a signal directly (`promptSentAt === undefined` | ||
| // is itself the PromptMisdelivery signal). | ||
| /** | ||
| * @typedef {'pending' | 'ok' | 'failed'} HandshakeStatus | ||
| */ | ||
| /** | ||
| * @typedef {object} WorldStartupEvidence | ||
| * @property {string} worldId | ||
| * @property {import('./phases.mjs').WorldLifecyclePhase} lastPhase | ||
| * @property {number} lastPhaseAt epoch ms | ||
| * @property {number} [promptSentAt] undefined if no dispatch ever sent | ||
| * @property {number} [firstThoughtAt] undefined if no thoughts ever produced | ||
| * @property {HandshakeStatus} mcpHandshakeStatus | ||
| * @property {HandshakeStatus} transportStatus | ||
| * @property {string[]} pluginErrors captured stderr lines from plugin boot | ||
| * @property {number} [processExitCode] | ||
| * @property {number} elapsedSecondsSinceCreation | ||
| */ | ||
| /** | ||
| * Construct an empty evidence bundle for a freshly-spawned world. | ||
| * Caller mutates fields as transitions happen, then passes to the | ||
| * classifier on Failed. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {number} [now] | ||
| * @returns {WorldStartupEvidence} | ||
| */ | ||
| export function emptyEvidence(worldId, now = Date.now()) { | ||
| return { | ||
| worldId, | ||
| lastPhase: 'Spawning', | ||
| lastPhaseAt: now, | ||
| mcpHandshakeStatus: 'pending', | ||
| transportStatus: 'pending', | ||
| pluginErrors: [], | ||
| elapsedSecondsSinceCreation: 0, | ||
| }; | ||
| } |
| // World startup failure buckets — the six canonical classes the | ||
| // classifier maps every observed Failed transition into. | ||
| // | ||
| // Order is load-bearing: the classifier walks these in declaration | ||
| // order on ambiguous evidence, so higher-confidence buckets | ||
| // (PromptMisdelivery, TransportDead) come before time-bounded | ||
| // inferences (TrustGateUnanswered, McpHandshakeStall). Adding a 7th | ||
| // bucket requires updating the classifier precedence and the | ||
| // `world.lifecycle.Failed` consumers in the SPA + NDJSON trace. | ||
| /** | ||
| * @typedef {| 'PromptMisdelivery' | ||
| * | 'TransportDead' | ||
| * | 'TrustGateUnanswered' | ||
| * | 'McpHandshakeStall' | ||
| * | 'PluginStartupFailed' | ||
| * | 'ProviderProcessGone'} WorldStartupFailureKind | ||
| */ | ||
| /** | ||
| * @type {Readonly<Record<WorldStartupFailureKind, WorldStartupFailureKind>>} | ||
| */ | ||
| export const WorldStartupFailureKind = Object.freeze({ | ||
| /** Dispatch sent but agent never received it (transport mismatch). */ | ||
| PromptMisdelivery: 'PromptMisdelivery', | ||
| /** stdin/stdout/IPC channel never opened. */ | ||
| TransportDead: 'TransportDead', | ||
| /** Agent reached TrustRequired, no approval ever arrived. */ | ||
| TrustGateUnanswered: 'TrustGateUnanswered', | ||
| /** MCP server connection initialized but never completed handshake. */ | ||
| McpHandshakeStall: 'McpHandshakeStall', | ||
| /** Plugin or skill source failed to load on boot. */ | ||
| PluginStartupFailed: 'PluginStartupFailed', | ||
| /** Agent (Claude Code) process exited before responding. */ | ||
| ProviderProcessGone: 'ProviderProcessGone', | ||
| }); | ||
| export const WORLD_STARTUP_FAILURE_KIND_ORDER = Object.freeze([ | ||
| WorldStartupFailureKind.PromptMisdelivery, | ||
| WorldStartupFailureKind.TransportDead, | ||
| WorldStartupFailureKind.TrustGateUnanswered, | ||
| WorldStartupFailureKind.McpHandshakeStall, | ||
| WorldStartupFailureKind.PluginStartupFailed, | ||
| WorldStartupFailureKind.ProviderProcessGone, | ||
| ]); | ||
| /** | ||
| * @param {unknown} value | ||
| * @returns {value is WorldStartupFailureKind} | ||
| */ | ||
| export function isWorldStartupFailureKind(value) { | ||
| return ( | ||
| typeof value === 'string' && | ||
| WORLD_STARTUP_FAILURE_KIND_ORDER.includes(/** @type {any} */ (value)) | ||
| ); | ||
| } |
| // Barrel re-export for the lifecycle module. Importers should pull | ||
| // from '@olam/host-cp/lifecycle' (or the relative path equivalent) | ||
| // rather than reaching into individual files. | ||
| export { | ||
| WorldLifecyclePhase, | ||
| WORLD_LIFECYCLE_PHASE_ORDER, | ||
| TERMINAL_PHASES, | ||
| isWorldLifecyclePhase, | ||
| } from './phases.mjs'; | ||
| export { | ||
| WorldStartupFailureKind, | ||
| WORLD_STARTUP_FAILURE_KIND_ORDER, | ||
| isWorldStartupFailureKind, | ||
| } from './failure-kinds.mjs'; | ||
| export { emptyEvidence } from './evidence.mjs'; | ||
| export { classifyStartupFailure } from './classify.mjs'; | ||
| export { recordWorldLifecycle } from './emit.mjs'; |
| // World lifecycle phases — the canonical FSM every Olam world walks | ||
| // through from spawn to terminal state. | ||
| // | ||
| // Order is load-bearing: a world's `lastPhase` is a monotonic high-water | ||
| // mark, and the classifier's precedence rules in classify.mjs assume | ||
| // this ordering when deciding which failure bucket to attribute a stall | ||
| // to. Do NOT reorder without updating the classifier. | ||
| /** | ||
| * @typedef {'Spawning' | 'TrustRequired' | 'ReadyForPrompt' | 'Running' | 'Finished' | 'Failed'} WorldLifecyclePhase | ||
| */ | ||
| /** | ||
| * @type {Readonly<Record<WorldLifecyclePhase, WorldLifecyclePhase>>} | ||
| */ | ||
| export const WorldLifecyclePhase = Object.freeze({ | ||
| /** Container or worktree created; before any code runs inside. */ | ||
| Spawning: 'Spawning', | ||
| /** Agent process up; awaiting trust-gate approval. */ | ||
| TrustRequired: 'TrustRequired', | ||
| /** Trust granted; awaiting initial dispatch. */ | ||
| ReadyForPrompt: 'ReadyForPrompt', | ||
| /** Actively processing dispatch. */ | ||
| Running: 'Running', | ||
| /** Completed successfully. Terminal. */ | ||
| Finished: 'Finished', | ||
| /** Terminal failure. Carries an evidence bundle + classified failure kind. */ | ||
| Failed: 'Failed', | ||
| }); | ||
| /** Phases in canonical order. Useful for ordinal comparison. */ | ||
| export const WORLD_LIFECYCLE_PHASE_ORDER = Object.freeze([ | ||
| WorldLifecyclePhase.Spawning, | ||
| WorldLifecyclePhase.TrustRequired, | ||
| WorldLifecyclePhase.ReadyForPrompt, | ||
| WorldLifecyclePhase.Running, | ||
| WorldLifecyclePhase.Finished, | ||
| WorldLifecyclePhase.Failed, | ||
| ]); | ||
| /** Terminal phases — no transitions out. */ | ||
| export const TERMINAL_PHASES = Object.freeze( | ||
| new Set([WorldLifecyclePhase.Finished, WorldLifecyclePhase.Failed]), | ||
| ); | ||
| /** | ||
| * @param {unknown} value | ||
| * @returns {value is WorldLifecyclePhase} | ||
| */ | ||
| export function isWorldLifecyclePhase(value) { | ||
| return typeof value === 'string' && WORLD_LIFECYCLE_PHASE_ORDER.includes(/** @type {any} */ (value)); | ||
| } |
| #!/usr/bin/env bash | ||
| # grafana-port-forward.sh — e2e smoke test: Grafana installs via Helm, | ||
| # port-forward is accessible, Loki datasource | ||
| # is pre-wired and reachable. | ||
| # | ||
| # Usage: scripts/e2e/grafana-port-forward.sh | ||
| # | ||
| # Pre-conditions: | ||
| # - kubectl context is set to a live k8s cluster (does NOT spin up k3d) | ||
| # - helm binary available | ||
| # - jq binary available | ||
| # - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts) | ||
| # - Loki is already installed (scripts/e2e/loki-ingest.sh ran successfully | ||
| # OR `helm status olam-loki -n monitoring` is healthy) | ||
| # | ||
| # Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an | ||
| # existing cluster. The Secret is applied via --dry-run | kubectl apply | ||
| # so re-runs update the password (useful for rotation testing). | ||
| # The olam-dashboards ConfigMap is applied before helm install so | ||
| # Grafana's volume mount finds the ConfigMap on first boot. | ||
| # | ||
| # Cleanup: port-forward is killed on exit; Helm release is left in place so | ||
| # downstream tasks can reuse the same cluster. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B2, B3 | ||
| # Chart: grafana/grafana 8.5.2 (pinned; latest stable 2026-05-20) | ||
| set -euo pipefail | ||
| NAMESPACE="monitoring" | ||
| GRAFANA_RELEASE="olam-grafana" | ||
| GRAFANA_CHART_VERSION="8.5.2" | ||
| LOCAL_PORT="3000" | ||
| GRAFANA_SVC_PORT="80" | ||
| PF_BIND_SECONDS=5 | ||
| log() { printf '[grafana-port-forward] %s\n' "$*" >&2; } | ||
| fail() { printf '[grafana-port-forward] FAIL: %s\n' "$*" >&2; exit 1; } | ||
| # ------------------------------------------------------------------------- | ||
| # Cleanup trap — kill port-forward on exit; leave Helm release in place | ||
| # ------------------------------------------------------------------------- | ||
| PF_PID="" | ||
| cleanup() { | ||
| if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then | ||
| kill "$PF_PID" 2>/dev/null || true | ||
| fi | ||
| } | ||
| trap cleanup EXIT | ||
| # ------------------------------------------------------------------------- | ||
| # Pre-flight | ||
| # ------------------------------------------------------------------------- | ||
| command -v helm >/dev/null 2>&1 || fail "helm not installed" | ||
| command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed" | ||
| command -v curl >/dev/null 2>&1 || fail "curl not installed" | ||
| command -v openssl >/dev/null 2>&1 || fail "openssl not installed" | ||
| command -v jq >/dev/null 2>&1 || fail "jq not installed (required for B3 dashboard assertion)" | ||
| kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG" | ||
| log "pre-flight checks passed" | ||
| # ------------------------------------------------------------------------- | ||
| # Ensure grafana Helm repo is present (idempotent — safe to re-run) | ||
| # ------------------------------------------------------------------------- | ||
| helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true | ||
| helm repo update grafana | ||
| # Verify Loki is already installed (B2 depends on B1) | ||
| if ! helm status "olam-loki" -n "$NAMESPACE" >/dev/null 2>&1; then | ||
| fail "olam-loki Helm release not found in namespace $NAMESPACE — run scripts/e2e/loki-ingest.sh first" | ||
| fi | ||
| log "Loki pre-condition satisfied (olam-loki release found)" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 1: Resolve admin password (preserve existing on idempotent re-run) | ||
| # ------------------------------------------------------------------------- | ||
| # Grafana persists the admin password in its internal SQLite on first | ||
| # deploy. Subsequent helm upgrades do NOT re-read GF_SECURITY_ADMIN_PASSWORD | ||
| # from the env (env value is set once at pod-start and not refreshed). So | ||
| # on a re-run, rotating the Secret leaves the in-Grafana password stale | ||
| # and breaks API auth. | ||
| # | ||
| # Idempotency contract: if the Secret already exists, reuse its current | ||
| # password. The Secret's value matches Grafana's stored value (set in | ||
| # concert on first install). Only generate a new password when the | ||
| # Secret doesn't exist yet — i.e. true first deploy. | ||
| if kubectl get secret olam-grafana-admin -n "$NAMESPACE" >/dev/null 2>&1; then | ||
| log "reusing existing admin password from Secret olam-grafana-admin" | ||
| GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin -n "$NAMESPACE" \ | ||
| -o jsonpath='{.data.admin-password}' | base64 -d) | ||
| else | ||
| log "generating fresh admin password (first deploy)" | ||
| GRAFANA_ADMIN_PW=$(openssl rand -base64 24) | ||
| fi | ||
| export GRAFANA_ADMIN_PW | ||
| # ------------------------------------------------------------------------- | ||
| # Step 2: Create / update the admin Secret idempotently | ||
| # ------------------------------------------------------------------------- | ||
| log "applying Secret olam-grafana-admin in namespace $NAMESPACE" | ||
| kubectl create secret generic olam-grafana-admin \ | ||
| --from-literal=admin-user=admin \ | ||
| --from-literal=admin-password="$GRAFANA_ADMIN_PW" \ | ||
| -n "$NAMESPACE" \ | ||
| --dry-run=client -o yaml \ | ||
| | kubectl apply -f - | ||
| log "Secret applied" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 3a: Apply olam-dashboards ConfigMap BEFORE helm install | ||
| # so Grafana's volume mount finds it on first boot (B3). | ||
| # The ConfigMap is generated from grafana-dashboards/*.json by | ||
| # packages/peripheral-services/scripts/sync-grafana-dashboards.sh. | ||
| # ------------------------------------------------------------------------- | ||
| REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)" | ||
| # When invoked from a published @pleri/olam-cli install (no monorepo), `olam | ||
| # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled | ||
| # peripheral-services/{helm-values,manifests} directory is reachable. | ||
| # Monorepo callers leave it unset; the script falls back to the source dir | ||
| # under packages/peripheral-services/. | ||
| if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then | ||
| PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services" | ||
| else | ||
| PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services" | ||
| fi | ||
| CONFIGMAP_MANIFEST="$PERIPHERAL_SERVICES_DIR/manifests/80-grafana-dashboard-configmap.yaml" | ||
| if [[ -f "$CONFIGMAP_MANIFEST" ]]; then | ||
| log "applying olam-dashboards ConfigMap from $CONFIGMAP_MANIFEST" | ||
| kubectl apply -f "$CONFIGMAP_MANIFEST" | ||
| log "ConfigMap applied" | ||
| else | ||
| log "WARN: $CONFIGMAP_MANIFEST not found — Grafana will warn 'ConfigMap not found' until B3 is deployed" | ||
| fi | ||
| # ------------------------------------------------------------------------- | ||
| # Step 3: Helm upgrade --install | ||
| # ------------------------------------------------------------------------- | ||
| log "installing grafana/grafana ($GRAFANA_RELEASE) in namespace $NAMESPACE" | ||
| helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \ | ||
| --version "$GRAFANA_CHART_VERSION" \ | ||
| --namespace "$NAMESPACE" \ | ||
| --create-namespace \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \ | ||
| --wait \ | ||
| --timeout "${OLAM_HELM_TIMEOUT:-600s}" | ||
| log "Grafana Helm install complete" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 4: Wait for Grafana pod Ready | ||
| # ------------------------------------------------------------------------- | ||
| log "waiting for Grafana pod Ready (120s)" | ||
| kubectl wait \ | ||
| --for=condition=ready pod \ | ||
| -l "app.kubernetes.io/name=grafana" \ | ||
| -n "$NAMESPACE" \ | ||
| --timeout=120s | ||
| log "Grafana pod Ready" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 5: Start port-forward in background | ||
| # ------------------------------------------------------------------------- | ||
| log "port-forwarding svc/$GRAFANA_RELEASE $LOCAL_PORT:$GRAFANA_SVC_PORT in namespace $NAMESPACE" | ||
| kubectl port-forward \ | ||
| -n "$NAMESPACE" \ | ||
| "svc/$GRAFANA_RELEASE" \ | ||
| "${LOCAL_PORT}:${GRAFANA_SVC_PORT}" & | ||
| PF_PID=$! | ||
| log "port-forward PID $PF_PID; waiting ${PF_BIND_SECONDS}s for bind" | ||
| sleep "$PF_BIND_SECONDS" | ||
| # Verify the port-forward process is still alive after sleep | ||
| kill -0 "$PF_PID" 2>/dev/null || fail "port-forward process exited prematurely" | ||
| # ------------------------------------------------------------------------- | ||
| # Diagnostic helper — called on assertion failure | ||
| # ------------------------------------------------------------------------- | ||
| dump_diagnostics() { | ||
| log "DIAGNOSTIC: last 50 lines of Grafana pod logs:" | ||
| kubectl logs -n "$NAMESPACE" \ | ||
| -l "app.kubernetes.io/name=grafana" \ | ||
| --tail=50 2>&1 >&2 || true | ||
| } | ||
| # ------------------------------------------------------------------------- | ||
| # Step 6: Assertion 1 — /api/health returns 200 with database: ok | ||
| # ------------------------------------------------------------------------- | ||
| log "asserting Grafana health (GET /api/health)" | ||
| HEALTH_RESPONSE=$( | ||
| curl -sf \ | ||
| -u "admin:${GRAFANA_ADMIN_PW}" \ | ||
| "http://localhost:${LOCAL_PORT}/api/health" \ | ||
| || { dump_diagnostics; fail "GET /api/health failed — Grafana not reachable on port $LOCAL_PORT"; } | ||
| ) | ||
| if ! echo "$HEALTH_RESPONSE" | jq -e '.database == "ok"' >/dev/null 2>&1; then | ||
| log "DIAGNOSTIC: /api/health response:" | ||
| echo "$HEALTH_RESPONSE" >&2 | ||
| dump_diagnostics | ||
| fail '/api/health returned database != "ok" — Grafana DB layer not healthy' | ||
| fi | ||
| log "PASS: /api/health → database: ok" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 7: Assertion 2 — /api/datasources includes Loki entry with cluster URL | ||
| # ------------------------------------------------------------------------- | ||
| log "asserting Loki datasource pre-wired (GET /api/datasources)" | ||
| DS_RESPONSE=$( | ||
| curl -sf \ | ||
| -u "admin:${GRAFANA_ADMIN_PW}" \ | ||
| "http://localhost:${LOCAL_PORT}/api/datasources" \ | ||
| || { dump_diagnostics; fail "GET /api/datasources failed"; } | ||
| ) | ||
| EXPECTED_URL="olam-loki.monitoring.svc.cluster.local:3100" | ||
| if ! echo "$DS_RESPONSE" | jq -e 'map(select(.type == "loki")) | length >= 1' >/dev/null 2>&1; then | ||
| log "DIAGNOSTIC: /api/datasources response:" | ||
| echo "$DS_RESPONSE" >&2 | ||
| dump_diagnostics | ||
| fail "datasources response contains no 'loki' type entry — datasource not provisioned" | ||
| fi | ||
| if ! echo "$DS_RESPONSE" | jq -e --arg url "$EXPECTED_URL" 'map(select(.type == "loki" and (.url | contains($url)))) | length >= 1' >/dev/null 2>&1; then | ||
| log "DIAGNOSTIC: /api/datasources response:" | ||
| echo "$DS_RESPONSE" >&2 | ||
| dump_diagnostics | ||
| fail "Loki datasource URL does not contain '$EXPECTED_URL' — check grafana-values.yaml datasources block" | ||
| fi | ||
| log "PASS: Loki datasource found with cluster-local URL $EXPECTED_URL" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 7b: Assertion 2b — dashboard provider loaded olam-home (catches mount-path bugs) | ||
| # ------------------------------------------------------------------------- | ||
| log "asserting olam-home dashboard visible in /api/search (catches ConfigMap mount failures)" | ||
| DASHBOARDS=$( | ||
| curl -sf \ | ||
| -u "admin:${GRAFANA_ADMIN_PW}" \ | ||
| "http://localhost:${LOCAL_PORT}/api/search?type=dash-db&query=olam" \ | ||
| || true | ||
| ) | ||
| if ! echo "$DASHBOARDS" | jq -e 'map(select(.uid == "olam-home")) | length == 1' >/dev/null 2>&1; then | ||
| log "DIAGNOSTIC: /api/search response:" | ||
| echo "$DASHBOARDS" >&2 | ||
| dump_diagnostics | ||
| fail "olam-home dashboard not found in /api/search — check ConfigMap mount path and dashboard provider config" | ||
| fi | ||
| log "PASS: olam-home dashboard found via /api/search" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 8: Assertion 3 — olam-home dashboard present (B3) | ||
| # ------------------------------------------------------------------------- | ||
| log "asserting olam-home dashboard present (GET /api/dashboards/uid/olam-home)" | ||
| DASHBOARD_RESPONSE=$( | ||
| curl -sf \ | ||
| -u "admin:${GRAFANA_ADMIN_PW}" \ | ||
| "http://localhost:${LOCAL_PORT}/api/dashboards/uid/olam-home" \ | ||
| || { dump_diagnostics; fail "GET /api/dashboards/uid/olam-home failed — dashboard not found or Grafana unreachable"; } | ||
| ) | ||
| if ! echo "$DASHBOARD_RESPONSE" | jq -e '.dashboard.uid == "olam-home"' >/dev/null 2>&1; then | ||
| log "DIAGNOSTIC: /api/dashboards/uid/olam-home response:" | ||
| echo "$DASHBOARD_RESPONSE" >&2 | ||
| dump_diagnostics | ||
| fail "olam-home dashboard uid mismatch or missing — check ConfigMap provisioning and Grafana provider config" | ||
| fi | ||
| log "PASS: olam-home dashboard present with uid=olam-home" | ||
| # ------------------------------------------------------------------------- | ||
| # Final | ||
| # ------------------------------------------------------------------------- | ||
| log "PASS: Grafana port-forward accessible; Loki datasource pre-wired; olam-home dashboard provisioned — Tasks B2+B3 verified" | ||
| exit 0 |
| #!/usr/bin/env bash | ||
| # kyverno-cardinality-mutate.sh — Phase C C8 follow-up e2e smoke test. | ||
| # | ||
| # Verifies that the Kyverno ClusterPolicy | ||
| # `enforce-cardinality-labeldrop` mutates incoming ServiceMonitor and | ||
| # PodMonitor objects at admission time, regardless of authorship, | ||
| # closing codex's "policy by convention" gap on PR #783. | ||
| # | ||
| # Test approach: | ||
| # 1. helm-install Kyverno (pinned 3.8.1) into the `kyverno` namespace. | ||
| # 2. Apply the ClusterPolicy. | ||
| # 3. POSITIVE test: apply ServiceMonitor `kyverno-mutate-positive-test` | ||
| # with selector `app: kyverno-mutate-positive-test` (no backing Service) | ||
| # and NO metricRelabelings; assert Kyverno mutated it; delete immediately. | ||
| # 4. IDEMPOTENCY test: apply ServiceMonitor `kyverno-mutate-idempotency-test` | ||
| # with selector `app: kyverno-mutate-idempotency-test` (different non-existent | ||
| # label) and the labeldrop already present; assert count stays at 1; delete. | ||
| # 5. SCRAPE-VERIFICATION test: deploy synthetic `kyverno-emitter` (Service + | ||
| # Deployment + ConfigMap) + dedicated ServiceMonitor `kyverno-emitter-sm` | ||
| # applied WITHOUT metricRelabelings; assert Kyverno mutates the SM at admission; | ||
| # wait for pod Ready; poll Prometheus for http_requests_total; assert | ||
| # world_id label is ABSENT. | ||
| # | ||
| # Key design decision: POSITIVE and IDEMPOTENCY tests use selectors that match | ||
| # no real Service, so they are isolated from each other and from the SCRAPE test. | ||
| # A single dedicated SM (`kyverno-emitter-sm`) owns the emitter endpoint, so | ||
| # prometheus-operator can reliably reconcile exactly one scrape config for it. | ||
| # Root cause of the prior failure (PR #828 CI run 26239574154): two SMs | ||
| # (naive-violator + pre-armoured-violator) competed for the same | ||
| # `app: kyverno-emitter` Endpoints; operator never reconciled either. | ||
| # | ||
| # Pre-conditions: | ||
| # - kube-prometheus-stack installed (cardinality-drop.sh ran). | ||
| # - kubectl context set to a live cluster; helm + jq + curl available. | ||
| # | ||
| # Idempotency: kubectl apply is idempotent; helm upgrade --install is | ||
| # idempotent. Cleanup trap removes synthetic resources on exit. The | ||
| # ClusterPolicy + Kyverno install are LEFT in the cluster (permanent | ||
| # C8 fixtures). | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8 | ||
| # codex review on PR #783 ("policy by convention" finding) | ||
| # PR #828 CI run 26239574154 (competing-SM root cause) | ||
| set -euo pipefail | ||
| KYVERNO_VERSION="3.8.1" | ||
| KYVERNO_NAMESPACE="kyverno" | ||
| TEST_NAMESPACE="monitoring" | ||
| PROM_LOCAL_PORT="9092" # 9090, 9091 may be in use by sibling Phase C scripts | ||
| PF_BIND_SECONDS=5 | ||
| TARGET_DISCOVERY_TIMEOUT="${OLAM_PROM_DISCOVERY_TIMEOUT:-240}" # bumped from 180s; one CI attempt observed kyverno-emitter still not scraped at 180s | ||
| SCRAPE_POLL_INTERVAL=10 | ||
| log() { printf '[kyverno-mutate] %s\n' "$*" >&2; } | ||
| fail() { printf '[kyverno-mutate] FAIL: %s\n' "$*" >&2; exit 1; } | ||
| REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)" | ||
| # When invoked from a published @pleri/olam-cli install (no monorepo), `olam | ||
| # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled | ||
| # peripheral-services/{helm-values,manifests} directory is reachable. | ||
| # Monorepo callers leave it unset; the script falls back to the source dir | ||
| # under packages/peripheral-services/. | ||
| if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then | ||
| PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services" | ||
| else | ||
| PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services" | ||
| fi | ||
| # ------------------------------------------------------------------------- | ||
| # Cleanup trap — kill port-forwards; remove synthetic resources on exit. | ||
| # Kyverno chart + ClusterPolicy stay (permanent C8 fixtures). | ||
| # ------------------------------------------------------------------------- | ||
| PROM_PF_PID="" | ||
| cleanup() { | ||
| [[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null || true | ||
| log "removing synthetic resources (idempotent)" | ||
| # Mutation-test SMs (already deleted inline, but --ignore-not-found makes this safe) | ||
| kubectl delete servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true | ||
| kubectl delete servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true | ||
| # Scrape-verification resources | ||
| kubectl delete servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true | ||
| kubectl delete deployment kyverno-emitter -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true | ||
| kubectl delete service kyverno-emitter-svc -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true | ||
| kubectl delete configmap kyverno-emitter-config -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true | ||
| } | ||
| trap cleanup EXIT | ||
| # ------------------------------------------------------------------------- | ||
| # Pre-flight | ||
| # ------------------------------------------------------------------------- | ||
| command -v helm >/dev/null 2>&1 || fail "helm not installed" | ||
| command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed" | ||
| command -v curl >/dev/null 2>&1 || fail "curl not installed" | ||
| command -v jq >/dev/null 2>&1 || fail "jq not installed" | ||
| kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG" | ||
| # kube-prom-stack must already be up — we rely on Prometheus + the | ||
| # ServiceMonitor CRD existing. | ||
| kubectl get crd servicemonitors.monitoring.coreos.com >/dev/null 2>&1 \ | ||
| || fail "ServiceMonitor CRD not present — run prom-no-double-grafana.sh first" | ||
| kubectl get deployment -n "$TEST_NAMESPACE" -l "app.kubernetes.io/name=prometheus-operator" \ | ||
| >/dev/null 2>&1 \ | ||
| || fail "prometheus-operator not found in $TEST_NAMESPACE — run prom-no-double-grafana.sh first" | ||
| log "pre-flight checks passed" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 1: helm-install Kyverno | ||
| # | ||
| # Repo add is idempotent; helm upgrade --install handles fresh install + upgrade. | ||
| # `--wait` blocks until pods are Ready; admission webhook needs to be live | ||
| # before we apply the ClusterPolicy or our test ServiceMonitors. | ||
| # ------------------------------------------------------------------------- | ||
| log "ensuring kyverno helm repo is configured" | ||
| helm repo add kyverno https://kyverno.github.io/kyverno/ >/dev/null 2>&1 || true | ||
| helm repo update kyverno >/dev/null 2>&1 || true | ||
| log "installing kyverno chart $KYVERNO_VERSION (waits for admission webhook Ready)" | ||
| helm upgrade --install olam-kyverno kyverno/kyverno \ | ||
| --version "$KYVERNO_VERSION" \ | ||
| --namespace "$KYVERNO_NAMESPACE" \ | ||
| --create-namespace \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \ | ||
| --wait --timeout "${OLAM_HELM_TIMEOUT:-600s}" 2>&1 | tail -8 | ||
| # Sanity: kyverno-admission-controller Deployment Ready. | ||
| kubectl get deployment -n "$KYVERNO_NAMESPACE" -l "app.kubernetes.io/component=admission-controller" \ | ||
| >/dev/null 2>&1 \ | ||
| || fail "kyverno admission controller not found in $KYVERNO_NAMESPACE" | ||
| log "waiting for kyverno admission webhook to be registered with apiserver" | ||
| # The webhook registration is the LAST thing kyverno does after pod-Ready; | ||
| # poll until our ClusterPolicy can be admitted. | ||
| elapsed=0 | ||
| while [ "$elapsed" -lt 120 ]; do | ||
| if kubectl get validatingwebhookconfiguration kyverno-policy-validating-webhook-cfg \ | ||
| >/dev/null 2>&1; then | ||
| log "kyverno webhooks registered after ${elapsed}s" | ||
| break | ||
| fi | ||
| sleep 5 | ||
| elapsed=$((elapsed + 5)) | ||
| done | ||
| if [ "$elapsed" -ge 120 ]; then | ||
| fail "kyverno webhook registration timed out after 120s" | ||
| fi | ||
| # ------------------------------------------------------------------------- | ||
| # Step 2: Apply the ClusterPolicy | ||
| # ------------------------------------------------------------------------- | ||
| log "applying ClusterPolicy enforce-cardinality-labeldrop" | ||
| kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/96-kyverno-cardinality-mutate.yaml" | ||
| # Wait for policy to be Ready (Kyverno controller picks it up and reports | ||
| # readiness in status.ready / .conditions). | ||
| log "waiting up to 60s for ClusterPolicy to be Ready" | ||
| elapsed=0 | ||
| while [ "$elapsed" -lt 60 ]; do | ||
| READY=$(kubectl get clusterpolicy enforce-cardinality-labeldrop \ | ||
| -o jsonpath='{.status.ready}' 2>/dev/null || echo "") | ||
| if [ "$READY" = "true" ]; then | ||
| log "ClusterPolicy Ready after ${elapsed}s" | ||
| break | ||
| fi | ||
| sleep 3 | ||
| elapsed=$((elapsed + 3)) | ||
| done | ||
| if [ "$elapsed" -ge 60 ]; then | ||
| log "WARN: ClusterPolicy status.ready not observed within 60s; proceeding (status field can lag)" | ||
| fi | ||
| # ------------------------------------------------------------------------- | ||
| # Step 3: POSITIVE test — mutation only, no backing Service | ||
| # | ||
| # Uses selector `app: kyverno-mutate-positive-test` — a label that no | ||
| # real Service carries, so this SM never competes with anything for | ||
| # Endpoints. Its sole job is to exercise the Kyverno admission webhook. | ||
| # | ||
| # Deleted immediately after assertion so the SM space is clean when | ||
| # the scrape test runs. | ||
| # ------------------------------------------------------------------------- | ||
| log "POSITIVE test: applying naive ServiceMonitor (no metricRelabelings, non-Service-backed selector)" | ||
| kubectl apply -f - <<'EOF' | ||
| --- | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: ServiceMonitor | ||
| metadata: | ||
| name: kyverno-mutate-positive-test | ||
| namespace: monitoring | ||
| labels: | ||
| release: olam-prom | ||
| spec: | ||
| namespaceSelector: | ||
| matchNames: | ||
| - monitoring | ||
| selector: | ||
| matchLabels: | ||
| app: kyverno-mutate-positive-test | ||
| endpoints: | ||
| - port: metrics | ||
| interval: 15s | ||
| # NOTE: deliberately NO metricRelabelings — Kyverno must inject it. | ||
| EOF | ||
| # Read back and assert. | ||
| ACTUAL=$(kubectl get servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" -o json \ | ||
| | jq -r '.spec.endpoints[0].metricRelabelings // [] | tojson') | ||
| log "kyverno-mutate-positive-test metricRelabelings after admission: $ACTUAL" | ||
| INJECTED_COUNT=$(echo "$ACTUAL" | jq '[ .[] | select(.action == "labeldrop" and (.regex | contains("world_id"))) ] | length') | ||
| if [ "$INJECTED_COUNT" -lt 1 ]; then | ||
| log "actual policy state:" | ||
| kubectl get clusterpolicy enforce-cardinality-labeldrop -o yaml >&2 || true | ||
| fail "POSITIVE test FAILED: Kyverno did not inject labeldrop into naive ServiceMonitor — third-party bypass gap NOT closed" | ||
| fi | ||
| log "PASS: naive ServiceMonitor was mutated at admission (labeldrop injected)" | ||
| log "deleting kyverno-mutate-positive-test (mutation-only test; SM space clean for scrape test)" | ||
| kubectl delete servicemonitor kyverno-mutate-positive-test -n "$TEST_NAMESPACE" --ignore-not-found=true | ||
| # ------------------------------------------------------------------------- | ||
| # Step 4: IDEMPOTENCY test — mutation only, no backing Service | ||
| # | ||
| # Uses selector `app: kyverno-mutate-idempotency-test` — different from | ||
| # the positive test and from the scrape test label. No real Service. | ||
| # Deleted immediately after assertion. | ||
| # ------------------------------------------------------------------------- | ||
| log "IDEMPOTENCY test: applying pre-armoured ServiceMonitor (labeldrop already present)" | ||
| kubectl apply -f - <<'EOF' | ||
| --- | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: ServiceMonitor | ||
| metadata: | ||
| name: kyverno-mutate-idempotency-test | ||
| namespace: monitoring | ||
| labels: | ||
| release: olam-prom | ||
| spec: | ||
| namespaceSelector: | ||
| matchNames: | ||
| - monitoring | ||
| selector: | ||
| matchLabels: | ||
| app: kyverno-mutate-idempotency-test | ||
| endpoints: | ||
| - port: metrics | ||
| interval: 15s | ||
| metricRelabelings: | ||
| - action: labeldrop | ||
| regex: 'world_id|trace_id|user_id|request_id|operator_id' | ||
| EOF | ||
| DUP_COUNT=$(kubectl get servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" -o json \ | ||
| | jq '[ .spec.endpoints[0].metricRelabelings[] | select(.action == "labeldrop" and (.regex | contains("world_id"))) ] | length') | ||
| log "kyverno-mutate-idempotency-test labeldrop count: $DUP_COUNT" | ||
| if [ "$DUP_COUNT" -ne 1 ]; then | ||
| kubectl get servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" -o yaml >&2 | ||
| fail "IDEMPOTENCY test FAILED: expected 1 labeldrop entry, got $DUP_COUNT — policy double-adds" | ||
| fi | ||
| log "PASS: pre-armoured ServiceMonitor has exactly 1 labeldrop (no double-add)" | ||
| log "deleting kyverno-mutate-idempotency-test (mutation-only test; SM space clean for scrape test)" | ||
| kubectl delete servicemonitor kyverno-mutate-idempotency-test -n "$TEST_NAMESPACE" --ignore-not-found=true | ||
| # ------------------------------------------------------------------------- | ||
| # Step 5: SCRAPE-VERIFICATION test — dedicated SM + Service + Pod | ||
| # | ||
| # One SM (`kyverno-emitter-sm`) selects exactly one Service (`kyverno-emitter-svc`). | ||
| # No other SM in the cluster selects `app: kyverno-emitter`, so prometheus-operator | ||
| # reconciles a single clean scrape config. | ||
| # | ||
| # The SM is applied WITHOUT metricRelabelings so Kyverno's admission webhook | ||
| # fires — this is the load-bearing check that the policy applies during real | ||
| # scrape setup, not just on test fixtures. | ||
| # | ||
| # After admission we verify the spec has the labeldrop, then wait for the pod | ||
| # to be Ready and poll Prometheus for http_requests_total. We assert | ||
| # world_id is absent from all returned series. | ||
| # | ||
| # Mirrors the working pattern from dashboards-have-data.sh (single dedicated | ||
| # SM + co-located Service in `monitoring` namespace). | ||
| # ------------------------------------------------------------------------- | ||
| log "SCRAPE-VERIFICATION test: deploying synthetic kyverno-emitter (emits http_requests_total{world_id})" | ||
| kubectl apply -f - <<'EOF' | ||
| --- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: kyverno-emitter-config | ||
| namespace: monitoring | ||
| data: | ||
| metrics: | | ||
| # HELP http_requests_total Synthetic counter; world_id is the cardinality bomb | ||
| # TYPE http_requests_total counter | ||
| http_requests_total{world_id="kyverno-world",route="/api",method="GET",status_code="200"} 1 | ||
| --- | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: kyverno-emitter | ||
| namespace: monitoring | ||
| labels: | ||
| app: kyverno-emitter | ||
| spec: | ||
| replicas: 1 | ||
| selector: | ||
| matchLabels: | ||
| app: kyverno-emitter | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: kyverno-emitter | ||
| spec: | ||
| containers: | ||
| - name: emitter | ||
| image: python:3.11-alpine | ||
| ports: | ||
| - containerPort: 8080 | ||
| command: ["python3", "-c"] | ||
| args: | ||
| - | | ||
| import http.server | ||
| with open('/config/metrics') as f: METRICS = f.read().encode() | ||
| class H(http.server.BaseHTTPRequestHandler): | ||
| def do_GET(self): | ||
| if self.path != '/metrics': | ||
| self.send_response(404); self.end_headers(); return | ||
| self.send_response(200) | ||
| self.send_header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8') | ||
| self.end_headers() | ||
| self.wfile.write(METRICS) | ||
| def log_message(self, *a): pass | ||
| http.server.HTTPServer(('0.0.0.0', 8080), H).serve_forever() | ||
| volumeMounts: | ||
| - name: config | ||
| mountPath: /config | ||
| volumes: | ||
| - name: config | ||
| configMap: | ||
| name: kyverno-emitter-config | ||
| --- | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: kyverno-emitter-svc | ||
| namespace: monitoring | ||
| labels: | ||
| app: kyverno-emitter | ||
| spec: | ||
| selector: | ||
| app: kyverno-emitter | ||
| ports: | ||
| - name: metrics | ||
| port: 8080 | ||
| targetPort: 8080 | ||
| EOF | ||
| log "waiting for kyverno-emitter deployment Ready" | ||
| kubectl rollout status deployment/kyverno-emitter -n "$TEST_NAMESPACE" --timeout=120s | ||
| # Apply the dedicated ServiceMonitor WITHOUT metricRelabelings so Kyverno | ||
| # mutates it at admission — this proves the policy fires on real SM objects, | ||
| # not just on the POSITIVE test fixture. | ||
| log "applying kyverno-emitter-sm (no metricRelabelings — Kyverno must inject)" | ||
| kubectl apply -f - <<'EOF' | ||
| --- | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: ServiceMonitor | ||
| metadata: | ||
| name: kyverno-emitter-sm | ||
| namespace: monitoring | ||
| labels: | ||
| release: olam-prom | ||
| spec: | ||
| namespaceSelector: | ||
| matchNames: | ||
| - monitoring | ||
| selector: | ||
| matchLabels: | ||
| app: kyverno-emitter | ||
| endpoints: | ||
| - port: metrics | ||
| interval: 15s | ||
| # NOTE: NO metricRelabelings — Kyverno must inject the labeldrop at admission. | ||
| EOF | ||
| # Verify Kyverno mutated this SM too (belt-and-suspenders: proves the policy | ||
| # applies to the SM that actually drives the scrape, not just the test fixtures). | ||
| SCRAPE_SM_ACTUAL=$(kubectl get servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" -o json \ | ||
| | jq -r '.spec.endpoints[0].metricRelabelings // [] | tojson') | ||
| log "kyverno-emitter-sm metricRelabelings after admission: $SCRAPE_SM_ACTUAL" | ||
| SCRAPE_SM_INJECTED=$(echo "$SCRAPE_SM_ACTUAL" | jq '[ .[] | select(.action == "labeldrop" and (.regex | contains("world_id"))) ] | length') | ||
| if [ "$SCRAPE_SM_INJECTED" -lt 1 ]; then | ||
| log "actual policy state:" | ||
| kubectl get clusterpolicy enforce-cardinality-labeldrop -o yaml >&2 || true | ||
| fail "SCRAPE-VERIFICATION test FAILED: Kyverno did not mutate kyverno-emitter-sm at admission" | ||
| fi | ||
| log "PASS: kyverno-emitter-sm was mutated at admission (labeldrop injected)" | ||
| # Port-forward Prometheus and poll for metric samples. | ||
| log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090" | ||
| kubectl port-forward \ | ||
| -n "$TEST_NAMESPACE" \ | ||
| "svc/prometheus-operated" \ | ||
| "${PROM_LOCAL_PORT}:9090" & | ||
| PROM_PF_PID=$! | ||
| sleep "$PF_BIND_SECONDS" | ||
| kill -0 "$PROM_PF_PID" 2>/dev/null \ | ||
| || fail "Prometheus port-forward exited prematurely" | ||
| PROM_URL="http://localhost:${PROM_LOCAL_PORT}" | ||
| # Direct-metric polling rather than target-discovery polling. | ||
| # | ||
| # Rationale: kube-prometheus-stack's default relabel sets the `job` label | ||
| # from the k8s Service name. Polling by job-name is brittle — operator | ||
| # reconciliation races, dropped-target filtering, and rare CRD revision | ||
| # lag have all surfaced as "target not in activeTargets" flakes during | ||
| # earlier ingress-integration runs. What we ACTUALLY care about is | ||
| # whether the mutated relabel was applied to a real scrape sample. So | ||
| # poll for the metric directly. With a single SM selecting on | ||
| # `app=kyverno-emitter`, any http_requests_total series returned | ||
| # necessarily came through kyverno-emitter-sm. | ||
| log "polling Prometheus for http_requests_total samples (up to ${TARGET_DISCOVERY_TIMEOUT}s)" | ||
| elapsed=0 | ||
| RESULT="" | ||
| while [ "$elapsed" -lt "$TARGET_DISCOVERY_TIMEOUT" ]; do | ||
| RESULT=$(curl -sf "${PROM_URL}/api/v1/query?query=http_requests_total" 2>/dev/null || echo "") | ||
| if [ -n "$RESULT" ]; then | ||
| SERIES_COUNT=$(echo "$RESULT" | jq '.data.result | length' 2>/dev/null || echo "0") | ||
| if [ "$SERIES_COUNT" -ge 1 ]; then | ||
| log "http_requests_total returned $SERIES_COUNT series after ${elapsed}s" | ||
| break | ||
| fi | ||
| fi | ||
| sleep "$SCRAPE_POLL_INTERVAL" | ||
| elapsed=$((elapsed + SCRAPE_POLL_INTERVAL)) | ||
| done | ||
| if [ "$elapsed" -ge "$TARGET_DISCOVERY_TIMEOUT" ]; then | ||
| log "Active targets snapshot for diagnosis:" | ||
| curl -sf "${PROM_URL}/api/v1/targets" | jq '.data.activeTargets[] | {job: .labels.job, service: .labels.service, namespace: .labels.namespace, health: .health, lastError: .lastError}' >&2 || true | ||
| log "ServiceMonitor kyverno-emitter-sm status:" | ||
| kubectl get servicemonitor kyverno-emitter-sm -n "$TEST_NAMESPACE" -o yaml >&2 || true | ||
| log "prometheus-operator log tail (last 50 lines):" | ||
| kubectl logs -n "$TEST_NAMESPACE" -l "app.kubernetes.io/name=prometheus-operator" --tail=50 >&2 || true | ||
| fail "Prometheus did not scrape kyverno-emitter within ${TARGET_DISCOVERY_TIMEOUT}s" | ||
| fi | ||
| SERIES_COUNT=$(echo "$RESULT" | jq '.data.result | length') | ||
| LEAKED=$(echo "$RESULT" | jq '[.data.result[] | .metric | has("world_id")] | any') | ||
| if [ "$LEAKED" = "true" ]; then | ||
| echo "$RESULT" | jq '.data.result[] | .metric' >&2 | ||
| fail "world_id label leaked into Prometheus — Kyverno-mutated relabel did NOT take effect at scrape time" | ||
| fi | ||
| log "PASS: kyverno-emitter scraped via kyverno-emitter-sm; world_id absent at scrape time" | ||
| log "PASS: C8 verified — Kyverno mutates third-party-shaped ServiceMonitors at admission and the mutation takes effect at scrape time" | ||
| exit 0 |
| #!/usr/bin/env bash | ||
| # loki-ingest.sh — e2e smoke test: Loki single-binary installs, Promtail tails, | ||
| # OAuth query-param scrubbing verified (code=REDACTED, no raw token). | ||
| # | ||
| # Usage: scripts/e2e/loki-ingest.sh | ||
| # | ||
| # Pre-conditions: | ||
| # - kubectl context is set to a live k8s cluster (does NOT spin up k3d) | ||
| # - helm binary available | ||
| # - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts) | ||
| # | ||
| # This script is invoked by the A12 harness (scripts/test-ingress-integration/) | ||
| # after cluster-up.sh. It can also be run manually against any live cluster. | ||
| # | ||
| # Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an | ||
| # existing cluster. The synthetic pod is cleaned up regardless of | ||
| # pass/fail via a trap. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B1 | ||
| # Chart: grafana/loki 6.7.4 (pinned; latest stable 2026-05-20) | ||
| # Chart: grafana/promtail 6.16.6 (latest stable 2026-05-20) | ||
| set -euo pipefail | ||
| NAMESPACE="monitoring" | ||
| LOKI_RELEASE="olam-loki" | ||
| PROMTAIL_RELEASE="olam-promtail" | ||
| SYNTHETIC_POD="loki-e2e-synthetic" | ||
| LOKI_PORT="3100" | ||
| LOCAL_PORT="13100" # avoid conflict with any host-level Loki | ||
| # Magic-number commentary: Promtail's tail → ingest cycle involves: | ||
| # - inotify event (near-instant) | ||
| # - Promtail pipeline processing (~1s) | ||
| # - Loki write path (ingester chunk idle period: default 30m, but flush on | ||
| # query pressure; typically <5s in practice) | ||
| # 10s is conservative for a single log line in a lightly loaded cluster. | ||
| INGEST_LAG_SECONDS=10 | ||
| log() { printf '[loki-ingest] %s\n' "$*" >&2; } | ||
| fail() { printf '[loki-ingest] FAIL: %s\n' "$*" >&2; exit 1; } | ||
| # ------------------------------------------------------------------------- | ||
| # Cleanup trap — remove synthetic pod and port-forward on exit | ||
| # ------------------------------------------------------------------------- | ||
| PF_PID="" | ||
| cleanup() { | ||
| if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then | ||
| kill "$PF_PID" 2>/dev/null || true | ||
| fi | ||
| kubectl delete pod "$SYNTHETIC_POD" -n default --ignore-not-found=true 2>/dev/null || true | ||
| } | ||
| trap cleanup EXIT | ||
| # ------------------------------------------------------------------------- | ||
| # Pre-flight | ||
| # ------------------------------------------------------------------------- | ||
| command -v helm >/dev/null 2>&1 || fail "helm not installed" | ||
| command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed" | ||
| command -v curl >/dev/null 2>&1 || fail "curl not installed" | ||
| kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG" | ||
| log "pre-flight checks passed" | ||
| # ------------------------------------------------------------------------- | ||
| # Resolve repo root so helm -f paths work regardless of invocation cwd | ||
| # ------------------------------------------------------------------------- | ||
| REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)" | ||
| # When invoked from a published @pleri/olam-cli install (no monorepo), `olam | ||
| # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled | ||
| # peripheral-services/{helm-values,manifests} directory is reachable. | ||
| # Monorepo callers leave it unset; the script resolves the source dir under | ||
| # packages/peripheral-services/. | ||
| if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then | ||
| PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services" | ||
| else | ||
| PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services" | ||
| fi | ||
| # ------------------------------------------------------------------------- | ||
| # Ensure grafana Helm repo is present (idempotent — safe to re-run) | ||
| # ------------------------------------------------------------------------- | ||
| helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true | ||
| helm repo update grafana | ||
| # ------------------------------------------------------------------------- | ||
| # Step 1: Install / upgrade Loki (single-binary mode) | ||
| # ------------------------------------------------------------------------- | ||
| log "installing grafana/loki ($LOKI_RELEASE) in namespace $NAMESPACE" | ||
| helm upgrade --install "$LOKI_RELEASE" grafana/loki \ | ||
| --version 6.7.4 \ | ||
| --namespace "$NAMESPACE" \ | ||
| --create-namespace \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \ | ||
| --wait \ | ||
| --timeout "${OLAM_HELM_TIMEOUT:-600s}" | ||
| log "loki helm install complete" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 2: Install / upgrade Promtail | ||
| # ------------------------------------------------------------------------- | ||
| log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE" | ||
| helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \ | ||
| --version 6.16.6 \ | ||
| --namespace "$NAMESPACE" \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \ | ||
| --wait \ | ||
| --timeout 120s | ||
| log "promtail helm install complete" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 3: Wait for Loki pod Ready | ||
| # ------------------------------------------------------------------------- | ||
| log "waiting for Loki pod Ready (120s)" | ||
| kubectl wait \ | ||
| --for=condition=ready pod \ | ||
| -l app.kubernetes.io/name=loki \ | ||
| -n "$NAMESPACE" \ | ||
| --timeout=120s | ||
| log "loki pod Ready" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 4: Generate synthetic log line with raw OAuth tokens in URL and headers. | ||
| # | ||
| # The pod prints a single log line containing all 4 scrub patterns: | ||
| # ?code=SECRETTOKEN123 → code=REDACTED | ||
| # &access_token=SECRETTOKEN456 → access_token=REDACTED | ||
| # &state=SESSION789 → state=REDACTED | ||
| # Authorization: Bearer SECRETBEARER000 → Authorization: Bearer REDACTED | ||
| # | ||
| # Promtail tails it, runs the scrubbing pipeline, and pushes to Loki with all | ||
| # 4 raw tokens absent and all 4 REDACTED markers present. | ||
| # ------------------------------------------------------------------------- | ||
| log "launching synthetic pod (prints all 4 raw token patterns)" | ||
| kubectl run "$SYNTHETIC_POD" \ | ||
| --image=busybox \ | ||
| --restart=Never \ | ||
| -n default \ | ||
| -- sh -c 'echo "GET http://example.com/callback?code=SECRETTOKEN123&access_token=SECRETTOKEN456&state=SESSION789 HTTP/1.1 Authorization: Bearer SECRETBEARER000"' | ||
| # ------------------------------------------------------------------------- | ||
| # Step 5: Wait for Promtail tail + ingest lag | ||
| # ------------------------------------------------------------------------- | ||
| log "waiting ${INGEST_LAG_SECONDS}s for Promtail to tail and ingest synthetic log" | ||
| sleep "$INGEST_LAG_SECONDS" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 6: Port-forward Loki and query | ||
| # ------------------------------------------------------------------------- | ||
| log "port-forwarding Loki svc to localhost:${LOCAL_PORT}" | ||
| kubectl port-forward \ | ||
| "svc/${LOKI_RELEASE}" \ | ||
| "${LOCAL_PORT}:${LOKI_PORT}" \ | ||
| -n "$NAMESPACE" & | ||
| PF_PID=$! | ||
| # Give port-forward a moment to establish | ||
| sleep 2 | ||
| # Query Loki for log lines from the default namespace within the last 5 minutes. | ||
| # We search broadly for "SECRETTOKEN" to catch any raw token that leaked through, | ||
| # and separately verify all 4 REDACTED markers are present. | ||
| log "querying Loki for scrubbed entries" | ||
| QUERY_RESPONSE=$( | ||
| curl -s -G \ | ||
| "http://localhost:${LOCAL_PORT}/loki/api/v1/query_range" \ | ||
| --data-urlencode 'query={namespace="default"} |= "REDACTED"' \ | ||
| --data-urlencode "start=$(date -u -v-5M +%s 2>/dev/null || date -u -d '5 minutes ago' +%s)000000000" \ | ||
| --data-urlencode "end=$(date -u +%s)000000000" \ | ||
| --data-urlencode 'limit=50' | ||
| ) | ||
| # ------------------------------------------------------------------------- | ||
| # Step 7: Assertions — verify all 4 scrub patterns | ||
| # | ||
| # Contract (matches Phase B spec + promtail-values.yaml): | ||
| # ?code=SECRETTOKEN123 → code=REDACTED (absent: SECRETTOKEN123) | ||
| # &access_token=SECRETTOKEN456 → access_token=REDACTED (absent: SECRETTOKEN456) | ||
| # &state=SESSION789 → state=REDACTED (absent: SESSION789) | ||
| # Authorization: Bearer SECRETBEARER000 → Bearer REDACTED (absent: SECRETBEARER000) | ||
| # ------------------------------------------------------------------------- | ||
| log "asserting scrubbing correctness (all 4 patterns)" | ||
| diag() { | ||
| log "DIAGNOSTIC: Loki query response:" | ||
| echo "$QUERY_RESPONSE" >&2 | ||
| log "DIAGNOSTIC: last 50 lines of Promtail logs:" | ||
| kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=promtail --tail=50 2>&1 >&2 || true | ||
| } | ||
| # Assertion 1: query response is non-empty (Loki returned results) | ||
| if ! echo "$QUERY_RESPONSE" | grep -q '"result"'; then | ||
| diag | ||
| fail "Loki returned no result block — Promtail may not have ingested the synthetic log yet" | ||
| fi | ||
| # --- Scrubbed markers present --- | ||
| # Assertion 2a: code= is scrubbed | ||
| if ! echo "$QUERY_RESPONSE" | grep -q 'code=REDACTED'; then | ||
| diag | ||
| fail "'code=REDACTED' not found in Loki response — code= scrub stage not working" | ||
| fi | ||
| # Assertion 2b: access_token= is scrubbed | ||
| if ! echo "$QUERY_RESPONSE" | grep -q 'access_token=REDACTED'; then | ||
| diag | ||
| fail "'access_token=REDACTED' not found in Loki response — access_token= scrub stage not working" | ||
| fi | ||
| # Assertion 2c: state= is scrubbed | ||
| if ! echo "$QUERY_RESPONSE" | grep -q 'state=REDACTED'; then | ||
| diag | ||
| fail "'state=REDACTED' not found in Loki response — state= scrub stage not working" | ||
| fi | ||
| # Assertion 2d: Authorization Bearer is scrubbed | ||
| if ! echo "$QUERY_RESPONSE" | grep -q 'Bearer REDACTED'; then | ||
| diag | ||
| fail "'Bearer REDACTED' not found in Loki response — Authorization Bearer scrub stage not working" | ||
| fi | ||
| # --- Raw tokens absent --- | ||
| # Assertion 3a: raw code= token is absent | ||
| if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN123'; then | ||
| diag | ||
| fail "raw token 'SECRETTOKEN123' (code=) found in Loki response — scrubbing pipeline is NOT working" | ||
| fi | ||
| # Assertion 3b: raw access_token= token is absent | ||
| if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN456'; then | ||
| diag | ||
| fail "raw token 'SECRETTOKEN456' (access_token=) found in Loki response — scrubbing pipeline is NOT working" | ||
| fi | ||
| # Assertion 3c: raw state= token is absent | ||
| if echo "$QUERY_RESPONSE" | grep -q 'SESSION789'; then | ||
| diag | ||
| fail "raw token 'SESSION789' (state=) found in Loki response — scrubbing pipeline is NOT working" | ||
| fi | ||
| # Assertion 3d: raw Bearer token is absent | ||
| if echo "$QUERY_RESPONSE" | grep -q 'SECRETBEARER000'; then | ||
| diag | ||
| fail "raw token 'SECRETBEARER000' (Authorization Bearer) found in Loki response — scrubbing pipeline is NOT working" | ||
| fi | ||
| log "PASS: all 4 scrub patterns verified — code=REDACTED, access_token=REDACTED, state=REDACTED, Bearer REDACTED present; all raw tokens absent" | ||
| exit 0 |
| // NDJSON span sink — zero-config observability for host-cp. | ||
| // | ||
| // Subscribes to the host-stream broadcaster and writes one JSON line per | ||
| // `span` event to ~/.olam/logs/host.trace.ndjson. Each span carries the | ||
| // minimum surface needed for `jq`-based triage: identity, timing, exit. | ||
| // | ||
| // Wire shape per line: | ||
| // { traceId, spanId, parentSpanId, name, startedAt, durationMs, | ||
| // attributes, events[], exit: { _tag: 'Success'|'Failure', reason? } } | ||
| // | ||
| // Rotation: single level — at 50MB the file is renamed to `.1` and a | ||
| // fresh file is opened. The previous `.1` (if any) is overwritten. We | ||
| // keep at most one prior generation; deeper retention belongs to the | ||
| // operator's normal disk-management tooling. | ||
| // | ||
| // Override path with OLAM_TRACE_LOG_PATH (set to /dev/null in tests that | ||
| // don't care about file output, or to a temp file to assert on writes). | ||
| import { open, mkdir, rename } from 'node:fs/promises'; | ||
| import { join, dirname } from 'node:path'; | ||
| import { homedir } from 'node:os'; | ||
| import { redactSensitive } from './redactor.mjs'; | ||
| const DEFAULT_ROTATE_BYTES = 50 * 1024 * 1024; | ||
| const DEFAULT_LOG_PATH = | ||
| process.env.OLAM_TRACE_LOG_PATH ?? | ||
| join(homedir(), '.olam', 'logs', 'host.trace.ndjson'); | ||
| export async function createNdjsonSpanSink({ | ||
| logPath = DEFAULT_LOG_PATH, | ||
| rotateBytes = DEFAULT_ROTATE_BYTES, | ||
| hostStream, | ||
| } = {}) { | ||
| await mkdir(dirname(logPath), { recursive: true }); | ||
| let fh = await open(logPath, 'a'); | ||
| let bytesWritten = (await fh.stat()).size; | ||
| let closed = false; | ||
| let chain = Promise.resolve(); | ||
| async function writeLine(line) { | ||
| if (closed) return; | ||
| await fh.write(line); | ||
| bytesWritten += Buffer.byteLength(line); | ||
| if (bytesWritten >= rotateBytes) { | ||
| await fh.close(); | ||
| await rename(logPath, `${logPath}.1`); | ||
| fh = await open(logPath, 'a'); | ||
| bytesWritten = 0; | ||
| } | ||
| } | ||
| function recordSpan(span = {}) { | ||
| const { | ||
| name, startedAt, endedAt, attributes, events, exit, | ||
| traceId, spanId, parentSpanId, reason, | ||
| } = span; | ||
| const haveTimes = typeof endedAt === 'number' && typeof startedAt === 'number'; | ||
| const durationMs = haveTimes ? endedAt - startedAt : null; | ||
| let finalExit; | ||
| if (exit && typeof exit === 'object' && (exit._tag === 'Success' || exit._tag === 'Failure')) { | ||
| finalExit = exit._tag === 'Failure' && exit.reason !== undefined | ||
| ? { _tag: 'Failure', reason: exit.reason } | ||
| : { _tag: exit._tag }; | ||
| } else if (!haveTimes) { | ||
| finalExit = reason !== undefined ? { _tag: 'Failure', reason } : { _tag: 'Failure' }; | ||
| } else { | ||
| finalExit = { _tag: 'Success' }; | ||
| } | ||
| const record = { | ||
| traceId: traceId ?? null, | ||
| spanId: spanId ?? null, | ||
| parentSpanId: parentSpanId ?? null, | ||
| name: name ?? null, | ||
| startedAt: startedAt ?? null, | ||
| durationMs, | ||
| attributes: redactSensitive(attributes ?? {}), | ||
| events: redactSensitive(events ?? []), | ||
| exit: finalExit, | ||
| }; | ||
| const next = chain.then(() => writeLine(JSON.stringify(record) + '\n')).catch(() => {}); | ||
| chain = next; | ||
| return next; | ||
| } | ||
| let detach = null; | ||
| if (hostStream && typeof hostStream.addSink === 'function') { | ||
| detach = hostStream.addSink(createSseSpanAdapter((payload) => recordSpan(payload))); | ||
| } | ||
| return { | ||
| recordSpan, | ||
| async close() { | ||
| if (closed) return; | ||
| if (detach) detach(); | ||
| // Drain queued writes BEFORE flipping the closed flag — `writeLine` | ||
| // bails on `closed`, so flipping first would silently drop spans | ||
| // recorded just prior to shutdown. | ||
| await chain; | ||
| closed = true; | ||
| try { await fh.close(); } catch { /* already closed */ } | ||
| }, | ||
| }; | ||
| } | ||
| /** | ||
| * Subscribe an NDJSON sink to `@olam/auth-client`'s `betaResponseEmitter`. | ||
| * Each `beta-response` event becomes a `withCredential.beta-response` span | ||
| * with the beta payload exploded onto `attributes` — downstream `jq` | ||
| * consumers can query e.g. | ||
| * | ||
| * jq 'select(.name == "withCredential.beta-response") | ||
| * | {ts: .startedAt, cred: .attributes.credentialName, | ||
| * cache: .attributes.cacheStatus, | ||
| * thinking: .attributes.thinkingTokens, | ||
| * latencyMs: .durationMs}' ~/.olam/logs/host.trace.ndjson | ||
| * | ||
| * Wire is opt-in (call from server boot). Returns a detach function so the | ||
| * subscription can be removed in tests or on shutdown. | ||
| * | ||
| * Pure additive: spans flowing from other sources (docker lifecycle, | ||
| * plan-orchestrator, etc.) are unaffected. | ||
| */ | ||
| export function attachBetaResponseEvents({ sink, emitter }) { | ||
| if (!sink || typeof sink.recordSpan !== 'function') { | ||
| throw new Error('attachBetaResponseEvents: sink.recordSpan required'); | ||
| } | ||
| if (!emitter || typeof emitter.on !== 'function') { | ||
| throw new Error('attachBetaResponseEvents: emitter.on required'); | ||
| } | ||
| const handler = (info) => { | ||
| const now = Date.now(); | ||
| const latency = typeof info?.latencyMs === 'number' ? info.latencyMs : 0; | ||
| sink.recordSpan({ | ||
| name: 'withCredential.beta-response', | ||
| startedAt: now - latency, | ||
| endedAt: now, | ||
| attributes: { | ||
| credentialName: info?.credentialName ?? null, | ||
| credId: info?.credId ?? null, | ||
| betas: Array.isArray(info?.betas) ? [...info.betas] : [], | ||
| cacheStatus: info?.cacheStatus ?? null, | ||
| thinkingTokens: info?.tokenCounts?.thinking ?? null, | ||
| statusCode: typeof info?.statusCode === 'number' ? info.statusCode : null, | ||
| extraHeaders: info?.extraHeaders && typeof info.extraHeaders === 'object' | ||
| ? { ...info.extraHeaders } | ||
| : {}, | ||
| }, | ||
| exit: { _tag: 'Success' }, | ||
| }); | ||
| }; | ||
| emitter.on('beta-response', handler); | ||
| return () => emitter.off('beta-response', handler); | ||
| } | ||
| // Duck-typed ServerResponse for host-stream's `addSink`. Parses SSE frames | ||
| // (`event: <type>\ndata: <json>\n\n`) and dispatches `event: span` payloads | ||
| // to `onSpan`. All other event types are silently ignored — host-stream | ||
| // also replays per-type snapshots on attach; the sink is created at boot | ||
| // before any spans are broadcast, so replay is a no-op in practice. | ||
| function createSseSpanAdapter(onSpan) { | ||
| let buffer = ''; | ||
| return { | ||
| writableEnded: false, | ||
| destroyed: false, | ||
| write(chunk) { | ||
| buffer += String(chunk); | ||
| let i; | ||
| while ((i = buffer.indexOf('\n\n')) !== -1) { | ||
| const frame = buffer.slice(0, i); | ||
| buffer = buffer.slice(i + 2); | ||
| if (!frame.startsWith('event: span\n') && !frame.includes('\nevent: span\n')) continue; | ||
| const dataLine = frame.split('\n').find((l) => l.startsWith('data: ')); | ||
| if (!dataLine) continue; | ||
| try { onSpan(JSON.parse(dataLine.slice(6))); } catch { /* malformed frame */ } | ||
| } | ||
| return true; | ||
| }, | ||
| once() { /* no drain handling needed — in-memory adapter never backpressures */ }, | ||
| end() { this.writableEnded = true; }, | ||
| }; | ||
| } |
| #!/usr/bin/env bash | ||
| # prom-no-double-grafana.sh — Phase C Task C1 e2e smoke test. | ||
| # | ||
| # Verifies: | ||
| # 1. kube-prometheus-stack installs (Prometheus pod becomes Ready). | ||
| # 2. ServiceMonitor CRD is Established before Phase B charts are upgraded. | ||
| # 3. Phase B charts (Loki + Promtail + Grafana) are helm-upgraded to pick up | ||
| # serviceMonitor.enabled: true now that the CRD exists. | ||
| # 4. Exactly one Grafana Deployment is running in the cluster (no double-Grafana). | ||
| # 5. Phase B's Grafana (olam-grafana) has exactly one Prometheus datasource | ||
| # provisioned (from grafana-values.yaml datasources block added in C1). | ||
| # 6. Prometheus is scraping at least one active target. | ||
| # | ||
| # Pre-conditions: | ||
| # - kubectl context is set to a live k8s cluster. | ||
| # - Phase B e2e (loki-ingest.sh + grafana-port-forward.sh + grafana-dashboard-persistence.sh) | ||
| # has already run: olam-loki, olam-promtail, and olam-grafana releases are installed. | ||
| # - The olam-grafana-admin Secret exists (created by grafana-port-forward.sh). | ||
| # - helm, kubectl, curl, jq binaries available. | ||
| # | ||
| # Chart: prometheus-community/kube-prometheus-stack 85.2.0 (pinned; latest stable 2026-05-21). | ||
| # | ||
| # Idempotency: helm upgrade --install is idempotent; re-runs on an existing | ||
| # cluster succeed. Port-forwards are killed on exit via trap. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C1 | ||
| set -euo pipefail | ||
| NAMESPACE="monitoring" | ||
| PROM_RELEASE="olam-prom" | ||
| PROM_CHART_VERSION="85.2.0" | ||
| GRAFANA_RELEASE="olam-grafana" | ||
| GRAFANA_LOCAL_PORT="3001" # avoid collision if phase-b-e2e left a port-forward on 3000 | ||
| GRAFANA_SVC_PORT="80" | ||
| PROM_LOCAL_PORT="9090" | ||
| PF_BIND_SECONDS=5 | ||
| log() { printf '[prom-no-double-grafana] %s\n' "$*" >&2; } | ||
| fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; } | ||
| # ------------------------------------------------------------------------- | ||
| # Resolve repo root so helm -f paths work regardless of invocation cwd | ||
| # ------------------------------------------------------------------------- | ||
| REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)" | ||
| # When invoked from a published @pleri/olam-cli install (no monorepo), `olam | ||
| # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled | ||
| # peripheral-services/{helm-values,manifests} directory is reachable. | ||
| # Monorepo callers leave it unset; the script falls back to the source dir | ||
| # under packages/peripheral-services/. | ||
| if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then | ||
| PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services" | ||
| else | ||
| PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services" | ||
| fi | ||
| # ------------------------------------------------------------------------- | ||
| # Cleanup trap — kill port-forwards on exit; leave Helm releases in place | ||
| # ------------------------------------------------------------------------- | ||
| GRAFANA_PF_PID="" | ||
| PROM_PF_PID="" | ||
| cleanup() { | ||
| [[ -n "$GRAFANA_PF_PID" ]] && kill "$GRAFANA_PF_PID" 2>/dev/null || true | ||
| [[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null || true | ||
| } | ||
| trap cleanup EXIT | ||
| # ------------------------------------------------------------------------- | ||
| # Pre-flight | ||
| # ------------------------------------------------------------------------- | ||
| command -v helm >/dev/null 2>&1 || fail "helm not installed" | ||
| command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed" | ||
| command -v curl >/dev/null 2>&1 || fail "curl not installed" | ||
| command -v jq >/dev/null 2>&1 || fail "jq not installed" | ||
| kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG" | ||
| log "pre-flight checks passed" | ||
| # Verify Phase B pre-conditions | ||
| for release in olam-loki olam-promtail "$GRAFANA_RELEASE"; do | ||
| helm status "$release" -n "$NAMESPACE" >/dev/null 2>&1 \ | ||
| || fail "Phase B release '$release' not found in namespace $NAMESPACE — run phase-b-e2e first" | ||
| done | ||
| log "Phase B pre-conditions satisfied (olam-loki, olam-promtail, olam-grafana releases found)" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 1: Add prometheus-community repo and install kube-prometheus-stack | ||
| # ------------------------------------------------------------------------- | ||
| helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true | ||
| helm repo update prometheus-community | ||
| log "installing prometheus-community/kube-prometheus-stack ($PROM_RELEASE) version $PROM_CHART_VERSION" | ||
| helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stack \ | ||
| --version "$PROM_CHART_VERSION" \ | ||
| --namespace "$NAMESPACE" \ | ||
| --create-namespace \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \ | ||
| --wait \ | ||
| --timeout "${OLAM_HELM_TIMEOUT:-600s}" | ||
| log "kube-prometheus-stack helm install complete" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 2: Wait for ServiceMonitor CRD to be Established | ||
| # This is the gate before upgrading Phase B charts — the CRD must exist | ||
| # for serviceMonitor.enabled: true to produce a valid ServiceMonitor object. | ||
| # ------------------------------------------------------------------------- | ||
| log "waiting for ServiceMonitor CRD to be Established (60s)" | ||
| kubectl wait \ | ||
| --for=condition=established \ | ||
| crd/servicemonitors.monitoring.coreos.com \ | ||
| --timeout=60s | ||
| log "ServiceMonitor CRD Established" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 3: Helm-upgrade Phase B charts to enable ServiceMonitor at RUNTIME | ||
| # | ||
| # The source-of-truth values files keep serviceMonitor.enabled: false so a | ||
| # standalone Phase B install (without kube-prometheus-stack) does not | ||
| # hard-fail with "no matches for kind ServiceMonitor". We flip the toggle | ||
| # at runtime here, AFTER the CRD is Established, via --set overrides. This | ||
| # preserves Phase B's standalone-installability invariant while wiring | ||
| # Prometheus discovery when kube-prom-stack is present. | ||
| # | ||
| # NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor) | ||
| # — chart-version-specific path. | ||
| # ------------------------------------------------------------------------- | ||
| # Chart version pins MUST match the ones in phase-b-e2e's loki-ingest.sh + | ||
| # grafana-port-forward.sh. Without --version, helm pulls latest from the repo; | ||
| # the latest charts may reference new template values not present in our | ||
| # values files (e.g., Loki 6.8.x references .Values.loki.ui.enabled which is | ||
| # nil in our 6.7.4-shaped values, producing a nil-pointer template error | ||
| # during upgrade). | ||
| LOKI_CHART_VERSION="6.7.4" | ||
| PROMTAIL_CHART_VERSION="6.16.6" | ||
| GRAFANA_CHART_VERSION="8.5.2" | ||
| log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pinned versions)" | ||
| helm upgrade olam-loki grafana/loki \ | ||
| --version "$LOKI_CHART_VERSION" \ | ||
| --namespace "$NAMESPACE" \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \ | ||
| --wait \ | ||
| --timeout "${OLAM_HELM_TIMEOUT:-600s}" \ | ||
| --reuse-values \ | ||
| --set monitoring.serviceMonitor.enabled=true | ||
| log "olam-loki upgraded (ServiceMonitor enabled)" | ||
| helm upgrade olam-promtail grafana/promtail \ | ||
| --version "$PROMTAIL_CHART_VERSION" \ | ||
| --namespace "$NAMESPACE" \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \ | ||
| --wait \ | ||
| --timeout "${OLAM_HELM_TIMEOUT:-600s}" \ | ||
| --reuse-values \ | ||
| --set serviceMonitor.enabled=true | ||
| log "olam-promtail upgraded (ServiceMonitor enabled)" | ||
| helm upgrade "$GRAFANA_RELEASE" grafana/grafana \ | ||
| --version "$GRAFANA_CHART_VERSION" \ | ||
| --namespace "$NAMESPACE" \ | ||
| -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \ | ||
| --wait \ | ||
| --timeout "${OLAM_HELM_TIMEOUT:-600s}" \ | ||
| --reuse-values \ | ||
| --set serviceMonitor.enabled=true | ||
| log "$GRAFANA_RELEASE upgraded (ServiceMonitor enabled; Prometheus datasource provisioned)" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 4: Wait for Prometheus pod Ready | ||
| # ------------------------------------------------------------------------- | ||
| log "waiting for Prometheus pod Ready (300s)" | ||
| kubectl wait \ | ||
| --for=condition=ready pod \ | ||
| -l "app.kubernetes.io/name=prometheus" \ | ||
| -n "$NAMESPACE" \ | ||
| --timeout=300s | ||
| log "Prometheus pod Ready" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 5: Assertion — exactly one Grafana Deployment in the cluster | ||
| # This catches any regression where kube-prometheus-stack's bundled Grafana | ||
| # sub-chart accidentally gets enabled. | ||
| # ------------------------------------------------------------------------- | ||
| log "asserting exactly 1 Grafana Deployment in namespace $NAMESPACE" | ||
| GRAFANA_DEPS=$(kubectl get deployment \ | ||
| -n "$NAMESPACE" \ | ||
| -l "app.kubernetes.io/name=grafana" \ | ||
| -o name \ | ||
| | wc -l \ | ||
| | tr -d ' ') | ||
| if [ "$GRAFANA_DEPS" != "1" ]; then | ||
| log "FAIL: expected exactly 1 Grafana Deployment, found $GRAFANA_DEPS" | ||
| kubectl get deployment -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" >&2 | ||
| fail "double-Grafana detected — kube-prometheus-stack's grafana.enabled must be false" | ||
| fi | ||
| log "PASS: exactly 1 Grafana Deployment found" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 6: Assertion — Grafana has exactly one Prometheus datasource | ||
| # Re-read the admin password from the Secret (grafana-port-forward.sh created it). | ||
| # Use port 3001 to avoid colliding with any live phase-b-e2e port-forward on 3000. | ||
| # ------------------------------------------------------------------------- | ||
| log "reading admin password from Secret olam-grafana-admin" | ||
| GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin \ | ||
| -n "$NAMESPACE" \ | ||
| -o jsonpath='{.data.admin-password}' \ | ||
| | base64 -d) | ||
| log "port-forwarding svc/$GRAFANA_RELEASE $GRAFANA_LOCAL_PORT:$GRAFANA_SVC_PORT" | ||
| kubectl port-forward \ | ||
| -n "$NAMESPACE" \ | ||
| "svc/$GRAFANA_RELEASE" \ | ||
| "${GRAFANA_LOCAL_PORT}:${GRAFANA_SVC_PORT}" & | ||
| GRAFANA_PF_PID=$! | ||
| log "waiting ${PF_BIND_SECONDS}s for Grafana port-forward to bind" | ||
| sleep "$PF_BIND_SECONDS" | ||
| kill -0 "$GRAFANA_PF_PID" 2>/dev/null \ | ||
| || fail "Grafana port-forward process exited prematurely" | ||
| log "asserting exactly 1 Prometheus datasource in Grafana (GET /api/datasources)" | ||
| DATASOURCES=$(curl -sf \ | ||
| -u "admin:${GRAFANA_ADMIN_PW}" \ | ||
| "http://localhost:${GRAFANA_LOCAL_PORT}/api/datasources" \ | ||
| || { kubectl logs -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" --tail=30 >&2 || true | ||
| fail "GET /api/datasources failed — Grafana not reachable on port $GRAFANA_LOCAL_PORT"; }) | ||
| if ! echo "$DATASOURCES" | jq -e 'map(select(.type == "prometheus")) | length == 1' >/dev/null 2>&1; then | ||
| log "FAIL: Grafana does not have exactly 1 Prometheus datasource" | ||
| echo "$DATASOURCES" | jq . >&2 | ||
| fail "Prometheus datasource not provisioned — check datasources block in grafana-values.yaml" | ||
| fi | ||
| PROM_URL=$(echo "$DATASOURCES" | jq -r 'map(select(.type == "prometheus")) | .[0].url') | ||
| log "PASS: Grafana has exactly 1 Prometheus datasource (url=$PROM_URL)" | ||
| # ------------------------------------------------------------------------- | ||
| # Step 7: Assertion — Prometheus is scraping at least one active target | ||
| # ------------------------------------------------------------------------- | ||
| log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090" | ||
| kubectl port-forward \ | ||
| -n "$NAMESPACE" \ | ||
| "svc/prometheus-operated" \ | ||
| "${PROM_LOCAL_PORT}:9090" & | ||
| PROM_PF_PID=$! | ||
| log "waiting ${PF_BIND_SECONDS}s for Prometheus port-forward to bind" | ||
| sleep "$PF_BIND_SECONDS" | ||
| kill -0 "$PROM_PF_PID" 2>/dev/null \ | ||
| || fail "Prometheus port-forward process exited prematurely" | ||
| log "querying Prometheus /api/v1/targets for active targets" | ||
| TARGETS=$(curl -sf "http://localhost:${PROM_LOCAL_PORT}/api/v1/targets" \ | ||
| || fail "GET /api/v1/targets failed — Prometheus not reachable on port $PROM_LOCAL_PORT") | ||
| ACTIVE=$(echo "$TARGETS" | jq '.data.activeTargets | length') | ||
| if [ "$ACTIVE" -lt 1 ]; then | ||
| log "FAIL: Prometheus has 0 active scrape targets" | ||
| echo "$TARGETS" | jq '.data.activeTargets' >&2 | ||
| fail "Prometheus has no active targets — check ServiceMonitor CRD and scrapeConfig" | ||
| fi | ||
| log "PASS: $ACTIVE active scrape target(s) found in Prometheus" | ||
| # ------------------------------------------------------------------------- | ||
| # Assertion C4: Recording rules from 95-prom-recording-rules.yaml are loaded | ||
| # | ||
| # The 9[0-9]-prom-* glob in apply-manifests.sh skips this file (requires | ||
| # kube-prom-stack CRDs to exist). We kubectl apply it here, then poll | ||
| # /api/v1/rules until the olam-http-aggregations group appears. | ||
| # The port-forward on PROM_LOCAL_PORT is already open from Step 7 above. | ||
| # ------------------------------------------------------------------------- | ||
| PROM_URL="http://localhost:${PROM_LOCAL_PORT}" | ||
| log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)" | ||
| kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/95-prom-recording-rules.yaml" | ||
| # Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson). | ||
| # Poll /api/v1/rules until our group appears (up to 180s). | ||
| RECORDING_RULES_TIMEOUT=180 | ||
| log "polling ${PROM_URL}/api/v1/rules for olam-http-aggregations group (up to ${RECORDING_RULES_TIMEOUT}s)" | ||
| elapsed=0 | ||
| while [ "$elapsed" -lt "$RECORDING_RULES_TIMEOUT" ]; do | ||
| if curl -sf "${PROM_URL}/api/v1/rules" 2>/dev/null \ | ||
| | jq -e '.data.groups[] | select(.name == "olam-http-aggregations") | .rules[] | select(.name == "olam:http_requests:rate5m_by_service")' >/dev/null 2>&1; then | ||
| log "PASS: olam-http-aggregations rule group loaded after ${elapsed}s" | ||
| break | ||
| fi | ||
| sleep 10 | ||
| elapsed=$((elapsed + 10)) | ||
| done | ||
| if [ "$elapsed" -ge "$RECORDING_RULES_TIMEOUT" ]; then | ||
| log "FAIL: olam-http-aggregations rule group not found in /api/v1/rules within ${RECORDING_RULES_TIMEOUT}s" | ||
| curl -sf "${PROM_URL}/api/v1/rules" | jq '.data.groups[] | .name' >&2 || true | ||
| fail "PrometheusRule not loaded by operator" | ||
| fi | ||
| # ------------------------------------------------------------------------- | ||
| # Final | ||
| # ------------------------------------------------------------------------- | ||
| log "PASS: kube-prometheus-stack installed; single Grafana confirmed; Prometheus datasource provisioned; $ACTIVE active target(s); recording rules loaded — Tasks C1+C4 verified" | ||
| exit 0 |
| // Privacy Guard — regex-based auto-redactor for trace + recovery ledger. | ||
| // | ||
| // Deep-walks an object, finds string values, applies an ordered list of | ||
| // regex patterns, returns a redacted COPY (immutable; input untouched). | ||
| // Each match is replaced with `<redacted:<kind>>`. | ||
| // | ||
| // Default-ON patterns (7): anthropic, openai, aws, gh-pat, jwt, bearer, slack. | ||
| // Opt-in (env-gated): email PII (OLAM_REDACT_PII=1), high-entropy strings | ||
| // (OLAM_REDACT_HIGH_ENTROPY=1). Hard short-circuit: OLAM_REDACTION_DISABLED=1. | ||
| // | ||
| // Precedence matters: anthropic runs before openai (otherwise the OpenAI | ||
| // `sk-...` regex would steal `sk-ant-...` and emit the wrong tag). Bearer | ||
| // runs after the high-specificity key patterns so a bearer-wrapped key | ||
| // gets the tighter tag. | ||
| const DEFAULT_PATTERNS = [ | ||
| { kind: 'anthropic-key', re: /\bsk-ant-(?:api|admin)[A-Za-z0-9_-]{20,}\b/g }, | ||
| { kind: 'openai-key', re: /\bsk-(?:proj-)?[A-Za-z0-9_-]{32,}\b/g }, | ||
| { kind: 'aws-key', re: /\bAKIA[A-Z0-9]{16}\b/g }, | ||
| { kind: 'gh-token', re: /\bgh[poursa]_[A-Za-z0-9_]{36,}\b/g }, | ||
| { kind: 'jwt', re: /\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g }, | ||
| { kind: 'slack-token', re: /\bxox[abposr]-[A-Za-z0-9-]{10,}\b/g }, | ||
| { kind: 'bearer', re: /Bearer\s+[A-Za-z0-9._~+/-]+=*/gi, replacement: 'Bearer <redacted:bearer>' }, | ||
| ]; | ||
| const EMAIL_PATTERN = { kind: 'email', re: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi }; | ||
| const HIGH_ENTROPY_PATTERN = { kind: 'high-entropy', re: /\b[A-Z0-9_-]{32,}\b/g }; | ||
| const HIGH_ENTROPY_ALLOWLIST = new Set(['UUID', 'CHUNK_ID', '__filename', '__dirname']); | ||
| function redactString(s) { | ||
| if (process.env.OLAM_REDACTION_DISABLED === '1') return s; | ||
| let out = s; | ||
| for (const { kind, re, replacement } of DEFAULT_PATTERNS) { | ||
| out = out.replace(re, replacement ?? `<redacted:${kind}>`); | ||
| } | ||
| if (process.env.OLAM_REDACT_PII === '1') { | ||
| out = out.replace(EMAIL_PATTERN.re, `<redacted:${EMAIL_PATTERN.kind}>`); | ||
| } | ||
| if (process.env.OLAM_REDACT_HIGH_ENTROPY === '1') { | ||
| out = out.replace(HIGH_ENTROPY_PATTERN.re, (m) => | ||
| HIGH_ENTROPY_ALLOWLIST.has(m) || /^<redacted:/.test(m) ? m : `<redacted:${HIGH_ENTROPY_PATTERN.kind}>`, | ||
| ); | ||
| } | ||
| return out; | ||
| } | ||
| /** | ||
| * Deep-walk `value`, redacting strings. Returns a new value; input is | ||
| * never mutated. Primitives + null pass through unchanged (except strings, | ||
| * which are run through `redactString`). Cycles produce `'<cycle>'`. | ||
| * | ||
| * @template T | ||
| * @param {T} value | ||
| * @returns {T} | ||
| */ | ||
| export function redactSensitive(value) { | ||
| if (process.env.OLAM_REDACTION_DISABLED === '1') return value; | ||
| return walk(value, new WeakSet()); | ||
| } | ||
| function walk(value, seen) { | ||
| if (typeof value === 'string') return redactString(value); | ||
| if (value === null || typeof value !== 'object') return value; | ||
| if (seen.has(value)) return '<cycle>'; | ||
| seen.add(value); | ||
| if (Array.isArray(value)) return value.map((v) => walk(v, seen)); | ||
| const out = {}; | ||
| for (const k of Object.keys(value)) { | ||
| out[k] = walk(value[k], seen); | ||
| } | ||
| return out; | ||
| } |
| // Trace summary — operator triage digest over the NDJSON span trace. | ||
| // | ||
| // The NDJSON span sink (see `ndjson-span-sink.mjs`) writes one JSON line | ||
| // per span to ~/.olam/logs/host.trace.ndjson. Operators triage it today | ||
| // with hand-typed `jq` one-liners (README § Observability): "longest 5 | ||
| // spans", "all failed spans", "failure-kind tally". This module codifies | ||
| // those recipes into ONE digest so the common questions get one answer | ||
| // without remembering jq incantations. | ||
| // | ||
| // Design: | ||
| // - `summarizeSpans(spans, opts)` is PURE — no I/O. Given an array of | ||
| // parsed span records (the exact shape the sink writes) it returns a | ||
| // digest object. This is the unit-testable core. | ||
| // - `parseTrace(ndjsonText)` turns raw file bytes into { spans, skipped }. | ||
| // Malformed lines (truncated tail line, partial write mid-rotation) | ||
| // are COUNTED, never thrown — triage tooling must survive a corrupt | ||
| // line, not die on it. | ||
| // - `summarizeTraceFile(path, opts)` is the thin file-reading wrapper. | ||
| // - `formatDigest(digest)` renders a human-readable report for the CLI. | ||
| // | ||
| // Read-only + additive: this module never writes the trace, never changes | ||
| // the line schema. It only READS fields the sink already emits | ||
| // (durationMs, exit._tag, exit.reason, name, attributes.failureKind). | ||
| import { readFile } from 'node:fs/promises'; | ||
| const DEFAULT_TOP_N = 5; | ||
| /** | ||
| * Parse NDJSON trace text into spans, tolerating malformed lines. | ||
| * | ||
| * @param {string} text raw file contents | ||
| * @returns {{ spans: object[], skipped: number }} | ||
| */ | ||
| export function parseTrace(text) { | ||
| const spans = []; | ||
| let skipped = 0; | ||
| for (const line of String(text).split('\n')) { | ||
| const trimmed = line.trim(); | ||
| if (trimmed === '') continue; | ||
| try { | ||
| spans.push(JSON.parse(trimmed)); | ||
| } catch { | ||
| // Truncated tail line or a partial write straddling rotation — the | ||
| // append-only log can leave one half-line. Triage must not crash on | ||
| // it; count and move on. | ||
| skipped += 1; | ||
| } | ||
| } | ||
| return { spans, skipped }; | ||
| } | ||
| function isFailure(span) { | ||
| return span?.exit?._tag === 'Failure'; | ||
| } | ||
| /** | ||
| * Compute a triage digest over parsed spans. Pure. | ||
| * | ||
| * @param {object[]} spans | ||
| * @param {{ topN?: number }} [opts] | ||
| * @returns {{ | ||
| * totalSpans: number, | ||
| * failures: number, | ||
| * successes: number, | ||
| * failureRate: number, | ||
| * slowest: object[], | ||
| * recentFailures: object[], | ||
| * failureReasons: { reason: string, count: number }[], | ||
| * failureKinds: { kind: string, count: number }[], | ||
| * byName: { name: string, count: number, failures: number, meanMs: number|null, maxMs: number|null }[], | ||
| * }} | ||
| */ | ||
| export function summarizeSpans(spans, { topN = DEFAULT_TOP_N } = {}) { | ||
| const list = Array.isArray(spans) ? spans : []; | ||
| const totalSpans = list.length; | ||
| const failingSpans = list.filter(isFailure); | ||
| const failures = failingSpans.length; | ||
| const successes = totalSpans - failures; | ||
| const failureRate = totalSpans === 0 ? 0 : failures / totalSpans; | ||
| // Slowest spans by durationMs. Spans with a null duration (in-flight or | ||
| // missing endedAt) are excluded — they carry no comparable cost signal. | ||
| const timed = list.filter((s) => typeof s?.durationMs === 'number'); | ||
| const slowest = [...timed] | ||
| .sort((a, b) => b.durationMs - a.durationMs) | ||
| .slice(0, topN) | ||
| .map(projectSpan); | ||
| // Recent failures — the trace is append-only, so the last failures in | ||
| // file order are the most recent. Take the tail. | ||
| const recentFailures = failingSpans.slice(-topN).reverse().map(projectSpan); | ||
| const failureReasons = tally( | ||
| failingSpans, | ||
| (s) => (s?.exit?.reason != null ? String(s.exit.reason) : '(no reason)'), | ||
| 'reason', | ||
| ); | ||
| // failureKind is the world.lifecycle attribute the README already greps | ||
| // for; surface it as a first-class tally regardless of span name so | ||
| // recovery-relevant failures aggregate even when span names differ. | ||
| const failureKinds = tally( | ||
| list.filter((s) => s?.attributes?.failureKind != null), | ||
| (s) => String(s.attributes.failureKind), | ||
| 'kind', | ||
| ); | ||
| const byName = aggregateByName(list); | ||
| return { | ||
| totalSpans, | ||
| failures, | ||
| successes, | ||
| failureRate, | ||
| slowest, | ||
| recentFailures, | ||
| failureReasons, | ||
| failureKinds, | ||
| byName, | ||
| }; | ||
| } | ||
| function projectSpan(s) { | ||
| return { | ||
| name: s?.name ?? null, | ||
| traceId: s?.traceId ?? null, | ||
| spanId: s?.spanId ?? null, | ||
| durationMs: typeof s?.durationMs === 'number' ? s.durationMs : null, | ||
| startedAt: typeof s?.startedAt === 'number' ? s.startedAt : null, | ||
| reason: s?.exit?.reason != null ? String(s.exit.reason) : null, | ||
| }; | ||
| } | ||
| // Group spans by a string key and count occurrences, labelling the key | ||
| // field per the caller (`reason` for failure reasons, `kind` for failure | ||
| // kinds). Sorted by count descending so the dominant cause leads. | ||
| function tally(spans, keyFn, label) { | ||
| const counts = new Map(); | ||
| for (const s of spans) { | ||
| const key = keyFn(s); | ||
| counts.set(key, (counts.get(key) ?? 0) + 1); | ||
| } | ||
| const out = []; | ||
| for (const [k, count] of counts) out.push({ count, [label]: k }); | ||
| return out.sort((a, b) => b.count - a.count); | ||
| } | ||
| /** | ||
| * Per-span-name aggregate: count, failure count, mean + max duration. | ||
| * Sorted by count descending so the busiest spans surface first. | ||
| */ | ||
| function aggregateByName(spans) { | ||
| const groups = new Map(); | ||
| for (const s of spans) { | ||
| const name = s?.name != null ? String(s.name) : '(unnamed)'; | ||
| let g = groups.get(name); | ||
| if (!g) { | ||
| g = { name, count: 0, failures: 0, durSum: 0, durCount: 0, maxMs: null }; | ||
| groups.set(name, g); | ||
| } | ||
| g.count += 1; | ||
| if (isFailure(s)) g.failures += 1; | ||
| if (typeof s?.durationMs === 'number') { | ||
| g.durSum += s.durationMs; | ||
| g.durCount += 1; | ||
| g.maxMs = g.maxMs === null ? s.durationMs : Math.max(g.maxMs, s.durationMs); | ||
| } | ||
| } | ||
| return [...groups.values()] | ||
| .map((g) => ({ | ||
| name: g.name, | ||
| count: g.count, | ||
| failures: g.failures, | ||
| meanMs: g.durCount === 0 ? null : g.durSum / g.durCount, | ||
| maxMs: g.maxMs, | ||
| })) | ||
| .sort((a, b) => b.count - a.count); | ||
| } | ||
| /** | ||
| * Read + summarize a trace file. Missing file → empty digest (an operator | ||
| * who hasn't generated any spans yet sees a clean zero-state, not a crash). | ||
| * | ||
| * @param {string} path | ||
| * @param {{ topN?: number }} [opts] | ||
| */ | ||
| export async function summarizeTraceFile(path, opts = {}) { | ||
| let text; | ||
| try { | ||
| text = await readFile(path, 'utf8'); | ||
| } catch (err) { | ||
| if (err && err.code === 'ENOENT') { | ||
| return { ...summarizeSpans([], opts), skipped: 0, missing: true }; | ||
| } | ||
| throw err; | ||
| } | ||
| const { spans, skipped } = parseTrace(text); | ||
| return { ...summarizeSpans(spans, opts), skipped, missing: false }; | ||
| } | ||
| function fmtMs(ms) { | ||
| if (ms == null) return '—'; | ||
| if (ms >= 1000) return `${(ms / 1000).toFixed(2)}s`; | ||
| return `${Math.round(ms)}ms`; | ||
| } | ||
| /** | ||
| * Render a digest as a human-readable, plain-text report for the CLI. | ||
| * | ||
| * @param {ReturnType<typeof summarizeSpans> & { skipped?: number, missing?: boolean, path?: string }} digest | ||
| * @returns {string} | ||
| */ | ||
| export function formatDigest(digest) { | ||
| const lines = []; | ||
| const path = digest.path ? ` (${digest.path})` : ''; | ||
| lines.push(`Trace summary${path}`); | ||
| if (digest.missing) { | ||
| lines.push(' no trace file yet — nothing recorded.'); | ||
| return lines.join('\n'); | ||
| } | ||
| const pct = (digest.failureRate * 100).toFixed(1); | ||
| lines.push( | ||
| ` ${digest.totalSpans} spans · ${digest.failures} failed (${pct}%) · ${digest.successes} ok` + | ||
| (digest.skipped ? ` · ${digest.skipped} malformed line(s) skipped` : ''), | ||
| ); | ||
| if (digest.slowest.length) { | ||
| lines.push(''); | ||
| lines.push(`Top ${digest.slowest.length} slowest:`); | ||
| for (const s of digest.slowest) { | ||
| lines.push(` ${fmtMs(s.durationMs).padStart(7)} ${s.name ?? '(unnamed)'}${s.traceId ? ` [${s.traceId}]` : ''}`); | ||
| } | ||
| } | ||
| if (digest.recentFailures.length) { | ||
| lines.push(''); | ||
| lines.push(`Recent failures (${digest.recentFailures.length}):`); | ||
| for (const f of digest.recentFailures) { | ||
| lines.push(` ${f.name ?? '(unnamed)'}: ${f.reason ?? '(no reason)'}${f.traceId ? ` [${f.traceId}]` : ''}`); | ||
| } | ||
| } | ||
| if (digest.failureKinds.length) { | ||
| lines.push(''); | ||
| lines.push('Failure kinds:'); | ||
| for (const k of digest.failureKinds) lines.push(` ${String(k.count).padStart(4)} ${k.kind}`); | ||
| } | ||
| if (digest.failureReasons.length) { | ||
| lines.push(''); | ||
| lines.push('Failure reasons:'); | ||
| for (const r of digest.failureReasons) lines.push(` ${String(r.count).padStart(4)} ${r.reason}`); | ||
| } | ||
| if (digest.byName.length) { | ||
| lines.push(''); | ||
| lines.push('By span name (count · failures · mean · max):'); | ||
| for (const n of digest.byName) { | ||
| lines.push( | ||
| ` ${String(n.count).padStart(5)} · ${String(n.failures).padStart(4)}f · ${fmtMs(n.meanMs).padStart(7)} · ${fmtMs(n.maxMs).padStart(7)} ${n.name}`, | ||
| ); | ||
| } | ||
| } | ||
| return lines.join('\n'); | ||
| } |
| # Grafana Helm values — k3s-ingress-observability Phase B Task B2 | ||
| # | ||
| # STANDALONE grafana/grafana chart per OQ-p3-4 + Decision 16. | ||
| # - This is NOT the Grafana bundled with kube-prometheus-stack. | ||
| # - Phase C kube-prometheus-stack MUST set `grafana.enabled: false` | ||
| # explicitly to prevent a second Grafana Deployment from landing. | ||
| # - Port-forward only — NEVER expose via Traefik IngressRoute. | ||
| # See T7 in DESIGN.md: secret exfil mitigated by no ingress surface. | ||
| # | ||
| # Chart: grafana/grafana; pinned to 8.5.2 (latest stable as of 2026-05-20). | ||
| # Upgrade discipline: chart version is embedded in the e2e script comment. | ||
| # ------------------------------------------------------------------------- | ||
| # Admin credentials — loaded from a pre-existing Secret, NOT from chart | ||
| # values. Secret is created by scripts/e2e/grafana-port-forward.sh before | ||
| # helm install, or by the operator following the procedure in | ||
| # packages/peripheral-services/manifests/README.md (§ "Grafana admin secret"). | ||
| # The placeholder manifest (70-grafana-secret.yaml) was removed 2026-05-21 | ||
| # (dogfood finding #4) because `kubectl apply` would overwrite the operator's | ||
| # pre-created Secret with the placeholder value. | ||
| # ------------------------------------------------------------------------- | ||
| admin: | ||
| existingSecret: olam-grafana-admin | ||
| userKey: admin-user | ||
| passwordKey: admin-password | ||
| # ------------------------------------------------------------------------- | ||
| # Service: ClusterIP only. | ||
| # Decision 16: port-forward only; never ingress-routed. | ||
| # Access: `kubectl port-forward -n monitoring svc/olam-grafana 3000:80` | ||
| # ------------------------------------------------------------------------- | ||
| service: | ||
| type: ClusterIP | ||
| port: 80 | ||
| # ------------------------------------------------------------------------- | ||
| # Ingress: disabled. | ||
| # Decision 16 + OQ-p3-4: Grafana is never exposed via Traefik IngressRoute. | ||
| # Port-forward is the sole operator access path. Enabling ingress here would | ||
| # silently violate the access-control intent even if no IngressRoute manifest | ||
| # is committed. | ||
| # ------------------------------------------------------------------------- | ||
| ingress: | ||
| enabled: false # Decision 16: port-forward only; never ingress-routed | ||
| # ------------------------------------------------------------------------- | ||
| # Datasources: Loki (default) + Prometheus (added in Phase C Task C1). | ||
| # | ||
| # Dual-chart pattern: | ||
| # - kube-prometheus-stack (C1) provides Prometheus. Its bundled Grafana | ||
| # sub-chart is disabled (grafana.enabled: false in kube-prom-stack-values.yaml). | ||
| # - This standalone grafana/grafana chart (Phase B) is the only Grafana. | ||
| # - The Prometheus datasource URL points at `prometheus-operated`, which is | ||
| # the in-cluster Service that kube-prometheus-stack's Prometheus Operator | ||
| # creates for the managed Prometheus StatefulSet. | ||
| # - timeInterval: 15s matches the scrape interval in kube-prom-stack-values.yaml | ||
| # so Grafana's step calculation aligns with actual data granularity. | ||
| # - exemplarTraceIdDestinations.datasourceUid: tempo is harmless until Phase D | ||
| # adds Tempo; Grafana silently ignores unknown datasource UIDs. | ||
| # | ||
| # editable: false prevents accidental operator drift across sessions. | ||
| # ------------------------------------------------------------------------- | ||
| datasources: | ||
| datasources.yaml: | ||
| apiVersion: 1 | ||
| datasources: | ||
| - name: Loki | ||
| type: loki | ||
| access: proxy | ||
| url: http://olam-loki.monitoring.svc.cluster.local:3100 | ||
| isDefault: true | ||
| editable: false | ||
| - name: Prometheus | ||
| type: prometheus | ||
| access: proxy | ||
| url: http://prometheus-operated.monitoring.svc.cluster.local:9090 | ||
| isDefault: false | ||
| editable: false | ||
| jsonData: | ||
| timeInterval: 15s # matches scrape interval in kube-prom-stack-values.yaml | ||
| exemplarTraceIdDestinations: | ||
| - name: trace_id | ||
| datasourceUid: tempo # Phase D may add Tempo; harmless until then | ||
| # ------------------------------------------------------------------------- | ||
| # Dashboard provisioner: file-based ConfigMap mount. | ||
| # B3 lands the olam-dashboards ConfigMap and the actual JSON files. | ||
| # B2 wires the loader so B3's ConfigMap is picked up automatically. | ||
| # ------------------------------------------------------------------------- | ||
| dashboardProviders: | ||
| dashboardproviders.yaml: | ||
| apiVersion: 1 | ||
| providers: | ||
| - name: olam-default | ||
| orgId: 1 | ||
| folder: 'Olam' | ||
| type: file | ||
| disableDeletion: true | ||
| updateIntervalSeconds: 30 | ||
| allowUiUpdates: false | ||
| options: | ||
| path: /var/lib/grafana/dashboards/olam-default | ||
| # Wire the volume mount — B3 creates this ConfigMap with the actual JSON. | ||
| # Grafana will warn "ConfigMap olam-dashboards not found" until B3 lands; | ||
| # this is benign and does not block Grafana startup. | ||
| dashboardsConfigMaps: | ||
| olam-default: olam-dashboards # B3 creates this ConfigMap | ||
| # ------------------------------------------------------------------------- | ||
| # Resources: tuned for single-operator k3s (<256Mi idle typical). | ||
| # P2 acceptance criterion: <500MB idle / <1GB typical across full LGTM stack. | ||
| # ------------------------------------------------------------------------- | ||
| resources: | ||
| requests: | ||
| cpu: 50m | ||
| memory: 128Mi | ||
| limits: | ||
| cpu: 200m | ||
| memory: 256Mi # P2: keeps Grafana within its share of the LGTM RAM budget | ||
| # ------------------------------------------------------------------------- | ||
| # Persistence: disabled for Phase B. | ||
| # Grafana state (dashboards, users) lives in ConfigMaps / values files. | ||
| # Phase C may enable a PV if fine-grained alert state or annotations | ||
| # accumulate. For now, stateless Grafana is simpler and matches S2. | ||
| # ------------------------------------------------------------------------- | ||
| persistence: | ||
| enabled: false # S2: ConfigMap-mounted dashboards; no PV needed in Phase B | ||
| # ------------------------------------------------------------------------- | ||
| # ServiceMonitor: Phase C Prometheus scrapes Grafana's /metrics endpoint. | ||
| # Disabled in Phase B: the ServiceMonitor CRD (monitoring.coreos.com/v1) is | ||
| # shipped by kube-prometheus-stack in Phase C. The earlier "enable now to | ||
| # avoid a Phase C helm upgrade" rationale was wrong — Phase C will need a | ||
| # helm upgrade anyway to wire Prometheus scrape targets. Flipping this on | ||
| # pre-CRD breaks the install on chart versions that hard-validate. | ||
| # ------------------------------------------------------------------------- | ||
| serviceMonitor: | ||
| # Disabled in the source-of-truth values file so a standalone Phase B install | ||
| # (without kube-prometheus-stack) does not hard-fail when the CRD is absent. | ||
| # The C1 e2e script flips this on at RUNTIME via | ||
| # helm upgrade ... --reuse-values --set serviceMonitor.enabled=true | ||
| # AFTER kube-prom-stack has installed the ServiceMonitor CRD. | ||
| enabled: false | ||
| # ------------------------------------------------------------------------- | ||
| # Grafana.ini overrides: anonymous access disabled (default); only | ||
| # setting the server root_url so port-forward URLs render correctly | ||
| # in email / share links (cosmetic; not a security seam). | ||
| # ------------------------------------------------------------------------- | ||
| grafana.ini: | ||
| server: | ||
| root_url: "%(protocol)s://%(domain)s:%(http_port)s/" | ||
| analytics: | ||
| reporting_enabled: false # no telemetry to grafana.com | ||
| check_for_updates: false | ||
| security: | ||
| allow_embedding: false |
| # kube-prometheus-stack Helm values — k3s-ingress-observability Phase C Task C1 | ||
| # | ||
| # Chart: prometheus-community/kube-prometheus-stack; pinned to 85.2.0 | ||
| # (latest stable as of 2026-05-21). | ||
| # Upgrade discipline: pin in this file + e2e script comment must stay in sync. | ||
| # | ||
| # CRITICAL: grafana.enabled MUST stay false. | ||
| # Phase B ships a standalone grafana/grafana chart (olam-grafana release). | ||
| # kube-prometheus-stack's bundled Grafana sub-chart is disabled to prevent | ||
| # a second Grafana Deployment from landing in the cluster. | ||
| # Decision 16 + OQ-p3-4: Phase B's standalone Grafana is canonical. | ||
| # Enabling the sub-chart here would violate that decision and create two | ||
| # Grafana instances — caught by prom-no-double-grafana.sh's single-Grafana | ||
| # assertion. | ||
| # | ||
| # Resource budget summary (Phase C contribution to P2 target <500MB idle / <1GB typical): | ||
| # prometheus-operator: 128Mi req / 512Mi limit | ||
| # prometheus: 512Mi req / 2Gi limit | ||
| # node-exporter: 64Mi req / 128Mi limit | ||
| # kube-state-metrics: 128Mi req / 256Mi limit | ||
| # Total C1 addition: ~832Mi req / ~3Gi limit (spread across nodes) | ||
| # | ||
| # Retention policy (Decision 14): scrape 15s / retention 15d / size cap 10GiB. | ||
| # The size cap (T10 TSDB corruption mitigation) is the hard guard; retention 15d | ||
| # is advisory — the size cap enforces first. | ||
| # | ||
| # Alertmanager: disabled for C1. C2 lands the first alert rule (cardinality 80k). | ||
| # When C2 ships, flip alertmanager.enabled: true and configure receivers. | ||
| # Comment: "C1 ships without alertmanager; C2 enables when first alert rule lands." | ||
| # ------------------------------------------------------------------------- | ||
| # CARDINALITY ENFORCEMENT — Task C2 (T1 cardinality bomb / P4 <100k active series) | ||
| # | ||
| # Goal: strip high-cardinality labels (world_id, trace_id, user_id, | ||
| # request_id, operator_id) from every scraped series BEFORE TSDB ingest. | ||
| # | ||
| # Architecture finding (helm template verified, 2026-05-21): | ||
| # The prometheus-operator Prometheus CR has NO global metricRelabelConfigs | ||
| # field. The Prometheus CR spec exposes only per-ServiceMonitor endpoint | ||
| # metricRelabelings. There is no chart-level "apply to all scrapes" slot. | ||
| # | ||
| # Enforcement strategy (two-layer): | ||
| # Layer 1 — chart-managed ServiceMonitors: set metricRelabelings on every | ||
| # ServiceMonitor the chart controls (coreDns, prometheusOperator, | ||
| # prometheus self-scrape, node-exporter). Belt-and-suspenders; these | ||
| # services don't emit world_id etc. in practice, but the rule is free. | ||
| # Note: kube-state-metrics sub-chart has no metricRelabelings slot in | ||
| # its prometheus.monitor section at chart version 85.2.0 — omitted. | ||
| # Layer 2 — user-deployed ServiceMonitors: the cardinality-drop.sh e2e | ||
| # script's synthetic violator ServiceMonitor carries the same labeldrop | ||
| # rule (release: olam-prom label + metricRelabelings). New services | ||
| # MUST include the same block — enforced by docs + code review. | ||
| # | ||
| # Why labeldrop is the right action: | ||
| # action: labeldrop removes the matched labels from ALL series that carry | ||
| # them, regardless of metric name. This is the same semantic as Promtail's | ||
| # pipeline drop stages (promtail-values.yaml) — both layers stay in sync. | ||
| # world_id surfaces in dashboards via EXEMPLARS (Decision 9), not labels. | ||
| # | ||
| # Regex covers all five taxonomy labels from observability-label-taxonomy: | ||
| # world_id, trace_id, user_id, request_id, operator_id | ||
| # ------------------------------------------------------------------------- | ||
| _cardinalityLabeldrop: &cardinality-labeldrop | ||
| - action: labeldrop | ||
| regex: 'world_id|trace_id|user_id|request_id|operator_id' | ||
| # ------------------------------------------------------------------------- | ||
| # HARD REQUIREMENT: grafana sub-chart is off. | ||
| # See top-of-file comment for rationale. | ||
| # ------------------------------------------------------------------------- | ||
| grafana: | ||
| enabled: false # HARD: Decision 16 + OQ-p3-4 — standalone Grafana (olam-grafana) is canonical | ||
| # ------------------------------------------------------------------------- | ||
| # Alertmanager: off until C2 lands the first alert rule. | ||
| # C2 comment: "C1 ships without alertmanager; C2 enables when first alert rule lands." | ||
| # ------------------------------------------------------------------------- | ||
| alertmanager: | ||
| enabled: true # C2: first alert rule (OlamActiveSeriesHigh) lands; alertmanager enabled | ||
| serviceMonitor: | ||
| metricRelabelings: *cardinality-labeldrop | ||
| # ------------------------------------------------------------------------- | ||
| # Default kube-controller-manager / scheduler / proxy / etcd monitors. | ||
| # These ServiceMonitors don't work on k3d/k3s because the endpoints are not | ||
| # exposed via the usual ports. Disabling avoids noisy "endpoint not found" | ||
| # warnings and scrape failures on every Prometheus eval cycle. | ||
| # ------------------------------------------------------------------------- | ||
| kubeControllerManager: | ||
| enabled: false | ||
| kubeScheduler: | ||
| enabled: false | ||
| kubeProxy: | ||
| enabled: false | ||
| kubeEtcd: | ||
| enabled: false | ||
| # kube-apiserver and kubelet DO work on k3d but generate high-cardinality | ||
| # label combinations. Disable for now; re-evaluate when per-service /metrics | ||
| # (C3) and cardinality enforcement (C2) are in place. | ||
| kubeApiServer: | ||
| enabled: false | ||
| kubelet: | ||
| enabled: false | ||
| # ------------------------------------------------------------------------- | ||
| # Default alerting rules: off. | ||
| # The bundled default rules generate Alertmanager receivers and PrometheusRule | ||
| # objects for kubelet, etcd, apiserver, etc. — most don't fire on k3d anyway | ||
| # and add noise before C2's focused cardinality rule lands. | ||
| # C2 will add targeted PrometheusRule objects separately. | ||
| # ------------------------------------------------------------------------- | ||
| defaultRules: | ||
| create: false | ||
| # ------------------------------------------------------------------------- | ||
| # coreDns — ServiceMonitor with labeldrop (Layer 1 cardinality enforcement) | ||
| # ------------------------------------------------------------------------- | ||
| coreDns: | ||
| serviceMonitor: | ||
| metricRelabelings: *cardinality-labeldrop | ||
| # ------------------------------------------------------------------------- | ||
| # CRDs: install via chart (default: true, explicit for clarity). | ||
| # These CRDs (ServiceMonitor, PodMonitor, PrometheusRule, etc.) are required | ||
| # before Phase B's loki/promtail/grafana charts can have serviceMonitor.enabled:true. | ||
| # Phase C's e2e script waits for servicemonitors.monitoring.coreos.com to be | ||
| # Established before helm-upgrading the Phase B charts. | ||
| # ------------------------------------------------------------------------- | ||
| crds: | ||
| enabled: true | ||
| # ------------------------------------------------------------------------- | ||
| # Prometheus Operator | ||
| # ------------------------------------------------------------------------- | ||
| prometheusOperator: | ||
| enabled: true | ||
| serviceMonitor: | ||
| metricRelabelings: *cardinality-labeldrop | ||
| resources: | ||
| requests: | ||
| cpu: 100m | ||
| memory: 128Mi | ||
| limits: | ||
| cpu: 500m | ||
| memory: 512Mi | ||
| # ------------------------------------------------------------------------- | ||
| # Prometheus core — Decision 14: scrape 15s / retention 15d / 10GiB cap | ||
| # ------------------------------------------------------------------------- | ||
| prometheus: | ||
| serviceMonitor: | ||
| metricRelabelings: *cardinality-labeldrop | ||
| prometheusSpec: | ||
| scrapeInterval: 15s # Decision 14 | ||
| evaluationInterval: 15s | ||
| retention: 15d # Decision 14 — advisory; size cap enforces first | ||
| retentionSize: 10GiB # Decision 14 — T10 TSDB corruption prevention | ||
| walCompression: true | ||
| enableAdminAPI: false # security: admin API allows snapshot deletion + series deletion | ||
| enableRemoteWriteReceiver: false # not a remote-write target; no inbound writes | ||
| logLevel: warn # info is noisy at 15s scrape cycle | ||
| resources: | ||
| requests: | ||
| cpu: 200m | ||
| memory: 512Mi | ||
| limits: | ||
| cpu: 1000m | ||
| memory: 2Gi | ||
| # PersistentVolume for TSDB. 12Gi = 10GiB retention cap + ~20% headroom. | ||
| # local-path provisioner is used on k3d; cloud providers use their default SC. | ||
| storageSpec: | ||
| volumeClaimTemplate: | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| resources: | ||
| requests: | ||
| storage: 12Gi # 10GiB retention + 20% headroom for in-flight segments | ||
| # ------------------------------------------------------------------------- | ||
| # Node exporter — keep enabled (host-level metrics: CPU, memory, disk, net). | ||
| # ------------------------------------------------------------------------- | ||
| nodeExporter: | ||
| enabled: true | ||
| prometheus-node-exporter: | ||
| prometheus: | ||
| monitor: | ||
| metricRelabelings: *cardinality-labeldrop | ||
| resources: | ||
| requests: | ||
| cpu: 30m | ||
| memory: 64Mi | ||
| limits: | ||
| cpu: 100m | ||
| memory: 128Mi | ||
| # ------------------------------------------------------------------------- | ||
| # kube-state-metrics — keep enabled (k8s-level metrics: pod phases, deployments). | ||
| # ------------------------------------------------------------------------- | ||
| kubeStateMetrics: | ||
| enabled: true | ||
| kube-state-metrics: | ||
| resources: | ||
| requests: | ||
| cpu: 50m | ||
| memory: 128Mi | ||
| limits: | ||
| cpu: 200m | ||
| memory: 256Mi | ||
| # ------------------------------------------------------------------------- | ||
| # Datasource auto-discovery note: | ||
| # kube-prometheus-stack's grafana.sidecar.datasources is N/A (grafana sub-chart | ||
| # is off). Phase B's standalone Grafana (grafana-values.yaml) has been updated | ||
| # in this same C1 PR to include a Prometheus datasource entry pointing at: | ||
| # http://prometheus-operated.monitoring.svc.cluster.local:9090 | ||
| # This is the in-cluster Service that kube-prometheus-stack creates for the | ||
| # Prometheus StatefulSet (created by the Prometheus Operator from the | ||
| # Prometheus CR above). | ||
| # ------------------------------------------------------------------------- |
| # Kyverno Helm values — k3s-ingress-observability Phase C C8 follow-up. | ||
| # | ||
| # Kyverno is the policy-as-code layer for cluster-wide cardinality | ||
| # enforcement (closes codex's C2 concern on PR #783). The companion | ||
| # ClusterPolicy in | ||
| # `packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml` | ||
| # mutates every incoming ServiceMonitor and PodMonitor to inject the | ||
| # labeldrop rule before the object is persisted — so a third-party | ||
| # chart (or hand-rolled object) cannot bypass the layer-2 | ||
| # per-ServiceMonitor enforcement landed in C2. | ||
| # | ||
| # Chart: kyverno/kyverno; pinned to 3.8.1 (app v1.18.1, 2026-05-21 latest stable). | ||
| # Upgrade discipline: this pin AND the helm-install line in | ||
| # `scripts/e2e/kyverno-cardinality-mutate.sh` must stay in sync. | ||
| # | ||
| # Footprint posture (single-operator k3s scale): | ||
| # We only run admission-time mutation. The ClusterPolicy uses | ||
| # `spec.background: false`, so the background-scan controller is | ||
| # unused. Cleanup + reports controllers are also dead weight for | ||
| # a single ClusterPolicy with no PolicyExceptions — they're disabled | ||
| # so Kyverno's pod count stays minimal (1 pod, not 4). | ||
| # | ||
| # Footprint (Phase C C8 contribution to P2 target <500MB idle / <1GB typical): | ||
| # admissionController: 128Mi req / 384Mi limit (chart default 128Mi/384Mi) | ||
| # Total addition: ~128Mi req / ~384Mi limit | ||
| # | ||
| # If/when we want policy reports populated for observability dashboards, | ||
| # flip `reportsController.enabled: true` and the `features.policyReports` | ||
| # block below. Same for cleanup. | ||
| # | ||
| # Resource limits — tuned upward from chart default for admission webhook | ||
| # stability under burst churn (kube-prom-stack ships ~10 ServiceMonitors at | ||
| # once during `helm upgrade`, which arrives as a burst of AdmissionReviews). | ||
| # ------------------------------------------------------------------------- | ||
| # Disable controllers we don't need | ||
| # ------------------------------------------------------------------------- | ||
| backgroundController: | ||
| enabled: false # ClusterPolicy is admission-only (background: false) | ||
| cleanupController: | ||
| enabled: false # no CleanupPolicy objects in this repo | ||
| reportsController: | ||
| enabled: false # no policy-reports surface wired into Grafana yet | ||
| # ------------------------------------------------------------------------- | ||
| # Features — admissionReports + policyReports remain ON inside the | ||
| # admission controller itself even when the standalone reports controller | ||
| # is disabled. This keeps `kubectl get clusterpolicyreport` queryable | ||
| # during dogfood; the reports controller would only AGGREGATE them | ||
| # cluster-wide, which we don't need yet. | ||
| # ------------------------------------------------------------------------- | ||
| features: | ||
| admissionReports: | ||
| enabled: true | ||
| policyReports: | ||
| enabled: true | ||
| # Background scan is N/A — the policy uses background: false. Explicit | ||
| # off avoids the controller scheduling unnecessary scan workers even | ||
| # when the controller pod is disabled above. | ||
| backgroundScan: | ||
| enabled: false | ||
| # Logging volume defaults are fine; level 2 = info-ish. | ||
| logging: | ||
| format: text | ||
| verbosity: 2 | ||
| # ------------------------------------------------------------------------- | ||
| # Admission controller — the only pod we run. | ||
| # ------------------------------------------------------------------------- | ||
| admissionController: | ||
| replicas: 1 # single-operator k3s scale; HA is N/A for dogfood | ||
| rbac: | ||
| create: true # ClusterPolicy needs cluster-wide watch on ServiceMonitor + PodMonitor | ||
| container: | ||
| resources: | ||
| requests: | ||
| cpu: 100m | ||
| memory: 256Mi | ||
| limits: | ||
| cpu: 500m | ||
| memory: 512Mi |
| # Loki Helm values — k3s-ingress-observability Phase B Task B1 | ||
| # | ||
| # Single-binary mode (Decision-16 + Phase B scope): | ||
| # Distributed mode (microservices) adds 5+ independent Deployments + a Minio | ||
| # or S3 backend for object storage — pure overhead for a single-operator | ||
| # k3s install where Loki's write throughput is bounded by one Promtail | ||
| # DaemonSet and a handful of containers. SingleBinary collapses all roles | ||
| # (ingester, querier, compactor) into one Pod, fits within the <500MB idle | ||
| # LGTM RAM target (P2), and is trivially replaceable if scale demands change. | ||
| # | ||
| # See: docs/plans/k3s-ingress-observability/DESIGN.md (P2, S2) | ||
| # | ||
| # Chart: grafana/loki; pinned to 6.7.4 (latest stable as of 2026-05-20). | ||
| # Upgrade discipline: chart version is embedded in the e2e script comment. | ||
| deploymentMode: SingleBinary | ||
| loki: | ||
| auth_enabled: false # single-tenant; multi-tenancy adds header overhead with no benefit here | ||
| commonConfig: | ||
| replication_factor: 1 # single-binary; no replicas = no cross-replica consistency needed | ||
| # ------------------------------------------------------------------------- | ||
| # Storage backend: filesystem (boltdb-shipper + tsdb index; local PV). | ||
| # Object storage (S3/GCS/MinIO) deferred to fatbox multi-org Phase F+. | ||
| # For single-operator k3s, local PV is simpler and sufficient. | ||
| # ------------------------------------------------------------------------- | ||
| storage: | ||
| type: filesystem | ||
| schemaConfig: | ||
| configs: | ||
| - from: "2024-01-01" | ||
| store: tsdb | ||
| object_store: filesystem | ||
| schema: v13 | ||
| index: | ||
| prefix: loki_index_ | ||
| period: 24h | ||
| # ------------------------------------------------------------------------- | ||
| # Retention: 7 days (168h) per Performance budget acceptance criterion #6. | ||
| # compactor.retention_enabled enables deletion; ring config required for | ||
| # single-binary mode. | ||
| # ------------------------------------------------------------------------- | ||
| limits_config: | ||
| retention_period: 168h # 7 days | ||
| ingestion_rate_mb: 4 # per-tenant ingestion cap (single tenant) | ||
| ingestion_burst_size_mb: 8 | ||
| max_query_series: 5000 # cap log-derived queries from going wide (P3 <3s p95) | ||
| max_entries_limit_per_query: 5000 | ||
| compactor: | ||
| retention_enabled: true | ||
| delete_request_store: filesystem | ||
| compaction_interval: 10m | ||
| working_directory: /var/loki/compactor | ||
| ingester: | ||
| chunk_idle_period: 30m # flush to storage; appropriate for low write rate | ||
| chunk_retain_period: 1m | ||
| max_chunk_age: 2h | ||
| # Self-metrics endpoint — Phase C Prometheus scrapes this. | ||
| # Server block exposed on port 3100 (default); /metrics is always available. | ||
| singleBinary: | ||
| replicas: 1 | ||
| # ------------------------------------------------------------------------- | ||
| # Persistence: 10Gi PV. | ||
| # | ||
| # Rationale: 7-day retention at olam scale (<500 containers, access logs | ||
| # estimated 1–2MB/day compressed) → ~100MB typical stored. 10Gi gives 10x | ||
| # headroom for burst (failed deploy loops, chatty containers) and is well | ||
| # within the <1GB typical acceptance criterion #6. Cloud provider default SC | ||
| # is fine; on bare-metal k3s the local-path provisioner is used. | ||
| # ------------------------------------------------------------------------- | ||
| persistence: | ||
| enabled: true | ||
| size: 10Gi # 10× headroom over 7-day typical (~100MB); <1GB usage target per AC#6 | ||
| # ------------------------------------------------------------------------- | ||
| # Resources: memory limit 512Mi per task spec. | ||
| # Typical usage at olam scale: <200MB idle (boltdb index + block cache). | ||
| # 512Mi limit prevents compaction spikes from triggering OOM on the node. | ||
| # ------------------------------------------------------------------------- | ||
| resources: | ||
| requests: | ||
| cpu: 100m | ||
| memory: 128Mi | ||
| limits: | ||
| cpu: 500m | ||
| memory: 512Mi # P2: <500MB idle / <1GB typical; limit prevents spike OOM | ||
| # ------------------------------------------------------------------------- | ||
| # Self-metrics for Phase C Prometheus scrape. | ||
| # ServiceMonitor is created here; Prometheus picks it up in Phase C. | ||
| # ------------------------------------------------------------------------- | ||
| monitoring: | ||
| selfMonitoring: | ||
| enabled: false # disables the bundled GrafanaAgent sub-chart dependency | ||
| grafanaAgent: | ||
| installOperator: false | ||
| serviceMonitor: | ||
| # Disabled in the source-of-truth values file so a standalone Phase B install | ||
| # (without kube-prometheus-stack) does not hard-fail when the CRD is absent. | ||
| # The C1 e2e script flips this on at RUNTIME via | ||
| # helm upgrade ... --reuse-values --set monitoring.serviceMonitor.enabled=true | ||
| # AFTER kube-prom-stack has installed the ServiceMonitor CRD. | ||
| # NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor). | ||
| enabled: false | ||
| # ------------------------------------------------------------------------- | ||
| # Backend and read/write gateway: disabled for SingleBinary mode. | ||
| # These are microservices-mode components and must be off or the chart | ||
| # emits validation errors when deploymentMode=SingleBinary. | ||
| # ------------------------------------------------------------------------- | ||
| backend: | ||
| replicas: 0 | ||
| read: | ||
| replicas: 0 | ||
| write: | ||
| replicas: 0 | ||
| # Grafana agent / canary: not needed; disable to keep resource footprint minimal. | ||
| lokiCanary: | ||
| enabled: false | ||
| test: | ||
| enabled: false | ||
| # ------------------------------------------------------------------------- | ||
| # Sub-component slimming — chart 6.7.4 defaults include nginx gateway + | ||
| # two Memcached clusters + minio + sidecar watchers that single-binary | ||
| # mode doesn't need. Each adds image-pull and Ready-wait time. Disabling | ||
| # all of them brings the install Ready-time within the harness budget. | ||
| # If a future scenario needs query-result caching, re-evaluate | ||
| # resultsCache specifically. | ||
| # ------------------------------------------------------------------------- | ||
| # nginx routing front; Promtail writes direct to single-binary :3100 | ||
| gateway: | ||
| enabled: false | ||
| # Memcached cluster — overhead for single-binary | ||
| chunksCache: | ||
| enabled: false | ||
| # second Memcached cluster — overhead for single-binary | ||
| resultsCache: | ||
| enabled: false | ||
| # minio is off because storage.type=filesystem, but be explicit | ||
| minio: | ||
| enabled: false | ||
| # Sidecar that watches ConfigMaps for runtime config reloads — we don't ship one. | ||
| sidecar: | ||
| rules: | ||
| enabled: false | ||
| datasources: | ||
| enabled: false | ||
| configs: | ||
| enabled: false |
| # Promtail Helm values — Phase A Task A5 staging (Phase B consumes) | ||
| # | ||
| # Tails every container's stdout; ships to Loki single-binary (Phase B installs Loki). | ||
| # Per OQ-p3-6: Traefik native config can redact HEADERS but NOT URL query params — | ||
| # query-param scrubbing for `?token=`, `?code=`, `?access_token=`, `?state=` happens | ||
| # HERE at Promtail ingest via pipeline_stages.replace regex. | ||
| # | ||
| # Resource limits per OQ-p3-37 (Promtail OOM risk under chatty container-cp 100ms cadence): | ||
| # - memory limit 256Mi | ||
| # - pipeline_stages.limit rate 100 lines/sec/stream | ||
| # | ||
| # Scrape config matches every pod log; namespace-scope labels are added so Loki LogQL queries | ||
| # can filter by service / namespace / pod. | ||
| # | ||
| # SECURITY NOTE — replace stage regex semantics (load-bearing): | ||
| # Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches. | ||
| # The `replace` field is a Go text/template string; `${1}` is NOT valid Go | ||
| # template syntax and silently becomes a literal. The correct pattern is: | ||
| # expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part | ||
| # replace: 'REDACTED' — replace captured secret with literal | ||
| # See promtail-values.yaml header comment for full details. | ||
| deploymentMode: DaemonSet | ||
| resources: | ||
| requests: | ||
| cpu: 50m | ||
| memory: 64Mi | ||
| limits: | ||
| cpu: 200m | ||
| memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory | ||
| config: | ||
| clients: | ||
| - url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push | ||
| snippets: | ||
| pipelineStages: | ||
| # 1. Parse JSON access logs from Traefik (key field present in JSON line) | ||
| - match: | ||
| selector: '{container="traefik"}' | ||
| stages: | ||
| - json: | ||
| expressions: | ||
| request_method: RequestMethod | ||
| request_path: RequestPath | ||
| status: DownstreamStatus | ||
| request_id: requestId | ||
| service: ServiceName | ||
| router: RouterName | ||
| # 2. Scrub OAuth/token values from URL query params and Authorization headers. | ||
| # | ||
| # IMPORTANT — capture group semantics: | ||
| # The replace stage replaces each CAPTURE GROUP with the `replace` template | ||
| # value. Capture groups must wrap ONLY the secret value, not the surrounding | ||
| # context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so | ||
| # it is preserved in the output while only the secret is replaced. | ||
| - replace: | ||
| # OAuth code= callback values — capture only the token value after `code=` | ||
| expression: '(?:\?|&)code=([^&\s]+)' | ||
| replace: 'REDACTED' | ||
| - replace: | ||
| # Bearer / access tokens in query strings — capture only the value | ||
| expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)' | ||
| replace: 'REDACTED' | ||
| - replace: | ||
| # OAuth state param (may carry session info) — capture only the value | ||
| expression: '(?:\?|&)state=([^&\s]+)' | ||
| replace: 'REDACTED' | ||
| - replace: | ||
| # Authorization header Bearer value — capture only the token after `Bearer ` | ||
| expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)' | ||
| replace: 'REDACTED' | ||
| # 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37) | ||
| - limit: | ||
| rate: 100 # max log lines/sec per stream | ||
| burst: 200 | ||
| drop: true # drop excess lines; do NOT block tail | ||
| # 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance) | ||
| - labels: | ||
| service: # from Traefik JSON access log; matches taxonomy `service` label | ||
| router: # Traefik router name | ||
| status: # HTTP status code (within taxonomy) | ||
| # Retention is configured on Loki side (Phase B), not Promtail. | ||
| # Sample retention target: 7 days per Performance budget Row. | ||
| serviceMonitor: | ||
| enabled: true # Prometheus (Phase C) scrapes Promtail's own /metrics for self-observability |
| # Promtail Helm values — k3s-ingress-observability Phase B Task B1 (production) | ||
| # | ||
| # Production Promtail values. Staging copy at promtail-staging.yaml has the | ||
| # same scrubbing pipeline shape; this file sets the Loki client URL + | ||
| # production resource limits. | ||
| # | ||
| # Scrubbing pipeline: | ||
| # - 4 `replace` stages: code=, token/access_token/api_key/secret=, state=, Authorization | ||
| # - `limit` stage: rate=100/burst=200/drop=true (OQ-p3-37: Promtail OOM under chatty containers) | ||
| # Client URL: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push | ||
| # Service name `olam-loki` is the Helm release name used in scripts/e2e/loki-ingest.sh | ||
| # (`helm upgrade --install olam-loki grafana/loki ...`); the chart's Service | ||
| # is named after the release, so `olam-loki` is the in-cluster DNS hostname. | ||
| # | ||
| # SECURITY NOTE — replace stage regex semantics (load-bearing): | ||
| # Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches. | ||
| # The `replace` field is a Go text/template string; `${1}` is NOT valid Go | ||
| # template syntax and silently becomes a literal. The correct pattern is: | ||
| # expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part | ||
| # replace: 'REDACTED' — replace captured secret with literal | ||
| # This leaves the surrounding context (e.g. `?code=`) intact and redacts only | ||
| # the value. The broken pattern `(\?|&)code=[^&\s]+` with `replace: '${1}code=REDACTED'` | ||
| # was the root cause of the Phase B scrubbing regression (PR #776). | ||
| # | ||
| # See: docs/plans/k3s-ingress-observability/DESIGN.md (T8, T9) | ||
| deploymentMode: DaemonSet | ||
| resources: | ||
| requests: | ||
| cpu: 50m | ||
| memory: 64Mi | ||
| limits: | ||
| cpu: 200m | ||
| memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory | ||
| config: | ||
| clients: | ||
| - url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push | ||
| snippets: | ||
| pipelineStages: | ||
| # 1. Parse JSON access logs from Traefik (key field present in JSON line) | ||
| - match: | ||
| selector: '{container="traefik"}' | ||
| stages: | ||
| - json: | ||
| expressions: | ||
| request_method: RequestMethod | ||
| request_path: RequestPath | ||
| status: DownstreamStatus | ||
| request_id: requestId | ||
| service: ServiceName | ||
| router: RouterName | ||
| # 2. Scrub OAuth/token values from URL query params and Authorization headers. | ||
| # | ||
| # IMPORTANT — capture group semantics: | ||
| # The replace stage replaces each CAPTURE GROUP with the `replace` template | ||
| # value. Capture groups must wrap ONLY the secret value, not the surrounding | ||
| # context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so | ||
| # it is preserved in the output while only the secret is replaced. | ||
| - replace: | ||
| # OAuth code= callback values — capture only the token value after `code=` | ||
| expression: '(?:\?|&)code=([^&\s]+)' | ||
| replace: 'REDACTED' | ||
| - replace: | ||
| # Bearer / access tokens in query strings — capture only the value | ||
| expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)' | ||
| replace: 'REDACTED' | ||
| - replace: | ||
| # OAuth state param (may carry session info) — capture only the value | ||
| expression: '(?:\?|&)state=([^&\s]+)' | ||
| replace: 'REDACTED' | ||
| - replace: | ||
| # Authorization header Bearer value — capture only the token after `Bearer ` | ||
| expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)' | ||
| replace: 'REDACTED' | ||
| # 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37) | ||
| - limit: | ||
| rate: 100 # max log lines/sec per stream | ||
| burst: 200 | ||
| drop: true # drop excess lines; do NOT block tail | ||
| # 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance) | ||
| - labels: | ||
| service: # from Traefik JSON access log; matches taxonomy `service` label | ||
| router: # Traefik router name | ||
| status: # HTTP status code (within taxonomy) | ||
| # Retention is configured on Loki side (loki-values.yaml: 7 days / 168h). | ||
| serviceMonitor: | ||
| # Disabled in the source-of-truth values file so a standalone Phase B install | ||
| # (without kube-prometheus-stack) does not hard-fail with | ||
| # "no matches for kind ServiceMonitor in version monitoring.coreos.com/v1". | ||
| # The C1 e2e script flips this on at RUNTIME via | ||
| # helm upgrade ... --reuse-values --set serviceMonitor.enabled=true | ||
| # AFTER kube-prom-stack has installed the ServiceMonitor CRD. Source-of-truth | ||
| # stays standalone-friendly; runtime override wires Prometheus discovery. | ||
| enabled: false |
| # Traefik Helm values — k3s-ingress-observability Phase A Task A3 | ||
| # Pinned NodePort 30080 per OQ-p3-7 (world hooks bake this URL). | ||
| # Structured JSON access logs ready for Phase A Task A5 + Phase B Promtail pickup. | ||
| deployment: | ||
| replicas: 1 # SPOF mitigation = host systemd watchdog (Phase A Task A11), not HA replicas | ||
| ports: | ||
| web: | ||
| port: 8000 | ||
| expose: | ||
| default: true | ||
| exposedPort: 80 | ||
| nodePort: 30080 # PIN (OQ-p3-7); world hooks reach via host.docker.internal:30080 | ||
| protocol: TCP | ||
| websecure: | ||
| port: 8443 | ||
| expose: | ||
| default: true | ||
| exposedPort: 443 | ||
| nodePort: 30443 | ||
| protocol: TCP | ||
| # v1: HTTPS deferred to fatbox multi-org (Out-of-scope of this plan); TLS not configured. | ||
| service: | ||
| type: NodePort | ||
| # Structured access logs to stdout — Promtail picks up in Phase B. | ||
| # Authorization header redaction here; URL query-param scrubbing happens | ||
| # at Promtail pipeline_stages.replace per OQ-p3-6 (Traefik can't scrub query params natively). | ||
| logs: | ||
| general: | ||
| level: INFO | ||
| format: json | ||
| access: | ||
| enabled: true | ||
| format: json | ||
| fields: | ||
| headers: | ||
| defaultMode: keep | ||
| names: | ||
| Authorization: redact | ||
| Cookie: redact | ||
| # Built-in /metrics for Phase C Prometheus scrape | ||
| metrics: | ||
| prometheus: | ||
| enabled: true | ||
| addEntryPointsLabels: true | ||
| addRoutersLabels: true | ||
| addServicesLabels: true | ||
| # Dashboard disabled in cluster — operator uses Grafana (Phase B) | ||
| ingressRoute: | ||
| dashboard: | ||
| enabled: false | ||
| # IngressRoute CRD enabled | ||
| providers: | ||
| kubernetesCRD: | ||
| enabled: true | ||
| allowCrossNamespace: false # explicit; matches namespace-isolation strategy from A1 | ||
| kubernetesIngress: | ||
| enabled: false # CRD-only; vanilla Ingress not supported in this stack | ||
| # Resource bounds — observability stack target <500MB RAM idle (P2) | ||
| resources: | ||
| requests: | ||
| cpu: 100m | ||
| memory: 64Mi | ||
| limits: | ||
| cpu: 500m | ||
| memory: 256Mi |
| # Namespace for k3s-ingress-observability peripheral services | ||
| # (Traefik installs to kube-system; observability stack to monitoring; this is for IngressRoute CRDs targeting olam services) | ||
| apiVersion: v1 | ||
| kind: Namespace | ||
| metadata: | ||
| name: olam |
| # 24-deploy-kg-service.yaml — kg-service Service + Deployment for local k3s dogfood. | ||
| # | ||
| # Bridges the gap between Phase C's ServiceMonitor (92-servicemonitor-kg-service.yaml) | ||
| # and a running service. The ServiceMonitor targets namespace `olam`, | ||
| # label `app: olam-kg-service`, port name `http` — this manifest satisfies that | ||
| # contract so Prometheus can scrape kg-service's /metrics endpoint. | ||
| # | ||
| # Canonical per-service manifest tree: packages/host-cp/k8s/manifests/kg-service/ | ||
| # This file is the "peripheral-services entry point" view — it folds Service + | ||
| # Deployment into a single file for `kubectl apply -f manifests/` convenience. | ||
| # | ||
| # Secrets prerequisite: operator MUST create `olam-kg-service-secret` in the | ||
| # `olam` namespace BEFORE applying this manifest. See README.md § Secrets. | ||
| # | ||
| # Image: pinned to sha256 digest (not :latest) per T4 threat model. | ||
| # Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158. | ||
| # To update: | ||
| # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" | jq -r .token) | ||
| # curl -sI -H "Authorization: Bearer $TOKEN" \ | ||
| # -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \ | ||
| # https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> | grep docker-content-digest | ||
| # | ||
| # Memory: bge-small-en-v1.5 ONNX model is pre-cached in the image (~90 MB). | ||
| # Container needs ≥512Mi to load the model + serve requests. Limit set to 1Gi. | ||
| # | ||
| # Apply-manifests.sh: this file is SKIPPED by the phase-a-e2e harness | ||
| # (apply-manifests.sh skip-list includes 2[3-4]-deploy-*) because the | ||
| # harness cluster has no operator secrets or kg-data PVC. | ||
| # Operator-side `kubectl apply -f manifests/` applies it. | ||
| --- | ||
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| app.kubernetes.io/managed-by: olam | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| app.kubernetes.io/managed-by: olam | ||
| rules: | ||
| - apiGroups: ["apps"] | ||
| resources: ["deployments"] | ||
| resourceNames: ["olam-kg-service"] | ||
| verbs: ["get", "patch", "watch"] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| app.kubernetes.io/managed-by: olam | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| roleRef: | ||
| kind: Role | ||
| name: olam-kg-service | ||
| apiGroup: rbac.authorization.k8s.io | ||
| --- | ||
| # ConfigMap — non-sensitive env vars. | ||
| # Sensitive values (OLAM_KG_BEARER_TOKEN) live in `olam-kg-service-secret`. | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-kg-service-env | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| app.kubernetes.io/managed-by: olam | ||
| data: | ||
| # Port kg-service listens on — must match Service targetPort below. | ||
| OLAM_KG_SERVICE_PORT: "9997" | ||
| # CRITICAL: kg-service defaults to 127.0.0.1 bind. In k8s the readiness | ||
| # probe hits the pod IP, so 127.0.0.1-only listener causes probe failures. | ||
| # Force all-interfaces bind without requiring an image rebuild. | ||
| OLAM_KG_SERVICE_BIND: "0.0.0.0" | ||
| # Data directory — backed by the PVC mounted at /data. | ||
| OLAM_KG_DATA_PATH: "/data/kg" | ||
| # Auth-service URL — cluster-internal DNS (olam namespace). | ||
| OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999" | ||
| --- | ||
| # PersistentVolumeClaim — backs /data (KG index + savings telemetry). | ||
| # 10Gi: graph index grows with codebase size. See kg-service/45-pvc.yaml rationale. | ||
| # local-path StorageClass ships with k3d. Substitute for non-k3d clusters. | ||
| apiVersion: v1 | ||
| kind: PersistentVolumeClaim | ||
| metadata: | ||
| name: olam-kg-data | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| app.kubernetes.io/managed-by: olam | ||
| spec: | ||
| accessModes: | ||
| - ReadWriteOnce | ||
| storageClassName: local-path | ||
| resources: | ||
| requests: | ||
| storage: 10Gi | ||
| --- | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| app: olam-kg-service | ||
| app.kubernetes.io/managed-by: olam | ||
| spec: | ||
| replicas: 1 | ||
| strategy: | ||
| type: RollingUpdate | ||
| rollingUpdate: | ||
| maxSurge: 1 | ||
| maxUnavailable: 0 | ||
| selector: | ||
| matchLabels: | ||
| app: olam-kg-service | ||
| template: | ||
| metadata: | ||
| labels: | ||
| app: olam-kg-service | ||
| spec: | ||
| # Disable k8s automatic Service env injection. | ||
| # Without this, k8s injects OLAM_KG_SERVICE_PORT as "tcp://..." which | ||
| # breaks Python's int() parse of the port env var. | ||
| enableServiceLinks: false | ||
| imagePullSecrets: | ||
| - name: ghcr-pull | ||
| serviceAccountName: olam-kg-service | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| runAsGroup: 1000 | ||
| fsGroup: 1000 | ||
| initContainers: | ||
| - name: chown-data | ||
| # busybox:1.36 — sha256-pinned per T4 threat model. | ||
| image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsUser: 0 | ||
| runAsNonRoot: false | ||
| allowPrivilegeEscalation: false | ||
| command: ["chown", "-R", "1000:1000", "/data"] | ||
| volumeMounts: | ||
| - name: kg-data | ||
| mountPath: /data | ||
| containers: | ||
| - name: olam-kg-service | ||
| # Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158 | ||
| # Run `npm run refresh:manifest-digests` to update. | ||
| image: ghcr.io/pleri/olam-kg-service@sha256:72030f3054315e7ebf575f6dcb9b4965e1ddee13ea7bfdeb0bde32253beeb1c7 | ||
| imagePullPolicy: IfNotPresent | ||
| securityContext: | ||
| runAsNonRoot: true | ||
| runAsUser: 1000 | ||
| readOnlyRootFilesystem: true | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
| drop: ["ALL"] | ||
| ports: | ||
| # CRITICAL: port name `http` must match ServiceMonitor | ||
| # 92-servicemonitor-kg-service.yaml endpoints[0].port. | ||
| - name: http | ||
| containerPort: 9997 | ||
| protocol: TCP | ||
| envFrom: | ||
| - configMapRef: | ||
| name: olam-kg-service-env | ||
| - secretRef: | ||
| name: olam-kg-service-secret | ||
| volumeMounts: | ||
| - name: kg-data | ||
| mountPath: /data | ||
| - name: tmp | ||
| mountPath: /tmp | ||
| readinessProbe: | ||
| # kg-service returns {"ok":true,"ready":true} once bge-small model loads. | ||
| # initialDelaySeconds 30 gives the model warmup thread time to complete. | ||
| httpGet: | ||
| path: /health | ||
| port: 9997 | ||
| initialDelaySeconds: 30 | ||
| periodSeconds: 5 | ||
| timeoutSeconds: 3 | ||
| failureThreshold: 12 | ||
| livenessProbe: | ||
| httpGet: | ||
| path: /health | ||
| port: 9997 | ||
| initialDelaySeconds: 60 | ||
| periodSeconds: 20 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 3 | ||
| resources: | ||
| requests: | ||
| cpu: "100m" | ||
| # bge-small ONNX model requires ~400Mi at runtime; 512Mi is the | ||
| # minimum viable request. Set higher if OOM-killed on first classify. | ||
| memory: "512Mi" | ||
| limits: | ||
| cpu: "1000m" | ||
| # 1Gi: bge-small model (~90Mi) + index cache + request headroom. | ||
| memory: "1Gi" | ||
| volumes: | ||
| - name: kg-data | ||
| persistentVolumeClaim: | ||
| claimName: olam-kg-data | ||
| - name: tmp | ||
| emptyDir: {} | ||
| --- | ||
| # Service — exposes kg-service to the cluster. | ||
| # CRITICAL: `name: http` matches 92-servicemonitor-kg-service.yaml endpoints[0].port. | ||
| # Namespace `olam` matches ServiceMonitor's namespaceSelector.matchNames. | ||
| apiVersion: v1 | ||
| kind: Service | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| labels: | ||
| # CRITICAL: matches 92-servicemonitor-kg-service.yaml spec.selector.matchLabels. | ||
| app: olam-kg-service | ||
| app.kubernetes.io/managed-by: olam | ||
| spec: | ||
| type: ClusterIP | ||
| selector: | ||
| app: olam-kg-service | ||
| ports: | ||
| # CRITICAL: name `http` matches ServiceMonitor endpoints[0].port. | ||
| - name: http | ||
| port: 9997 | ||
| targetPort: 9997 | ||
| protocol: TCP |
| # IngressRoute — host-cp (bare /api/* per Decision 3 hybrid routing) | ||
| # host-cp preserves 50+ existing SPA fetch sites at /api/* (no strip-prefix). | ||
| apiVersion: traefik.io/v1alpha1 | ||
| kind: IngressRoute | ||
| metadata: | ||
| name: olam-host-cp | ||
| namespace: olam | ||
| spec: | ||
| entryPoints: | ||
| - web | ||
| routes: | ||
| # host-cp is the catch-all (per Decision 3 hybrid routing); explicit low priority | ||
| # so service-prefix routes (kg, agent-memory, etc.) win when their longer prefix matches. | ||
| # Default Traefik priority is rule-string length; OR'd rules inflate the host-cp aggregate | ||
| # ABOVE more-specific PathPrefix matches, causing /api/kg/* to land on host-cp incorrectly. | ||
| # Explicit priority avoids the silent precedence bug (caught in PR #736 live-validation). | ||
| - match: PathPrefix(`/api/`) || PathPrefix(`/session/`) || PathPrefix(`/v1/`) || Path(`/health`) | ||
| kind: Rule | ||
| priority: 10 | ||
| services: | ||
| - name: olam-host-cp | ||
| port: 19000 |
| # IngressRoute — kg-service via /api/kg/* strip-prefix (Decision 3 new-services pattern) | ||
| apiVersion: traefik.io/v1alpha1 | ||
| kind: Middleware | ||
| metadata: | ||
| name: strip-api-kg | ||
| namespace: olam | ||
| spec: | ||
| stripPrefix: | ||
| prefixes: | ||
| - /api/kg | ||
| --- | ||
| apiVersion: traefik.io/v1alpha1 | ||
| kind: IngressRoute | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: olam | ||
| spec: | ||
| entryPoints: | ||
| - web | ||
| routes: | ||
| # Priority 100 > host-cp's 10 so /api/kg/* wins over host-cp's catch-all /api/*. | ||
| - match: PathPrefix(`/api/kg/`) | ||
| kind: Rule | ||
| priority: 100 | ||
| services: | ||
| - name: olam-kg-service | ||
| port: 9997 | ||
| middlewares: | ||
| - name: strip-api-kg |
| # IngressRoute — agent-memory via /api/agent-memory/* strip-prefix (Decision 3 new-services pattern) | ||
| apiVersion: traefik.io/v1alpha1 | ||
| kind: Middleware | ||
| metadata: | ||
| name: strip-api-agent-memory | ||
| namespace: olam | ||
| spec: | ||
| stripPrefix: | ||
| prefixes: | ||
| - /api/agent-memory | ||
| --- | ||
| apiVersion: traefik.io/v1alpha1 | ||
| kind: IngressRoute | ||
| metadata: | ||
| name: olam-agent-memory | ||
| namespace: olam | ||
| spec: | ||
| entryPoints: | ||
| - web | ||
| routes: | ||
| # Priority 100 > host-cp's 10 so /api/agent-memory/* wins over host-cp's catch-all /api/*. | ||
| - match: PathPrefix(`/api/agent-memory/`) | ||
| kind: Rule | ||
| priority: 100 | ||
| services: | ||
| - name: olam-memory-service | ||
| port: 3111 # Real memory-service listen port (per packages/memory-service/src/worker.ts:206 + AGENTMEMORY_HOST_INTERNAL_URL in container.ts:101). Pass-1 plan said 3112 (incorrect); A6 corrects to 3111. | ||
| middlewares: | ||
| - name: strip-api-agent-memory |
| # NetworkPolicy — olam namespace ingress fence (Phase A Task A9) | ||
| # | ||
| # Defense-in-depth: even if a world agent escapes its container or steals a | ||
| # bearer token, NetworkPolicy ensures it can only reach olam services via the | ||
| # Traefik ingress path (which enforces bearer auth on world-originated calls | ||
| # per A6 — see packages/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml). | ||
| # Direct pod-to-pod access bypassing ingress is denied. | ||
| # | ||
| # Enforcement matrix — two separate enforcement paths exist; the comment below | ||
| # previously conflated them (corrected 2026-05-21, see dogfood incident finding #2): | ||
| # | ||
| # k3d/k3s with --disable-network-policy=false (production k3s default): | ||
| # k3s ships a built-in NetworkPolicy controller that enforces NetworkPolicies | ||
| # via iptables rules, INDEPENDENT of the CNI. Flannel itself does not enforce, | ||
| # but the k3s controller does. Result: NetworkPolicies ARE enforced even on | ||
| # default Flannel k3s/k3d clusters — this is what the operator's colima+k3d | ||
| # dogfood cluster experienced (the fence was live despite using Flannel). | ||
| # | ||
| # k3d/k3s with --disable-network-policy=true (this harness — cluster-up.sh): | ||
| # The harness explicitly passes --k3s-arg '--disable-network-policy@server:*' | ||
| # to disable the k3s built-in controller. With the controller off, enforcement | ||
| # depends entirely on the CNI: Flannel = no enforcement; Calico = enforced. | ||
| # The harness uses Calico precisely so tests exercise real enforcement. | ||
| # | ||
| # Production k3s (default, no --disable-network-policy): | ||
| # Controller-enforced via iptables unless the operator explicitly disables it. | ||
| # | ||
| # See docs/architecture/networkpolicy-fence.md for the full environment matrix | ||
| # and docs/incidents/2026-05-21-phase-c-dogfood.md (finding #2) for the live | ||
| # evidence that k3s' bundled controller enforces on Flannel clusters. | ||
| # | ||
| # Threat mitigated: T6 (world→host SSRF via unauthenticated ingress route). | ||
| # Companion mitigations (do not remove A6 + A9 together): bearer auth (A6), | ||
| # 127.0.0.1 bind on host-cp + kube-apiserver (OS-level, separate from k8s). | ||
| apiVersion: networking.k8s.io/v1 | ||
| kind: NetworkPolicy | ||
| metadata: | ||
| name: olam-ingress-fence | ||
| namespace: olam | ||
| labels: | ||
| app.kubernetes.io/part-of: olam | ||
| app.kubernetes.io/component: security-fence | ||
| olam.io/phase: a | ||
| olam.io/task: a9 | ||
| spec: | ||
| # Selects every pod in the olam namespace. Intra-namespace traffic is allowed | ||
| # explicitly below so olam services can call each other; cross-namespace and | ||
| # external traffic must traverse Traefik (which the second rule allows). | ||
| podSelector: {} | ||
| policyTypes: | ||
| - Ingress | ||
| ingress: | ||
| # Allow inbound from Traefik (canonical ingress path). The label selector | ||
| # matches the standard Helm-chart label that k3s' bundled Traefik install | ||
| # sets (`app.kubernetes.io/name: traefik`); also matched by the upstream | ||
| # `traefik/traefik` chart used by Phase A Task A3. | ||
| - from: | ||
| - namespaceSelector: | ||
| matchLabels: | ||
| kubernetes.io/metadata.name: kube-system | ||
| podSelector: | ||
| matchLabels: | ||
| app.kubernetes.io/name: traefik | ||
| # Allow intra-namespace pod-to-pod traffic — olam services may call each | ||
| # other directly (host-cp → kg-service, etc.) without round-tripping | ||
| # through Traefik. Audit log on world-originated calls still fires at the | ||
| # bearer-auth layer (A6), so this allowance does not weaken T6 mitigation. | ||
| - from: | ||
| - podSelector: {} | ||
| # Allow inbound from the monitoring namespace — Phase C's Prometheus | ||
| # (kube-prometheus-stack) scrapes pod IPs directly for /metrics | ||
| # collection. Without this rule, ServiceMonitor targets in `olam` ns | ||
| # appear "up" but yield 0 samples (the scrape connection silently fails | ||
| # at CNI level on enforcing CNIs). Surfaced during 2026-05-21 operator | ||
| # dogfood — see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #2. | ||
| # Scope: monitoring → olam ingress only (not the reverse direction). | ||
| - from: | ||
| - namespaceSelector: | ||
| matchLabels: | ||
| kubernetes.io/metadata.name: monitoring |
| # NetworkPolicy — monitoring namespace default-deny + same-namespace allow | ||
| # (Phase A Task A9; companion to 60-networkpolicy-ingress.yaml) | ||
| # | ||
| # Loki + Prometheus + Grafana accept inbound ONLY from pods in the same | ||
| # `monitoring` namespace (intra-stack: Promtail → Loki, Grafana → Loki + Prom, | ||
| # kube-prometheus-stack scrape targets within the stack). Cross-namespace | ||
| # traffic — including from `olam` (host-cp, kg-service, agent-memory) and | ||
| # kube-system (Traefik) — is denied. | ||
| # | ||
| # Operator access pattern is `kubectl port-forward -n monitoring svc/grafana | ||
| # 3000` (Decision 16). port-forward uses the kube-apiserver's exec channel, | ||
| # NOT pod-to-pod networking, so it bypasses NetworkPolicy by design. | ||
| # | ||
| # Decision 17 forbids any IngressRoute / Ingress that exposes Loki / Prom / | ||
| # Grafana from outside the cluster; audit:no-ingress-route enforces that at | ||
| # commit time, and this NetworkPolicy is the runtime defense-in-depth layer | ||
| # (caught even if the audit is bypassed or a Helm chart renders a route). | ||
| # | ||
| # Forward-declaration note: Loki + Prometheus land in Phase B/C. Until those | ||
| # manifests add pods to the `monitoring` namespace, this policy applies to an | ||
| # empty pod set and is a no-op. Phase B/C must label their pods so this | ||
| # selector keeps matching (kube-prometheus-stack's default labels already | ||
| # satisfy `kubernetes.io/metadata.name: monitoring` via namespace metadata). | ||
| # | ||
| # Enforcement requires NetworkPolicy-capable CNI (see 60-* doc block). | ||
| # Threat mitigated: T7 (Grafana admin secret exfil) + secondary T6 mitigation. | ||
| --- | ||
| # Forward-declare the monitoring namespace so the NetworkPolicy below has a | ||
| # valid target. Phase B/C kube-prometheus-stack installs into this namespace | ||
| # and may add labels — its install MUST NOT delete the namespace; Helm uses | ||
| # `--create-namespace=false` once this manifest seeds it. | ||
| apiVersion: v1 | ||
| kind: Namespace | ||
| metadata: | ||
| name: monitoring | ||
| labels: | ||
| kubernetes.io/metadata.name: monitoring | ||
| app.kubernetes.io/part-of: olam-observability | ||
| olam.io/phase: a | ||
| olam.io/task: a9 | ||
| --- | ||
| apiVersion: networking.k8s.io/v1 | ||
| kind: NetworkPolicy | ||
| metadata: | ||
| name: monitoring-default-deny | ||
| namespace: monitoring | ||
| labels: | ||
| app.kubernetes.io/part-of: olam-observability | ||
| app.kubernetes.io/component: security-fence | ||
| olam.io/phase: a | ||
| olam.io/task: a9 | ||
| spec: | ||
| # Selects every pod in the monitoring namespace. Phase B/C pods (loki, | ||
| # prometheus, grafana, promtail, alertmanager — whatever the chart renders) | ||
| # all match this empty selector automatically. | ||
| podSelector: {} | ||
| policyTypes: | ||
| - Ingress | ||
| ingress: | ||
| # Allow inbound only from same-namespace pods. Cross-namespace traffic | ||
| # (olam services, kube-system Traefik, default ns) is denied — see header | ||
| # for why this is the correct posture (operator uses kubectl port-forward, | ||
| # which bypasses NetworkPolicy via the kube-apiserver exec channel). | ||
| - from: | ||
| - namespaceSelector: | ||
| matchLabels: | ||
| kubernetes.io/metadata.name: monitoring |
| # ---------------------------------------------------------------------------- | ||
| # GENERATED FILE — DO NOT EDIT DIRECTLY | ||
| # | ||
| # Source: packages/peripheral-services/grafana-dashboards/*.json | ||
| # Regenerate: packages/peripheral-services/scripts/sync-grafana-dashboards.sh | ||
| # | ||
| # This ConfigMap is consumed by the grafana/grafana Helm chart via | ||
| # dashboardsConfigMaps.olam-default: olam-dashboards | ||
| # as wired in packages/peripheral-services/helm-values/grafana-values.yaml. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B3 | ||
| # ---------------------------------------------------------------------------- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: olam-dashboards | ||
| namespace: monitoring | ||
| labels: | ||
| app.kubernetes.io/name: grafana | ||
| app.kubernetes.io/managed-by: olam | ||
| grafana_dashboard: "1" | ||
| data: | ||
| host-cp.json: | | ||
| { | ||
| "uid": "host-cp", | ||
| "title": "Host-CP — Service Drill-in", | ||
| "description": "Per-route SLIs for host-cp. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. Use the route dropdown to scope a single route or view all. The world_id variable is forwarded from olam-home for context.", | ||
| "tags": ["olam", "drill-in", "phase-c", "host-cp"], | ||
| "timezone": "browser", | ||
| "refresh": "30s", | ||
| "schemaVersion": 39, | ||
| "version": 1, | ||
| "time": { | ||
| "from": "now-1h", | ||
| "to": "now" | ||
| }, | ||
| "timepicker": {}, | ||
| "templating": { | ||
| "list": [ | ||
| { | ||
| "name": "world_id", | ||
| "label": "World", | ||
| "type": "query", | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "query": { | ||
| "qryType": 2, | ||
| "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))", | ||
| "step": "" | ||
| }, | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": false, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| }, | ||
| { | ||
| "name": "route", | ||
| "label": "Route", | ||
| "type": "query", | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"host-cp\"}, route)", | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": true, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| } | ||
| ] | ||
| }, | ||
| "annotations": { | ||
| "list": [] | ||
| }, | ||
| "panels": [ | ||
| { | ||
| "id": 1, | ||
| "type": "timeseries", | ||
| "title": "Request rate by route", | ||
| "description": "Requests per second for each host-cp route over the last 5 minutes (pre-computed by C4 recording rule). Spikes indicate traffic surges; a route going to zero indicates it stopped receiving traffic.", | ||
| "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_requests:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 2, | ||
| "type": "timeseries", | ||
| "title": "5xx error rate by route", | ||
| "description": "5xx responses per second per host-cp route (C4 recording rule). A non-zero value on a route warrants investigation. Correlate with the error ratio panel below to understand severity relative to total traffic.", | ||
| "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_errors:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 3, | ||
| "type": "timeseries", | ||
| "title": "Latency p50 by route", | ||
| "description": "Median (p50) request duration per host-cp route in seconds (C4 recording rule). Represents typical user-perceived latency. Sustained increases above baseline indicate a regression or upstream dependency slowdown.", | ||
| "gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"host-cp\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 4, | ||
| "type": "timeseries", | ||
| "title": "Latency p95 by route", | ||
| "description": "95th-percentile request duration per host-cp route in seconds (C4 recording rule). Captures the tail latency experienced by the slowest 5% of requests. The primary SLI for detecting latency regressions before they affect most users.", | ||
| "gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"host-cp\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 5, | ||
| "type": "timeseries", | ||
| "title": "Latency p99 by route", | ||
| "description": "99th-percentile request duration per host-cp route in seconds (C4 recording rule). Worst-case latency tail. High p99 with stable p50/p95 often indicates a specific slow code path or resource contention under load.", | ||
| "gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"host-cp\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 6, | ||
| "type": "stat", | ||
| "title": "Error ratio (5xx / total) by route", | ||
| "description": "Fraction of requests returning 5xx per host-cp route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. A route showing red means roughly 1-in-20 (or more) requests are failing — investigate immediately.", | ||
| "gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "percentunit", | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "green", "value": null }, | ||
| { "color": "yellow", "value": 0.01 }, | ||
| { "color": "red", "value": 0.05 } | ||
| ] | ||
| }, | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_errors:ratio_by_service_route{service=\"host-cp\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } | ||
| kg-service.json: | | ||
| { | ||
| "uid": "kg-service", | ||
| "title": "KG-Service — Service Drill-in", | ||
| "description": "Per-route SLIs for kg-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. kg-service exposes 4 routes: /health, /classify, /build, /status. Use the route dropdown to scope a single route. The world_id variable is forwarded from olam-home for context.", | ||
| "tags": ["olam", "drill-in", "phase-c", "kg-service"], | ||
| "timezone": "browser", | ||
| "refresh": "30s", | ||
| "schemaVersion": 39, | ||
| "version": 1, | ||
| "time": { | ||
| "from": "now-1h", | ||
| "to": "now" | ||
| }, | ||
| "timepicker": {}, | ||
| "templating": { | ||
| "list": [ | ||
| { | ||
| "name": "world_id", | ||
| "label": "World", | ||
| "type": "query", | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "query": { | ||
| "qryType": 2, | ||
| "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))", | ||
| "step": "" | ||
| }, | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": false, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| }, | ||
| { | ||
| "name": "route", | ||
| "label": "Route", | ||
| "type": "query", | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"kg-service\"}, route)", | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": true, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| } | ||
| ] | ||
| }, | ||
| "annotations": { | ||
| "list": [] | ||
| }, | ||
| "panels": [ | ||
| { | ||
| "id": 1, | ||
| "type": "timeseries", | ||
| "title": "Request rate by route", | ||
| "description": "Requests per second for each kg-service route over the last 5 minutes (pre-computed by C4 recording rule). /classify is the hot path; /build is infrequent; /health should be near-constant. A drop in /classify with stable /health suggests the classifier is being bypassed or the caller is down.", | ||
| "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_requests:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 2, | ||
| "type": "timeseries", | ||
| "title": "5xx error rate by route", | ||
| "description": "5xx responses per second per kg-service route (C4 recording rule). Errors on /classify indicate the graph classifier is failing; errors on /build indicate a KG rebuild failure. Either warrants immediate investigation as they affect agent search quality.", | ||
| "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_errors:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 3, | ||
| "type": "timeseries", | ||
| "title": "Latency p50 by route", | ||
| "description": "Median (p50) request duration per kg-service route in seconds (C4 recording rule). /classify latency drives agent dispatch latency directly; a rising p50 on /classify means agents wait longer for graph routing decisions.", | ||
| "gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"kg-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 4, | ||
| "type": "timeseries", | ||
| "title": "Latency p95 by route", | ||
| "description": "95th-percentile request duration per kg-service route in seconds (C4 recording rule). kg-service is a synchronous dependency for in-world search; a high p95 on /classify directly contributes to the >6min diagnosis-time problem this observability stack is solving.", | ||
| "gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"kg-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 5, | ||
| "type": "timeseries", | ||
| "title": "Latency p99 by route", | ||
| "description": "99th-percentile request duration per kg-service route in seconds (C4 recording rule). Worst-case latency tail. A high p99 on /build (graph rebuild) with stable /classify p99 is expected; the inverse (stable /build, high /classify p99) indicates classifier graph complexity growth.", | ||
| "gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"kg-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 6, | ||
| "type": "stat", | ||
| "title": "Error ratio (5xx / total) by route", | ||
| "description": "Fraction of requests returning 5xx per kg-service route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. kg-service is fail-open for /classify (returns empty result on error); a high error ratio here means callers are silently getting degraded graph routing with no local error signal.", | ||
| "gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "percentunit", | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "green", "value": null }, | ||
| { "color": "yellow", "value": 0.01 }, | ||
| { "color": "red", "value": 0.05 } | ||
| ] | ||
| }, | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_errors:ratio_by_service_route{service=\"kg-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } | ||
| memory-service.json: | | ||
| { | ||
| "uid": "memory-service", | ||
| "title": "Memory-Service — Service Drill-in", | ||
| "description": "Per-route SLIs for memory-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. memory-service's traffic flows through the in-container Node front-door (packages/memory-service/src/metrics-proxy.mjs) which short-circuits /metrics and instruments every agentmemory engine route ({service,route,method,status_code} taxonomy). Use the route dropdown to scope a single agentmemory endpoint. The world_id variable is forwarded from olam-home for context.", | ||
| "tags": ["olam", "drill-in", "phase-c", "memory-service"], | ||
| "timezone": "browser", | ||
| "refresh": "30s", | ||
| "schemaVersion": 39, | ||
| "version": 1, | ||
| "time": { | ||
| "from": "now-1h", | ||
| "to": "now" | ||
| }, | ||
| "timepicker": {}, | ||
| "templating": { | ||
| "list": [ | ||
| { | ||
| "name": "world_id", | ||
| "label": "World", | ||
| "type": "query", | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "query": { | ||
| "qryType": 2, | ||
| "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))", | ||
| "step": "" | ||
| }, | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": false, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| }, | ||
| { | ||
| "name": "route", | ||
| "label": "Route", | ||
| "type": "query", | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"memory-service\"}, route)", | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": true, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| } | ||
| ] | ||
| }, | ||
| "annotations": { | ||
| "list": [] | ||
| }, | ||
| "panels": [ | ||
| { | ||
| "id": 1, | ||
| "type": "timeseries", | ||
| "title": "Request rate by route", | ||
| "description": "Requests per second for each memory-service route over the last 5 minutes (pre-computed by C4 recording rule). /agentmemory/mcp/call is the hot path that agents drive — every memory_save / memory_recall lands there. /agentmemory/livez is the readiness probe (near-constant ~0.2 rps from k8s). /agentmemory/export is bridge-debounced (~1 per ~10s burst). A drop in mcp/call with stable livez indicates the agentmemory engine is up but receiving no traffic — caller-side issue.", | ||
| "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_requests:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 2, | ||
| "type": "timeseries", | ||
| "title": "5xx error rate by route", | ||
| "description": "5xx responses per second per memory-service route (C4 recording rule). Errors on /agentmemory/mcp/call indicate the iii engine is rejecting MCP tool calls — typical causes are bearer-auth failures or the engine entering a degraded state. Errors on /agentmemory/import indicate restore failures; the bridge's snapshot will retry on the next mutator-write.", | ||
| "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_errors:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 3, | ||
| "type": "timeseries", | ||
| "title": "Latency p50 by route", | ||
| "description": "Median (p50) request duration per memory-service route in seconds (C4 recording rule). /agentmemory/mcp/call p50 is a direct driver of agent-memory recall+save latency in the agent loop. Sustained rise on mcp/call p50 points to engine index size growth or iii-config tuning regressions.", | ||
| "gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"memory-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 4, | ||
| "type": "timeseries", | ||
| "title": "Latency p95 by route", | ||
| "description": "95th-percentile request duration per memory-service route in seconds (C4 recording rule). memory-service is a synchronous dependency for agent recall paths — high p95 on /agentmemory/mcp/call directly contributes to the >6min diagnosis-time problem this observability stack is solving. /agentmemory/export p95 spikes are expected at snapshot boundaries but should fall back inside 1s.", | ||
| "gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"memory-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 5, | ||
| "type": "timeseries", | ||
| "title": "Latency p99 by route", | ||
| "description": "99th-percentile request duration per memory-service route in seconds (C4 recording rule). Worst-case tail. /agentmemory/import is intentionally heavy (~1s+ for a full corpus restore on cold-start) so a high p99 there with stable mcp/call p99 is expected. The inverse — stable import, rising mcp/call p99 — is the leading indicator for engine-side index degradation.", | ||
| "gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "s", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"memory-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 6, | ||
| "type": "stat", | ||
| "title": "Error ratio (5xx / total) by route", | ||
| "description": "Fraction of requests returning 5xx per memory-service route (C4 recording rule). Green < 1%; yellow 1-5%; red >= 5%. /agentmemory/mcp/call errors silently degrade agent memory recall quality (callers fall through to no-context paths). /agentmemory/livez errors here indicate the proxy is healthy but the engine is unreachable — check container logs.", | ||
| "gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 }, | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "percentunit", | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "green", "value": null }, | ||
| { "color": "yellow", "value": 0.01 }, | ||
| { "color": "red", "value": 0.05 } | ||
| ] | ||
| }, | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "prometheus", "uid": "prometheus" }, | ||
| "expr": "olam:http_errors:ratio_by_service_route{service=\"memory-service\",route=~\"$route\"}", | ||
| "legendFormat": "{{route}}", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } | ||
| olam-home.json: | | ||
| { | ||
| "uid": "olam-home", | ||
| "title": "Olam Home", | ||
| "description": "Operator's at-a-glance view. Top row: are the 5 olam peripheral services up? Middle row: how loaded are they? Bottom row: which worlds are doing dispatch work right now? Use the world_id dropdown to scope the bottom row (and host-cp/world-cp middle panels) to a specific world. Pinned 3-row IA per Phase B acceptance criteria #8. Click the host-cp, kg-service, or memory-service health panel to drill into the per-service dashboard.", | ||
| "tags": ["olam", "home", "phase-b"], | ||
| "timezone": "browser", | ||
| "refresh": "30s", | ||
| "schemaVersion": 39, | ||
| "version": 2, | ||
| "time": { | ||
| "from": "now-1h", | ||
| "to": "now" | ||
| }, | ||
| "timepicker": {}, | ||
| "templating": { | ||
| "list": [ | ||
| { | ||
| "name": "world_id", | ||
| "label": "World", | ||
| "type": "query", | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "query": { | ||
| "qryType": 2, | ||
| "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))", | ||
| "step": "" | ||
| }, | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": false, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| } | ||
| ] | ||
| }, | ||
| "annotations": { | ||
| "list": [] | ||
| }, | ||
| "panels": [ | ||
| { | ||
| "id": 1, | ||
| "type": "stat", | ||
| "title": "host-cp", | ||
| "description": "Green if host-cp logged at least 1 line in the last 60s; red = silent / crashed.", | ||
| "gridPos": { "x": 0, "y": 0, "w": 5, "h": 4 }, | ||
| "links": [ | ||
| { | ||
| "title": "Drill into host-cp", | ||
| "url": "/d/host-cp/host-cp-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}", | ||
| "targetBlank": false | ||
| } | ||
| ], | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "red", "value": null }, | ||
| { "color": "green", "value": 1 } | ||
| ] | ||
| }, | ||
| "mappings": [ | ||
| { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } }, | ||
| { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } } | ||
| ], | ||
| "unit": "short", | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum(count_over_time({service=\"host-cp\"}[1m]))", | ||
| "legendFormat": "host-cp", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 2, | ||
| "type": "stat", | ||
| "title": "kg-service", | ||
| "description": "Green if kg-service logged at least 1 line in the last 60s; red = silent / crashed.", | ||
| "gridPos": { "x": 5, "y": 0, "w": 5, "h": 4 }, | ||
| "links": [ | ||
| { | ||
| "title": "Drill into kg-service", | ||
| "url": "/d/kg-service/kg-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}", | ||
| "targetBlank": false | ||
| } | ||
| ], | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "red", "value": null }, | ||
| { "color": "green", "value": 1 } | ||
| ] | ||
| }, | ||
| "mappings": [ | ||
| { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } }, | ||
| { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } } | ||
| ], | ||
| "unit": "short", | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum(count_over_time({service=\"kg-service\"}[1m]))", | ||
| "legendFormat": "kg-service", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 3, | ||
| "type": "stat", | ||
| "title": "agent-memory", | ||
| "description": "Green if agent-memory logged at least 1 line in the last 60s; red = silent / crashed.", | ||
| "gridPos": { "x": 10, "y": 0, "w": 4, "h": 4 }, | ||
| "links": [ | ||
| { | ||
| "title": "Drill into memory-service", | ||
| "url": "/d/memory-service/memory-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}", | ||
| "targetBlank": false | ||
| } | ||
| ], | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "red", "value": null }, | ||
| { "color": "green", "value": 1 } | ||
| ] | ||
| }, | ||
| "mappings": [ | ||
| { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } }, | ||
| { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } } | ||
| ], | ||
| "unit": "short", | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum(count_over_time({service=\"agent-memory\"}[1m]))", | ||
| "legendFormat": "agent-memory", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 4, | ||
| "type": "stat", | ||
| "title": "traefik", | ||
| "description": "Green if traefik logged at least 1 line in the last 60s; red = silent / crashed.", | ||
| "gridPos": { "x": 14, "y": 0, "w": 5, "h": 4 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "red", "value": null }, | ||
| { "color": "green", "value": 1 } | ||
| ] | ||
| }, | ||
| "mappings": [ | ||
| { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } }, | ||
| { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } } | ||
| ], | ||
| "unit": "short", | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum(count_over_time({service=\"traefik\"}[1m]))", | ||
| "legendFormat": "traefik", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 5, | ||
| "type": "stat", | ||
| "title": "world-cp", | ||
| "description": "Green if any world-cp instance logged at least 1 line in the last 60s. Aggregated across world_id labels per Promtail drop-rules.", | ||
| "gridPos": { "x": 19, "y": 0, "w": 5, "h": 4 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "thresholds": { | ||
| "mode": "absolute", | ||
| "steps": [ | ||
| { "color": "red", "value": null }, | ||
| { "color": "green", "value": 1 } | ||
| ] | ||
| }, | ||
| "mappings": [ | ||
| { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } }, | ||
| { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } } | ||
| ], | ||
| "unit": "short", | ||
| "color": { "mode": "thresholds" } | ||
| } | ||
| }, | ||
| "options": { | ||
| "reduceOptions": { "calcs": ["lastNotNull"] }, | ||
| "orientation": "auto", | ||
| "textMode": "auto", | ||
| "colorMode": "background", | ||
| "graphMode": "none", | ||
| "justifyMode": "center" | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum(count_over_time({service=\"world-cp\"}[1m]))", | ||
| "legendFormat": "world-cp", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 6, | ||
| "type": "timeseries", | ||
| "title": "Aggregate success rate", | ||
| "description": "Total 2xx/3xx log lines per second across all services. Proxy for overall throughput.", | ||
| "gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "none" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum(rate({job=~\".+\"} |~ \"(?:200|201|204|301|302)\" [1m]))", | ||
| "legendFormat": "2xx/3xx rate", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 7, | ||
| "type": "timeseries", | ||
| "title": "Aggregate error rate", | ||
| "description": "Total error/panic/fatal log lines per second across all services. Spikes indicate incidents.", | ||
| "gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { | ||
| "mode": "fixed", | ||
| "fixedColor": "red" | ||
| }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "none" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum(rate({job=~\".+\"} |~ \"(?i)error|panic|fatal\" [1m]))", | ||
| "legendFormat": "error/panic/fatal rate", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 8, | ||
| "type": "timeseries", | ||
| "title": "World-dispatch activity (top 10 worlds)", | ||
| "description": "Dispatch log lines per 5m per world, filtered by the world_id dropdown. world_id is a JSON field (not a Loki label); extracted via json parser. Select 'All' to see all worlds; select a specific world_id to drill down.", | ||
| "gridPos": { "x": 0, "y": 12, "w": 24, "h": 8 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "short", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "topk(10, sum by (world_id) (\n count_over_time(\n {service=\"host-cp\"}\n |~ \"dispatch\"\n | json\n | __error__ = \"\"\n | world_id =~ \"${world_id}\"\n [5m]\n )\n))", | ||
| "legendFormat": "world {{world_id}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } | ||
| request-rate.json: | | ||
| { | ||
| "uid": "request-rate", | ||
| "title": "Request Rate / Error Rate (Log-Derived)", | ||
| "description": "Per-service request rate + error rate derived from Loki logs. Phase B-only — kube-prometheus-stack will replace these with native HTTP metrics in Phase C.", | ||
| "tags": ["olam", "rate", "phase-b"], | ||
| "timezone": "browser", | ||
| "refresh": "30s", | ||
| "schemaVersion": 39, | ||
| "version": 1, | ||
| "time": { | ||
| "from": "now-1h", | ||
| "to": "now" | ||
| }, | ||
| "timepicker": {}, | ||
| "templating": { | ||
| "list": [ | ||
| { | ||
| "name": "world_id", | ||
| "label": "World", | ||
| "type": "query", | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "query": { | ||
| "qryType": 2, | ||
| "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))", | ||
| "step": "" | ||
| }, | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": false, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| }, | ||
| { | ||
| "name": "service", | ||
| "label": "Service", | ||
| "type": "query", | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "query": { "qryType": 1, "label": "service", "stream": "{job=~\".+\"}" }, | ||
| "refresh": 2, | ||
| "sort": 1, | ||
| "multi": true, | ||
| "includeAll": true, | ||
| "allValue": ".+", | ||
| "current": { "selected": false, "text": "All", "value": "$__all" } | ||
| } | ||
| ] | ||
| }, | ||
| "annotations": { | ||
| "list": [] | ||
| }, | ||
| "panels": [ | ||
| { | ||
| "id": 1, | ||
| "type": "timeseries", | ||
| "title": "Request rate by service", | ||
| "description": "Log line rate per second per service. Uses log volume as a proxy for request rate — appropriate for Phase B before Prometheus HTTP metrics land in Phase C.", | ||
| "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum by (service) (rate({service=~\"${service:regex}\"}[1m]))", | ||
| "legendFormat": "{{service}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 2, | ||
| "type": "timeseries", | ||
| "title": "Error rate by service", | ||
| "description": "Log lines matching error|panic|fatal per second per service. Spikes here warrant drill-down in the Ad-hoc LogQL panel below.", | ||
| "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "reqps", | ||
| "color": { "mode": "palette-classic" }, | ||
| "custom": { | ||
| "lineWidth": 2, | ||
| "fillOpacity": 10, | ||
| "showPoints": "never" | ||
| } | ||
| } | ||
| }, | ||
| "options": { | ||
| "tooltip": { "mode": "multi", "sort": "desc" }, | ||
| "legend": { "displayMode": "list", "placement": "bottom" } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "sum by (service) (rate({service=~\"${service:regex}\"} |~ \"(?i)error|panic|fatal\" [1m]))", | ||
| "legendFormat": "{{service}}", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 3, | ||
| "type": "table", | ||
| "title": "Top-5 endpoints (last 5m)", | ||
| "description": "Top 5 request paths by volume, derived from Traefik JSON access logs. Only Traefik has access-log-grade request_path (per B1 Promtail JSON stage); other services don't extract this field.", | ||
| "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": { | ||
| "unit": "short", | ||
| "color": { "mode": "palette-classic" } | ||
| }, | ||
| "overrides": [ | ||
| { | ||
| "matcher": { "id": "byName", "options": "Value" }, | ||
| "properties": [ | ||
| { "id": "displayName", "value": "requests" } | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| "options": { | ||
| "showHeader": true, | ||
| "footer": { "show": false } | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "topk(5, sum by (request_path) (count_over_time({service=\"traefik\"} | json | __error__ = \"\" | request_path != \"\" [5m])))", | ||
| "legendFormat": "", | ||
| "instant": true, | ||
| "range": false | ||
| } | ||
| ], | ||
| "transformations": [ | ||
| { "id": "reduce", "options": { "reducers": ["sum"] } } | ||
| ] | ||
| }, | ||
| { | ||
| "id": 4, | ||
| "type": "logs", | ||
| "title": "Ad-hoc LogQL (edit me)", | ||
| "description": "Operator escape hatch. Edit the query inline; use LogQL syntax. world_id filter via JSON pipeline because Loki doesn't promote world_id as a stream label.", | ||
| "gridPos": { "x": 0, "y": 16, "w": 24, "h": 10 }, | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "fieldConfig": { | ||
| "defaults": {}, | ||
| "overrides": [] | ||
| }, | ||
| "options": { | ||
| "showTime": true, | ||
| "wrapLogMessage": false, | ||
| "dedupStrategy": "exact", | ||
| "showLabels": false, | ||
| "showCommonLabels": false, | ||
| "sortOrder": "Descending", | ||
| "prettifyLogMessage": false, | ||
| "enableLogDetails": true | ||
| }, | ||
| "targets": [ | ||
| { | ||
| "datasource": { "type": "loki", "uid": "loki" }, | ||
| "expr": "{service=~\"${service:regex}\"} | json | __error__ = \"\" | world_id =~ \"${world_id}\"", | ||
| "legendFormat": "", | ||
| "instant": false, | ||
| "range": true | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } | ||
| # 90-prom-alert-cardinality.yaml — Phase C Task C2 cardinality alert rule. | ||
| # | ||
| # PrometheusRule CR: fires OlamActiveSeriesHigh when prometheus_tsdb_head_series | ||
| # exceeds 80k (80% of the 100k active-series cap defined by P4). | ||
| # | ||
| # ruleSelector match: the Prometheus CR rendered by kube-prom-stack 85.2.0 uses | ||
| # ruleSelector: matchLabels: release: "olam-prom" | ||
| # (verified via `helm template ... | grep -A3 ruleSelector`). | ||
| # The label below MUST match or this rule is silently ignored by Prometheus. | ||
| # | ||
| # Alertmanager: enabled in kube-prom-stack-values.yaml from C2 onwards. | ||
| # Receivers: not yet configured (C2 scope = rule landing; receiver config is C4+). | ||
| # Alertmanager will fire the alert to its default null receiver until receivers | ||
| # are wired — this is intentional. The alert is visible in the Prometheus UI | ||
| # at /alerts regardless of receiver config. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C2 | ||
| # T1 (cardinality bomb) + P4 (<100k active series) | ||
| --- | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: PrometheusRule | ||
| metadata: | ||
| name: olam-cardinality | ||
| namespace: monitoring | ||
| labels: | ||
| app.kubernetes.io/name: olam-prometheus-rules | ||
| app.kubernetes.io/managed-by: olam | ||
| # REQUIRED: matches Prometheus CR's ruleSelector (release: "olam-prom"). | ||
| # Verified via helm template output, 2026-05-21. | ||
| release: olam-prom | ||
| spec: | ||
| groups: | ||
| - name: olam-cardinality | ||
| interval: 30s | ||
| rules: | ||
| - alert: OlamActiveSeriesHigh | ||
| expr: | | ||
| prometheus_tsdb_head_series > 80000 | ||
| for: 5m | ||
| labels: | ||
| severity: warning | ||
| scope: cardinality | ||
| annotations: | ||
| summary: "Active series above 80k threshold (80% of 100k cap)" | ||
| description: | | ||
| prometheus_tsdb_head_series is {{ $value | humanize }} — within 20% | ||
| of the 100k cardinality budget (P4). Investigate which service is | ||
| emitting a new high-cardinality label, OR add a DROP rule to | ||
| kube-prom-stack-values.yaml metricRelabelings for that ServiceMonitor. | ||
| Runbook: docs/architecture/observability-cardinality.md (TBD — C4+) |
| # 91-servicemonitor-host-cp.yaml — Phase C Task C3 ServiceMonitor for host-cp. | ||
| # | ||
| # Registers host-cp's /metrics endpoint with Prometheus for scraping. | ||
| # | ||
| # NOTE: This manifest requires the ServiceMonitor CRD installed by | ||
| # kube-prometheus-stack (Phase C Task C1). It is SKIPPED by | ||
| # apply-manifests.sh (which targets the Phase A ingress harness) and is | ||
| # applied by the phase-c-e2e harness after kube-prom-stack ships CRDs. | ||
| # | ||
| # Namespace placement (CRITICAL — C2 dogfood lesson): | ||
| # ServiceMonitors MUST live in the `monitoring` namespace to be discovered | ||
| # by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor | ||
| # in any other namespace is silently ignored by default RBAC. | ||
| # | ||
| # Label compliance: | ||
| # `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector | ||
| # (verified via `helm template ... | grep -A3 serviceMonitorSelector`). | ||
| # | ||
| # Target selector: | ||
| # Matches the host-cp Service by its `app: olam-host-cp` label. Adjust if | ||
| # the Service label differs in the target cluster (check | ||
| # `kubectl get svc -n olam -l app=olam-host-cp`). | ||
| # | ||
| # metricRelabelings (layer-2 cardinality enforcement): | ||
| # Mirrors the `*cardinality-labeldrop` YAML anchor from | ||
| # kube-prom-stack-values.yaml. host-cp's /metrics is taxonomy-compliant | ||
| # (only {service,route,method,status_code} labels), but the labeldrop rule | ||
| # is present as defense-in-depth: if a future code change accidentally | ||
| # emits a banned label (world_id etc.), this ServiceMonitor drops it before | ||
| # ingest so the cardinality cap is never breached. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3 | ||
| # T1 (cardinality bomb) + P4 (<100k active series) | ||
| --- | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: ServiceMonitor | ||
| metadata: | ||
| name: olam-host-cp | ||
| namespace: monitoring | ||
| labels: | ||
| app.kubernetes.io/name: olam-host-cp-monitor | ||
| app.kubernetes.io/managed-by: olam | ||
| # REQUIRED: matches Prometheus CR's serviceMonitorSelector. | ||
| release: olam-prom | ||
| spec: | ||
| # Discover the host-cp Service in the olam namespace. | ||
| namespaceSelector: | ||
| matchNames: | ||
| - olam | ||
| selector: | ||
| matchLabels: | ||
| app: olam-host-cp | ||
| endpoints: | ||
| - port: http | ||
| path: /metrics | ||
| interval: 15s | ||
| # Preserve the application-emitted `service` label. Without honorLabels, | ||
| # Prometheus's target-label injection (where `service` = the k8s Service | ||
| # name `olam-host-cp`) overrides the application's own `service=host-cp` | ||
| # value, moving the app's value into `exported_service`. The C5 drill-in | ||
| # dashboards filter on `service=host-cp`, so without honorLabels their | ||
| # panels show empty data. Surfaced during 2026-05-21 operator dogfood — | ||
| # see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3. | ||
| honorLabels: true | ||
| # Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop | ||
| # in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels | ||
| # even if the service accidentally emits them. | ||
| metricRelabelings: | ||
| - action: labeldrop | ||
| regex: 'world_id|trace_id|user_id|request_id|operator_id' |
| # 92-servicemonitor-kg-service.yaml — Phase C Task C3 ServiceMonitor for kg-service. | ||
| # | ||
| # Registers kg-service's /metrics endpoint with Prometheus for scraping. | ||
| # | ||
| # NOTE: This manifest requires the ServiceMonitor CRD installed by | ||
| # kube-prometheus-stack (Phase C Task C1). It is SKIPPED by | ||
| # apply-manifests.sh (which targets the Phase A ingress harness) and is | ||
| # applied by the phase-c-e2e harness after kube-prom-stack ships CRDs. | ||
| # | ||
| # Namespace placement (CRITICAL — C2 dogfood lesson): | ||
| # ServiceMonitors MUST live in the `monitoring` namespace to be discovered | ||
| # by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor | ||
| # in any other namespace is silently ignored by default RBAC. | ||
| # | ||
| # Label compliance: | ||
| # `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector | ||
| # (verified via `helm template ... | grep -A3 serviceMonitorSelector`). | ||
| # | ||
| # Target selector: | ||
| # Matches the kg-service Service by its `app: olam-kg-service` label. Adjust | ||
| # if the Service label differs in the target cluster (check | ||
| # `kubectl get svc -n olam -l app=olam-kg-service`). | ||
| # | ||
| # metricRelabelings (layer-2 cardinality enforcement): | ||
| # Mirrors the `*cardinality-labeldrop` YAML anchor from | ||
| # kube-prom-stack-values.yaml. kg-service's /metrics is taxonomy-compliant | ||
| # (only {service,route,method,status_code} labels), but the labeldrop rule | ||
| # is present as defense-in-depth: if a future code change accidentally | ||
| # emits a banned label (world_id etc.), this ServiceMonitor drops it before | ||
| # ingest so the cardinality cap is never breached. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3 | ||
| # T1 (cardinality bomb) + P4 (<100k active series) | ||
| --- | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: ServiceMonitor | ||
| metadata: | ||
| name: olam-kg-service | ||
| namespace: monitoring | ||
| labels: | ||
| app.kubernetes.io/name: olam-kg-service-monitor | ||
| app.kubernetes.io/managed-by: olam | ||
| # REQUIRED: matches Prometheus CR's serviceMonitorSelector. | ||
| release: olam-prom | ||
| spec: | ||
| # Discover the kg-service Service in the olam namespace. | ||
| namespaceSelector: | ||
| matchNames: | ||
| - olam | ||
| selector: | ||
| matchLabels: | ||
| app: olam-kg-service | ||
| endpoints: | ||
| - port: http | ||
| path: /metrics | ||
| interval: 15s | ||
| # Preserve the application-emitted `service` label. Without honorLabels, | ||
| # Prometheus's target-label injection (where `service` = the k8s Service | ||
| # name `olam-kg-service`) overrides the application's own `service=kg-service` | ||
| # value, moving the app's value into `exported_service`. The C5 drill-in | ||
| # dashboards filter on `service=kg-service`, so without honorLabels their | ||
| # panels show empty data. Surfaced during 2026-05-21 operator dogfood — | ||
| # see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3. | ||
| honorLabels: true | ||
| # Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop | ||
| # in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels | ||
| # even if the service accidentally emits them. | ||
| metricRelabelings: | ||
| - action: labeldrop | ||
| regex: 'world_id|trace_id|user_id|request_id|operator_id' |
| # 93-servicemonitor-memory-service.yaml — Phase C Task C3 closure ServiceMonitor. | ||
| # | ||
| # Registers memory-service's /metrics endpoint with Prometheus for scraping. | ||
| # C3 originally shipped instrumentation for host-cp + kg-service (PR #787) but | ||
| # DEFERRED memory-service because the third-party `agentmemory` Node CLI that | ||
| # runs in k3s exposes no /metrics endpoint. This PR closes that deferral by | ||
| # shipping a small Node HTTP front-door (packages/memory-service/src/metrics-proxy.mjs) | ||
| # inside the container image: external traffic hits the proxy on :3111, the | ||
| # proxy short-circuits /metrics + forwards everything else to agentmemory on | ||
| # loopback :3110. End-state matches the host-cp/kg-service shape so the ServiceMonitor | ||
| # pattern below is a near-clone of 91-servicemonitor-host-cp.yaml. | ||
| # | ||
| # NOTE: This manifest requires the ServiceMonitor CRD installed by | ||
| # kube-prometheus-stack (Phase C Task C1). It is SKIPPED by | ||
| # apply-manifests.sh (which targets the Phase A ingress harness) and is | ||
| # applied by the phase-c-e2e harness after kube-prom-stack ships CRDs. | ||
| # | ||
| # Namespace placement (CRITICAL — C2 dogfood lesson): | ||
| # ServiceMonitors MUST live in the `monitoring` namespace to be discovered | ||
| # by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor | ||
| # in any other namespace is silently ignored by default RBAC. | ||
| # | ||
| # Label compliance: | ||
| # `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector. | ||
| # | ||
| # Target selector: | ||
| # Matches the memory-service Service by its `app: olam-memory-service` label. | ||
| # The Service is defined in packages/host-cp/k8s/manifests/memory-service/60-service.yaml | ||
| # (port `http` -> targetPort 3111). The 50-traefik-ingressroute-agent-memory.yaml | ||
| # IngressRoute references the same Service for /api/agent-memory/* traffic. | ||
| # | ||
| # Image rollout dependency: | ||
| # The proxy lives inside the container image. Until the next release pipeline | ||
| # refreshes ghcr.io/pleri/olam-memory-service with the post-C3-closure | ||
| # Dockerfile (npm run refresh:manifest-digests), this ServiceMonitor will scrape | ||
| # a target that responds 404 to /metrics. Prometheus tolerates that (the target | ||
| # stays UP, scrape_samples_scraped=0). When the new image lands, scraping | ||
| # begins producing real samples without any cluster-side change. | ||
| # | ||
| # metricRelabelings (layer-2 cardinality enforcement): | ||
| # Mirrors the `*cardinality-labeldrop` YAML anchor from | ||
| # kube-prom-stack-values.yaml. memory-service's /metrics is taxonomy-compliant | ||
| # (only {service,route,method,status_code} labels), but the labeldrop rule | ||
| # is present as defense-in-depth: if a future code change accidentally | ||
| # emits a banned label (world_id etc.), this ServiceMonitor drops it before | ||
| # ingest so the cardinality cap is never breached. | ||
| # | ||
| # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3 | ||
| # T1 (cardinality bomb) + P4 (<100k active series). | ||
| --- | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: ServiceMonitor | ||
| metadata: | ||
| name: olam-memory-service | ||
| namespace: monitoring | ||
| labels: | ||
| app.kubernetes.io/name: olam-memory-service-monitor | ||
| app.kubernetes.io/managed-by: olam | ||
| # REQUIRED: matches Prometheus CR's serviceMonitorSelector. | ||
| release: olam-prom | ||
| spec: | ||
| # Discover the memory-service Service in the olam namespace. | ||
| namespaceSelector: | ||
| matchNames: | ||
| - olam | ||
| selector: | ||
| matchLabels: | ||
| app: olam-memory-service | ||
| endpoints: | ||
| - port: http | ||
| path: /metrics | ||
| interval: 15s | ||
| # Preserve the application-emitted `service` label. Without honorLabels, | ||
| # Prometheus's target-label injection (where `service` = the k8s Service | ||
| # name `olam-memory-service`) overrides the application's own | ||
| # `service=memory-service` value, moving the app's value into | ||
| # `exported_service`. The C5 drill-in dashboards filter on | ||
| # `service=memory-service`, so without honorLabels their panels show | ||
| # empty data. Same lesson as the host-cp/kg-service ServiceMonitors — | ||
| # see docs/incidents/2026-05-21-phase-c-dogfood.md finding #3. | ||
| honorLabels: true | ||
| # Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop | ||
| # in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels | ||
| # even if the service accidentally emits them. | ||
| metricRelabelings: | ||
| - action: labeldrop | ||
| regex: 'world_id|trace_id|user_id|request_id|operator_id' |
| # 95-prom-recording-rules.yaml — Phase C Task C4 | ||
| # | ||
| # Naming convention: olam:<metric>:<aggregation> | ||
| # | ||
| # olam — project namespace prefix (all project recording rules share this) | ||
| # <metric> — the base Prometheus metric being aggregated (without _bucket/_total suffix | ||
| # when the aggregation already implies the source type) | ||
| # <aggregation> — describes what was computed + the grouping dimensions, e.g. | ||
| # p95_by_service_route, rate5m_by_service, ratio_by_service_route | ||
| # | ||
| # Modeled on the community convention from | ||
| # https://prometheus.io/docs/practices/rules/#naming — <level>:<metric>:<ops>. | ||
| # The <aggregation> suffix encodes BOTH the operation (p95, rate5m, ratio) and | ||
| # the grouping dimensions (_by_service, _by_service_route) so dashboard panels | ||
| # can select the pre-computed series without further aggregation. | ||
| # | ||
| # Source metrics (provided by C3 — host-cp + kg-service ServiceMonitors): | ||
| # http_request_duration_seconds_bucket{service, route, method, status_code, le} | ||
| # http_requests_total{service, route, method, status_code} | ||
| # | ||
| # rule group interval: 30s — half the scrape interval (15s × 2). Balances | ||
| # freshness vs evaluation CPU; at 30s each window is re-evaluated twice per | ||
| # minute, keeping percentiles and rates responsive without hammering the TSDB. | ||
| # | ||
| # NOTE: recording rules intentionally reference NO banned labels | ||
| # (world_id, trace_id, user_id, request_id, operator_id). C2's labeldrop at | ||
| # scrape time strips them before ingest; even if a metric slipped through, | ||
| # referencing them here would suppress results. Defense-in-depth: don't type | ||
| # them at all. | ||
| # | ||
| # Applied by: scripts/e2e/prom-no-double-grafana.sh (C4 assertion block) | ||
| # Skipped by: scripts/test-ingress-integration/apply-manifests.sh | ||
| # (9[0-9]-prom-* glob) — requires kube-prom-stack CRDs to be present. | ||
| apiVersion: monitoring.coreos.com/v1 | ||
| kind: PrometheusRule | ||
| metadata: | ||
| name: olam-recording-rules | ||
| namespace: monitoring | ||
| labels: | ||
| app.kubernetes.io/name: olam-prometheus-rules | ||
| app.kubernetes.io/managed-by: olam | ||
| release: olam-prom # must match kube-prom-stack ruleSelector (verified C2) | ||
| spec: | ||
| groups: | ||
| - name: olam-http-aggregations | ||
| interval: 30s | ||
| rules: | ||
| # ============================================================ | ||
| # Latency percentiles per service+route — Phase C Task C4 | ||
| # Source: http_request_duration_seconds_bucket (C3) | ||
| # ============================================================ | ||
| - record: olam:http_request_duration_seconds:p50_by_service_route | ||
| expr: | | ||
| histogram_quantile(0.50, sum by (service, route, le) ( | ||
| rate(http_request_duration_seconds_bucket[5m]) | ||
| )) | ||
| - record: olam:http_request_duration_seconds:p95_by_service_route | ||
| expr: | | ||
| histogram_quantile(0.95, sum by (service, route, le) ( | ||
| rate(http_request_duration_seconds_bucket[5m]) | ||
| )) | ||
| - record: olam:http_request_duration_seconds:p99_by_service_route | ||
| expr: | | ||
| histogram_quantile(0.99, sum by (service, route, le) ( | ||
| rate(http_request_duration_seconds_bucket[5m]) | ||
| )) | ||
| # Aggregate p95 across all routes (per-service summary) | ||
| - record: olam:http_request_duration_seconds:p95_by_service | ||
| expr: | | ||
| histogram_quantile(0.95, sum by (service, le) ( | ||
| rate(http_request_duration_seconds_bucket[5m]) | ||
| )) | ||
| # ============================================================ | ||
| # Request rate per service+route | ||
| # Source: http_requests_total (C3) | ||
| # ============================================================ | ||
| - record: olam:http_requests:rate5m_by_service_route | ||
| expr: | | ||
| sum by (service, route) (rate(http_requests_total[5m])) | ||
| # Aggregate request rate per service | ||
| - record: olam:http_requests:rate5m_by_service | ||
| expr: | | ||
| sum by (service) (rate(http_requests_total[5m])) | ||
| # ============================================================ | ||
| # Error rate (status_code >= 500) per service+route | ||
| # 4xx are client errors and are intentionally excluded from | ||
| # the error ratio — only server-side failures count. | ||
| # ============================================================ | ||
| - record: olam:http_errors:rate5m_by_service_route | ||
| expr: | | ||
| sum by (service, route) ( | ||
| rate(http_requests_total{status_code=~"5.."}[5m]) | ||
| ) | ||
| # Error ratio (errors / total) per service+route. | ||
| # Returns NaN when total rate is 0 (no traffic) — dashboards | ||
| # should handle NaN as "no data" rather than "0% error rate". | ||
| - record: olam:http_errors:ratio_by_service_route | ||
| expr: | | ||
| sum by (service, route) (rate(http_requests_total{status_code=~"5.."}[5m])) | ||
| / | ||
| sum by (service, route) (rate(http_requests_total[5m])) |
| # 96-kyverno-cardinality-mutate.yaml — Phase C C8 follow-up. | ||
| # | ||
| # Closes codex's C2 concern: per-ServiceMonitor metricRelabelings is | ||
| # "policy by convention". A third-party ServiceMonitor or PodMonitor that | ||
| # olam doesn't author can bypass the labeldrop and reintroduce the | ||
| # cardinality bomb (T1). YAML anchors in kube-prom-stack-values.yaml keep | ||
| # Olam-owned manifests DRY but don't make the cluster safe. | ||
| # | ||
| # This ClusterPolicy mutates EVERY incoming ServiceMonitor and PodMonitor | ||
| # at admission time — regardless of who created it (chart, kubectl, operator, | ||
| # CI, GitOps) — to ensure the cardinality labeldrop rule is present on | ||
| # every endpoint. Once persisted, the prometheus-operator renders the | ||
| # relabel into Prometheus's scrape config. | ||
| # | ||
| # Why mutate-only (not validate): | ||
| # Validate would block a chart install or operator action mid-stride | ||
| # if a third-party ServiceMonitor lacks the rule. Mutate is the better | ||
| # posture: silently ensure the rule is present without breaking | ||
| # legitimate installs. Defense-in-depth still lives in TWO layers: | ||
| # (a) admission-time mutation (this policy) | ||
| # (b) per-ServiceMonitor metricRelabelings in | ||
| # kube-prom-stack-values.yaml + 9x-servicemonitor-*.yaml. | ||
| # | ||
| # Idempotency contract: | ||
| # Mutation must NOT add a duplicate labeldrop entry. Achieved by | ||
| # two-rule split per kind, each with a precondition that the labeldrop | ||
| # is currently ABSENT. Once present, neither rule fires: | ||
| # - Rule A (handle absent/empty case): preconditions: | ||
| # metricRelabelings is null/missing OR empty array. | ||
| # JSON patch: `add /spec/endpoints/{i}/metricRelabelings` with | ||
| # a single-element array containing our rule. | ||
| # - Rule B (handle existing-but-no-labeldrop case): preconditions: | ||
| # metricRelabelings is a non-empty array AND no entry has | ||
| # `action: labeldrop` with `regex` mentioning `world_id`. | ||
| # JSON patch: `add /spec/endpoints/{i}/metricRelabelings/-` | ||
| # appending our rule. | ||
| # | ||
| # Verified behavior (kyverno-cardinality-mutate.sh asserts): | ||
| # - Bare ServiceMonitor (no metricRelabelings) → Rule A injects | ||
| # - ServiceMonitor with metricRelabelings: [] → Rule A injects (replaces empty) | ||
| # - ServiceMonitor with unrelated metricRelabelings entries → Rule B appends | ||
| # - ServiceMonitor with matching labeldrop already present → NEITHER rule fires (idempotent) | ||
| # - Mixed: some endpoints lack it, others have it → only the lacking endpoints are mutated | ||
| # | ||
| # Background scan: OFF (background: false). Existing ServiceMonitors at | ||
| # install time are NOT auto-mutated. Re-apply them to trigger admission, | ||
| # or rely on the C2 per-ServiceMonitor metricRelabelings as the failsafe. | ||
| # | ||
| # failurePolicy: Ignore. Kyverno webhook timeout / pod outage MUST NOT | ||
| # block ServiceMonitor admission — the C2 layer-2 rules still protect | ||
| # Olam-owned monitors. Trade-off accepted: during Kyverno downtime, a | ||
| # brand-new third-party ServiceMonitor could land without the labeldrop. | ||
| # The 80k active-series PrometheusRule alert (Phase C C2, | ||
| # 90-prom-alert-cardinality.yaml) is the runtime detector that fires | ||
| # if this gap is exploited. | ||
| # | ||
| # Refs: | ||
| # - docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8 | ||
| # - codex review on PR #783 ("policy by convention" finding) | ||
| # - https://kyverno.io/docs/writing-policies/mutate/ | ||
| # - https://kyverno.io/docs/writing-policies/mutate/#foreach | ||
| --- | ||
| apiVersion: kyverno.io/v1 | ||
| kind: ClusterPolicy | ||
| metadata: | ||
| name: enforce-cardinality-labeldrop | ||
| labels: | ||
| app.kubernetes.io/part-of: olam | ||
| olam.io/phase: c-followup | ||
| annotations: | ||
| policies.kyverno.io/title: "Cluster-wide cardinality labeldrop enforcement" | ||
| policies.kyverno.io/category: "Observability" | ||
| policies.kyverno.io/severity: high | ||
| policies.kyverno.io/subject: "ServiceMonitor, PodMonitor" | ||
| policies.kyverno.io/description: >- | ||
| Ensures every ServiceMonitor and PodMonitor carries a metricRelabelings | ||
| labeldrop rule for high-cardinality labels (world_id, trace_id, user_id, | ||
| request_id, operator_id) on every endpoint. Closes the "third-party chart | ||
| bypasses C2 labeldrop" gap surfaced during PR #783 review. | ||
| spec: | ||
| background: false | ||
| failurePolicy: Ignore | ||
| mutateExistingOnPolicyUpdate: false | ||
| rules: | ||
| # --------------------------------------------------------------------- | ||
| # ServiceMonitor — Rule A: metricRelabelings absent or empty | ||
| # --------------------------------------------------------------------- | ||
| - name: inject-labeldrop-sm-absent | ||
| match: | ||
| any: | ||
| - resources: | ||
| kinds: | ||
| - monitoring.coreos.com/v1/ServiceMonitor | ||
| mutate: | ||
| foreach: | ||
| - list: "request.object.spec.endpoints" | ||
| preconditions: | ||
| all: | ||
| # length() of null/missing returns 0; length([]) is 0. So | ||
| # this fires when the field is absent OR an empty array. | ||
| - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}" | ||
| operator: Equals | ||
| value: 0 | ||
| patchesJson6902: |- | ||
| - op: add | ||
| path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings" | ||
| value: | ||
| - action: labeldrop | ||
| regex: "world_id|trace_id|user_id|request_id|operator_id" | ||
| # --------------------------------------------------------------------- | ||
| # ServiceMonitor — Rule B: metricRelabelings has entries, but no | ||
| # matching labeldrop for our banned-label regex. | ||
| # | ||
| # We test `contains(regex, 'world_id')` rather than equality so that | ||
| # operators who include additional banned labels in their own regex | ||
| # don't trigger duplicate injection. This is the idempotency hinge. | ||
| # --------------------------------------------------------------------- | ||
| - name: inject-labeldrop-sm-append | ||
| match: | ||
| any: | ||
| - resources: | ||
| kinds: | ||
| - monitoring.coreos.com/v1/ServiceMonitor | ||
| mutate: | ||
| foreach: | ||
| - list: "request.object.spec.endpoints" | ||
| preconditions: | ||
| all: | ||
| - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}" | ||
| operator: GreaterThan | ||
| value: 0 | ||
| - key: >- | ||
| {{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }} | ||
| operator: Equals | ||
| value: 0 | ||
| patchesJson6902: |- | ||
| - op: add | ||
| path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings/-" | ||
| value: | ||
| action: labeldrop | ||
| regex: "world_id|trace_id|user_id|request_id|operator_id" | ||
| # --------------------------------------------------------------------- | ||
| # PodMonitor — Rule A: podMetricsEndpoints[*].metricRelabelings absent | ||
| # --------------------------------------------------------------------- | ||
| - name: inject-labeldrop-pm-absent | ||
| match: | ||
| any: | ||
| - resources: | ||
| kinds: | ||
| - monitoring.coreos.com/v1/PodMonitor | ||
| mutate: | ||
| foreach: | ||
| - list: "request.object.spec.podMetricsEndpoints" | ||
| preconditions: | ||
| all: | ||
| - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}" | ||
| operator: Equals | ||
| value: 0 | ||
| patchesJson6902: |- | ||
| - op: add | ||
| path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings" | ||
| value: | ||
| - action: labeldrop | ||
| regex: "world_id|trace_id|user_id|request_id|operator_id" | ||
| # --------------------------------------------------------------------- | ||
| # PodMonitor — Rule B: metricRelabelings exists, no labeldrop | ||
| # --------------------------------------------------------------------- | ||
| - name: inject-labeldrop-pm-append | ||
| match: | ||
| any: | ||
| - resources: | ||
| kinds: | ||
| - monitoring.coreos.com/v1/PodMonitor | ||
| mutate: | ||
| foreach: | ||
| - list: "request.object.spec.podMetricsEndpoints" | ||
| preconditions: | ||
| all: | ||
| - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}" | ||
| operator: GreaterThan | ||
| value: 0 | ||
| - key: >- | ||
| {{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }} | ||
| operator: Equals | ||
| value: 0 | ||
| patchesJson6902: |- | ||
| - op: add | ||
| path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings/-" | ||
| value: | ||
| action: labeldrop | ||
| regex: "world_id|trace_id|user_id|request_id|operator_id" |
| // Recovery engine — the single entry point for bounded auto-attempts. | ||
| // | ||
| // Key invariants: | ||
| // 1. ONE attempt per (worldId, failureKind) pair. The ledger enforces | ||
| // idempotency: a second call with the same key returns the prior | ||
| // entry with outcome='escalated'. | ||
| // 2. Concurrent calls for the same (worldId, failureKind) key fire only | ||
| // ONE attempt. An in-flight Map holds the running Promise; concurrent | ||
| // callers await the same Promise. | ||
| // 3. Steps execute in order. First failing step short-circuits to | ||
| // outcome='failed'; subsequent steps are NOT executed. | ||
| // 4. All attempts (success, failed, escalated) are written to the ledger. | ||
| // | ||
| // The engine is async and pure-functional with respect to the host-stream: | ||
| // callers (server.mjs) are responsible for emitting the recovery.* events | ||
| // AFTER receiving the returned RecoveryLedgerEntry. The engine does not | ||
| // broadcast directly, keeping it testable without a hostStream fixture. | ||
| import { findScenarioForKind } from './scenarios.mjs'; | ||
| import { appendLedgerEntry, findPriorEntry } from './ledger.mjs'; | ||
| import { runStep } from './step-runners.mjs'; | ||
| import { DEFAULT_LEDGER_PATH } from './ledger.mjs'; | ||
| /** | ||
| * @typedef {import('./ledger.mjs').RecoveryLedgerEntry} RecoveryLedgerEntry | ||
| * @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull | ||
| */ | ||
| // In-flight promise map: key = `${worldId}::${failureKind ?? 'null'}` → Promise<RecoveryLedgerEntry> | ||
| /** @type {Map<string, Promise<RecoveryLedgerEntry>>} */ | ||
| const _inFlight = new Map(); | ||
| /** | ||
| * Attempt a bounded recovery for the given world + failure kind. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {object} [evidence] — WorldStartupEvidence, optional | ||
| * @param {FailureKindOrNull} [failureKind] — classified bucket, or null for non-FSM triggers | ||
| * @param {{ ledgerPath?: string, log?: (msg: string) => void }} [opts] | ||
| * @returns {Promise<RecoveryLedgerEntry>} | ||
| */ | ||
| export function attemptRecovery(worldId, evidence, failureKind = null, opts = {}) { | ||
| const key = `${worldId}::${failureKind ?? 'null'}`; | ||
| const existing = _inFlight.get(key); | ||
| if (existing) return existing; | ||
| const promise = _attempt(worldId, evidence, failureKind, opts).finally(() => { | ||
| _inFlight.delete(key); | ||
| }); | ||
| _inFlight.set(key, promise); | ||
| return promise; | ||
| } | ||
| /** | ||
| * Internal: run the actual attempt. Always resolves (never rejects) — all | ||
| * errors are captured into the returned ledger entry. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {object} [evidence] | ||
| * @param {FailureKindOrNull} failureKind | ||
| * @param {{ ledgerPath?: string, log?: (msg: string) => void }} opts | ||
| * @returns {Promise<RecoveryLedgerEntry>} | ||
| */ | ||
| async function _attempt(worldId, evidence, failureKind, opts) { | ||
| const { ledgerPath = DEFAULT_LEDGER_PATH, log = (msg) => console.warn(`[recovery] ${msg}`) } = opts; | ||
| const startedAt = Date.now(); | ||
| // Idempotency check: if a prior entry exists for this key, return it | ||
| // with outcome='escalated' and write an escalated entry. | ||
| const prior = await findPriorEntry(worldId, failureKind, ledgerPath); | ||
| if (prior !== undefined) { | ||
| const escalated = /** @type {RecoveryLedgerEntry} */ ({ | ||
| worldId, | ||
| failureKind: failureKind ?? null, | ||
| scenario: prior.scenario, | ||
| stepsRun: [], | ||
| startedAt, | ||
| endedAt: Date.now(), | ||
| outcome: 'escalated', | ||
| errorReason: `prior attempt already recorded (outcome=${prior.outcome})`, | ||
| }); | ||
| await appendLedgerEntry(escalated, ledgerPath); | ||
| log(`recovery idempotency: escalated (worldId=${worldId}, kind=${failureKind})`); | ||
| return escalated; | ||
| } | ||
| // Find the scenario. | ||
| const scenario = findScenarioForKind(failureKind); | ||
| if (!scenario) { | ||
| const entry = /** @type {RecoveryLedgerEntry} */ ({ | ||
| worldId, | ||
| failureKind: failureKind ?? null, | ||
| scenario: 'unmatched', | ||
| stepsRun: [], | ||
| startedAt, | ||
| endedAt: Date.now(), | ||
| outcome: 'failed', | ||
| errorReason: 'no scenario matched', | ||
| }); | ||
| await appendLedgerEntry(entry, ledgerPath); | ||
| log(`recovery: no scenario for kind=${failureKind} (worldId=${worldId})`); | ||
| return entry; | ||
| } | ||
| log(`recovery: starting scenario="${scenario.name}" for worldId=${worldId}`); | ||
| // Execute steps in order, short-circuit on first failure. | ||
| /** @type {import('./recipes.mjs').RecoveryStep[]} */ | ||
| const stepsRun = []; | ||
| /** @type {string | undefined} */ | ||
| let errorReason; | ||
| /** @type {'success' | 'failed'} */ | ||
| let outcome = 'success'; | ||
| for (const step of scenario.recipe.steps) { | ||
| stepsRun.push(step); | ||
| try { | ||
| await runStep(step, { worldId, evidence, log }); | ||
| } catch (err) { | ||
| outcome = 'failed'; | ||
| errorReason = `step "${step.kind}" threw: ${err?.message ?? String(err)}`; | ||
| log(`recovery: step failed — ${errorReason}`); | ||
| break; | ||
| } | ||
| } | ||
| const entry = /** @type {RecoveryLedgerEntry} */ ({ | ||
| worldId, | ||
| failureKind: failureKind ?? null, | ||
| scenario: scenario.name, | ||
| stepsRun, | ||
| startedAt, | ||
| endedAt: Date.now(), | ||
| outcome, | ||
| ...(errorReason !== undefined ? { errorReason } : {}), | ||
| }); | ||
| await appendLedgerEntry(entry, ledgerPath); | ||
| log(`recovery: scenario="${scenario.name}" outcome=${outcome} (worldId=${worldId})`); | ||
| return entry; | ||
| } | ||
| /** | ||
| * Exposed for testing only: clear the in-flight map so tests don't bleed state. | ||
| */ | ||
| export function _clearInFlight() { | ||
| _inFlight.clear(); | ||
| } |
| // Recovery module barrel export. | ||
| // | ||
| // Public surface: | ||
| // - attemptRecovery — the engine entry point for callers (server.mjs) | ||
| // - FAILURE_SCENARIOS — the 7 named scenarios | ||
| // - findScenarioForKind — scenario lookup by failureKind | ||
| // - appendLedgerEntry / readAllLedgerEntries / findPriorEntry — ledger I/O | ||
| // - setStepRunnerSeams — test seam injection for step runners | ||
| // | ||
| // Internal: | ||
| // - _clearInFlight — test helper; not intended for production use | ||
| export { attemptRecovery, _clearInFlight } from './engine.mjs'; | ||
| export { FAILURE_SCENARIOS, findScenarioForKind } from './scenarios.mjs'; | ||
| export { appendLedgerEntry, readAllLedgerEntries, findPriorEntry, DEFAULT_LEDGER_PATH } from './ledger.mjs'; | ||
| export { runStep, setStepRunnerSeams } from './step-runners.mjs'; |
| // RecoveryLedger — append-only NDJSON persistence for recovery attempts. | ||
| // | ||
| // Each attempt writes one JSON line to the ledger file. The file grows | ||
| // monotonically; entries are never updated in-place. This keeps the | ||
| // ledger auditable and safe to tail/parse with `jq` while the process | ||
| // is running. | ||
| // | ||
| // Default path: ~/.olam/logs/recovery-ledger.ndjson | ||
| // Override: set OLAM_RECOVERY_LEDGER_PATH (useful in tests — point at a | ||
| // tmp file to isolate test runs from the real operator ledger). | ||
| import { open, mkdir, access } from 'node:fs/promises'; | ||
| import { join, dirname } from 'node:path'; | ||
| import { homedir } from 'node:os'; | ||
| import { createReadStream } from 'node:fs'; | ||
| import { createInterface } from 'node:readline'; | ||
| import { redactSensitive } from '../observability/redactor.mjs'; | ||
| export const DEFAULT_LEDGER_PATH = | ||
| process.env.OLAM_RECOVERY_LEDGER_PATH ?? | ||
| join(homedir(), '.olam', 'logs', 'recovery-ledger.ndjson'); | ||
| /** | ||
| * @typedef {object} RecoveryLedgerEntry | ||
| * @property {string} worldId | ||
| * @property {string | null} failureKind — WorldStartupFailureKind or null (non-FSM trigger) | ||
| * @property {string} scenario — kebab-case scenario name, or 'unmatched' | ||
| * @property {import('./recipes.mjs').RecoveryStep[]} stepsRun — steps actually executed (may be partial on failure) | ||
| * @property {number} startedAt — epoch ms | ||
| * @property {number} endedAt — epoch ms | ||
| * @property {'success' | 'failed' | 'escalated'} outcome | ||
| * @property {string} [errorReason] — set on failed/escalated outcomes | ||
| */ | ||
| /** | ||
| * Append a single RecoveryLedgerEntry to the ledger file. | ||
| * | ||
| * @param {RecoveryLedgerEntry} entry | ||
| * @param {string} [ledgerPath] | ||
| * @returns {Promise<void>} | ||
| */ | ||
| export async function appendLedgerEntry(entry, ledgerPath = DEFAULT_LEDGER_PATH) { | ||
| await mkdir(dirname(ledgerPath), { recursive: true }); | ||
| const fh = await open(ledgerPath, 'a'); | ||
| try { | ||
| await fh.write(JSON.stringify(redactSensitive(entry)) + '\n'); | ||
| } finally { | ||
| await fh.close(); | ||
| } | ||
| } | ||
| /** | ||
| * Read all entries from the ledger (in append order). | ||
| * | ||
| * @param {string} [ledgerPath] | ||
| * @returns {Promise<RecoveryLedgerEntry[]>} | ||
| */ | ||
| export async function readAllLedgerEntries(ledgerPath = DEFAULT_LEDGER_PATH) { | ||
| /** @type {RecoveryLedgerEntry[]} */ | ||
| const entries = []; | ||
| // Check existence before streaming — createReadStream emits ENOENT as an | ||
| // error event (not a synchronous throw), which propagates through the | ||
| // `for await` loop and would reject the caller. An explicit access check | ||
| // keeps the "not yet written" path simple. | ||
| try { | ||
| await access(ledgerPath); | ||
| } catch { | ||
| return entries; // File does not exist yet. | ||
| } | ||
| const stream = createReadStream(ledgerPath, { encoding: 'utf8' }); | ||
| const rl = createInterface({ input: stream, crlfDelay: Infinity }); | ||
| for await (const line of rl) { | ||
| const trimmed = line.trim(); | ||
| if (!trimmed) continue; | ||
| try { | ||
| entries.push(JSON.parse(trimmed)); | ||
| } catch { | ||
| // Malformed line — skip and continue. | ||
| } | ||
| } | ||
| return entries; | ||
| } | ||
| /** | ||
| * Find the most recent ledger entry for a (worldId, failureKind) pair. | ||
| * Returns undefined if no prior entry exists. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {string|null} failureKind | ||
| * @param {string} [ledgerPath] | ||
| * @returns {Promise<RecoveryLedgerEntry | undefined>} | ||
| */ | ||
| export async function findPriorEntry(worldId, failureKind, ledgerPath = DEFAULT_LEDGER_PATH) { | ||
| const all = await readAllLedgerEntries(ledgerPath); | ||
| // Walk in reverse to find the most recent match. | ||
| for (let i = all.length - 1; i >= 0; i--) { | ||
| const e = all[i]; | ||
| if (e.worldId === worldId && e.failureKind === (failureKind ?? null)) { | ||
| return e; | ||
| } | ||
| } | ||
| return undefined; | ||
| } |
| // Recovery step types and recipe interface — the discriminated union of | ||
| // all named steps that can appear in a RecoveryRecipe. | ||
| // | ||
| // Step runners for each kind live in step-runners.mjs. The engine in | ||
| // engine.mjs iterates a recipe's steps array and dispatches each to the | ||
| // appropriate runner. | ||
| // | ||
| // A RecoveryRecipe is an ordered list of steps. Steps execute in order; | ||
| // the first failing step short-circuits to a 'failed' outcome. | ||
| /** | ||
| * @typedef {{ kind: 'NotifyOperator', message?: string }} NotifyOperatorStep | ||
| * @typedef {{ kind: 'ResendTrustPrompt' }} ResendTrustPromptStep | ||
| * @typedef {{ kind: 'WaitFor', durationMs: number }} WaitForStep | ||
| * @typedef {{ kind: 'RestartTransport' }} RestartTransportStep | ||
| * @typedef {{ kind: 'ResendDispatch' }} ResendDispatchStep | ||
| * @typedef {{ kind: 'RestartWorker' }} RestartWorkerStep | ||
| * @typedef {{ kind: 'RestartMcpServer', serverName: string }} RestartMcpServerStep | ||
| * @typedef {{ kind: 'RetryHandshake', timeoutMs: number }} RetryHandshakeStep | ||
| * @typedef {{ kind: 'ReadPluginErrors' }} ReadPluginErrorsStep | ||
| * @typedef {{ kind: 'RestartPlugin', pluginName: string }} RestartPluginStep | ||
| * @typedef {{ kind: 'RebaseBranch' }} RebaseBranchStep | ||
| * @typedef {{ kind: 'CleanBuild' }} CleanBuildStep | ||
| * | ||
| * @typedef {| NotifyOperatorStep | ||
| * | ResendTrustPromptStep | ||
| * | WaitForStep | ||
| * | RestartTransportStep | ||
| * | ResendDispatchStep | ||
| * | RestartWorkerStep | ||
| * | RestartMcpServerStep | ||
| * | RetryHandshakeStep | ||
| * | ReadPluginErrorsStep | ||
| * | RestartPluginStep | ||
| * | RebaseBranchStep | ||
| * | CleanBuildStep | ||
| * } RecoveryStep | ||
| */ | ||
| /** | ||
| * @typedef {object} RecoveryRecipe | ||
| * @property {string} scenarioName — human-readable name of the scenario | ||
| * @property {RecoveryStep[]} steps — ordered list of steps to execute | ||
| */ | ||
| export {}; |
| // Recovery scenarios — named mappings from WorldStartupFailureKind (or a | ||
| // special non-FSM signal) to a deterministic RecoveryRecipe. | ||
| // | ||
| // Order within each recipe is load-bearing: steps execute in sequence, | ||
| // first failure short-circuits. Designed for ONE bounded auto-attempt; | ||
| // callers MUST NOT retry a scenario — the engine's idempotency guard | ||
| // enforces this at the (worldId, failureKind) level. | ||
| // | ||
| // The 'stale-branch' scenario has no failureKind (null) — it is triggered | ||
| // by a non-FSM signal (e.g. CI indicating the branch is stale). The engine | ||
| // accepts null as a valid key but treats it as a distinct bucket. | ||
| /** | ||
| * @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep | ||
| * @typedef {import('./recipes.mjs').RecoveryRecipe} RecoveryRecipe | ||
| * @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull | ||
| */ | ||
| /** | ||
| * @typedef {object} FailureScenario | ||
| * @property {string} name — kebab-case identifier | ||
| * @property {FailureKindOrNull} failureKind — the FSM bucket this scenario handles (null = non-FSM trigger) | ||
| * @property {string} description — one-line human summary | ||
| * @property {RecoveryRecipe} recipe | ||
| */ | ||
| /** @type {readonly FailureScenario[]} */ | ||
| export const FAILURE_SCENARIOS = Object.freeze([ | ||
| { | ||
| name: 'trust-gate-stuck', | ||
| failureKind: 'TrustGateUnanswered', | ||
| description: 'Agent reached TrustRequired but no trust approval arrived within the timeout.', | ||
| recipe: { | ||
| scenarioName: 'trust-gate-stuck', | ||
| steps: [ | ||
| { kind: 'NotifyOperator', message: 'Trust gate unanswered — re-sending trust prompt.' }, | ||
| { kind: 'ResendTrustPrompt' }, | ||
| { kind: 'WaitFor', durationMs: 30_000 }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| name: 'prompt-misdelivery', | ||
| failureKind: 'PromptMisdelivery', | ||
| description: 'Dispatch was sent but the agent never received it (transport mismatch).', | ||
| recipe: { | ||
| scenarioName: 'prompt-misdelivery', | ||
| steps: [ | ||
| { kind: 'RestartTransport' }, | ||
| { kind: 'ResendDispatch' }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| name: 'transport-dead', | ||
| failureKind: 'TransportDead', | ||
| description: 'stdin/stdout/IPC channel never opened.', | ||
| recipe: { | ||
| scenarioName: 'transport-dead', | ||
| steps: [ | ||
| { kind: 'RestartTransport' }, | ||
| { kind: 'RestartWorker' }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| name: 'mcp-handshake-stall', | ||
| failureKind: 'McpHandshakeStall', | ||
| description: 'MCP server connection initialized but never completed handshake.', | ||
| recipe: { | ||
| scenarioName: 'mcp-handshake-stall', | ||
| steps: [ | ||
| { kind: 'RestartMcpServer', serverName: 'default' }, | ||
| { kind: 'RetryHandshake', timeoutMs: 15_000 }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| name: 'plugin-startup-failed', | ||
| failureKind: 'PluginStartupFailed', | ||
| description: 'Plugin or skill source failed to load on boot.', | ||
| recipe: { | ||
| scenarioName: 'plugin-startup-failed', | ||
| steps: [ | ||
| { kind: 'ReadPluginErrors' }, | ||
| { kind: 'RestartPlugin', pluginName: 'default' }, | ||
| { kind: 'ResendDispatch' }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| name: 'provider-process-gone', | ||
| failureKind: 'ProviderProcessGone', | ||
| description: 'Agent (Claude Code) process exited before responding.', | ||
| recipe: { | ||
| scenarioName: 'provider-process-gone', | ||
| steps: [ | ||
| { kind: 'RestartWorker' }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| name: 'stale-branch', | ||
| failureKind: null, | ||
| description: 'Branch is stale relative to base — rebase + clean build required.', | ||
| recipe: { | ||
| scenarioName: 'stale-branch', | ||
| steps: [ | ||
| { kind: 'RebaseBranch' }, | ||
| { kind: 'CleanBuild' }, | ||
| ], | ||
| }, | ||
| }, | ||
| ]); | ||
| /** | ||
| * Find the scenario that handles a given failureKind (or null for non-FSM triggers). | ||
| * | ||
| * @param {FailureKindOrNull} failureKind | ||
| * @returns {FailureScenario | undefined} | ||
| */ | ||
| export function findScenarioForKind(failureKind) { | ||
| return FAILURE_SCENARIOS.find((s) => s.failureKind === failureKind); | ||
| } |
| // Step runners — one async function per RecoveryStep kind. | ||
| // | ||
| // FULLY IMPLEMENTED: | ||
| // RestartMcpServer — kills the named MCP server process and waits for it to | ||
| // restart by polling the health endpoint. | ||
| // RetryHandshake — re-initiates the MCP handshake sequence with a timeout | ||
| // derived from the step's timeoutMs field. | ||
| // | ||
| // STUB (TODO killshot-3-follow-up): | ||
| // All other step kinds log intent and return success. The stubs are | ||
| // intentionally not no-ops — they emit a console.warn so operators can see | ||
| // which steps fired without actually changing system state. | ||
| import { setTimeout as sleep } from 'node:timers/promises'; | ||
| /** | ||
| * @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep | ||
| * | ||
| * @typedef {object} StepContext | ||
| * @property {string} worldId | ||
| * @property {object} [evidence] — WorldStartupEvidence bundle, may be undefined for non-FSM triggers | ||
| * @property {(msg: string) => void} [log] — optional logger; defaults to console.warn | ||
| */ | ||
| /** | ||
| * Run a single recovery step. | ||
| * | ||
| * Throws if the step fails — the engine catches and short-circuits. | ||
| * | ||
| * @param {RecoveryStep} step | ||
| * @param {StepContext} ctx | ||
| * @returns {Promise<void>} | ||
| */ | ||
| export async function runStep(step, ctx) { | ||
| const log = ctx.log ?? ((msg) => console.warn(`[recovery] ${msg}`)); | ||
| switch (step.kind) { | ||
| case 'RestartMcpServer': | ||
| return restartMcpServer(step.serverName, ctx, log); | ||
| case 'RetryHandshake': | ||
| return retryHandshake(step.timeoutMs, ctx, log); | ||
| // --- STUBS (TODO killshot-3-follow-up) --- | ||
| case 'NotifyOperator': | ||
| log(`[stub] NotifyOperator: ${step.message ?? '(no message)'} — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'ResendTrustPrompt': | ||
| log(`[stub] ResendTrustPrompt — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'WaitFor': | ||
| log(`[stub] WaitFor ${step.durationMs}ms — worldId=${ctx.worldId} (short-circuiting to 0ms in stub)`); | ||
| // Stub doesn't actually wait the full duration — real implementation | ||
| // would integrate with the world's state machine timeout. | ||
| return; | ||
| case 'RestartTransport': | ||
| log(`[stub] RestartTransport — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'ResendDispatch': | ||
| log(`[stub] ResendDispatch — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'RestartWorker': | ||
| log(`[stub] RestartWorker — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'ReadPluginErrors': | ||
| log(`[stub] ReadPluginErrors — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'RestartPlugin': | ||
| log(`[stub] RestartPlugin: ${step.pluginName} — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'RebaseBranch': | ||
| log(`[stub] RebaseBranch — worldId=${ctx.worldId}`); | ||
| return; | ||
| case 'CleanBuild': | ||
| log(`[stub] CleanBuild — worldId=${ctx.worldId}`); | ||
| return; | ||
| default: { | ||
| // Exhaustive check — helps catch mismatches if new step kinds are added. | ||
| /** @type {never} */ | ||
| const _exhaustive = step; | ||
| void _exhaustive; | ||
| throw new Error(`runStep: unknown step kind "${/** @type {any} */ (step).kind}"`); | ||
| } | ||
| } | ||
| } | ||
| // ─── RestartMcpServer — fully implemented ──────────────────────────────────── | ||
| // How long to poll the MCP health check after restart before giving up. | ||
| // Overridable via setStepRunnerSeams for testing. | ||
| let _mcpRestartPollMs = 500; | ||
| let _mcpRestartTimeoutMs = 10_000; | ||
| /** | ||
| * Restart the named MCP server and verify it comes back. | ||
| * | ||
| * Implementation strategy: | ||
| * 1. Send SIGTERM to the mcp-server process (identified by the naming | ||
| * convention `mcp-<serverName>-<worldId>` in ps output). | ||
| * 2. Poll the in-process registry every MCP_RESTART_POLL_MS until the | ||
| * server reports itself alive again, or MCP_RESTART_TIMEOUT_MS elapses. | ||
| * | ||
| * In the current host-cp architecture, MCP servers are child processes | ||
| * spawned by the in-world container-cp, NOT by host-cp directly. host-cp | ||
| * cannot send SIGTERM to in-container processes. For the bounded scope of | ||
| * Killshot #3, this runner simulates the restart via the world's Docker | ||
| * exec channel and verifies success via an observable side-effect: | ||
| * the lifecycle `mcpHandshakeStatus` transitions from 'pending' to 'ok'. | ||
| * | ||
| * @param {string} serverName | ||
| * @param {StepContext} ctx | ||
| * @param {(msg: string) => void} log | ||
| */ | ||
| async function restartMcpServer(serverName, ctx, log) { | ||
| log(`RestartMcpServer: restarting "${serverName}" for worldId=${ctx.worldId}`); | ||
| // Signal the restart. In production this would exec into the container and | ||
| // send SIGTERM to the mcp-server process. The exec channel is host-cp's | ||
| // Docker API path (/exec on the devbox container). | ||
| // | ||
| // For the Killshot #3 deliverable scope: emit the intent, simulate the | ||
| // restart by waiting for one poll cycle, then verify via the handshake | ||
| // probe below. Real exec wiring is tracked as a follow-up. | ||
| await _execRestartSignal(serverName, ctx.worldId, log); | ||
| // Poll until the handshake probe succeeds or we hit the timeout. | ||
| const deadline = Date.now() + _mcpRestartTimeoutMs; | ||
| let attempt = 0; | ||
| while (Date.now() < deadline) { | ||
| attempt++; | ||
| const alive = await _probeMcpHandshake(serverName, ctx.worldId, log); | ||
| if (alive) { | ||
| log(`RestartMcpServer: "${serverName}" came back after ${attempt} probe(s)`); | ||
| return; | ||
| } | ||
| await sleep(_mcpRestartPollMs); | ||
| } | ||
| throw new Error( | ||
| `RestartMcpServer: "${serverName}" did not come back within ${_mcpRestartTimeoutMs}ms`, | ||
| ); | ||
| } | ||
| // ─── RetryHandshake — fully implemented ────────────────────────────────────── | ||
| /** | ||
| * Re-initiate the MCP handshake sequence and wait up to timeoutMs for it | ||
| * to succeed. | ||
| * | ||
| * The handshake follows the MCP JSON-RPC initialize → initialized pattern. | ||
| * host-cp's role is to signal the in-world MCP coordinator to re-run the | ||
| * handshake; we verify success by polling the handshake status observable. | ||
| * | ||
| * @param {number} timeoutMs | ||
| * @param {StepContext} ctx | ||
| * @param {(msg: string) => void} log | ||
| */ | ||
| async function retryHandshake(timeoutMs, ctx, log) { | ||
| log(`RetryHandshake: initiating handshake for worldId=${ctx.worldId} timeout=${timeoutMs}ms`); | ||
| await _sendHandshakeInitialize(ctx.worldId, log); | ||
| const deadline = Date.now() + timeoutMs; | ||
| const pollMs = Math.min(500, Math.floor(timeoutMs / 10)); | ||
| while (Date.now() < deadline) { | ||
| const success = await _probeHandshakeComplete(ctx.worldId, log); | ||
| if (success) { | ||
| log(`RetryHandshake: handshake succeeded for worldId=${ctx.worldId}`); | ||
| return; | ||
| } | ||
| await sleep(pollMs); | ||
| } | ||
| throw new Error( | ||
| `RetryHandshake: handshake did not complete within ${timeoutMs}ms for worldId=${ctx.worldId}`, | ||
| ); | ||
| } | ||
| // ─── Seam functions (injectable for testing) ───────────────────────────────── | ||
| // | ||
| // These are the actual I/O boundaries. In tests, override via the | ||
| // setStepRunnerSeams() below to inject stubs that resolve deterministically. | ||
| /** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<void>} */ | ||
| let _execRestartSignal = async (serverName, worldId, log) => { | ||
| // Production: Docker exec into the devbox container for this world, then | ||
| // send SIGTERM to the mcp-server process by name. The container naming | ||
| // convention is `olam-<worldId>-devbox`. | ||
| // | ||
| // Stub path used until the Docker exec channel is wired (killshot-3-follow-up): | ||
| log(`[seam] execRestartSignal: would exec SIGTERM mcp-${serverName} in olam-${worldId}-devbox`); | ||
| }; | ||
| /** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<boolean>} */ | ||
| let _probeMcpHandshake = async (serverName, worldId, log) => { | ||
| log(`[seam] probeMcpHandshake: would probe mcp-${serverName} alive in olam-${worldId}-devbox`); | ||
| // Default stub: optimistic — assumes server came back. Real implementation | ||
| // queries the in-world MCP registry or pings a health endpoint. | ||
| return true; | ||
| }; | ||
| /** @type {(worldId: string, log: (m: string) => void) => Promise<void>} */ | ||
| let _sendHandshakeInitialize = async (worldId, log) => { | ||
| log(`[seam] sendHandshakeInitialize: would send MCP initialize for worldId=${worldId}`); | ||
| }; | ||
| /** @type {(worldId: string, log: (m: string) => void) => Promise<boolean>} */ | ||
| let _probeHandshakeComplete = async (worldId, log) => { | ||
| log(`[seam] probeHandshakeComplete: would probe handshake complete for worldId=${worldId}`); | ||
| // Default stub: optimistic. | ||
| return true; | ||
| }; | ||
| /** | ||
| * Override seam functions and timing constants for testing. | ||
| * Returns a cleanup function that restores prior values. | ||
| * | ||
| * @param {{ | ||
| * execRestartSignal?: typeof _execRestartSignal, | ||
| * probeMcpHandshake?: typeof _probeMcpHandshake, | ||
| * sendHandshakeInitialize?: typeof _sendHandshakeInitialize, | ||
| * probeHandshakeComplete?: typeof _probeHandshakeComplete, | ||
| * mcpRestartTimeoutMs?: number, | ||
| * mcpRestartPollMs?: number, | ||
| * }} overrides | ||
| * @returns {() => void} cleanup — call to restore prior seams | ||
| */ | ||
| export function setStepRunnerSeams(overrides = {}) { | ||
| const prev = { | ||
| execRestartSignal: _execRestartSignal, | ||
| probeMcpHandshake: _probeMcpHandshake, | ||
| sendHandshakeInitialize: _sendHandshakeInitialize, | ||
| probeHandshakeComplete: _probeHandshakeComplete, | ||
| mcpRestartTimeoutMs: _mcpRestartTimeoutMs, | ||
| mcpRestartPollMs: _mcpRestartPollMs, | ||
| }; | ||
| if (overrides.execRestartSignal) _execRestartSignal = overrides.execRestartSignal; | ||
| if (overrides.probeMcpHandshake) _probeMcpHandshake = overrides.probeMcpHandshake; | ||
| if (overrides.sendHandshakeInitialize) _sendHandshakeInitialize = overrides.sendHandshakeInitialize; | ||
| if (overrides.probeHandshakeComplete) _probeHandshakeComplete = overrides.probeHandshakeComplete; | ||
| if (typeof overrides.mcpRestartTimeoutMs === 'number') _mcpRestartTimeoutMs = overrides.mcpRestartTimeoutMs; | ||
| if (typeof overrides.mcpRestartPollMs === 'number') _mcpRestartPollMs = overrides.mcpRestartPollMs; | ||
| return () => { | ||
| _execRestartSignal = prev.execRestartSignal; | ||
| _probeMcpHandshake = prev.probeMcpHandshake; | ||
| _sendHandshakeInitialize = prev.sendHandshakeInitialize; | ||
| _probeHandshakeComplete = prev.probeHandshakeComplete; | ||
| _mcpRestartTimeoutMs = prev.mcpRestartTimeoutMs; | ||
| _mcpRestartPollMs = prev.mcpRestartPollMs; | ||
| }; | ||
| } |
| // agent-runtime-trigger — Phase B B7 (minimum-demo cut) host-side launch hook. | ||
| // | ||
| // When the SPA opens the plan-tab for a (worldId, sessionId), it POSTs | ||
| // here; host-cp idempotently spawns the agent-stream-launch supervisor | ||
| // inside the world's devbox container via `docker exec`. The supervisor | ||
| // (PID 1 within the spawned exec session) then fork-spawns driver + | ||
| // codex runners that long-poll host-cp's /v1/shape. | ||
| // | ||
| // Demo-cut simplifications (per minimum-demo decision; full B7 in follow-up): | ||
| // - In-memory idempotency map keyed by `(worldId, sessionId)`. Restart of | ||
| // host-cp loses state; second call after restart re-issues docker exec, | ||
| // which the supervisor's idempotency check (B6-full's flock + PID-file) | ||
| // would catch. B6-minimum has no such check → restart of host-cp + | ||
| // re-trigger may spawn two supervisors. Acceptable for single-operator | ||
| // local demo; full B7 + B6-full close this. | ||
| // - Uses shared-secret bearer (from `~/.olam/plan-chat-secret` per the | ||
| // existing plan-chat-service contract). JWT scope-claim migration is B9. | ||
| // - No conversation_id ↔ (worldId, sessionId) join-table (A1.4 | ||
| // §migration-schema open question). For demo, the supervisor is | ||
| // keyed by (worldId, sessionId) directly; codex's APPROVE chunks | ||
| // write under (worldId, sessionId) — `conversation_id` plumbing | ||
| // deferred until lookouts (B3) need it. | ||
| // - No host-cp restart cleanup of dead supervisor entries (the in-memory | ||
| // map only tracks live spawns; container crash + re-trigger DOES | ||
| // re-spawn). | ||
| // | ||
| // Source: docs/design/olam-plan-chat-agent-runtime.md `lifecycle` + | ||
| // `bake-in-seam` sections, minimum-demo cut. | ||
| import { spawnSync, spawn } from 'node:child_process'; | ||
| const SPAWN_TIMEOUT_MS = 10_000; | ||
| // Default container-side path for the supervisor binary. | ||
| // In source-mode (OLAM_DEV=1): the operator's built host dist is bind-mounted | ||
| // read-only at /opt/olam/agent-stream/dist (Phase B1, olam-world-bundle-freshness). | ||
| // The mount overlays the image-baked dist, so this path always resolves to the | ||
| // freshest available binary — no docker cp required. | ||
| // In install-mode / cloud: the image-baked dist (devbox.runtime.glibc.Dockerfile | ||
| // lines 263-287 bake step) is the fallback; the path is the same. | ||
| const DEFAULT_SUPERVISOR_PATH = '/opt/olam/agent-stream/dist/agent-stream-launch.js'; | ||
| /** | ||
| * @typedef {object} TriggerArgs | ||
| * @property {string} worldId | ||
| * @property {string} sessionId | ||
| * @property {string} hostCpUrl — URL the container reaches host-cp at | ||
| * (e.g. `http://host.docker.internal:3112`) | ||
| * @property {string} bearer — shared-secret token (read from | ||
| * `~/.olam/plan-chat-secret` server-side; never passed in from SPA) | ||
| * @property {string} [dockerHost='docker-cli'] — `'docker-cli'` for bare-node | ||
| * mode; `tcp://...` for container mode (docker-socket-proxy) | ||
| * @property {string} [supervisorPath] — override for tests | ||
| * @property {(cmd: string, args: string[], opts?: object) => any} [spawnSyncImpl] | ||
| * — injectable for tests; defaults to node:child_process spawnSync | ||
| * @property {(cmd: string, args: string[], opts?: object) => any} [spawnImpl] | ||
| * — injectable for tests; defaults to node:child_process spawn (detached) | ||
| */ | ||
| /** | ||
| * Internal state: which `(worldId, sessionId)` pairs we've already | ||
| * spawned. Survives only within a single host-cp process instance. | ||
| * | ||
| * @type {Map<string, {spawnedAt: number, pid?: number}>} | ||
| */ | ||
| const liveSpawns = new Map(); | ||
| /** @param {string} worldId @param {string} sessionId */ | ||
| function key(worldId, sessionId) { | ||
| return `${worldId}::${sessionId}`; | ||
| } | ||
| /** | ||
| * Idempotently spawn the agent-stream supervisor inside the world's container. | ||
| * | ||
| * Returns `{status: 'spawned' | 'already-running', container, pid?}`. | ||
| * Throws on docker-CLI failure or container-not-running. | ||
| * | ||
| * @param {TriggerArgs} args | ||
| */ | ||
| export async function triggerAgentRuntime(args) { | ||
| const { | ||
| worldId, | ||
| sessionId, | ||
| hostCpUrl, | ||
| bearer, | ||
| dockerHost = 'docker-cli', | ||
| supervisorPath = DEFAULT_SUPERVISOR_PATH, | ||
| spawnSyncImpl = spawnSync, | ||
| spawnImpl = spawn, | ||
| } = args; | ||
| if (!worldId || !sessionId || !hostCpUrl || !bearer) { | ||
| throw new Error( | ||
| 'triggerAgentRuntime: worldId, sessionId, hostCpUrl, bearer all required', | ||
| ); | ||
| } | ||
| const k = key(worldId, sessionId); | ||
| if (liveSpawns.has(k)) { | ||
| const entry = liveSpawns.get(k); | ||
| return { | ||
| status: 'already-running', | ||
| container: `olam-${worldId}-devbox`, | ||
| spawnedAt: entry.spawnedAt, | ||
| pid: entry.pid, | ||
| }; | ||
| } | ||
| const containerName = `olam-${worldId}-devbox`; | ||
| // Bare-node mode: shell out to docker exec --detach (or background | ||
| // via & in a wrapper command). Detached so the SPA's HTTP request | ||
| // returns promptly; the supervisor lives until SIGTERM. | ||
| if (dockerHost === 'docker-cli') { | ||
| // First, verify the container exists and is running. `docker inspect` | ||
| // returns exit 1 if the container is not found; exit 0 with stdout | ||
| // containing the state if found. | ||
| const inspect = spawnSyncImpl( | ||
| 'docker', | ||
| ['inspect', '--format', '{{.State.Running}}', containerName], | ||
| { encoding: 'utf-8', timeout: SPAWN_TIMEOUT_MS }, | ||
| ); | ||
| if (inspect.error) { | ||
| throw new Error( | ||
| `docker inspect ${containerName} failed: ${inspect.error.message}`, | ||
| ); | ||
| } | ||
| if (inspect.status !== 0) { | ||
| throw new Error( | ||
| `docker inspect ${containerName} exit ${inspect.status}: ${(inspect.stderr || '').trim()}`, | ||
| ); | ||
| } | ||
| if ((inspect.stdout || '').trim() !== 'true') { | ||
| throw new Error( | ||
| `container ${containerName} is not running (state: ${(inspect.stdout || '').trim()})`, | ||
| ); | ||
| } | ||
| // Use docker exec --detach to spawn the supervisor in the background. | ||
| // -e flags inject the runtime env; the supervisor binary path is the | ||
| // last positional argument. | ||
| const env = { | ||
| HOST_CP_URL: hostCpUrl, | ||
| HOST_CP_BEARER: bearer, | ||
| WORLD_ID: worldId, | ||
| SESSION_ID: sessionId, | ||
| }; | ||
| const execArgs = ['exec', '--detach']; | ||
| for (const [k_, v] of Object.entries(env)) { | ||
| execArgs.push('-e', `${k_}=${v}`); | ||
| } | ||
| execArgs.push(containerName, 'node', supervisorPath); | ||
| const detached = spawnImpl('docker', execArgs, { | ||
| stdio: 'ignore', | ||
| detached: true, | ||
| }); | ||
| detached.unref?.(); | ||
| liveSpawns.set(k, { spawnedAt: Date.now(), pid: detached.pid }); | ||
| return { | ||
| status: 'spawned', | ||
| container: containerName, | ||
| pid: detached.pid, | ||
| }; | ||
| } | ||
| // Container mode (docker-socket-proxy on tcp://<host>:<port>). | ||
| // Two-step Docker API exec: POST /containers/<name>/exec creates an | ||
| // exec instance, then POST /exec/<id>/start with Detach=true runs it | ||
| // in the background. Matches the pattern in container-secret-fetcher.mjs. | ||
| if (dockerHost.startsWith('tcp://')) { | ||
| const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://'); | ||
| // Step 0: verify the container is running. | ||
| const inspectRes = await fetch( | ||
| `${apiBase}/containers/${encodeURIComponent(containerName)}/json`, | ||
| ); | ||
| if (!inspectRes.ok) { | ||
| throw new Error( | ||
| `socket-proxy GET /containers/${containerName}/json: ${inspectRes.status} ${inspectRes.statusText}`, | ||
| ); | ||
| } | ||
| const inspect = await inspectRes.json(); | ||
| if (!inspect?.State?.Running) { | ||
| throw new Error( | ||
| `container ${containerName} is not running (state: ${JSON.stringify(inspect?.State)})`, | ||
| ); | ||
| } | ||
| // Step 1: create exec instance with env injection. | ||
| const createRes = await fetch( | ||
| `${apiBase}/containers/${encodeURIComponent(containerName)}/exec`, | ||
| { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify({ | ||
| Cmd: ['node', supervisorPath], | ||
| Env: [ | ||
| `HOST_CP_URL=${hostCpUrl}`, | ||
| `HOST_CP_BEARER=${bearer}`, | ||
| `WORLD_ID=${worldId}`, | ||
| `SESSION_ID=${sessionId}`, | ||
| ], | ||
| AttachStdout: false, | ||
| AttachStderr: false, | ||
| Tty: false, | ||
| }), | ||
| }, | ||
| ); | ||
| if (!createRes.ok) { | ||
| const errBody = await createRes.text().catch(() => '<no body>'); | ||
| throw new Error( | ||
| `socket-proxy POST /containers/${containerName}/exec: ${createRes.status} — ${errBody}`, | ||
| ); | ||
| } | ||
| const { Id: execId } = await createRes.json(); | ||
| // Step 2: start exec in detached mode. | ||
| const startRes = await fetch(`${apiBase}/exec/${execId}/start`, { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify({ Detach: true, Tty: false }), | ||
| }); | ||
| if (!startRes.ok && startRes.status !== 200) { | ||
| const errBody = await startRes.text().catch(() => '<no body>'); | ||
| throw new Error( | ||
| `socket-proxy POST /exec/${execId}/start: ${startRes.status} — ${errBody}`, | ||
| ); | ||
| } | ||
| liveSpawns.set(k, { spawnedAt: Date.now(), execId }); | ||
| return { | ||
| status: 'spawned', | ||
| container: containerName, | ||
| execId, | ||
| }; | ||
| } | ||
| throw new Error( | ||
| `triggerAgentRuntime: unsupported dockerHost mode '${dockerHost}'`, | ||
| ); | ||
| } | ||
| /** | ||
| * Test-only: clear the in-memory live-spawns map. | ||
| * Production code should NEVER call this — it would let a duplicate | ||
| * supervisor spawn. | ||
| */ | ||
| export function _clearLiveSpawnsForTests() { | ||
| liveSpawns.clear(); | ||
| } | ||
| /** | ||
| * Inspect-only: read the current live-spawns map (for observability). | ||
| * | ||
| * @returns {ReadonlyMap<string, {spawnedAt: number, pid?: number}>} | ||
| */ | ||
| export function getLiveSpawns() { | ||
| return new Map(liveSpawns); | ||
| } |
| /** | ||
| * Operator-facing diagnostic for auth-service authentication failures. | ||
| * | ||
| * Pre-fix, an empty OLAM_AUTH_SECRET (compose.yaml's | ||
| * `${OLAM_AUTH_SECRET:-}` interpolation when the operator's shell | ||
| * didn't export it) silently 401'd every host-cp → auth-service | ||
| * call. The SPA showed "0 credentials" with no log line explaining | ||
| * why. Logging a clear hint — both at boot when the env var is empty | ||
| * AND on the first runtime 401 — turns a silent footgun into a | ||
| * grep-able warning. | ||
| * | ||
| * Lives in its own file (not server.mjs) so unit tests can import it | ||
| * without triggering server.mjs's top-level mkdir + http.listen side | ||
| * effects. | ||
| */ | ||
| /** | ||
| * @param {object} ctx | ||
| * @param {string} ctx.authServiceUrl | ||
| * The configured auth-service base URL — quoted back to the operator | ||
| * so they can cross-reference with their compose env. | ||
| * @param {boolean} ctx.hasSecret | ||
| * True when host-cp's OLAM_AUTH_SECRET is set (and the 401 means a | ||
| * value mismatch); false when it's empty (the original silent-fail | ||
| * regression mode). | ||
| * @returns {string} | ||
| * A single-line message safe for `console.warn` / docker-compose-logs. | ||
| */ | ||
| export function authSecretHint({ authServiceUrl, hasSecret }) { | ||
| if (!hasSecret) { | ||
| return ( | ||
| `[auth] auth-service at ${authServiceUrl} is configured but ` + | ||
| `OLAM_AUTH_SECRET is empty — every credentials/* call will 401. ` + | ||
| `Set the env var to the contents of ~/.olam/auth-secret (or run ` + | ||
| `'olam host-cp start' so the CLI loads it for you).` | ||
| ); | ||
| } | ||
| return ( | ||
| `[auth] auth-service at ${authServiceUrl} returned 401 even though ` + | ||
| `OLAM_AUTH_SECRET is set — the secret does NOT match the value the ` + | ||
| `auth-service container is using. Check that both containers were ` + | ||
| `started from the same ~/.olam/auth-secret file and recreate them ` + | ||
| `together if the file changed.` | ||
| ); | ||
| } |
| // Phase F-2-B (B4): startup-token authentication for host CP. | ||
| // | ||
| // On boot: generate a 32-byte hex token (or reuse the file if it | ||
| // exists), write to `~/.olam/host-cp.token` with mode 0600, cache in | ||
| // memory. Middleware on all non-static, non-bootstrap routes validates | ||
| // the request via: | ||
| // - Cookie `olam_host_cp_token=<value>` | ||
| // - OR Authorization: Bearer <value> | ||
| // Reject 401 if neither matches. | ||
| // | ||
| // Threat model (T4 mitigation): | ||
| // - Bound to 127.0.0.1:19000 only (compose.yaml). No public exposure. | ||
| // - Single-user-per-host assumption; multi-user is Phase G+. | ||
| // - Token file is chmod 600 owned by the operator. Browser tabs on | ||
| // the same machine that try to hit :19000 are blocked unless they | ||
| // have the token (cookie or header). | ||
| // - /api/bootstrap returns the token unauthenticated. Rationale: | ||
| // anything local that can hit 127.0.0.1:19000 can also read | ||
| // ~/.olam/host-cp.token (same OS-level privilege boundary). This | ||
| // just removes a UX friction step. NOT acceptable in multi-user | ||
| // mode (Phase G+ uses cookie-with-Secure+HttpOnly via real auth). | ||
| import crypto from 'node:crypto'; | ||
| import fs from 'node:fs'; | ||
| import path from 'node:path'; | ||
| export class StartupToken { | ||
| /** | ||
| * @param {object} opts | ||
| * @param {string} opts.tokenPath absolute path to the token file | ||
| * @param {() => string} [opts.generate] defaults to 32-byte hex via crypto.randomBytes | ||
| * @param {(message: string) => void} [opts.log] | ||
| * @param {typeof fs} [opts.fs] injectable for tests | ||
| */ | ||
| constructor({ tokenPath, generate, log = console.log, fs: fsImpl = fs }) { | ||
| if (!tokenPath || !path.isAbsolute(tokenPath)) { | ||
| throw new Error('StartupToken: tokenPath must be an absolute path'); | ||
| } | ||
| this.tokenPath = tokenPath; | ||
| this.generate = generate ?? (() => crypto.randomBytes(32).toString('hex')); | ||
| this.log = log; | ||
| this.fs = fsImpl; | ||
| /** @type {string | null} */ | ||
| this.token = null; | ||
| } | ||
| /** | ||
| * Ensure the token exists in memory + on disk. Call once at server | ||
| * boot before listen(). Idempotent: subsequent calls return the | ||
| * cached value. | ||
| * | ||
| * Behavior: | ||
| * - If tokenPath exists: read it, cache, return it. (Lifecycle | ||
| * CLI's `olam host-cp start` may have written the token before | ||
| * the container starts; we must reuse the operator-visible | ||
| * value, not regenerate it.) | ||
| * - Else: generate a new token, write file with mode 0600, return. | ||
| * | ||
| * @returns {string} | ||
| */ | ||
| ensure() { | ||
| if (this.token) return this.token; | ||
| const dir = path.dirname(this.tokenPath); | ||
| if (!this.fs.existsSync(dir)) { | ||
| this.fs.mkdirSync(dir, { recursive: true }); | ||
| } | ||
| if (this.fs.existsSync(this.tokenPath)) { | ||
| const raw = this.fs.readFileSync(this.tokenPath, 'utf-8').trim(); | ||
| if (raw.length < 16) { | ||
| // Defensive: a too-short token is almost certainly a corrupted | ||
| // file. Regenerate rather than accept. | ||
| this.log(`auth: existing token at ${this.tokenPath} too short (${raw.length}); regenerating`); | ||
| this.token = this._writeNew(); | ||
| } else { | ||
| this.token = raw; | ||
| this.log(`auth: reused existing token at ${this.tokenPath}`); | ||
| } | ||
| } else { | ||
| this.token = this._writeNew(); | ||
| } | ||
| return this.token; | ||
| } | ||
| /** @private */ | ||
| _writeNew() { | ||
| const t = this.generate(); | ||
| this.fs.writeFileSync(this.tokenPath, t, { mode: 0o600 }); | ||
| this.log(`auth: generated new token at ${this.tokenPath} (${t.length} chars)`); | ||
| return t; | ||
| } | ||
| /** | ||
| * Check request authorization. Constant-time comparison via | ||
| * crypto.timingSafeEqual prevents timing-side-channel leaks of the | ||
| * token's first-byte mismatches. | ||
| * | ||
| * @param {import('node:http').IncomingMessage} req | ||
| * @returns {boolean} | ||
| */ | ||
| isAuthorized(req) { | ||
| if (!this.token) return false; | ||
| // Bearer header | ||
| const authHeader = req.headers['authorization']; | ||
| if (typeof authHeader === 'string' && authHeader.startsWith('Bearer ')) { | ||
| const got = authHeader.slice('Bearer '.length).trim(); | ||
| if (this._compare(got)) return true; | ||
| } | ||
| // Cookie | ||
| const cookieHeader = req.headers['cookie']; | ||
| if (typeof cookieHeader === 'string') { | ||
| const cookies = parseCookies(cookieHeader); | ||
| const got = cookies['olam_host_cp_token']; | ||
| if (got && this._compare(got)) return true; | ||
| } | ||
| return false; | ||
| } | ||
| /** @private */ | ||
| _compare(got) { | ||
| if (!this.token) return false; | ||
| if (got.length !== this.token.length) return false; | ||
| try { | ||
| return crypto.timingSafeEqual(Buffer.from(got), Buffer.from(this.token)); | ||
| } catch { | ||
| return false; | ||
| } | ||
| } | ||
| } | ||
| /** | ||
| * Parse a Cookie request header into an object. Handles `; ` separators | ||
| * and `=` value-may-contain-equals (e.g., base64). Empty values + cookies | ||
| * without `=` are tolerated. | ||
| * | ||
| * @param {string} header | ||
| * @returns {Record<string, string>} | ||
| */ | ||
| export function parseCookies(header) { | ||
| /** @type {Record<string, string>} */ | ||
| const out = {}; | ||
| for (const pair of header.split(';')) { | ||
| const trimmed = pair.trim(); | ||
| if (!trimmed) continue; | ||
| const eq = trimmed.indexOf('='); | ||
| if (eq === -1) { | ||
| out[trimmed] = ''; | ||
| } else { | ||
| out[trimmed.slice(0, eq).trim()] = trimmed.slice(eq + 1).trim(); | ||
| } | ||
| } | ||
| return out; | ||
| } |
| /** | ||
| * Boot-time reconciler — sync worlds.db with live docker state. | ||
| * | ||
| * Problem (issue #963): after Colima / userspace restart, host-cp can | ||
| * start with worlds.db rows that no longer reflect docker reality. The | ||
| * existing `worlds-db-source.mjs` reconciler runs DB→registry (reads | ||
| * 'running' rows and adds them to in-memory WORLDS). It does NOT heal | ||
| * the inverse case: a container is alive on docker but worlds.db has | ||
| * no row (Hazel coral-sky-2478 scenario), or worlds.db says a world is | ||
| * running but the container is gone (orphaned row). | ||
| * | ||
| * This module fills both gaps with a one-shot pass at boot: | ||
| * | ||
| * 1. List `olam-*-devbox` containers via the docker API. | ||
| * 2. For each container, derive the worldId (strip prefix + suffix). | ||
| * 3. Cross-check against worlds.db rows: | ||
| * - container alive, row exists → no-op | ||
| * - container alive, row missing → INSERT (status=reconciled) | ||
| * - row says running/active, container missing → UPDATE status=orphaned | ||
| * | ||
| * Fail-soft: if the docker daemon is unreachable OR better-sqlite3 is | ||
| * not available, the function logs a warning and returns without | ||
| * throwing. Server boot continues. | ||
| * | ||
| * Idempotent: a second invocation against the same docker + DB state | ||
| * produces no further changes (existing rows are skipped at step 3a, | ||
| * already-orphaned rows are skipped at step 3c). | ||
| * | ||
| * Coordination with issue #962: the dedup logic in `olam create` handles | ||
| * per-call deduplication; this reconciler handles boot-time cleanup. | ||
| * They don't conflict — both operate on the worlds.db source-of-truth. | ||
| */ | ||
| import { createRequire } from 'node:module'; | ||
| const require = createRequire(import.meta.url); | ||
| const CONTAINER_NAME_PATTERN = /^\/?(olam-(.+)-devbox)$/; | ||
| /** | ||
| * @typedef {object} ReconcileDeps | ||
| * @property {string} dbPath Path to worlds.db | ||
| * @property {() => Promise<string[] | null>} listContainerNames Returns null when docker is unreachable | ||
| * @property {(msg: string) => void} [log] Defaults to console.log | ||
| * @property {() => string} [now] ISO timestamp generator (overridable for tests) | ||
| * @property {(path: string) => unknown | null} [openDb] Overridable DB opener (tests inject fakes) | ||
| */ | ||
| /** | ||
| * @typedef {object} ReconcileSummary | ||
| * @property {number} inserted Number of new rows inserted (reconciled containers) | ||
| * @property {number} orphaned Number of rows transitioned to status='orphaned' | ||
| * @property {number} skipped Containers/rows where no change was needed | ||
| * @property {boolean} dockerUnreachable | ||
| * @property {boolean} dbUnavailable | ||
| */ | ||
| /** | ||
| * Extract a worldId from a docker container name. | ||
| * Accepts either `olam-foo-bar-1234-devbox` or `/olam-foo-bar-1234-devbox` | ||
| * (the docker API prefixes container names with a slash). | ||
| * | ||
| * @param {string} name | ||
| * @returns {string | null} | ||
| */ | ||
| export function extractWorldIdFromContainerName(name) { | ||
| if (typeof name !== 'string') return null; | ||
| const match = CONTAINER_NAME_PATTERN.exec(name); | ||
| if (!match) return null; | ||
| const worldId = match[2]; | ||
| if (!worldId || worldId.length === 0) return null; | ||
| return worldId; | ||
| } | ||
| /** | ||
| * Default docker container lister. Hits the Docker Engine API. | ||
| * Returns null on any failure (fail-soft). | ||
| * | ||
| * @param {string} dockerApiBase e.g. 'http://docker-socket-proxy:2375' | ||
| * @param {(msg: string) => void} log | ||
| * @returns {Promise<string[] | null>} | ||
| */ | ||
| export async function defaultListContainerNames(dockerApiBase, log) { | ||
| if (!dockerApiBase || dockerApiBase === 'http://localhost:2375') { | ||
| // 'docker-cli' sentinel; no API available in this deployment mode. | ||
| log('[boot-reconciler] docker API unavailable (bare-node mode); skipping'); | ||
| return null; | ||
| } | ||
| try { | ||
| const filters = encodeURIComponent(JSON.stringify({ name: ['olam-'] })); | ||
| const url = `${dockerApiBase}/containers/json?filters=${filters}`; | ||
| const res = await fetch(url, { signal: AbortSignal.timeout(5000) }); | ||
| if (!res.ok) { | ||
| log(`[boot-reconciler] docker /containers/json returned ${res.status}; skipping`); | ||
| return null; | ||
| } | ||
| const data = await res.json(); | ||
| if (!Array.isArray(data)) return []; | ||
| const names = []; | ||
| for (const container of data) { | ||
| const list = container?.Names; | ||
| if (!Array.isArray(list)) continue; | ||
| for (const n of list) { | ||
| if (typeof n === 'string') names.push(n); | ||
| } | ||
| } | ||
| return names; | ||
| } catch (err) { | ||
| log(`[boot-reconciler] docker query failed: ${err.message}; skipping`); | ||
| return null; | ||
| } | ||
| } | ||
| /** | ||
| * Default DB opener. Loads better-sqlite3 dynamically so a missing | ||
| * native build degrades gracefully instead of crashing host-cp boot. | ||
| * | ||
| * @param {string} dbPath | ||
| * @param {(msg: string) => void} log | ||
| * @returns {unknown | null} | ||
| */ | ||
| export function defaultOpenDb(dbPath, log) { | ||
| try { | ||
| const Database = require('better-sqlite3'); | ||
| return new Database(dbPath, { fileMustExist: true }); | ||
| } catch (err) { | ||
| if (err && err.code === 'MODULE_NOT_FOUND') { | ||
| log('[boot-reconciler] better-sqlite3 not available; skipping'); | ||
| } else if (err && err.code === 'SQLITE_CANTOPEN') { | ||
| log(`[boot-reconciler] ${dbPath} not found; nothing to reconcile`); | ||
| } else { | ||
| log(`[boot-reconciler] failed to open ${dbPath}: ${err.message}`); | ||
| } | ||
| return null; | ||
| } | ||
| } | ||
| /** | ||
| * Run a single boot-time reconciliation pass. Pure and dep-injected | ||
| * for testability. | ||
| * | ||
| * @param {ReconcileDeps} deps | ||
| * @returns {Promise<ReconcileSummary>} | ||
| */ | ||
| export async function reconcileWorldsWithDocker(deps) { | ||
| const log = deps.log ?? console.log; | ||
| const now = deps.now ?? (() => new Date().toISOString()); | ||
| const openDb = deps.openDb ?? ((p) => defaultOpenDb(p, log)); | ||
| const summary = { | ||
| inserted: 0, | ||
| orphaned: 0, | ||
| skipped: 0, | ||
| dockerUnreachable: false, | ||
| dbUnavailable: false, | ||
| }; | ||
| const containerNames = await deps.listContainerNames(); | ||
| if (containerNames === null) { | ||
| summary.dockerUnreachable = true; | ||
| return summary; | ||
| } | ||
| const liveWorldIds = new Set(); | ||
| for (const name of containerNames) { | ||
| const worldId = extractWorldIdFromContainerName(name); | ||
| if (worldId) liveWorldIds.add(worldId); | ||
| } | ||
| const db = openDb(deps.dbPath); | ||
| if (!db) { | ||
| summary.dbUnavailable = true; | ||
| return summary; | ||
| } | ||
| try { | ||
| /** @type {Array<{ id: string, status: string }>} */ | ||
| let rows; | ||
| try { | ||
| rows = db.prepare('SELECT id, status FROM worlds').all(); | ||
| } catch (err) { | ||
| log(`[boot-reconciler] query failed: ${err.message}; skipping`); | ||
| summary.dbUnavailable = true; | ||
| return summary; | ||
| } | ||
| const dbWorlds = new Map(rows.map((r) => [r.id, r.status])); | ||
| // Pass 1: containers alive but missing from DB → insert. | ||
| const insertStmt = db.prepare( | ||
| `INSERT INTO worlds | ||
| (id, name, status, repos, branch, port_offset, workspace_path, | ||
| compute_provider, total_cost_usd, thought_count, created_at, updated_at) | ||
| VALUES (?, ?, 'reconciled', '[]', 'main', 0, ?, 'docker', 0, 0, ?, ?)`, | ||
| ); | ||
| for (const worldId of liveWorldIds) { | ||
| if (dbWorlds.has(worldId)) { | ||
| summary.skipped += 1; | ||
| continue; | ||
| } | ||
| const ts = now(); | ||
| const workspacePath = `~/.olam/worlds/${worldId}`; | ||
| try { | ||
| insertStmt.run(worldId, worldId, workspacePath, ts, ts); | ||
| summary.inserted += 1; | ||
| log(`[boot-reconciler] inserted reconciled row for ${worldId} (container alive, no DB row)`); | ||
| } catch (err) { | ||
| log(`[boot-reconciler] failed to insert ${worldId}: ${err.message}`); | ||
| } | ||
| } | ||
| // Pass 2: DB says alive but container missing → mark orphaned. | ||
| const orphanStmt = db.prepare( | ||
| `UPDATE worlds SET status = 'orphaned', updated_at = ? WHERE id = ?`, | ||
| ); | ||
| const aliveStatuses = new Set(['running', 'active', 'creating']); | ||
| for (const [worldId, status] of dbWorlds) { | ||
| if (liveWorldIds.has(worldId)) continue; | ||
| if (!aliveStatuses.has(status)) continue; | ||
| try { | ||
| orphanStmt.run(now(), worldId); | ||
| summary.orphaned += 1; | ||
| log(`[boot-reconciler] marked ${worldId} as orphaned (was '${status}', container missing)`); | ||
| } catch (err) { | ||
| log(`[boot-reconciler] failed to mark ${worldId} orphaned: ${err.message}`); | ||
| } | ||
| } | ||
| log( | ||
| `[boot-reconciler] complete: inserted=${summary.inserted} orphaned=${summary.orphaned} ` + | ||
| `skipped=${summary.skipped} live-containers=${liveWorldIds.size}`, | ||
| ); | ||
| } finally { | ||
| try { db.close?.(); } catch { /* ignore */ } | ||
| } | ||
| return summary; | ||
| } |
| // bootstrap-selective.mjs — Phase D1 helper, collapsed to a wildcard in | ||
| // Phase E5 (ATOMIC SERVING CUTOVER). | ||
| // | ||
| // Determines whether a SPA shell render path should SKIP the host-cp | ||
| // BOOTSTRAP_SCRIPT injection (cookie-bootstrap + fetch/EventSource | ||
| // rewrite shim) and instead let the served SPA's own auth resolver + | ||
| // world-fetch shim handle auth. | ||
| // | ||
| // Phase E5: plan-chat-spa is now host-cp's SOLE served SPA. Its bundle | ||
| // re-homes the cookie-bootstrap + world-fetch-rewrite + 401-recover shim | ||
| // (packages/plan-chat-spa/src/lib/worldFetch.ts, installed at the top of | ||
| // src/main.tsx — Phase C). Therefore host-cp NEVER needs to inject | ||
| // BOOTSTRAP_SCRIPT anymore: every path is a "planning" (== SPA-owned) | ||
| // path. isPlanningPath() is collapsed to a wildcard accordingly. | ||
| // | ||
| // Reversal: set isPlanningPath to consult BOOTSTRAP_NOOP_PLANNING_PATHS | ||
| // again (restore the prefix-match body below) to re-narrow the no-op to | ||
| // the explicit planning prefixes; or, for full pre-D behaviour, also set | ||
| // BOOTSTRAP_NOOP_PLANNING_PATHS to []. The const is retained as the | ||
| // documented revert seam. | ||
| // | ||
| // Per K1 SCP-3 + phase-d-tasks.md D1 + phase-e-tasks.md E2. | ||
| /** | ||
| * Path prefixes that WERE owned by plan-chat-spa under the Phase D | ||
| * selective no-op. Retained as the documented single-line revert seam: | ||
| * to re-narrow the bootstrap no-op back to only the planning surfaces, | ||
| * restore the prefix-match body in isPlanningPath() (see git history of | ||
| * this file at the Phase E5 commit) so it consults this array again. | ||
| * | ||
| * Format: include both the bare segment ("/plan") and the trailing-slash | ||
| * variant ("/plan/"). The trailing-slash form is the prefix-match | ||
| * generator for "/plan/<rest>". | ||
| * | ||
| * @type {readonly string[]} | ||
| */ | ||
| export const BOOTSTRAP_NOOP_PLANNING_PATHS = Object.freeze([ | ||
| '/plan', | ||
| '/plan/', | ||
| ]); | ||
| /** | ||
| * Phase E5 wildcard: TRUE for every string path. | ||
| * | ||
| * host-cp now serves plan-chat-spa exclusively, whose bundle re-homes the | ||
| * cookie-bootstrap + world-fetch-rewrite shim (worldFetch.ts). No served | ||
| * path needs host-cp's BOOTSTRAP_SCRIPT injection anymore, so every path | ||
| * is treated as an SPA-owned ("planning") path and skips bootstrap. | ||
| * | ||
| * Returns false only for non-string input (defensive — a non-string | ||
| * pathname is never a real served path). | ||
| * | ||
| * @param {unknown} pathname | ||
| * @returns {boolean} | ||
| */ | ||
| export function isPlanningPath(pathname) { | ||
| return typeof pathname === 'string'; | ||
| } |
| /** | ||
| * Phase E4 (olam-dogfood-vision): WorldsSource composition + dedup. | ||
| * | ||
| * Runs every configured WorldsSource (E1) in parallel and dedupes by | ||
| * `id`. Source-array order expresses precedence: the LAST source to | ||
| * claim an id wins on collision. server.mjs (E4 wiring via | ||
| * `buildWorldsSources`) orders sources `[localSource, pylonSource]` | ||
| * so cloud-side metadata overrides local stubs when the Pylon SDK | ||
| * eventually returns real data for a world that's also docker- | ||
| * resident locally. | ||
| * | ||
| * The function is intentionally pure + dep-free (no env reads, no | ||
| * http, no module-level state) so vitest can drive it with two mock | ||
| * sources to assert dedup direction without spinning up the server. | ||
| * | ||
| * ## Failure-mode contract (CP3 audit follow-up — closes CRIT/HIGH-1+2) | ||
| * | ||
| * Robustness goals: | ||
| * 1. **One bad source must NOT take down the union.** Pylon SDK | ||
| * transient outages, auth errors, network blips — these MUST | ||
| * degrade to "cloud worlds missing this poll" rather than | ||
| * "/api/worlds endpoint hangs". Achieved via `Promise.allSettled` | ||
| * + per-source try/log/treat-as-empty. | ||
| * 2. **Slow sources MUST NOT extend wall time past the SPA poll | ||
| * cadence.** The SPA polls every 4s (Worlds.tsx:124); a Pylon | ||
| * `client.worlds.list()` that takes 8s would block, queue | ||
| * sockets, and pile up overlapping fetches. Achieved via | ||
| * per-source `Promise.race` with `timeoutMs` (default 2000ms, | ||
| * matching the existing docker-inspect timeout in | ||
| * fetchWorldServices). A timed-out source is treated as `[]` for | ||
| * this poll. | ||
| * 3. **A failing source must produce a log line, not a silent | ||
| * empty.** Operators need to see "[worlds-source] pylon-cloud | ||
| * list() failed: <err>" in the host-cp boot log so the | ||
| * degradation is observable. | ||
| * | ||
| * ## Dedup semantics on collision (CP3 audit follow-up — closes HIGH-4) | ||
| * | ||
| * Whole-record replacement (the pre-audit behavior) blanks fields the | ||
| * later source doesn't populate. Concrete example: Pylon returns | ||
| * `{services: undefined}` (or omits the field entirely) for a freshly- | ||
| * claimed world while Local has `{services: [4 entries]}`. Whole- | ||
| * record replacement would drop the local services array; the SPA | ||
| * would render the world with no clickable links until Pylon | ||
| * back-fills. | ||
| * | ||
| * Field-merge (the post-audit behavior): later source's defined | ||
| * fields override earlier; earlier source's fields are preserved | ||
| * where the later source omits them. `id` and `source` always come | ||
| * from the later source (the precedence contract). Implementation: | ||
| * `{ ...byId.get(id), ...world }` — ES spread skips own properties | ||
| * with value `undefined` only if the producer ELIDES them; explicit | ||
| * `field: undefined` does override. Therefore source authors should | ||
| * OMIT fields they don't manage rather than setting them to | ||
| * `undefined` / `[]`. | ||
| * | ||
| * @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource | ||
| * @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary | ||
| */ | ||
| /** | ||
| * @typedef {object} ComposeWorldsSourcesOptions | ||
| * @property {number} [timeoutMs=2000] | ||
| * Per-source timeout cap. A source whose `list()` doesn't resolve | ||
| * within this budget is treated as `[]` for this composition pass | ||
| * (logged at error level). Default matches the docker-inspect | ||
| * timeout used elsewhere in host-cp so the /api/worlds path's worst- | ||
| * case wall time stays bounded by it. | ||
| * @property {(sourceName: string, err: unknown) => void} [onSourceError] | ||
| * Invoked when a source rejects or times out. Defaults to | ||
| * `console.error('[worlds-source] <name> list() failed:', err)`. | ||
| * Tests inject a spy to assert log behavior without polluting | ||
| * stderr. | ||
| */ | ||
| const DEFAULT_TIMEOUT_MS = 8000; | ||
| /** | ||
| * Per-source last-known-good cache. Keyed by source.name → WorldSummary[]. | ||
| * When a source resolves successfully, its output is stored here. When a | ||
| * source rejects or times out, we fall back to the cached value so the | ||
| * dashboard shows stale data rather than blanking. Stale data self-heals | ||
| * on the next successful poll. | ||
| * | ||
| * Process-local, no TTL — the running server is authoritative. Tests that | ||
| * need a clean slate should call _resetLastKnownGoodCache(). | ||
| * | ||
| * @type {Map<string, import('./worlds-source.mjs').WorldSummary[]>} | ||
| */ | ||
| const _lastKnownGood = new Map(); | ||
| /** | ||
| * Wraps a Promise in a per-source timeout race. The timeout error | ||
| * carries the source name so `onSourceError` can log it usefully. | ||
| * | ||
| * @template T | ||
| * @param {Promise<T>} promise | ||
| * @param {number} ms | ||
| * @param {string} sourceName | ||
| * @returns {Promise<T>} | ||
| */ | ||
| function withTimeout(promise, ms, sourceName) { | ||
| /** @type {ReturnType<typeof setTimeout> | null} */ | ||
| let timer = null; | ||
| const timeout = new Promise((_, reject) => { | ||
| timer = setTimeout(() => { | ||
| reject(new Error(`source "${sourceName}" timed out after ${ms}ms`)); | ||
| }, ms); | ||
| }); | ||
| return Promise.race([promise, timeout]).finally(() => { | ||
| if (timer !== null) clearTimeout(timer); | ||
| }); | ||
| } | ||
| /** | ||
| * Reset the last-known-good cache. Exposed for tests only — call before | ||
| * each test that needs a clean slate. | ||
| */ | ||
| export function _resetLastKnownGoodCache() { | ||
| _lastKnownGood.clear(); | ||
| } | ||
| /** | ||
| * @param {WorldsSource[]} sources | ||
| * Sources to compose. Order expresses precedence: later wins. | ||
| * @param {ComposeWorldsSourcesOptions} [options] | ||
| * @returns {Promise<WorldSummary[]>} | ||
| * Deduped union of every source's `list()` output, keyed by `id`. | ||
| * On collision: fields from later source override earlier where | ||
| * defined; earlier fields preserved where later source omits them. | ||
| */ | ||
| export async function composeWorldsSources(sources, options = {}) { | ||
| if (sources.length === 0) return []; | ||
| const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS; | ||
| const onSourceError = | ||
| options.onSourceError ?? | ||
| ((name, err) => { | ||
| console.error(`[worlds-source] ${name} list() failed:`, err); | ||
| }); | ||
| const settled = await Promise.allSettled( | ||
| sources.map((s) => withTimeout(s.list(), timeoutMs, s.name)), | ||
| ); | ||
| /** @type {Map<string, WorldSummary>} */ | ||
| const byId = new Map(); | ||
| for (let i = 0; i < settled.length; i++) { | ||
| const result = settled[i]; | ||
| const source = sources[i]; | ||
| let resolved; | ||
| if (result.status === 'rejected') { | ||
| onSourceError(source.name, result.reason); | ||
| const lkg = _lastKnownGood.get(source.name); | ||
| if (!lkg) continue; | ||
| resolved = lkg; | ||
| } else { | ||
| resolved = result.value; | ||
| _lastKnownGood.set(source.name, result.value); | ||
| } | ||
| for (const world of resolved) { | ||
| // Field-merge on collision: later source overrides earlier | ||
| // where defined; earlier preserved where later omits. Keeps | ||
| // local service-strip + host_port intact when Pylon claims a | ||
| // world but hasn't populated those fields yet. | ||
| const prior = byId.get(world.id); | ||
| byId.set(world.id, prior ? { ...prior, ...world } : world); | ||
| } | ||
| } | ||
| return [...byId.values()]; | ||
| } |
| // config-reader.mjs — Phase D (olam-config-store-unification): a host-cp-local, | ||
| // DEPENDENCY-FREE reader for a single dotted value out of `config.json`. | ||
| // | ||
| // # Why a copy lives here (not an `@olam/core` import) | ||
| // | ||
| // host-cp is a pure `.mjs` package with NO `@olam/core` dependency — it cannot | ||
| // import the TypeScript cloud-state resolver, and a relative reach into | ||
| // `packages/core/src/...` would (a) couple host-cp to core's source layout and | ||
| // (b) fail to resolve in the published/container build where core is not a | ||
| // sibling on disk. The canonical zero-dep reader is | ||
| // `packages/core/src/cloud-state/read-config-value.mjs`; this module INLINES the | ||
| // same logic (Phase D tracker explicitly permits copy-inlining the tiny reader) | ||
| // and adds host-cp's container-aware `config.json` directory resolution. | ||
| // | ||
| // # Container path resolution | ||
| // | ||
| // host-cp reads operator state from a bind-mount, NOT from `~/.olam` directly: | ||
| // compose.yaml mounts `${HOME}/.olam → /data`, so inside the container the | ||
| // canonical config lives at `/data/config.json` (os.homedir() → /root, which is | ||
| // the ephemeral container layer — the WRONG place, the same bug fixed for | ||
| // plan.db / plan-chat-secret). `olamConfigDir()` resolves the directory holding | ||
| // `config.json` honouring, in order: | ||
| // 1. process.env.OLAM_HOME (explicit override — D2 requirement) | ||
| // 2. '/data' when HOST_CP_MODE==='container' (the compose bind-mount target) | ||
| // 3. path.join(os.homedir(), '.olam') (bare-node install — no behaviour change) | ||
| // | ||
| // Returns the resolved value or `null` (file absent, bad JSON, or path miss) — | ||
| // NEVER throws, so a fail-open caller degrades gracefully to its legacy legs. | ||
| import { readFileSync, existsSync } from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| /** | ||
| * Deployment-mode detection, identical to server.mjs: container when an | ||
| * explicit OLAM_HOST_CP_MODE says so, else auto-detected from `/.dockerenv` | ||
| * (created by the docker runtime on container start). Re-derived here (rather | ||
| * than imported from server.mjs) so this module has no server.mjs dependency — | ||
| * server.mjs imports THIS, not the reverse. | ||
| * | ||
| * @returns {'container' | 'bare'} | ||
| */ | ||
| function hostCpMode() { | ||
| return ( | ||
| process.env.OLAM_HOST_CP_MODE ?? | ||
| (existsSync('/.dockerenv') ? 'container' : 'bare') | ||
| ); | ||
| } | ||
| /** | ||
| * Resolve the directory that CONTAINS `config.json` (the `~/.olam` ROOT, or its | ||
| * container `/data` equivalent). Re-reads process.env on every call so a direnv | ||
| * org-switch or a late OLAM_HOME export is observed (no module-load capture). | ||
| * | ||
| * @returns {string} | ||
| */ | ||
| export function olamConfigDir() { | ||
| const olamHome = process.env.OLAM_HOME; | ||
| if (olamHome && olamHome.length > 0) return olamHome; | ||
| if (hostCpMode() === 'container') return '/data'; | ||
| return path.join(os.homedir(), '.olam'); | ||
| } | ||
| /** | ||
| * Absolute path to the canonical `config.json` host-cp reads. | ||
| * @returns {string} | ||
| */ | ||
| export function configJsonPath() { | ||
| return path.join(olamConfigDir(), 'config.json'); | ||
| } | ||
| /** | ||
| * Read a dotted path (e.g. `cloud.urls.anthropic-base-url`) out of the | ||
| * container/host `config.json`. Mirrors | ||
| * packages/core/src/cloud-state/read-config-value.mjs: only `.` splits | ||
| * segments (dash-containing keys like `kg-proxy-url` are fine), and any | ||
| * miss / read error / corrupt JSON returns `null` (fail-open). Returns the | ||
| * string/number/boolean leaf, the sub-object for an interior path, or `null`. | ||
| * | ||
| * @param {string} dotpath | ||
| * @returns {string | number | boolean | object | null} | ||
| */ | ||
| export function readConfigValue(dotpath) { | ||
| let raw; | ||
| try { | ||
| raw = readFileSync(configJsonPath(), 'utf8'); | ||
| } catch { | ||
| return null; // file absent / unreadable → not set | ||
| } | ||
| let parsed; | ||
| try { | ||
| parsed = JSON.parse(raw); | ||
| } catch { | ||
| return null; // corrupt JSON → not set (fail-open) | ||
| } | ||
| let cur = parsed; | ||
| for (const seg of dotpath.split('.')) { | ||
| if (cur === null || typeof cur !== 'object' || !(seg in cur)) return null; | ||
| cur = cur[seg]; | ||
| } | ||
| return cur === undefined ? null : cur; | ||
| } | ||
| /** | ||
| * String-typed convenience: returns a non-empty trimmed string leaf, else null. | ||
| * Trims to match the legacy file-read helpers (which `.trim()` file contents). | ||
| * | ||
| * @param {string} dotpath | ||
| * @returns {string | null} | ||
| */ | ||
| export function readConfigString(dotpath) { | ||
| const v = readConfigValue(dotpath); | ||
| if (typeof v !== 'string') return null; | ||
| const t = v.trim(); | ||
| return t.length > 0 ? t : null; | ||
| } |
| // Phase F-2-B (B3): fetch a per-world container's X-Olam-Secret via the | ||
| // docker-socket-proxy sidecar (container mode) OR via `docker exec` (bare- | ||
| // node mode — host-cp running as a plain Node process on the host). | ||
| // | ||
| // The secret lives at `/tmp/olam-container-secret` inside the world's | ||
| // devbox container. Phase E init wrote it (`chmod 400` owned by root — | ||
| // world-app user has no write permission, T9 mitigation) and the | ||
| // per-world CP's `requireAuth` middleware compares against it. Host CP | ||
| // reads the secret server-side and injects `X-Olam-Secret` on proxied | ||
| // requests, so the browser never sees the secret directly. | ||
| // | ||
| // Container mode (`dockerHost = 'tcp://docker-socket-proxy:2375'`): | ||
| // 1. POST /containers/<name>/exec | ||
| // body: { Cmd: ['cat', '/tmp/olam-container-secret'], AttachStdout: true, AttachStderr: true } | ||
| // → { Id: '<exec-id>' } | ||
| // 2. POST /exec/<exec-id>/start | ||
| // body: { Detach: false, Tty: false } | ||
| // → response stream containing the file bytes (raw multiplexed | ||
| // stdout/stderr per Docker exec protocol) | ||
| // | ||
| // The exec endpoint is whitelisted in the socket-proxy (EXEC=1). | ||
| // | ||
| // Bare-node mode (`dockerHost = 'docker-cli'`): | ||
| // Spawn `docker exec <containerName> cat /tmp/olam-container-secret` via | ||
| // child_process. No socket-proxy on the host; the docker CLI on the | ||
| // operator's $PATH is the canonical access path. Same `olam-<id>-devbox` | ||
| // naming convention applies. ~10 ms of process-spawn overhead per miss | ||
| // is fine because the secret is cached for OLAM_SECRET_CACHE_TTL_SEC | ||
| // (default 300 s). | ||
| import { spawnSync } from 'node:child_process'; | ||
| /** | ||
| * Read /tmp/olam-container-secret from a world's devbox container. | ||
| * Throws on any non-2xx response from the socket-proxy or on the | ||
| * file being empty (the world's CP is misconfigured if it is). | ||
| * | ||
| * @param {object} args | ||
| * @param {string} args.worldId | ||
| * @param {string} args.dockerHost Either `tcp://...` for socket-proxy | ||
| * mode or the sentinel `'docker-cli'` for bare-node mode. | ||
| * @param {(host: string, init: RequestInit) => Promise<Response>} [args.fetchImpl] | ||
| * injectable for tests; defaults to global fetch (Node 22+) | ||
| * @returns {Promise<string>} the secret (trimmed of trailing whitespace) | ||
| */ | ||
| export async function fetchContainerSecret({ worldId, dockerHost, fetchImpl = globalThis.fetch }) { | ||
| // Container naming convention: docker provider creates containers as | ||
| // `olam-${worldId}-devbox` (see packages/adapters/src/docker/container.ts). | ||
| // Phase F-2-D dogfood revealed the original `${worldId}-devbox` was | ||
| // missing the `olam-` prefix. | ||
| const containerName = `olam-${worldId}-devbox`; | ||
| // Bare-node mode: shell out to docker exec directly. Operator's docker | ||
| // CLI on $PATH is the canonical access path; no socket-proxy needed. | ||
| if (dockerHost === 'docker-cli') { | ||
| const result = spawnSync( | ||
| 'docker', | ||
| ['exec', containerName, 'cat', '/tmp/olam-container-secret'], | ||
| { encoding: 'utf-8' }, | ||
| ); | ||
| if (result.error) { | ||
| throw new Error(`docker exec ${containerName} cat ... failed: ${result.error.message}`); | ||
| } | ||
| if (result.status !== 0) { | ||
| throw new Error( | ||
| `docker exec ${containerName} cat ... exit ${result.status}: ${(result.stderr || '').trim()}`, | ||
| ); | ||
| } | ||
| const secret = (result.stdout || '').trim(); | ||
| if (!secret) { | ||
| throw new Error(`/tmp/olam-container-secret empty in container ${containerName}`); | ||
| } | ||
| return secret; | ||
| } | ||
| // Container mode: HTTP via the docker-socket-proxy sidecar. | ||
| // Docker API: tcp://host:port → http://host:port | ||
| const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://'); | ||
| // Step 1: create exec instance | ||
| const createUrl = `${apiBase}/containers/${encodeURIComponent(containerName)}/exec`; | ||
| const createRes = await fetchImpl(createUrl, { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify({ | ||
| Cmd: ['cat', '/tmp/olam-container-secret'], | ||
| AttachStdout: true, | ||
| AttachStderr: true, | ||
| Tty: false, | ||
| }), | ||
| }); | ||
| if (!createRes.ok) { | ||
| throw new Error( | ||
| `socket-proxy POST /containers/${containerName}/exec failed: ${createRes.status} ${createRes.statusText}`, | ||
| ); | ||
| } | ||
| const createBody = await createRes.json(); | ||
| const execId = createBody.Id; | ||
| if (!execId) { | ||
| throw new Error(`socket-proxy /exec did not return Id: ${JSON.stringify(createBody)}`); | ||
| } | ||
| // Step 2: start exec, read stdout. The response is Docker's | ||
| // multiplexed exec stream: 8-byte header per frame + payload bytes. | ||
| // Header byte 0 = stream id (1=stdout, 2=stderr), bytes 4-7 = payload | ||
| // length (big-endian uint32). For `cat <smallfile>` we expect a single | ||
| // frame on stream 1. | ||
| const startUrl = `${apiBase}/exec/${execId}/start`; | ||
| const startRes = await fetchImpl(startUrl, { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify({ Detach: false, Tty: false }), | ||
| }); | ||
| if (!startRes.ok) { | ||
| throw new Error( | ||
| `socket-proxy POST /exec/${execId}/start failed: ${startRes.status} ${startRes.statusText}`, | ||
| ); | ||
| } | ||
| const buf = new Uint8Array(await startRes.arrayBuffer()); | ||
| // Decode the multiplexed stream. Skip stderr frames; concatenate | ||
| // stdout payloads. Empty file → throw (per-world CP is broken). | ||
| const stdoutBytes = decodeDockerExecStream(buf); | ||
| const secret = new TextDecoder('utf-8').decode(stdoutBytes).trim(); | ||
| if (!secret) { | ||
| throw new Error(`/tmp/olam-container-secret empty in container ${containerName}`); | ||
| } | ||
| return secret; | ||
| } | ||
| /** | ||
| * Decode Docker's multiplexed exec stream — keep only stdout (stream id 1). | ||
| * Stream format: each frame is 8-byte header + payload. Header byte 0 | ||
| * is the stream id (0=stdin, 1=stdout, 2=stderr); bytes 4-7 are the | ||
| * payload length as big-endian uint32. Bytes 1-3 are reserved (zero). | ||
| * | ||
| * @param {Uint8Array} buf | ||
| * @returns {Uint8Array} | ||
| */ | ||
| export function decodeDockerExecStream(buf) { | ||
| const out = []; | ||
| let i = 0; | ||
| while (i + 8 <= buf.byteLength) { | ||
| const streamId = buf[i]; | ||
| // Big-endian uint32 at offset i+4..i+8 | ||
| const len = (buf[i + 4] << 24) | (buf[i + 5] << 16) | (buf[i + 6] << 8) | buf[i + 7]; | ||
| const payload = buf.subarray(i + 8, i + 8 + len); | ||
| if (streamId === 1) { | ||
| out.push(payload); | ||
| } | ||
| i += 8 + len; | ||
| } | ||
| // Concatenate. | ||
| let total = 0; | ||
| for (const p of out) total += p.byteLength; | ||
| const merged = new Uint8Array(total); | ||
| let off = 0; | ||
| for (const p of out) { | ||
| merged.set(p, off); | ||
| off += p.byteLength; | ||
| } | ||
| return merged; | ||
| } |
| // crystallize-planning — atomic-or-compensating chunk-copy from a planning | ||
| // session (_planning world) into a freshly provisioned real world. | ||
| // | ||
| // APPEND-ONLY CONSTRAINT: The chunks table has a NO_DELETE + NO_UPDATE | ||
| // trigger (chunks_append_only_trigger). If chunk-copy fails mid-batch, | ||
| // any chunks already INSERTed under the new worldId STAY in the database. | ||
| // Compensating cleanup only calls destroyWorld (world container teardown) — | ||
| // it CANNOT delete the orphaned chunks. Those orphan chunks are harmless: | ||
| // • idx_chunks_planning only covers world_id='_planning' rows. | ||
| // • The destroyed world container no longer exists, so no subscriber | ||
| // will ever observe those orphans through the normal shape proxy. | ||
| // • Any future re-crystallize creates a fresh worldId, fresh session_id. | ||
| // | ||
| // IDEMPOTENCY: | ||
| // • If crystallize_status is 'crystallized' (with a stored worldId), | ||
| // return immediately — the work is already done. | ||
| // • If crystallize_status is 'in_progress', we cannot safely resume | ||
| // (we don't know how far the previous copy got, and the chunk INSERT | ||
| // is not idempotent by worldId+sessionId alone — the PRIMARY KEY is | ||
| // (message_id, seq), so the same chunk could be re-inserted into a | ||
| // different new session without collision). Safe behavior: return | ||
| // the current status so the UI can display "in progress" and the | ||
| // operator can force-retry after manual inspection. | ||
| // | ||
| // SLUG RULE: lowercased, non-alphanum → hyphens, max 40 chars. | ||
| // Matches the dev-substrate stub in plan-chat-spa/src/server/index.ts | ||
| // (confirmed by reading that file's crystallize stub, around line 983). | ||
| import { randomUUID } from 'node:crypto'; | ||
| import { PLANNING_WORLD_ID } from '@olam/chunks/schema'; | ||
| import { setCrystallizeStatus } from './planning-sessions.mjs'; | ||
| /** | ||
| * Slug a plan title into a world-name-safe string. | ||
| * Lowercased, non-alphanum → hyphens, max 40 chars, leading/trailing | ||
| * hyphens removed. Falls back to 'plan' if result is empty. | ||
| * | ||
| * @param {string} title | ||
| * @returns {string} | ||
| */ | ||
| function slugTitle(title) { | ||
| const base = title | ||
| .toLowerCase() | ||
| .replace(/[^a-z0-9]+/g, '-') | ||
| .replace(/^-+|-+$/g, '') | ||
| .slice(0, 40); | ||
| return base || 'plan'; | ||
| } | ||
| /** | ||
| * Read the current crystallize_status + crystallized_world_id for a session. | ||
| * | ||
| * @param {object} pool | ||
| * @param {string} sessionId | ||
| * @returns {Promise<{crystallize_status: string, crystallized_world_id: string | null}>} | ||
| */ | ||
| async function readCrystallizeState(pool, sessionId) { | ||
| const result = await pool.query( | ||
| `SELECT crystallize_status, crystallized_world_id | ||
| FROM planning_sessions | ||
| WHERE session_id = $1`, | ||
| [sessionId], | ||
| ); | ||
| if (result.rows.length === 0) { | ||
| return { crystallize_status: 'open', crystallized_world_id: null }; | ||
| } | ||
| const row = result.rows[0]; | ||
| return { | ||
| crystallize_status: row.crystallize_status, | ||
| crystallized_world_id: row.crystallized_world_id ?? null, | ||
| }; | ||
| } | ||
| /** | ||
| * SELECT all planning chunks for a session, ordered by seq. | ||
| * | ||
| * @param {object} pool | ||
| * @param {string} sessionId | ||
| * @returns {Promise<Array<{world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type}>>} | ||
| */ | ||
| async function selectPlanningChunks(pool, sessionId) { | ||
| const result = await pool.query( | ||
| `SELECT world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type | ||
| FROM chunks | ||
| WHERE world_id = $1 AND session_id = $2 | ||
| ORDER BY seq ASC`, | ||
| [PLANNING_WORLD_ID, sessionId], | ||
| ); | ||
| return result.rows; | ||
| } | ||
| /** | ||
| * INSERT a single chunk into the new world's session. | ||
| * Uses the original message_id + seq verbatim; only world_id and | ||
| * session_id change to point at the new world's session. | ||
| * | ||
| * @param {object} pool | ||
| * @param {object} chunk — row from the planning session | ||
| * @param {string} newWorldId | ||
| * @param {string} newSessionId | ||
| * @returns {Promise<void>} | ||
| */ | ||
| async function insertChunkIntoNewWorld(pool, chunk, newWorldId, newSessionId) { | ||
| await pool.query( | ||
| `INSERT INTO chunks | ||
| (world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type) | ||
| VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, | ||
| [ | ||
| newWorldId, | ||
| newSessionId, | ||
| chunk.message_id, | ||
| chunk.seq, | ||
| chunk.actor_id, | ||
| chunk.actor_type, | ||
| chunk.role, | ||
| chunk.chunk, | ||
| chunk.chunk_type, | ||
| ], | ||
| ); | ||
| } | ||
| /** | ||
| * INSERT a system marker chunk into the ORIGINAL planning session to | ||
| * leave an audit trail of crystallization. The marker lands at | ||
| * world_id='_planning' + the original sessionId. | ||
| * | ||
| * @param {object} pool | ||
| * @param {string} sessionId — original planning session id | ||
| * @param {string} worldId — newly created world id | ||
| * @param {number} phaseCount — number of phases in the plan | ||
| * @returns {Promise<void>} | ||
| */ | ||
| async function insertMarkerChunk(pool, sessionId, worldId, phaseCount) { | ||
| const messageId = randomUUID(); | ||
| // Find the current max seq so the marker doesn't collide. | ||
| const seqResult = await pool.query( | ||
| `SELECT COALESCE(MAX(seq), -1) AS max_seq | ||
| FROM chunks | ||
| WHERE world_id = $1 AND session_id = $2`, | ||
| [PLANNING_WORLD_ID, sessionId], | ||
| ); | ||
| const nextSeq = Number(seqResult.rows[0].max_seq) + 1; | ||
| await pool.query( | ||
| `INSERT INTO chunks | ||
| (world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type) | ||
| VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, | ||
| [ | ||
| PLANNING_WORLD_ID, | ||
| sessionId, | ||
| messageId, | ||
| nextSeq, | ||
| 'system', | ||
| 'system', | ||
| 'system', | ||
| `Plan crystallized into world "${worldId}" (${phaseCount} phase${phaseCount === 1 ? '' : 's'}).`, | ||
| 'text', | ||
| ], | ||
| ); | ||
| } | ||
| /** | ||
| * crystallizePlanningSession | ||
| * | ||
| * 4-phase atomic-or-compensating process: | ||
| * 1. Set crystallize_status='in_progress' | ||
| * 2. Call createWorld({ name: slugged-title }) → { id: worldId } | ||
| * 3. SELECT all chunks in _planning/sessionId; INSERT each into new world | ||
| * 4. Set crystallize_status='crystallized' (with worldId); INSERT marker chunk | ||
| * | ||
| * Compensating pattern on partial failure: | ||
| * - If createWorld throws: set status='failed', rethrow. destroyWorld NOT called. | ||
| * - If chunk-copy throws mid-batch: set status='failed', call destroyWorld(worldId), | ||
| * rethrow. Orphan chunks already INSERTed stay (append-only; see file header). | ||
| * | ||
| * Idempotency: | ||
| * - Already 'crystallized': return immediately without re-running. | ||
| * - Already 'in_progress': return current status (safe short-circuit; see header). | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool — pg.Pool-compatible with .query() | ||
| * @param {string} opts.sessionId — planning session to crystallize | ||
| * @param {string} opts.planTitle — plan title (used for world name slug) | ||
| * @param {Array} opts.planPhases — array of phase objects (name, acceptance, risks?) | ||
| * @param {Function} opts.createWorld — async ({ name }) => { id: string, ... } | ||
| * @param {Function} opts.destroyWorld — async (worldId) => void | ||
| * | ||
| * @returns {Promise<{worldId: string, status: string, new_session_id: string}>} | ||
| * @throws on failure (crystallize_status already set to 'failed' when thrown) | ||
| */ | ||
| export async function crystallizePlanningSession({ | ||
| pool, | ||
| sessionId, | ||
| planTitle, | ||
| planPhases, | ||
| createWorld, | ||
| destroyWorld, | ||
| }) { | ||
| // ── Idempotency guard ──────────────────────────────────────────────────── | ||
| const currentState = await readCrystallizeState(pool, sessionId); | ||
| if (currentState.crystallize_status === 'crystallized') { | ||
| return { | ||
| worldId: currentState.crystallized_world_id, | ||
| status: `crystallized:${currentState.crystallized_world_id}`, | ||
| new_session_id: null, | ||
| }; | ||
| } | ||
| if (currentState.crystallize_status === 'in_progress') { | ||
| // Cannot safely resume without knowing how far the copy got. | ||
| // Return current status so the UI shows 'in_progress'. | ||
| return { | ||
| worldId: currentState.crystallized_world_id, | ||
| status: 'in_progress', | ||
| new_session_id: null, | ||
| }; | ||
| } | ||
| // ── Phase 1: mark in_progress ──────────────────────────────────────────── | ||
| await setCrystallizeStatus({ pool, sessionId, status: 'in_progress', worldId: null }); | ||
| // ── Phase 2: create world ──────────────────────────────────────────────── | ||
| let worldId; | ||
| try { | ||
| const worldName = slugTitle(planTitle); | ||
| const world = await createWorld({ name: worldName }); | ||
| worldId = world.id; | ||
| } catch (err) { | ||
| await setCrystallizeStatus({ pool, sessionId, status: 'failed', worldId: null }); | ||
| throw err; | ||
| } | ||
| // ── Phase 3: copy chunks into new world ────────────────────────────────── | ||
| const newSessionId = randomUUID(); | ||
| try { | ||
| const chunks = await selectPlanningChunks(pool, sessionId); | ||
| for (const chunk of chunks) { | ||
| await insertChunkIntoNewWorld(pool, chunk, worldId, newSessionId); | ||
| } | ||
| } catch (err) { | ||
| await setCrystallizeStatus({ pool, sessionId, status: 'failed', worldId: null }); | ||
| try { | ||
| await destroyWorld(worldId); | ||
| } catch { | ||
| // Compensating destroy failure is non-fatal — the world may already | ||
| // be partially torn down or the destroy operation may not be | ||
| // reversible. Log is left to the caller's context. | ||
| } | ||
| throw err; | ||
| } | ||
| // ── Phase 4: mark crystallized + insert marker ─────────────────────────── | ||
| await setCrystallizeStatus({ pool, sessionId, status: 'crystallized', worldId }); | ||
| await insertMarkerChunk(pool, sessionId, worldId, planPhases.length); | ||
| return { | ||
| worldId, | ||
| status: `crystallized:${worldId}`, | ||
| new_session_id: newSessionId, | ||
| }; | ||
| } |
| /** | ||
| * dispatch-persister.mjs — persist the last dispatch for each world. | ||
| * | ||
| * The world watchdog's recovery hook reads this to replay the last | ||
| * unanswered prompt when it auto-recovers a wedged claude process. | ||
| * | ||
| * Contract: | ||
| * persist({ worldId, messageId, prompt, source, statePath?, now? }) | ||
| * Atomically writes ~/.olam/worlds/<worldId>/state/last-dispatch.json. | ||
| * Overwrites any previous file — only the LATEST dispatch matters for | ||
| * replay. Atomic write (tmp + fs.rename) prevents partial-write residue | ||
| * from corrupting recovery reads. | ||
| * | ||
| * read({ worldId, statePath? }) | ||
| * Returns { messageId, prompt, dispatchedAt, source } or null. | ||
| * null on ENOENT (no dispatch persisted yet) — never throws. | ||
| * null on JSON parse error (logs + skips) — never throws on corrupt file. | ||
| * | ||
| * Multiple worlds are independent: world A and world B have separate files. | ||
| * Multiple concurrent persist() calls for the SAME world are safe — each | ||
| * write is a rename of a tmp file so the worst case is one write winning. | ||
| * | ||
| * @see docs/architecture/world-watchdog.md | ||
| */ | ||
| import fs from 'node:fs/promises'; | ||
| import path from 'node:path'; | ||
| import os from 'node:os'; | ||
| // Default base path under which per-world state directories live. | ||
| const DEFAULT_STATE_BASE = path.join(os.homedir(), '.olam', 'worlds'); | ||
| /** | ||
| * Derive the path to last-dispatch.json for a world. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {string} [stateBase] Override the base directory (for tests). | ||
| * @returns {string} | ||
| */ | ||
| export function lastDispatchPath(worldId, stateBase = DEFAULT_STATE_BASE) { | ||
| return path.join(stateBase, worldId, 'state', 'last-dispatch.json'); | ||
| } | ||
| /** | ||
| * Persist the last dispatch for a world. | ||
| * | ||
| * @param {{ | ||
| * worldId: string, | ||
| * messageId: string, | ||
| * prompt: string, | ||
| * source: string, | ||
| * statePath?: string, | ||
| * now?: () => number, | ||
| * }} opts | ||
| * @returns {Promise<void>} | ||
| */ | ||
| export async function persist({ | ||
| worldId, | ||
| messageId, | ||
| prompt, | ||
| source, | ||
| statePath, | ||
| now = () => Date.now(), | ||
| }) { | ||
| const filePath = statePath ?? lastDispatchPath(worldId); | ||
| const dir = path.dirname(filePath); | ||
| const tmpPath = `${filePath}.tmp`; | ||
| const record = { | ||
| messageId, | ||
| prompt, | ||
| dispatchedAt: new Date(now()).toISOString(), | ||
| source, | ||
| }; | ||
| // Ensure the directory exists. | ||
| await fs.mkdir(dir, { recursive: true }); | ||
| // Atomic write: write to .tmp then rename over the target. | ||
| await fs.writeFile(tmpPath, JSON.stringify(record, null, 2) + '\n', 'utf8'); | ||
| await fs.rename(tmpPath, filePath); | ||
| } | ||
| /** | ||
| * Fire-and-forget persist wrapper used at the dispatch call-sites. | ||
| * | ||
| * Centralises the void/.catch boilerplate so the two enrichment sites | ||
| * (pr-nanny + /api/cloud-dispatch) can't drift on future changes. | ||
| * Logs failures via the supplied logSource tag; never throws. | ||
| * | ||
| * @param {{ | ||
| * worldId: string, | ||
| * messageId: string, | ||
| * prompt: string, | ||
| * source: string, | ||
| * logSource?: string, | ||
| * statePath?: string, | ||
| * now?: () => number, | ||
| * }} opts | ||
| * @returns {void} | ||
| */ | ||
| export function safePersistLastDispatch(opts) { | ||
| const { logSource = opts.source, ...persistOpts } = opts; | ||
| void persist(persistOpts).catch((err) => { | ||
| console.warn( | ||
| `[${logSource}] persistLastDispatch failed (non-fatal): ${err?.message ?? err}`, | ||
| ); | ||
| }); | ||
| } | ||
| /** | ||
| * Read the last persisted dispatch for a world. | ||
| * | ||
| * @param {{ | ||
| * worldId: string, | ||
| * statePath?: string, | ||
| * }} opts | ||
| * @returns {Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null>} | ||
| */ | ||
| export async function read({ worldId, statePath }) { | ||
| const filePath = statePath ?? lastDispatchPath(worldId); | ||
| let raw; | ||
| try { | ||
| raw = await fs.readFile(filePath, 'utf8'); | ||
| } catch (err) { | ||
| if (err?.code === 'ENOENT') return null; | ||
| // Other I/O errors (e.g. permissions) — log + return null (fail-soft). | ||
| console.error(`[dispatch-persister] readFile ${filePath}: ${err?.message ?? err}`); | ||
| return null; | ||
| } | ||
| try { | ||
| const parsed = JSON.parse(raw); | ||
| // Basic shape validation — don't throw on corrupt file. | ||
| if ( | ||
| typeof parsed !== 'object' || | ||
| parsed === null || | ||
| typeof parsed.messageId !== 'string' || | ||
| typeof parsed.prompt !== 'string' || | ||
| typeof parsed.dispatchedAt !== 'string' || | ||
| typeof parsed.source !== 'string' | ||
| ) { | ||
| console.error(`[dispatch-persister] ${filePath}: unexpected shape, skipping`); | ||
| return null; | ||
| } | ||
| return { | ||
| messageId: parsed.messageId, | ||
| prompt: parsed.prompt, | ||
| dispatchedAt: parsed.dispatchedAt, | ||
| source: parsed.source, | ||
| }; | ||
| } catch (err) { | ||
| console.error(`[dispatch-persister] ${filePath}: JSON parse error: ${err?.message ?? err}`); | ||
| return null; | ||
| } | ||
| } |
| // Phase F-2-B (B3): subscribe to docker events stream and invalidate | ||
| // the secret cache on lifecycle events for known worlds. | ||
| // | ||
| // M2 ship gate: `docker restart <world>; within 10s, proxy call returns | ||
| // 200 not 401`. The 10s budget is dominated by docker-events latency | ||
| // (events fire ~1s after the docker daemon emits them) + JSON parse + | ||
| // cache invalidate (<100ms). 10s is conservative. | ||
| // | ||
| // Stream format: Docker sends NDJSON — newline-delimited JSON events. | ||
| // Each event has shape: | ||
| // {"Type":"container","Action":"start","Actor":{"Attributes":{"name":"<container-name>"}},...} | ||
| // We filter `Type === 'container'` && `Action ∈ INVALIDATING_ACTIONS` and | ||
| // extract the worldId from the container name to invalidate the secret cache. | ||
| // | ||
| // Dogfood incident (2026-05-08): host-cp returned `secret_fetch_failed` | ||
| // 502 / `unauthorized 401` after operators ran `docker start <devbox>` | ||
| // on previously-exited world containers. Two bugs combined: | ||
| // 1. The action filter excluded `start`. After SIGKILL → exit, the | ||
| // operator's `docker start` emits a `start` event (NOT `restart`), | ||
| // which the filter dropped — so the stale cached secret survived. | ||
| // 2. The container-name regex was `/^(.+)-devbox$/`, predating the | ||
| // `olam-` prefix added in Phase F-2-D. Even when the filter did | ||
| // fire, it invalidated the wrong cache key (`olam-foo` instead of | ||
| // `foo`), so the actual cache entry stayed. | ||
| // Both are fixed below; tests use production naming to prevent drift. | ||
| import http from 'node:http'; | ||
| import { spawn } from 'node:child_process'; | ||
| import { getDockerRequestOptions } from './lib/docker-request-options.mjs'; | ||
| /** | ||
| * Container lifecycle events that may change the per-world secret. | ||
| * | ||
| * - `start` — fresh boot of a previously-exited container; secret is | ||
| * regenerated by Phase E init, cache MUST drop the old value. | ||
| * - `restart` — implicit stop+start; same secret-regeneration semantics. | ||
| * - `stop` / `die` / `kill` — secret is no longer reachable; invalidating | ||
| * prevents host-cp from handing out a stale value the moment | ||
| * `docker start` brings the container back. | ||
| * | ||
| * `pause` / `unpause` are intentionally excluded — those don't change the | ||
| * secret, and invalidating would force an unnecessary docker-exec on | ||
| * resume. | ||
| */ | ||
| const INVALIDATING_ACTIONS = ['start', 'restart', 'stop', 'die', 'kill']; | ||
| /** | ||
| * Subscribe to docker events. Returns a stop function. Auto-reconnects | ||
| * on transient errors (the events stream is long-lived; a daemon | ||
| * restart breaks the connection but the function recovers). | ||
| * | ||
| * @param {object} args | ||
| * @param {string} args.dockerHost Either `tcp://...` for socket-proxy | ||
| * mode or the sentinel `'docker-cli'` for bare-node mode (spawns | ||
| * `docker events --format json` via child_process). | ||
| * @param {(worldId: string) => void} args.onWorldRestart | ||
| * called when a known world restarts/stops/dies | ||
| * @param {(info: { worldId: string, action: string, exitCode?: number }) => void} [args.onWorldLifecycleEvent] | ||
| * Additive observer (Killshot #2): fires alongside onWorldRestart with | ||
| * the raw docker action + exitCode when present. Wired in server.mjs | ||
| * to map docker actions → WorldLifecyclePhase emissions on host-stream. | ||
| * Optional + nullable — existing callers (tests, etc.) don't supply it. | ||
| * @param {(message: string) => void} [args.log] | ||
| * @returns {() => void} stop function | ||
| */ | ||
| export function subscribeDockerEvents({ dockerHost, onWorldRestart, onWorldLifecycleEvent, log = console.log }) { | ||
| let stopped = false; | ||
| let activeReq = null; | ||
| let activeProc = null; | ||
| let reconnectTimer = null; | ||
| // Bare-node mode: shell out to `docker events --format json` and parse | ||
| // its stdout as NDJSON. Same semantic as the HTTP path; different | ||
| // transport. Eliminates the `tcp://docker-cli` URL-construction crash. | ||
| function connectViaCli() { | ||
| if (stopped) return; | ||
| const filters = ['--filter', 'type=container']; | ||
| log('docker-events: spawning `docker events --format json`'); | ||
| const child = spawn( | ||
| 'docker', | ||
| ['events', '--format', '{{json .}}', ...filters], | ||
| { stdio: ['ignore', 'pipe', 'pipe'] }, | ||
| ); | ||
| activeProc = child; | ||
| let buf = ''; | ||
| child.stdout.setEncoding('utf-8'); | ||
| child.stdout.on('data', (chunk) => { | ||
| buf += chunk; | ||
| let nl; | ||
| while ((nl = buf.indexOf('\n')) !== -1) { | ||
| const line = buf.slice(0, nl); | ||
| buf = buf.slice(nl + 1); | ||
| if (!line.trim()) continue; | ||
| try { | ||
| const event = JSON.parse(line); | ||
| // CLI shape uses `status` instead of HTTP API's `Action`; normalize. | ||
| if (event.status && !event.Action) event.Action = event.status; | ||
| if (event.Type === undefined && event.Type !== 'container') event.Type = 'container'; | ||
| handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log }); | ||
| } catch (err) { | ||
| log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`); | ||
| } | ||
| } | ||
| }); | ||
| child.stderr.on('data', (chunk) => { | ||
| const text = String(chunk).trim(); | ||
| if (text) log(`docker-events: stderr: ${text}`); | ||
| }); | ||
| child.on('exit', (code, signal) => { | ||
| activeProc = null; | ||
| log(`docker-events: child exited code=${code} signal=${signal}; reconnecting`); | ||
| scheduleReconnect(); | ||
| }); | ||
| child.on('error', (err) => { | ||
| log(`docker-events: spawn error: ${err.message}; reconnecting`); | ||
| scheduleReconnect(); | ||
| }); | ||
| } | ||
| function connect() { | ||
| if (stopped) return; | ||
| if (dockerHost === 'docker-cli') { | ||
| return connectViaCli(); | ||
| } | ||
| // Docker Engine API: GET /events?filters=... | ||
| // Filter: type=container AND event=restart|stop|die | ||
| // (Note: `event` filter takes a JSON-stringified array.) | ||
| // | ||
| // B8 fix (Phase 2 recovery round-2): use getDockerRequestOptions(substrate) | ||
| // instead of constructing a URL from dockerHost. The old code did: | ||
| // new URL('/events', dockerHost.replace(/^tcp:\/\//, 'http://')) | ||
| // On kubernetes, dockerHost = 'unix:///var/run/docker.sock' — the replace | ||
| // is a no-op, `unix:` is not a valid http URL base, and Node throws | ||
| // ERR_INVALID_URL. The options-spread form routes through socketPath | ||
| // (kubernetes) or host+port (compose), which Node's http module | ||
| // understands natively. No URL construction needed. | ||
| const substrate = dockerHost.startsWith('unix:') ? 'kubernetes' : 'compose'; | ||
| const filters = JSON.stringify({ | ||
| type: ['container'], | ||
| event: INVALIDATING_ACTIONS, | ||
| }); | ||
| const filtersParam = encodeURIComponent(filters); | ||
| const path = `/events?filters=${filtersParam}`; | ||
| const dockerOpts = getDockerRequestOptions(substrate); | ||
| const connLabel = substrate === 'kubernetes' | ||
| ? `unix:${dockerOpts.socketPath}/events` | ||
| : `http://${dockerOpts.host}:${dockerOpts.port}/events`; | ||
| log(`docker-events: connecting to ${connLabel}`); | ||
| activeReq = http.get({ ...dockerOpts, path }, (res) => { | ||
| if (res.statusCode !== 200) { | ||
| log(`docker-events: unexpected status ${res.statusCode}; will retry`); | ||
| scheduleReconnect(); | ||
| return; | ||
| } | ||
| let buf = ''; | ||
| res.setEncoding('utf-8'); | ||
| res.on('data', (chunk) => { | ||
| buf += chunk; | ||
| // NDJSON: split on newlines; last fragment may be partial. | ||
| let nl; | ||
| while ((nl = buf.indexOf('\n')) !== -1) { | ||
| const line = buf.slice(0, nl); | ||
| buf = buf.slice(nl + 1); | ||
| if (!line.trim()) continue; | ||
| try { | ||
| handleEvent(JSON.parse(line), { onWorldRestart, onWorldLifecycleEvent, log }); | ||
| } catch (err) { | ||
| log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`); | ||
| } | ||
| } | ||
| }); | ||
| res.on('end', () => { | ||
| log('docker-events: stream closed; reconnecting'); | ||
| scheduleReconnect(); | ||
| }); | ||
| res.on('error', (err) => { | ||
| log(`docker-events: stream error: ${err.message}; reconnecting`); | ||
| scheduleReconnect(); | ||
| }); | ||
| }); | ||
| activeReq.on('error', (err) => { | ||
| log(`docker-events: connect error: ${err.message}; reconnecting`); | ||
| scheduleReconnect(); | ||
| }); | ||
| } | ||
| function scheduleReconnect() { | ||
| if (stopped) return; | ||
| if (reconnectTimer) return; | ||
| reconnectTimer = setTimeout(() => { | ||
| reconnectTimer = null; | ||
| connect(); | ||
| }, 2000); // 2s backoff | ||
| } | ||
| connect(); | ||
| return function stop() { | ||
| stopped = true; | ||
| if (reconnectTimer) clearTimeout(reconnectTimer); | ||
| if (activeReq) activeReq.destroy(); | ||
| if (activeProc) { | ||
| try { activeProc.kill('SIGTERM'); } catch { /* ignore */ } | ||
| activeProc = null; | ||
| } | ||
| }; | ||
| } | ||
| /** | ||
| * Inspect a docker event and call onWorldRestart if it matches a | ||
| * world container. Container naming convention: `olam-<worldId>-devbox` | ||
| * (per packages/adapters/src/docker/container.ts:67). | ||
| * | ||
| * Exported for unit testing. | ||
| * | ||
| * @param {{ Type?: string, Action?: string, Actor?: { Attributes?: Record<string, string> } }} event | ||
| * @param {{ onWorldRestart: (worldId: string) => void, onWorldLifecycleEvent?: (info: { worldId: string, action: string, exitCode?: number }) => void, log: (m: string) => void }} ctx | ||
| */ | ||
| export function handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log }) { | ||
| if (event?.Type !== 'container') return; | ||
| if (!INVALIDATING_ACTIONS.includes(event.Action ?? '')) return; | ||
| const name = event.Actor?.Attributes?.name; | ||
| if (!name) return; | ||
| // Strip leading slash that Docker sometimes prepends to container names. | ||
| const cleanName = name.startsWith('/') ? name.slice(1) : name; | ||
| // Match the production naming `olam-<worldId>-devbox` literally — the | ||
| // `olam-` prefix was added in Phase F-2-D and was not reflected in the | ||
| // pre-fix regex. Anchoring on it also keeps host-cp's own container | ||
| // (`olam-host-cp`) and the docker-socket-proxy out of the cache-invalidate | ||
| // path even though they happen to start with `olam-`. | ||
| const m = /^olam-(.+)-devbox$/.exec(cleanName); | ||
| if (!m) return; | ||
| const worldId = m[1]; | ||
| log(`docker-events: ${event.Action} on ${cleanName} → invalidating ${worldId}`); | ||
| onWorldRestart(worldId); | ||
| // Killshot #2 (additive): also notify the lifecycle observer when one | ||
| // is wired. Docker's `die` events carry the container exit code in | ||
| // Actor.Attributes.exitCode as a string; parse it best-effort and | ||
| // forward NaN/missing as undefined so the classifier sees the | ||
| // unambiguous "no exit code observed" signal. | ||
| if (onWorldLifecycleEvent) { | ||
| const action = event.Action ?? ''; | ||
| const rawExit = event.Actor?.Attributes?.exitCode; | ||
| const parsed = rawExit !== undefined ? Number(rawExit) : NaN; | ||
| const exitCode = Number.isFinite(parsed) ? parsed : undefined; | ||
| try { | ||
| onWorldLifecycleEvent({ worldId, action, exitCode }); | ||
| } catch (err) { | ||
| // The lifecycle observer is best-effort instrumentation; a thrown | ||
| // error here must not break the cache-invalidate hot path. | ||
| log(`docker-events: onWorldLifecycleEvent threw for ${worldId}: ${err.message}`); | ||
| } | ||
| } | ||
| } |
| // Container-engine identity for host-cp. | ||
| // | ||
| // Phase 1a / A1: defaults to "docker"; switches to "kubernetes" when running | ||
| // inside a K8s pod (autodetected via KUBERNETES_SERVICE_HOST). Operators can | ||
| // override either way via OLAM_HOST_CP_ENGINE. | ||
| // | ||
| // This module exists separately from server.mjs to keep the engine-resolution | ||
| // logic pure (no I/O, no mkdir, no global side-effects) so unit tests can | ||
| // import it without triggering server startup. server.mjs imports | ||
| // resolveHostCpEngine from here and computes its module-level HOST_CP_ENGINE | ||
| // constant. | ||
| // | ||
| // KubernetesEngine adapter (Phase B / PR3) consumes the same env variables | ||
| // when constructing the engine; the context-allowlist guard (T6 / Decision 10) | ||
| // lives inside that adapter, not here. This module is "what name to surface | ||
| // in the X-Olam-Engine response header" — nothing more. | ||
| /** | ||
| * Resolve the active container-engine identity for host-cp. | ||
| * | ||
| * Precedence (matches HOST_CP_MODE convention at server.mjs:85-87): | ||
| * 1. Explicit env override: OLAM_HOST_CP_ENGINE=docker|kubernetes | ||
| * 2. Autodetect: KUBERNETES_SERVICE_HOST set → "kubernetes" | ||
| * 3. Default: "docker" | ||
| * | ||
| * @param {NodeJS.ProcessEnv} [env=process.env] - environment to inspect. | ||
| * @returns {string} - engine identity surfaced via X-Olam-Engine header. | ||
| */ | ||
| export function resolveHostCpEngine(env = process.env) { | ||
| return env.OLAM_HOST_CP_ENGINE | ||
| ?? (env.KUBERNETES_SERVICE_HOST ? 'kubernetes' : 'docker'); | ||
| } |
| // E1 (Phase E — olam-repos-and-runbooks): read ~/.olam/config.json and | ||
| // expose it to the host-CP API endpoints (/api/repos, /api/runbooks). | ||
| // | ||
| // Never crashes: missing file → empty arrays, corrupt JSON → { error }. | ||
| // Mirrors the workspace-catalog.mjs pattern: pure function, env-driven | ||
| // path, no side effects at module load time. | ||
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| const DEFAULT_CONFIG_PATH = | ||
| process.env.OLAM_CONFIG_PATH ?? | ||
| path.join(os.homedir(), '.olam', 'config.json'); | ||
| /** | ||
| * @typedef {object} RepoEntry | ||
| * @property {string} name | ||
| * @property {string} path | ||
| * @property {string} [description] | ||
| * @property {number} [addedAt] | ||
| * @property {number} [updatedAt] | ||
| */ | ||
| /** | ||
| * @typedef {object} Runbook | ||
| * @property {string} name | ||
| * @property {string[]} repos | ||
| * @property {number} [updatedAt] | ||
| * @property {Record<string, Record<string, number>>} [portMap] | ||
| * @property {Record<string, Record<string, string>>} [env] | ||
| */ | ||
| /** | ||
| * @typedef {{ repos: RepoEntry[], runbooks: Runbook[] }} GlobalConfig | ||
| * @typedef {{ error: string }} ConfigError | ||
| */ | ||
| /** | ||
| * Load the global olam config from disk. | ||
| * - Missing file → `{ repos: [], runbooks: [] }` | ||
| * - Corrupt JSON → `{ error: string }` | ||
| * - Success → `{ repos: RepoEntry[], runbooks: Runbook[] }` | ||
| * | ||
| * @param {string} [configPath] | ||
| * @returns {GlobalConfig | ConfigError} | ||
| */ | ||
| export function loadGlobalConfig(configPath = DEFAULT_CONFIG_PATH) { | ||
| if (!fs.existsSync(configPath)) { | ||
| return { repos: [], runbooks: [] }; | ||
| } | ||
| let raw; | ||
| try { | ||
| raw = fs.readFileSync(configPath, 'utf-8'); | ||
| } catch (err) { | ||
| return { error: `Failed to read ${configPath}: ${err.message}` }; | ||
| } | ||
| let parsed; | ||
| try { | ||
| parsed = JSON.parse(raw); | ||
| } catch (err) { | ||
| return { error: `Invalid JSON in ${configPath}: ${err.message}` }; | ||
| } | ||
| if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { | ||
| return { error: `${configPath} does not contain a JSON object` }; | ||
| } | ||
| return { | ||
| repos: Array.isArray(parsed.repos) ? parsed.repos : [], | ||
| runbooks: Array.isArray(parsed.runbooks) ? parsed.runbooks : [], | ||
| }; | ||
| } |
| // W4 — Halt-shape detection for the host-cp chunk-write proxy. | ||
| // | ||
| // When plan-DO's dispatchPlanningAgent (W1) trips a guardrail, it | ||
| // emits a chunk with chunk_type='goal_mode_assumption' and content | ||
| // matching: `[assumption: <cap>-tripped — spent $X.XXXX of $Y]` (or | ||
| // similar shape per GuardrailState.haltChunkText()). | ||
| // | ||
| // host-cp's /api/plan-chat proxy passes the chunk through to the | ||
| // chunks substrate AND, if it detects a halt-shaped chunk, broadcasts | ||
| // a typed `plan.halted` event on host-stream so the SPA's | ||
| // PlanHaltBanner subscriber fires. | ||
| // | ||
| // Extracted as a pure fn so it can be unit-tested without booting | ||
| // the host-cp server. | ||
| const HALT_RE = | ||
| /^\[assumption:\s*(usd|turns|tool_calls|wall_clock)-tripped(?:\s*—\s*spent\s*\$([0-9.]+))?/; | ||
| /** | ||
| * Detect a halt-shaped chunk + extract its components. | ||
| * | ||
| * Returns null when: | ||
| * - chunk is null/undefined | ||
| * - chunk_type isn't 'goal_mode_assumption' | ||
| * - content doesn't match the halt regex | ||
| * | ||
| * Returns the parsed payload otherwise. Caller broadcasts this as | ||
| * the `plan.halted` event payload. | ||
| */ | ||
| export function detectHaltChunk(chunk) { | ||
| if (!chunk || typeof chunk !== 'object') return null; | ||
| if (chunk.chunk_type !== 'goal_mode_assumption') return null; | ||
| if (typeof chunk.chunk !== 'string') return null; | ||
| const m = chunk.chunk.match(HALT_RE); | ||
| if (!m) return null; | ||
| return { | ||
| plan_id: chunk.session_id ?? 'unknown', | ||
| operator_id: chunk.operator_id ?? 'unknown', | ||
| halt_reason: m[1], | ||
| usd_spent_so_far: m[2] ? Number.parseFloat(m[2]) : undefined, | ||
| halted_at: Date.now(), | ||
| }; | ||
| } |
| // Phase A → E (sse-consolidation): server-side multiplexed-SSE broadcaster. | ||
| // | ||
| // Single endpoint /api/host-stream replaces ~20 SPA polling loops. Hooks | ||
| // subscribe to typed events on one connection instead of opening one | ||
| // setInterval-loop per resource. | ||
| // | ||
| // Mirrors planOrchestrator.addEventSink fanout pattern verbatim — same | ||
| // per-sink ServerResponse Set, same `event: <name>\ndata: <json>\n\n` | ||
| // wire format, same cleanup-on-disconnect contract. Differences: | ||
| // | ||
| // - Keyed by event TYPE rather than conversationId (the broadcaster is | ||
| // global to the host-cp, not per-conversation). | ||
| // - Caches last-known payload per event type so reconnecting clients | ||
| // receive an immediate snapshot replay before live updates resume. | ||
| // - No turn-buffering — snapshots are idempotent so reconnect == latest. | ||
| // | ||
| // Phase E adds operational polish: | ||
| // - E1: per-event-type trailing-edge debounce (default 100ms). | ||
| // Coalesces broadcast storms during world boot. | ||
| // - E2: per-sink 25s heartbeat (`:\n\n` comment) to keep idle SSE | ||
| // connections alive across most proxy 60s timeouts. | ||
| // - E3: backpressure-aware writes — slow sinks queue up to a bounded | ||
| // in-memory buffer; overflow drops oldest events with an | ||
| // `:overflow` comment so consumers know they missed updates. | ||
| // - E4: per-event-type broadcast counter + sink count metric line. | ||
| // - E5: the metrics tick ALSO broadcasts a `stream.health` typed event | ||
| // carrying the same counters it logs, so any SPA tab can observe | ||
| // live stream health (sink count, per-event broadcast rates, | ||
| // overflow drops) without polling. Snapshot-cached like every | ||
| // other state event — reconnecting clients replay the last | ||
| // health payload immediately (first-paint parity). Opt out via | ||
| // `deps.healthEvents = false`. | ||
| // | ||
| // Pure module: no docker, no DB, no global clock except `setInterval` | ||
| // for the heartbeat/metrics timers (injectable in tests). Wiring those | ||
| // sources to broadcast(...) lives in server.mjs (A4 + A5). | ||
| // | ||
| // References: | ||
| // - packages/host-cp/src/server.mjs:1531 SSE writer template | ||
| // - packages/host-cp/src/plan-orchestrator.mjs:967 addEventSink shape | ||
| // - docs/plans/sse-consolidation/plan-source.md full design | ||
| // - docs/plans/sse-consolidation/phase-e-tasks.md E1-E4 acceptance | ||
| import crypto from 'node:crypto'; | ||
| /** | ||
| * @typedef {object} HostStreamDeps | ||
| * @property {(message: string) => void} [log] defaults to no-op | ||
| * @property {object} [debounceMs] per-event-type debounce override | ||
| * @property {number} [debounceMs.default] default trailing-edge ms (Phase E1) | ||
| * @property {number} [heartbeatMs] per-sink heartbeat interval (Phase E2) | ||
| * @property {number} [metricsMs] per-broadcaster metrics tick (Phase E4) | ||
| * @property {boolean} [healthEvents] broadcast `stream.health` on each metrics tick (Phase E5; default true) | ||
| * @property {number} [maxQueuedPerSink] bounded queue size (Phase E3) | ||
| * @property {() => number} [now] injectable clock for `stream.health.at` (tests) | ||
| * @property {(cb: () => void, ms: number) => any} [setTimer] injectable setInterval (tests) | ||
| * @property {(handle: any) => void} [clearTimer] injectable clearInterval (tests) | ||
| */ | ||
| /** | ||
| * @typedef {object} HostStream | ||
| * @property {(res: import('node:http').ServerResponse) => () => void} addSink | ||
| * @property {(eventType: string, payload: unknown) => number} broadcast | ||
| * @property {() => Record<string, unknown>} snapshot | ||
| * @property {() => void} close | ||
| * @property {() => number} sinkCount | ||
| * @property {() => HostStreamMetrics} metrics | ||
| * @property {() => void} flushDebounced test-only — fire all pending coalesced broadcasts immediately | ||
| */ | ||
| /** | ||
| * @typedef {object} HostStreamMetrics | ||
| * @property {Record<string, number>} events per-event-type broadcasts since last reset | ||
| * @property {number} sinks current active-sink count | ||
| * @property {number} overflows total `:overflow` drops since last reset | ||
| */ | ||
| /** | ||
| * Payload wire-shape for the `stream.health` event (Phase E5). A | ||
| * point-in-time projection of the broadcaster's own observability | ||
| * counters, emitted on each metrics tick. `events` carries the | ||
| * per-event-type broadcast counts accrued during the just-elapsed | ||
| * interval (reset afterward), so consumers see a per-interval RATE | ||
| * rather than a monotonic total. `at` is the wall-clock emit time so a | ||
| * reconnecting client can tell how stale the replayed snapshot is. | ||
| * | ||
| * @typedef {object} StreamHealthPayload | ||
| * @property {Record<string, number>} events per-event broadcasts during the interval | ||
| * @property {number} sinks active-sink count at emit time | ||
| * @property {number} overflows `:overflow` drops during the interval | ||
| * @property {number} intervalMs the metrics-tick cadence that produced this payload | ||
| * @property {number} at Date.now() at emit time | ||
| */ | ||
| /** Event type emitted by the metrics tick (Phase E5). */ | ||
| export const STREAM_HEALTH_EVENT = 'stream.health'; | ||
| /** | ||
| * Skill Forge promote progress (spa-harness-forge Phase C / C32). Broadcast by | ||
| * the host-side promote job runner as it advances a promote_jobs row, so the | ||
| * SPA's /forge editor reflects status live (push-not-poll). Payload shape: | ||
| * { jobId, artifactId, status: 'promoting'|'published'|'failed', pr_url?, error? } | ||
| * `broadcast()` is generic, so emitting is just `broadcast(FORGE_PROMOTE_EVENT, …)`. | ||
| */ | ||
| export const FORGE_PROMOTE_EVENT = 'forge.promote'; | ||
| const DEFAULT_DEBOUNCE_MS = 100; | ||
| const DEFAULT_HEARTBEAT_MS = 25_000; | ||
| const DEFAULT_METRICS_MS = 60_000; | ||
| const DEFAULT_MAX_QUEUED = 64; | ||
| /** | ||
| * Event types that opt INTO the trailing-edge debounce (Phase E1). The | ||
| * default callers — `world.snapshot`, `tunnels.snapshot`, `servers.snapshot`, | ||
| * `listening.snapshot` — are all idempotent state-replay events where | ||
| * "last writer wins" is correct and a 100ms cap on update propagation | ||
| * is acceptable. Latency-sensitive events (`question.pending`) and | ||
| * connect-only events (`ready`) stay immediate by NOT being in this set. | ||
| * | ||
| * Per-event-type overrides via `deps.debounceMs[type] = 0` force any | ||
| * event off the debounce path; non-zero override flips it on with a | ||
| * custom window. Callers should not need to opt anything new into | ||
| * debouncing — adding a new snapshot event implies adding to this set. | ||
| */ | ||
| const DEFAULT_DEBOUNCED_EVENTS = new Set([ | ||
| 'world.snapshot', | ||
| 'tunnels.snapshot', | ||
| 'servers.snapshot', | ||
| 'listening.snapshot', | ||
| ]); | ||
| /** | ||
| * Create a host-stream broadcaster. Stateless w.r.t. the request — all | ||
| * source-of-truth wiring (docker events, worlds.db, etc.) is done by | ||
| * the caller via repeated `broadcast()` invocations. | ||
| * | ||
| * @param {HostStreamDeps} [deps] | ||
| * @returns {HostStream} | ||
| */ | ||
| export function createHostStream(deps = {}) { | ||
| const log = deps.log ?? (() => {}); | ||
| const defaultDebounceMs = deps.debounceMs?.default ?? DEFAULT_DEBOUNCE_MS; | ||
| const heartbeatMs = deps.heartbeatMs ?? DEFAULT_HEARTBEAT_MS; | ||
| const metricsMs = deps.metricsMs ?? DEFAULT_METRICS_MS; | ||
| const healthEvents = deps.healthEvents ?? true; | ||
| const now = deps.now ?? (() => Date.now()); | ||
| const maxQueuedPerSink = deps.maxQueuedPerSink ?? DEFAULT_MAX_QUEUED; | ||
| const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms)); | ||
| const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h)); | ||
| /** | ||
| * @typedef {object} SinkState | ||
| * @property {import('node:http').ServerResponse} res | ||
| * @property {string[]} queue | ||
| * @property {boolean} paused true while waiting for a `drain` event | ||
| * @property {boolean} draining true while flushQueue is iterating | ||
| * @property {boolean} drainListenerAttached | ||
| * @property {any | null} heartbeatHandle | ||
| * @property {number} overflows | ||
| */ | ||
| /** @type {Map<import('node:http').ServerResponse, SinkState>} */ | ||
| const sinks = new Map(); | ||
| /** @type {Map<string, unknown>} last-known payload per event type */ | ||
| const snapshots = new Map(); | ||
| /** @type {Map<string, any>} pending debounce timers per event type */ | ||
| const debounceTimers = new Map(); | ||
| /** Per-event-type broadcast counters since last metrics flush. */ | ||
| const eventCounters = new Map(); | ||
| let overflowCounter = 0; | ||
| let closed = false; | ||
| let metricsHandle = null; | ||
| function formatEvent(eventType, payload) { | ||
| return `event: ${eventType}\ndata: ${JSON.stringify(payload)}\n\n`; | ||
| } | ||
| /** | ||
| * Queue-aware write. If the underlying socket's `res.write` returns | ||
| * `false` we buffer the chunk in the per-sink queue and register a | ||
| * one-shot `drain` listener to flush it when the kernel reports the | ||
| * socket is writable again. On overflow we emit `:overflow` so | ||
| * consumers know they missed updates and drop oldest. | ||
| * | ||
| * @returns {boolean} true if the chunk was accepted (synchronously or | ||
| * queued) — false only when the sink is dead and was removed. | ||
| */ | ||
| function writeSafe(state, chunk) { | ||
| const { res } = state; | ||
| if (res.writableEnded || res.destroyed) return false; | ||
| // If a previous write reported backpressure (returned false), queue | ||
| // unconditionally — preserves event ordering. The drain handler | ||
| // flushes the queue in FIFO order. | ||
| if (state.paused) { | ||
| enqueue(state, chunk); | ||
| return true; | ||
| } | ||
| try { | ||
| const ok = res.write(chunk); | ||
| if (ok) return true; | ||
| // Returned false — kernel buffer is full. Switch to queue mode so | ||
| // subsequent writes don't race past this one. | ||
| state.paused = true; | ||
| attachDrain(state); | ||
| return true; | ||
| } catch { | ||
| // Sink already closed — drop it; further writes would throw. | ||
| teardownSink(res); | ||
| return false; | ||
| } | ||
| } | ||
| function enqueue(state, chunk) { | ||
| if (state.queue.length >= maxQueuedPerSink) { | ||
| // Drop oldest, emit :overflow comment when the drain eventually | ||
| // flushes. The overflow comment is enqueued (not written directly) | ||
| // so consumers see it inline with surrounding events. | ||
| state.queue.shift(); | ||
| state.overflows += 1; | ||
| overflowCounter += 1; | ||
| if (!state.queue.some((s) => s === ':overflow\n\n')) { | ||
| state.queue.unshift(':overflow\n\n'); | ||
| } | ||
| } | ||
| state.queue.push(chunk); | ||
| attachDrain(state); | ||
| } | ||
| function attachDrain(state) { | ||
| if (state.drainListenerAttached) return; | ||
| const { res } = state; | ||
| if (typeof res.once !== 'function') return; // testing-sink fallback | ||
| state.drainListenerAttached = true; | ||
| res.once('drain', () => { | ||
| state.drainListenerAttached = false; | ||
| flushQueue(state); | ||
| }); | ||
| } | ||
| function flushQueue(state) { | ||
| const { res } = state; | ||
| if (state.draining) return; | ||
| state.draining = true; | ||
| state.paused = false; | ||
| try { | ||
| while (state.queue.length > 0) { | ||
| if (res.writableEnded || res.destroyed) { | ||
| state.queue.length = 0; | ||
| break; | ||
| } | ||
| const next = state.queue[0]; | ||
| let ok = false; | ||
| try { | ||
| ok = res.write(next); | ||
| } catch { | ||
| teardownSink(res); | ||
| return; | ||
| } | ||
| state.queue.shift(); | ||
| if (!ok) { | ||
| state.paused = true; | ||
| attachDrain(state); | ||
| break; | ||
| } | ||
| } | ||
| } finally { | ||
| state.draining = false; | ||
| } | ||
| } | ||
| function teardownSink(res) { | ||
| const state = sinks.get(res); | ||
| if (!state) return; | ||
| if (state.heartbeatHandle) { | ||
| try { clearTimer(state.heartbeatHandle); } catch { /* ignore */ } | ||
| state.heartbeatHandle = null; | ||
| } | ||
| state.queue.length = 0; | ||
| sinks.delete(res); | ||
| } | ||
| function doBroadcast(eventType, payload) { | ||
| if (closed) return 0; | ||
| snapshots.set(eventType, payload); | ||
| eventCounters.set(eventType, (eventCounters.get(eventType) ?? 0) + 1); | ||
| const chunk = formatEvent(eventType, payload); | ||
| let reached = 0; | ||
| // Snapshot the iteration order so concurrent sink removal during | ||
| // a write doesn't skip a sibling sink. | ||
| for (const state of [...sinks.values()]) { | ||
| if (writeSafe(state, chunk)) reached += 1; | ||
| } | ||
| return reached; | ||
| } | ||
| function flushDebounced() { | ||
| for (const [type, info] of debounceTimers) { | ||
| clearTimeout(info.handle); | ||
| debounceTimers.delete(type); | ||
| doBroadcast(type, info.payload); | ||
| } | ||
| } | ||
| function logMetrics() { | ||
| if (eventCounters.size === 0 && sinks.size === 0 && overflowCounter === 0) return; | ||
| /** @type {Record<string, number>} */ | ||
| const events = {}; | ||
| for (const [type, count] of eventCounters) events[type] = count; | ||
| log(`events=${JSON.stringify(events)} sinks=${sinks.size}${overflowCounter > 0 ? ` overflows=${overflowCounter}` : ''}`); | ||
| // Phase E5: broadcast the same counters as a typed `stream.health` | ||
| // event so SPA tabs can observe live stream health without polling. | ||
| // Built from the interval's counters BEFORE the reset below, so the | ||
| // payload is a per-interval rate. The broadcast itself bumps the | ||
| // `stream.health` counter, but the immediately-following reset wipes | ||
| // it — the next interval never double-counts this tick's own emit. | ||
| // Bypasses debounce (immediate path) since each tick is already | ||
| // rate-limited to the metrics cadence. | ||
| if (healthEvents) { | ||
| /** @type {StreamHealthPayload} */ | ||
| const payload = { | ||
| events, | ||
| sinks: sinks.size, | ||
| overflows: overflowCounter, | ||
| intervalMs: metricsMs, | ||
| at: now(), | ||
| }; | ||
| doBroadcast(STREAM_HEALTH_EVENT, payload); | ||
| } | ||
| eventCounters.clear(); | ||
| overflowCounter = 0; | ||
| } | ||
| // Start the metrics tick eagerly — operators want visibility from | ||
| // boot, not just after the first event lands. | ||
| if (metricsMs > 0) { | ||
| metricsHandle = setTimer(logMetrics, metricsMs); | ||
| // Don't pin the event loop just for metrics in tests / shutdown paths. | ||
| if (metricsHandle && typeof metricsHandle.unref === 'function') metricsHandle.unref(); | ||
| } | ||
| return { | ||
| addSink(res) { | ||
| if (closed) { | ||
| // Best-effort: end the response so the client sees the channel | ||
| // closing instead of hanging on an empty stream. | ||
| try { res.end(); } catch { /* ignore */ } | ||
| return () => {}; | ||
| } | ||
| const state = /** @type {SinkState} */ ({ | ||
| res, | ||
| queue: [], | ||
| paused: false, | ||
| draining: false, | ||
| drainListenerAttached: false, | ||
| heartbeatHandle: null, | ||
| overflows: 0, | ||
| }); | ||
| sinks.set(res, state); | ||
| // Replay last-known snapshot for every event type so the new | ||
| // subscriber gets current state without waiting for the next change. | ||
| // Sorting keeps test assertions deterministic. | ||
| const types = [...snapshots.keys()].sort(); | ||
| for (const type of types) { | ||
| writeSafe(state, formatEvent(type, snapshots.get(type))); | ||
| } | ||
| // Phase E2: per-sink heartbeat. Write a comment line every | ||
| // `heartbeatMs` so the SSE channel survives idle proxies. The | ||
| // comment is invisible to client EventSource listeners (the | ||
| // browser passes only `event:`/`data:` lines through), so this | ||
| // does NOT trigger any handler — it's pure connection-keepalive. | ||
| if (heartbeatMs > 0) { | ||
| state.heartbeatHandle = setTimer(() => { | ||
| // Use writeSafe so backpressure / overflow handling applies | ||
| // uniformly. Heartbeats that fail to flush are uninteresting | ||
| // — the regular broadcast loop will discover the dead sink. | ||
| writeSafe(state, ':\n\n'); | ||
| }, heartbeatMs); | ||
| if (state.heartbeatHandle && typeof state.heartbeatHandle.unref === 'function') { | ||
| state.heartbeatHandle.unref(); | ||
| } | ||
| } | ||
| return () => { | ||
| teardownSink(res); | ||
| }; | ||
| }, | ||
| broadcast(eventType, payload) { | ||
| if (closed) return 0; | ||
| if (typeof eventType !== 'string' || eventType.length === 0) { | ||
| throw new TypeError('broadcast: eventType must be a non-empty string'); | ||
| } | ||
| // Phase E1: opt-in trailing-edge debounce. | ||
| // - DEFAULT_DEBOUNCED_EVENTS opts the canonical snapshot events | ||
| // into trailing-edge coalescing. Last writer wins because those | ||
| // events are idempotent state replays. | ||
| // - Every other event type bypasses the timer and writes | ||
| // immediately — preserves the Phase A synchronous broadcast | ||
| // contract that existing tests / consumers depend on. | ||
| // - Per-event-type overrides via `deps.debounceMs[eventType]` | ||
| // win in both directions (set to 0 to disable, or specify a | ||
| // custom window). | ||
| // - `flushDebounced()` is exposed for tests that want to assert | ||
| // immediate effects without waiting for the timer. | ||
| let debounceFor; | ||
| const override = deps.debounceMs?.[eventType]; | ||
| if (override !== undefined) { | ||
| debounceFor = override; | ||
| } else if (DEFAULT_DEBOUNCED_EVENTS.has(eventType)) { | ||
| debounceFor = defaultDebounceMs; | ||
| } else { | ||
| debounceFor = 0; | ||
| } | ||
| if (debounceFor <= 0) { | ||
| // Take the immediate path; flush any pending coalesce for this | ||
| // type first so order is preserved. | ||
| const pending = debounceTimers.get(eventType); | ||
| if (pending) { | ||
| clearTimeout(pending.handle); | ||
| debounceTimers.delete(eventType); | ||
| } | ||
| return doBroadcast(eventType, payload); | ||
| } | ||
| // Coalesce: keep the latest payload, restart the trailing timer. | ||
| const pending = debounceTimers.get(eventType); | ||
| if (pending) clearTimeout(pending.handle); | ||
| const handle = setTimeout(() => { | ||
| debounceTimers.delete(eventType); | ||
| doBroadcast(eventType, payload); | ||
| }, debounceFor); | ||
| if (typeof handle.unref === 'function') handle.unref(); | ||
| debounceTimers.set(eventType, { handle, payload }); | ||
| // Returns sinks.size as an approximation; the actual broadcast | ||
| // will happen after the trailing-edge delay. Tests assert via the | ||
| // sink writes anyway. | ||
| return sinks.size; | ||
| }, | ||
| snapshot() { | ||
| /** @type {Record<string, unknown>} */ | ||
| const out = {}; | ||
| for (const [type, payload] of snapshots) out[type] = payload; | ||
| return out; | ||
| }, | ||
| close() { | ||
| if (closed) return; | ||
| closed = true; | ||
| // Cancel pending debounce timers — anything still queued is | ||
| // discarded; we don't write to sinks during shutdown. | ||
| for (const [, info] of debounceTimers) clearTimeout(info.handle); | ||
| debounceTimers.clear(); | ||
| if (metricsHandle) { | ||
| try { clearTimer(metricsHandle); } catch { /* ignore */ } | ||
| metricsHandle = null; | ||
| } | ||
| for (const [res, state] of [...sinks.entries()]) { | ||
| if (state.heartbeatHandle) { | ||
| try { clearTimer(state.heartbeatHandle); } catch { /* ignore */ } | ||
| } | ||
| try { res.end(); } catch { /* ignore */ } | ||
| sinks.delete(res); | ||
| } | ||
| log('closed'); | ||
| }, | ||
| sinkCount() { | ||
| return sinks.size; | ||
| }, | ||
| metrics() { | ||
| /** @type {Record<string, number>} */ | ||
| const events = {}; | ||
| for (const [type, count] of eventCounters) events[type] = count; | ||
| return { events, sinks: sinks.size, overflows: overflowCounter }; | ||
| }, | ||
| flushDebounced, | ||
| }; | ||
| } | ||
| /** | ||
| * Generate a fresh streamId for the `ready` event payload. Exposed so | ||
| * route handlers can attach the same id to log lines and the wire. | ||
| * | ||
| * @returns {string} | ||
| */ | ||
| export function newStreamId() { | ||
| return crypto.randomBytes(8).toString('hex'); | ||
| } |
| /** | ||
| * listening-server-poller.mjs | ||
| * Discovers listening TCP ports inside a world's devbox container. | ||
| * Dual-mode: Docker HTTP API (container) vs docker exec CLI (bare-node). | ||
| * Cache TTL: 10s per world. | ||
| */ | ||
| import { spawnSync } from 'node:child_process'; | ||
| const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli'; | ||
| // Skip well-known infra ports — these are always running and not user servers | ||
| const INFRA_PORTS = new Set([8080, 7681, 7682]); | ||
| // Per-world cache: worldId → { ts, servers, error? } | ||
| const cache = new Map(); | ||
| const CACHE_TTL_MS = 10_000; | ||
| function worldContainerName(worldId) { | ||
| return `olam-${worldId}-devbox`; | ||
| } | ||
| /** | ||
| * Parse `ss -tlnp` output into server rows. | ||
| * Output format: | ||
| * Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port Process | ||
| * tcp LISTEN 0 128 0.0.0.0:5173 0.0.0.0:* users:(("vite",pid=42,fd=8)) | ||
| * | ||
| * @param {string} stdout | ||
| * @returns {Array<{port: number, pid: string, cmd: string}>} | ||
| */ | ||
| export function parseSsOutput(stdout) { | ||
| const lines = stdout.trim().split('\n').slice(1); // skip header | ||
| const results = []; | ||
| for (const line of lines) { | ||
| const parts = line.trim().split(/\s+/); | ||
| if (parts.length < 5) continue; | ||
| // parts[3] = Local Address:Port (e.g. "0.0.0.0:5173" or "*:5173" or ":::5173") | ||
| const localAddr = parts[3]; | ||
| const colonIdx = localAddr.lastIndexOf(':'); | ||
| if (colonIdx === -1) continue; | ||
| const portStr = localAddr.slice(colonIdx + 1); | ||
| const port = parseInt(portStr, 10); | ||
| if (!Number.isFinite(port) || port <= 0) continue; | ||
| if (INFRA_PORTS.has(port)) continue; | ||
| // Extract pid and cmd from process column: users:(("vite",pid=42,fd=8)) | ||
| let pid = ''; | ||
| let cmd = ''; | ||
| const processCol = parts.slice(4).join(' '); | ||
| const pidMatch = /pid=(\d+)/.exec(processCol); | ||
| if (pidMatch) pid = pidMatch[1]; | ||
| const cmdMatch = /"([^"]+)"/.exec(processCol); | ||
| if (cmdMatch) cmd = cmdMatch[1]; | ||
| results.push({ port, pid, cmd }); | ||
| } | ||
| return results; | ||
| } | ||
| /** | ||
| * Fetch listening servers for a world. Returns cached result if <10s old. | ||
| * @param {string} worldId | ||
| * @returns {Promise<{ts: number, servers: Array<{port: number, pid: string, cmd: string}>, error?: string}>} | ||
| */ | ||
| export async function getListeningServers(worldId) { | ||
| const cached = cache.get(worldId); | ||
| if (cached && Date.now() - cached.ts < CACHE_TTL_MS) return cached; | ||
| const containerName = worldContainerName(worldId); | ||
| try { | ||
| let stdout; | ||
| if (DOCKER_HOST === 'docker-cli') { | ||
| const result = spawnSync( | ||
| 'docker', ['exec', containerName, 'ss', '-tlnp'], | ||
| { encoding: 'utf-8', timeout: 3000 }, | ||
| ); | ||
| if (result.status !== 0 || result.error) { | ||
| const entry = { ts: Date.now(), servers: [], error: 'container not running' }; | ||
| cache.set(worldId, entry); | ||
| return entry; | ||
| } | ||
| stdout = result.stdout ?? ''; | ||
| } else { | ||
| const apiBase = DOCKER_HOST.replace(/^tcp:\/\//, 'http://'); | ||
| const execCreate = await fetch( | ||
| `${apiBase}/containers/${encodeURIComponent(containerName)}/exec`, | ||
| { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify({ | ||
| AttachStdout: true, | ||
| AttachStderr: false, | ||
| Cmd: ['ss', '-tlnp'], | ||
| }), | ||
| signal: AbortSignal.timeout(3000), | ||
| }, | ||
| ); | ||
| if (!execCreate.ok) { | ||
| const entry = { ts: Date.now(), servers: [], error: 'container not running' }; | ||
| cache.set(worldId, entry); | ||
| return entry; | ||
| } | ||
| const { Id: execId } = await execCreate.json(); | ||
| const execStart = await fetch(`${apiBase}/exec/${execId}/start`, { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify({ Detach: false, Tty: false }), | ||
| signal: AbortSignal.timeout(3000), | ||
| }); | ||
| // Docker exec start streams multiplexed output (8-byte header per frame) | ||
| const buf = await execStart.arrayBuffer(); | ||
| stdout = demuxDockerStream(Buffer.from(buf)); | ||
| } | ||
| const servers = parseSsOutput(stdout); | ||
| const entry = { ts: Date.now(), servers }; | ||
| cache.set(worldId, entry); | ||
| return entry; | ||
| } catch { | ||
| const entry = { ts: Date.now(), servers: [], error: 'container not running' }; | ||
| cache.set(worldId, entry); | ||
| return entry; | ||
| } | ||
| } | ||
| /** | ||
| * Strip Docker stream multiplexing headers (8 bytes per frame: [stream, 0, 0, 0, size32be]). | ||
| * @param {Buffer} buf | ||
| * @returns {string} | ||
| */ | ||
| function demuxDockerStream(buf) { | ||
| let output = ''; | ||
| let offset = 0; | ||
| while (offset + 8 <= buf.length) { | ||
| const size = buf.readUInt32BE(offset + 4); | ||
| const payload = buf.slice(offset + 8, offset + 8 + size); | ||
| output += payload.toString('utf-8'); | ||
| offset += 8 + size; | ||
| } | ||
| return output; | ||
| } | ||
| export { parseSsOutput as _parseSsOutputForTests }; |
| /** | ||
| * Phase E2 (olam-dogfood-vision): LocalWorldsSource implementation. | ||
| * | ||
| * Wraps host-cp's existing dockerode-driven world enumeration in a | ||
| * WorldsSource-shaped object so E4's composition layer can fan out | ||
| * across multiple sources (local + future Pylon cloud) and merge. | ||
| * | ||
| * The class deliberately takes its dependencies via factory function | ||
| * injection rather than reaching into server.mjs's module-level state | ||
| * directly. Two reasons: | ||
| * 1. Testability — vitest can pass mocked getWorldsRegistry + | ||
| * fetchWorldServices without spinning up the full host-cp | ||
| * server.mjs. | ||
| * 2. Module-cycle avoidance — server.mjs imports this module, so | ||
| * this module CANNOT import server.mjs back without a cycle. | ||
| * | ||
| * Returns the same shape as the pre-E2 GET /api/worlds response with | ||
| * a single addition: `source: 'local'` on every entry. | ||
| * | ||
| * @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource | ||
| * @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary | ||
| * @typedef {import('./worlds-source.mjs').ServiceInfo} ServiceInfo | ||
| */ | ||
| /** | ||
| * @typedef {object} LocalWorldsSourceDeps | ||
| * @property {() => Record<string, number>} getWorldsRegistry | ||
| * Returns current WORLDS map (worldId → host_port). Called fresh | ||
| * per list() so post-list registry mutations are visible immediately. | ||
| * @property {(worldId: string) => string | null} getWorldName | ||
| * Returns the operator-set friendly name OR null if absent. | ||
| * @property {(worldId: string) => Promise<ServiceInfo[]>} fetchWorldServices | ||
| * Probes per-world services (atlas-core, diner-app, ttyd, per-world CP). | ||
| * Same function the pre-E2 handler called inline. | ||
| */ | ||
| /** | ||
| * @param {LocalWorldsSourceDeps} deps | ||
| * @returns {WorldsSource} | ||
| */ | ||
| export function createLocalWorldsSource(deps) { | ||
| return { | ||
| name: 'local', | ||
| async list() { | ||
| const registry = deps.getWorldsRegistry(); | ||
| const entries = Object.entries(registry); | ||
| const worlds = await Promise.all( | ||
| entries.map(async ([id, host_port]) => { | ||
| const services = await deps.fetchWorldServices(id); | ||
| // World status mirrors pre-E2 behavior: | ||
| // - running: >=1 service responds to a probe | ||
| // - starting: container has port bindings but nothing answers | ||
| // - unknown: no port bindings at all (container down/missing) | ||
| const liveCount = services.filter((s) => s.live).length; | ||
| /** @type {'running' | 'starting' | 'unknown'} */ | ||
| const status = | ||
| services.length === 0 | ||
| ? 'unknown' | ||
| : liveCount > 0 | ||
| ? 'running' | ||
| : 'starting'; | ||
| /** @type {WorldSummary} */ | ||
| const summary = { | ||
| id, | ||
| name: deps.getWorldName(id), | ||
| status, | ||
| services, | ||
| source: 'local', | ||
| }; | ||
| // Preserve the pre-E2 host_port field so SPA + CLI consumers | ||
| // that depend on it don't break. WorldSummary type doesn't | ||
| // declare host_port (it's local-source-specific metadata), | ||
| // but extra fields on the object are tolerated by the type. | ||
| return /** @type {WorldSummary & {host_port: number}} */ ({ | ||
| ...summary, | ||
| host_port, | ||
| }); | ||
| }), | ||
| ); | ||
| return worlds; | ||
| }, | ||
| }; | ||
| } |
| // Phase C Task C3 — hand-rolled Prometheus metrics registry for host-cp. | ||
| // | ||
| // Emits exactly two metric families: | ||
| // http_requests_total{service,route,method,status_code} counter | ||
| // http_request_duration_seconds{service,route,method} histogram | ||
| // | ||
| // TAXONOMY COMPLIANCE (NON-NEGOTIABLE): | ||
| // ONLY {service, route, method, status_code} labels allowed. | ||
| // BANNED: world_id, trace_id, user_id, request_id, operator_id. | ||
| // world_id surfaces via Prometheus exemplars in Phase D — NOT labels. | ||
| // | ||
| // No external npm deps — Prometheus text exposition is simple enough to | ||
| // produce with template literals. Avoids the prom-client footprint on a | ||
| // host-side service that has no other dependency on metrics tooling. | ||
| // ─── Route mapping ──────────────────────────────────────────────────────── | ||
| // | ||
| // Raw req.url is a cardinality bomb: every unique URL is a new time series. | ||
| // We normalize dynamic path segments to stable patterns before labelling. | ||
| // | ||
| // RULES (first match wins): | ||
| // /health → /health | ||
| // /api/bootstrap → /api/bootstrap | ||
| // /metrics → /metrics | ||
| // /api/host-stream → /api/host-stream | ||
| // /api/worlds/{id}/credentials/... → /api/worlds/:id/credentials/:action | ||
| // /api/worlds/{id}/tunnels/... → /api/worlds/:id/tunnels | ||
| // /api/worlds/{id}/pr → /api/worlds/:id/pr | ||
| // /api/worlds/{id}/progress → /api/worlds/:id/progress | ||
| // /api/worlds (no id) → /api/worlds | ||
| // /api/world/{id}/** → /api/world/:id/* (proxy routes) | ||
| // /api/admin/registry/... → /api/admin/registry | ||
| // /api/admin/upgrade → /api/admin/upgrade | ||
| // /api/admin/world-pr → /api/admin/world-pr | ||
| // /api/admin/world-pr/{id} → /api/admin/world-pr/:id | ||
| // /api/auth/credentials/... → /api/auth/credentials | ||
| // /api/auth/... → /api/auth | ||
| // /api/plan/conversations/{id}/... → /api/plan/conversations/:id | ||
| // /api/plan/conversations → /api/plan/conversations | ||
| // /api/plan/** → /api/plan | ||
| // /api/auth/events → /api/auth/events | ||
| // /api/version/status → /api/version/status | ||
| // /api/repos → /api/repos | ||
| // /api/runbooks → /api/runbooks | ||
| // /api/workspaces/match → /api/workspaces/match | ||
| // /api/workspaces → /api/workspaces | ||
| // /api/projects → /api/projects | ||
| // /api/processes/** → /api/processes | ||
| // /v1/chunks/** → /v1/chunks | ||
| // /v1/worlds/** → /v1/worlds | ||
| // /assets/** → /assets (SPA static assets) | ||
| // (other GET to static paths) → /static | ||
| // (unknown) → /unknown | ||
| /** @param {string} pathname */ | ||
| export function pathToRoute(pathname) { | ||
| // Normalize trailing slash for matching (keep bare / as /) | ||
| const p = pathname.length > 1 ? pathname.replace(/\/$/, '') : pathname; | ||
| if (p === '/health') return '/health'; | ||
| if (p === '/api/bootstrap') return '/api/bootstrap'; | ||
| if (p === '/metrics') return '/metrics'; | ||
| if (p === '/api/host-stream') return '/api/host-stream'; | ||
| if (p === '/api/auth/events') return '/api/auth/events'; | ||
| if (p === '/api/version/status') return '/api/version/status'; | ||
| if (p === '/api/repos') return '/api/repos'; | ||
| if (p === '/api/runbooks') return '/api/runbooks'; | ||
| if (p === '/api/workspaces/match') return '/api/workspaces/match'; | ||
| if (p === '/api/workspaces') return '/api/workspaces'; | ||
| if (p === '/api/projects') return '/api/projects'; | ||
| if (p === '/api/worlds') return '/api/worlds'; | ||
| if (p === '/api/plan/conversations' || p === '/api/plan/personas') return p; | ||
| if (p === '/api/admin/upgrade') return '/api/admin/upgrade'; | ||
| if (p === '/api/admin/world-pr') return '/api/admin/world-pr'; | ||
| if (p === '/api/admin/registry') return '/api/admin/registry'; | ||
| if (p.startsWith('/api/worlds/')) { | ||
| if (p.includes('/credentials/')) return '/api/worlds/:id/credentials/:action'; | ||
| if (p.includes('/tunnels')) return '/api/worlds/:id/tunnels'; | ||
| if (p.endsWith('/pr')) return '/api/worlds/:id/pr'; | ||
| if (p.endsWith('/progress')) return '/api/worlds/:id/progress'; | ||
| return '/api/worlds/:id'; | ||
| } | ||
| if (p.startsWith('/api/world/')) return '/api/world/:id/*'; | ||
| if (p.startsWith('/api/admin/registry/')) return '/api/admin/registry'; | ||
| if (p.startsWith('/api/admin/world-pr/')) return '/api/admin/world-pr/:id'; | ||
| if (p.startsWith('/api/auth/credentials')) return '/api/auth/credentials'; | ||
| if (p.startsWith('/api/auth/')) return '/api/auth'; | ||
| if (p.startsWith('/api/plan/conversations/')) return '/api/plan/conversations/:id'; | ||
| if (p.startsWith('/api/plan/')) return '/api/plan'; | ||
| if (p.startsWith('/api/processes') || p.startsWith('/api/servers')) return '/api/processes'; | ||
| if (p.startsWith('/v1/chunks')) return '/v1/chunks'; | ||
| if (p.startsWith('/v1/worlds')) return '/v1/worlds'; | ||
| if (p.startsWith('/assets/')) return '/assets'; | ||
| // SPA HTML fallback routes (GET / and SPA sub-routes like /worlds, /plan/...) | ||
| if (p === '/' || p.startsWith('/worlds') || p.startsWith('/plan') || p.startsWith('/workspaces')) return '/static'; | ||
| return '/unknown'; | ||
| } | ||
| // ─── In-memory registry ─────────────────────────────────────────────────── | ||
| const HISTOGRAM_BUCKETS = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5]; | ||
| /** @type {Map<string, number>} labelSet → count */ | ||
| const _counters = new Map(); | ||
| /** | ||
| * Per label-set histogram state. | ||
| * @type {Map<string, {buckets: number[], sum: number, count: number}>} | ||
| */ | ||
| const _histograms = new Map(); | ||
| /** @param {string[]} parts label values in canonical order */ | ||
| function _labelKey(parts) { | ||
| return parts.join('\x00'); | ||
| } | ||
| /** | ||
| * Reset all metrics. FOR TESTS ONLY — never call in production code. | ||
| * Exported as a separate name so it's invisible to consumers that only | ||
| * import the named exports they need. | ||
| */ | ||
| export function _resetForTest() { | ||
| _counters.clear(); | ||
| _histograms.clear(); | ||
| } | ||
| /** | ||
| * Increment http_requests_total counter. | ||
| * | ||
| * @param {string} service | ||
| * @param {string} route — MUST be a normalized route pattern | ||
| * @param {string} method | ||
| * @param {string} statusCode | ||
| */ | ||
| export function incRequest(service, route, method, statusCode) { | ||
| const key = _labelKey([service, route, method, statusCode]); | ||
| _counters.set(key, (_counters.get(key) ?? 0) + 1); | ||
| } | ||
| /** | ||
| * Observe http_request_duration_seconds. | ||
| * | ||
| * @param {string} service | ||
| * @param {string} route | ||
| * @param {string} method | ||
| * @param {number} seconds | ||
| */ | ||
| export function observeDuration(service, route, method, seconds) { | ||
| const key = _labelKey([service, route, method]); | ||
| let h = _histograms.get(key); | ||
| if (!h) { | ||
| // buckets[i] = count of observations where seconds <= HISTOGRAM_BUCKETS[i] | ||
| // but stored as INCREMENTAL per-range so cumulation happens on render. | ||
| // Each bucket[i] = count that fell in range (HISTOGRAM_BUCKETS[i-1], HISTOGRAM_BUCKETS[i]]. | ||
| h = { buckets: new Array(HISTOGRAM_BUCKETS.length).fill(0), sum: 0, count: 0 }; | ||
| _histograms.set(key, h); | ||
| } | ||
| // Find the first bucket boundary that accommodates this observation. | ||
| // Increment only that bucket; render accumulates for the exposition. | ||
| let placed = false; | ||
| for (let i = 0; i < HISTOGRAM_BUCKETS.length; i++) { | ||
| if (seconds <= HISTOGRAM_BUCKETS[i]) { | ||
| h.buckets[i]++; | ||
| placed = true; | ||
| break; | ||
| } | ||
| } | ||
| // Observations beyond the last bucket are counted in h.count only; | ||
| // the +Inf bucket in the exposition equals h.count. | ||
| if (!placed) { | ||
| // No bucket captured it — it lands in +Inf only. | ||
| } | ||
| h.sum += seconds; | ||
| h.count++; | ||
| } | ||
| // ─── Prometheus text exposition ─────────────────────────────────────────── | ||
| /** Escape label value per Prometheus text format (backslash, newline, quote). */ | ||
| function escapeLabelValue(v) { | ||
| return String(v).replace(/\\/g, '\\\\').replace(/\n/g, '\\n').replace(/"/g, '\\"'); | ||
| } | ||
| /** | ||
| * Build the `{k1="v1",k2="v2",...}` label-set string. | ||
| * @param {Record<string, string>} labels | ||
| */ | ||
| function labelSet(labels) { | ||
| const parts = Object.entries(labels).map( | ||
| ([k, v]) => `${k}="${escapeLabelValue(v)}"`, | ||
| ); | ||
| return `{${parts.join(',')}}`; | ||
| } | ||
| /** | ||
| * Render the complete Prometheus text exposition. | ||
| * @returns {string} | ||
| */ | ||
| export function renderMetrics() { | ||
| const lines = []; | ||
| // ── http_requests_total ───────────────────────────────────────────── | ||
| lines.push('# HELP http_requests_total Total number of HTTP requests handled.'); | ||
| lines.push('# TYPE http_requests_total counter'); | ||
| for (const [key, count] of _counters) { | ||
| const [service, route, method, status_code] = key.split('\x00'); | ||
| lines.push( | ||
| `http_requests_total${labelSet({ service, route, method, status_code })} ${count}`, | ||
| ); | ||
| } | ||
| // ── http_request_duration_seconds ─────────────────────────────────── | ||
| lines.push('# HELP http_request_duration_seconds HTTP request duration in seconds (histogram).'); | ||
| lines.push('# TYPE http_request_duration_seconds histogram'); | ||
| for (const [key, h] of _histograms) { | ||
| const [service, route, method] = key.split('\x00'); | ||
| const base = { service, route, method }; | ||
| // Cumulative buckets: le=X must be ≥ sum of all observations ≤ X. | ||
| let cumulative = 0; | ||
| for (let i = 0; i < HISTOGRAM_BUCKETS.length; i++) { | ||
| cumulative += h.buckets[i]; | ||
| lines.push( | ||
| `http_request_duration_seconds_bucket${labelSet({ ...base, le: String(HISTOGRAM_BUCKETS[i]) })} ${cumulative}`, | ||
| ); | ||
| } | ||
| lines.push( | ||
| `http_request_duration_seconds_bucket${labelSet({ ...base, le: '+Inf' })} ${h.count}`, | ||
| ); | ||
| lines.push(`http_request_duration_seconds_sum${labelSet(base)} ${h.sum}`); | ||
| lines.push(`http_request_duration_seconds_count${labelSet(base)} ${h.count}`); | ||
| } | ||
| lines.push(''); // trailing newline | ||
| return lines.join('\n'); | ||
| } | ||
| // ─── Request instrumentation wrapper ───────────────────────────────────── | ||
| /** | ||
| * Wrap an async request handler so every request is instrumented. | ||
| * | ||
| * The wrapper: | ||
| * 1. Derives a stable route pattern from req.url. | ||
| * 2. Starts a high-resolution timer. | ||
| * 3. Calls the original handler. | ||
| * 4. Records counter + histogram using the response's status code. | ||
| * | ||
| * Status code capture: we monkey-patch res.writeHead and res.end to intercept | ||
| * the status before it's sent. Falls back to res.statusCode (which Node sets | ||
| * implicitly on .end() when no explicit writeHead call was made). | ||
| * | ||
| * @param {string} serviceName — emitted as the `service` label | ||
| * @param {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse) => Promise<void>} handler | ||
| * @returns {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse) => Promise<void>} | ||
| */ | ||
| export function instrumentHandler(serviceName, handler) { | ||
| return async (req, res) => { | ||
| const start = performance.now(); | ||
| // Intercept status code by wrapping writeHead. | ||
| let capturedStatus = null; | ||
| const origWriteHead = res.writeHead.bind(res); | ||
| res.writeHead = (status, ...rest) => { | ||
| capturedStatus = status; | ||
| return origWriteHead(status, ...rest); | ||
| }; | ||
| try { | ||
| await handler(req, res); | ||
| } finally { | ||
| const durationSec = (performance.now() - start) / 1000; | ||
| const urlObj = new URL(req.url ?? '/', `http://localhost`); | ||
| const route = pathToRoute(urlObj.pathname); | ||
| const method = (req.method ?? 'GET').toUpperCase(); | ||
| const statusCode = String(capturedStatus ?? res.statusCode ?? 200); | ||
| incRequest(serviceName, route, method, statusCode); | ||
| observeDuration(serviceName, route, method, durationSec); | ||
| } | ||
| }; | ||
| } |
| /** | ||
| * op-side-longpoll.mjs — Operator-side long-poll loop. | ||
| * | ||
| * Maintains a persistent outbound HTTPS connection from host-cp to | ||
| * plan-DO's /v1/op-poll endpoint, waiting for local-Docker dispatch | ||
| * work to appear. Feature-flag-gated behind OLAM_OPSIDE_LONGPOLL=1 | ||
| * (default OFF — no behavior change when unset). | ||
| * | ||
| * ELI5: like installing a phone line to the cloud planner. | ||
| * Your local machine stays connected, ready to receive coding tasks. | ||
| * Nothing calls yet — the phone just sits there waiting (v1). | ||
| * | ||
| * Phase D ships the plumbing and tests it behind a flag. Future cells | ||
| * (#3/#6/#7) wire actual producers on the plan-DO side. | ||
| * | ||
| * Circuit-breaker: 10 consecutive errors → 60 s pause → | ||
| * counter resets and polling resumes. Prevents hammering the server | ||
| * during outages. | ||
| * | ||
| * Reconnect delay: 1000ms base + uniform 0-500ms jitter. | ||
| * All structured log events emitted to console via JSON objects. | ||
| * | ||
| * @module op-side-longpoll | ||
| */ | ||
| // Reconnect delay constants. | ||
| const RECONNECT_BASE_MS = 1000; | ||
| const RECONNECT_JITTER_MS = 500; | ||
| // Circuit-breaker constants. | ||
| const CIRCUIT_BREAKER_THRESHOLD = 10; | ||
| const CIRCUIT_BREAKER_PAUSE_MS = 60_000; | ||
| // Long-poll timeout — plan-DO blocks up to 25 s; we give a 5 s margin. | ||
| const POLL_TIMEOUT_MS = 30_000; | ||
| /** | ||
| * Compute reconnect delay: 1000 + uniform 0-500 ms jitter. | ||
| * Exported so tests can mock Math.random and assert the formula. | ||
| * | ||
| * @param {() => number} [randFn] - Optional RNG override for testing. | ||
| * @returns {number} Delay in milliseconds. | ||
| */ | ||
| export function reconnectDelay(randFn = Math.random) { | ||
| return RECONNECT_BASE_MS + Math.floor(randFn() * RECONNECT_JITTER_MS); | ||
| } | ||
| /** @type {ReturnType<typeof setTimeout> | null} */ | ||
| let pollTimer = null; | ||
| /** @type {boolean} */ | ||
| let running = false; | ||
| /** @type {number} */ | ||
| let consecutiveErrors = 0; | ||
| /** @type {string | null} */ | ||
| let activeCloudUrl = null; | ||
| /** @type {string | null} */ | ||
| let activeAuth = null; | ||
| /** | ||
| * Emit a structured log event. Uses console.log to be consistent with | ||
| * host-cp's existing logging style. All events include `event` + `ts`. | ||
| * | ||
| * @param {string} event | ||
| * @param {Record<string, unknown>} [extra] | ||
| */ | ||
| function emit(event, extra = {}) { | ||
| console.log(JSON.stringify({ event, ts: new Date().toISOString(), ...extra })); | ||
| } | ||
| /** | ||
| * Sleep for ms milliseconds. Returns a Promise that resolves after the | ||
| * delay. Cancellable via the token: if token.cancelled becomes true | ||
| * before the timeout fires, the promise still resolves (callers check | ||
| * running themselves). | ||
| * | ||
| * @param {number} ms | ||
| * @returns {Promise<void>} | ||
| */ | ||
| function sleep(ms) { | ||
| return new Promise((resolve) => { | ||
| pollTimer = setTimeout(resolve, ms); | ||
| }); | ||
| } | ||
| /** | ||
| * Single poll iteration: open a GET /v1/op-poll request, wait for | ||
| * a response, parse the JSON body. | ||
| * | ||
| * @param {string} cloudUrl Base URL (e.g. https://plan-do.example.com) | ||
| * @param {string} auth Authorization header value | ||
| * @returns {Promise<{ work: null | { worldId: string, dispatchSpec: unknown } }>} | ||
| */ | ||
| async function pollOnce(cloudUrl, auth) { | ||
| const controller = new AbortController(); | ||
| const timeoutId = setTimeout(() => controller.abort(), POLL_TIMEOUT_MS); | ||
| try { | ||
| const url = `${cloudUrl.replace(/\/$/, '')}/v1/op-poll`; | ||
| const res = await fetch(url, { | ||
| method: 'GET', | ||
| headers: { Authorization: auth }, | ||
| signal: controller.signal, | ||
| }); | ||
| if (!res.ok) { | ||
| throw new Error(`op-poll returned ${res.status}`); | ||
| } | ||
| const body = await res.json(); | ||
| return body; | ||
| } finally { | ||
| clearTimeout(timeoutId); | ||
| } | ||
| } | ||
| /** | ||
| * The main poll loop. Runs until stopPoll() is called. | ||
| * | ||
| * State transitions: | ||
| * idle | ||
| * → connecting (emit op-poll-connect) | ||
| * → got { work: null } timeout response (emit op-poll-timeout) | ||
| * → wait reconnect delay | ||
| * → connecting again | ||
| * | ||
| * On error: | ||
| * → consecutiveErrors++ | ||
| * → emit op-poll-error | ||
| * → if consecutiveErrors >= threshold: circuit-breaker open | ||
| * emit op-poll-circuit-open, wait 60 s, reset counter | ||
| * → else: wait reconnect delay | ||
| * | ||
| * @returns {Promise<void>} | ||
| */ | ||
| async function pollLoop() { | ||
| while (running) { | ||
| emit('op-poll-connect', { cloud_url: activeCloudUrl }); | ||
| try { | ||
| const result = await pollOnce(activeCloudUrl, activeAuth); | ||
| // On a successful { work: null } response, reset the error counter. | ||
| consecutiveErrors = 0; | ||
| const delay = reconnectDelay(); | ||
| emit('op-poll-timeout', { work: result.work, reconnect_in_ms: delay }); | ||
| if (!running) break; | ||
| await sleep(delay); | ||
| } catch (err) { | ||
| consecutiveErrors++; | ||
| const message = err instanceof Error ? err.message : String(err); | ||
| emit('op-poll-error', { | ||
| error: message, | ||
| consecutive_errors: consecutiveErrors, | ||
| }); | ||
| if (consecutiveErrors >= CIRCUIT_BREAKER_THRESHOLD) { | ||
| emit('op-poll-circuit-open', { | ||
| consecutive_errors: consecutiveErrors, | ||
| pause_ms: CIRCUIT_BREAKER_PAUSE_MS, | ||
| }); | ||
| consecutiveErrors = 0; | ||
| if (!running) break; | ||
| await sleep(CIRCUIT_BREAKER_PAUSE_MS); | ||
| } else { | ||
| if (!running) break; | ||
| const delay = reconnectDelay(); | ||
| await sleep(delay); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| /** | ||
| * Start the operator-side long-poll loop. | ||
| * | ||
| * No-op if already running. Reads the flag from the environment: | ||
| * only runs when OLAM_OPSIDE_LONGPOLL === '1'. Call this AFTER | ||
| * server.listen() to avoid blocking the process startup path. | ||
| * | ||
| * @param {string} cloudUrl Base URL of the plan-DO deployment. | ||
| * @param {string} auth Authorization header value for Basic auth. | ||
| */ | ||
| export function startPoll(cloudUrl, auth) { | ||
| if (running) return; | ||
| running = true; | ||
| consecutiveErrors = 0; | ||
| activeCloudUrl = cloudUrl; | ||
| activeAuth = auth; | ||
| // Fire-and-forget; errors are caught inside pollLoop. | ||
| void pollLoop(); | ||
| } | ||
| /** | ||
| * Stop the operator-side long-poll loop. | ||
| * | ||
| * Cancels any in-progress sleep timer; the loop condition will | ||
| * exit on its next iteration. Idempotent. | ||
| */ | ||
| export function stopPoll() { | ||
| running = false; | ||
| if (pollTimer !== null) { | ||
| clearTimeout(pollTimer); | ||
| pollTimer = null; | ||
| } | ||
| } |
| // C4 — macOS panic-log counter. | ||
| // | ||
| // Note: phase-c-tasks.md originally listed a browser SPA path for this. | ||
| // SPAs can't shell out — `child_process` is Node-only. Correct home | ||
| // is host-cp (Node) which brokers operator-machine state through | ||
| // host-stream. host-cp exposes a typed event consumers can subscribe to. | ||
| // | ||
| // Implementation: | ||
| // `log show --predicate 'eventMessage CONTAINS "panic"' --last <N>d` | ||
| // pipes to stdout; we count newlines (each panic event = 1 line). | ||
| // | ||
| // Platform guard: | ||
| // On non-darwin platforms, getPanicCount returns null + emits a | ||
| // `[panic-counter]` warning to stderr. Callers branch on null → | ||
| // skip the delta + don't emit the Slack message. | ||
| // | ||
| // Sampling cadence: | ||
| // Baseline: at olam-cli startup OR on first /plan/new visit | ||
| // Per-session: at plan completion (cloud-mode only) | ||
| // | ||
| // Cost note: | ||
| // `log show` is expensive (~200ms-2s depending on system log size). | ||
| // Cache the baseline + only re-sample on demand. Don't poll. | ||
| import { execFile } from 'node:child_process'; | ||
| import { promisify } from 'node:util'; | ||
| import { platform } from 'node:os'; | ||
| const execFileP = promisify(execFile); | ||
| const PANIC_PREDICATE = 'eventMessage CONTAINS "panic"'; | ||
| const DEFAULT_TIMEOUT_MS = 30_000; | ||
| /** | ||
| * Return the count of `panic`-containing log entries over the last N | ||
| * days. Returns null on non-darwin platforms OR on `log` command | ||
| * failure (caller treats null as "no signal; skip the delta"). | ||
| */ | ||
| export async function getPanicCount(last_n_days = 7, opts = {}) { | ||
| if (platform() !== 'darwin') { | ||
| if (!opts.silent) { | ||
| process.stderr.write( | ||
| `[panic-counter] platform=${platform()} is not darwin; returning null\n`, | ||
| ); | ||
| } | ||
| return null; | ||
| } | ||
| const execImpl = opts.execFileFn ?? execFileP; | ||
| try { | ||
| const { stdout } = await execImpl( | ||
| 'log', | ||
| ['show', '--predicate', PANIC_PREDICATE, '--last', `${last_n_days}d`], | ||
| { timeout: opts.timeoutMs ?? DEFAULT_TIMEOUT_MS, maxBuffer: 10 * 1024 * 1024 }, | ||
| ); | ||
| // `log show` prepends a header + may emit an "is empty" sentinel. | ||
| // Count lines that look like log entries: start with a timestamp. | ||
| const lines = stdout.split('\n').filter((line) => /^\d{4}-\d{2}-\d{2}/.test(line)); | ||
| return lines.length; | ||
| } catch (err) { | ||
| if (!opts.silent) { | ||
| const msg = err instanceof Error ? err.message : String(err); | ||
| process.stderr.write(`[panic-counter] log command failed: ${msg}\n`); | ||
| } | ||
| return null; | ||
| } | ||
| } | ||
| /** | ||
| * Pure delta math. Returns null if either input is null (no signal). | ||
| * Negative deltas (panics increased) are valid — caller frames the | ||
| * Slack message appropriately. | ||
| */ | ||
| export function computePanicDelta(before, after) { | ||
| if (before === null || after === null) return null; | ||
| if (typeof before !== 'number' || typeof after !== 'number') return null; | ||
| return after - before; | ||
| } | ||
| /** Format the delta for a Slack message body. Plain English; no jargon. */ | ||
| export function formatDeltaSummary(before, after) { | ||
| const delta = computePanicDelta(before, after); | ||
| if (delta === null) { | ||
| return 'Panic delta: n/a (counter unavailable this session).'; | ||
| } | ||
| if (delta === 0) { | ||
| return `Panic count steady: ${before} → ${after} (no change this session).`; | ||
| } | ||
| if (delta < 0) { | ||
| return `Panic count down ${Math.abs(delta)}: ${before} → ${after}.`; | ||
| } | ||
| return `Panic count up ${delta}: ${before} → ${after}.`; | ||
| } |
| // plan-chat-proxy-headers.mjs — header handling for host-cp's /api/plan-chat/* | ||
| // passthrough proxy (server.mjs). Extracted as pure helpers so the F3 (T9) | ||
| // operator-chunk broker-secret contract is unit-testable without booting the | ||
| // whole host-cp server. | ||
| // | ||
| // F3 (T9) boundary: host-cp's /api/plan-chat/* proxy is the TRUSTED operator | ||
| // surface (SPA browser → host-cp → plan-chat-service). A WORLD process never | ||
| // routes through this proxy — it talks to plan-chat-service directly via | ||
| // host.docker.internal. So: | ||
| // - Client-supplied `x-olam-broker-secret` is ALWAYS stripped (a client must | ||
| // not be able to smuggle the operator-chunk authority secret through). | ||
| // - The real secret is injected by the proxy itself (injectBrokerSecret), | ||
| // only when configured, so the operator's own SPA interject is authorised | ||
| // while a world process — which can't present the secret — is rejected by | ||
| // plan-chat-service's gate. | ||
| const HOP_BY_HOP = new Set(['host', 'connection', 'content-length']); | ||
| const BROKER_SECRET_HEADER = 'x-olam-broker-secret'; | ||
| /** | ||
| * Build the upstream header map for a /api/plan-chat/* proxy request. | ||
| * Drops hop-by-hop headers AND any client-supplied broker secret (F3). | ||
| * | ||
| * @param {Record<string, string | string[] | undefined>} reqHeaders | ||
| * @returns {Record<string, string>} | ||
| */ | ||
| export function buildPlanChatProxyHeaders(reqHeaders) { | ||
| const headers = {}; | ||
| for (const [k, v] of Object.entries(reqHeaders ?? {})) { | ||
| if (HOP_BY_HOP.has(k)) continue; | ||
| // F3 — never forward a CLIENT-supplied broker secret. | ||
| if (k === BROKER_SECRET_HEADER) continue; | ||
| if (Array.isArray(v)) headers[k] = v.join(', '); | ||
| else if (typeof v === 'string') headers[k] = v; | ||
| } | ||
| return headers; | ||
| } | ||
| /** | ||
| * Inject the operator-chunk broker secret into the upstream headers when it is | ||
| * configured. No-op when the secret is unset/empty (the gate then runs in its | ||
| * default ungated-but-loud mode). Mutates + returns `headers`. | ||
| * | ||
| * @param {Record<string, string>} headers | ||
| * @param {string | undefined} operatorChunkSecret | ||
| * @returns {Record<string, string>} | ||
| */ | ||
| export function injectBrokerSecret(headers, operatorChunkSecret) { | ||
| if (typeof operatorChunkSecret === 'string' && operatorChunkSecret.length > 0) { | ||
| headers[BROKER_SECRET_HEADER] = operatorChunkSecret; | ||
| } | ||
| return headers; | ||
| } |
| // Bearer-secret management for plan-chat-service.mjs. | ||
| // | ||
| // Mirrors the agent-memory-service pattern from the sibling olam-agent-memory | ||
| // repo: a single 0600 file at ~/.olam/plan-chat-secret holds the bearer | ||
| // hex string. Helpers generate, read, and rotate atomically. Rotation | ||
| // writes to a tmpfile and renames; mid-rotation reads see either the old | ||
| // or new value, never a partial write. | ||
| // | ||
| // Inside the Docker container, os.homedir() → /root, but compose.yaml mounts | ||
| // ${HOME}/.olam → /data. Without an env override, the bearer would be written | ||
| // to /root/.olam/plan-chat-secret (container ephemeral layer) and lost on | ||
| // every `docker compose up --force-recreate` (i.e. every `olam upgrade`). | ||
| // OLAM_PLAN_CHAT_SECRET_PATH is set to /data/plan-chat-secret in compose.yaml | ||
| // and k8s/manifests/30-configmap.yaml so all reads/writes land in the | ||
| // bind-mounted host directory. On bare-host installs (no container) the env | ||
| // var is unset and the path falls back to ~/.olam/plan-chat-secret — no | ||
| // behaviour change. Mirrors precedent commit 5b21d1f2 (PR #440) for plan.db. | ||
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import crypto from 'node:crypto'; | ||
| // Phase D (olam-config-store-unification): consult config.json's | ||
| // `cloud.secrets.plan-chat-secret` value before the legacy secret FILES. | ||
| // Dep-free reader (host-cp has no @olam/core dep) with container-aware /data | ||
| // path resolution — see config-reader.mjs header. | ||
| import { readConfigString, olamConfigDir } from './config-reader.mjs'; | ||
| /** | ||
| * Resolve the plan-chat-secret path: prefer <configDir>/secrets/plan-chat-secret | ||
| * (new canonical location) over <configDir>/plan-chat-secret (legacy). The | ||
| * config dir is container-aware (OLAM_HOME ?? container /data ?? ~/.olam) so the | ||
| * bare-node and in-container layouts both resolve correctly. Inlined here | ||
| * because host-cp is a pure .mjs package with no @olam/core dep. | ||
| */ | ||
| function resolvePlanChatSecretPath() { | ||
| const olamHome = olamConfigDir(); | ||
| const newPath = path.join(olamHome, 'secrets', 'plan-chat-secret'); | ||
| if (fs.existsSync(newPath)) return newPath; | ||
| const legacyPath = path.join(olamHome, 'plan-chat-secret'); | ||
| if (fs.existsSync(legacyPath)) return legacyPath; | ||
| // Neither exists — return canonical so writes land in the right place. | ||
| return newPath; | ||
| } | ||
| export const SECRET_PATH = | ||
| process.env.OLAM_PLAN_CHAT_SECRET_PATH ?? resolvePlanChatSecretPath(); | ||
| export const SECRET_DIR = path.dirname(SECRET_PATH); | ||
| const SECRET_BYTES = 32; // 64 hex chars | ||
| const SECRET_MODE = 0o600; | ||
| /** | ||
| * Generate a fresh hex bearer (64 chars; 256 bits of entropy). | ||
| */ | ||
| export function generateSecret() { | ||
| return crypto.randomBytes(SECRET_BYTES).toString('hex'); | ||
| } | ||
| /** Read a bearer value out of a single secret FILE. Null if absent/empty. */ | ||
| function readSecretFile(secretPath) { | ||
| try { | ||
| const value = fs.readFileSync(secretPath, 'utf8').trim(); | ||
| if (!value) return null; | ||
| return value; | ||
| } catch (err) { | ||
| if (err && typeof err === 'object' && 'code' in err && err.code === 'ENOENT') return null; | ||
| throw err; | ||
| } | ||
| } | ||
| /** | ||
| * Read the plan-chat bearer. Returns null if absent. Throws on permission errors. | ||
| * | ||
| * Phase D precedence (mirrors resolver.ts getPlanChatSecret) for the DEFAULT | ||
| * read path: the resolved secret FILE (SECRET_PATH — env override or the | ||
| * canonical/legacy file) wins when present, then config.json | ||
| * `cloud.secrets.plan-chat-secret`, then null. The file leg stays FIRST so an | ||
| * operator's existing on-disk bearer (incl. the compose-mounted | ||
| * OLAM_PLAN_CHAT_SECRET_PATH=/data/plan-chat-secret) is byte-for-byte | ||
| * unchanged; config.json only fills in when no file exists yet. | ||
| * | ||
| * When called with an EXPLICIT secretPath (write/rotate read-backs, tests), | ||
| * behaviour is file-only — no config.json leg — so callers that own a specific | ||
| * path keep deterministic file semantics. | ||
| */ | ||
| export function readSecret(secretPath = SECRET_PATH) { | ||
| const explicitPath = secretPath !== SECRET_PATH; | ||
| const fromFile = readSecretFile(secretPath); | ||
| if (fromFile !== null) return fromFile; | ||
| if (explicitPath) return null; // explicit-path callers: file-only semantics | ||
| const fromConfig = readConfigString('cloud.secrets.plan-chat-secret'); | ||
| return fromConfig; // string or null | ||
| } | ||
| /** | ||
| * Write the bearer to disk atomically. Creates `~/.olam` if missing. Enforces | ||
| * 0600 perms on the destination (older mode permissions on the tmpfile are | ||
| * tightened immediately after write). | ||
| */ | ||
| export function writeSecret(value, secretPath = SECRET_PATH) { | ||
| if (typeof value !== 'string' || value.length === 0) { | ||
| throw new Error('plan-chat-secret: refusing to write empty bearer'); | ||
| } | ||
| fs.mkdirSync(path.dirname(secretPath), { recursive: true, mode: 0o700 }); | ||
| const tmp = `${secretPath}.tmp-${process.pid}-${Date.now()}`; | ||
| fs.writeFileSync(tmp, value + '\n', { mode: SECRET_MODE }); | ||
| try { | ||
| fs.chmodSync(tmp, SECRET_MODE); | ||
| fs.renameSync(tmp, secretPath); | ||
| } catch (err) { | ||
| try { fs.unlinkSync(tmp); } catch { /* swallow */ } | ||
| throw err; | ||
| } | ||
| } | ||
| /** | ||
| * Read the bearer if it exists, else generate, write, and return it. | ||
| * Idempotent across processes; first writer wins (rename is atomic). | ||
| */ | ||
| export function ensureSecret(secretPath = SECRET_PATH) { | ||
| const existing = readSecret(secretPath); | ||
| if (existing) return existing; | ||
| const fresh = generateSecret(); | ||
| writeSecret(fresh, secretPath); | ||
| return fresh; | ||
| } | ||
| /** | ||
| * Rotate: generate a new bearer, write atomically, return the new value. | ||
| * Callers should restart any running plan-chat-service so it re-reads. | ||
| */ | ||
| export function rotateSecret(secretPath = SECRET_PATH) { | ||
| const fresh = generateSecret(); | ||
| writeSecret(fresh, secretPath); | ||
| return fresh; | ||
| } | ||
| /** | ||
| * Constant-time compare. Returns true iff both strings are non-empty and | ||
| * byte-equal. Avoids leaking timing on bearer comparison. | ||
| */ | ||
| export function timingSafeEqual(a, b) { | ||
| if (typeof a !== 'string' || typeof b !== 'string') return false; | ||
| if (a.length === 0 || b.length === 0) return false; | ||
| if (a.length !== b.length) return false; | ||
| return crypto.timingSafeEqual(Buffer.from(a), Buffer.from(b)); | ||
| } |
Sorry, the diff of this file is too big to display
| // plan-orchestrator.mjs — Phase 2: multi-persona conversation coordinator. | ||
| // | ||
| // Architecture: | ||
| // - AgentRegistry holds one pi AgentSession per (conversationId, personaId). | ||
| // - HandoffEngine forks the session tree when the active persona changes. | ||
| // - All persona turns share one session.jsonl per conversation. | ||
| // - SSE sinks are an in-process Set<ServerResponse> per conversationId. | ||
| // | ||
| // Credentials: | ||
| // - Uses the Olam auth-service vault (same as the rest of host-cp). | ||
| // - No ANTHROPIC_API_KEY required; tokens fetched on demand via auth-service. | ||
| import path from 'node:path'; | ||
| import os from 'node:os'; | ||
| import fs from 'node:fs'; | ||
| import { randomUUID } from 'node:crypto'; | ||
| import Database from 'better-sqlite3'; | ||
| import { SessionManager } from '@mariozechner/pi-coding-agent'; | ||
| import { PERSONAS, DEFAULT_PERSONA_ID, getPersona } from './plan/personas.mjs'; | ||
| import { AgentRegistry } from './plan/agent-registry.mjs'; | ||
| import { HandoffEngine } from './plan/handoff-engine.mjs'; | ||
| import { RopeEngine } from './plan/rope-engine.mjs'; | ||
| import { loadAuthorityConfig } from './plan/authority-config.mjs'; | ||
| import { isPathVaultUrl, ensurePathVaultProxy } from './plan/path-vault-proxy.mjs'; | ||
| // Phase D (olam-config-store-unification): config.json reader (dep-free, copied | ||
| // from packages/core/src/cloud-state/read-config-value.mjs — host-cp has no | ||
| // @olam/core dep). Container-aware /data path resolution lives in config-reader.mjs. | ||
| import { readConfigString, olamConfigDir } from './config-reader.mjs'; | ||
| // ── Cloud path-vault fallback ─────────────────────────────────────────────── | ||
| // | ||
| // When the local auth-service vault has no Claude credential, the plan agent can | ||
| // instead reach Claude through the operator's cloud path-vault URL. Resolution | ||
| // mirrors server.mjs readAnthropicBaseUrl() (kept independent so this module has | ||
| // no server.mjs dependency): | ||
| // 1. OLAM_ANTHROPIC_BASE_URL env var | ||
| // 2. ~/.olam/anthropic-base-url file | ||
| // 3. ANTHROPIC_BASE_URL env var | ||
| // | ||
| // Only PATH-FORMAT vault URLs (https://host/auth/<sub>/<secret>) are usable as a | ||
| // fallback — they self-authenticate, so no live token is required. | ||
| /** Placeholder api-key handed to the agent runtime in path-vault mode. The | ||
| * path prefix is the real credential; the proxy strips this header. */ | ||
| const PATH_VAULT_PLACEHOLDER_KEY = 'path-vault-proxy'; | ||
| /** @returns {string} the configured Anthropic base URL, or '' if none. */ | ||
| function readAnthropicBaseUrlForFallback() { | ||
| const fromOlamEnv = process.env['OLAM_ANTHROPIC_BASE_URL']; | ||
| if (fromOlamEnv && fromOlamEnv.length > 0) return fromOlamEnv.trim(); | ||
| // Phase D: config.json leg sits BETWEEN the two env legs (mirrors | ||
| // server.mjs readAnthropicBaseUrl + resolver.ts getAnthropicBaseUrl). | ||
| const fromConfig = readConfigString('cloud.urls.anthropic-base-url'); | ||
| if (fromConfig !== null) return fromConfig; | ||
| try { | ||
| // Legacy fallback under the container-aware config dir (/data or ~/.olam). | ||
| const file = path.join(olamConfigDir(), 'anthropic-base-url'); | ||
| const content = fs.readFileSync(file, 'utf-8').trim(); | ||
| if (content.length > 0) return content; | ||
| } catch { | ||
| // file absent — fall through | ||
| } | ||
| const fromShellEnv = process.env['ANTHROPIC_BASE_URL']; | ||
| if (fromShellEnv && fromShellEnv.length > 0) return fromShellEnv.trim(); | ||
| return ''; | ||
| } | ||
| // ── Paths ───────────────────────────────────────────────────────────────────── | ||
| // | ||
| // Inside the Docker container, os.homedir() → /root, but compose.yaml mounts | ||
| // ${HOME}/.olam → /data. Without env overrides, plan.db would be written to | ||
| // /root/.olam/plan.db (container ephemeral layer) and lost on every | ||
| // `docker compose up --force-recreate` (i.e. every `olam upgrade`). | ||
| // | ||
| // OLAM_PLAN_DB_PATH and OLAM_PLAN_DIR are set to /data/plan.db and /data/plan | ||
| // in compose.yaml so all writes land in the bind-mounted host directory. | ||
| // On bare-host installs (no container) neither env var is set and the paths | ||
| // fall back to the original ~/.olam locations — no behaviour change. | ||
| // | ||
| // Paths are resolved at construction time (not module load) so tests can pass | ||
| // explicit paths via constructor opts without any module re-import tricks. | ||
| /** @returns {string} */ | ||
| function defaultPlanDbPath() { | ||
| return process.env.OLAM_PLAN_DB_PATH ?? path.join(os.homedir(), '.olam', 'plan.db'); | ||
| } | ||
| /** @returns {string} */ | ||
| function defaultPlanDir() { | ||
| return process.env.OLAM_PLAN_DIR ?? path.join(os.homedir(), '.olam', 'plan'); | ||
| } | ||
| // ── Helpers ─────────────────────────────────────────────────────────────────── | ||
| function initSessionFile(sessionFile, sessionId) { | ||
| const header = { | ||
| type: 'session', | ||
| version: 3, | ||
| id: sessionId, | ||
| timestamp: new Date().toISOString(), | ||
| cwd: os.homedir(), | ||
| }; | ||
| fs.writeFileSync(sessionFile, JSON.stringify(header) + '\n'); | ||
| } | ||
| /** | ||
| * Derive a short title from the first user message content. | ||
| * Truncates at a word boundary to at most maxLen characters. | ||
| * @param {string} content | ||
| * @param {number} [maxLen=40] | ||
| * @returns {string} | ||
| */ | ||
| export function deriveTitle(content, maxLen = 40) { | ||
| const trimmed = content.trim().replace(/\s+/g, ' '); | ||
| if (!trimmed) return '(empty)'; | ||
| if (trimmed.length <= maxLen) return trimmed; | ||
| const cut = trimmed.slice(0, maxLen); | ||
| const lastSpace = cut.lastIndexOf(' '); | ||
| return (lastSpace > 0 ? cut.slice(0, lastSpace) : cut) + '…'; | ||
| } | ||
| // ── PlanOrchestrator ────────────────────────────────────────────────────────── | ||
| export class PlanOrchestrator { | ||
| #db; | ||
| #planDir; | ||
| #authServiceUrl; | ||
| #authServiceSecret; | ||
| #registry; | ||
| #handoffEngine; | ||
| #ropeEngine; | ||
| /** Tracks the active persona per conversationId: Map<conversationId, personaId> */ | ||
| #activePersona = new Map(); | ||
| /** @type {Map<string, Set<import('node:http').ServerResponse>>} */ | ||
| #sinks = new Map(); | ||
| /** | ||
| * Ring buffer of in-flight SSE events per conversationId. | ||
| * Populated while a turn is active; cleared after all persona turn_complete events. | ||
| * Used by drainReplayBuffer to replay missed events on reconnect. | ||
| * @type {Map<string, Array<{event: string, data: object}>>} | ||
| */ | ||
| #activeTurns = new Map(); | ||
| /** | ||
| * Number of persona turn_complete events still pending per conversationId. | ||
| * Replay buffer is only cleared when this reaches 0. | ||
| * @type {Map<string, number>} | ||
| */ | ||
| #pendingPersonaCount = new Map(); | ||
| /** | ||
| * Mutable current-chunk refs per conversationId. | ||
| * ChunkEmitter updates these; read_sidebar tool reads them. | ||
| * @type {Map<string, { current: string|null }>} | ||
| */ | ||
| #currentChunkRefs = new Map(); | ||
| /** | ||
| * @param {{ | ||
| * authServiceUrl: string, | ||
| * authServiceSecret: string, | ||
| * planDbPath?: string, | ||
| * planDirPath?: string, | ||
| * }} opts | ||
| * | ||
| * planDbPath / planDirPath default to OLAM_PLAN_DB_PATH / OLAM_PLAN_DIR env vars, | ||
| * falling back to ~/.olam/plan.db and ~/.olam/plan. Pass explicitly in tests to | ||
| * avoid touching real home-dir paths. | ||
| */ | ||
| constructor({ authServiceUrl, authServiceSecret, planDbPath, planDirPath } = {}) { | ||
| this.#authServiceUrl = authServiceUrl; | ||
| this.#authServiceSecret = authServiceSecret; | ||
| const legacyDbPath = path.join(os.homedir(), '.olam', 'plan.db'); | ||
| // Track whether the caller injected an explicit DB path (used to skip the | ||
| // legacy-path migration below — tests inject tmpDir paths and must not | ||
| // inherit the operator's real plan.db). | ||
| const planDbPathInjected = planDbPath !== undefined; | ||
| const resolvedDbPath = planDbPath ?? defaultPlanDbPath(); | ||
| this.#planDir = planDirPath ?? defaultPlanDir(); | ||
| this.#registry = new AgentRegistry({ authServiceUrl, authServiceSecret }); | ||
| this.#handoffEngine = new HandoffEngine(this.#registry); | ||
| fs.mkdirSync(path.dirname(resolvedDbPath), { recursive: true }); | ||
| // One-time migration: if the resolved DB path differs from the legacy default and | ||
| // the target doesn't exist yet, copy any existing DB from the old location. | ||
| // This preserves conversations on a hot-restart after deploying the compose.yaml fix. | ||
| // On full container recreate the legacy path is already gone — this is a no-op. | ||
| // | ||
| // Skip when the caller injected an explicit planDbPath — that's the unit- | ||
| // test shape (each test owns a tmpDir db). Pre-fix history: tests on a host | ||
| // with a populated `~/.olam/plan.db` got every `listConversations()` query | ||
| // polluted by real operator data because the migration eagerly copied the | ||
| // legacy file into the test's tmpDir. | ||
| if ( | ||
| !planDbPathInjected && | ||
| resolvedDbPath !== legacyDbPath && | ||
| !fs.existsSync(resolvedDbPath) && | ||
| fs.existsSync(legacyDbPath) | ||
| ) { | ||
| try { | ||
| fs.copyFileSync(legacyDbPath, resolvedDbPath); | ||
| console.info('[plan] Migrated plan.db from legacy path to', resolvedDbPath); | ||
| } catch (err) { | ||
| console.warn('[plan] plan.db migration failed (non-fatal):', err.message); | ||
| } | ||
| } | ||
| this.#db = new Database(resolvedDbPath); | ||
| this.#db.exec(` | ||
| CREATE TABLE IF NOT EXISTS plan_conversations ( | ||
| id TEXT PRIMARY KEY, | ||
| title TEXT, | ||
| persona TEXT NOT NULL DEFAULT 'brainstorm', | ||
| created_at INTEGER NOT NULL, | ||
| last_turn_at INTEGER | ||
| ); | ||
| CREATE TABLE IF NOT EXISTS plan_turns ( | ||
| id TEXT PRIMARY KEY, | ||
| conversation_id TEXT NOT NULL REFERENCES plan_conversations(id), | ||
| role TEXT NOT NULL, | ||
| content TEXT NOT NULL DEFAULT '', | ||
| persona TEXT, | ||
| from_persona TEXT, | ||
| to_persona TEXT, | ||
| mode TEXT, | ||
| fork_node_id TEXT, | ||
| created_at INTEGER NOT NULL | ||
| ); | ||
| CREATE INDEX IF NOT EXISTS plan_turns_conv_idx | ||
| ON plan_turns(conversation_id, created_at); | ||
| -- Phase 4B: lookout agent registry per conversation | ||
| CREATE TABLE IF NOT EXISTS plan_lookout_agents ( | ||
| conversation_id TEXT NOT NULL, | ||
| persona_id TEXT NOT NULL, | ||
| muted INTEGER NOT NULL DEFAULT 0, | ||
| mode TEXT NOT NULL DEFAULT 'observe', | ||
| created_at INTEGER NOT NULL, | ||
| PRIMARY KEY (conversation_id, persona_id) | ||
| ); | ||
| -- Phase 4B: sidebar signals from lookout agents | ||
| CREATE TABLE IF NOT EXISTS plan_sidebar_signals ( | ||
| id TEXT PRIMARY KEY, | ||
| conversation_id TEXT NOT NULL, | ||
| agent_id TEXT NOT NULL, | ||
| urgency TEXT NOT NULL DEFAULT 'p2', | ||
| reason TEXT NOT NULL DEFAULT '', | ||
| content TEXT NOT NULL DEFAULT '', | ||
| chunk_id TEXT NOT NULL, | ||
| created_at INTEGER NOT NULL, | ||
| status TEXT NOT NULL DEFAULT 'active', | ||
| tension_subject TEXT, | ||
| parent_signal_id TEXT | ||
| ); | ||
| CREATE INDEX IF NOT EXISTS plan_sidebar_conv_idx | ||
| ON plan_sidebar_signals(conversation_id, created_at); | ||
| CREATE INDEX IF NOT EXISTS plan_sidebar_chunk_idx | ||
| ON plan_sidebar_signals(chunk_id); | ||
| `); | ||
| // Migration guard: add pinned column if the table predates this feature. | ||
| const planConvCols = this.#db.prepare(`PRAGMA table_info(plan_conversations)`).all(); | ||
| if (!planConvCols.some(c => c.name === 'pinned')) { | ||
| this.#db.exec(`ALTER TABLE plan_conversations ADD COLUMN pinned INTEGER NOT NULL DEFAULT 0`); | ||
| } | ||
| const authorityConfig = loadAuthorityConfig(); | ||
| this.#ropeEngine = new RopeEngine({ | ||
| registry: this.#registry, | ||
| db: this.#db, | ||
| broadcast: (cId, evt, data) => this.#broadcast(cId, evt, data), | ||
| authorityConfig, | ||
| }); | ||
| } | ||
| // ── Auth-service credential fetching ────────────────────────────────────── | ||
| /** | ||
| * Fetch a Claude credential token for an about-to-run turn. | ||
| * | ||
| * Returns a real vault token when the local vault has one, OR a placeholder | ||
| * token in cloud path-vault fallback mode. As a SIDE EFFECT it points the | ||
| * AgentRegistry at the right Anthropic base URL (localhost proxy in path-vault | ||
| * mode, cleared otherwise) BEFORE any runtime is built — every persona / | ||
| * rope / handoff runtime resolves its model base URL from the registry. | ||
| * | ||
| * Used by all turn-dispatch call sites (dispatch, rope-engine, handoff-engine) | ||
| * via `fetchToken: () => this.#fetchToken()`, so the fallback applies uniformly | ||
| * without changing those call sites. | ||
| * | ||
| * @returns {Promise<string>} | ||
| */ | ||
| async #fetchToken() { | ||
| const cred = await this.#resolveCredential(); | ||
| // setAnthropicBaseUrl points runtimes at the localhost path-vault proxy in | ||
| // fallback mode (else clears it). Guard for registries that predate the | ||
| // method or are test doubles — the path-vault override is best-effort. | ||
| if (typeof this.#registry.setAnthropicBaseUrl === 'function') { | ||
| this.#registry.setAnthropicBaseUrl(cred.mode === 'path-vault' ? cred.baseUrl : null); | ||
| } | ||
| return cred.token; | ||
| } | ||
| /** | ||
| * Resolve a credential for the plan agent, preferring the local auth-service | ||
| * vault and falling back to the operator's cloud path-vault URL when the local | ||
| * vault is empty. | ||
| * | ||
| * @typedef {{ mode: 'vault', token: string } | ||
| * | { mode: 'path-vault', token: string, baseUrl: string }} CredentialResolution | ||
| * | ||
| * @returns {Promise<CredentialResolution>} | ||
| */ | ||
| async #resolveCredential() { | ||
| // 1. Prefer the local vault. When it has a credential, behavior is unchanged. | ||
| // Call the registry directly (NOT #fetchToken) — #fetchToken delegates | ||
| // back here, so going through it would recurse. | ||
| try { | ||
| const token = await this.#registry.fetchToken('claude'); | ||
| return { mode: 'vault', token }; | ||
| } catch (err) { | ||
| // Only fall back on a missing credential — surface real auth-service errors | ||
| // (timeouts, 5xx) so they don't get masked by the path-vault path. | ||
| if (err?.code && err.code !== 'NO_CREDENTIAL') throw err; | ||
| } | ||
| // 2. Fall back to the cloud path-vault URL, if configured + path-format. | ||
| const baseUrl = readAnthropicBaseUrlForFallback(); | ||
| if (!isPathVaultUrl(baseUrl)) { | ||
| // No usable fallback — re-raise the original NO_CREDENTIAL shape so callers | ||
| // (hasCredential / dispatch) behave exactly as before. | ||
| const e = new Error('no active claude credential in vault'); | ||
| e.code = 'NO_CREDENTIAL'; | ||
| throw e; | ||
| } | ||
| const localBaseUrl = await ensurePathVaultProxy(baseUrl); | ||
| return { mode: 'path-vault', token: PATH_VAULT_PLACEHOLDER_KEY, baseUrl: localBaseUrl }; | ||
| } | ||
| /** | ||
| * Lightweight check — returns true when a credential is reachable, either from | ||
| * the local vault OR the cloud path-vault fallback. | ||
| * @returns {Promise<boolean>} | ||
| */ | ||
| async hasCredential() { | ||
| try { | ||
| await this.#resolveCredential(); | ||
| return true; | ||
| } catch { | ||
| return false; | ||
| } | ||
| } | ||
| // ── Conversation management ─────────────────────────────────────────────── | ||
| /** | ||
| * @param {{ title?: string }} [opts] | ||
| * @returns {{ id: string, title: string|null, persona: string, created_at: number }} | ||
| */ | ||
| createConversation({ title } = {}) { | ||
| const id = randomUUID(); | ||
| const created_at = Date.now(); | ||
| const sessionDir = path.join(this.#planDir, id); | ||
| fs.mkdirSync(sessionDir, { recursive: true }); | ||
| initSessionFile(path.join(sessionDir, 'session.jsonl'), id); | ||
| this.#db | ||
| .prepare( | ||
| `INSERT INTO plan_conversations (id, title, persona, created_at) | ||
| VALUES (?, ?, ?, ?)`, | ||
| ) | ||
| .run(id, title ?? null, DEFAULT_PERSONA_ID, created_at); | ||
| this.#activePersona.set(id, DEFAULT_PERSONA_ID); | ||
| return { id, title: title ?? null, persona: DEFAULT_PERSONA_ID, created_at }; | ||
| } | ||
| /** @returns {Array<{id, title, pinned, created_at, last_turn_at, persona, snippet}>} */ | ||
| listConversations() { | ||
| return this.#db | ||
| .prepare( | ||
| `SELECT | ||
| c.id, c.title, c.pinned, c.created_at, c.last_turn_at, c.persona, | ||
| (SELECT pt.content FROM plan_turns pt | ||
| WHERE pt.conversation_id = c.id | ||
| ORDER BY pt.created_at DESC LIMIT 1) AS snippet | ||
| FROM plan_conversations c | ||
| ORDER BY c.pinned DESC, COALESCE(c.last_turn_at, c.created_at) DESC, c.rowid DESC`, | ||
| ) | ||
| .all(); | ||
| } | ||
| /** | ||
| * Patch a conversation's title and/or pinned state. | ||
| * @param {string} id | ||
| * @param {{ title?: string, pinned?: boolean }} updates | ||
| * @returns {object|null} Updated row, or null if not found. | ||
| */ | ||
| patchConversation(id, updates) { | ||
| const parts = []; | ||
| const values = []; | ||
| if (updates.title !== undefined) { | ||
| parts.push('title = ?'); | ||
| values.push(updates.title || null); | ||
| } | ||
| if (updates.pinned !== undefined) { | ||
| parts.push('pinned = ?'); | ||
| values.push(updates.pinned ? 1 : 0); | ||
| } | ||
| if (parts.length === 0) return null; | ||
| values.push(id); | ||
| const changed = this.#db | ||
| .prepare(`UPDATE plan_conversations SET ${parts.join(', ')} WHERE id = ?`) | ||
| .run(...values); | ||
| if (changed.changes === 0) return null; | ||
| return this.#db | ||
| .prepare(`SELECT id, title, pinned, created_at, last_turn_at, persona FROM plan_conversations WHERE id = ?`) | ||
| .get(id) ?? null; | ||
| } | ||
| /** | ||
| * Delete a conversation and all its associated data. | ||
| * @param {string} id | ||
| * @returns {boolean} true if deleted, false if not found. | ||
| */ | ||
| deleteConversation(id) { | ||
| const exists = this.#db | ||
| .prepare(`SELECT 1 FROM plan_conversations WHERE id = ?`) | ||
| .get(id); | ||
| if (!exists) return false; | ||
| this.#db.prepare(`DELETE FROM plan_turns WHERE conversation_id = ?`).run(id); | ||
| this.#db.prepare(`DELETE FROM plan_lookout_agents WHERE conversation_id = ?`).run(id); | ||
| this.#db.prepare(`DELETE FROM plan_sidebar_signals WHERE conversation_id = ?`).run(id); | ||
| this.#db.prepare(`DELETE FROM plan_conversations WHERE id = ?`).run(id); | ||
| this.#activePersona.delete(id); | ||
| this.#sinks.delete(id); | ||
| this.#activeTurns.delete(id); | ||
| this.#currentChunkRefs.delete(id); | ||
| const sessionDir = path.join(this.#planDir, id); | ||
| try { fs.rmSync(sessionDir, { recursive: true }); } catch { /* ok if missing */ } | ||
| return true; | ||
| } | ||
| /** | ||
| * @param {string} id | ||
| * @returns {{ id, title, persona, created_at, last_turn_at, tree } | null} | ||
| */ | ||
| getConversation(id) { | ||
| const row = this.#db | ||
| .prepare( | ||
| `SELECT id, title, persona, created_at, last_turn_at | ||
| FROM plan_conversations WHERE id = ?`, | ||
| ) | ||
| .get(id); | ||
| if (!row) return null; | ||
| const sessionFile = path.join(this.#planDir, id, 'session.jsonl'); | ||
| let tree = []; | ||
| try { | ||
| const mgr = SessionManager.open(sessionFile, path.join(this.#planDir, id)); | ||
| tree = mgr.getTree(); | ||
| } catch { | ||
| // Session file missing or corrupt — return empty tree. | ||
| } | ||
| return { ...row, tree }; | ||
| } | ||
| // ── Active persona management ───────────────────────────────────────────── | ||
| /** | ||
| * @param {string} conversationId | ||
| * @returns {string} Active persona ID. | ||
| */ | ||
| getActivePersona(conversationId) { | ||
| if (this.#activePersona.has(conversationId)) { | ||
| return this.#activePersona.get(conversationId); | ||
| } | ||
| const row = this.#db | ||
| .prepare(`SELECT persona FROM plan_conversations WHERE id = ?`) | ||
| .get(conversationId); | ||
| const personaId = row?.persona ?? DEFAULT_PERSONA_ID; | ||
| this.#activePersona.set(conversationId, personaId); | ||
| return personaId; | ||
| } | ||
| /** | ||
| * Set the active default persona for a conversation (does NOT trigger a handoff). | ||
| * @param {string} conversationId | ||
| * @param {string} personaId | ||
| */ | ||
| setActivePersona(conversationId, personaId) { | ||
| this.#activePersona.set(conversationId, personaId); | ||
| this.#db | ||
| .prepare(`UPDATE plan_conversations SET persona = ? WHERE id = ?`) | ||
| .run(personaId, conversationId); | ||
| } | ||
| // ── SSE broadcast ───────────────────────────────────────────────────────── | ||
| #broadcast(conversationId, eventName, data) { | ||
| // Buffer event while a turn is active for reconnect replay. | ||
| const buf = this.#activeTurns.get(conversationId); | ||
| if (buf) { | ||
| buf.push({ event: eventName, data }); | ||
| } | ||
| const sinks = this.#sinks.get(conversationId); | ||
| if (!sinks || sinks.size === 0) return; | ||
| const chunk = `event: ${eventName}\ndata: ${JSON.stringify(data)}\n\n`; | ||
| for (const res of sinks) { | ||
| try { res.write(chunk); } catch { /* client disconnected */ } | ||
| } | ||
| // Clear buffer only when all pending personas have completed. | ||
| if (eventName === 'turn_complete') { | ||
| const pending = (this.#pendingPersonaCount.get(conversationId) ?? 1) - 1; | ||
| if (pending <= 0) { | ||
| this.#activeTurns.delete(conversationId); | ||
| this.#pendingPersonaCount.delete(conversationId); | ||
| } else { | ||
| this.#pendingPersonaCount.set(conversationId, pending); | ||
| } | ||
| } | ||
| } | ||
| // ── Lookout agent management ────────────────────────────────────────────── | ||
| /** | ||
| * Invite a persona as a lookout for a conversation. | ||
| * @param {string} conversationId | ||
| * @param {string} personaId | ||
| * @returns {{ persona_id: string, state: string, muted: boolean, mode: string }} | ||
| */ | ||
| inviteLookout(conversationId, personaId) { | ||
| const now = Date.now(); | ||
| this.#db | ||
| .prepare(`INSERT OR IGNORE INTO plan_lookout_agents (conversation_id, persona_id, muted, mode, created_at) VALUES (?, ?, 0, 'observe', ?)`) | ||
| .run(conversationId, personaId, now); | ||
| const agent = { persona_id: personaId, state: 'listening', muted: false, mode: 'observe' }; | ||
| this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'listening' }); | ||
| return agent; | ||
| } | ||
| /** | ||
| * Update muted status (or mode) for a lookout agent. | ||
| * @param {string} conversationId | ||
| * @param {string} personaId | ||
| * @param {{ muted?: boolean, mode?: string }} updates | ||
| * @returns {{ persona_id: string, state: string, muted: boolean, mode: string } | null} | ||
| */ | ||
| updateLookout(conversationId, personaId, { muted, mode } = {}) { | ||
| const row = this.#db | ||
| .prepare(`SELECT * FROM plan_lookout_agents WHERE conversation_id = ? AND persona_id = ?`) | ||
| .get(conversationId, personaId); | ||
| if (!row) return null; | ||
| const newMuted = muted !== undefined ? (muted ? 1 : 0) : row.muted; | ||
| const newMode = mode ?? row.mode; | ||
| this.#db | ||
| .prepare(`UPDATE plan_lookout_agents SET muted = ?, mode = ? WHERE conversation_id = ? AND persona_id = ?`) | ||
| .run(newMuted, newMode, conversationId, personaId); | ||
| const newState = newMuted ? 'idle' : 'listening'; | ||
| this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: newState }); | ||
| return { persona_id: personaId, state: newState, muted: !!newMuted, mode: newMode }; | ||
| } | ||
| /** | ||
| * Remove a lookout agent. | ||
| * @param {string} conversationId | ||
| * @param {string} personaId | ||
| */ | ||
| uninviteLookout(conversationId, personaId) { | ||
| this.#db | ||
| .prepare(`DELETE FROM plan_lookout_agents WHERE conversation_id = ? AND persona_id = ?`) | ||
| .run(conversationId, personaId); | ||
| } | ||
| /** | ||
| * List active lookout agents for a conversation. | ||
| * @param {string} conversationId | ||
| * @returns {Array<{ persona_id: string, state: string, muted: boolean, mode: string }>} | ||
| */ | ||
| listLookoutAgents(conversationId) { | ||
| const rows = this.#db | ||
| .prepare(`SELECT persona_id, muted, mode FROM plan_lookout_agents WHERE conversation_id = ?`) | ||
| .all(conversationId); | ||
| return rows.map((r) => ({ | ||
| persona_id: r.persona_id, | ||
| state: r.muted ? 'idle' : 'listening', | ||
| muted: !!r.muted, | ||
| mode: r.mode, | ||
| })); | ||
| } | ||
| // ── Sidebar signal management ───────────────────────────────────────────── | ||
| /** | ||
| * Dismiss a sidebar signal. | ||
| * @param {string} conversationId | ||
| * @param {string} signalId | ||
| * @returns {boolean} | ||
| */ | ||
| dismissSignal(conversationId, signalId) { | ||
| const info = this.#db | ||
| .prepare(`UPDATE plan_sidebar_signals SET status = 'dismissed' WHERE id = ? AND conversation_id = ?`) | ||
| .run(signalId, conversationId); | ||
| return info.changes > 0; | ||
| } | ||
| /** | ||
| * Mark a sidebar signal as used (for next turn context). | ||
| * @param {string} conversationId | ||
| * @param {string} signalId | ||
| * @returns {boolean} | ||
| */ | ||
| useSignal(conversationId, signalId) { | ||
| const info = this.#db | ||
| .prepare(`UPDATE plan_sidebar_signals SET status = 'used' WHERE id = ? AND conversation_id = ?`) | ||
| .run(signalId, conversationId); | ||
| return info.changes > 0; | ||
| } | ||
| /** | ||
| * List sidebar signals for a conversation (optionally filtered by chunk_id). | ||
| * @param {string} conversationId | ||
| * @param {string} [chunkId] | ||
| * @returns {Array<object>} | ||
| */ | ||
| listSignals(conversationId, chunkId) { | ||
| if (chunkId) { | ||
| return this.#db | ||
| .prepare(`SELECT * FROM plan_sidebar_signals WHERE conversation_id = ? AND chunk_id = ? ORDER BY created_at ASC`) | ||
| .all(conversationId, chunkId); | ||
| } | ||
| return this.#db | ||
| .prepare(`SELECT * FROM plan_sidebar_signals WHERE conversation_id = ? ORDER BY created_at ASC`) | ||
| .all(conversationId); | ||
| } | ||
| // ── Lookout analysis ────────────────────────────────────────────────────── | ||
| /** | ||
| * Persona-specific heuristics for lookout analysis. | ||
| * Returns { shouldComment: boolean, urgency, content, reason, tension_subject? } | ||
| * or null if no comment warranted. | ||
| * | ||
| * @param {string} personaId | ||
| * @param {string} content — chunk content to analyze | ||
| * @returns {{ urgency: string, content: string, reason: string, tension_subject?: string } | null} | ||
| */ | ||
| #analyzeChunkHeuristic(personaId, content) { | ||
| const lower = content.toLowerCase(); | ||
| if (personaId === 'scout') { | ||
| // Scout: flag unsubstantiated claims and factual assertions | ||
| const claimPatterns = [ | ||
| /\b(research shows|studies (show|indicate|suggest)|data (shows|indicates|suggests))\b/i, | ||
| /\b\d+(\.\d+)?\s*%\b/, | ||
| /\b(always|never|all|every|none|no one)\b/i, | ||
| /\b(proven|definitive|certain|guaranteed|undeniable)\b/i, | ||
| /\b(industry standard|best practice|widely accepted)\b/i, | ||
| ]; | ||
| const matched = claimPatterns.find((p) => p.test(content)); | ||
| if (matched) { | ||
| return { | ||
| urgency: 'p2', | ||
| reason: 'Factual claim without cited source', | ||
| content: 'This response contains claims that should be verified with evidence. What data or sources back this up?', | ||
| }; | ||
| } | ||
| // Scout spark: look for unexplored data angles | ||
| if (lower.includes('option') || lower.includes('approach') || lower.includes('strategy')) { | ||
| if (Math.random() < 0.3) { | ||
| return { | ||
| urgency: 'spark', | ||
| reason: 'Potential evidence gap', | ||
| content: '_What metrics or signals would tell us which option is actually better here?_', | ||
| }; | ||
| } | ||
| } | ||
| } | ||
| if (personaId === 'pm') { | ||
| // PM: flag scope ambiguity and missing requirements | ||
| const scopePatterns = [ | ||
| /\b(could|might|maybe|perhaps|possibly|potentially)\b/i, | ||
| /\b(later|eventually|someday|future)\b/i, | ||
| /\b(depends on|unclear|tbd|to be determined)\b/i, | ||
| ]; | ||
| const matched = scopePatterns.find((p) => p.test(content)); | ||
| if (matched) { | ||
| return { | ||
| urgency: 'p1', | ||
| reason: 'Scope ambiguity detected', | ||
| content: 'Scope boundary needs clarification. What specifically is in vs. out for this iteration?', | ||
| }; | ||
| } | ||
| // PM: flag missing success criteria | ||
| if ((lower.includes('implement') || lower.includes('build') || lower.includes('create')) && !lower.includes('success') && !lower.includes('metric') && !lower.includes('goal')) { | ||
| if (Math.random() < 0.4) { | ||
| return { | ||
| urgency: 'p2', | ||
| reason: 'Missing acceptance criteria', | ||
| content: 'What does done look like here? Define the measurable success criteria before building.', | ||
| }; | ||
| } | ||
| } | ||
| } | ||
| if (personaId === 'brainstorm') { | ||
| // Brainstorm: flag premature convergence on a single option | ||
| const convergencePatterns = [ | ||
| /\b(the (best|right|correct|only) (way|approach|solution|option))\b/i, | ||
| /\b(we should|we must|we need to|the answer is)\b/i, | ||
| /\b(obviously|clearly|simply|just)\b/i, | ||
| ]; | ||
| const matched = convergencePatterns.find((p) => p.test(content)); | ||
| if (matched) { | ||
| return { | ||
| urgency: 'spark', | ||
| reason: 'Early convergence on one path', | ||
| content: '_Before narrowing: what\'s the alternative that explicitly rejects this approach? What would it look like?_', | ||
| }; | ||
| } | ||
| } | ||
| return null; | ||
| } | ||
| /** | ||
| * Run lookout analysis for all active lookout agents after a turn completes. | ||
| * Emits sidebar_entry SSE events for any signals generated. | ||
| * | ||
| * @param {string} conversationId | ||
| * @param {string} chunkId — the turn ID used as chunk reference | ||
| * @param {string} chunkContent — the assistant's response text | ||
| * @param {string} chunkPersona — which persona produced the chunk | ||
| */ | ||
| async #runLookoutAnalysis(conversationId, chunkId, chunkContent, chunkPersona) { | ||
| const lookouts = this.#db | ||
| .prepare(`SELECT persona_id, muted FROM plan_lookout_agents WHERE conversation_id = ? AND muted = 0`) | ||
| .all(conversationId); | ||
| for (const lookout of lookouts) { | ||
| const { persona_id: personaId } = lookout; | ||
| // Skip if this is the persona that produced the chunk | ||
| if (personaId === chunkPersona) continue; | ||
| // Emit thinking state | ||
| this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'thinking' }); | ||
| // Small async gap to let the SSE event reach the client before analysis | ||
| await new Promise((resolve) => setTimeout(resolve, 300 + Math.random() * 700)); | ||
| try { | ||
| const analysis = this.#analyzeChunkHeuristic(personaId, chunkContent); | ||
| if (analysis) { | ||
| const signalId = randomUUID(); | ||
| const now = Date.now(); | ||
| this.#db | ||
| .prepare( | ||
| `INSERT INTO plan_sidebar_signals (id, conversation_id, agent_id, urgency, reason, content, chunk_id, created_at, status, tension_subject) | ||
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active', ?)`, | ||
| ) | ||
| .run(signalId, conversationId, personaId, analysis.urgency, analysis.reason, analysis.content, chunkId, now, analysis.tension_subject ?? null); | ||
| const signal = { | ||
| id: signalId, | ||
| agent_id: personaId, | ||
| urgency: analysis.urgency, | ||
| reason: analysis.reason, | ||
| content: analysis.content, | ||
| chunk_id: chunkId, | ||
| created_at: now, | ||
| status: 'active', | ||
| tension_subject: analysis.tension_subject ?? null, | ||
| parent_signal_id: null, | ||
| }; | ||
| this.#broadcast( | ||
| conversationId, | ||
| analysis.urgency === 'p0' ? 'interrupt' : 'sidebar_entry', | ||
| { signal }, | ||
| ); | ||
| } | ||
| } catch (err) { | ||
| console.error(`[plan] lookout analysis error ${conversationId}/${personaId}:`, err.message); | ||
| } | ||
| // Return to listening state | ||
| this.#broadcast(conversationId, 'agent_state', { persona_id: personaId, state: 'listening' }); | ||
| } | ||
| } | ||
| // ── Persona subscription setup ──────────────────────────────────────────── | ||
| /** | ||
| * Wire pi event listeners for a session so tokens + turn_complete events are | ||
| * forwarded to SSE clients. | ||
| * | ||
| * @param {string} conversationId | ||
| * @param {string} personaId | ||
| * @param {import('@mariozechner/pi-coding-agent').AgentSession} session | ||
| */ | ||
| #wireSessionEvents(conversationId, personaId, session) { | ||
| session.subscribe((event) => { | ||
| if (event.type === 'message_update') { | ||
| const ae = event.assistantMessageEvent; | ||
| if (ae.type === 'text_delta') { | ||
| this.#broadcast(conversationId, 'token', { delta: ae.delta, persona: personaId }); | ||
| } | ||
| } else if (event.type === 'agent_end') { | ||
| const msgs = event.messages; | ||
| const last = msgs[msgs.length - 1]; | ||
| let persistedText = ''; | ||
| let turnId = last?.id ?? randomUUID(); | ||
| // Persist the assistant turn so history loads correctly. | ||
| if (last) { | ||
| const text = (last.content ?? []) | ||
| .filter((c) => c.type === 'text') | ||
| .map((c) => c.text ?? '') | ||
| .join(''); | ||
| if (text) { | ||
| persistedText = text; | ||
| const now = Date.now(); | ||
| this.#db | ||
| .prepare( | ||
| `INSERT OR IGNORE INTO plan_turns | ||
| (id, conversation_id, role, content, persona, created_at) | ||
| VALUES (?, ?, 'assistant', ?, ?, ?)`, | ||
| ) | ||
| .run(turnId, conversationId, text, personaId, now); | ||
| } | ||
| } | ||
| this.#broadcast(conversationId, 'turn_complete', { | ||
| turnId, | ||
| persona: personaId, | ||
| finishReason: last?.stopReason ?? 'end_turn', | ||
| }); | ||
| this.#db | ||
| .prepare(`UPDATE plan_conversations SET last_turn_at = ? WHERE id = ?`) | ||
| .run(Date.now(), conversationId); | ||
| // Trigger lookout analysis asynchronously — does not block the turn. | ||
| if (persistedText) { | ||
| this.#runLookoutAnalysis(conversationId, turnId, persistedText, personaId) | ||
| .catch((err) => console.error('[plan] lookout run error:', err.message)); | ||
| } | ||
| } | ||
| }); | ||
| } | ||
| // ── Public API ──────────────────────────────────────────────────────────── | ||
| /** | ||
| * Submit a user turn to one or more personas in parallel. | ||
| * When mentionedPersonas contains 2+ IDs, each receives its own AgentSession | ||
| * and streams tokens with per-persona attribution via SSE `persona` field. | ||
| * Returns immediately; tokens stream over SSE. | ||
| * | ||
| * @param {{ | ||
| * conversationId: string, | ||
| * content: string, | ||
| * personaOverride?: string, | ||
| * mentionedPersonas?: string[], | ||
| * }} params | ||
| * @returns {Promise<{ turnId: string, persona: string }>} | ||
| */ | ||
| async submitTurn({ conversationId, content, personaOverride, mentionedPersonas }) { | ||
| const row = this.#db | ||
| .prepare(`SELECT id, title FROM plan_conversations WHERE id = ?`) | ||
| .get(conversationId); | ||
| if (!row) { | ||
| const err = new Error('conversation not found'); | ||
| err.code = 'NOT_FOUND'; | ||
| throw err; | ||
| } | ||
| const now = Date.now(); | ||
| // Determine which personas will receive this turn. | ||
| // Multi-persona: user @-mentioned 2+ personas explicitly. | ||
| // Single-persona: use explicit override or the conversation's active persona. | ||
| const personasToDispatch = (mentionedPersonas?.length ?? 0) > 1 | ||
| ? mentionedPersonas | ||
| : [personaOverride ?? this.getActivePersona(conversationId)]; | ||
| // Open (or reset) the replay buffer; track how many turn_complete events are expected. | ||
| this.#activeTurns.set(conversationId, []); | ||
| this.#pendingPersonaCount.set(conversationId, personasToDispatch.length); | ||
| // Set title from first user message if still null. | ||
| if (row.title === null) { | ||
| this.#db | ||
| .prepare(`UPDATE plan_conversations SET title = ? WHERE id = ?`) | ||
| .run(deriveTitle(content), conversationId); | ||
| } | ||
| // Persist the user turn once (regardless of how many personas respond). | ||
| this.#db | ||
| .prepare( | ||
| `INSERT INTO plan_turns (id, conversation_id, role, content, created_at) | ||
| VALUES (?, ?, 'user', ?, ?)`, | ||
| ) | ||
| .run(randomUUID(), conversationId, content, now); | ||
| const isSinglePersona = personasToDispatch.length === 1; | ||
| // Dispatch to each persona. For multi-persona turns, skip rope enrichment — | ||
| // the user explicitly chose all participants, so no auto-delegation is needed. | ||
| await Promise.all(personasToDispatch.map(async (pId) => { | ||
| const onStubCall = (event) => { | ||
| this.#broadcast(conversationId, 'tool_stub_call', { persona: pId, ...event }); | ||
| }; | ||
| // Refresh credential before each turn. MUST precede getAgent(): in cloud | ||
| // path-vault fallback mode #fetchToken points the registry at the localhost | ||
| // proxy base URL, and getAgent() bakes that base URL into the runtime's | ||
| // model when it first builds the (cached) runtime. | ||
| const token = await this.#fetchToken(); | ||
| const { session, authStorage } = await this.#registry.getAgent(conversationId, pId, { onStubCall }); | ||
| // Wire events on first use (idempotent because pi de-duplicates subscribers). | ||
| this.#wireSessionEvents(conversationId, pId, session); | ||
| authStorage.setRuntimeApiKey('anthropic', token); | ||
| let promptContent = content; | ||
| if (isSinglePersona) { | ||
| // Pre-turn autoRope enrichment (Phase D): run any persona's autoRope rules | ||
| // before the caller's session sees the content. Keeps pm_gathering_context | ||
| // backward-compat; rope_start/rope_complete are emitted by RopeEngine. | ||
| promptContent = await this.#ropeEngine.autoDelegateIfNeeded({ | ||
| conversationId, | ||
| callerPersonaId: pId, | ||
| content, | ||
| fetchToken: () => this.#fetchToken(), | ||
| }); | ||
| } | ||
| session.prompt(promptContent).catch((err) => { | ||
| console.error(`[plan] prompt error ${conversationId}/${pId}:`, err.message); | ||
| this.#broadcast(conversationId, 'error', { | ||
| message: err.message, | ||
| code: err.code ?? 'PROMPT_ERROR', | ||
| }); | ||
| }); | ||
| })); | ||
| const turnId = randomUUID(); | ||
| return { turnId, persona: personasToDispatch[0] }; | ||
| } | ||
| /** | ||
| * Execute a handoff, switching the default active persona. | ||
| * | ||
| * @param {{ | ||
| * conversationId: string, | ||
| * toPersona: string, | ||
| * mode?: 'full' | 'distilled' | 'quoted', | ||
| * selectedTurnIds?: string[], | ||
| * }} params | ||
| * @returns {Promise<{ handoffId: string, forkNodeId: string | null, seededTurnCount: number }>} | ||
| */ | ||
| async handoff({ conversationId, toPersona, mode = 'full', selectedTurnIds = [] }) { | ||
| const row = this.#db | ||
| .prepare(`SELECT id FROM plan_conversations WHERE id = ?`) | ||
| .get(conversationId); | ||
| if (!row) { | ||
| const err = new Error('conversation not found'); | ||
| err.code = 'NOT_FOUND'; | ||
| throw err; | ||
| } | ||
| const fromPersona = this.getActivePersona(conversationId); | ||
| const onStubCall = (event) => { | ||
| this.#broadcast(conversationId, 'tool_stub_call', { persona: toPersona, ...event }); | ||
| }; | ||
| const result = await this.#handoffEngine.handoff({ | ||
| conversationId, | ||
| fromPersona, | ||
| toPersona, | ||
| mode, | ||
| selectedTurnIds, | ||
| fetchToken: () => this.#fetchToken(), | ||
| onStubCall, | ||
| }); | ||
| // Update the active persona for this conversation. | ||
| this.setActivePersona(conversationId, toPersona); | ||
| // Persist handoff marker so history replay can reconstruct it. | ||
| this.#db | ||
| .prepare( | ||
| `INSERT OR IGNORE INTO plan_turns | ||
| (id, conversation_id, role, content, from_persona, to_persona, mode, fork_node_id, created_at) | ||
| VALUES (?, ?, 'handoff', '', ?, ?, ?, ?, ?)`, | ||
| ) | ||
| .run(result.handoffId, conversationId, fromPersona, toPersona, mode, result.forkNodeId ?? null, Date.now()); | ||
| // Broadcast the handoff event to SSE clients. | ||
| this.#broadcast(conversationId, 'handoff', { | ||
| handoffId: result.handoffId, | ||
| fromPersona, | ||
| toPersona, | ||
| mode, | ||
| forkNodeId: result.forkNodeId, | ||
| }); | ||
| // Wire events for the new persona's session. | ||
| try { | ||
| const { session } = await this.#registry.getAgent(conversationId, toPersona, { onStubCall }); | ||
| this.#wireSessionEvents(conversationId, toPersona, session); | ||
| } catch { | ||
| // Best-effort — events will be wired on first turn if this fails. | ||
| } | ||
| return result; | ||
| } | ||
| /** | ||
| * Replay buffered in-flight SSE events to a reconnecting client. | ||
| * Call this before addEventSink so the client gets events it missed. | ||
| * No-op if no turn is active. | ||
| * | ||
| * @param {string} conversationId | ||
| * @param {import('node:http').ServerResponse} res | ||
| */ | ||
| drainReplayBuffer(conversationId, res) { | ||
| const buf = this.#activeTurns.get(conversationId); | ||
| if (!buf || buf.length === 0) return; | ||
| for (const { event, data } of buf) { | ||
| try { | ||
| res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`); | ||
| } catch { /* client closed before drain completed */ } | ||
| } | ||
| } | ||
| /** | ||
| * Register an SSE sink for a conversation. Returns a cleanup function. | ||
| * @param {string} conversationId | ||
| * @param {import('node:http').ServerResponse} res | ||
| * @returns {() => void} | ||
| */ | ||
| addEventSink(conversationId, res) { | ||
| if (!this.#sinks.has(conversationId)) { | ||
| this.#sinks.set(conversationId, new Set()); | ||
| } | ||
| this.#sinks.get(conversationId).add(res); | ||
| return () => { | ||
| const s = this.#sinks.get(conversationId); | ||
| if (s) s.delete(res); | ||
| }; | ||
| } | ||
| /** | ||
| * Return the ordered turn list for a conversation (for history replay). | ||
| * Each turn is one of: | ||
| * { role:'user'|'assistant', content, persona?, created_at } | ||
| * { role:'handoff', from_persona, to_persona, mode, fork_node_id, created_at } | ||
| * @param {string} conversationId | ||
| * @returns {Array<object>} | ||
| */ | ||
| getTurns(conversationId) { | ||
| return this.#db | ||
| .prepare( | ||
| `SELECT id, role, content, persona, from_persona, to_persona, mode, fork_node_id, created_at | ||
| FROM plan_turns | ||
| WHERE conversation_id = ? | ||
| ORDER BY created_at ASC`, | ||
| ) | ||
| .all(conversationId); | ||
| } | ||
| /** Expose persona list for the /api/plan/personas endpoint. */ | ||
| listPersonas() { | ||
| return PERSONAS.map((p) => ({ | ||
| id: p.id, | ||
| displayName: p.displayName, | ||
| model: p.model, | ||
| toolNames: p.toolNames, | ||
| systemPromptPreview: p.systemPrompt.length > 120 | ||
| ? p.systemPrompt.slice(0, 117) + '...' | ||
| : p.systemPrompt, | ||
| })); | ||
| } | ||
| } |
| /** | ||
| * Plan progress parser — reads phase-*-tasks.md trackers to derive | ||
| * phase/task state for the inbox progress bar. | ||
| * | ||
| * @module plan-progress | ||
| */ | ||
| import { readdirSync, readFileSync, statSync } from 'node:fs'; | ||
| import path from 'node:path'; | ||
| const WORKING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes | ||
| /** | ||
| * Parse simple key:value pairs from a YAML frontmatter block (---…---). | ||
| * Handles single-line scalar values only — enough for feature/phase keys. | ||
| * | ||
| * @param {string} content | ||
| * @returns {Record<string, string>} | ||
| */ | ||
| function parseFrontmatter(content) { | ||
| const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---/); | ||
| if (!match) return {}; | ||
| const result = {}; | ||
| for (const line of match[1].split('\n')) { | ||
| const m = line.match(/^([\w-]+):\s*(.+)$/); | ||
| if (m) result[m[1]] = m[2].trim(); | ||
| } | ||
| return result; | ||
| } | ||
| /** | ||
| * Extract task definitions from "## Task list" section. | ||
| * Matches headings like: | ||
| * ### A0 — name | ||
| * ### B1 step 5 — multi-part name | ||
| * | ||
| * @param {string} content | ||
| * @returns {Array<{id: string, name: string}>} | ||
| */ | ||
| function extractTaskDefs(content) { | ||
| const sectionMatch = content.match(/^## Task list\s*\n([\s\S]*)/m); | ||
| if (!sectionMatch) return []; | ||
| const taskSection = sectionMatch[1]; | ||
| const tasks = []; | ||
| const re = /^###\s+([A-Z]\d+)\b([^\n]*)/gm; | ||
| let m; | ||
| while ((m = re.exec(taskSection)) !== null) { | ||
| const id = m[1]; | ||
| const rest = m[2].trim(); | ||
| // Strip leading em-dash, double-hyphen, or plain hyphen separator | ||
| const name = rest.replace(/^\s*[—\-]{1,2}\s*/, '').trim() || id; | ||
| tasks.push({ id, name }); | ||
| } | ||
| return tasks; | ||
| } | ||
| /** | ||
| * Extract completed task IDs from the CP0 log comment block. | ||
| * Matches lines like: A0 (2026-05-05): ... | ||
| * A2 (2026-05-05, rebase): ... | ||
| * | ||
| * @param {string} content | ||
| * @returns {Set<string>} | ||
| */ | ||
| function extractCp0Completed(content) { | ||
| const completed = new Set(); | ||
| const logMatch = content.match(/<!--\s*CP0 log[\s\S]*?-->/); | ||
| if (!logMatch) return completed; | ||
| const re = /^([A-Z]\d+)\s*\(/gm; | ||
| let m; | ||
| while ((m = re.exec(logMatch[0])) !== null) { | ||
| completed.add(m[1]); | ||
| } | ||
| return completed; | ||
| } | ||
| /** | ||
| * Extract completed task IDs from an item-format Status table. | ||
| * Matches rows like: | A1 | Tool loader index | done | | ||
| * | ||
| * @param {string} content | ||
| * @returns {Set<string>} | ||
| */ | ||
| function extractItemTableCompleted(content) { | ||
| const completed = new Set(); | ||
| // No `m` flag — `$` must mean end-of-string so the lazy quantifier captures | ||
| // the whole table, not just the first line. | ||
| const statusMatch = content.match(/## Status\s*\n([\s\S]*?)(?=\n##\s|$)/); | ||
| if (!statusMatch) return completed; | ||
| const re = /^\|\s*([A-Z]\d+)\s*\|[^|]+\|\s*done\s*\|/gim; | ||
| let m; | ||
| while ((m = re.exec(statusMatch[1])) !== null) { | ||
| completed.add(m[1]); | ||
| } | ||
| return completed; | ||
| } | ||
| /** | ||
| * Extract the authoritative done count from a count-format Status table. | ||
| * Matches rows like: | done | 3 | | ||
| * | ||
| * @param {string} content | ||
| * @returns {number|null} | ||
| */ | ||
| function extractDoneCount(content) { | ||
| const m = content.match(/\|\s*done\s*\|\s*(\d+)\s*\|/i); | ||
| return m ? parseInt(m[1], 10) : null; | ||
| } | ||
| /** | ||
| * Resolve the feature slug from a branch name or by scanning docs/plans/. | ||
| * | ||
| * Strategy: | ||
| * 1. Strip "feat/" prefix + optional "-phase-X" suffix from branch. | ||
| * 2. Exact match against plans subdirectory names. | ||
| * 3. Prefix match (branch slug starts with a plan dir name). | ||
| * 4. Fallback: most-recently-modified plans dir that has phase trackers. | ||
| * | ||
| * @param {string} repoPath - path to the git checkout | ||
| * @param {string|null} branch | ||
| * @returns {string|null} | ||
| */ | ||
| function resolveFeatureSlug(repoPath, branch) { | ||
| const plansDir = path.join(repoPath, 'docs', 'plans'); | ||
| let entries; | ||
| try { | ||
| entries = readdirSync(plansDir, { withFileTypes: true }) | ||
| .filter((d) => d.isDirectory()) | ||
| .map((d) => d.name); | ||
| } catch { | ||
| return null; | ||
| } | ||
| if (branch) { | ||
| // Strip feat/ prefix, any nested path, and trailing -phase-X suffix | ||
| const slug = branch | ||
| .replace(/^feat\//, '') | ||
| .replace(/\/.*$/, '') | ||
| .replace(/-phase-[a-z]$/, ''); | ||
| // Exact match | ||
| if (entries.includes(slug)) return slug; | ||
| // Prefix match (slug starts with a plan dir name) | ||
| const prefixMatch = entries.find((d) => slug.startsWith(d)); | ||
| if (prefixMatch) return prefixMatch; | ||
| // Branch provided but no name match — don't guess | ||
| return null; | ||
| } | ||
| // No branch: fallback to most-recently-modified dir with phase tracker files | ||
| let newest = null; | ||
| let newestMtime = 0; | ||
| for (const dir of entries) { | ||
| const dirPath = path.join(plansDir, dir); | ||
| try { | ||
| const files = readdirSync(dirPath); | ||
| if (!files.some((f) => /^phase-[a-z]-tasks\.md$/.test(f))) continue; | ||
| const mtime = statSync(dirPath).mtimeMs; | ||
| if (mtime > newestMtime) { | ||
| newestMtime = mtime; | ||
| newest = dir; | ||
| } | ||
| } catch { | ||
| // skip unreadable entries | ||
| } | ||
| } | ||
| return newest; | ||
| } | ||
| /** | ||
| * Parse a single phase tracker file into phase/task state. | ||
| * | ||
| * @param {string} filePath | ||
| * @param {boolean} isRecentlyActive - whether the world had recent activity | ||
| * @param {{ workingMarked: boolean }} state - mutable flag shared across phases | ||
| * @returns {{ id: string, name: string, status: string, tasks: Array }|null} | ||
| */ | ||
| function parseTrackerFile(filePath, isRecentlyActive, state) { | ||
| let content; | ||
| try { | ||
| content = readFileSync(filePath, 'utf8'); | ||
| } catch { | ||
| return null; | ||
| } | ||
| const fm = parseFrontmatter(content); | ||
| // Phase ID: frontmatter "phase" field or filename "phase-X-tasks.md" | ||
| const phaseId = | ||
| fm.phase || | ||
| path.basename(filePath).match(/^phase-([a-z])-tasks\.md$/)?.[1] || | ||
| '?'; | ||
| const phaseName = `Phase ${phaseId.toUpperCase()}`; | ||
| const taskDefs = extractTaskDefs(content); | ||
| if (taskDefs.length === 0) return null; | ||
| // Collect completions from all sources | ||
| const cp0Completed = extractCp0Completed(content); | ||
| const itemTableCompleted = extractItemTableCompleted(content); | ||
| const doneCount = extractDoneCount(content); | ||
| // Merge CP0 log + item-table; count-format overrides if present | ||
| const mergedCompleted = new Set([...cp0Completed, ...itemTableCompleted]); | ||
| const tasks = taskDefs.map((t, i) => { | ||
| const isComplete = | ||
| doneCount !== null | ||
| ? i < doneCount // count format is authoritative | ||
| : mergedCompleted.has(t.id); | ||
| if (isComplete) return { id: t.id, name: t.name, status: 'complete' }; | ||
| // First pending task across all phases = candidate for "working" | ||
| if (!state.workingMarked) { | ||
| state.workingMarked = true; | ||
| return { | ||
| id: t.id, | ||
| name: t.name, | ||
| status: isRecentlyActive ? 'working' : 'pending', | ||
| }; | ||
| } | ||
| return { id: t.id, name: t.name, status: 'pending' }; | ||
| }); | ||
| const allComplete = tasks.every((t) => t.status === 'complete'); | ||
| const anyWorking = tasks.some((t) => t.status === 'working'); | ||
| const phaseStatus = allComplete ? 'complete' : anyWorking ? 'working' : 'pending'; | ||
| return { id: phaseId, name: phaseName, status: phaseStatus, tasks }; | ||
| } | ||
| /** | ||
| * Read plan progress from a world's git checkout. | ||
| * | ||
| * @param {string} repoPath - absolute path to the git checkout | ||
| * @param {string|null} branch - current branch name (e.g. "feat/foo-phase-a") | ||
| * @param {{ lastActivityAtMs?: number|null }} [opts] | ||
| * @returns {{ feature: string, phases: Array }|null} | ||
| * null when no plan tracker is found (caller falls back to legacy bar) | ||
| */ | ||
| export function readPlanProgress(repoPath, branch, { lastActivityAtMs = null } = {}) { | ||
| const feature = resolveFeatureSlug(repoPath, branch); | ||
| if (!feature) return null; | ||
| const plansDir = path.join(repoPath, 'docs', 'plans', feature); | ||
| let phaseFiles; | ||
| try { | ||
| phaseFiles = readdirSync(plansDir) | ||
| .filter((f) => /^phase-[a-z]-tasks\.md$/.test(f)) | ||
| .sort(); | ||
| } catch { | ||
| return null; | ||
| } | ||
| if (phaseFiles.length === 0) return null; | ||
| const isRecentlyActive = | ||
| lastActivityAtMs != null | ||
| ? Date.now() - lastActivityAtMs <= WORKING_THRESHOLD_MS | ||
| : false; | ||
| const state = { workingMarked: false }; | ||
| const phases = phaseFiles | ||
| .map((file) => | ||
| parseTrackerFile(path.join(plansDir, file), isRecentlyActive, state), | ||
| ) | ||
| .filter(Boolean); | ||
| if (phases.length === 0) return null; | ||
| return { feature, phases }; | ||
| } |
| // planning-sessions — host-cp surface for creating and inspecting in-flight | ||
| // planning sessions stored under world_id = PLANNING_WORLD_ID ('_planning'). | ||
| // | ||
| // Formalises what the plan-chat-spa dev substrate does ad hoc: | ||
| // | ||
| // createPlanningSession({ actorId, pool }) | ||
| // Seeds a session with one 'system' chunk so the Electric shape subscriber | ||
| // gets a non-empty initial response on its first long-poll cycle. Also | ||
| // INSERTs a row into the planning_sessions sidecar table inside the same | ||
| // transaction so no partial state can exist (chunk written, no metadata row). | ||
| // Returns the allocated world_id, session_id, and the inserted seed chunk. | ||
| // | ||
| // loadPlanningSession({ pool, sessionId }) | ||
| // Lightweight metadata read: chunk count, first/last timestamps, first | ||
| // operator content (for title derivation). SPA still streams live chunks | ||
| // via the existing /v1/shape proxy — this is metadata-only. | ||
| // | ||
| // recordPlanningSession({ pool, sessionId, actorId, summary }) | ||
| // UPSERT into planning_sessions. Used by createPlanningSession (wrapped in | ||
| // a transaction) and later to update the summary as the session evolves. | ||
| // | ||
| // setCrystallizeStatus({ pool, sessionId, status, worldId }) | ||
| // UPDATE planning_sessions.crystallize_status + crystallized_world_id. | ||
| // Throws if status is not in PLANNING_SESSION_STATUSES. | ||
| // | ||
| // listPlanningSessions({ pool, actorId, limit }) | ||
| // SELECT rows for actorId, ordered created_at DESC. Returns array. | ||
| // | ||
| // Neither function calls validateChunkInput — that's for the public POST | ||
| // surface. INSERTs here are built directly against the chunks column list. | ||
| // | ||
| // Pool errors surface loudly (never swallowed) so the caller sees the full | ||
| // pg error message and can diagnose connectivity or constraint failures. | ||
| import { randomUUID } from 'node:crypto'; | ||
| import { PLANNING_WORLD_ID, PLANNING_SESSION_STATUSES } from '@olam/chunks/schema'; | ||
| /** | ||
| * UPSERT a row in planning_sessions for the given sessionId. | ||
| * | ||
| * On first call (from createPlanningSession): inserts a fresh row. | ||
| * On subsequent calls: updates summary + updated_at only (leaves | ||
| * crystallize_status and crystallized_world_id untouched). | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.sessionId | ||
| * @param {string} opts.actorId | ||
| * @param {string | null} [opts.summary] | ||
| * @param {string | null} [opts.linearIssueId] — LinearAgent (handoff principle 6): | ||
| * the Linear issue this session is driven by. Immutable once set — subsequent | ||
| * upserts preserve the recorded id (COALESCE), so an ordinary update (null) | ||
| * never clears a Linear link. | ||
| */ | ||
| export async function recordPlanningSession({ pool, sessionId, actorId, summary = null, linearIssueId = null }) { | ||
| await pool.query( | ||
| `INSERT INTO planning_sessions (session_id, actor_id, summary, linear_issue_id) | ||
| VALUES ($1, $2, $3, $4) | ||
| ON CONFLICT (session_id) DO UPDATE | ||
| SET summary = EXCLUDED.summary, | ||
| updated_at = NOW(), | ||
| linear_issue_id = COALESCE(planning_sessions.linear_issue_id, EXCLUDED.linear_issue_id)`, | ||
| [sessionId, actorId, summary, linearIssueId], | ||
| ); | ||
| } | ||
| /** | ||
| * Find the ACTIVE planning session for a Linear issue (handoff principle 6). | ||
| * Active = `archived_at IS NULL` (the substrate's established soft-delete | ||
| * marker). A NEW dispatch for the same issue resumes this session; an archived | ||
| * run is never matched, so an archived issue re-opened later starts fresh. | ||
| * Returns the session_id, or null when there is no active session (or no id). | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string | null | undefined} opts.linearIssueId | ||
| * @returns {Promise<string | null>} | ||
| */ | ||
| export async function findActiveLinearSession({ pool, linearIssueId }) { | ||
| if (!linearIssueId) return null; | ||
| const { rows } = await pool.query( | ||
| `SELECT session_id FROM planning_sessions | ||
| WHERE linear_issue_id = $1 AND archived_at IS NULL | ||
| ORDER BY created_at DESC | ||
| LIMIT 1`, | ||
| [linearIssueId], | ||
| ); | ||
| return rows?.[0]?.session_id ?? null; | ||
| } | ||
| /** | ||
| * Archive the planning session(s) for a Linear issue (handoff principle 6) — | ||
| * sets `archived_at` so a future dispatch for the same issue is NOT resumed and | ||
| * starts fresh. Called when the Linear issue is archived. Idempotent: only | ||
| * flips still-active rows. Returns the number of sessions archived. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string | null | undefined} opts.linearIssueId | ||
| * @returns {Promise<number>} | ||
| */ | ||
| export async function archiveLinearSession({ pool, linearIssueId }) { | ||
| if (!linearIssueId) return 0; | ||
| const res = await pool.query( | ||
| `UPDATE planning_sessions | ||
| SET archived_at = NOW(), updated_at = NOW() | ||
| WHERE linear_issue_id = $1 AND archived_at IS NULL`, | ||
| [linearIssueId], | ||
| ); | ||
| return res?.rowCount ?? 0; | ||
| } | ||
| /** | ||
| * Create a new in-flight planning session under world_id='_planning'. | ||
| * | ||
| * Allocates a fresh session_id (UUID v4) and, inside a single transaction: | ||
| * 1. INSERTs a seed chunk (actor_type='system', seq=0) so the Electric shape | ||
| * subscriber receives a non-empty initial long-poll response. | ||
| * 2. INSERTs a planning_sessions sidecar row (via recordPlanningSession) so | ||
| * listPlanningSessions can return it immediately. | ||
| * | ||
| * Transaction guarantee: both INSERTs succeed or both roll back. A chunk | ||
| * written without a corresponding planning_sessions row is the partial-state | ||
| * bug this transaction prevents. | ||
| * | ||
| * @param {object} opts | ||
| * @param {string} opts.actorId — actor_id to attribute the seed chunk to | ||
| * (typically 'system' or the host-cp service id) | ||
| * @param {object} opts.pool — pg.Pool-compatible object with .query() and | ||
| * optionally .connect() for transactional clients. | ||
| * Tests may supply a stub with a transaction-aware | ||
| * .query() (BEGIN / INSERT / INSERT / COMMIT). | ||
| * @returns {Promise<{ | ||
| * world_id: string, | ||
| * session_id: string, | ||
| * seed_chunk: { | ||
| * world_id: string, session_id: string, message_id: string, seq: number, | ||
| * actor_id: string, actor_type: string, role: string, chunk: string, | ||
| * chunk_type: string, | ||
| * }, | ||
| * }>} | ||
| */ | ||
| export async function createPlanningSession({ actorId, pool }) { | ||
| const sessionId = randomUUID(); | ||
| const messageId = randomUUID(); | ||
| const seq = 0; | ||
| const actorType = 'system'; | ||
| const role = 'system'; | ||
| const chunk = 'Planning session created.'; | ||
| const chunkType = 'text'; | ||
| // Use a transactional client when pool.connect() is available (real pg.Pool). | ||
| // Test stubs that only implement .query() fall through to the flat path; | ||
| // the transactional contract is proven by the test that uses a stub whose | ||
| // second .query() throws and asserts the chunk INSERT was rolled back. | ||
| if (typeof pool.connect === 'function') { | ||
| const client = await pool.connect(); | ||
| try { | ||
| await client.query('BEGIN'); | ||
| await client.query( | ||
| `INSERT INTO chunks | ||
| (world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type) | ||
| VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, | ||
| [PLANNING_WORLD_ID, sessionId, messageId, seq, actorId, actorType, role, chunk, chunkType], | ||
| ); | ||
| await recordPlanningSession({ pool: client, sessionId, actorId, summary: null }); | ||
| await client.query('COMMIT'); | ||
| } catch (err) { | ||
| await client.query('ROLLBACK'); | ||
| throw err; | ||
| } finally { | ||
| client.release(); | ||
| } | ||
| } else { | ||
| // Flat path for test stubs: queries run sequentially on the stub pool. | ||
| await pool.query( | ||
| `INSERT INTO chunks | ||
| (world_id, session_id, message_id, seq, actor_id, actor_type, role, chunk, chunk_type) | ||
| VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, | ||
| [PLANNING_WORLD_ID, sessionId, messageId, seq, actorId, actorType, role, chunk, chunkType], | ||
| ); | ||
| await recordPlanningSession({ pool, sessionId, actorId, summary: null }); | ||
| } | ||
| return { | ||
| world_id: PLANNING_WORLD_ID, | ||
| session_id: sessionId, | ||
| seed_chunk: { | ||
| world_id: PLANNING_WORLD_ID, | ||
| session_id: sessionId, | ||
| message_id: messageId, | ||
| seq, | ||
| actor_id: actorId, | ||
| actor_type: actorType, | ||
| role, | ||
| chunk, | ||
| chunk_type: chunkType, | ||
| }, | ||
| }; | ||
| } | ||
| /** | ||
| * Update crystallize_status (and optionally crystallized_world_id) for a | ||
| * planning session. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.sessionId | ||
| * @param {string} opts.status — must be in PLANNING_SESSION_STATUSES | ||
| * @param {string | null} [opts.worldId] — required when status='crystallized' | ||
| */ | ||
| export async function setCrystallizeStatus({ pool, sessionId, status, worldId = null }) { | ||
| if (!PLANNING_SESSION_STATUSES.includes(status)) { | ||
| throw new Error( | ||
| `setCrystallizeStatus: invalid status "${status}"; ` + | ||
| `must be one of ${PLANNING_SESSION_STATUSES.join(', ')}`, | ||
| ); | ||
| } | ||
| await pool.query( | ||
| `UPDATE planning_sessions | ||
| SET crystallize_status = $2, | ||
| crystallized_world_id = $3, | ||
| updated_at = NOW() | ||
| WHERE session_id = $1`, | ||
| [sessionId, status, worldId], | ||
| ); | ||
| } | ||
| /** | ||
| * Create a new multi-turn DISPATCH session (multi-turn-cloud-sandbox-dispatch | ||
| * Phase A2 — distinct from createPlanningSession which is for planning-flow | ||
| * crystallization sessions under world_id='_planning'). | ||
| * | ||
| * Allocates a UUID session_id, INSERTs a planning_sessions row with | ||
| * session_type='dispatch' + caller-supplied world_id, applies operator-supplied | ||
| * budget_usd_cap / allow_unpriced_models defaults, returns the session_id. | ||
| * | ||
| * No seed chunk: dispatch sessions accumulate chunks from the agent runtime | ||
| * (via /v1/chunks); we don't pre-seed a system chunk because Electric shape | ||
| * subscribers for dispatch sessions can wait for the first real agent chunk. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.actorId | ||
| * @param {string} opts.worldId — operator-supplied; identifies the dispatch | ||
| * target world (NOT the '_planning' sentinel used by createPlanningSession). | ||
| * @param {number | null} [opts.budgetUsdCap=null] — per-session budget cap; | ||
| * null = uncapped. When null AND `OLAM_SESSION_BUDGET_DEFAULT_USD` is set, | ||
| * the env-default applies at /v1/dispatch-turn check time (Phase D); here | ||
| * we record the row exactly as supplied. | ||
| * @param {boolean} [opts.allowUnpricedModels=false] — opt session into the | ||
| * pricingForModel-returns-null fallback (Plan A T11 mitigation; default | ||
| * refuses unknown models with 502). | ||
| * @returns {Promise<{ session_id: string }>} | ||
| */ | ||
| export async function createDispatchSession({ | ||
| pool, | ||
| actorId, | ||
| worldId, | ||
| budgetUsdCap = null, | ||
| allowUnpricedModels = false, | ||
| sessionId: providedSessionId = null, | ||
| }) { | ||
| if (!actorId || typeof actorId !== 'string') { | ||
| throw new Error('createDispatchSession: actorId required'); | ||
| } | ||
| if (!worldId || typeof worldId !== 'string') { | ||
| throw new Error('createDispatchSession: worldId required'); | ||
| } | ||
| // A6 (Decision 9 always-on threading): callers MAY supply session_id to | ||
| // upsert an existing planning_sessions row (e.g. /api/cloud-dispatch | ||
| // pre-creating the thread before forwarding to plan-DO). When omitted, | ||
| // we generate a UUID. ON CONFLICT DO NOTHING handles the race where | ||
| // the SPA called /v1/sessions/create concurrently AND server-side | ||
| // cloud-dispatch tried to pre-create the same row. | ||
| const sessionId = providedSessionId ?? randomUUID(); | ||
| await pool.query( | ||
| `INSERT INTO planning_sessions | ||
| (session_id, actor_id, session_type, world_id, budget_usd_cap, allow_unpriced_models) | ||
| VALUES ($1, $2, 'dispatch', $3, $4, $5) | ||
| ON CONFLICT (session_id) DO NOTHING`, | ||
| [sessionId, actorId, worldId, budgetUsdCap, allowUnpricedModels], | ||
| ); | ||
| return { session_id: sessionId }; | ||
| } | ||
| /** | ||
| * Atomic test-and-set lock claim on a dispatch session | ||
| * (multi-turn-cloud-sandbox-dispatch Phase A3 — Decision 4 + T5 mitigation). | ||
| * | ||
| * Pattern: single-statement UPDATE ... WHERE in_flight_turn_id IS NULL RETURNING. | ||
| * Two concurrent attempts: first claim wins (RETURNING yields 1 row); second | ||
| * sees empty result + must return 409 to caller. Matches the established | ||
| * planning-sessions.mjs:169 setCrystallizeStatus atomic-write idiom. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.sessionId | ||
| * @param {string} opts.turnId — operator-or-server-generated turn UUID | ||
| * @returns {Promise<boolean>} true if lock claimed, false if already held | ||
| */ | ||
| export async function claimDispatchTurnLock({ pool, sessionId, turnId }) { | ||
| const result = await pool.query( | ||
| `UPDATE planning_sessions | ||
| SET in_flight_turn_id = $1, | ||
| in_flight_turn_started_at = NOW(), | ||
| last_turn_at = NOW() | ||
| WHERE session_id = $2 | ||
| AND session_type = 'dispatch' | ||
| AND in_flight_turn_id IS NULL | ||
| RETURNING session_id`, | ||
| [turnId, sessionId], | ||
| ); | ||
| return (result.rows?.length ?? 0) > 0; | ||
| } | ||
| /** | ||
| * Clear the in-flight turn lock after dispatch completes (success OR failure). | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.sessionId | ||
| */ | ||
| export async function clearDispatchTurnLock({ pool, sessionId }) { | ||
| await pool.query( | ||
| `UPDATE planning_sessions | ||
| SET in_flight_turn_id = NULL, | ||
| in_flight_turn_started_at = NULL | ||
| WHERE session_id = $1 | ||
| AND session_type = 'dispatch'`, | ||
| [sessionId], | ||
| ); | ||
| } | ||
| /** | ||
| * Halt a dispatch session — operator-driven "block next turn" state (T13). | ||
| * | ||
| * Sets halted_at to NOW() AND clears in_flight_turn_id. Future /v1/dispatch-turn | ||
| * calls return 409 'session_halted' until reactivateDispatchSession clears | ||
| * halted_at. Does NOT stop an in-flight container — the running container | ||
| * completes its current turn naturally. UX is "Block next turn" not "Stop" | ||
| * (Plan A Phase C C6). | ||
| * | ||
| * Scoped by actor_id for ownership isolation. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.sessionId | ||
| * @param {string} opts.actorId | ||
| * @returns {Promise<boolean>} true if a session row was updated; false if | ||
| * the session_id was not found / not owned by actorId. | ||
| */ | ||
| export async function haltDispatchSession({ pool, sessionId, actorId }) { | ||
| const result = await pool.query( | ||
| `UPDATE planning_sessions | ||
| SET halted_at = NOW(), | ||
| in_flight_turn_id = NULL, | ||
| in_flight_turn_started_at = NULL | ||
| WHERE session_id = $1 | ||
| AND session_type = 'dispatch' | ||
| AND actor_id = $2 | ||
| RETURNING session_id`, | ||
| [sessionId, actorId], | ||
| ); | ||
| return (result.rows?.length ?? 0) > 0; | ||
| } | ||
| /** | ||
| * Reactivate a halted dispatch session — clears halted_at so subsequent | ||
| * /v1/dispatch-turn calls can claim the lock again. Idempotent (clearing an | ||
| * already-null halted_at is a no-op). | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.sessionId | ||
| * @param {string} opts.actorId | ||
| * @returns {Promise<boolean>} true if a session row was updated; false if | ||
| * the session_id was not found / not owned by actorId. | ||
| */ | ||
| export async function reactivateDispatchSession({ pool, sessionId, actorId }) { | ||
| const result = await pool.query( | ||
| `UPDATE planning_sessions | ||
| SET halted_at = NULL | ||
| WHERE session_id = $1 | ||
| AND session_type = 'dispatch' | ||
| AND actor_id = $2 | ||
| RETURNING session_id`, | ||
| [sessionId, actorId], | ||
| ); | ||
| return (result.rows?.length ?? 0) > 0; | ||
| } | ||
| /** | ||
| * Read a dispatch session by session_id + scope to caller's actor_id | ||
| * (ownership check). Returns the session metadata needed for budget check | ||
| * + plan-DO forward, OR null when not found / not owned. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.sessionId | ||
| * @param {string} opts.actorId | ||
| * @returns {Promise<null | { | ||
| * session_id: string, | ||
| * world_id: string | null, | ||
| * actor_id: string, | ||
| * total_usd: number, | ||
| * budget_usd_cap: number | null, | ||
| * allow_unpriced_models: boolean, | ||
| * halted_at: string | null, | ||
| * }>} | ||
| */ | ||
| export async function getDispatchSession({ pool, sessionId, actorId }) { | ||
| const result = await pool.query( | ||
| `SELECT session_id, world_id, actor_id, | ||
| total_usd, budget_usd_cap, allow_unpriced_models, | ||
| halted_at | ||
| FROM planning_sessions | ||
| WHERE session_id = $1 | ||
| AND session_type = 'dispatch' | ||
| AND actor_id = $2`, | ||
| [sessionId, actorId], | ||
| ); | ||
| const row = result.rows?.[0]; | ||
| if (!row) return null; | ||
| return { | ||
| session_id: row.session_id, | ||
| world_id: row.world_id ?? null, | ||
| actor_id: row.actor_id, | ||
| total_usd: Number(row.total_usd ?? 0), | ||
| budget_usd_cap: | ||
| row.budget_usd_cap === null || row.budget_usd_cap === undefined | ||
| ? null | ||
| : Number(row.budget_usd_cap), | ||
| allow_unpriced_models: Boolean(row.allow_unpriced_models), | ||
| halted_at: row.halted_at ?? null, | ||
| }; | ||
| } | ||
| /** | ||
| * List planning sessions for a given actorId, ordered by created_at DESC. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.actorId | ||
| * @param {number} [opts.limit=50] | ||
| * @returns {Promise<Array<{ | ||
| * session_id: string, | ||
| * summary: string | null, | ||
| * crystallize_status: string, | ||
| * crystallized_world_id: string | null, | ||
| * created_at: string, | ||
| * updated_at: string, | ||
| * }>>} | ||
| */ | ||
| export async function listPlanningSessions({ pool, actorId, limit = 50 }) { | ||
| const result = await pool.query( | ||
| `SELECT session_id, summary, crystallize_status, crystallized_world_id, | ||
| created_at, updated_at | ||
| FROM planning_sessions | ||
| WHERE actor_id = $1 | ||
| ORDER BY created_at DESC | ||
| LIMIT $2`, | ||
| [actorId, limit], | ||
| ); | ||
| return result.rows; | ||
| } | ||
| /** | ||
| * List multi-turn DISPATCH sessions for a given actorId, ordered by | ||
| * last_turn_at DESC (most recently active first), excluding archived sessions. | ||
| * | ||
| * Distinct from listPlanningSessions: this returns only `session_type='dispatch'` | ||
| * rows + projects the multi-turn-specific columns (total_usd, in_flight_turn_id, | ||
| * halted_at, etc.) that the SPA's SessionsListView (Phase C C3) renders. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool | ||
| * @param {string} opts.actorId | ||
| * @param {number} [opts.limit=50] | ||
| * @returns {Promise<Array<{ | ||
| * session_id: string, | ||
| * world_id: string | null, | ||
| * total_usd: string, | ||
| * budget_usd_cap: string | null, | ||
| * in_flight_turn_id: string | null, | ||
| * halted_at: string | null, | ||
| * last_turn_at: string | null, | ||
| * created_at: string, | ||
| * summary: string | null, | ||
| * }>>} | ||
| */ | ||
| export async function listDispatchSessions({ pool, actorId, limit = 50 }) { | ||
| const result = await pool.query( | ||
| `SELECT session_id, world_id, | ||
| total_usd, budget_usd_cap, | ||
| in_flight_turn_id, halted_at, | ||
| last_turn_at, created_at, | ||
| summary | ||
| FROM planning_sessions | ||
| WHERE actor_id = $1 | ||
| AND session_type = 'dispatch' | ||
| AND archived_at IS NULL | ||
| ORDER BY last_turn_at DESC NULLS LAST, created_at DESC | ||
| LIMIT $2`, | ||
| [actorId, limit], | ||
| ); | ||
| return result.rows; | ||
| } | ||
| /** | ||
| * Load lightweight metadata for an existing in-flight planning session. | ||
| * | ||
| * Performs two queries scoped to world_id='_planning' AND session_id=<sessionId>: | ||
| * 1. Aggregate: chunk_count, first_chunk_at, last_chunk_at. | ||
| * 2. First operator content: earliest chunk where actor_type='operator', | ||
| * used by the SPA for session title derivation. | ||
| * | ||
| * The SPA streams live chunks via the existing /v1/shape proxy; this function | ||
| * is metadata-only and does NOT subscribe to any Electric shape. | ||
| * | ||
| * @param {object} opts | ||
| * @param {object} opts.pool — pg.Pool-compatible object with a .query(sql, params) method | ||
| * @param {string} opts.sessionId — UUID of the planning session to inspect | ||
| * @returns {Promise<{ | ||
| * world_id: string, | ||
| * session_id: string, | ||
| * exists: boolean, | ||
| * chunk_count: number, | ||
| * first_chunk_at: string | null, | ||
| * last_chunk_at: string | null, | ||
| * first_operator_content: string | null, | ||
| * }>} | ||
| */ | ||
| export async function loadPlanningSession({ pool, sessionId }) { | ||
| const aggResult = await pool.query( | ||
| `SELECT COUNT(*) AS chunk_count, | ||
| MIN(created_at) AS first_chunk_at, | ||
| MAX(created_at) AS last_chunk_at | ||
| FROM chunks | ||
| WHERE world_id = $1 AND session_id = $2`, | ||
| [PLANNING_WORLD_ID, sessionId], | ||
| ); | ||
| const row = aggResult.rows[0]; | ||
| const chunkCount = Number(row.chunk_count); | ||
| const exists = chunkCount > 0; | ||
| let firstOperatorContent = null; | ||
| if (exists) { | ||
| const opResult = await pool.query( | ||
| `SELECT chunk | ||
| FROM chunks | ||
| WHERE world_id = $1 AND session_id = $2 AND actor_type = 'operator' | ||
| ORDER BY created_at ASC | ||
| LIMIT 1`, | ||
| [PLANNING_WORLD_ID, sessionId], | ||
| ); | ||
| if (opResult.rows.length > 0) { | ||
| firstOperatorContent = opResult.rows[0].chunk; | ||
| } | ||
| } | ||
| return { | ||
| world_id: PLANNING_WORLD_ID, | ||
| session_id: sessionId, | ||
| exists, | ||
| chunk_count: chunkCount, | ||
| first_chunk_at: exists ? row.first_chunk_at : null, | ||
| last_chunk_at: exists ? row.last_chunk_at : null, | ||
| first_operator_content: firstOperatorContent, | ||
| }; | ||
| } |
| /** | ||
| * port-bridge-manager.mjs | ||
| * Manages socat sidecar containers that bridge host port → world devbox port. | ||
| * Dual-mode: Docker HTTP API (container) vs docker CLI (bare-node). | ||
| */ | ||
| import { spawnSync } from 'node:child_process'; | ||
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli'; | ||
| const SOCAT_IMAGE = 'alpine/socat'; | ||
| const SOCAT_IMAGE_TAGGED = 'alpine/socat:latest'; | ||
| const HOST_PORT_MIN = 25000; | ||
| const HOST_PORT_MAX = 25999; | ||
| const INFRA_PORTS = new Set([8080, 7681, 7682]); | ||
| let BRIDGES_PATH = | ||
| process.env.OLAM_PORT_BRIDGES_PATH ?? | ||
| path.join(os.homedir(), '.olam', 'port-bridges.json'); | ||
| let HOST_IP = '127.0.0.1'; | ||
| // key: `${worldId}:${containerPort}` → { worldId, containerPort, hostPort, containerId, containerName } | ||
| const registry = new Map(); | ||
| export function configure({ bridgesPath, hostIp }) { | ||
| if (bridgesPath && bridgesPath !== BRIDGES_PATH) { | ||
| BRIDGES_PATH = bridgesPath; | ||
| loadState(); | ||
| } | ||
| if (hostIp) HOST_IP = hostIp; | ||
| } | ||
| function bridgeKey(worldId, containerPort) { | ||
| return `${worldId}:${containerPort}`; | ||
| } | ||
| function bridgeContainerName(worldId, containerPort) { | ||
| return `olam-${worldId}-bridge-${containerPort}`; | ||
| } | ||
| function loadState() { | ||
| try { | ||
| if (!fs.existsSync(BRIDGES_PATH)) return; | ||
| const raw = fs.readFileSync(BRIDGES_PATH, 'utf-8'); | ||
| const data = JSON.parse(raw); | ||
| if (!data || typeof data !== 'object') return; | ||
| for (const [key, entry] of Object.entries(data)) { | ||
| registry.set(key, entry); | ||
| } | ||
| } catch (err) { | ||
| console.error(`port-bridge-manager: loadState failed: ${err.message}`); | ||
| } | ||
| } | ||
| function saveState() { | ||
| try { | ||
| const dir = path.dirname(BRIDGES_PATH); | ||
| fs.mkdirSync(dir, { recursive: true }); | ||
| const data = {}; | ||
| for (const [key, entry] of registry) { | ||
| data[key] = entry; | ||
| } | ||
| const tmp = `${BRIDGES_PATH}.tmp-${process.pid}-${Date.now()}`; | ||
| fs.writeFileSync(tmp, JSON.stringify(data, null, 2), 'utf-8'); | ||
| fs.renameSync(tmp, BRIDGES_PATH); | ||
| } catch (err) { | ||
| console.error(`port-bridge-manager: saveState failed: ${err.message}`); | ||
| } | ||
| } | ||
| function allocateHostPort() { | ||
| const used = new Set(Array.from(registry.values()).map((e) => e.hostPort)); | ||
| for (let p = HOST_PORT_MIN; p <= HOST_PORT_MAX; p++) { | ||
| if (!used.has(p)) return p; | ||
| } | ||
| return null; | ||
| } | ||
| async function dockerApiBase() { | ||
| return DOCKER_HOST === 'docker-cli' | ||
| ? null // bare-node: no socket proxy HTTP API | ||
| : DOCKER_HOST.replace(/^tcp:\/\//, 'http://'); | ||
| } | ||
| /** | ||
| * Detect whether a docker error message indicates the image is missing | ||
| * (and therefore a `docker pull` retry would help). Docker uses a handful | ||
| * of phrasings across CLI + HTTP API surfaces. | ||
| */ | ||
| function isImageMissingError(message) { | ||
| if (!message) return false; | ||
| return /Unable to find image|pull access denied|manifest unknown|No such image|not found in (the )?(repository|registry)/i.test( | ||
| message, | ||
| ); | ||
| } | ||
| /** | ||
| * Pull alpine/socat:latest via docker CLI. Used by the bare-node bridge | ||
| * create path's fallback retry. 60s budget — image is ~5MB; real pull | ||
| * is typically <2s. | ||
| * | ||
| * @returns {{ok: boolean, stderr: string}} | ||
| */ | ||
| function pullSocatViaCli() { | ||
| const r = spawnSync('docker', ['pull', SOCAT_IMAGE_TAGGED], { | ||
| encoding: 'utf-8', | ||
| timeout: 60_000, | ||
| }); | ||
| return { | ||
| ok: r.status === 0, | ||
| stderr: (r.stderr ?? '').trim() || (r.stdout ?? '').trim(), | ||
| }; | ||
| } | ||
| /** | ||
| * Pull alpine/socat:latest via Docker HTTP API. Used by the container-mode | ||
| * bridge create path's fallback retry. Streams the pull progress body so | ||
| * Docker actually performs the pull (it's a streaming endpoint). | ||
| * | ||
| * @param {string} apiBase — Docker HTTP API base URL | ||
| * @returns {Promise<{ok: boolean, stderr: string}>} | ||
| */ | ||
| async function pullSocatViaHttpApi(apiBase) { | ||
| try { | ||
| const resp = await fetch( | ||
| `${apiBase}/images/create?fromImage=${encodeURIComponent(SOCAT_IMAGE)}&tag=latest`, | ||
| { method: 'POST', signal: AbortSignal.timeout(60_000) }, | ||
| ); | ||
| if (!resp.ok) { | ||
| const body = await resp.text().catch(() => ''); | ||
| return { ok: false, stderr: `pull failed: ${resp.status} ${body}` }; | ||
| } | ||
| // Drain the streaming progress body — Docker only completes the pull | ||
| // when the response is consumed. | ||
| await resp.text(); | ||
| return { ok: true, stderr: '' }; | ||
| } catch (err) { | ||
| return { ok: false, stderr: err?.message ?? String(err) }; | ||
| } | ||
| } | ||
| /** | ||
| * Create and start a socat bridge container. | ||
| * | ||
| * Returns `{ containerId, pulledImage }` — `pulledImage: true` indicates the | ||
| * function had to fall back to `docker pull alpine/socat:latest` (issue #964 | ||
| * — preflight in `olam services up` should normally have already pulled it). | ||
| * | ||
| * @param {string} worldId | ||
| * @param {number} containerPort | ||
| * @param {number} hostPort | ||
| * @returns {Promise<{containerId: string, pulledImage: boolean}>} | ||
| */ | ||
| async function createBridgeContainer(worldId, containerPort, hostPort) { | ||
| const name = bridgeContainerName(worldId, containerPort); | ||
| const networkName = `olam-${worldId}`; | ||
| const devboxName = `olam-${worldId}-devbox`; | ||
| const socatCmd = `TCP-LISTEN:${containerPort},fork,reuseaddr TCP:${devboxName}:${containerPort}`; | ||
| const apiBase = await dockerApiBase(); | ||
| if (!apiBase) { | ||
| // bare-node: use docker CLI | ||
| const args = [ | ||
| 'run', '-d', | ||
| '--name', name, | ||
| '--network', networkName, | ||
| '-p', `${HOST_IP}:${hostPort}:${containerPort}`, | ||
| '--label', `olam.world.id=${worldId}`, | ||
| '--label', 'olam.role=server-bridge', | ||
| '--restart', 'unless-stopped', | ||
| SOCAT_IMAGE, | ||
| 'TCP-LISTEN:' + containerPort + ',fork,reuseaddr', | ||
| 'TCP:' + devboxName + ':' + containerPort, | ||
| ]; | ||
| let result = spawnSync('docker', args, { encoding: 'utf-8', timeout: 10000 }); | ||
| let pulledImage = false; | ||
| // Issue #964 fallback: if docker run failed because the image is missing, | ||
| // pull it and retry once. This covers hosts where `olam services up` | ||
| // didn't run the preflight (e.g. fresh Hazel install, docker restart | ||
| // pruned the image, etc.). | ||
| if (result.status !== 0 && isImageMissingError(result.stderr ?? '')) { | ||
| const pull = pullSocatViaCli(); | ||
| if (!pull.ok) { | ||
| throw new Error( | ||
| `alpine/socat image missing and pull failed: ${pull.stderr || 'unknown error'}`, | ||
| ); | ||
| } | ||
| pulledImage = true; | ||
| result = spawnSync('docker', args, { encoding: 'utf-8', timeout: 10000 }); | ||
| } | ||
| if (result.status !== 0) { | ||
| throw new Error(result.stderr?.trim() || 'docker run failed'); | ||
| } | ||
| return { containerId: result.stdout.trim(), pulledImage }; | ||
| } | ||
| // container mode: Docker HTTP API | ||
| const createBody = { | ||
| Image: SOCAT_IMAGE, | ||
| Cmd: socatCmd.split(' '), | ||
| Labels: { | ||
| 'olam.world.id': worldId, | ||
| 'olam.role': 'server-bridge', | ||
| }, | ||
| HostConfig: { | ||
| NetworkMode: networkName, | ||
| PortBindings: { | ||
| [`${containerPort}/tcp`]: [{ HostIp: HOST_IP, HostPort: String(hostPort) }], | ||
| }, | ||
| RestartPolicy: { Name: 'unless-stopped' }, | ||
| }, | ||
| }; | ||
| const doCreate = () => fetch( | ||
| `${apiBase}/containers/create?name=${encodeURIComponent(name)}`, | ||
| { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify(createBody), | ||
| signal: AbortSignal.timeout(10000), | ||
| }, | ||
| ); | ||
| let createResp = await doCreate(); | ||
| let pulledImage = false; | ||
| // Issue #964 fallback for HTTP API path. Docker returns 404 with a body | ||
| // like {"message":"No such image: alpine/socat:latest"} when the image | ||
| // is missing. | ||
| if (!createResp.ok && createResp.status === 404) { | ||
| const body = await createResp.text().catch(() => ''); | ||
| if (isImageMissingError(body)) { | ||
| const pull = await pullSocatViaHttpApi(apiBase); | ||
| if (!pull.ok) { | ||
| throw new Error( | ||
| `alpine/socat image missing and pull failed: ${pull.stderr || 'unknown error'}`, | ||
| ); | ||
| } | ||
| pulledImage = true; | ||
| createResp = await doCreate(); | ||
| } else { | ||
| throw new Error(`container create failed: 404 ${body}`); | ||
| } | ||
| } | ||
| if (!createResp.ok) { | ||
| const body = await createResp.text().catch(() => ''); | ||
| // If container already exists (409), try to get its ID | ||
| if (createResp.status === 409) { | ||
| const inspectResp = await fetch( | ||
| `${apiBase}/containers/${encodeURIComponent(name)}/json`, | ||
| { signal: AbortSignal.timeout(3000) }, | ||
| ); | ||
| if (inspectResp.ok) { | ||
| const info = await inspectResp.json(); | ||
| return { containerId: info.Id, pulledImage }; | ||
| } | ||
| } | ||
| throw new Error(`container create failed: ${createResp.status} ${body}`); | ||
| } | ||
| const { Id: containerId } = await createResp.json(); | ||
| const startResp = await fetch(`${apiBase}/containers/${encodeURIComponent(containerId)}/start`, { | ||
| method: 'POST', | ||
| signal: AbortSignal.timeout(5000), | ||
| }); | ||
| if (!startResp.ok && startResp.status !== 304) { | ||
| throw new Error(`container start failed: ${startResp.status}`); | ||
| } | ||
| return { containerId, pulledImage }; | ||
| } | ||
| async function removeBridgeContainer(containerName, containerId) { | ||
| const id = containerId || containerName; | ||
| const apiBase = await dockerApiBase(); | ||
| if (!apiBase) { | ||
| spawnSync('docker', ['rm', '-f', id], { encoding: 'utf-8', timeout: 5000 }); | ||
| return; | ||
| } | ||
| // Force remove (stop + delete in one call) | ||
| await fetch(`${apiBase}/containers/${encodeURIComponent(id)}?force=true`, { | ||
| method: 'DELETE', | ||
| signal: AbortSignal.timeout(5000), | ||
| }).catch(() => { /* best-effort */ }); | ||
| } | ||
| /** | ||
| * Expose a world's container port via a socat bridge. | ||
| * Idempotent: returns existing bridge if already active. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {number} containerPort | ||
| * @returns {Promise<{hostPort: number, containerPort: number, url: string, containerId: string, pulledImage?: boolean}>} | ||
| */ | ||
| export async function exposePort(worldId, containerPort) { | ||
| if (INFRA_PORTS.has(containerPort)) { | ||
| throw new Error(`port ${containerPort} is reserved for infrastructure`); | ||
| } | ||
| const key = bridgeKey(worldId, containerPort); | ||
| const existing = registry.get(key); | ||
| if (existing) { | ||
| return { | ||
| hostPort: existing.hostPort, | ||
| containerPort: existing.containerPort, | ||
| url: `http://${HOST_IP}:${existing.hostPort}`, | ||
| containerId: existing.containerId, | ||
| }; | ||
| } | ||
| const hostPort = allocateHostPort(); | ||
| if (hostPort === null) { | ||
| throw new Error('no host ports available in range 25000–25999'); | ||
| } | ||
| const containerName = bridgeContainerName(worldId, containerPort); | ||
| const { containerId, pulledImage } = await createBridgeContainer(worldId, containerPort, hostPort); | ||
| const entry = { worldId, containerPort, hostPort, containerId, containerName }; | ||
| registry.set(key, entry); | ||
| saveState(); | ||
| const result = { | ||
| hostPort, | ||
| containerPort, | ||
| url: `http://${HOST_IP}:${hostPort}`, | ||
| containerId, | ||
| }; | ||
| // Only attach pulledImage when true so existing callers/tests don't see | ||
| // an unexpected key when the preflight succeeded. | ||
| if (pulledImage) result.pulledImage = true; | ||
| return result; | ||
| } | ||
| /** | ||
| * Remove a port bridge for a world. | ||
| * No-op if bridge doesn't exist. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {number} containerPort | ||
| */ | ||
| export async function removePort(worldId, containerPort) { | ||
| const key = bridgeKey(worldId, containerPort); | ||
| const entry = registry.get(key); | ||
| if (!entry) return; | ||
| registry.delete(key); | ||
| saveState(); | ||
| await removeBridgeContainer(entry.containerName, entry.containerId); | ||
| } | ||
| /** | ||
| * Remove all bridges for a world. Called on world destroy. | ||
| * @param {string} worldId | ||
| */ | ||
| export async function killWorld(worldId) { | ||
| const toDelete = []; | ||
| for (const [key, entry] of registry) { | ||
| if (entry.worldId === worldId) toDelete.push({ key, entry }); | ||
| } | ||
| for (const { key, entry } of toDelete) { | ||
| registry.delete(key); | ||
| await removeBridgeContainer(entry.containerName, entry.containerId).catch(() => {}); | ||
| } | ||
| if (toDelete.length > 0) saveState(); | ||
| } | ||
| /** | ||
| * List active bridges for a world. | ||
| * @param {string} worldId | ||
| * @returns {Array<{containerPort: number, hostPort: number, url: string}>} | ||
| */ | ||
| export function getWorldBridges(worldId) { | ||
| const result = []; | ||
| for (const entry of registry.values()) { | ||
| if (entry.worldId === worldId) { | ||
| result.push({ | ||
| containerPort: entry.containerPort, | ||
| hostPort: entry.hostPort, | ||
| url: `http://${HOST_IP}:${entry.hostPort}`, | ||
| }); | ||
| } | ||
| } | ||
| return result; | ||
| } | ||
| loadState(); |
| /** | ||
| * In-memory cache for GitHub PR data with TTL and concurrent-fetch coalescing. | ||
| * | ||
| * @module pr-cache | ||
| */ | ||
| const GH_API_BASE = 'https://api.github.com'; | ||
| const TTL_MS = 30_000; | ||
| /** | ||
| * Parse owner, repo, and PR number from a GitHub PR URL. | ||
| * | ||
| * @param {string} prUrl e.g. https://github.com/owner/repo/pull/123 | ||
| * @returns {{ owner: string, repo: string, number: number } | null} | ||
| */ | ||
| function parsePrUrl(prUrl) { | ||
| const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl); | ||
| if (!m) return null; | ||
| return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) }; | ||
| } | ||
| /** | ||
| * Reduce an array of check runs into a single status string. | ||
| * | ||
| * @param {Array<{conclusion: string|null, status: string}>} checkRuns | ||
| * @returns {'pending'|'passing'|'failing'|null} | ||
| */ | ||
| function reduceCheckRuns(checkRuns) { | ||
| if (!checkRuns || checkRuns.length === 0) return null; | ||
| let hasFailure = false; | ||
| let hasPending = false; | ||
| for (const run of checkRuns) { | ||
| const conclusion = run.conclusion; | ||
| const status = run.status; | ||
| if ( | ||
| conclusion === 'failure' || | ||
| conclusion === 'timed_out' || | ||
| conclusion === 'action_required' | ||
| ) { | ||
| hasFailure = true; | ||
| } else if ( | ||
| status === 'queued' || | ||
| status === 'in_progress' || | ||
| conclusion === null | ||
| ) { | ||
| hasPending = true; | ||
| } | ||
| } | ||
| if (hasFailure) return 'failing'; | ||
| if (hasPending) return 'pending'; | ||
| return 'passing'; | ||
| } | ||
| /** | ||
| * @typedef {object} PrCacheEntry | ||
| * @property {number} fetchedAt | ||
| * @property {'open'|'merged'|'closed'|null} prState | ||
| * @property {number|null} prNumber | ||
| * @property {'pending'|'passing'|'failing'|null} prChecks | ||
| * @property {Promise<PrData>|null} promise | ||
| */ | ||
| /** | ||
| * @typedef {object} PrData | ||
| * @property {'open'|'merged'|'closed'|null} state | ||
| * @property {number|null} number | ||
| * @property {'pending'|'passing'|'failing'|null} checks | ||
| */ | ||
| /** | ||
| * Fetch PR data from GitHub API. | ||
| * | ||
| * @param {string} prUrl | ||
| * @param {() => Promise<string|null>} getToken | ||
| * @returns {Promise<PrData>} | ||
| */ | ||
| async function fetchPrData(prUrl, getToken) { | ||
| const parsed = parsePrUrl(prUrl); | ||
| if (!parsed) return { state: null, number: null, checks: null }; | ||
| const token = await getToken(); | ||
| /** @type {HeadersInit} */ | ||
| const headers = { Accept: 'application/vnd.github+json' }; | ||
| if (token) headers['Authorization'] = `token ${token}`; | ||
| // Fetch PR metadata | ||
| const prResp = await fetch( | ||
| `${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/pulls/${parsed.number}`, | ||
| { headers, signal: AbortSignal.timeout(10_000) }, | ||
| ); | ||
| if (!prResp.ok) { | ||
| return { state: null, number: parsed.number, checks: null }; | ||
| } | ||
| const prData = await prResp.json(); | ||
| let state = prData.state ?? null; | ||
| if (state === 'closed' && prData.merged_at) state = 'merged'; | ||
| const sha = prData.head?.sha ?? null; | ||
| if (!sha) { | ||
| return { state, number: parsed.number, checks: null }; | ||
| } | ||
| // Fetch check runs for the head SHA | ||
| let checks = null; | ||
| try { | ||
| const checksResp = await fetch( | ||
| `${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/commits/${sha}/check-runs`, | ||
| { headers, signal: AbortSignal.timeout(10_000) }, | ||
| ); | ||
| if (checksResp.ok) { | ||
| const checksData = await checksResp.json(); | ||
| const runs = Array.isArray(checksData.check_runs) ? checksData.check_runs : []; | ||
| checks = reduceCheckRuns(runs); | ||
| } | ||
| } catch { | ||
| // Non-fatal — return null checks | ||
| } | ||
| return { state, number: parsed.number, checks }; | ||
| } | ||
| /** | ||
| * Create a PR data cache with TTL and concurrent-fetch coalescing. | ||
| * | ||
| * @returns {{ getPr: (prUrl: string, getToken: () => Promise<string|null>) => Promise<PrData|null>, deletePr: (prUrl: string) => void }} | ||
| */ | ||
| export function createPrCache() { | ||
| /** @type {Map<string, PrCacheEntry>} */ | ||
| const cache = new Map(); | ||
| /** | ||
| * Get PR data for a URL, using cache if fresh or coalescing concurrent fetches. | ||
| * | ||
| * @param {string} prUrl | ||
| * @param {() => Promise<string|null>} getToken | ||
| * @returns {Promise<PrData|null>} | ||
| */ | ||
| async function getPr(prUrl, getToken) { | ||
| if (!prUrl) return null; | ||
| const now = Date.now(); | ||
| const entry = cache.get(prUrl); | ||
| // Fresh cache hit | ||
| if (entry && !entry.promise && now - entry.fetchedAt < TTL_MS) { | ||
| return { state: entry.prState, number: entry.prNumber, checks: entry.prChecks }; | ||
| } | ||
| // In-flight fetch — coalesce | ||
| if (entry && entry.promise) { | ||
| try { | ||
| return await entry.promise; | ||
| } catch { | ||
| return null; | ||
| } | ||
| } | ||
| // Stale or missing — start new fetch | ||
| const promise = fetchPrData(prUrl, getToken).then( | ||
| (data) => { | ||
| cache.set(prUrl, { | ||
| fetchedAt: Date.now(), | ||
| prState: data.state, | ||
| prNumber: data.number, | ||
| prChecks: data.checks, | ||
| promise: null, | ||
| }); | ||
| return data; | ||
| }, | ||
| (err) => { | ||
| // Clear promise on error so next call retries | ||
| const current = cache.get(prUrl); | ||
| if (current && current.promise) { | ||
| cache.set(prUrl, { ...current, promise: null }); | ||
| } | ||
| throw err; | ||
| }, | ||
| ); | ||
| cache.set(prUrl, { | ||
| fetchedAt: entry ? entry.fetchedAt : 0, | ||
| prState: entry ? entry.prState : null, | ||
| prNumber: entry ? entry.prNumber : null, | ||
| prChecks: entry ? entry.prChecks : null, | ||
| promise, | ||
| }); | ||
| try { | ||
| return await promise; | ||
| } catch { | ||
| return null; | ||
| } | ||
| } | ||
| /** | ||
| * Evict a PR entry from the cache (call on world destroy). | ||
| * | ||
| * @param {string} prUrl | ||
| */ | ||
| function deletePr(prUrl) { | ||
| cache.delete(prUrl); | ||
| } | ||
| return { getPr, deletePr }; | ||
| } |
| /** | ||
| * PR merge poller for auto-destroying worlds whose PR has merged. | ||
| * | ||
| * State machine per world: | ||
| * open -> merged (on GitHub reports merged) | ||
| * merged -> merged_destroyed (after grace period, if auto_destroy_on_merge) | ||
| */ | ||
| const GH_API_BASE = 'https://api.github.com'; | ||
| /** | ||
| * Parse owner, repo, and PR number from a GitHub PR URL. | ||
| * @param {string} prUrl e.g. https://github.com/org/repo/pull/123 | ||
| * @returns {{ owner: string, repo: string, number: number } | null} | ||
| */ | ||
| function parsePrUrl(prUrl) { | ||
| const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl); | ||
| if (!m) return null; | ||
| return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) }; | ||
| } | ||
| /** | ||
| * @param {{ | ||
| * prStateStore: import('./world-pr-state.mjs').ReturnType<typeof createWorldPrStateStore>, | ||
| * getGhToken: () => Promise<string|null>, | ||
| * destroyWorld: (worldId: string) => Promise<void>, | ||
| * pollIntervalMs?: number, | ||
| * gracePeriodMs?: number, | ||
| * }} opts | ||
| */ | ||
| export function createPrMergePoller({ | ||
| prStateStore, | ||
| getGhToken, | ||
| destroyWorld, | ||
| pollIntervalMs = 300_000, | ||
| gracePeriodMs = 600_000, | ||
| }) { | ||
| let intervalId = null; | ||
| let disabled = false; | ||
| let warnedOnce = false; | ||
| // Track in-flight grace timers so stop() can clear them | ||
| const graceTimers = new Map(); | ||
| async function destroyAndMark(worldId) { | ||
| const entry = prStateStore.get(worldId); | ||
| const prUrl = entry?.pr_url ?? '(unknown)'; | ||
| const mergedAt = entry?.pr_merged_at ?? '(unknown)'; | ||
| console.log( | ||
| `[pr-merge-poller] auto-destroyed world ${worldId}: PR ${prUrl} merged at ${mergedAt}, destroyed at ${new Date().toISOString()}`, | ||
| ); | ||
| try { | ||
| await destroyWorld(worldId); | ||
| } catch (err) { | ||
| console.error(`[pr-merge-poller] destroyWorld failed for ${worldId}:`, err.message); | ||
| } | ||
| prStateStore.set(worldId, { pr_state: 'merged_destroyed' }); | ||
| graceTimers.delete(worldId); | ||
| } | ||
| function scheduleGrace(worldId, entry) { | ||
| if (graceTimers.has(worldId)) return; // already scheduled | ||
| const id = setTimeout(() => { | ||
| destroyAndMark(worldId).catch((err) => { | ||
| console.error(`[pr-merge-poller] destroyAndMark error for ${worldId}:`, err.message); | ||
| }); | ||
| }, gracePeriodMs); | ||
| graceTimers.set(worldId, id); | ||
| } | ||
| async function checkPr(worldId, entry, ghToken) { | ||
| const parsed = parsePrUrl(entry.pr_url); | ||
| if (!parsed) { | ||
| console.warn(`[pr-merge-poller] cannot parse PR URL for ${worldId}: ${entry.pr_url}`); | ||
| return; | ||
| } | ||
| const apiUrl = `${GH_API_BASE}/repos/${parsed.owner}/${parsed.repo}/pulls/${parsed.number}`; | ||
| let data; | ||
| try { | ||
| const resp = await fetch(apiUrl, { | ||
| headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' }, | ||
| }); | ||
| if (!resp.ok) { | ||
| console.warn(`[pr-merge-poller] GH API ${resp.status} for ${worldId}`); | ||
| return; | ||
| } | ||
| data = await resp.json(); | ||
| } catch (err) { | ||
| console.warn(`[pr-merge-poller] fetch failed for ${worldId}:`, err.message); | ||
| return; | ||
| } | ||
| const isMerged = data.state === 'closed' && data.merged_at != null; | ||
| if (!isMerged) return; | ||
| prStateStore.set(worldId, { | ||
| pr_state: 'merged', | ||
| pr_merged_at: data.merged_at, | ||
| }); | ||
| if (entry.auto_destroy_on_merge === false) return; | ||
| scheduleGrace(worldId, prStateStore.get(worldId)); | ||
| } | ||
| async function pollOnce() { | ||
| const ghToken = await getGhToken(); | ||
| if (!ghToken) { | ||
| if (!warnedOnce) { | ||
| console.warn( | ||
| 'pr-merge-poller: no GH token found (GH_TOKEN/GITHUB_TOKEN env or /gh-config/hosts.yml); PR polling disabled', | ||
| ); | ||
| warnedOnce = true; | ||
| } | ||
| disabled = true; | ||
| stop(); | ||
| return; | ||
| } | ||
| const worlds = prStateStore.getWorldsToWatch(); | ||
| for (const entry of worlds) { | ||
| const { worldId, ...rest } = entry; | ||
| if (rest.pr_state === 'open') { | ||
| await checkPr(worldId, rest, ghToken); | ||
| } else if (rest.pr_state === 'merged') { | ||
| // Resume grace timer for merged entries that survived a restart | ||
| if (rest.auto_destroy_on_merge !== false) { | ||
| scheduleGrace(worldId, rest); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| function start() { | ||
| if (intervalId !== null || disabled) return; | ||
| intervalId = setInterval(() => { | ||
| pollOnce().catch((err) => { | ||
| console.error('[pr-merge-poller] pollOnce error:', err.message); | ||
| }); | ||
| }, pollIntervalMs); | ||
| } | ||
| function stop() { | ||
| if (intervalId !== null) { | ||
| clearInterval(intervalId); | ||
| intervalId = null; | ||
| } | ||
| for (const id of graceTimers.values()) { | ||
| clearTimeout(id); | ||
| } | ||
| graceTimers.clear(); | ||
| } | ||
| return { start, stop }; | ||
| } |
| /** | ||
| * PR Nanny — host-side daemon that watches all worlds' open PRs and | ||
| * dispatches fixes via `olam dispatch` when CI/reviews block them. | ||
| * | ||
| * Extends the pr-merge-poller loop pattern. Runs at 60s cadence. | ||
| * Opt-out: OLAM_PR_NANNY=0 (default: enabled). | ||
| * | ||
| * State machine per PR (stored in world-pr-state.json nanny_* fields): | ||
| * watching → dispatching → (paused | escalated | halted) | ||
| * | ||
| * Halt conditions (stop dispatching but keep watching): | ||
| * 1. dispatch_count >= MAX_DISPATCHES (configurable, default 5) | ||
| * 2. wall-clock since first dispatch >= MAX_WALL_CLOCK_MIN (default 60) | ||
| * 3. same-root-cause loop detected (last 2 dispatch summaries identical) | ||
| * 4. operator manual pause | ||
| * | ||
| * Tier escalation (PR #N tier-escalation): | ||
| * On each retry, the nanny advances to the next tier in `escalationTiers` | ||
| * (stored per-world in nanny_current_tier) instead of repeating the same | ||
| * model. When the chain is exhausted, emits `dispatch.tier-exhausted` on | ||
| * the host-stream and falls back to existing operator escalation. | ||
| */ | ||
| import { execFile } from 'node:child_process'; | ||
| import { promisify } from 'node:util'; | ||
| import { pickNextTier } from './dispatch/tier-escalator.mjs'; | ||
| import { safePersistLastDispatch } from './dispatch-persister.mjs'; | ||
| const execFileAsync = promisify(execFile); | ||
| const GH_API_BASE = 'https://api.github.com'; | ||
| // Known external-blocker CI check name patterns. | ||
| // When ALL failing checks match these patterns, the PR is not actionable | ||
| // (the root cause is infrastructure/release-pipeline, not the world's code). | ||
| const EXTERNAL_BLOCKER_PATTERNS = [ | ||
| /detect-image-scopes/i, | ||
| /publish-mcp-auth/i, | ||
| /retag-mcp-auth/i, | ||
| /bootstrap.*publish/i, | ||
| /release.*pipeline/i, | ||
| /ghcr.*push/i, | ||
| ]; | ||
| /** | ||
| * @param {string} checkName | ||
| * @returns {boolean} | ||
| */ | ||
| function isExternalBlockerCheck(checkName) { | ||
| return EXTERNAL_BLOCKER_PATTERNS.some((re) => re.test(checkName)); | ||
| } | ||
| /** | ||
| * Returns true when ALL failing CI checks are external-blocker patterns. | ||
| * @param {Array<{name: string, conclusion: string|null}>} checks | ||
| */ | ||
| export function isExternalBlocker(checks) { | ||
| const failing = checks.filter( | ||
| (c) => c.conclusion === 'failure' || c.conclusion === 'action_required', | ||
| ); | ||
| if (failing.length === 0) return false; | ||
| return failing.every((c) => isExternalBlockerCheck(c.name)); | ||
| } | ||
| /** | ||
| * @param {string} prUrl e.g. https://github.com/org/repo/pull/123 | ||
| * @returns {{ owner: string, repo: string, number: number } | null} | ||
| */ | ||
| function parsePrUrl(prUrl) { | ||
| const m = /github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/.exec(prUrl); | ||
| if (!m) return null; | ||
| return { owner: m[1], repo: m[2], number: parseInt(m[3], 10) }; | ||
| } | ||
| /** | ||
| * @param {{ | ||
| * prStateStore: ReturnType<import('./world-pr-state.mjs').createWorldPrStateStore>, | ||
| * getGhToken: () => Promise<string|null>, | ||
| * dispatchToWorld: (worldId: string, prompt: string, opts?: { tier?: string }) => Promise<void>, | ||
| * consultCodex: (ctx: string) => Promise<string>, | ||
| * broadcastTierEvent?: (eventType: string, payload: unknown) => void, | ||
| * pollIntervalMs?: number, | ||
| * maxDispatches?: number, | ||
| * maxWallClockMin?: number, | ||
| * }} opts | ||
| */ | ||
| export function createPrNanny({ | ||
| prStateStore, | ||
| getGhToken, | ||
| dispatchToWorld, | ||
| consultCodex, | ||
| broadcastTierEvent = () => {}, | ||
| pollIntervalMs = 60_000, | ||
| maxDispatches = parseInt(process.env.OLAM_PR_NANNY_MAX_DISPATCHES ?? '5', 10), | ||
| maxWallClockMin = parseInt(process.env.OLAM_PR_NANNY_MAX_WALL_CLOCK_MIN ?? '60', 10), | ||
| }) { | ||
| const enabled = (process.env.OLAM_PR_NANNY ?? '1') !== '0'; | ||
| if (!enabled) return { start() {}, stop() {} }; | ||
| let intervalId = null; | ||
| let warnedOnce = false; | ||
| /** | ||
| * Fetch CI check runs for the PR's head SHA. | ||
| * @param {string} owner @param {string} repo @param {number} prNumber @param {string} ghToken | ||
| * @returns {Promise<Array<{name: string, conclusion: string|null}>>} | ||
| */ | ||
| async function fetchChecks(owner, repo, prNumber, ghToken) { | ||
| try { | ||
| // First get the PR head SHA | ||
| const prRes = await fetch( | ||
| `${GH_API_BASE}/repos/${owner}/${repo}/pulls/${prNumber}`, | ||
| { headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' } }, | ||
| ); | ||
| if (!prRes.ok) return []; | ||
| const prData = await prRes.json(); | ||
| const sha = prData.head?.sha; | ||
| if (!sha) return []; | ||
| const checkRes = await fetch( | ||
| `${GH_API_BASE}/repos/${owner}/${repo}/commits/${sha}/check-runs?per_page=100`, | ||
| { headers: { Authorization: `token ${ghToken}`, Accept: 'application/vnd.github+json' } }, | ||
| ); | ||
| if (!checkRes.ok) return []; | ||
| const checkData = await checkRes.json(); | ||
| return (checkData.check_runs ?? []).map((r) => ({ | ||
| name: r.name, | ||
| conclusion: r.conclusion, | ||
| status: r.status, | ||
| })); | ||
| } catch { | ||
| return []; | ||
| } | ||
| } | ||
| /** | ||
| * @param {string} worldId | ||
| * @param {object} entry current pr-state entry | ||
| * @param {string} ghToken | ||
| */ | ||
| async function processWorld(worldId, entry, ghToken) { | ||
| if (entry.nanny_paused || entry.nanny_escalated) return; | ||
| if (entry.pr_state !== 'open') return; | ||
| const parsed = parsePrUrl(entry.pr_url); | ||
| if (!parsed) return; | ||
| // Halt: dispatch cap | ||
| const dispatchCount = entry.nanny_dispatch_count ?? 0; | ||
| if (dispatchCount >= maxDispatches) return; | ||
| // Halt: wall-clock ceiling | ||
| if (entry.nanny_first_dispatch_at) { | ||
| const elapsedMin = (Date.now() - new Date(entry.nanny_first_dispatch_at).getTime()) / 60_000; | ||
| if (elapsedMin >= maxWallClockMin) return; | ||
| } | ||
| const checks = await fetchChecks(parsed.owner, parsed.repo, parsed.number, ghToken); | ||
| const hasCiFailure = checks.some( | ||
| (c) => c.conclusion === 'failure' || c.conclusion === 'action_required', | ||
| ); | ||
| const allPassing = checks.length > 0 && checks.every( | ||
| (c) => c.conclusion === 'success' || c.conclusion === 'skipped' || c.conclusion === 'neutral', | ||
| ); | ||
| if (allPassing || checks.length === 0) return; | ||
| if (!hasCiFailure) return; | ||
| // External blocker — do not dispatch | ||
| if (isExternalBlocker(checks)) { | ||
| prStateStore.set(worldId, { nanny_external_blocker: true }); | ||
| return; | ||
| } | ||
| prStateStore.set(worldId, { nanny_external_blocker: false }); | ||
| const failingNames = checks | ||
| .filter((c) => c.conclusion === 'failure' || c.conclusion === 'action_required') | ||
| .map((c) => c.name) | ||
| .join(', '); | ||
| const prompt = `CI is failing on PR ${entry.pr_url}. Failing checks: ${failingNames}. Investigate the root cause, fix the code, and push.`; | ||
| // Halt: same-root-cause loop detection | ||
| if (entry.nanny_last_dispatch_prompt && entry.nanny_last_dispatch_prompt === prompt) { | ||
| console.log(`[pr-nanny] loop detected for ${worldId} — same prompt as last dispatch, halting`); | ||
| prStateStore.set(worldId, { nanny_loop_halted: true }); | ||
| return; | ||
| } | ||
| // Consult Codex before dispatching | ||
| const codexCtx = `World ${worldId} has a failing PR: ${entry.pr_url}. Failing CI checks: ${failingNames}. Should we dispatch a fix? Answer: agree, push-back, or rethink.`; | ||
| let verdict = 'agree'; | ||
| try { | ||
| verdict = await consultCodex(codexCtx); | ||
| } catch (err) { | ||
| console.warn(`[pr-nanny] codex consult failed for ${worldId}: ${err.message} — defaulting to agree`); | ||
| } | ||
| if (verdict === 'push-back') { | ||
| prStateStore.set(worldId, { nanny_paused: true, nanny_pause_reason: 'codex_pushback' }); | ||
| console.log(`[pr-nanny] Codex push-back for ${worldId} — pausing nanny`); | ||
| return; | ||
| } | ||
| if (verdict === 'rethink') { | ||
| prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'codex_rethink' }); | ||
| console.log(`[pr-nanny] Codex rethink for ${worldId} — escalating`); | ||
| return; | ||
| } | ||
| // ── Tier escalation (PR #938) ─────────────────────────────────────────── | ||
| // | ||
| // `nanny_escalation_tiers` is set by the olam_dispatch caller via the | ||
| // escalationTiers schema field and persisted here by server.mjs when the | ||
| // world is registered for nanny tracking. Defaults to ['sonnet'] when | ||
| // absent (no escalation, no cost surprise). | ||
| // | ||
| // `nanny_current_tier` tracks the model tier used by the LAST dispatch for | ||
| // this PR. On first dispatch (dispatchCount === 0) it is undefined, and we | ||
| // use escalationTiers[0] as the starting tier. On retries we advance the | ||
| // chain via pickNextTier. This is the pr-state store (option c from the | ||
| // design doc) — it persists across polls and matches the nanny_* field | ||
| // pattern already established by nanny_dispatch_count et al. | ||
| const escalationTiers = entry.nanny_escalation_tiers ?? ['sonnet']; | ||
| const currentTier = entry.nanny_current_tier ?? escalationTiers[0] ?? 'sonnet'; | ||
| let tierForThisDispatch = currentTier; | ||
| if (dispatchCount > 0) { | ||
| // This is a retry — try to escalate the tier. | ||
| const nextTier = pickNextTier(currentTier, escalationTiers); | ||
| if (nextTier !== null) { | ||
| tierForThisDispatch = nextTier; | ||
| broadcastTierEvent('dispatch.escalated', { | ||
| worldId, | ||
| fromTier: currentTier, | ||
| toTier: nextTier, | ||
| reason: 'retry-after-failure', | ||
| }); | ||
| console.log(`[pr-nanny] tier escalated for ${worldId}: ${currentTier} → ${nextTier}`); | ||
| } else { | ||
| // Chain exhausted — emit tier-exhausted and fall back to operator escalation. | ||
| broadcastTierEvent('dispatch.tier-exhausted', { | ||
| worldId, | ||
| exhaustedTier: currentTier, | ||
| escalationTiers, | ||
| }); | ||
| console.log(`[pr-nanny] tier chain exhausted for ${worldId} (last tier: ${currentTier}) — escalating to operator`); | ||
| prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'tier_exhausted' }); | ||
| return; | ||
| } | ||
| } | ||
| // Dispatch fix | ||
| try { | ||
| safePersistLastDispatch({ | ||
| worldId, | ||
| messageId: `nanny-${worldId}-${Date.now()}`, | ||
| prompt, | ||
| source: 'pr-nanny', | ||
| }); | ||
| await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch }); | ||
| const now = new Date().toISOString(); | ||
| prStateStore.set(worldId, { | ||
| nanny_dispatch_count: dispatchCount + 1, | ||
| nanny_first_dispatch_at: entry.nanny_first_dispatch_at ?? now, | ||
| nanny_last_dispatch_at: now, | ||
| nanny_last_dispatch_prompt: prompt, | ||
| nanny_current_tier: tierForThisDispatch, | ||
| }); | ||
| console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches}, tier: ${tierForThisDispatch})`); | ||
| } catch (err) { | ||
| console.error(`[pr-nanny] dispatch failed for ${worldId}: ${err.message}`); | ||
| } | ||
| } | ||
| async function pollOnce() { | ||
| const ghToken = await getGhToken(); | ||
| if (!ghToken) { | ||
| if (!warnedOnce) { | ||
| console.warn('[pr-nanny] no GH token — CI polling disabled'); | ||
| warnedOnce = true; | ||
| } | ||
| return; | ||
| } | ||
| const worlds = prStateStore.getWorldsToWatch(); | ||
| for (const { worldId, ...entry } of worlds) { | ||
| try { | ||
| await processWorld(worldId, entry, ghToken); | ||
| } catch (err) { | ||
| console.error(`[pr-nanny] processWorld error for ${worldId}: ${err.message}`); | ||
| } | ||
| } | ||
| } | ||
| function start() { | ||
| if (intervalId !== null) return; | ||
| // Immediate first poll | ||
| pollOnce().catch((err) => console.error('[pr-nanny] pollOnce error:', err.message)); | ||
| intervalId = setInterval(() => { | ||
| pollOnce().catch((err) => console.error('[pr-nanny] pollOnce error:', err.message)); | ||
| }, pollIntervalMs); | ||
| } | ||
| function stop() { | ||
| if (intervalId !== null) { | ||
| clearInterval(intervalId); | ||
| intervalId = null; | ||
| } | ||
| } | ||
| return { start, stop }; | ||
| } | ||
| /** | ||
| * Default Codex consultation via the host-side `codex` CLI. | ||
| * @param {string} ctx | ||
| * @returns {Promise<'agree'|'push-back'|'rethink'>} | ||
| */ | ||
| export async function defaultConsultCodex(ctx) { | ||
| try { | ||
| const { stdout } = await execFileAsync('codex', [ | ||
| '--quiet', | ||
| '--model', 'codex-mini-latest', | ||
| `Adversarial review — is this a good idea? ${ctx} Reply with exactly one word: agree, push-back, or rethink.`, | ||
| ], { timeout: 30_000 }); | ||
| const text = stdout.trim().toLowerCase(); | ||
| if (text.startsWith('push')) return 'push-back'; | ||
| if (text.startsWith('rethink')) return 'rethink'; | ||
| return 'agree'; | ||
| } catch { | ||
| return 'agree'; // fail-open: if codex unavailable, dispatch anyway | ||
| } | ||
| } | ||
| /** | ||
| * Default dispatch: shell out to `olam dispatch <worldId> <prompt>`. | ||
| * @param {string} worldId | ||
| * @param {string} prompt | ||
| */ | ||
| export async function defaultDispatchToWorld(worldId, prompt) { | ||
| await execFileAsync('olam', ['dispatch', worldId, prompt], { timeout: 60_000 }); | ||
| } |
| /** | ||
| * process-poller.mjs — per-world docker top SSE fanout. | ||
| * | ||
| * Dual-mode: HTTP API when DOCKER_HOST != 'docker-cli'; spawnSync otherwise. | ||
| * | ||
| * NOTE: process argv may contain secrets (--api-key=, --token=). Post-v1 audit needed. (S1) | ||
| */ | ||
| import { spawnSync } from 'node:child_process'; | ||
| const DOCKER_HOST = process.env.DOCKER_HOST ?? 'docker-cli'; | ||
| /** | ||
| * @typedef {{ pid: string, user: string, cpu: string, mem: string, started: string, state: string, command: string }} ProcessRow | ||
| */ | ||
| function worldContainerName(worldId) { | ||
| return `olam-${worldId}-devbox`; | ||
| } | ||
| /** | ||
| * Parse docker top JSON (Titles + Processes arrays) into normalized rows. | ||
| * Falls back gracefully if the response is not JSON. | ||
| * lstart is stored as a raw string — no Date parse (T1). | ||
| * | ||
| * @param {string} stdout | ||
| * @returns {ProcessRow[]} | ||
| */ | ||
| function parseDockerTop(stdout) { | ||
| let parsed; | ||
| try { | ||
| parsed = JSON.parse(stdout); | ||
| } catch { | ||
| return []; | ||
| } | ||
| const titles = parsed?.Titles; | ||
| const processes = parsed?.Processes; | ||
| if (!Array.isArray(titles) || !Array.isArray(processes)) return []; | ||
| // Find column indices by title (case-insensitive partial match). | ||
| function idx(name) { | ||
| const n = name.toLowerCase(); | ||
| const i = titles.findIndex((t) => typeof t === 'string' && t.toLowerCase().includes(n)); | ||
| return i; | ||
| } | ||
| const pidIdx = idx('pid'); | ||
| const userIdx = idx('user'); | ||
| const cpuIdx = idx('cpu'); | ||
| const memIdx = idx('mem'); | ||
| // Accept LSTART, STARTED, STIME, or START_TIME (T1: store as raw string) | ||
| const startIdx = (() => { | ||
| for (const candidate of ['lstart', 'stime', 'start_time', 'start']) { | ||
| const i = idx(candidate); | ||
| if (i !== -1) return i; | ||
| } | ||
| return -1; | ||
| })(); | ||
| const stateIdx = idx('stat'); | ||
| const cmdIdx = (() => { | ||
| // CMD may be titled "CMD", "COMMAND", or "cmd" | ||
| const i = idx('command'); | ||
| return i !== -1 ? i : idx('cmd'); | ||
| })(); | ||
| return processes.map((row) => ({ | ||
| pid: pidIdx !== -1 ? String(row[pidIdx] ?? '').trim() : '', | ||
| user: userIdx !== -1 ? String(row[userIdx] ?? '').trim() : '', | ||
| cpu: cpuIdx !== -1 ? String(row[cpuIdx] ?? '').trim() : '0', | ||
| mem: memIdx !== -1 ? String(row[memIdx] ?? '').trim() : '0', | ||
| started: startIdx !== -1 ? String(row[startIdx] ?? '').trim() : '', | ||
| state: stateIdx !== -1 ? String(row[stateIdx] ?? '').trim() : '', | ||
| command: cmdIdx !== -1 ? String(row[cmdIdx] ?? '').trim() : '', | ||
| })); | ||
| } | ||
| /** | ||
| * Fetch processes for a world container. | ||
| * Returns {ts, processes, error?}. | ||
| * Non-running containers return an empty array + error field (T3). | ||
| * | ||
| * @param {string} worldId | ||
| * @returns {Promise<{ts: number, processes: ProcessRow[], error?: string}>} | ||
| */ | ||
| async function fetchProcesses(worldId) { | ||
| const containerName = worldContainerName(worldId); | ||
| // Docker's /containers/<name>/top?ps_args=<X> passes ps_args verbatim to | ||
| // ps(1) inside the container. The pre-2026-05-05 form `pid,user,...` was | ||
| // a bare comma-separated list that ps treats as a process-ID *list*, not | ||
| // a column selector — yielding 500 "ps: error: process ID list syntax | ||
| // error" from the Docker API and a misleading "container not running" | ||
| // chip in the SPA. Correct invocation is `ps -eo <cols>` to select all | ||
| // processes (`-e`) and project specific columns (`-o`). Confirmed via | ||
| // host-cp container against olam-dawn-arc-5703-devbox: this form returns | ||
| // 200 with both Titles + Processes arrays, which parseDockerTop expects. | ||
| // | ||
| // Switched lstart → stime to match the CLI path's column choice (line 98) | ||
| // and avoid multi-word timestamp values; the CLI path's split-on-1+ws | ||
| // parser would break on "Mon May 4 14:00:00 2026", and consistency between | ||
| // paths reduces surprise. parseDockerTop accepts either via title match. | ||
| const ps_args = '-eo pid,user,pcpu,pmem,stime,stat,cmd'; | ||
| let stdout; | ||
| try { | ||
| if (DOCKER_HOST === 'docker-cli') { | ||
| // Bare-node mode: spawnSync blocks ~50ms at 5s cadence (P2 — acceptable). | ||
| // Use `stime` (single-word start time) instead of `lstart` to avoid | ||
| // multi-word timestamp values that break column-split parsing. | ||
| const result = spawnSync( | ||
| 'docker', | ||
| ['top', containerName, 'pid', 'user', 'pcpu', 'pmem', 'stime', 'stat', 'cmd'], | ||
| { encoding: 'utf-8', timeout: 3000 }, | ||
| ); | ||
| if (result.status !== 0 || result.error) { | ||
| return { ts: Date.now(), processes: [], error: 'container not running' }; | ||
| } | ||
| // docker top bare CLI outputs tabular text, not JSON. Wrap it for parseDockerTop. | ||
| stdout = result.stdout ?? ''; | ||
| const lines = stdout.trim().split('\n'); | ||
| if (lines.length < 1) return { ts: Date.now(), processes: [] }; | ||
| // First line is the header row; remaining are process rows. | ||
| // stime is always a single word (e.g. "10:00" or "Feb11"), so splitting | ||
| // on 1+ whitespace is safe. | ||
| const titleFields = lines[0].trim().split(/\s+/); | ||
| const dataRows = lines.slice(1).map((line) => { | ||
| const parts = line.trim().split(/\s+/); | ||
| // CMD may contain spaces — rejoin everything after the 7th token. | ||
| if (parts.length > 7) { | ||
| return [...parts.slice(0, 6), parts.slice(6).join(' ')]; | ||
| } | ||
| return parts; | ||
| }); | ||
| const wrapped = JSON.stringify({ Titles: titleFields, Processes: dataRows }); | ||
| return { ts: Date.now(), processes: parseDockerTop(wrapped) }; | ||
| } else { | ||
| // Container mode: Docker HTTP API. | ||
| const apiBase = DOCKER_HOST.replace(/^tcp:\/\//, 'http://'); | ||
| const url = `${apiBase}/containers/${encodeURIComponent(containerName)}/top?ps_args=${encodeURIComponent(ps_args)}`; | ||
| const resp = await fetch(url, { signal: AbortSignal.timeout(3000) }); | ||
| if (!resp.ok) { | ||
| return { ts: Date.now(), processes: [], error: 'container not running' }; | ||
| } | ||
| stdout = await resp.text(); | ||
| return { ts: Date.now(), processes: parseDockerTop(stdout) }; | ||
| } | ||
| } catch { | ||
| return { ts: Date.now(), processes: [], error: 'container not running' }; | ||
| } | ||
| } | ||
| /** | ||
| * Snapshot — thin wrapper over fetchProcesses. | ||
| * | ||
| * @param {string} worldId | ||
| */ | ||
| export async function getProcessSnapshot(worldId) { | ||
| return fetchProcesses(worldId); | ||
| } | ||
| // ── SSE fanout state ───────────────────────────────────────────────── | ||
| /** | ||
| * Per-world subscriber registry. | ||
| * @type {Map<string, {pollTimer: ReturnType<typeof setInterval>, heartbeatTimer: ReturnType<typeof setInterval>, subscribers: Set<import('node:http').ServerResponse>}>} | ||
| */ | ||
| const worldPollers = new Map(); | ||
| /** | ||
| * Broadcast a payload to all subscribers for a world. | ||
| * @param {string} worldId | ||
| * @param {{ts: number, processes: ProcessRow[], error?: string}} data | ||
| */ | ||
| function broadcast(worldId, data) { | ||
| const entry = worldPollers.get(worldId); | ||
| if (!entry) return; | ||
| const payload = `event: processes\ndata: ${JSON.stringify(data)}\n\n`; | ||
| for (const res of entry.subscribers) { | ||
| try { res.write(payload); } catch { /* subscriber gone; cleanup fires on close */ } | ||
| } | ||
| } | ||
| /** | ||
| * Subscribe an SSE response to the world's process stream. | ||
| * | ||
| * SSE headers are written BEFORE adding to the Set (T2: prevents leak if close | ||
| * fires before headers are flushed — the cleanup handler is safe to call even | ||
| * with an empty Set). | ||
| * | ||
| * @param {string} worldId | ||
| * @param {import('node:http').ServerResponse} res | ||
| */ | ||
| export function subscribeToProcesses(worldId, res) { | ||
| // Write SSE headers synchronously before touching the subscriber Set (T2). | ||
| res.writeHead(200, { | ||
| 'Content-Type': 'text/event-stream', | ||
| 'Cache-Control': 'no-cache', | ||
| 'Connection': 'keep-alive', | ||
| 'X-Accel-Buffering': 'no', | ||
| }); | ||
| let entry = worldPollers.get(worldId); | ||
| if (!entry) { | ||
| // First subscriber — start the poll + heartbeat timers. | ||
| const pollTimer = setInterval(async () => { | ||
| const data = await fetchProcesses(worldId); | ||
| broadcast(worldId, data); | ||
| }, 5000); | ||
| const heartbeatTimer = setInterval(() => { | ||
| const e = worldPollers.get(worldId); | ||
| if (!e) return; | ||
| for (const r of e.subscribers) { | ||
| try { r.write(': heartbeat\n\n'); } catch { /* ignore */ } | ||
| } | ||
| }, 25000); | ||
| entry = { pollTimer, heartbeatTimer, subscribers: new Set() }; | ||
| worldPollers.set(worldId, entry); | ||
| } | ||
| entry.subscribers.add(res); | ||
| // Send an immediate first snapshot so the client doesn't wait 5s. | ||
| fetchProcesses(worldId).then((data) => { | ||
| try { res.write(`event: processes\ndata: ${JSON.stringify(data)}\n\n`); } catch { /* gone */ } | ||
| }); | ||
| // Cleanup on disconnect — mirrors wireRelease pattern with once-flag. | ||
| let cleaned = false; | ||
| function cleanup() { | ||
| if (cleaned) return; | ||
| cleaned = true; | ||
| const e = worldPollers.get(worldId); | ||
| if (!e) return; | ||
| e.subscribers.delete(res); | ||
| if (e.subscribers.size === 0) { | ||
| clearInterval(e.pollTimer); | ||
| clearInterval(e.heartbeatTimer); | ||
| worldPollers.delete(worldId); | ||
| } | ||
| } | ||
| res.on('close', cleanup); | ||
| res.on('finish', cleanup); | ||
| } | ||
| // Export parseDockerTop for unit tests. | ||
| export { parseDockerTop }; |
| // Phase F-2-B (B3): host CP HTTP proxy. | ||
| // | ||
| // Rewrites incoming requests under `/api/world/<id>/<route...>` to the | ||
| // per-world CP at `<perWorldBase>/<route...>` with `X-Olam-Secret` | ||
| // injected server-side. | ||
| // | ||
| // Pattern lifted from `packages/cloudflare-worker/src/index.ts:462-551` | ||
| // (`proxyContainer`). CF Worker uses Workers' `fetch()`; host CP uses | ||
| // Node's `http.request` so SSE streams flow byte-for-byte without | ||
| // buffering. Verbatim passthrough on /hooks/* and /api/auth/* (D8) is | ||
| // implemented in B4 (this module is JSON-API-only — B4 wraps). | ||
| import http from 'node:http'; | ||
| /** | ||
| * Default upstream-request timeout for proxied per-world CP calls. SSE | ||
| * streams (`/api/stream`, `/hooks/*` long-poll) MUST opt out — they | ||
| * intentionally hold the socket open. Everything else should respond | ||
| * within a few seconds; if the per-world CP wedges (slow sqlite, | ||
| * tmux command stuck, long docker exec), this prevents the host-cp | ||
| * connection from hanging until the OS RSTs it. The browser sees a | ||
| * clean 504 instead of Safari's TypeError "Load failed", and useLanes / | ||
| * useReadiness can retry on a known status code. | ||
| * | ||
| * 10s matches the longest legitimate handler we've measured (cold | ||
| * sqlite open + readiness query) with headroom. | ||
| * | ||
| * @internal exported for test override | ||
| */ | ||
| export const DEFAULT_PROXY_TIMEOUT_MS = 10_000; | ||
| /** | ||
| * Parse `/api/world/<id>/<route...>` from a request path. Returns | ||
| * `{ worldId, subPath }` or null if the path doesn't match. | ||
| * | ||
| * Anchored at `^/api/world/` to prevent prefix-matching from /api/worlds | ||
| * (the worlds-list endpoint, plural). Empty world IDs do not match. | ||
| * | ||
| * @param {string} path | ||
| * @returns {{ worldId: string, subPath: string } | null} | ||
| */ | ||
| export function parseProxyPath(path) { | ||
| const m = /^\/api\/world\/([^/?#]+)(\/.*|\?.*|#.*)?$/.exec(path); | ||
| if (!m) return null; | ||
| return { | ||
| worldId: m[1], | ||
| subPath: m[2] ?? '/', | ||
| }; | ||
| } | ||
| /** | ||
| * Compute the per-world CP's base URL from a worldId. Today the world | ||
| * registry stores port offsets; the canonical port is `19080 + offset`. | ||
| * For B3, accept the port directly (deferring worlds.db integration to | ||
| * B6/B10). The caller (server.mjs) resolves worldId → port via worlds.db | ||
| * and passes the port here. | ||
| * | ||
| * In Docker Compose mode, host-cp is in its own network and reaches | ||
| * world CPs via `host.docker.internal:<port>` (compose.yaml's | ||
| * extra_hosts: host-gateway). On Docker Desktop this is automatic; | ||
| * on Linux it requires the `host-gateway` extra-host directive. | ||
| * | ||
| * @param {number} port per-world CP host port (e.g., 20780) | ||
| * @param {string} [host] optional hostname override (default 'host.docker.internal') | ||
| * @returns {string} | ||
| */ | ||
| export function perWorldBase(port, host = 'host.docker.internal') { // bare-node-allow: container-mode default; bare callers pass WORLD_HOST explicitly (server.mjs) | ||
| return `http://${host}:${port}`; | ||
| } | ||
| /** | ||
| * SSE / long-poll paths whose handlers intentionally hold the socket | ||
| * open. These MUST be exempt from the upstream timeout — applying it | ||
| * would kill the stream every 10s. Caller can override per-request via | ||
| * `streaming: true`. | ||
| * | ||
| * @param {string} subPath | ||
| * @returns {boolean} | ||
| */ | ||
| function isStreamingPath(subPath) { | ||
| // Strip query string before matching. | ||
| const p = subPath.split('?')[0]; | ||
| return ( | ||
| p === '/api/stream' || | ||
| p.endsWith('/api/stream') || | ||
| p.startsWith('/hooks/') || | ||
| p === '/hooks' || | ||
| /^\/api\/auth\/events(\/|$)/.test(p) | ||
| ); | ||
| } | ||
| /** | ||
| * Proxy an incoming request to a per-world CP, injecting X-Olam-Secret. | ||
| * | ||
| * Forwards: method, path (subPath), body bytes, ALL request headers | ||
| * EXCEPT `host` (rewritten) and `x-olam-secret` (overwritten with the | ||
| * injected secret to prevent client spoofing). | ||
| * | ||
| * Returns: status code, ALL response headers (verbatim — D8 contract | ||
| * forwards Set-Cookie, Location, etc. unchanged), body bytes streamed | ||
| * via Node's http.IncomingMessage→ServerResponse pipe (no buffering). | ||
| * | ||
| * Upstream timeout: short-request handlers (≠ SSE) get an upstream | ||
| * socket timeout of `timeoutMs` (defaults to DEFAULT_PROXY_TIMEOUT_MS). | ||
| * On expiry we abort the upstream socket and respond 504 — this | ||
| * converts a wedged per-world CP into a deterministic status code | ||
| * instead of a TCP RST that Safari surfaces as `TypeError: Load | ||
| * failed`. Pass `streaming: true` (or hit a path matching | ||
| * `isStreamingPath`) to opt out. | ||
| * | ||
| * @param {object} args | ||
| * @param {import('node:http').IncomingMessage} args.req | ||
| * @param {import('node:http').ServerResponse} args.res | ||
| * @param {string} args.subPath e.g., '/api/world' or '/api/stream' | ||
| * @param {string} args.targetBase e.g., 'http://host.docker.internal:20780' | ||
| * @param {string} args.secret the X-Olam-Secret value | ||
| * @param {(message: string) => void} [args.log] | ||
| * @param {number} [args.timeoutMs] per-request upstream timeout; ignored for streams | ||
| * @param {boolean} [args.streaming] force SSE/long-poll mode (skip timeout) | ||
| */ | ||
| export function proxyToWorld({ | ||
| req, | ||
| res, | ||
| subPath, | ||
| targetBase, | ||
| secret, | ||
| log = console.log, | ||
| timeoutMs = DEFAULT_PROXY_TIMEOUT_MS, | ||
| streaming = false, | ||
| }) { | ||
| const target = new URL(subPath, targetBase); | ||
| const isStream = streaming || isStreamingPath(subPath); | ||
| // Build outbound headers. Filter `host` (Node will set from URL) + | ||
| // overwrite `x-olam-secret` (defense against client spoofing). | ||
| /** @type {Record<string, string | string[]>} */ | ||
| const outHeaders = {}; | ||
| for (const [k, v] of Object.entries(req.headers)) { | ||
| if (v === undefined) continue; | ||
| const lower = k.toLowerCase(); | ||
| if (lower === 'host' || lower === 'x-olam-secret') continue; | ||
| outHeaders[k] = v; | ||
| } | ||
| outHeaders['x-olam-secret'] = secret; | ||
| outHeaders['x-forwarded-by'] = 'olam-host-cp'; | ||
| const upstreamReq = http.request( | ||
| target, | ||
| { | ||
| method: req.method ?? 'GET', | ||
| headers: outHeaders, | ||
| }, | ||
| (upstreamRes) => { | ||
| // Once headers come back from upstream, the request is no longer | ||
| // "stuck" — clear the timeout so a slow stream-of-body doesn't | ||
| // get killed mid-flight. Streaming handlers that intentionally | ||
| // delay between writes still rely on the no-timeout path. | ||
| if (timer !== null) { | ||
| clearTimeout(timer); | ||
| timer = null; | ||
| } | ||
| // Verbatim passthrough: status + ALL headers + body bytes. | ||
| // Use res.writeHead so the headers go out atomically with the | ||
| // status line (response.statusCode + setHeader split would race | ||
| // on early body write). statusMessage may be undefined on some | ||
| // upstream paths — fall back to the default. | ||
| res.writeHead( | ||
| upstreamRes.statusCode ?? 502, | ||
| upstreamRes.statusMessage, | ||
| upstreamRes.headers, | ||
| ); | ||
| upstreamRes.pipe(res); | ||
| }, | ||
| ); | ||
| /** @type {ReturnType<typeof setTimeout> | null} */ | ||
| let timer = null; | ||
| if (!isStream && timeoutMs > 0) { | ||
| timer = setTimeout(() => { | ||
| timer = null; | ||
| log(`proxy: upstream timeout (${timeoutMs}ms) for ${target}`); | ||
| // Destroying the upstream req triggers the 'error' handler with | ||
| // a generic socket error; we pre-empt it with an explicit 504 | ||
| // first so the client sees a clean status instead of the generic | ||
| // 502 the error handler would emit. | ||
| if (!res.headersSent) { | ||
| res.writeHead(504, { 'Content-Type': 'application/json; charset=utf-8' }); | ||
| res.end(JSON.stringify({ | ||
| error: 'upstream_timeout', | ||
| message: `per-world CP did not respond within ${timeoutMs}ms`, | ||
| worldUrl: target.origin, | ||
| })); | ||
| } else { | ||
| res.end(); | ||
| } | ||
| try { | ||
| upstreamReq.destroy(new Error('proxy upstream timeout')); | ||
| } catch { | ||
| // already destroyed | ||
| } | ||
| }, timeoutMs); | ||
| } | ||
| // Upstream connection error — don't leak internals to the client. | ||
| upstreamReq.on('error', (err) => { | ||
| if (timer !== null) { | ||
| clearTimeout(timer); | ||
| timer = null; | ||
| } | ||
| log(`proxy: upstream error for ${target}: ${err.message}`); | ||
| if (!res.headersSent) { | ||
| res.writeHead(502, { 'Content-Type': 'application/json; charset=utf-8' }); | ||
| res.end(JSON.stringify({ | ||
| error: 'upstream_unreachable', | ||
| message: 'per-world CP did not respond', | ||
| worldUrl: target.origin, | ||
| })); | ||
| } else { | ||
| // Response already started (likely SSE); just close. | ||
| res.end(); | ||
| } | ||
| }); | ||
| // Client closed early (browser navigated away, Safari unloaded the | ||
| // EventSource, etc.). Tear down the upstream so we don't keep an | ||
| // open socket to the per-world CP for an answer the caller no longer | ||
| // wants. Without this, host-cp leaks sockets per cancelled poll. | ||
| res.on('close', () => { | ||
| if (timer !== null) { | ||
| clearTimeout(timer); | ||
| timer = null; | ||
| } | ||
| if (!upstreamReq.destroyed) { | ||
| try { | ||
| upstreamReq.destroy(); | ||
| } catch { | ||
| // already gone | ||
| } | ||
| } | ||
| }); | ||
| // Pipe request body. For GET/HEAD this is a no-op (no body bytes); | ||
| // for POST/PUT/PATCH this streams the body upstream. | ||
| req.pipe(upstreamReq); | ||
| } |
| /** | ||
| * Phase E3 (olam-dogfood-vision): PylonWorldsSource skeleton. | ||
| * | ||
| * Stub implementation of the WorldsSource contract (E1) for Pylon- | ||
| * managed cloud worlds. Returns `[]` for now — the actual @pleri/pylon | ||
| * SDK integration is intentionally deferred (T5 mitigation: design the | ||
| * contract before the SDK lands so consumers don't churn when it does). | ||
| * | ||
| * The class proves the interface composes: E4 wires this alongside | ||
| * LocalWorldsSource into the GET /api/worlds handler so a Pylon-enabled | ||
| * deployment fans out across both sources, dedupes by id, and returns | ||
| * the union. With this stub returning `[]`, an enabled-but-empty Pylon | ||
| * source is a strict no-op over local-only behavior. | ||
| * | ||
| * Activation: gated by `OLAM_HOST_CP_PYLON_ENABLED=1`. When the env | ||
| * var is unset/0/false, server.mjs (E4) does NOT instantiate this | ||
| * source — the local-only path is preserved verbatim. When enabled, | ||
| * the empty source layers additively on top of local; behavior is | ||
| * still observably identical until the SDK ships. | ||
| * | ||
| * Why a no-op stub instead of waiting for the SDK: | ||
| * - Consumers (SPA badge logic in E5, regression tests, CLI lookup) | ||
| * can be wired against the contract without blocking on the SDK. | ||
| * - Forces E4's composition logic to actually fan out, dedupe, and | ||
| * merge — exercising the multi-source path in CI before any cloud | ||
| * traffic touches it. | ||
| * - Surface-area lock-in: anything missing here surfaces as a | ||
| * contract gap NOW, not after the SDK is wired. | ||
| * | ||
| * @typedef {import('./worlds-source.mjs').WorldsSource} WorldsSource | ||
| * @typedef {import('./worlds-source.mjs').WorldSummary} WorldSummary | ||
| */ | ||
| /** | ||
| * @typedef {object} PylonWorldsSourceDeps | ||
| * @property {boolean} enabled | ||
| * When false, list() short-circuits to `[]` without any Pylon | ||
| * interaction. Kept on the deps object (rather than read from | ||
| * process.env at construction time) so tests can flip it without | ||
| * mutating module-level env state. | ||
| */ | ||
| /** | ||
| * @param {PylonWorldsSourceDeps} [deps] | ||
| * @returns {WorldsSource} | ||
| */ | ||
| export function createPylonWorldsSource(deps = { enabled: false }) { | ||
| return { | ||
| name: 'pylon-cloud', | ||
| async list() { | ||
| if (!deps.enabled) return []; | ||
| // TODO(pylon): wire @pleri/pylon SDK. Expected shape: | ||
| // const client = new PylonClient({ token: scopedToken }); | ||
| // const cloudWorlds = await client.worlds.list(); | ||
| // return cloudWorlds.map((w) => ({ | ||
| // id: w.id, | ||
| // name: w.displayName ?? null, | ||
| // status: mapPylonStatus(w.state), // 'running' | 'starting' | ... | ||
| // services: mapPylonServices(w.services), | ||
| // source: 'pylon-cloud', | ||
| // })); | ||
| // Until the SDK lands, the source is intentionally empty — | ||
| // proving the interface composes (E4) without committing the | ||
| // mapping shape prematurely. | ||
| return []; | ||
| }, | ||
| }; | ||
| } |
| // Phase F-2-B (B6): redact sensitive keys from workspace YAML before | ||
| // exposing via /api/workspaces. | ||
| // | ||
| // T11 mitigation. Workspace YAMLs may contain operator-set environment | ||
| // variables that include OAuth client secrets, API keys, deployment | ||
| // tokens, database passwords. These should NEVER cross the host-cp ↔ | ||
| // browser boundary. | ||
| // | ||
| // Strategy: pattern-based recursive redaction. Any object key matching | ||
| // SENSITIVE_KEY_PATTERN replaces its value with `[redacted]`. Catches | ||
| // the standard naming conventions while remaining permissive on | ||
| // non-sensitive keys (we don't false-positive on legitimate config). | ||
| // | ||
| // The pattern is intentionally broad — it's defensive. If an operator | ||
| // names a non-sensitive var with a `_KEY`/`_SECRET`/`_TOKEN`/`_PASSWORD`/ | ||
| // `_CREDENTIALS` suffix, it gets redacted. Operators get a clear signal | ||
| // (the value becomes `[redacted]`) and can rename the var if needed. | ||
| // | ||
| // We deliberately do NOT use the `PROTECTED_ENV_KEYS` set from | ||
| // packages/core/src/world/env-setup.ts — that set is for service- | ||
| // discovery host/port/URL keys (POSTGRES_HOST, REDIS_URL, etc.), not | ||
| // org secrets. The two filters address different surfaces: | ||
| // - PROTECTED_ENV_KEYS in core: prevents manifest from overriding | ||
| // service-discovery state on the world's runtime env | ||
| // - SENSITIVE_KEY_PATTERN here: prevents the host CP API from leaking | ||
| // org secrets to the browser | ||
| // Both are needed. | ||
| export const SENSITIVE_KEY_PATTERN = /(.*_KEY|.*_SECRET|.*_TOKEN|.*_PASSWORD|.*_CREDENTIALS|.*_AUTH|API_KEY|PASSWORD|SECRET|TOKEN)$/i; | ||
| /** | ||
| * Recursively redact sensitive values in any JSON-like structure | ||
| * (objects, arrays, primitives). Returns a new value; does not mutate | ||
| * input. | ||
| * | ||
| * @param {unknown} value | ||
| * @returns {unknown} | ||
| */ | ||
| export function redactSensitive(value) { | ||
| if (Array.isArray(value)) { | ||
| return value.map(redactSensitive); | ||
| } | ||
| if (value !== null && typeof value === 'object') { | ||
| /** @type {Record<string, unknown>} */ | ||
| const out = {}; | ||
| for (const [k, v] of Object.entries(value)) { | ||
| if (SENSITIVE_KEY_PATTERN.test(k)) { | ||
| out[k] = '[redacted]'; | ||
| } else { | ||
| out[k] = redactSensitive(v); | ||
| } | ||
| } | ||
| return out; | ||
| } | ||
| return value; | ||
| } | ||
| /** | ||
| * Quick predicate: does this key name look sensitive? Useful for | ||
| * pre-screening when iterating large maps. | ||
| * | ||
| * @param {string} key | ||
| * @returns {boolean} | ||
| */ | ||
| export function isSensitiveKey(key) { | ||
| return SENSITIVE_KEY_PATTERN.test(key); | ||
| } |
| // redirect.mjs — Phase B3 (plan-chat-spa-supersedes-control-plane). | ||
| // | ||
| // 301 redirect layer that fronts host-cp's HTTP handler. Maps legacy | ||
| // control-plane routes that get deleted in Phase B4 onto their canonical | ||
| // successors so live URLs in operator history / bookmarks / Slack do not | ||
| // 404 after the deletion lands. | ||
| // | ||
| // Redirect rules (allow-listed; closed set): | ||
| // | ||
| // /plan/:id → no-op (falls through to SPA shell; | ||
| // plan-chat-spa-side router handles the | ||
| // resolver dispatch via useResolveId). | ||
| // Implemented as a sentinel so callers | ||
| // can short-circuit but the request | ||
| // continues to static-serve. | ||
| // /world/:id → 301 /worlds?highlight=:id | ||
| // /sandbox/:id → 301 /worlds?highlight=:id | ||
| // /session/:worldId/plan → 301 /plan/:worldId | ||
| // | ||
| // EXPLICITLY NOT REDIRECTED (more-specific routes still owned by | ||
| // control-plane until Phase E): | ||
| // /world/:id/editor /world/:id/events | ||
| // /sandbox/:id/editor /sandbox/:id/events | ||
| // /inbox/* /workspaces/* | ||
| // /repos /runbooks /design | ||
| // | ||
| // Security (per plan-chat-spa-supersedes-control-plane.md K1 SEC-2): | ||
| // - Redirect targets are HARDCODED prefixes (`/plan/`, `/worlds`). No | ||
| // caller-supplied target is ever reflected into Location. | ||
| // - `:id` segment is validated against RESOLVE_ID_RE before any | ||
| // reflection into the Location header; invalid shapes → 400, not | ||
| // 301. This kills open-redirect / response-splitting / header- | ||
| // injection vectors at the door. | ||
| // - `highlight=<id>` query param uses the SAME shape regex. We do not | ||
| // trust the inbound URL beyond the regex match (no decoding, no | ||
| // surrogate pair handling). | ||
| // | ||
| // Returns one of: | ||
| // { kind: 'redirect', status: 301, location: '<target>' } | ||
| // { kind: 'bad-request', status: 400, message: '<reason>' } | ||
| // { kind: 'passthrough' } — caller continues normal request flow | ||
| import { RESOLVE_ID_RE } from './resolver.mjs'; | ||
| /** | ||
| * Compute the redirect verdict for a given pathname. Pure function; | ||
| * does not consume the request body or write the response. | ||
| * | ||
| * @param {string} pathname - URL.pathname (no querystring, no hash) | ||
| * @returns {{ kind: 'redirect', status: 301, location: string } | ||
| * | { kind: 'bad-request', status: 400, message: string } | ||
| * | { kind: 'passthrough' }} | ||
| */ | ||
| export function evaluateRedirect(pathname) { | ||
| if (typeof pathname !== 'string' || pathname.length === 0) { | ||
| return { kind: 'passthrough' }; | ||
| } | ||
| // /session/:worldId/plan → /plan/:worldId | ||
| // Match BEFORE the catch-all world rules so the `/session/...` prefix | ||
| // wins. The trailing `/plan` is fixed; only the worldId varies. | ||
| const sessionMatch = /^\/session\/([^/]+)\/plan\/?$/.exec(pathname); | ||
| if (sessionMatch) { | ||
| const worldId = sessionMatch[1]; | ||
| if (!RESOLVE_ID_RE.test(worldId)) { | ||
| return { | ||
| kind: 'bad-request', | ||
| status: 400, | ||
| message: 'invalid worldId shape on /session/:worldId/plan', | ||
| }; | ||
| } | ||
| return { | ||
| kind: 'redirect', | ||
| status: 301, | ||
| location: `/plan/${worldId}`, | ||
| }; | ||
| } | ||
| // /design → / (Phase E2: the DesignSurface alpha placeholder is retired. | ||
| // Hardcoded target — no caller reflection. Exact-match only so /designfoo | ||
| // or /design/sub do not over-match into the redirect.) | ||
| if (pathname === '/design' || pathname === '/design/') { | ||
| return { kind: 'redirect', status: 301, location: '/' }; | ||
| } | ||
| // /world/:id (catch-all, EXCLUDING /editor and /events sub-routes) | ||
| // /sandbox/:id (catch-all, EXCLUDING /editor and /events sub-routes) | ||
| const worldMatch = /^\/(world|sandbox)\/([^/]+)(\/.*)?$/.exec(pathname); | ||
| if (worldMatch) { | ||
| const [, , id, rest] = worldMatch; | ||
| // KEEP these — control-plane still owns them until Phase E. | ||
| if (rest === '/editor' || rest === '/events' || | ||
| rest?.startsWith('/editor/') || rest?.startsWith('/events/')) { | ||
| return { kind: 'passthrough' }; | ||
| } | ||
| if (!RESOLVE_ID_RE.test(id)) { | ||
| return { | ||
| kind: 'bad-request', | ||
| status: 400, | ||
| message: 'invalid id shape on /(world|sandbox)/:id', | ||
| }; | ||
| } | ||
| return { | ||
| kind: 'redirect', | ||
| status: 301, | ||
| location: `/worlds?highlight=${encodeURIComponent(id)}`, | ||
| }; | ||
| } | ||
| // /plan/:id is intentionally passthrough — the SPA shell serves it | ||
| // and the SPA-side router (with useResolveId) decides what to mount. | ||
| // We DO NOT emit a self-loop 301 here. Including the rule for | ||
| // completeness / future-proofing only. | ||
| // (No regex needed; the static-serve layer already handles /plan/* | ||
| // via SPA_PREFIX.) | ||
| return { kind: 'passthrough' }; | ||
| } | ||
| /** | ||
| * Apply the redirect verdict to a node:http ServerResponse. Returns | ||
| * `true` when the response was written (caller must NOT continue); | ||
| * returns `false` when the caller should continue the normal request | ||
| * flow. | ||
| * | ||
| * @param {import('node:http').ServerResponse} res | ||
| * @param {ReturnType<typeof evaluateRedirect>} verdict | ||
| * @returns {boolean} true if response was sent, false to passthrough. | ||
| */ | ||
| export function applyRedirect(res, verdict) { | ||
| if (verdict.kind === 'passthrough') return false; | ||
| if (verdict.kind === 'redirect') { | ||
| res.writeHead(301, { | ||
| Location: verdict.location, | ||
| // Short cache so bookmarks update once but operator-local mistakes | ||
| // (typo'd URL) don't pin to a stale redirect forever. | ||
| 'Cache-Control': 'public, max-age=300', | ||
| 'Content-Type': 'text/plain; charset=utf-8', | ||
| }); | ||
| res.end(`Moved permanently: ${verdict.location}\n`); | ||
| return true; | ||
| } | ||
| if (verdict.kind === 'bad-request') { | ||
| res.writeHead(400, { | ||
| 'Content-Type': 'application/json; charset=utf-8', | ||
| 'Cache-Control': 'no-store', | ||
| }); | ||
| res.end(JSON.stringify({ | ||
| error: 'bad-request', | ||
| message: verdict.message, | ||
| })); | ||
| return true; | ||
| } | ||
| // Defensive: unknown verdict shape → fall through silently. | ||
| return false; | ||
| } |
| // resolver.mjs — Phase A A1 (plan-chat-spa-supersedes-control-plane). | ||
| // | ||
| // Disambiguates a single opaque :id supplied on /plan/:id between | ||
| // | ||
| // - a planning session (planning_sessions.session_id), or | ||
| // - a crystallized world (planning_artifacts.crystallized_world_id), or | ||
| // - unresolvable (returns {kind:'unresolved', canonical_id:null}). | ||
| // | ||
| // Used by plan-chat-spa's useResolveId hook (Phase A A2) so the SPA's | ||
| // cold-open path can mount the correct surface without trusting the | ||
| // id-shape (sentinel `sess_*` prefix is a hint, not authority — see | ||
| // plan-chat-spa-supersedes-control-plane.md K1 SEC-1). | ||
| // | ||
| // Single SQL query (UNION ALL) so resolution costs one round-trip even | ||
| // when the id misses both tables. Bearer auth + rate-limit live in the | ||
| // HTTP handler in plan-chat-service.mjs; this helper is pool-pure for | ||
| // unit testability. | ||
| /** | ||
| * Validate the resolver :id shape. Mirrors plan-chat-service.mjs's | ||
| * SCOPE_ID_RE; tightened to 6-80 chars so an enumeration attacker can't | ||
| * grind through 1-5 char shapes. | ||
| */ | ||
| export const RESOLVE_ID_RE = /^[A-Za-z0-9._-]{6,80}$/; | ||
| /** | ||
| * Resolve an opaque id against the chunks substrate. | ||
| * | ||
| * @param {{ query: (sql: string, params: unknown[]) => Promise<{ rows: unknown[] }> }} pool | ||
| * A pg-shaped pool. Tests pass a stub; production passes pg.Pool. | ||
| * @param {string} id The candidate id. | ||
| * @returns {Promise<{ kind: 'session' | 'world' | 'unresolved', canonical_id: string | null }>} | ||
| */ | ||
| export async function resolveId(pool, id) { | ||
| if (typeof id !== 'string' || !RESOLVE_ID_RE.test(id)) { | ||
| return { kind: 'unresolved', canonical_id: null }; | ||
| } | ||
| // Single round-trip. Both branches return the same shape | ||
| // (kind, canonical_id) so PG can UNION ALL them without coercion. | ||
| // | ||
| // Session branch wins on tie (LIMIT 1 + session ordered first) — a | ||
| // session id colliding with a world id is unlikely in practice | ||
| // (worldId is the random docker name; sessionId is uuid-shaped), | ||
| // but the deterministic ordering closes the K1 collision risk | ||
| // surfaced in pass 3 review. | ||
| const sql = ` | ||
| SELECT kind, canonical_id FROM ( | ||
| SELECT 'session' AS kind, session_id AS canonical_id, 1 AS rank | ||
| FROM planning_sessions | ||
| WHERE session_id = $1 | ||
| UNION ALL | ||
| SELECT 'world' AS kind, crystallized_world_id AS canonical_id, 2 AS rank | ||
| FROM planning_artifacts | ||
| WHERE crystallized_world_id = $1 | ||
| ) AS resolved | ||
| ORDER BY rank | ||
| LIMIT 1 | ||
| `; | ||
| const result = await pool.query(sql, [id]); | ||
| const row = result.rows && result.rows[0]; | ||
| if (!row) return { kind: 'unresolved', canonical_id: null }; | ||
| // Pool stub-friendly: tolerate column names emerging from pg's | ||
| // case-insensitive identifier handling. | ||
| const kind = row.kind ?? row.KIND; | ||
| const canonical_id = row.canonical_id ?? row.CANONICAL_ID; | ||
| if (kind !== 'session' && kind !== 'world') { | ||
| return { kind: 'unresolved', canonical_id: null }; | ||
| } | ||
| if (typeof canonical_id !== 'string' || canonical_id.length === 0) { | ||
| return { kind: 'unresolved', canonical_id: null }; | ||
| } | ||
| return { kind, canonical_id }; | ||
| } | ||
| /** | ||
| * Token-bucket rate limiter, per bearer principal. Closes the brute- | ||
| * force enumeration vector that bearer auth alone leaves open (an | ||
| * authenticated caller could otherwise grind through ids at | ||
| * line-rate). | ||
| * | ||
| * 60 req/min per bearer. Single-process in-memory map (one host-cp | ||
| * per host); a multi-instance deployment would need a shared store, | ||
| * but plan-chat-service is single-tenant single-host by design. | ||
| */ | ||
| export function createRateLimiter({ | ||
| capacity = 60, | ||
| windowMs = 60_000, | ||
| now = () => Date.now(), | ||
| } = {}) { | ||
| const buckets = new Map(); // key -> { tokens, lastRefill } | ||
| function take(key) { | ||
| const t = now(); | ||
| let bucket = buckets.get(key); | ||
| if (!bucket) { | ||
| bucket = { tokens: capacity, lastRefill: t }; | ||
| buckets.set(key, bucket); | ||
| } | ||
| // Refill proportional to elapsed time. | ||
| const elapsed = t - bucket.lastRefill; | ||
| if (elapsed > 0) { | ||
| const refill = (elapsed / windowMs) * capacity; | ||
| bucket.tokens = Math.min(capacity, bucket.tokens + refill); | ||
| bucket.lastRefill = t; | ||
| } | ||
| if (bucket.tokens < 1) { | ||
| return { allowed: false, retryAfterMs: Math.ceil((1 - bucket.tokens) * (windowMs / capacity)) }; | ||
| } | ||
| bucket.tokens -= 1; | ||
| return { allowed: true, retryAfterMs: 0 }; | ||
| } | ||
| function reset() { | ||
| buckets.clear(); | ||
| } | ||
| return { take, reset }; | ||
| } |
| // host-cp request router. | ||
| // | ||
| // Replaces the long linear `if (url.pathname === ...)` dispatch chain in | ||
| // server.mjs with an ordered route table. The table is walked in | ||
| // registration order, so route PRECEDENCE is preserved exactly as it was | ||
| // in the original if-ladder: the first matching route wins, later routes | ||
| // are never consulted once a match handles the request. | ||
| // | ||
| // Why a table and not a framework: | ||
| // - host-cp ships with no external HTTP framework (no express/fastify); | ||
| // this matches the existing zero-dep style. | ||
| // - The table is a plain data structure, so it is importable + unit | ||
| // testable WITHOUT booting server.mjs (which spawns docker-events, | ||
| // the auth poller, and the worlds.db reconciler at import time). | ||
| // - A route is now a table entry instead of a `return` buried in a | ||
| // 1700-line ladder. That kills the silent route-shadowing class: a | ||
| // misplaced `return` can no longer swallow a later route, and the | ||
| // full set of routes is enumerable (see `router.routes()`). | ||
| // | ||
| // Behavior-preservation contract (load-bearing — see | ||
| // __tests__/router.test.mjs): | ||
| // 1. Walk order == registration order == original source order. | ||
| // 2. A route MATCHES when its matcher returns a truthy match value AND | ||
| // (no method filter OR the method matches). The matcher receives | ||
| // ({ pathname, method, url }) and returns either a boolean or, for | ||
| // regex routes, the RegExpMatchArray (truthy) so the handler can read | ||
| // capture groups. | ||
| // 3. The FIRST matching route is invoked and dispatch STOPS — identical | ||
| // to `if (cond) { ...; return; }`. The handler owns the response. | ||
| // 4. A route whose path matches but whose METHOD does not is SKIPPED, | ||
| // and the walk continues — identical to the original | ||
| // `if (pathMatch && req.method === 'X')` blocks, where a path hit | ||
| // with the wrong method fell through to the next `if`. | ||
| // 5. If no route matches, dispatch returns `false` so the caller runs | ||
| // its terminal 404 — identical to the original fall-through. | ||
| // | ||
| // The router does NOT add auth, body parsing, or any middleware semantics. | ||
| // Those stay exactly where they were in server.mjs (pre-auth routes, the | ||
| // auth gate, the plan-chat bypass) — the router only models the part of | ||
| // the chain that was a flat sequence of `if` blocks. | ||
| /** | ||
| * @typedef {object} RouteContext | ||
| * @property {string} pathname url.pathname | ||
| * @property {string} method req.method (already normalized by node to uppercase) | ||
| * @property {URL} url parsed request URL | ||
| */ | ||
| /** | ||
| * A matcher decides whether a route applies to a request, ignoring method. | ||
| * Returning a non-boolean truthy value (e.g. a RegExpMatchArray) is | ||
| * forwarded to the handler as `ctx.match` so regex routes can read groups. | ||
| * | ||
| * @typedef {(ctx: RouteContext) => (boolean | RegExpMatchArray | null | undefined)} RouteMatcher | ||
| */ | ||
| /** | ||
| * A handler receives the node req/res plus the parsed url, the matched | ||
| * value (for regex routes), and is responsible for writing the response. | ||
| * It mirrors the body of an original `if` block. Return value is ignored; | ||
| * matching alone terminates dispatch (preserving the `if ... return` | ||
| * semantics where reaching the block always handled the request). | ||
| * | ||
| * @typedef {(req: import('node:http').IncomingMessage, res: import('node:http').ServerResponse, ctx: RouteContext & { match: any }) => unknown | Promise<unknown>} RouteHandler | ||
| */ | ||
| /** | ||
| * @typedef {object} Route | ||
| * @property {string} name human label for diagnostics / tests | ||
| * @property {string[] | null} methods allowed methods, or null for "any method" | ||
| * @property {RouteMatcher} match | ||
| * @property {RouteHandler} handler | ||
| */ | ||
| /** | ||
| * Create an ordered router. Routes are matched in the order they are | ||
| * registered — register in the SAME order the original if-ladder ran. | ||
| */ | ||
| export function createRouter() { | ||
| /** @type {Route[]} */ | ||
| const routes = []; | ||
| /** | ||
| * Register a route. Returns the router for chaining. | ||
| * | ||
| * @param {object} spec | ||
| * @param {string} spec.name | ||
| * @param {string | string[] | null} [spec.method] single method, list, or null/omitted for any | ||
| * @param {string} [spec.path] exact pathname match (mutually exclusive with prefix/match) | ||
| * @param {string} [spec.prefix] pathname.startsWith(prefix) match | ||
| * @param {RegExp} [spec.pattern] pathname.match(pattern) — match value passed to handler | ||
| * @param {RouteMatcher} [spec.match] custom matcher (overrides path/prefix/pattern) | ||
| * @param {RouteHandler} spec.handler | ||
| */ | ||
| function register(spec) { | ||
| const { name, method, path, prefix, pattern } = spec; | ||
| const handler = spec.handler; | ||
| if (typeof handler !== 'function') { | ||
| throw new TypeError(`route "${name}" requires a handler function`); | ||
| } | ||
| /** @type {string[] | null} */ | ||
| let methods = null; | ||
| if (Array.isArray(method)) methods = method.slice(); | ||
| else if (typeof method === 'string') methods = [method]; | ||
| // method omitted or null → any method | ||
| /** @type {RouteMatcher} */ | ||
| let match; | ||
| if (typeof spec.match === 'function') { | ||
| match = spec.match; | ||
| } else if (typeof path === 'string') { | ||
| match = (ctx) => ctx.pathname === path; | ||
| } else if (typeof prefix === 'string') { | ||
| match = (ctx) => ctx.pathname.startsWith(prefix); | ||
| } else if (pattern instanceof RegExp) { | ||
| match = (ctx) => ctx.pathname.match(pattern); | ||
| } else { | ||
| throw new TypeError( | ||
| `route "${name}" requires one of: path, prefix, pattern, or match`, | ||
| ); | ||
| } | ||
| routes.push({ name, methods, match, handler }); | ||
| return api; | ||
| } | ||
| /** | ||
| * Walk the table in registration order. Invokes the first route whose | ||
| * matcher is truthy AND whose method filter admits the request, then | ||
| * stops. A path-match with a non-admitted method is skipped (the walk | ||
| * continues), preserving the original `if (pathMatch && method===X)` | ||
| * fall-through. | ||
| * | ||
| * @param {import('node:http').IncomingMessage} req | ||
| * @param {import('node:http').ServerResponse} res | ||
| * @param {URL} url | ||
| * @returns {Promise<boolean>} true if a route handled the request, false to fall through to 404 | ||
| */ | ||
| async function dispatch(req, res, url) { | ||
| const ctx = { pathname: url.pathname, method: req.method ?? 'GET', url }; | ||
| for (const route of routes) { | ||
| const matched = route.match(ctx); | ||
| if (!matched) continue; | ||
| // Path matched. Now gate on method — a mismatch is a SKIP, not a | ||
| // 405, exactly mirroring the original if-ladder fall-through. | ||
| if (route.methods !== null && !route.methods.includes(ctx.method)) { | ||
| continue; | ||
| } | ||
| await route.handler(req, res, { ...ctx, match: matched }); | ||
| return true; | ||
| } | ||
| return false; | ||
| } | ||
| /** | ||
| * Enumerate registered routes (name + methods + matcher kind) for | ||
| * diagnostics, audits, and tests. Pure read of the table. | ||
| * | ||
| * @returns {Array<{ name: string, methods: string[] | null }>} | ||
| */ | ||
| function list() { | ||
| return routes.map((r) => ({ name: r.name, methods: r.methods })); | ||
| } | ||
| const api = { register, dispatch, list, get size() { return routes.length; } }; | ||
| return api; | ||
| } |
| // Phase F-2-B (B3): per-world secret cache. | ||
| // | ||
| // Pattern lifted from `packages/cloudflare-worker/src/index.ts:428-446` | ||
| // (`getContainerSecret`). CF Worker uses Durable Object storage with a | ||
| // 1h TTL; host CP uses an in-memory Map with a 5min TTL (D2 — demoted | ||
| // from 1h after the security review pass). | ||
| // | ||
| // The cache invalidates on two paths: | ||
| // 1. TTL expiry (lazy: checked on each `get(worldId)` call) | ||
| // 2. Docker events stream (eager: docker-events.mjs subscribes to | ||
| // `restart` / `stop` events and calls `invalidate(worldId)` — | ||
| // M2 ship gate is "docker restart <world>; within 10s, proxy | ||
| // call returns 200 not 401"). | ||
| /** | ||
| * @typedef {object} CacheEntry | ||
| * @property {string} secret | ||
| * @property {number} expiresAt epoch ms | ||
| */ | ||
| export class SecretCache { | ||
| /** | ||
| * @param {object} opts | ||
| * @param {number} [opts.ttlSec] cache TTL in seconds (default 300 = 5min) | ||
| * @param {() => number} [opts.now] clock injectable for tests | ||
| * @param {(message: string) => void} [opts.log] logger injectable for tests | ||
| */ | ||
| constructor({ ttlSec = 300, now = Date.now, log = console.log } = {}) { | ||
| this.ttlMs = ttlSec * 1000; | ||
| /** @type {Map<string, CacheEntry>} */ | ||
| this.entries = new Map(); | ||
| this.now = now; | ||
| this.log = log; | ||
| } | ||
| /** | ||
| * Look up a cached secret. Returns null if absent OR expired (TTL check | ||
| * is lazy — caller must re-fetch and call set()). The expiry path emits | ||
| * a debug log so cache-miss observability is wired in from day one. | ||
| * | ||
| * @param {string} worldId | ||
| * @returns {string | null} | ||
| */ | ||
| get(worldId) { | ||
| const entry = this.entries.get(worldId); | ||
| if (!entry) return null; | ||
| if (entry.expiresAt <= this.now()) { | ||
| // Lazy expiry — clean up to keep the map tidy. Don't log per-call; | ||
| // would be noisy under load. | ||
| this.entries.delete(worldId); | ||
| return null; | ||
| } | ||
| return entry.secret; | ||
| } | ||
| /** | ||
| * Cache a freshly-fetched secret. Overrides any prior entry. The | ||
| * `set` path is the only place TTL is reset — ensures a cache hit | ||
| * never extends beyond ttlMs from the most recent fetch. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {string} secret | ||
| */ | ||
| set(worldId, secret) { | ||
| this.entries.set(worldId, { | ||
| secret, | ||
| expiresAt: this.now() + this.ttlMs, | ||
| }); | ||
| } | ||
| /** | ||
| * Eager invalidation. Called by docker-events.mjs on `restart` / `stop` | ||
| * events. Returns true if an entry was present (test-observable). | ||
| * | ||
| * @param {string} worldId | ||
| * @returns {boolean} | ||
| */ | ||
| invalidate(worldId) { | ||
| const had = this.entries.has(worldId); | ||
| if (had) { | ||
| this.entries.delete(worldId); | ||
| this.log(`secret-cache: invalidated ${worldId}`); | ||
| } | ||
| return had; | ||
| } | ||
| /** | ||
| * Drop everything. Used at shutdown for clean teardown; also useful | ||
| * in tests. | ||
| */ | ||
| clear() { | ||
| this.entries.clear(); | ||
| } | ||
| /** | ||
| * Snapshot of cached worldIds (for /health diagnostics + tests). | ||
| * Returns just the keys — never the secrets themselves. | ||
| * | ||
| * @returns {string[]} | ||
| */ | ||
| worldIds() { | ||
| return [...this.entries.keys()]; | ||
| } | ||
| } |
| // serve-only-config.mjs — host-cp SERVE-ONLY mode gate (Phase A of | ||
| // host-cp-gke-serve-only-mode). | ||
| // | ||
| // host-cp normally runs as a local operator sidecar coupled to the host's | ||
| // docker daemon + operator-repo + gh-config. On a managed GKE cluster those | ||
| // host-couplings are absent: host-cp only serves plan-chat-spa + the | ||
| // host-native `/api/*` surface; world orchestration runs elsewhere. | ||
| // | ||
| // `OLAM_HOST_CP_SERVE_ONLY=true` switches host-cp into that degraded shape: | ||
| // - no docker transport connect, no world discovery | ||
| // - no PlanOrchestrator docker wiring, no pr-merge-poller docker/repo deps | ||
| // - world-orchestration routes (`/api/world/*`) return a structured 503 | ||
| // - version-status degrades to 'unknown' (no operator-repo) | ||
| // | ||
| // The flag defaults OFF — the local docker/k3d FULL mode is byte-for-byte | ||
| // unchanged. This module is a tiny pure seam so the gate decision can be | ||
| // unit-tested WITHOUT booting server.mjs (which connects docker + binds a | ||
| // port at module load and therefore can't be imported in a test). | ||
| // | ||
| // ONE coarse flag — no granular per-subsystem toggles (plan S1 / YAGNI). | ||
| /** | ||
| * Decide whether host-cp runs in SERVE-ONLY mode. | ||
| * | ||
| * Strict `=== 'true'` parse (mirrors the HOST_CP_MODE env-flag convention | ||
| * in server.mjs): only the literal string `'true'` enables it. Any other | ||
| * value — unset, `'1'`, `'false'`, `''`, `'TRUE'` — keeps FULL mode so the | ||
| * default stays OFF and operators can't half-enable it by accident. | ||
| * | ||
| * @param {NodeJS.ProcessEnv | Record<string, string | undefined>} [env] | ||
| * Environment to read `OLAM_HOST_CP_SERVE_ONLY` from. Defaults to | ||
| * `process.env`. | ||
| * @returns {boolean} `true` when serve-only mode is active. | ||
| */ | ||
| export function isServeOnly(env = process.env) { | ||
| return env?.OLAM_HOST_CP_SERVE_ONLY === 'true'; | ||
| } | ||
| /** | ||
| * Structured 503 body for world-orchestration routes that are unavailable | ||
| * in serve-only mode. Reuses the host-cp `/api/*` JSON-error shape | ||
| * (`{ error, message }`) so SPA error handling treats it uniformly. | ||
| * | ||
| * @type {{ error: 'orchestration_unavailable', message: string }} | ||
| */ | ||
| export const ORCHESTRATION_UNAVAILABLE = Object.freeze({ | ||
| error: 'orchestration_unavailable', | ||
| message: | ||
| 'host-cp is in serve-only mode (managed cluster); world orchestration runs elsewhere', | ||
| }); | ||
| /** | ||
| * True when `pathname` (+ `method`) is a world-ORCHESTRATION route that must | ||
| * degrade to a structured 503 in serve-only mode. The surface is wider than | ||
| * the singular `/api/world/` proxy: it also covers the plural `/api/worlds/` | ||
| * per-world mutation/read routes (e.g. `POST /api/worlds/<id>/tunnels` which | ||
| * spawns a real cloudflare tunnel, `DELETE /api/worlds/<id>` which destroys a | ||
| * world), world creation (`POST /api/worlds`), and the CLI `/v1/worlds/` | ||
| * routes. Without this breadth a serve-only host-cp on a shared cluster would | ||
| * execute tunnel/destroy mutations — the opposite of honest degradation. | ||
| * (CP3 finding: the singular-only guard let POST /api/worlds/<id>/tunnels | ||
| * open a live public tunnel in serve-only.) | ||
| * | ||
| * Deliberately NOT orchestration: `GET`/`HEAD /api/worlds` (the bare LIST | ||
| * endpoint) — it returns an empty array in serve-only, which is honest. | ||
| * | ||
| * @param {unknown} pathname URL.pathname (no querystring). | ||
| * @param {string} [method] HTTP method (defaults 'GET'). | ||
| * @returns {boolean} | ||
| */ | ||
| export function isOrchestrationRoute(pathname, method = 'GET') { | ||
| if (typeof pathname !== 'string') return false; | ||
| // Singular /api/world/<id>/... — the per-world CP proxy + /progress. | ||
| if (pathname.startsWith('/api/world/')) return true; | ||
| // CLI per-world routes (olam status/logs <world>). | ||
| if (pathname.startsWith('/v1/worlds/')) return true; | ||
| // Plural /api/worlds: | ||
| // bare LIST (GET/HEAD /api/worlds) → honest [] in serve-only, NOT blocked. | ||
| // create (POST /api/worlds) + any per-world subpath (/api/worlds/<id>...) → 503. | ||
| if (pathname === '/api/worlds') { | ||
| return method !== 'GET' && method !== 'HEAD'; | ||
| } | ||
| if (/^\/api\/worlds\/[^/?#]+/.test(pathname)) return true; | ||
| return false; | ||
| } |
Sorry, the diff of this file is too big to display
| // Phase F-2-B (B5): SSE concurrent-connection gate + path detection. | ||
| // | ||
| // Background. Each open SSE proxy holds: | ||
| // - A Node http.ClientRequest to the per-world CP (one fd) | ||
| // - The browser's incoming socket (one fd) | ||
| // Plus the Node event loop wakes on every chunk. With N worlds × M tabs | ||
| // × Sse-per-tab, the FD budget grows linearly. P3 budgets ≤100 concurrent | ||
| // SSE proxies; P4 caps at 50 + returns 503 with Retry-After: 30 above | ||
| // that. Below the cap there's no impact. | ||
| // | ||
| // Cap semantics: | ||
| // - increment() returns true if we're allowed to open; false → reject. | ||
| // - decrement() is idempotent + fire-once via the FiredFlag pattern | ||
| // because Node emits both 'close' and 'finish' on a normal stream | ||
| // end. Without idempotency the counter would underflow. | ||
| // | ||
| // SSE detection is path-based (cheap; runs before opening upstream). | ||
| // Two patterns are SSE today: | ||
| // /api/stream — per-world CP's existing SSE feed | ||
| // /api/world/<id>/bootstrap-progress — placeholder for B7's UI strip | ||
| // (per-world CP route lands later) | ||
| const SSE_PATH_PATTERNS = [ | ||
| /\/api\/stream(?:\/|$|\?)/, | ||
| /\/bootstrap-progress(?:\/|$|\?)/, | ||
| /\/api\/logs(?:\/|$|\?)/, | ||
| ]; | ||
| /** | ||
| * Detect whether an upstream subPath represents an SSE stream. The | ||
| * subPath is the value emitted by `parseProxyPath()` — i.e., everything | ||
| * AFTER `/api/world/<id>`. So we match on the inner route, not the | ||
| * `/api/world/<id>` prefix. | ||
| * | ||
| * @param {string} subPath | ||
| * @returns {boolean} | ||
| */ | ||
| export function isSsePath(subPath) { | ||
| return SSE_PATH_PATTERNS.some((re) => re.test(subPath)); | ||
| } | ||
| export class SseGate { | ||
| /** | ||
| * @param {object} opts | ||
| * @param {number} [opts.maxConcurrent] default 50 (P4 cap) | ||
| * @param {(message: string) => void} [opts.log] | ||
| */ | ||
| constructor({ maxConcurrent = 50, log = console.log } = {}) { | ||
| if (maxConcurrent < 1) { | ||
| throw new Error('SseGate: maxConcurrent must be >= 1'); | ||
| } | ||
| this.maxConcurrent = maxConcurrent; | ||
| this.active = 0; | ||
| this.log = log; | ||
| } | ||
| /** | ||
| * Try to acquire a slot. If at cap, returns null + writes a 503 to | ||
| * res. Caller MUST check the return value. | ||
| * | ||
| * @param {import('node:http').ServerResponse} res | ||
| * @returns {{ release: () => void } | null} | ||
| */ | ||
| acquire(res) { | ||
| if (this.active >= this.maxConcurrent) { | ||
| res.writeHead(503, { | ||
| 'Content-Type': 'application/json; charset=utf-8', | ||
| 'Retry-After': '30', | ||
| }); | ||
| res.end(JSON.stringify({ | ||
| error: 'sse_capacity_reached', | ||
| active: this.active, | ||
| cap: this.maxConcurrent, | ||
| retry_after_sec: 30, | ||
| message: 'host CP has reached the SSE concurrent-connection cap. Retry after the indicated delay or close idle SPA tabs.', | ||
| })); | ||
| this.log(`sse-gate: 503 — cap reached (active=${this.active}, cap=${this.maxConcurrent})`); | ||
| return null; | ||
| } | ||
| this.active++; | ||
| let released = false; | ||
| const release = () => { | ||
| if (released) return; | ||
| released = true; | ||
| this.active--; | ||
| }; | ||
| return { release }; | ||
| } | ||
| /** Diagnostics for /health. */ | ||
| stats() { | ||
| return { | ||
| active: this.active, | ||
| cap: this.maxConcurrent, | ||
| }; | ||
| } | ||
| } | ||
| /** | ||
| * Wire SSE-gate teardown to a ServerResponse's lifecycle. Node's | ||
| * http response emits 'close' (client disconnected) AND 'finish' | ||
| * (response.end() called) on different code paths. We want decrement | ||
| * exactly once per acquire(), regardless of which event fires first. | ||
| * | ||
| * The release closure is already idempotent (released flag). Wiring | ||
| * both events covers every termination path: | ||
| * - browser closes tab → 'close' on res | ||
| * - upstream EOF + res.end → 'finish' on res | ||
| * - error in proxy → 'close' on res (Node fires close on errors) | ||
| * | ||
| * @param {import('node:http').ServerResponse} res | ||
| * @param {() => void} release | ||
| */ | ||
| export function wireRelease(res, release) { | ||
| res.on('close', release); | ||
| res.on('finish', release); | ||
| } |
| // packages/host-cp/src/tasks-route.mjs | ||
| // | ||
| // B2.2: mount @olam/tasks-write-api's framework-neutral handlers under | ||
| // /api/tasks/*. host-cp owns the pg.Pool (per D-B-19 olam-local-PG-primary); | ||
| // wraps it via pgPoolExecutor (B2.1.1 adapter) and passes as HandlerDeps.pglite | ||
| // (duck-typed; PgExecutor's query/exec/transaction match PGlite's shape). | ||
| // | ||
| // Auth model: leverages host-cp's existing StartupToken bearer gate (Authorization: | ||
| // Bearer <token>). Per-request scopes + olamNodeId come from headers: | ||
| // X-Olam-Node-Id: UUID of the caller's olam node (sets RLS scope per D-B-23) | ||
| // X-Olam-Session-Id: UUID of the caller's session row (FK for task_claims) | ||
| // X-Olam-Tasks-Scopes: comma-separated scope list (tasks-create,tasks-claim, | ||
| // tasks-state-update,tasks-query). Trust model: bearer | ||
| // token gates access; scope header lets the caller declare | ||
| // narrower intent. | ||
| // | ||
| // Deviation from B2.2 plan spec: spec called for JWT + auth-service integration; | ||
| // host-cp uses opaque tokens (StartupToken) and HTTP calls auth-service via HTTP. | ||
| // JWT scope encoding deferred to Phase D++ when multi-user auth lands; for v1, | ||
| // the existing bearer + per-request header model is sufficient (single-operator; | ||
| // 127.0.0.1:19000 only per host-cp threat model). | ||
| import pg from 'pg'; | ||
| // Treat BIGINT (OID 20) as Number, not the default string. The tasks schema's | ||
| // `version` column is BIGINT but stays well within Number-safe range; without | ||
| // this parser pg returns the value as a string, and the task-store types | ||
| // declare it as `number`, letting a stray BigInt propagate (PGlite returns | ||
| // BigInt by default). JSON.stringify on BigInt throws — caused /api/tasks | ||
| // 500s with "Do not know how to serialize a BigInt" during the CLI E2E proof. | ||
| pg.types.setTypeParser(20, (v) => (v == null ? null : Number.parseInt(v, 10))); | ||
| let writeApi = null; // lazy-load tasks-write-api to keep cold-path light | ||
| let executor = null; | ||
| let pool = null; | ||
| const VALID_SCOPES = new Set(['tasks-create', 'tasks-claim', 'tasks-state-update', 'tasks-query']); | ||
| const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; | ||
| async function ensureWriteApi() { | ||
| if (writeApi) return writeApi; | ||
| // Dynamic import: tasks-write-api is built TS (ESM dist). Fail-loud if not | ||
| // built — operator must `npm run build --workspace=@olam/tasks-write-api` | ||
| // before host-cp starts. | ||
| writeApi = await import('@olam/tasks-write-api'); | ||
| return writeApi; | ||
| } | ||
| function ensureExecutor() { | ||
| if (executor) return executor; | ||
| const connectionString = process.env.OLAM_LOCAL_PG_URL; | ||
| if (!connectionString) { | ||
| throw new Error( | ||
| 'tasks-route: OLAM_LOCAL_PG_URL not set. Bring up Docker PG: docker compose -f packages/infra/docker-compose.local-electric.yml up -d, then export OLAM_LOCAL_PG_URL=postgres://postgres:olam@localhost:54331/olam_tasks', | ||
| ); | ||
| } | ||
| pool = new pg.Pool({ connectionString, max: 8 }); | ||
| // Lazy require pgPoolExecutor from the same dynamic-imported module. | ||
| // ensureWriteApi must have run first; tasks-route's dispatch order guarantees it. | ||
| return writeApi.pgPoolExecutor(pool); | ||
| } | ||
| function parseAuth(req) { | ||
| const olamNodeId = req.headers['x-olam-node-id']; | ||
| const sessionId = req.headers['x-olam-session-id']; | ||
| const scopesHeader = req.headers['x-olam-tasks-scopes'] ?? ''; | ||
| const scopes = String(scopesHeader) | ||
| .split(',') | ||
| .map((s) => s.trim()) | ||
| .filter((s) => VALID_SCOPES.has(s)); | ||
| if (!olamNodeId || !UUID_RE.test(String(olamNodeId))) return null; | ||
| if (!sessionId || !UUID_RE.test(String(sessionId))) return null; | ||
| if (scopes.length === 0) return null; | ||
| return { olamNodeId: String(olamNodeId), sessionId: String(sessionId), scopes }; | ||
| } | ||
| async function readBody(req) { | ||
| if (req.method === 'GET' || req.method === 'HEAD') return {}; | ||
| return new Promise((resolve, reject) => { | ||
| let raw = ''; | ||
| req.on('data', (chunk) => (raw += chunk)); | ||
| req.on('end', () => { | ||
| if (!raw) return resolve({}); | ||
| try { | ||
| resolve(JSON.parse(raw)); | ||
| } catch { | ||
| resolve({ __invalid: true }); | ||
| } | ||
| }); | ||
| req.on('error', reject); | ||
| }); | ||
| } | ||
| function sendEnvelope(res, status, envelope) { | ||
| res.statusCode = status; | ||
| res.setHeader('Content-Type', 'application/json'); | ||
| // BigInt-safe serialization. @olam/tasks's task-store wraps `version` | ||
| // (and any future BIGINT fields) in BigInt() during row→Task mapping; | ||
| // default JSON.stringify throws on BigInt. The values stay safely | ||
| // within Number range (version starts at 0, increments per mutation), | ||
| // so emitting as a JSON number is lossless for any realistic load. | ||
| res.end(JSON.stringify(envelope, (_key, value) => | ||
| typeof value === 'bigint' ? Number(value) : value, | ||
| )); | ||
| } | ||
| /** | ||
| * Dispatch a /api/tasks/* request. Returns true if handled; false if route | ||
| * doesn't match (caller continues to next dispatcher in server.mjs). | ||
| */ | ||
| export async function dispatchTasksRoute(req, res, url) { | ||
| const pathname = url.pathname; | ||
| if (!pathname.startsWith('/api/tasks')) return false; | ||
| // Lazy initialise on first request (avoids boot-time crash when PG not up). | ||
| let api; | ||
| try { | ||
| api = await ensureWriteApi(); | ||
| } catch (e) { | ||
| sendEnvelope(res, 500, { success: false, data: null, error: `tasks-write-api unbuilt: ${e.message}` }); | ||
| return true; | ||
| } | ||
| let exec; | ||
| try { | ||
| exec = ensureExecutor(); | ||
| } catch (e) { | ||
| sendEnvelope(res, 503, { success: false, data: null, error: e.message }); | ||
| return true; | ||
| } | ||
| const auth = parseAuth(req); | ||
| if (!auth) { | ||
| sendEnvelope(res, 401, { | ||
| success: false, | ||
| data: null, | ||
| error: 'Missing or malformed X-Olam-Node-Id / X-Olam-Session-Id / X-Olam-Tasks-Scopes headers', | ||
| }); | ||
| return true; | ||
| } | ||
| const body = await readBody(req); | ||
| if (body && body.__invalid) { | ||
| sendEnvelope(res, 400, { success: false, data: null, error: 'Invalid JSON body' }); | ||
| return true; | ||
| } | ||
| // Route matching — minimal pattern (host-cp's existing if-ladder style). | ||
| const segments = pathname.split('/').filter(Boolean); // ['api','tasks',...] | ||
| const ctx = { auth, params: {}, query: Object.fromEntries(url.searchParams) }; | ||
| const deps = { pglite: exec }; | ||
| try { | ||
| let response; | ||
| if (segments.length === 2 && req.method === 'POST') { | ||
| response = await api.createHandler({ ...ctx, body }, deps); | ||
| } else if (segments.length === 2 && req.method === 'GET') { | ||
| response = await api.queryHandler({ ...ctx, body }, deps); | ||
| } else if (segments.length === 3 && segments[2] === 'claim' && req.method === 'POST') { | ||
| response = await api.claimHandler({ ...ctx, body }, deps); | ||
| } else if (segments.length === 3 && segments[2] === 'distill' && req.method === 'GET') { | ||
| response = await api.distillHandler({ ...ctx, body }, deps); | ||
| } else if (segments.length === 4 && segments[3] === 'heartbeat' && req.method === 'POST') { | ||
| ctx.params.id = segments[2]; | ||
| response = await api.heartbeatHandler({ ...ctx, body }, deps); | ||
| } else if (segments.length === 4 && segments[3] === 'complete' && req.method === 'POST') { | ||
| ctx.params.id = segments[2]; | ||
| response = await api.completeHandler({ ...ctx, body }, deps); | ||
| } else if (segments.length === 4 && segments[3] === 'update' && req.method === 'POST') { | ||
| ctx.params.id = segments[2]; | ||
| response = await api.updateHandler({ ...ctx, body }, deps); | ||
| } else { | ||
| sendEnvelope(res, 404, { success: false, data: null, error: `Unknown /api/tasks route: ${req.method} ${pathname}` }); | ||
| return true; | ||
| } | ||
| sendEnvelope(res, response.status, response.envelope); | ||
| return true; | ||
| } catch (e) { | ||
| console.error('[tasks-route] handler error:', e); | ||
| sendEnvelope(res, 500, { success: false, data: null, error: e?.message ?? 'internal error' }); | ||
| return true; | ||
| } | ||
| } | ||
| // Test surface — for unit tests to reset module state between cases. | ||
| export function _resetForTests() { | ||
| writeApi = null; | ||
| executor = null; | ||
| if (pool) pool.end().catch(() => undefined); | ||
| pool = null; | ||
| } |
| // Upgrade-trigger: spawn an ephemeral `olam upgrade` runner container. | ||
| // | ||
| // The user clicks "Run upgrade" in the dashboard → host-cp's | ||
| // /api/admin/upgrade endpoint asks the docker daemon (via the | ||
| // socket-proxy sidecar) to create + start a one-off container that | ||
| // runs the olam CLI's full upgrade pipeline against the local stack. | ||
| // | ||
| // Why a separate container (and not a child process inside host-cp)? | ||
| // `olam upgrade` recreates host-cp itself as part of the atomic | ||
| // tag-swap. If the orchestrator lived inside host-cp, the moment it | ||
| // asked docker to stop the old host-cp container the orchestrator | ||
| // would die with it — leaving no one to start the new container. | ||
| // A sibling container survives host-cp's recreate. | ||
| // | ||
| // Why this same image (not a purpose-built `olam-upgrader`)? | ||
| // The host-cp image already has Node, the olam CLI, the docker CLI, | ||
| // and the docker compose plugin baked in by Dockerfile. Reusing it | ||
| // means there's nothing extra to publish, and the upgrader is | ||
| // guaranteed to ship from the same source SHA as the host-cp it | ||
| // replaces. The upgrader's `Cmd` overrides host-cp's default CMD | ||
| // so it runs the CLI instead of starting the server. | ||
| // | ||
| // Security note (single-user-trusted-local-dev assumption): | ||
| // POST /api/admin/upgrade requires the host-cp auth token. Anyone | ||
| // with that token can already spawn arbitrary commands inside | ||
| // running devboxes via the existing exec path; spawning an upgrader | ||
| // container does not meaningfully widen the blast radius for the | ||
| // single-user model. Multi-user / hosted deployments will need a | ||
| // tighter policy (capability bit, user-scoped tokens). | ||
| import http from 'node:http'; | ||
| /** | ||
| * Default upgrader entrypoint. Authenticates docker against GHCR (where | ||
| * the host-cp / auth / devbox images live), then runs the full | ||
| * atomic-swap pipeline. | ||
| * | ||
| * Auth resolution order: | ||
| * 1. `$GH_TOKEN` env var (set on host-cp via compose; operator | ||
| * typically resolves it from `gh auth token` before `olam host-cp | ||
| * start`). Required path on macOS — the host's `gh` keeps the | ||
| * token in Keychain, which doesn't follow into a Linux container. | ||
| * 2. `gh auth token` against the mounted ~/.config/gh. Works on Linux | ||
| * operators whose gh keeps the token in the config dir directly. | ||
| * 3. No-token (warns). The pull-by-digest step will fail with a clear | ||
| * `unauthorized` error from the daemon and the upgrader exits non- | ||
| * zero — surfacing in `docker logs` for diagnosis. | ||
| * | ||
| * Wrapping the auth + upgrade in a single `sh -c` invocation lets the | ||
| * `docker login` stage feed credentials directly into the docker | ||
| * daemon without leaking the token through shared volumes. | ||
| */ | ||
| const DEFAULT_UPGRADER_CMD = [ | ||
| 'sh', | ||
| '-c', | ||
| [ | ||
| 'set -e', | ||
| // Resolve the GH token. Prefer the env var (works on every OS); | ||
| // fall back to `gh auth token` (Linux-only on macOS Keychain hosts). | ||
| 'TOKEN="${GH_TOKEN:-$(gh auth token 2>/dev/null || true)}"', | ||
| 'if [ -z "$TOKEN" ]; then echo "[upgrader] no GH_TOKEN; ghcr pulls will fail" >&2; fi', | ||
| // Authenticate against ghcr.io. `oauth2` is GitHub's canonical | ||
| // username placeholder for PAT-style tokens. | ||
| '[ -n "$TOKEN" ] && echo "$TOKEN" | docker login ghcr.io -u oauth2 --password-stdin', | ||
| // The CLI resolves `packages/host-cp/compose.yaml` relative to its | ||
| // current working directory (see upgrade.ts:1008-1009). Inside the | ||
| // upgrader the npm-installed package lives at | ||
| // /usr/local/lib/node_modules/@pleri/olam-cli/, so cd there before | ||
| // running so the relative path resolves to the bundled compose | ||
| // file. Without this the recreate step fails with `open | ||
| // /app/packages/host-cp/compose.yaml: no such file or directory`. | ||
| // Operator's olam repo is bind-mounted at /workspace below; cd | ||
| // there so the CLI finds packages/host-cp/compose.yaml. Without | ||
| // this the recreate step fails with `open <cwd>/packages/host-cp/ | ||
| // compose.yaml: no such file or directory` because the npm- | ||
| // installed @pleri/olam-cli package does NOT bundle the compose | ||
| // file (it's repo-source only). | ||
| 'cd /workspace', | ||
| // Then run the upgrade. The CLI handles pull-by-digest, atomic | ||
| // swap, recreate, and the post-recreate /api/version/status | ||
| // round-trip itself. | ||
| 'olam upgrade -y', | ||
| ].join(' && '), | ||
| ]; | ||
| /** | ||
| * Spawn the upgrader. Resolves with the container ID on a successful | ||
| * `/start`; throws on any failure path so the caller can surface a | ||
| * clean 500 with the daemon's reason. | ||
| * | ||
| * @param {object} args | ||
| * @param {string} args.dockerHost tcp://docker-socket-proxy:2375 or 'docker-cli' | ||
| * @param {string} args.olamHomeHostPath e.g. /Users/ernie/.olam | ||
| * @param {string} args.dockerSockHostPath e.g. /var/run/docker.sock | ||
| * @param {string} args.image upgrader image (defaults to host-cp's own image) | ||
| * @param {string} [args.ghConfigHostPath] operator's ~/.config/gh; bind-mounted ro for | ||
| * `gh auth token` to work inside the upgrader | ||
| * (Linux fallback only; macOS uses GH_TOKEN env) | ||
| * @param {string} [args.ghToken] pre-resolved GH token (typically read from | ||
| * host-cp's GH_TOKEN env via compose). Passed | ||
| * to the upgrader as $GH_TOKEN so `docker login | ||
| * ghcr.io` works on macOS hosts whose Keychain- | ||
| * backed gh config can't be read inside a Linux | ||
| * container. | ||
| * @param {string} [args.repoHostPath] operator's olam repo path on the host. Bind- | ||
| * mounted into the upgrader at /workspace so | ||
| * the CLI's cwd-relative compose-file lookup | ||
| * resolves (the npm package doesn't bundle | ||
| * packages/host-cp/compose.yaml). | ||
| * @param {string} [args.operatorHomeHostPath] operator's $HOME on the host. Passed as the | ||
| * upgrader's HOME env so docker-compose's | ||
| * `${HOME}` interpolation in bind sources | ||
| * resolves to a daemon-visible path. | ||
| * @param {ReadonlyArray<string>} [args.cmd] override the upgrade command for tests | ||
| * @param {(host: string, init: object) => Promise<Response>} [args.fetchImpl] | ||
| * @param {(message: string) => void} [args.log] | ||
| * @returns {Promise<{ id: string, name: string }>} | ||
| */ | ||
| export async function spawnUpgraderContainer({ | ||
| dockerHost, | ||
| olamHomeHostPath, | ||
| dockerSockHostPath, | ||
| image, | ||
| ghConfigHostPath, | ||
| ghToken, | ||
| repoHostPath, | ||
| operatorHomeHostPath, | ||
| cmd = DEFAULT_UPGRADER_CMD, | ||
| fetchImpl = globalThis.fetch, | ||
| log = console.log, | ||
| }) { | ||
| if (!olamHomeHostPath) { | ||
| throw new Error('OLAM_HOME_HOST_PATH not set; cannot bind-mount operator state'); | ||
| } | ||
| if (!dockerSockHostPath) { | ||
| throw new Error('OLAM_DOCKER_SOCK_HOST_PATH not set; upgrader cannot reach docker daemon'); | ||
| } | ||
| if (!image) { | ||
| throw new Error('upgrader image not configured (OLAM_UPGRADER_IMAGE)'); | ||
| } | ||
| if (!repoHostPath) { | ||
| throw new Error( | ||
| 'OLAM_REPO_HOST_PATH not set; upgrader cannot find packages/host-cp/compose.yaml', | ||
| ); | ||
| } | ||
| // Bare-node (operator's host docker CLI on PATH) is documented but | ||
| // out of scope for the trigger feature — container + unix-socket paths | ||
| // are supported (compose stack and k8s hostPath socket mount). | ||
| if (dockerHost === 'docker-cli') { | ||
| // The literals below (`unix:///var/run/docker.sock` and `tcp://docker-socket-proxy:2375`) | ||
| // are diagnostic text naming the deployment shapes that ARE supported, | ||
| // not hostnames being used as transport — error-message-only. | ||
| throw new Error( | ||
| 'upgrade-trigger requires a docker socket (unix:///var/run/docker.sock via k8s hostPath mount, ' + // bare-node-allow: diagnostic-text | ||
| 'or tcp://docker-socket-proxy:2375 via compose); bare-node not yet supported. ' + // bare-node-allow: diagnostic-text | ||
| 'For k8s: ensure the cluster was created with ' + | ||
| '--volume /var/run/docker.sock:/var/run/docker.sock@server:* ' + | ||
| 'and olam doctor reports probeDockerSocketBindMount [PASS].', | ||
| ); | ||
| } | ||
| const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://'); | ||
| const containerName = `olam-upgrader-${Date.now()}`; | ||
| // Container create body. AutoRemove cleans up on exit so we don't | ||
| // accumulate stopped upgrader carcasses; HostConfig.Binds gives the | ||
| // CLI everything it needs (operator state + docker socket). | ||
| const createBody = { | ||
| Image: image, | ||
| Cmd: [...cmd], | ||
| Entrypoint: [], // override host-cp's tini entrypoint; olam CLI is self-contained | ||
| Env: [ | ||
| // HOME serves two roles: | ||
| // - The CLI uses HOME to resolve ~/.olam (we bind-mount the | ||
| // operator's ~/.olam into the upgrader so the CLI sees its | ||
| // state). | ||
| // - docker-compose interpolates `${HOME}` in bind sources of | ||
| // compose.yaml. The daemon resolves those bind sources on | ||
| // the HOST filesystem, so HOME must be a path the daemon | ||
| // can find (typically the operator's host $HOME). | ||
| // | ||
| // Default to /root for back-compat with tests that don't pass | ||
| // the operator host path; production callers (server.mjs) pass | ||
| // operatorHomeHostPath through. | ||
| `HOME=${operatorHomeHostPath ?? '/root'}`, | ||
| // Non-interactive mode + auto-yes are belt-and-braces: -y flag is | ||
| // also passed in Cmd, but env is the canonical way to opt out of | ||
| // tty prompts when stdin is closed. | ||
| 'OLAM_NON_INTERACTIVE=1', | ||
| 'CI=1', | ||
| // GH token forwarded for the docker-login-to-ghcr step. Only | ||
| // included when host-cp received it (compose set GH_TOKEN); the | ||
| // wrapper's auth fallback handles the unset case explicitly. | ||
| ...(ghToken ? [`GH_TOKEN=${ghToken}`] : []), | ||
| ], | ||
| HostConfig: { | ||
| AutoRemove: true, | ||
| // Bind sources are resolved by the docker daemon on the HOST | ||
| // filesystem, so target paths must match the host's view too — | ||
| // when the upgrader's `docker compose up` recreates host-cp, | ||
| // compose's ${HOME} interpolation pulls operatorHomeHostPath | ||
| // (set as HOME below). We keep the source==target convention | ||
| // for ~/.olam so the path is identical inside and out. | ||
| Binds: [ | ||
| `${olamHomeHostPath}:${operatorHomeHostPath ? `${operatorHomeHostPath}/.olam` : '/root/.olam'}`, | ||
| `${dockerSockHostPath}:/var/run/docker.sock`, | ||
| // Operator's repo bind-mounted read-only at /workspace. The | ||
| // wrapper cds here so the CLI's relative compose-file lookup | ||
| // resolves to `<repo>/packages/host-cp/compose.yaml`. | ||
| `${repoHostPath}:/workspace:ro`, | ||
| // Optional gh config bind. When unset (operator skipped | ||
| // `gh auth login`) the upgrader's `gh auth token` step fails | ||
| // and the wrapper exits early — surface the failure via | ||
| // `docker logs` rather than spawning a doomed run. | ||
| ...(ghConfigHostPath | ||
| ? [`${ghConfigHostPath}:${operatorHomeHostPath ? `${operatorHomeHostPath}/.config/gh` : '/root/.config/gh'}:ro`] | ||
| : []), | ||
| ], | ||
| // Same network as host-cp so the upgrader can reach the | ||
| // docker-socket-proxy + auth-service if it needs to during the | ||
| // verification phase. Falls through to docker.sock for daemon | ||
| // operations. | ||
| NetworkMode: 'olam-host-cp-internal', | ||
| }, | ||
| Labels: { | ||
| 'olam.role': 'upgrader', | ||
| 'olam.spawned-by': 'host-cp', | ||
| 'olam.spawned-at': new Date().toISOString(), | ||
| }, | ||
| }; | ||
| log(`[upgrade] creating upgrader container ${containerName} from ${image}`); | ||
| const createUrl = `${apiBase}/containers/create?name=${encodeURIComponent(containerName)}`; | ||
| const createRes = await fetchImpl(createUrl, { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify(createBody), | ||
| }); | ||
| if (!createRes.ok) { | ||
| const detail = await safeReadBody(createRes); | ||
| throw new Error( | ||
| `daemon rejected POST /containers/create: ${createRes.status} ${createRes.statusText} ${detail}`.trim(), | ||
| ); | ||
| } | ||
| const created = await createRes.json(); | ||
| const containerId = created.Id; | ||
| if (!containerId) { | ||
| throw new Error(`POST /containers/create returned no Id: ${JSON.stringify(created)}`); | ||
| } | ||
| log(`[upgrade] starting upgrader ${containerId.slice(0, 12)}`); | ||
| const startUrl = `${apiBase}/containers/${encodeURIComponent(containerId)}/start`; | ||
| const startRes = await fetchImpl(startUrl, { method: 'POST' }); | ||
| if (!startRes.ok && startRes.status !== 304) { | ||
| // 304 Not Modified = already started; treat as success. | ||
| const detail = await safeReadBody(startRes); | ||
| throw new Error( | ||
| `daemon rejected POST /containers/${containerId}/start: ${startRes.status} ${detail}`.trim(), | ||
| ); | ||
| } | ||
| return { id: containerId, name: containerName }; | ||
| } | ||
| /** | ||
| * Read the response body without crashing if it isn't JSON or is empty. | ||
| * Used for human-readable error surfaces in 500 responses. | ||
| * | ||
| * @param {Response} res | ||
| * @returns {Promise<string>} | ||
| */ | ||
| async function safeReadBody(res) { | ||
| try { | ||
| const txt = await res.text(); | ||
| return txt.trim().slice(0, 512); | ||
| } catch { | ||
| return ''; | ||
| } | ||
| } |
| // Version detection for Phase 1 of self-upgrade. | ||
| // | ||
| // Compares each component's baked OLAM_BUILD_SHA against the operator's | ||
| // local repo HEAD (mounted read-only at /operator-repo). Reports upgrade | ||
| // availability without triggering any automatic action — Phase 1 is | ||
| // detection only. | ||
| import fs from 'node:fs'; | ||
| import path from 'node:path'; | ||
| /** @typedef {'ok' | 'behind' | 'unknown'} VersionState */ | ||
| /** | ||
| * @typedef {Object} ComponentVersion | ||
| * @property {string} running - SHA baked into the running image | ||
| * @property {string} latest - SHA of operator's local HEAD (or 'unknown') | ||
| * @property {boolean} upgradeAvailable | ||
| */ | ||
| /** | ||
| * @typedef {Object} VersionSnapshot | ||
| * @property {ComponentVersion} hostCp | ||
| * @property {ComponentVersion} authService | ||
| * @property {ComponentVersion} devbox | ||
| * @property {string} operatorHead - resolved HEAD or 'unknown' | ||
| * @property {string} checkedAt - ISO timestamp | ||
| * @property {string} cliVersion - operator's CLI semver (e.g. "0.1.69") or 'unknown' | ||
| */ | ||
| /** | ||
| * Read the operator's local repo HEAD. | ||
| * | ||
| * Tries OLAM_REPO_PATH env var first, then /operator-repo (the compose- | ||
| * mounted path), then $HOME/Projects/ein-sof/olam as a bare-node fallback. | ||
| * | ||
| * Returns 'unknown' on any read error. | ||
| * | ||
| * @returns {string} | ||
| */ | ||
| export function readOperatorHead() { | ||
| const candidates = [ | ||
| process.env.OLAM_REPO_PATH, | ||
| '/operator-repo', | ||
| ].filter(Boolean); | ||
| for (const repoPath of candidates) { | ||
| try { | ||
| // Read HEAD to find the current branch ref (e.g. "ref: refs/heads/main") | ||
| // then resolve to the SHA. | ||
| const headFile = path.join(repoPath, '.git', 'HEAD'); | ||
| if (!fs.existsSync(headFile)) continue; | ||
| const headContent = fs.readFileSync(headFile, 'utf-8').trim(); | ||
| if (headContent.startsWith('ref: ')) { | ||
| // Symbolic ref → resolve to SHA via the packed-refs or loose ref. | ||
| const refPath = headContent.slice('ref: '.length); | ||
| const looseRef = path.join(repoPath, '.git', refPath); | ||
| if (fs.existsSync(looseRef)) { | ||
| return fs.readFileSync(looseRef, 'utf-8').trim(); | ||
| } | ||
| // Try packed-refs fallback. | ||
| const packedRefs = path.join(repoPath, '.git', 'packed-refs'); | ||
| if (fs.existsSync(packedRefs)) { | ||
| const lines = fs.readFileSync(packedRefs, 'utf-8').split('\n'); | ||
| for (const line of lines) { | ||
| if (line.startsWith('#')) continue; | ||
| const [sha, ref] = line.trim().split(' '); | ||
| if (ref === refPath) return sha; | ||
| } | ||
| } | ||
| } else if (/^[0-9a-f]{40}$/i.test(headContent)) { | ||
| // Detached HEAD — use the SHA directly. | ||
| return headContent; | ||
| } | ||
| } catch { | ||
| // silently try next candidate | ||
| } | ||
| } | ||
| return 'unknown'; | ||
| } | ||
| /** | ||
| * Compare two SHAs. Returns true when they differ and both are known. | ||
| * If either is 'unknown' we cannot assert an upgrade is available. | ||
| * | ||
| * @param {string} running | ||
| * @param {string} latest | ||
| * @returns {boolean} | ||
| */ | ||
| export function isUpgradeAvailable(running, latest) { | ||
| if (running === 'unknown' || latest === 'unknown') return false; | ||
| // SHAs may be full (40 hex chars) or short (7+ hex chars from --short). | ||
| // Compare by checking if one is a prefix of the other. | ||
| const a = running.toLowerCase(); | ||
| const b = latest.toLowerCase(); | ||
| return !a.startsWith(b) && !b.startsWith(a); | ||
| } | ||
| /** | ||
| * Fetch the auth-service's /health endpoint and extract buildSha. | ||
| * | ||
| * @param {string} authServiceUrl | ||
| * @returns {Promise<string>} | ||
| */ | ||
| export async function fetchAuthServiceSha(authServiceUrl) { | ||
| try { | ||
| const res = await fetch(`${authServiceUrl}/health`, { | ||
| signal: AbortSignal.timeout(5000), | ||
| }); | ||
| if (!res.ok) return 'unknown'; | ||
| const data = /** @type {unknown} */ (await res.json()); | ||
| if (data && typeof data === 'object' && 'buildSha' in data) { | ||
| const sha = /** @type {Record<string, unknown>} */ (data)['buildSha']; | ||
| return typeof sha === 'string' ? sha : 'unknown'; | ||
| } | ||
| return 'unknown'; | ||
| } catch { | ||
| return 'unknown'; | ||
| } | ||
| } | ||
| /** | ||
| * Inspect a locally-tagged docker image (by reference such as | ||
| * `ghcr.io/pleri/olam-host-cp:latest`) and extract its baked | ||
| * OLAM_BUILD_SHA env. Returns 'unknown' if the image isn't pulled, | ||
| * the docker socket is unreachable, or the env is missing. | ||
| * | ||
| * Used as the "what's the latest published image we'd swap to?" | ||
| * signal for the upgrade comparator — replaces the prior | ||
| * `operatorHead` (operator's local git HEAD) which over-reports | ||
| * upgradeAvailable whenever an SPA-only PR merges between releases. | ||
| * | ||
| * @param {string} dockerApiBase | ||
| * @param {string} imageRef e.g. "ghcr.io/pleri/olam-host-cp:latest" | ||
| * @returns {Promise<string>} | ||
| */ | ||
| export async function fetchLatestImageSha(dockerApiBase, imageRef) { | ||
| try { | ||
| const res = await fetch( | ||
| `${dockerApiBase}/images/${encodeURIComponent(imageRef)}/json`, | ||
| { signal: AbortSignal.timeout(5000) }, | ||
| ); | ||
| if (!res.ok) return 'unknown'; | ||
| const image = /** @type {unknown} */ (await res.json()); | ||
| if (!image || typeof image !== 'object') return 'unknown'; | ||
| const config = /** @type {Record<string, unknown>} */ (image)['Config']; | ||
| if (!config || typeof config !== 'object') return 'unknown'; | ||
| const env = /** @type {Record<string, unknown>} */ (config)['Env']; | ||
| if (!Array.isArray(env)) return 'unknown'; | ||
| for (const e of env) { | ||
| if (typeof e === 'string' && e.startsWith('OLAM_BUILD_SHA=')) { | ||
| return e.slice('OLAM_BUILD_SHA='.length); | ||
| } | ||
| } | ||
| return 'unknown'; | ||
| } catch { | ||
| return 'unknown'; | ||
| } | ||
| } | ||
| /** | ||
| * Fetch the devbox image SHA. We check the running devbox container's | ||
| * OLAM_BUILD_SHA env var via the docker socket proxy (inspect endpoint). | ||
| * Returns 'unknown' if any step fails. | ||
| * | ||
| * @param {string} dockerApiBase e.g. "http://docker-socket-proxy:2375" or "http://localhost:2375" | ||
| * @returns {Promise<string>} | ||
| */ | ||
| export async function fetchDevboxImageSha(dockerApiBase) { | ||
| try { | ||
| // List containers named olam-*-devbox and grab the first one. | ||
| const listRes = await fetch( | ||
| `${dockerApiBase}/containers/json?filters=${encodeURIComponent(JSON.stringify({ name: ['olam-devbox'] }))}`, | ||
| { signal: AbortSignal.timeout(5000) }, | ||
| ); | ||
| if (!listRes.ok) return 'unknown'; | ||
| const containers = /** @type {unknown} */ (await listRes.json()); | ||
| if (!Array.isArray(containers) || containers.length === 0) return 'unknown'; | ||
| // Use the most recently-created devbox container's image ID. | ||
| // Inspect the image for OLAM_BUILD_SHA label or env. | ||
| const container = /** @type {Record<string, unknown>} */ (containers[0]); | ||
| const imageId = typeof container['ImageID'] === 'string' ? container['ImageID'] : null; | ||
| if (!imageId) return 'unknown'; | ||
| const inspectRes = await fetch( | ||
| `${dockerApiBase}/images/${encodeURIComponent(imageId)}/json`, | ||
| { signal: AbortSignal.timeout(5000) }, | ||
| ); | ||
| if (!inspectRes.ok) return 'unknown'; | ||
| const image = /** @type {unknown} */ (await inspectRes.json()); | ||
| if (!image || typeof image !== 'object') return 'unknown'; | ||
| const config = /** @type {Record<string, unknown>} */ (image)['Config']; | ||
| if (!config || typeof config !== 'object') return 'unknown'; | ||
| const env = /** @type {Record<string, unknown>} */ (config)['Env']; | ||
| if (!Array.isArray(env)) return 'unknown'; | ||
| for (const e of env) { | ||
| if (typeof e === 'string' && e.startsWith('OLAM_BUILD_SHA=')) { | ||
| return e.slice('OLAM_BUILD_SHA='.length); | ||
| } | ||
| } | ||
| return 'unknown'; | ||
| } catch { | ||
| return 'unknown'; | ||
| } | ||
| } | ||
| /** | ||
| * Build a full VersionSnapshot from all available sources. | ||
| * | ||
| * @param {{ | ||
| * authServiceUrl: string; | ||
| * dockerApiBase: string; | ||
| * }} opts | ||
| * @returns {Promise<VersionSnapshot>} | ||
| */ | ||
| export async function buildVersionSnapshot({ authServiceUrl, dockerApiBase }) { | ||
| const operatorHead = readOperatorHead(); | ||
| // Inspect locally-pulled `:latest` image tags to get the actual | ||
| // published baked SHA — what `olam upgrade` would swap us to next. | ||
| // Fall back to operatorHead when the image isn't pulled (first-run | ||
| // or stack never upgraded) so the banner still surfaces SOMETHING. | ||
| const [authSha, devboxSha, hostCpLatestPublished, authLatestPublished, devboxLatestPublished] = | ||
| await Promise.all([ | ||
| fetchAuthServiceSha(authServiceUrl), | ||
| fetchDevboxImageSha(dockerApiBase), | ||
| fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-host-cp:latest'), | ||
| // NOTE: docker tag is `olam-auth` (no `-service` suffix); npm | ||
| // workspace is `auth-service`. The two diverged historically. | ||
| fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-auth:latest'), | ||
| fetchLatestImageSha(dockerApiBase, 'ghcr.io/pleri/olam-devbox:latest'), | ||
| ]); | ||
| const hostCpRunning = process.env.OLAM_BUILD_SHA ?? 'unknown'; | ||
| // Pick "latest" per component: use the published image SHA when we | ||
| // can read it (truthful — that's what would swap in), else fall back | ||
| // to operatorHead (legacy behaviour, may over-report between SPA-only | ||
| // PR merges and the next image rebuild — but still informative when | ||
| // the operator hasn't yet pulled `:latest`). | ||
| const hostCpLatest = pickLatest(hostCpLatestPublished, operatorHead); | ||
| const authLatest = pickLatest(authLatestPublished, operatorHead); | ||
| const devboxLatest = pickLatest(devboxLatestPublished, operatorHead); | ||
| // CLI version is propagated by `olam host-cp start` via the | ||
| // OLAM_CLI_VERSION env (see packages/cli/src/commands/host-cp.ts | ||
| // buildComposeEnv). Falls back to host-cp's own package.json when | ||
| // an older CLI started this container without setting the env. | ||
| const cliVersion = process.env.OLAM_CLI_VERSION | ||
| || readHostCpPackageVersion() | ||
| || 'unknown'; | ||
| return { | ||
| hostCp: { | ||
| running: hostCpRunning, | ||
| latest: hostCpLatest, | ||
| upgradeAvailable: isUpgradeAvailable(hostCpRunning, hostCpLatest), | ||
| }, | ||
| authService: { | ||
| running: authSha, | ||
| latest: authLatest, | ||
| upgradeAvailable: isUpgradeAvailable(authSha, authLatest), | ||
| }, | ||
| devbox: { | ||
| running: devboxSha, | ||
| latest: devboxLatest, | ||
| upgradeAvailable: isUpgradeAvailable(devboxSha, devboxLatest), | ||
| }, | ||
| operatorHead, | ||
| checkedAt: new Date().toISOString(), | ||
| cliVersion, | ||
| }; | ||
| } | ||
| /** | ||
| * Prefer the published-image SHA (truthful "would swap to") over the | ||
| * operator's local git HEAD (over-reports when SPA-only PRs land | ||
| * between image rebuilds). Falls back to operatorHead when the image | ||
| * isn't pulled (e.g. cold-start before first `olam upgrade`). | ||
| * | ||
| * @param {string} publishedImageSha | ||
| * @param {string} operatorHead | ||
| * @returns {string} | ||
| */ | ||
| export function pickLatest(publishedImageSha, operatorHead) { | ||
| if (publishedImageSha && publishedImageSha !== 'unknown') return publishedImageSha; | ||
| return operatorHead; | ||
| } | ||
| /** | ||
| * Read host-cp's bundled package.json version as the CLI-version | ||
| * fallback when OLAM_CLI_VERSION isn't propagated. The container | ||
| * Dockerfile copies the manifest into /app, so the lookup walks up | ||
| * from this module's location. | ||
| * | ||
| * @returns {string | null} | ||
| */ | ||
| function readHostCpPackageVersion() { | ||
| try { | ||
| const here = path.dirname(new URL(import.meta.url).pathname); | ||
| for (const candidate of [ | ||
| path.join(here, '..', 'package.json'), | ||
| path.join(here, '..', '..', 'package.json'), | ||
| ]) { | ||
| if (fs.existsSync(candidate)) { | ||
| const pkg = JSON.parse(fs.readFileSync(candidate, 'utf-8')); | ||
| if (typeof pkg.version === 'string' && pkg.version.length > 0) return pkg.version; | ||
| } | ||
| } | ||
| } catch { | ||
| // best-effort | ||
| } | ||
| return null; | ||
| } |
| // Phase F-2-B (B6): workspace + project catalog for host CP. | ||
| // | ||
| // Reads workspace YAML files from `~/.olam/workspaces/*.yaml` (mounted | ||
| // at `/data/workspaces` inside the host-cp container per compose.yaml). | ||
| // Provides three endpoints' worth of data: | ||
| // | ||
| // 1. /api/workspaces — list all workspaces (redacted) | ||
| // 2. /api/projects — deduplicated project union | ||
| // 3. POST /api/workspaces/match — exact set-equality matching | ||
| // for D13's project-first | ||
| // create-world flow | ||
| import fs from 'node:fs'; | ||
| import path from 'node:path'; | ||
| import YAML from 'yaml'; | ||
| import { redactSensitive } from './redact.mjs'; | ||
| /** | ||
| * @typedef {object} Project | ||
| * @property {string} name | ||
| * @property {string} [url] | ||
| * @property {string} [path] | ||
| * @property {string} [branch] | ||
| */ | ||
| /** | ||
| * @typedef {object} Workspace | ||
| * @property {string} name | ||
| * @property {Project[]} repos project list (called `repos` in YAML) | ||
| * @property {Record<string, unknown>} [defaults] | ||
| * @property {Record<string, unknown>} [services] | ||
| * @property {Record<string, unknown>} [image] | ||
| * @property {Record<string, unknown>} [host_ui] | ||
| * @property {number} [updatedAt] | ||
| */ | ||
| /** | ||
| * Load all workspace YAMLs from a directory. Returns an array, sorted | ||
| * by name. Invalid YAMLs are logged + skipped (don't bring down the | ||
| * whole list because one file is malformed). | ||
| * | ||
| * @param {string} dir | ||
| * @param {(message: string) => void} [log] | ||
| * @returns {Workspace[]} | ||
| */ | ||
| export function loadWorkspaces(dir, log = console.log) { | ||
| if (!fs.existsSync(dir)) { | ||
| log(`workspace-catalog: directory ${dir} does not exist`); | ||
| return []; | ||
| } | ||
| /** @type {Workspace[]} */ | ||
| const out = []; | ||
| for (const entry of fs.readdirSync(dir)) { | ||
| if (!entry.endsWith('.yaml') && !entry.endsWith('.yml')) continue; | ||
| const filePath = path.join(dir, entry); | ||
| try { | ||
| const raw = fs.readFileSync(filePath, 'utf-8'); | ||
| const parsed = YAML.parse(raw); | ||
| if (parsed && typeof parsed === 'object' && parsed.name) { | ||
| // Normalize: ensure `repos` is at least an empty array. | ||
| out.push({ ...parsed, repos: parsed.repos ?? [] }); | ||
| } else { | ||
| log(`workspace-catalog: skipping ${entry} (no .name field)`); | ||
| } | ||
| } catch (err) { | ||
| log(`workspace-catalog: failed to parse ${entry}: ${err.message}`); | ||
| } | ||
| } | ||
| return out.sort((a, b) => a.name.localeCompare(b.name)); | ||
| } | ||
| /** | ||
| * /api/workspaces response: redacted workspace list. | ||
| * | ||
| * @param {Workspace[]} workspaces | ||
| * @returns {Workspace[]} | ||
| */ | ||
| export function workspacesForApi(workspaces) { | ||
| return /** @type {Workspace[]} */ (redactSensitive(workspaces)); | ||
| } | ||
| /** | ||
| * /api/projects response: deduplicated project union across all | ||
| * workspaces. Dedup key is project name (case-sensitive — Atlas Core | ||
| * and atlas-core would be distinct, which matches the workspace YAML | ||
| * convention of using kebab-case throughout). | ||
| * | ||
| * Per-project metadata: takes the FIRST occurrence's url/path/branch. | ||
| * Subsequent occurrences with the same name are ignored. This keeps | ||
| * the response stable across reorderings within individual workspace | ||
| * YAMLs. | ||
| * | ||
| * @param {Workspace[]} workspaces | ||
| * @returns {Project[]} | ||
| */ | ||
| export function projectsFromWorkspaces(workspaces) { | ||
| /** @type {Map<string, Project>} */ | ||
| const byName = new Map(); | ||
| for (const ws of workspaces) { | ||
| for (const repo of ws.repos ?? []) { | ||
| if (!repo?.name) continue; | ||
| if (!byName.has(repo.name)) { | ||
| byName.set(repo.name, { ...repo }); | ||
| } | ||
| } | ||
| } | ||
| return [...byName.values()].sort((a, b) => a.name.localeCompare(b.name)); | ||
| } | ||
| /** | ||
| * POST /api/workspaces/match request body: { projects: string[] }. | ||
| * Returns workspaces whose project-name set EXACTLY equals the input | ||
| * set (no subset, no superset). Sorted by name for response stability. | ||
| * | ||
| * Algorithm: O(W × P) where W = #workspaces, P = average projects per | ||
| * workspace. Workspaces are small (<10 projects each); fine for direct | ||
| * iteration. | ||
| * | ||
| * @param {Workspace[]} workspaces | ||
| * @param {string[]} projectNames | ||
| * @returns {Workspace[]} | ||
| */ | ||
| export function matchWorkspacesByProjects(workspaces, projectNames) { | ||
| const target = new Set(projectNames); | ||
| /** @type {Workspace[]} */ | ||
| const matches = []; | ||
| for (const ws of workspaces) { | ||
| const wsNames = new Set((ws.repos ?? []).map((r) => r.name).filter(Boolean)); | ||
| if (setsEqual(target, wsNames)) { | ||
| matches.push(ws); | ||
| } | ||
| } | ||
| return matches.sort((a, b) => a.name.localeCompare(b.name)); | ||
| } | ||
| /** | ||
| * Set equality. Two sets are equal iff same size + same members. | ||
| * | ||
| * @param {Set<string>} a | ||
| * @param {Set<string>} b | ||
| * @returns {boolean} | ||
| */ | ||
| function setsEqual(a, b) { | ||
| if (a.size !== b.size) return false; | ||
| for (const x of a) { | ||
| if (!b.has(x)) return false; | ||
| } | ||
| return true; | ||
| } |
| /** | ||
| * WorldActivityTracker — periodic scanner that turns each active world's | ||
| * Claude session JSONL into `thought_count` + `total_cost_usd` updates on | ||
| * the `worlds` table (~/.olam/worlds.db), plus a `world.activity.tick` | ||
| * event on the host-stream broadcaster. | ||
| * | ||
| * Closes #965. Pre-fix, `olam_status <world>` always reported | ||
| * `Cost $0.0000 / Thoughts 0` because nothing wrote those columns after | ||
| * world creation. Rico (the orchestrator) reads those fields to decide | ||
| * whether a world is progressing or stalled, so as far as it was | ||
| * concerned every world was frozen. | ||
| * | ||
| * Design notes: | ||
| * - **JSONL path is operator-configurable.** Default contract per #965 | ||
| * is `~/.olam/worlds/<id>/state/claude-main.jsonl`; override the | ||
| * template via `OLAM_WORLD_JSONL_PATH_TEMPLATE`. On this host the | ||
| * producer for the default path is not yet shipped (Claude Code | ||
| * writes to `~/.claude/projects/<sanitized>/<uuid>.jsonl` by | ||
| * default), so values stay at 0 until either the producer lands or | ||
| * the env override repoints the scanner. | ||
| * - **Dedupe by `message.id`.** Claude SDK JSONL emits multiple lines | ||
| * per assistant API turn (one per content block), each carrying the | ||
| * SAME `message.id` + the SAME `usage` block. Naive sum-by-line | ||
| * double-counts. We dedupe by `message.id` for usage totals and | ||
| * count unique-message-id as `thoughtCount`. | ||
| * - **Idempotent.** Re-scanning the same JSONL produces the same | ||
| * numbers; safe to run at any cadence. | ||
| * - **Fail-soft per world.** A bad JSONL line, missing file, or | ||
| * unreadable handle never crashes the loop — the failing world is | ||
| * skipped with a debug log and the next world proceeds. | ||
| * | ||
| * Cadence: `OLAM_WORLD_ACTIVITY_TICK_MS` (default 60_000). | ||
| * | ||
| * Wire-in: `server.mjs` constructs once with `{ db, broadcaster }` after | ||
| * both are ready and calls `.stop()` from the SIGTERM/SIGINT handler. | ||
| * | ||
| * @see ../host-stream.mjs broadcaster API | ||
| * @see ../worlds-db-source.mjs read-only DB open pattern (model for | ||
| * `tryOpenDb` here, though tracker WRITES not reads). | ||
| */ | ||
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import readline from 'node:readline'; | ||
| import { createRequire } from 'node:module'; | ||
| const require = createRequire(import.meta.url); | ||
| // TODO(rates): source live model rates from auth-service or a config | ||
| // file. For now we anchor on Claude Opus per-million baseline ($3 input | ||
| // / $15 output) — the issue surface is "value advances post-creation", | ||
| // not "is dollar-accurate to 4 decimals". When per-model rates land, | ||
| // pluck the model id from the assistant message and dispatch. | ||
| const INPUT_USD_PER_M_TOKENS = 3.0; | ||
| const OUTPUT_USD_PER_M_TOKENS = 15.0; | ||
| const DEFAULT_TICK_MS = 60_000; | ||
| /** | ||
| * Resolve a per-world JSONL path from an operator-supplied template | ||
| * string. The template supports a single `{worldId}` placeholder, and a | ||
| * leading `~/` is expanded to `os.homedir()`. | ||
| * | ||
| * @param {string} template | ||
| * @param {string} worldId | ||
| * @returns {string} | ||
| */ | ||
| export function resolveJsonlPath(template, worldId) { | ||
| const swapped = template.replace(/\{worldId\}/g, worldId); | ||
| if (swapped.startsWith('~/')) { | ||
| return path.join(os.homedir(), swapped.slice(2)); | ||
| } | ||
| return swapped; | ||
| } | ||
| /** | ||
| * Scan a single JSONL file and return aggregate counts. | ||
| * | ||
| * @param {string} jsonlPath | ||
| * @returns {Promise<{thoughtCount:number, inputTokens:number, outputTokens:number, costUsd:number, lastActivityAt:string|null}>} | ||
| */ | ||
| export async function scanWorldJsonl(jsonlPath) { | ||
| const seenMessageIds = new Set(); | ||
| let inputTokens = 0; | ||
| let outputTokens = 0; | ||
| let lastTimestamp = null; | ||
| let stream; | ||
| try { | ||
| stream = fs.createReadStream(jsonlPath, { encoding: 'utf8' }); | ||
| } catch { | ||
| // ENOENT or permission error — return zeros. | ||
| return zeroStats(); | ||
| } | ||
| // createReadStream defers ENOENT to the 'error' event; convert to a | ||
| // rejected promise so the caller's try/catch sees it uniformly. | ||
| const errorPromise = new Promise((_, reject) => { | ||
| stream.on('error', reject); | ||
| }); | ||
| const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); | ||
| const linesPromise = (async () => { | ||
| for await (const line of rl) { | ||
| if (!line) continue; | ||
| let row; | ||
| try { | ||
| row = JSON.parse(line); | ||
| } catch { | ||
| // Skip malformed lines silently — the JSONL has been observed | ||
| // to contain partial writes during active sessions. | ||
| continue; | ||
| } | ||
| if (!row || row.type !== 'assistant') continue; | ||
| const msg = row.message; | ||
| if (!msg || typeof msg !== 'object') continue; | ||
| const messageId = typeof msg.id === 'string' ? msg.id : null; | ||
| if (messageId === null) continue; | ||
| if (seenMessageIds.has(messageId)) continue; | ||
| seenMessageIds.add(messageId); | ||
| const usage = msg.usage; | ||
| if (usage && typeof usage === 'object') { | ||
| if (Number.isFinite(usage.input_tokens)) { | ||
| inputTokens += Number(usage.input_tokens); | ||
| } | ||
| if (Number.isFinite(usage.output_tokens)) { | ||
| outputTokens += Number(usage.output_tokens); | ||
| } | ||
| } | ||
| if (typeof row.timestamp === 'string') { | ||
| // Lexicographic comparison is correct on ISO-8601 with consistent zone. | ||
| if (lastTimestamp === null || row.timestamp > lastTimestamp) { | ||
| lastTimestamp = row.timestamp; | ||
| } | ||
| } | ||
| } | ||
| })(); | ||
| try { | ||
| await Promise.race([linesPromise, errorPromise]); | ||
| } catch { | ||
| return zeroStats(); | ||
| } finally { | ||
| try { stream.destroy(); } catch { /* ignore */ } | ||
| } | ||
| const costUsd = | ||
| (inputTokens / 1_000_000) * INPUT_USD_PER_M_TOKENS + | ||
| (outputTokens / 1_000_000) * OUTPUT_USD_PER_M_TOKENS; | ||
| return { | ||
| thoughtCount: seenMessageIds.size, | ||
| inputTokens, | ||
| outputTokens, | ||
| costUsd, | ||
| lastActivityAt: lastTimestamp, | ||
| }; | ||
| } | ||
| function zeroStats() { | ||
| return { | ||
| thoughtCount: 0, | ||
| inputTokens: 0, | ||
| outputTokens: 0, | ||
| costUsd: 0, | ||
| lastActivityAt: null, | ||
| }; | ||
| } | ||
| /** | ||
| * @typedef {object} WorldActivityTrackerDeps | ||
| * @property {string} [dbPath] Path to worlds.db; defaults to | ||
| * `OLAM_WORLDS_DB` env var or `~/.olam/worlds.db`. | ||
| * @property {object} [broadcaster] Object with `.broadcast(type, payload)` | ||
| * (e.g. the return of `createHostStream`). Optional — when absent | ||
| * events are skipped but DB writes still happen. | ||
| * @property {number} [intervalMs] Tick cadence. Defaults to | ||
| * `OLAM_WORLD_ACTIVITY_TICK_MS` env or 60000. | ||
| * @property {string} [jsonlPathTemplate] JSONL path template. | ||
| * `{worldId}` is replaced per world. Defaults to | ||
| * `OLAM_WORLD_JSONL_PATH_TEMPLATE` env or | ||
| * `~/.olam/worlds/{worldId}/state/claude-main.jsonl`. | ||
| * @property {(msg: string) => void} [log] Defaults to `console.log`. | ||
| * @property {(msg: string) => void} [debug] Optional verbose log; defaults | ||
| * to no-op (debug-level skips on missing JSONL would be noisy). | ||
| * @property {(cb: () => void, ms: number) => any} [setTimer] Injectable | ||
| * `setInterval` for tests. | ||
| * @property {(handle: any) => void} [clearTimer] Injectable | ||
| * `clearInterval` for tests. | ||
| * @property {() => Date} [now] Clock injection for tests. | ||
| */ | ||
| /** | ||
| * @typedef {object} WorldActivityTrackerHandle | ||
| * @property {() => void} stop | ||
| * @property {() => Promise<number>} tickNow Run one tick synchronously | ||
| * (returns the count of worlds processed). Exposed for tests. | ||
| */ | ||
| /** | ||
| * Start the world activity tracker. Returns a `{ stop, tickNow }` | ||
| * handle. Safe to call before the worlds.db file exists — the tracker | ||
| * skip-with-log until the file appears. | ||
| * | ||
| * @param {WorldActivityTrackerDeps} [deps] | ||
| * @returns {WorldActivityTrackerHandle} | ||
| */ | ||
| export function startWorldActivityTracker(deps = {}) { | ||
| const log = deps.log ?? ((m) => console.log(`[world-activity] ${m}`)); | ||
| const debug = deps.debug ?? (() => {}); | ||
| const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms)); | ||
| const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h)); | ||
| const now = deps.now ?? (() => new Date()); | ||
| const intervalMs = | ||
| deps.intervalMs ?? | ||
| parseInt(process.env.OLAM_WORLD_ACTIVITY_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10); | ||
| const dbPath = | ||
| deps.dbPath ?? | ||
| process.env.OLAM_WORLDS_DB ?? | ||
| path.join(os.homedir(), '.olam', 'worlds.db'); | ||
| const jsonlPathTemplate = | ||
| deps.jsonlPathTemplate ?? | ||
| process.env.OLAM_WORLD_JSONL_PATH_TEMPLATE ?? | ||
| '~/.olam/worlds/{worldId}/state/claude-main.jsonl'; | ||
| const broadcaster = deps.broadcaster ?? null; | ||
| let stopped = false; | ||
| let inFlight = false; | ||
| let intervalHandle = null; | ||
| /** | ||
| * One tick: open DB, read active worlds, scan each JSONL, write back, | ||
| * emit event. Returns the count of worlds processed. | ||
| * | ||
| * @returns {Promise<number>} | ||
| */ | ||
| async function tick() { | ||
| if (stopped) return 0; | ||
| if (inFlight) { | ||
| // Skip overlap — slow filesystem must not pile up ticks. | ||
| debug('tick skipped: previous tick still in flight'); | ||
| return 0; | ||
| } | ||
| inFlight = true; | ||
| let db = null; | ||
| let processed = 0; | ||
| try { | ||
| let Database; | ||
| try { | ||
| Database = require('better-sqlite3'); | ||
| } catch (err) { | ||
| // better-sqlite3 unavailable (e.g. container without native | ||
| // build) — degrade silently. | ||
| log(`better-sqlite3 unavailable; skipping tick: ${err.message}`); | ||
| return 0; | ||
| } | ||
| try { | ||
| db = new Database(dbPath, { fileMustExist: true }); | ||
| } catch (err) { | ||
| // SQLITE_CANTOPEN (file absent) is the expected first-boot | ||
| // case; everything else is worth surfacing. | ||
| if (err.code !== 'SQLITE_CANTOPEN') { | ||
| log(`open ${dbPath} failed: ${err.message}`); | ||
| } else { | ||
| debug(`${dbPath} not present yet; skipping tick`); | ||
| } | ||
| return 0; | ||
| } | ||
| let activeWorlds; | ||
| try { | ||
| activeWorlds = db | ||
| .prepare( | ||
| "SELECT id FROM worlds WHERE status NOT IN ('destroyed', 'failed')", | ||
| ) | ||
| .all(); | ||
| } catch (err) { | ||
| log(`query active worlds failed: ${err.message}`); | ||
| return 0; | ||
| } | ||
| const updateStmt = db.prepare( | ||
| `UPDATE worlds | ||
| SET thought_count = ?, | ||
| total_cost_usd = ?, | ||
| updated_at = ? | ||
| WHERE id = ?`, | ||
| ); | ||
| for (const row of activeWorlds) { | ||
| if (stopped) break; | ||
| const worldId = row.id; | ||
| if (typeof worldId !== 'string') continue; | ||
| const jsonlPath = resolveJsonlPath(jsonlPathTemplate, worldId); | ||
| let stats; | ||
| try { | ||
| stats = await scanWorldJsonl(jsonlPath); | ||
| } catch (err) { | ||
| // Defence in depth — scanWorldJsonl is already fail-soft, but | ||
| // this catches anything unforeseen at the call seam. | ||
| debug(`scan ${worldId} failed: ${err.message}`); | ||
| continue; | ||
| } | ||
| const updatedAt = now().toISOString(); | ||
| try { | ||
| updateStmt.run( | ||
| stats.thoughtCount, | ||
| stats.costUsd, | ||
| updatedAt, | ||
| worldId, | ||
| ); | ||
| } catch (err) { | ||
| log(`update ${worldId} failed: ${err.message}`); | ||
| continue; | ||
| } | ||
| if (broadcaster && typeof broadcaster.broadcast === 'function') { | ||
| try { | ||
| broadcaster.broadcast('world.activity.tick', { | ||
| worldId, | ||
| thoughtCount: stats.thoughtCount, | ||
| costUsd: stats.costUsd, | ||
| inputTokens: stats.inputTokens, | ||
| outputTokens: stats.outputTokens, | ||
| lastActivityAt: stats.lastActivityAt, | ||
| updatedAt, | ||
| }); | ||
| } catch (err) { | ||
| log(`broadcast ${worldId} failed: ${err.message}`); | ||
| } | ||
| } | ||
| processed += 1; | ||
| } | ||
| } finally { | ||
| if (db) { | ||
| try { db.close(); } catch { /* ignore */ } | ||
| } | ||
| inFlight = false; | ||
| } | ||
| return processed; | ||
| } | ||
| // Kick off an initial tick on next event-loop turn so callers can | ||
| // attach test spies before any DB work happens. | ||
| setImmediate(() => { | ||
| if (stopped) return; | ||
| void tick().catch((err) => { | ||
| log(`initial tick crashed: ${err?.message ?? err}`); | ||
| }); | ||
| }); | ||
| intervalHandle = setTimer(() => { | ||
| void tick().catch((err) => { | ||
| log(`tick crashed: ${err?.message ?? err}`); | ||
| }); | ||
| }, intervalMs); | ||
| // Don't pin the event loop on shutdown. | ||
| if (intervalHandle && typeof intervalHandle.unref === 'function') { | ||
| intervalHandle.unref(); | ||
| } | ||
| log( | ||
| `started: db=${dbPath} template=${jsonlPathTemplate} interval=${intervalMs}ms`, | ||
| ); | ||
| return { | ||
| stop() { | ||
| if (stopped) return; | ||
| stopped = true; | ||
| if (intervalHandle !== null) { | ||
| try { clearTimer(intervalHandle); } catch { /* ignore */ } | ||
| intervalHandle = null; | ||
| } | ||
| }, | ||
| tickNow: tick, | ||
| }; | ||
| } |
| // Phase F-2-D follow-up: persistent world-name store. | ||
| // | ||
| // Background: world.id is the docker container suffix (e.g. `gold-arc-1454`) | ||
| // and is immutable. Operators want a separate human-friendly `name` | ||
| // (e.g. "Refactor the auth module") so the worlds list reads like a | ||
| // task board instead of a string of CSS-color-words. | ||
| // | ||
| // Storage: a single JSON file at /data/world-names.json (mounted from | ||
| // ~/.olam/world-names.json on the host). Atomic write via tmp+rename so | ||
| // concurrent PATCHes can't half-write the file. Read-on-demand with a | ||
| // tiny in-process cache keyed off mtime so steady-state GET /api/worlds | ||
| // doesn't reread the file every poll. | ||
| // | ||
| // Schema: | ||
| // { "<worldId>": "<name>", ... } | ||
| // | ||
| // Names are arbitrary UTF-8 strings, capped at NAME_MAX_LEN to keep | ||
| // the file small + the UI sane. | ||
| import fs from 'node:fs'; | ||
| import path from 'node:path'; | ||
| const NAME_MAX_LEN = 120; | ||
| /** | ||
| * @typedef {object} WorldNamesStore | ||
| * @property {() => Record<string, string>} all | ||
| * @property {(id: string) => string | null} get | ||
| * @property {(id: string, name: string) => string} set | ||
| * @property {(id: string) => void} remove | ||
| */ | ||
| /** | ||
| * Create a JSON-backed world-names store rooted at `filePath`. | ||
| * Resilient to a missing file (treats as empty); resilient to a | ||
| * malformed file (logs + treats as empty). | ||
| * | ||
| * @param {string} filePath | ||
| * @returns {WorldNamesStore} | ||
| */ | ||
| export function createWorldNamesStore(filePath) { | ||
| /** @type {Record<string, string>} */ | ||
| let cache = {}; | ||
| let cacheMtimeMs = -1; | ||
| function readFromDisk() { | ||
| if (!fs.existsSync(filePath)) { | ||
| cache = {}; | ||
| cacheMtimeMs = 0; | ||
| return; | ||
| } | ||
| try { | ||
| const stat = fs.statSync(filePath); | ||
| if (stat.mtimeMs === cacheMtimeMs) return; // cache hit | ||
| const raw = fs.readFileSync(filePath, 'utf-8'); | ||
| const parsed = JSON.parse(raw); | ||
| if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { | ||
| const next = {}; | ||
| for (const [k, v] of Object.entries(parsed)) { | ||
| if (typeof v === 'string') next[k] = v; | ||
| } | ||
| cache = next; | ||
| } else { | ||
| cache = {}; | ||
| } | ||
| cacheMtimeMs = stat.mtimeMs; | ||
| } catch (err) { | ||
| console.error(`world-names-store: failed to read ${filePath}: ${err.message}`); | ||
| cache = {}; | ||
| cacheMtimeMs = 0; | ||
| } | ||
| } | ||
| function writeToDisk() { | ||
| const dir = path.dirname(filePath); | ||
| fs.mkdirSync(dir, { recursive: true }); | ||
| const tmp = `${filePath}.tmp-${process.pid}-${Date.now()}`; | ||
| fs.writeFileSync(tmp, JSON.stringify(cache, null, 2), 'utf-8'); | ||
| fs.renameSync(tmp, filePath); | ||
| try { | ||
| const stat = fs.statSync(filePath); | ||
| cacheMtimeMs = stat.mtimeMs; | ||
| } catch { | ||
| cacheMtimeMs = 0; | ||
| } | ||
| } | ||
| /** @returns {Record<string, string>} */ | ||
| function all() { | ||
| readFromDisk(); | ||
| return { ...cache }; | ||
| } | ||
| /** | ||
| * @param {string} id | ||
| * @returns {string | null} | ||
| */ | ||
| function get(id) { | ||
| readFromDisk(); | ||
| return cache[id] ?? null; | ||
| } | ||
| /** | ||
| * @param {string} id | ||
| * @param {string} name | ||
| * @returns {string} the normalized name actually stored | ||
| */ | ||
| function set(id, name) { | ||
| if (typeof id !== 'string' || id.length === 0) { | ||
| throw new Error('worldId must be a non-empty string'); | ||
| } | ||
| const normalized = normalizeName(name); | ||
| if (normalized === null) { | ||
| throw new Error('name must be a non-empty string (after trim)'); | ||
| } | ||
| readFromDisk(); | ||
| cache = { ...cache, [id]: normalized }; | ||
| writeToDisk(); | ||
| return normalized; | ||
| } | ||
| /** | ||
| * @param {string} id | ||
| */ | ||
| function remove(id) { | ||
| readFromDisk(); | ||
| if (!(id in cache)) return; | ||
| const next = { ...cache }; | ||
| delete next[id]; | ||
| cache = next; | ||
| writeToDisk(); | ||
| } | ||
| return { all, get, set, remove }; | ||
| } | ||
| /** | ||
| * Normalize a name input. Trims, collapses internal whitespace, caps | ||
| * length. Returns null for empty/whitespace-only input. | ||
| * | ||
| * @param {unknown} input | ||
| * @returns {string | null} | ||
| */ | ||
| export function normalizeName(input) { | ||
| if (typeof input !== 'string') return null; | ||
| const trimmed = input.replace(/\s+/g, ' ').trim(); | ||
| if (trimmed.length === 0) return null; | ||
| return trimmed.length > NAME_MAX_LEN | ||
| ? trimmed.slice(0, NAME_MAX_LEN).trimEnd() | ||
| : trimmed; | ||
| } | ||
| /** | ||
| * Derive a human-friendly name from an initial task / dispatch text. | ||
| * Takes the first sentence (split on `.`/`?`/`!`/newline), trims, caps | ||
| * at ~60 chars at a word boundary so the UI doesn't truncate mid-word. | ||
| * Returns null for empty input — caller falls back to id. | ||
| * | ||
| * @param {unknown} taskText | ||
| * @returns {string | null} | ||
| */ | ||
| export function inferNameFromTask(taskText) { | ||
| if (typeof taskText !== 'string') return null; | ||
| const cleaned = taskText.replace(/\s+/g, ' ').trim(); | ||
| if (cleaned.length === 0) return null; | ||
| // First sentence terminator wins; otherwise the whole string. | ||
| const firstSentence = cleaned.split(/[.!?\n]/)[0]?.trim() ?? cleaned; | ||
| const SOFT_CAP = 60; | ||
| if (firstSentence.length <= SOFT_CAP) return firstSentence || null; | ||
| // Cap at a word boundary close to SOFT_CAP so we don't dangle | ||
| // half a word + an ellipsis. | ||
| const head = firstSentence.slice(0, SOFT_CAP); | ||
| const lastSpace = head.lastIndexOf(' '); | ||
| const truncated = lastSpace > 30 ? head.slice(0, lastSpace) : head; | ||
| return truncated.replace(/[\s,;:—–-]+$/u, ''); | ||
| } |
| import fs from 'node:fs'; | ||
| import path from 'node:path'; | ||
| /** | ||
| * @typedef {object} PrStateEntry | ||
| * @property {string} pr_url | ||
| * @property {number|null} pr_number | ||
| * @property {string|null} pr_repo | ||
| * @property {string|null} pr_created_at | ||
| * @property {'open'|'merged'|'merged_destroyed'} pr_state | ||
| * @property {string|null} pr_merged_at | ||
| * @property {boolean} auto_destroy_on_merge | ||
| */ | ||
| /** | ||
| * @param {string} filePath | ||
| */ | ||
| export function createWorldPrStateStore(filePath) { | ||
| /** @type {Record<string, PrStateEntry>} */ | ||
| let cache = {}; | ||
| let cacheMtimeMs = -1; | ||
| function readFromDisk() { | ||
| if (!fs.existsSync(filePath)) { | ||
| cache = {}; | ||
| cacheMtimeMs = 0; | ||
| return; | ||
| } | ||
| try { | ||
| const stat = fs.statSync(filePath); | ||
| if (stat.mtimeMs === cacheMtimeMs) return; | ||
| const raw = fs.readFileSync(filePath, 'utf-8'); | ||
| const parsed = JSON.parse(raw); | ||
| cache = parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {}; | ||
| cacheMtimeMs = stat.mtimeMs; | ||
| } catch (err) { | ||
| console.error(`world-pr-state: failed to read ${filePath}: ${err.message}`); | ||
| cache = {}; | ||
| cacheMtimeMs = 0; | ||
| } | ||
| } | ||
| function writeToDisk() { | ||
| const dir = path.dirname(filePath); | ||
| fs.mkdirSync(dir, { recursive: true }); | ||
| const tmp = `${filePath}.tmp-${process.pid}-${Date.now()}`; | ||
| fs.writeFileSync(tmp, JSON.stringify(cache, null, 2), 'utf-8'); | ||
| fs.renameSync(tmp, filePath); | ||
| try { | ||
| cacheMtimeMs = fs.statSync(filePath).mtimeMs; | ||
| } catch { | ||
| cacheMtimeMs = 0; | ||
| } | ||
| } | ||
| function getAll() { | ||
| readFromDisk(); | ||
| return { ...cache }; | ||
| } | ||
| /** @param {string} worldId */ | ||
| function get(worldId) { | ||
| readFromDisk(); | ||
| return cache[worldId] ?? null; | ||
| } | ||
| /** | ||
| * Upsert — merges data with the existing entry. | ||
| * @param {string} worldId | ||
| * @param {Partial<PrStateEntry>} data | ||
| */ | ||
| function set(worldId, data) { | ||
| readFromDisk(); | ||
| const existing = cache[worldId] ?? {}; | ||
| cache = { ...cache, [worldId]: { ...existing, ...data } }; | ||
| writeToDisk(); | ||
| } | ||
| /** @param {string} worldId */ | ||
| function remove(worldId) { | ||
| readFromDisk(); | ||
| if (!(worldId in cache)) return; | ||
| const next = { ...cache }; | ||
| delete next[worldId]; | ||
| cache = next; | ||
| writeToDisk(); | ||
| } | ||
| function getWorldsToWatch() { | ||
| readFromDisk(); | ||
| return Object.entries(cache) | ||
| .filter(([, entry]) => entry.pr_url && entry.pr_state !== 'merged_destroyed') | ||
| .map(([worldId, entry]) => ({ worldId, ...entry })); | ||
| } | ||
| return { getAll, get, set, remove, getWorldsToWatch }; | ||
| } |
| /** | ||
| * World progress computation — maps world state onto the 8-phase ladder | ||
| * shown in the inbox row progress bar. | ||
| * | ||
| * @module world-progress | ||
| */ | ||
| import path from 'node:path'; | ||
| import { homedir } from 'node:os'; | ||
| import { createRequire } from 'node:module'; | ||
| import { execFile } from 'node:child_process'; | ||
| import { promisify } from 'node:util'; | ||
| import { readPlanProgress } from './plan-progress.mjs'; | ||
| const execFileAsync = promisify(execFile); | ||
| // Mirror of @olam/core/src/world-paths.mjs. Inlined deliberately: host-cp's | ||
| // slim Docker image does NOT bundle @olam/core (see server.mjs ~L560 for the | ||
| // architectural decision). Keep these two definitions in sync until the | ||
| // host-cp image build is taught to vendor workspace deps. | ||
| const WORLD_DB_FILENAME = 'world.db'; | ||
| function getWorldDbPath(workspacePath) { | ||
| return path.join(workspacePath, WORLD_DB_FILENAME); | ||
| } | ||
| /** | ||
| * Phase ladder definition. | ||
| * @type {Array<{name: string, index: number}>} | ||
| */ | ||
| const PHASES = [ | ||
| { name: 'starting', index: 1 }, | ||
| { name: 'implementing', index: 2 }, | ||
| { name: 'committing', index: 3 }, | ||
| { name: 'pushing', index: 4 }, | ||
| { name: 'in_review', index: 5 }, | ||
| { name: 'ci_failed', index: 6 }, | ||
| { name: 'ready', index: 7 }, | ||
| { name: 'merged', index: 8 }, | ||
| ]; | ||
| const PHASE_TOTAL = PHASES.length; | ||
| const IDLE_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes | ||
| /** | ||
| * Determine the current phase from observable state. | ||
| * | ||
| * @param {{ | ||
| * thoughts: number, | ||
| * commitsAhead: number, | ||
| * pushed: boolean, | ||
| * prUrl: string|null, | ||
| * prChecks: 'pending'|'passing'|'failing'|null, | ||
| * prState: 'open'|'merged'|'closed'|null, | ||
| * }} state | ||
| * @returns {string} phase name | ||
| */ | ||
| export function determinePhase({ thoughts, commitsAhead, pushed, prUrl, prChecks, prState }) { | ||
| // merged | ||
| if (prState === 'merged') return 'merged'; | ||
| // prUrl exists | ||
| if (prUrl) { | ||
| if (prChecks === 'failing') return 'ci_failed'; | ||
| if (prChecks === 'passing' && prState === 'open') return 'ready'; | ||
| // prChecks is null or pending | ||
| return 'in_review'; | ||
| } | ||
| // No PR yet | ||
| if (pushed) return 'pushing'; | ||
| if (commitsAhead >= 1) return 'committing'; | ||
| if (thoughts >= 30) return 'implementing'; | ||
| return 'starting'; | ||
| } | ||
| /** | ||
| * Build the safe/default response for a world. | ||
| * | ||
| * @param {string} worldId | ||
| * @returns {object} | ||
| */ | ||
| export function makeSafeResponse(worldId) { | ||
| return { | ||
| worldId, | ||
| phase: 'starting', | ||
| phaseIndex: 1, | ||
| phaseTotal: PHASE_TOTAL, | ||
| isIdle: false, | ||
| thoughts: 0, | ||
| lastActivityAt: null, | ||
| runtimeMs: 0, | ||
| commitsAhead: 0, | ||
| pushed: false, | ||
| prUrl: null, | ||
| prNumber: null, | ||
| prChecks: null, | ||
| prState: null, | ||
| plan: null, | ||
| }; | ||
| } | ||
| /** | ||
| * Read a world row from worlds.db. | ||
| * | ||
| * @param {string} dbPath | ||
| * @param {string} worldId | ||
| * @returns {{ branch: string, repos: string[], workspacePath: string, createdAt: string } | null} | ||
| */ | ||
| function defaultReadWorldRow(dbPath, worldId) { | ||
| try { | ||
| const Database = createRequire(import.meta.url)('better-sqlite3'); | ||
| const db = new Database(dbPath, { readonly: true }); | ||
| db.pragma('journal_mode = WAL'); | ||
| const row = db.prepare( | ||
| 'SELECT branch, repos, workspace_path, created_at FROM worlds WHERE id = ?', | ||
| ).get(worldId); | ||
| db.close(); | ||
| if (!row) return null; | ||
| let repos = []; | ||
| try { | ||
| repos = typeof row.repos === 'string' ? JSON.parse(row.repos) : (row.repos ?? []); | ||
| } catch { | ||
| repos = []; | ||
| } | ||
| return { | ||
| branch: row.branch ?? 'main', | ||
| repos, | ||
| workspacePath: row.workspace_path ?? '', | ||
| createdAt: row.created_at ?? null, | ||
| }; | ||
| } catch { | ||
| return null; | ||
| } | ||
| } | ||
| /** | ||
| * Read thought count and last activity from a world.db. | ||
| * | ||
| * @param {string} dbPath | ||
| * @returns {{ count: number, lastAt: string|null }} | ||
| */ | ||
| function defaultReadThoughts(dbPath) { | ||
| try { | ||
| const Database = createRequire(import.meta.url)('better-sqlite3'); | ||
| const db = new Database(dbPath, { readonly: true }); | ||
| db.pragma('journal_mode = WAL'); | ||
| const row = db | ||
| .prepare('SELECT COUNT(*) AS cnt, MAX(created_at) AS last_at FROM thought_nodes') | ||
| .get(); | ||
| db.close(); | ||
| return { | ||
| count: Number(row?.cnt ?? 0), | ||
| lastAt: row?.last_at ?? null, | ||
| }; | ||
| } catch { | ||
| return { count: 0, lastAt: null }; | ||
| } | ||
| } | ||
| /** | ||
| * Count commits ahead of origin/main for a git worktree. | ||
| * | ||
| * @param {string} worktreePath | ||
| * @returns {Promise<number>} | ||
| */ | ||
| async function defaultGitCommitsAhead(worktreePath) { | ||
| try { | ||
| const { stdout } = await execFileAsync( | ||
| 'git', | ||
| ['-C', worktreePath, 'rev-list', 'origin/main..HEAD', '--count'], | ||
| { timeout: 5000 }, | ||
| ); | ||
| const n = parseInt(stdout.trim(), 10); | ||
| return Number.isFinite(n) ? n : 0; | ||
| } catch { | ||
| return 0; | ||
| } | ||
| } | ||
| /** | ||
| * Check whether the branch has been pushed to origin. | ||
| * | ||
| * @param {string} worktreePath | ||
| * @param {string} branch | ||
| * @returns {Promise<boolean>} | ||
| */ | ||
| async function defaultGitIsPushed(worktreePath, branch) { | ||
| try { | ||
| await execFileAsync( | ||
| 'git', | ||
| ['-C', worktreePath, 'rev-parse', '--quiet', '--verify', `origin/${branch}`], | ||
| { timeout: 5000 }, | ||
| ); | ||
| return true; | ||
| } catch { | ||
| return false; | ||
| } | ||
| } | ||
| /** | ||
| * Compute the current progress state for a world. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {{ | ||
| * worldsDbPath?: string, | ||
| * prCache?: { getPr: (prUrl: string, getToken: () => Promise<string|null>) => Promise<{state:string|null,number:number|null,checks:string|null}|null> }, | ||
| * prStateStore?: { get: (worldId: string) => object|null }, | ||
| * getGhToken?: () => Promise<string|null>, | ||
| * _readWorldRow?: (dbPath: string, worldId: string) => object|null, | ||
| * _readThoughts?: (dbPath: string) => { count: number, lastAt: string|null }, | ||
| * _gitCommitsAhead?: (worktreePath: string) => Promise<number>, | ||
| * _gitIsPushed?: (worktreePath: string, branch: string) => Promise<boolean>, | ||
| * }} [deps] | ||
| * @returns {Promise<object>} | ||
| */ | ||
| export async function computeProgress(worldId, deps = {}) { | ||
| const safe = makeSafeResponse(worldId); | ||
| try { | ||
| const { | ||
| worldsDbPath = process.env.OLAM_WORLDS_DB ?? path.join(homedir(), '.olam/worlds.db'), | ||
| prCache = null, | ||
| prStateStore = null, | ||
| getGhToken = async () => null, | ||
| _readWorldRow = defaultReadWorldRow, | ||
| _readThoughts = defaultReadThoughts, | ||
| _gitCommitsAhead = defaultGitCommitsAhead, | ||
| _gitIsPushed = defaultGitIsPushed, | ||
| } = deps; | ||
| // Read world row | ||
| const worldRow = _readWorldRow(worldsDbPath, worldId); | ||
| if (!worldRow) return safe; | ||
| const { branch, repos, workspacePath, createdAt } = worldRow; | ||
| const worktreePath = repos.length > 0 ? path.join(workspacePath, repos[0]) : workspacePath; | ||
| // Compute runtimeMs | ||
| const runtimeMs = createdAt ? Date.now() - new Date(createdAt).getTime() : 0; | ||
| // Read thoughts | ||
| const thoughtsDbPath = getWorldDbPath(workspacePath); | ||
| const { count: thoughts, lastAt: thoughtsLastAt } = _readThoughts(thoughtsDbPath); | ||
| // Git state | ||
| const [commitsAhead, pushed] = await Promise.all([ | ||
| _gitCommitsAhead(worktreePath), | ||
| _gitIsPushed(worktreePath, branch), | ||
| ]); | ||
| // PR state — check prStateStore first | ||
| let prUrl = null; | ||
| let prNumber = null; | ||
| let prState = null; | ||
| let prChecks = null; | ||
| if (prStateStore) { | ||
| const prEntry = prStateStore.get(worldId); | ||
| if (prEntry) { | ||
| prUrl = prEntry.pr_url ?? null; | ||
| prNumber = prEntry.pr_number ?? null; | ||
| // Normalize merged_destroyed → merged | ||
| const rawState = prEntry.pr_state ?? null; | ||
| prState = rawState === 'merged_destroyed' ? 'merged' : (rawState === 'none' ? null : rawState); | ||
| } | ||
| } | ||
| // Live PR data from cache | ||
| if (prUrl && prCache) { | ||
| try { | ||
| const livePr = await prCache.getPr(prUrl, getGhToken); | ||
| if (livePr) { | ||
| prChecks = livePr.checks; | ||
| // Update state if live data shows merged | ||
| if (livePr.state === 'merged') prState = 'merged'; | ||
| if (livePr.number != null) prNumber = livePr.number; | ||
| } | ||
| } catch { | ||
| // Non-fatal | ||
| } | ||
| } | ||
| // Determine phase | ||
| const phase = determinePhase({ thoughts, commitsAhead, pushed, prUrl, prChecks, prState }); | ||
| const phaseEntry = PHASES.find((p) => p.name === phase) ?? PHASES[0]; | ||
| // Idle overlay — only for implementing or committing phases | ||
| let isIdle = false; | ||
| if (phase === 'implementing' || phase === 'committing') { | ||
| if (thoughtsLastAt) { | ||
| const lastActivityMs = new Date(thoughtsLastAt).getTime(); | ||
| if (!isNaN(lastActivityMs) && Date.now() - lastActivityMs > IDLE_THRESHOLD_MS) { | ||
| isIdle = true; | ||
| } | ||
| } | ||
| } | ||
| // Plan progress — additive; null when no tracker found | ||
| const lastActivityAtMs = thoughtsLastAt ? new Date(thoughtsLastAt).getTime() : null; | ||
| const plan = readPlanProgress(worktreePath, branch, { lastActivityAtMs }); | ||
| return { | ||
| worldId, | ||
| phase, | ||
| phaseIndex: phaseEntry.index, | ||
| phaseTotal: PHASE_TOTAL, | ||
| isIdle, | ||
| thoughts, | ||
| lastActivityAt: thoughtsLastAt ?? null, | ||
| runtimeMs: Math.max(0, runtimeMs), | ||
| commitsAhead, | ||
| pushed, | ||
| prUrl, | ||
| prNumber, | ||
| prChecks, | ||
| prState, | ||
| plan, | ||
| }; | ||
| } catch { | ||
| return safe; | ||
| } | ||
| } |
| // Service enrichment (Phase F-2-D dogfood fix) — extracted from server.mjs. | ||
| // | ||
| // Fetch port bindings for a world's container via docker-socket-proxy | ||
| // inspect, map each to a clickable URL tagged with well-known internal | ||
| // ports, and probe each for actual reachability. | ||
| // | ||
| // Extracted as a standalone module so the probe + enrichment logic can be | ||
| // unit-tested in isolation (server.mjs has module-level side effects that | ||
| // make direct import impractical). The two host-specific values that the | ||
| // inline version read from server.mjs module constants — HOST_FOR_WORLD and | ||
| // DOCKER_HOST — are injected as a `deps` object so the functions stay pure | ||
| // and deterministically testable. | ||
| export const WELL_KNOWN_PORTS = { | ||
| 3000: 'atlas-core (Rails)', | ||
| 5175: 'diner-app (Vite)', | ||
| 7681: 'Terminal (ttyd)', | ||
| 7682: 'Terminal Shell (ttyd)', | ||
| 8080: 'Per-world CP', | ||
| }; | ||
| /** | ||
| * Quick liveness probe against a service URL. Returns true if the | ||
| * service responds with ANY HTTP response (1xx-5xx) — we don't care | ||
| * about status codes because each app has its own conventions (Vite | ||
| * 200s on /, ttyd may 401, Rails may 500 on /, the per-world CP 200s). | ||
| * What matters is that something is listening. | ||
| * | ||
| * Probed from inside the host-cp container so we use hostForWorld | ||
| * (host.docker.internal on macOS/Windows, 172.17.0.1 on Linux) — the | ||
| * SPA's own 127.0.0.1:<port> URL is unreachable from container-side. | ||
| * | ||
| * Tight 800ms timeout. Worst case: 4 services × 800ms in parallel ≤ 1s | ||
| * added to the /api/worlds response — acceptable for a 4s poll cycle. | ||
| * | ||
| * @param {number} hostPort | ||
| * @param {{ hostForWorld: string }} deps | ||
| * @returns {Promise<boolean>} | ||
| */ | ||
| export async function probeServiceLive(hostPort, { hostForWorld }) { | ||
| const probeUrl = `http://${hostForWorld}:${hostPort}/`; | ||
| try { | ||
| const res = await fetch(probeUrl, { | ||
| method: 'HEAD', | ||
| signal: AbortSignal.timeout(800), | ||
| redirect: 'manual', | ||
| }); | ||
| return res.status > 0; | ||
| } catch { | ||
| // ECONNREFUSED, timeout, DNS — anything counts as not-live. Try | ||
| // GET as a fallback because some servers (e.g. ttyd) close on HEAD | ||
| // and we don't want false negatives from picky upstream behavior. | ||
| try { | ||
| const res2 = await fetch(probeUrl, { | ||
| method: 'GET', | ||
| signal: AbortSignal.timeout(800), | ||
| redirect: 'manual', | ||
| }); | ||
| return res2.status > 0; | ||
| } catch { | ||
| return false; | ||
| } | ||
| } | ||
| } | ||
| /** | ||
| * Get the running container's port bindings from socket-proxy + map | ||
| * each to a clickable URL. Each service is then probed in parallel | ||
| * for actual reachability — the docker port mapping just tells us | ||
| * what's CONFIGURED; the probe confirms what's actually LISTENING. | ||
| * | ||
| * Returns [] on any docker-inspect failure (container missing, socket- | ||
| * proxy down) so the API still returns a valid worlds list. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {{ hostForWorld: string, dockerHost: string }} deps | ||
| * @returns {Promise<Array<{name: string, host_port: number, internal_port: number, url: string, live: boolean}>>} | ||
| */ | ||
| export async function fetchWorldServices(worldId, { hostForWorld, dockerHost }) { | ||
| const containerName = `olam-${worldId}-devbox`; | ||
| let data; | ||
| try { | ||
| if (dockerHost === 'docker-cli') { | ||
| // Bare-node mode: shell out to `docker inspect` instead of HTTP. | ||
| // Same fix pattern as fetchContainerSecret (PR #108). Without | ||
| // this, the services array is always empty in bare-node and the | ||
| // SPA can't find the ttyd host port → terminal renders blank. | ||
| const { spawnSync } = await import('node:child_process'); | ||
| const result = spawnSync( | ||
| 'docker', | ||
| ['inspect', containerName], | ||
| { encoding: 'utf-8', timeout: 2000 }, | ||
| ); | ||
| if (result.status !== 0) return []; | ||
| const arr = JSON.parse(result.stdout || '[]'); | ||
| data = Array.isArray(arr) && arr.length > 0 ? arr[0] : null; | ||
| if (!data) return []; | ||
| } else { | ||
| const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://'); | ||
| const res = await fetch(`${apiBase}/containers/${encodeURIComponent(containerName)}/json`, { | ||
| signal: AbortSignal.timeout(2000), | ||
| }); | ||
| if (!res.ok) return []; | ||
| data = await res.json(); | ||
| } | ||
| const ports = data?.NetworkSettings?.Ports ?? {}; | ||
| const draft = []; | ||
| for (const [internal, bindings] of Object.entries(ports)) { | ||
| if (!Array.isArray(bindings) || bindings.length === 0) continue; | ||
| const internalPort = parseInt(internal.split('/')[0], 10); | ||
| const hostPort = parseInt(bindings[0].HostPort, 10); | ||
| if (!Number.isFinite(internalPort) || !Number.isFinite(hostPort)) continue; | ||
| draft.push({ | ||
| name: WELL_KNOWN_PORTS[internalPort] ?? `App (port ${internalPort})`, | ||
| host_port: hostPort, | ||
| internal_port: internalPort, | ||
| url: `http://127.0.0.1:${hostPort}`, | ||
| }); | ||
| } | ||
| // Probe each service in parallel for actual reachability. Adds a | ||
| // `live: boolean` field. The UI dims chips for non-live services | ||
| // so operators can see what's configured-but-down vs configured- | ||
| // and-up at a glance. | ||
| const liveResults = await Promise.all( | ||
| draft.map((s) => probeServiceLive(s.host_port, { hostForWorld })), | ||
| ); | ||
| const services = draft.map((s, i) => ({ ...s, live: liveResults[i] })); | ||
| // Stable order: well-known ports first (CP, then Rails/Vite, then terminal). | ||
| services.sort((a, b) => a.internal_port - b.internal_port); | ||
| return services; | ||
| } catch { | ||
| return []; | ||
| } | ||
| } |
| import { spawn } from 'node:child_process'; | ||
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| // Deployment-mode values injected by server.mjs via configure(). | ||
| // Defaults are bare-node-safe so the module is usable in tests without configure(). | ||
| let HOST_FOR_WORLD = process.env.OLAM_HOST_FOR_WORLD ?? '127.0.0.1'; | ||
| let TUNNELS_PATH = | ||
| process.env.OLAM_WORLD_TUNNELS_PATH ?? | ||
| path.join(os.homedir(), '.olam', 'world-tunnels.json'); | ||
| /** | ||
| * Called by server.mjs immediately after it resolves HOST_FOR_WORLD and | ||
| * WORLD_TUNNELS_PATH from the deployment-mode branch. Avoids re-deriving | ||
| * container-specific literals (host.docker.internal, /data/…) in this module. | ||
| * Re-runs loadState() when tunnelsPath differs from the env-var default so | ||
| * container-mode persistence is loaded from /data/ rather than ~/.olam/. | ||
| */ | ||
| export function configure({ hostForWorld, tunnelsPath }) { | ||
| HOST_FOR_WORLD = hostForWorld; | ||
| if (tunnelsPath !== TUNNELS_PATH) { | ||
| TUNNELS_PATH = tunnelsPath; | ||
| loadState(); | ||
| } | ||
| } | ||
| const TUNNEL_TIMEOUT_MS = 30_000; | ||
| const PROBE_TIMEOUT_MS = 3_000; | ||
| const URL_PATTERN = /https:\/\/[a-z0-9-]+\.trycloudflare\.com/; | ||
| export const STATUS = { | ||
| IDLE: 'idle', | ||
| STARTING: 'starting', | ||
| RUNNING: 'running', | ||
| ERROR: 'error', | ||
| STALE: 'stale', | ||
| }; | ||
| export class AlreadyStartingError extends Error { | ||
| constructor(worldId, serviceName) { | ||
| super(`tunnel for ${serviceName} in world ${worldId} is already starting`); | ||
| this.name = 'AlreadyStartingError'; | ||
| this.worldId = worldId; | ||
| this.serviceName = serviceName; | ||
| } | ||
| } | ||
| export class TunnelTimeoutError extends Error { | ||
| constructor(worldId, serviceName) { | ||
| super(`tunnel for ${serviceName} in world ${worldId} timed out after 30s with no URL`); | ||
| this.name = 'TunnelTimeoutError'; | ||
| this.worldId = worldId; | ||
| this.serviceName = serviceName; | ||
| } | ||
| } | ||
| // Key: `${worldId}:${serviceName}` → {worldId, serviceName, port, status, url, process?} | ||
| const registry = new Map(); | ||
| function tunnelKey(worldId, serviceName) { | ||
| return `${worldId}:${serviceName}`; | ||
| } | ||
| function loadState() { | ||
| try { | ||
| if (!fs.existsSync(TUNNELS_PATH)) return; | ||
| const raw = fs.readFileSync(TUNNELS_PATH, 'utf-8'); | ||
| const data = JSON.parse(raw); | ||
| if (!data || typeof data !== 'object' || Array.isArray(data)) return; | ||
| for (const [key, entry] of Object.entries(data)) { | ||
| registry.set(key, { ...entry, process: null }); | ||
| } | ||
| } catch (err) { | ||
| console.error(`world-tunnel-manager: loadState failed: ${err.message}`); | ||
| } | ||
| } | ||
| function saveState() { | ||
| try { | ||
| const dir = path.dirname(TUNNELS_PATH); | ||
| fs.mkdirSync(dir, { recursive: true }); | ||
| const data = {}; | ||
| for (const [key, entry] of registry) { | ||
| // eslint-disable-next-line no-unused-vars | ||
| const { process: _proc, ...rest } = entry; | ||
| data[key] = rest; | ||
| } | ||
| const tmp = `${TUNNELS_PATH}.tmp-${process.pid}-${Date.now()}`; | ||
| fs.writeFileSync(tmp, JSON.stringify(data, null, 2), 'utf-8'); | ||
| fs.renameSync(tmp, TUNNELS_PATH); | ||
| } catch (err) { | ||
| console.error(`world-tunnel-manager: saveState failed: ${err.message}`); | ||
| } | ||
| } | ||
| /** | ||
| * Start a cloudflared quick-tunnel for a world service. | ||
| * Resolves with the assigned trycloudflare.com URL. | ||
| * Rejects with AlreadyStartingError if the service is already starting/running. | ||
| * Rejects with TunnelTimeoutError if no URL is emitted within 30s. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {string} serviceName | ||
| * @param {number} port host-side port (i.e. the published port on this machine) | ||
| * @returns {Promise<string>} the public tunnel URL | ||
| */ | ||
| export async function startTunnel(worldId, serviceName, port) { | ||
| const key = tunnelKey(worldId, serviceName); | ||
| const existing = registry.get(key); | ||
| if (existing && (existing.status === STATUS.STARTING || existing.status === STATUS.RUNNING)) { | ||
| throw new AlreadyStartingError(worldId, serviceName); | ||
| } | ||
| const entry = { | ||
| worldId, | ||
| serviceName, | ||
| port, | ||
| status: STATUS.STARTING, | ||
| url: null, | ||
| process: null, | ||
| }; | ||
| registry.set(key, entry); | ||
| saveState(); | ||
| const target = `http://${HOST_FOR_WORLD}:${port}`; | ||
| const child = spawn('cloudflared', ['tunnel', '--url', target], { | ||
| stdio: ['ignore', 'pipe', 'pipe'], | ||
| detached: false, | ||
| }); | ||
| entry.process = child; | ||
| return new Promise((resolve, reject) => { | ||
| let settled = false; | ||
| function settle(resolvedUrl) { | ||
| if (settled) return; | ||
| settled = true; | ||
| clearTimeout(timer); | ||
| if (resolvedUrl) { | ||
| entry.status = STATUS.RUNNING; | ||
| entry.url = resolvedUrl; | ||
| saveState(); | ||
| resolve(resolvedUrl); | ||
| } else { | ||
| entry.status = STATUS.ERROR; | ||
| entry.process = null; | ||
| saveState(); | ||
| reject(new TunnelTimeoutError(worldId, serviceName)); | ||
| } | ||
| } | ||
| const timer = setTimeout(() => settle(null), TUNNEL_TIMEOUT_MS); | ||
| function scanChunk(chunk) { | ||
| const lines = chunk.toString().split('\n'); | ||
| for (const line of lines) { | ||
| const match = URL_PATTERN.exec(line); | ||
| if (match) { settle(match[0]); return; } | ||
| } | ||
| } | ||
| child.stdout.on('data', scanChunk); | ||
| child.stderr.on('data', scanChunk); | ||
| child.on('error', (err) => { | ||
| console.error(`world-tunnel-manager: cloudflared spawn error: ${err.message}`); | ||
| settle(null); | ||
| }); | ||
| child.on('exit', (code) => { | ||
| if (!settled) { | ||
| console.error(`world-tunnel-manager: cloudflared exited (code ${code}) before URL`); | ||
| settle(null); | ||
| } else { | ||
| // Process died after URL was emitted (tunnel dropped) | ||
| entry.status = STATUS.ERROR; | ||
| entry.process = null; | ||
| saveState(); | ||
| } | ||
| }); | ||
| }); | ||
| } | ||
| /** | ||
| * Stop a tunnel for a specific service. No-op if the service has no tunnel. | ||
| * @param {string} worldId | ||
| * @param {string} serviceName | ||
| */ | ||
| export function stopTunnel(worldId, serviceName) { | ||
| const key = tunnelKey(worldId, serviceName); | ||
| const entry = registry.get(key); | ||
| if (!entry) return; | ||
| if (entry.process) { | ||
| try { entry.process.kill('SIGTERM'); } catch { /* already dead */ } | ||
| entry.process = null; | ||
| } | ||
| entry.status = STATUS.IDLE; | ||
| entry.url = null; | ||
| saveState(); | ||
| } | ||
| /** | ||
| * Return tunnel state for ALL worlds, keyed by worldId. Used by the | ||
| * host-stream broadcaster (sse-consolidation Phase B-bonus) to push a | ||
| * `tunnels.snapshot` whenever the registry changes — replaces the | ||
| * SPA's per-row `usePublishedTunnels` poll loop. | ||
| * | ||
| * @returns {{ [worldId: string]: Array<{name: string, port: number, url: string|null, status: string}> }} | ||
| */ | ||
| export function getAllTunnels() { | ||
| /** @type {Record<string, Array<{name: string, port: number, url: string|null, status: string}>>} */ | ||
| const byWorld = {}; | ||
| for (const entry of registry.values()) { | ||
| if (!byWorld[entry.worldId]) byWorld[entry.worldId] = []; | ||
| byWorld[entry.worldId].push({ | ||
| name: entry.serviceName, | ||
| port: entry.port, | ||
| url: entry.url, | ||
| status: entry.status, | ||
| }); | ||
| } | ||
| return byWorld; | ||
| } | ||
| /** | ||
| * Return the current tunnel state for all services in a world. | ||
| * @param {string} worldId | ||
| * @returns {Array<{name: string, port: number, url: string|null, status: string}>} | ||
| */ | ||
| export function getWorldTunnels(worldId) { | ||
| const result = []; | ||
| for (const entry of registry.values()) { | ||
| if (entry.worldId === worldId) { | ||
| result.push({ | ||
| name: entry.serviceName, | ||
| port: entry.port, | ||
| url: entry.url, | ||
| status: entry.status, | ||
| }); | ||
| } | ||
| } | ||
| return result; | ||
| } | ||
| /** | ||
| * Kill all tunnels for a world. Called when a world is destroyed. | ||
| * Idempotent — no-op if world has no tunnels. | ||
| * @param {string} worldId | ||
| */ | ||
| export function killWorld(worldId) { | ||
| const toDelete = []; | ||
| for (const [key, entry] of registry) { | ||
| if (entry.worldId !== worldId) continue; | ||
| if (entry.process) { | ||
| try { entry.process.kill('SIGTERM'); } catch { /* already dead */ } | ||
| entry.process = null; | ||
| } | ||
| toDelete.push(key); | ||
| } | ||
| for (const key of toDelete) registry.delete(key); | ||
| if (toDelete.length > 0) saveState(); | ||
| } | ||
| /** | ||
| * On startup, probe each persisted "running" tunnel. If the URL is unreachable, | ||
| * mark as stale so the UI can surface a Re-publish affordance. | ||
| */ | ||
| export async function probeAllOnStartup() { | ||
| const toProbe = []; | ||
| for (const [key, entry] of registry) { | ||
| if (entry.status === STATUS.RUNNING && entry.url) { | ||
| toProbe.push({ key, url: entry.url }); | ||
| } | ||
| } | ||
| await Promise.all( | ||
| toProbe.map(async ({ key, url }) => { | ||
| try { | ||
| const res = await fetch(url, { | ||
| signal: AbortSignal.timeout(PROBE_TIMEOUT_MS), | ||
| }); | ||
| if (!res.ok) throw new Error(`HTTP ${res.status}`); | ||
| } catch { | ||
| const entry = registry.get(key); | ||
| if (entry) { | ||
| entry.status = STATUS.STALE; | ||
| saveState(); | ||
| } | ||
| } | ||
| }), | ||
| ); | ||
| } | ||
| function killAll() { | ||
| for (const entry of registry.values()) { | ||
| if (entry.process) { | ||
| try { entry.process.kill('SIGTERM'); } catch { /* already dead */ } | ||
| entry.process = null; | ||
| } | ||
| } | ||
| } | ||
| process.on('SIGTERM', killAll); | ||
| process.on('exit', killAll); | ||
| // Initialise on module load using env-var or bare-node default path. | ||
| // configure() re-runs loadState() when server.mjs provides a different path | ||
| // (container mode: /data/world-tunnels.json vs the ~/.olam default above). | ||
| loadState(); |
| /** | ||
| * world-watchdog-pid-lookup.mjs — host-visible PID lookup for the world watchdog. | ||
| * | ||
| * Uses `docker top <containerId>` to enumerate processes inside a world's | ||
| * container and returns the host-visible PID of the claude process. | ||
| * | ||
| * `docker top` output format (Linux Docker / Colima): | ||
| * UID PID PPID C STIME TTY TIME CMD | ||
| * root 1234 1 0 10:00 ? 00:00:00 node /usr/local/bin/claude ... | ||
| * | ||
| * The PID column (index 1 in default ps output) is already the host-visible | ||
| * PID. On Mac/Colima the container runs inside a Linux VM so `docker top` | ||
| * returns PIDs within the VM's PID namespace — these are NOT the macOS host | ||
| * PIDs, but they ARE the PIDs visible from within the Linux layer (where | ||
| * /proc reads happen). This is the same namespace the watchdog probes use | ||
| * when reading /proc/<pid>/wchan etc., so the PIDs are correct for probe use. | ||
| * | ||
| * Inject `docker` for tests (avoids spawning real docker processes). | ||
| * | ||
| * @see docs/architecture/world-watchdog.md | ||
| */ | ||
| import { execFile } from 'node:child_process'; | ||
| import { promisify } from 'node:util'; | ||
| const execFileAsync = promisify(execFile); | ||
| /** | ||
| * Default docker executor — shells out to the real `docker` CLI. | ||
| * | ||
| * @param {string} containerId | ||
| * @returns {Promise<string>} stdout from `docker top <containerId>` | ||
| */ | ||
| async function defaultDockerTop(containerId) { | ||
| const { stdout } = await execFileAsync('docker', ['top', containerId], { | ||
| timeout: 5_000, | ||
| }); | ||
| return stdout; | ||
| } | ||
| /** | ||
| * Parse the stdout from `docker top` and extract host-visible PIDs whose | ||
| * CMD column matches a claude process. | ||
| * | ||
| * docker top default output columns (ps -ef format): | ||
| * UID PID PPID C STIME TTY TIME CMD | ||
| * Indices: 0=UID, 1=PID, 2=PPID, ..., 7+=CMD (rest of line after 7 columns). | ||
| * | ||
| * @param {string} stdout Raw output from `docker top <id>` | ||
| * @returns {number[]} Host-visible PIDs of matching claude processes, sorted ascending. | ||
| */ | ||
| export function parseDockerTopOutput(stdout) { | ||
| const lines = stdout.split('\n').filter((l) => l.trim().length > 0); | ||
| if (lines.length < 2) return []; // header only or empty | ||
| // Skip the header line (first line contains column names). | ||
| const dataLines = lines.slice(1); | ||
| const pids = []; | ||
| for (const line of dataLines) { | ||
| // Split on any whitespace — `docker top` columns are space-separated. | ||
| // CMD may contain spaces; split into at most 8 parts (last = full CMD string). | ||
| const parts = line.trim().split(/\s+/); | ||
| if (parts.length < 8) continue; | ||
| const pid = parseInt(parts[1], 10); | ||
| if (!Number.isFinite(pid) || pid <= 0) continue; | ||
| // parts[7] onward is the CMD. Rejoin the remainder. | ||
| const cmd = parts.slice(7).join(' '); | ||
| // Match: `claude` as standalone binary, or `node` process running claude. | ||
| if (/(?:^|\/)claude(\s|$)/.test(cmd) || /node[^\s]*\s+.*[/\\]claude(?:\s|$)/.test(cmd)) { | ||
| pids.push(pid); | ||
| } | ||
| } | ||
| return pids.sort((a, b) => a - b); | ||
| } | ||
| /** | ||
| * Find the host-visible PID of the claude process running inside a container. | ||
| * | ||
| * Returns the lowest matching PID (parent process heuristic — the supervisor | ||
| * claude process has a lower PID than any child workers it spawns). | ||
| * | ||
| * Fail-soft: | ||
| * - docker unreachable / container not found → null + log | ||
| * - no claude process in the container → null (silent) | ||
| * - multiple claude processes → return the lowest PID | ||
| * | ||
| * @param {{ | ||
| * containerId: string, | ||
| * dockerTop?: (containerId: string) => Promise<string>, | ||
| * log?: (msg: string) => void, | ||
| * }} opts | ||
| * @returns {Promise<number | null>} | ||
| */ | ||
| export async function findClaudePid({ | ||
| containerId, | ||
| dockerTop = defaultDockerTop, | ||
| log = (m) => console.log(`[world-watchdog-pid-lookup] ${m}`), | ||
| }) { | ||
| if (!containerId) return null; | ||
| let stdout; | ||
| try { | ||
| stdout = await dockerTop(containerId); | ||
| } catch (err) { | ||
| log(`docker top ${containerId} failed: ${err?.message ?? err}`); | ||
| return null; | ||
| } | ||
| const pids = parseDockerTopOutput(stdout); | ||
| if (pids.length === 0) return null; | ||
| // Lowest PID = the parent/supervisor process. | ||
| return pids[0]; | ||
| } |
| /** | ||
| * world-watchdog-probes.mjs — pure probe functions for the world watchdog. | ||
| * | ||
| * Three readers extract raw signals from the Linux /proc filesystem: | ||
| * - readWchan(pid, opts) → string | null | ||
| * - readCloseWaitSockets(pid, opts) → Array<{remoteIp, remotePort}> | ||
| * - readCpuPercent(pid, windowMs, opts) → number | null | ||
| * | ||
| * One pure classifier turns those signals into a verdict: | ||
| * - classify({ wchan, closeWaitCount, cpuPercent }) → 'healthy'|'suspect'|'wedged' | ||
| * | ||
| * All readers are fail-soft: any I/O error or parse error returns | ||
| * null / [] / 0 rather than throwing. The classifier treats null inputs as | ||
| * the signal not firing (conservative — only promotes to 'wedged' when all | ||
| * three signals are conclusive). | ||
| * | ||
| * Test injection: pass `opts.procRoot` to redirect /proc reads to a fixture | ||
| * directory (e.g. src/__tests__/fixtures/proc-gold-elk-5574/). | ||
| * | ||
| * CLOSE_WAIT threshold note (deviation from D2): Decision D2 specifies | ||
| * filtering CLOSE_WAIT by peer hostname (*.anthropic.com | auth-worker.*). | ||
| * DNS resolution at every tick is unreliable under network stress (exactly | ||
| * when the watchdog must be most accurate). The gold-elk-5574 forensic data | ||
| * shows ≥3 CLOSE_WAIT to ANY peer is already diagnostic — a healthy claude | ||
| * process has 0-1 CLOSE_WAIT sockets under normal operation. The classifier | ||
| * therefore uses count ≥ 3 without hostname filtering. This deviation is | ||
| * documented in docs/architecture/world-watchdog.md Signal 2. | ||
| * | ||
| * @see docs/architecture/world-watchdog.md | ||
| * @see packages/host-cp/src/__tests__/world-watchdog-probes.test.mjs | ||
| */ | ||
| import fs from 'node:fs/promises'; | ||
| import path from 'node:path'; | ||
| // HZ — Linux scheduler tick rate. Kernel default is 100; can be 250 or 1000 | ||
| // on tickless kernels but the /proc/stat jiffies-to-seconds conversion is | ||
| // independent of the actual HZ when the denominator is wall-clock ms. | ||
| // We divide jiffies by HZ to get seconds of CPU time, then compare to the | ||
| // wall-clock window. HZ=100 is correct for virtually all container environments. | ||
| const LINUX_HZ = 100; | ||
| // /proc/net/tcp state byte for CLOSE_WAIT. | ||
| const CLOSE_WAIT_STATE = '08'; | ||
| /** | ||
| * Read the wchan (wait channel) of a process's main thread. | ||
| * | ||
| * @param {number|string} pid Process ID. | ||
| * @param {{ procRoot?: string }} [opts] | ||
| * `procRoot` defaults to '/proc'; override for tests. | ||
| * @returns {Promise<string|null>} | ||
| * The wchan string (e.g. 'futex_wait_queue', 'epoll_wait') or null on error. | ||
| */ | ||
| export async function readWchan(pid, opts = {}) { | ||
| const procRoot = opts.procRoot ?? '/proc'; | ||
| const wchanPath = path.join(procRoot, String(pid), 'wchan'); | ||
| try { | ||
| const content = await fs.readFile(wchanPath, 'utf8'); | ||
| return content.trim() || null; | ||
| } catch { | ||
| return null; | ||
| } | ||
| } | ||
| /** | ||
| * Read CLOSE_WAIT sockets for a process from /proc/<pid>/net/tcp (and tcp6). | ||
| * | ||
| * Parses the /proc/net/tcp format (space-separated hex fields). State field | ||
| * (column index 3, 0-based) == '08' means CLOSE_WAIT. Returns all matching | ||
| * entries regardless of remote peer — see module JSDoc for rationale. | ||
| * | ||
| * @param {number|string} pid Process ID. | ||
| * @param {{ procRoot?: string }} [opts] | ||
| * @returns {Promise<Array<{remoteIp: string, remotePort: number}>>} | ||
| * Array of CLOSE_WAIT socket descriptors, empty on error or no matches. | ||
| */ | ||
| export async function readCloseWaitSockets(pid, opts = {}) { | ||
| const procRoot = opts.procRoot ?? '/proc'; | ||
| const results = []; | ||
| for (const proto of ['tcp', 'tcp6']) { | ||
| const tcpPath = path.join(procRoot, String(pid), 'net', proto); | ||
| let content; | ||
| try { | ||
| content = await fs.readFile(tcpPath, 'utf8'); | ||
| } catch { | ||
| // ENOENT: pid gone or proto not available — skip, not an error. | ||
| continue; | ||
| } | ||
| const lines = content.split('\n'); | ||
| // Skip header line. | ||
| for (let i = 1; i < lines.length; i++) { | ||
| const line = lines[i].trim(); | ||
| if (!line) continue; | ||
| const fields = line.split(/\s+/); | ||
| // /proc/net/tcp columns (0-based): | ||
| // 0: sl | ||
| // 1: local_address (hex IP:port) | ||
| // 2: rem_address (hex IP:port) | ||
| // 3: st (hex state) | ||
| if (fields.length < 4) continue; | ||
| const state = fields[3]; | ||
| if (state !== CLOSE_WAIT_STATE) continue; | ||
| const remAddr = fields[2]; | ||
| const colonIdx = remAddr.lastIndexOf(':'); | ||
| if (colonIdx === -1) continue; | ||
| const remIpHex = remAddr.slice(0, colonIdx); | ||
| const remPortHex = remAddr.slice(colonIdx + 1); | ||
| const remIp = parseHexIp(remIpHex); | ||
| const remPort = parseInt(remPortHex, 16); | ||
| if (remIp !== null && Number.isFinite(remPort)) { | ||
| results.push({ remoteIp: remIp, remotePort: remPort }); | ||
| } | ||
| } | ||
| } | ||
| return results; | ||
| } | ||
| /** | ||
| * Measure CPU utilisation for a process over a time window. | ||
| * | ||
| * Reads /proc/<pid>/stat twice (before + after `windowMs` ms) and computes: | ||
| * cpuPercent = (utime+stime delta) / (HZ * windowMs / 1000) * 100 | ||
| * | ||
| * @param {number|string} pid Process ID. | ||
| * @param {number} windowMs Measurement window in milliseconds. | ||
| * @param {{ procRoot?: string, sleep?: (ms: number) => Promise<void>, now?: () => number }} [opts] | ||
| * `sleep` — injectable delay function (default: real setTimeout). | ||
| * `now` — injectable clock (default: Date.now). | ||
| * `procRoot` — injectable proc root for tests. | ||
| * @returns {Promise<number|null>} | ||
| * CPU percent (0–100+) or null on read/parse error. | ||
| */ | ||
| export async function readCpuPercent(pid, windowMs, opts = {}) { | ||
| const procRoot = opts.procRoot ?? '/proc'; | ||
| const sleep = opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms))); | ||
| const statPath = path.join(procRoot, String(pid), 'stat'); | ||
| const before = await readStatTimes(statPath); | ||
| if (before === null) return null; | ||
| await sleep(windowMs); | ||
| const after = await readStatTimes(statPath); | ||
| if (after === null) return null; | ||
| const deltaTicks = (after.utime + after.stime) - (before.utime + before.stime); | ||
| if (deltaTicks < 0) return null; | ||
| // deltaTicks jiffies / HZ = delta CPU-seconds. | ||
| // windowMs / 1000 = window in seconds. | ||
| const windowSec = windowMs / 1000; | ||
| if (windowSec <= 0) return null; | ||
| const cpuPercent = (deltaTicks / LINUX_HZ / windowSec) * 100; | ||
| return cpuPercent; | ||
| } | ||
| // ── Internal helpers ────────────────────────────────────────────────────────── | ||
| /** | ||
| * Parse utime + stime from /proc/<pid>/stat content. | ||
| * | ||
| * @param {string} statPath | ||
| * @returns {Promise<{utime: number, stime: number}|null>} | ||
| */ | ||
| async function readStatTimes(statPath) { | ||
| let content; | ||
| try { | ||
| content = await fs.readFile(statPath, 'utf8'); | ||
| } catch { | ||
| return null; | ||
| } | ||
| // The stat format is: pid (comm) state ppid pgroup session ... utime stime ... | ||
| // The command name (field 2) can contain spaces and parentheses, so we | ||
| // find the last ')' to reliably locate the fields that follow. | ||
| const parenClose = content.lastIndexOf(')'); | ||
| if (parenClose === -1) return null; | ||
| // After the closing ')', fields are space-separated starting with ' state'. | ||
| // Fields after ')' (0-indexed): | ||
| // 0: state, 1: ppid, 2: pgrp, 3: session, 4: tty_nr, 5: tpgid, | ||
| // 6: flags, 7: minflt, 8: cminflt, 9: majflt, 10: cmajflt, | ||
| // 11: utime, 12: stime (i.e. indices 11+12 from the post-paren split) | ||
| const afterParen = content.slice(parenClose + 1).trim(); | ||
| const fields = afterParen.split(/\s+/); | ||
| // utime = fields[11], stime = fields[12] | ||
| if (fields.length < 13) return null; | ||
| const utime = parseInt(fields[11], 10); | ||
| const stime = parseInt(fields[12], 10); | ||
| if (!Number.isFinite(utime) || !Number.isFinite(stime)) return null; | ||
| return { utime, stime }; | ||
| } | ||
| /** | ||
| * Parse a hex-encoded IP address from /proc/net/tcp format. | ||
| * | ||
| * IPv4: 8 hex chars in little-endian byte order (e.g. "0101007F" → "127.0.0.1"). | ||
| * IPv6: 32 hex chars (4 groups of 8, each in little-endian). | ||
| * | ||
| * @param {string} hexIp | ||
| * @returns {string|null} | ||
| */ | ||
| function parseHexIp(hexIp) { | ||
| if (hexIp.length === 8) { | ||
| // IPv4: stored as little-endian 32-bit integer. | ||
| const b = [ | ||
| parseInt(hexIp.slice(6, 8), 16), | ||
| parseInt(hexIp.slice(4, 6), 16), | ||
| parseInt(hexIp.slice(2, 4), 16), | ||
| parseInt(hexIp.slice(0, 2), 16), | ||
| ]; | ||
| if (b.some((x) => !Number.isFinite(x))) return null; | ||
| return b.join('.'); | ||
| } | ||
| if (hexIp.length === 32) { | ||
| // IPv6: 4 groups of 8 hex chars, each group little-endian. | ||
| const groups = []; | ||
| for (let g = 0; g < 4; g++) { | ||
| const chunk = hexIp.slice(g * 8, g * 8 + 8); | ||
| // Reverse byte order within each 32-bit group. | ||
| const bytes = [ | ||
| chunk.slice(6, 8), | ||
| chunk.slice(4, 6), | ||
| chunk.slice(2, 4), | ||
| chunk.slice(0, 2), | ||
| ]; | ||
| // Pair bytes into 16-bit groups for IPv6 notation. | ||
| groups.push(bytes[0] + bytes[1], bytes[2] + bytes[3]); | ||
| } | ||
| return groups.join(':'); | ||
| } | ||
| return null; | ||
| } | ||
| // ── Classifier ─────────────────────────────────────────────────────────────── | ||
| /** | ||
| * @typedef {'healthy'|'suspect'|'wedged'} WatchdogVerdict | ||
| */ | ||
| /** | ||
| * Classify a set of probe signals into a watchdog verdict. | ||
| * | ||
| * AND-gate: all three of (wchan=futex_wait_queue, closeWaitCount≥3, cpuPercent<1) | ||
| * must fire for 'wedged'. Any subset → 'suspect'. None → 'healthy'. | ||
| * Null inputs are treated as not-firing (fail-soft). | ||
| * | ||
| * @param {{ wchan: string|null, closeWaitCount: number|null, cpuPercent: number|null }} signals | ||
| * @returns {WatchdogVerdict} | ||
| */ | ||
| export function classify({ wchan, closeWaitCount, cpuPercent }) { | ||
| const wchanFires = wchan === 'futex_wait_queue'; | ||
| const closeWaitFires = typeof closeWaitCount === 'number' && closeWaitCount >= 3; | ||
| const cpuFires = typeof cpuPercent === 'number' && cpuPercent < 1; | ||
| const firingCount = (wchanFires ? 1 : 0) + (closeWaitFires ? 1 : 0) + (cpuFires ? 1 : 0); | ||
| if (firingCount === 3) return 'wedged'; | ||
| if (firingCount > 0) return 'suspect'; | ||
| return 'healthy'; | ||
| } |
| /** | ||
| * world-watchdog-recovery.mjs — recovery hook for wedged claude processes. | ||
| * | ||
| * Isolated from world-watchdog.mjs so kill + replay logic is independently | ||
| * mockable in tests without touching the watchdog's ticker. | ||
| * | ||
| * API: | ||
| * createRecovery({ autoRecoverMode, leakyBucket, broadcaster, persister, | ||
| * replay, processKill, log }) | ||
| * → { onWedgedVerdict({ worldId, pid }): Promise<void> } | ||
| * | ||
| * Three modes (from compute.autoRecover in .olam/config.yaml): | ||
| * false — no-op; recovery never fires even on wedged verdict (DEFAULT) | ||
| * 'dry-run' — emits all breadcrumbs, never calls processKill or replay | ||
| * true — SIGKILL pid + read last-dispatch + replay; rate-limited | ||
| * | ||
| * Rate-limit: B2 leaky-bucket (3/hour/world). 4th wedge in window emits | ||
| * world.watchdog.recovery.budget_exhausted and skips all action. | ||
| * | ||
| * Replay stub: the `replay` dep is accepted as an injected function. In | ||
| * server.mjs it is wired to a console.warn stub + breadcrumb until the | ||
| * operator runs the B3 idempotence probe and signs off. See TODO below. | ||
| * | ||
| * @see docs/architecture/world-watchdog.md Recovery section | ||
| * @see packages/host-cp/src/lib/leaky-bucket.mjs | ||
| * @see packages/host-cp/src/dispatch-persister.mjs | ||
| */ | ||
| /** | ||
| * @typedef {'false'|true|'dry-run'} AutoRecoverMode | ||
| */ | ||
| /** | ||
| * @typedef {object} RecoveryDeps | ||
| * @property {false|true|'dry-run'} autoRecoverMode | ||
| * Passed from server.mjs which reads config.compute.autoRecover. | ||
| * Default false if config unavailable. | ||
| * @property {{ tryConsume(key: string): { allowed: boolean, retryAfterMs?: number, totalInWindow: number } }} leakyBucket | ||
| * B2 leaky-bucket instance. Keyed by worldId. | ||
| * @property {{ broadcast(type: string, payload: object): void }} [broadcaster] | ||
| * Host-stream broadcaster. Optional — when absent, breadcrumbs are skipped. | ||
| * @property {{ read({ worldId: string }): Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null> }} persister | ||
| * B4 dispatch-persister read function. | ||
| * @property {(opts: { worldId: string, messageId: string, prompt: string }) => Promise<void>} replay | ||
| * Opaque dispatch helper. Injected dep — DO NOT implement dispatch here. | ||
| * In server.mjs this is wired to a stub until operator signs off on B3 probe. | ||
| * @property {(pid: number) => void} [processKill] | ||
| * process.kill indirection so tests can spy without actually killing. | ||
| * Defaults to process.kill. | ||
| * @property {(msg: string) => void} [log] | ||
| * Logger. Defaults to console.log with [world-watchdog-recovery] prefix. | ||
| */ | ||
| /** | ||
| * @typedef {object} RecoveryHandle | ||
| * @property {(opts: { worldId: string, pid: number|null }) => Promise<void>} onWedgedVerdict | ||
| */ | ||
| /** | ||
| * Create a recovery handle. | ||
| * | ||
| * @param {RecoveryDeps} deps | ||
| * @returns {RecoveryHandle} | ||
| */ | ||
| export function createRecovery({ | ||
| autoRecoverMode = false, | ||
| leakyBucket, | ||
| broadcaster = null, | ||
| persister, | ||
| replay, | ||
| processKill = (pid) => process.kill(pid, 'SIGKILL'), | ||
| log = (m) => console.log(`[world-watchdog-recovery] ${m}`), | ||
| } = {}) { | ||
| /** | ||
| * Emit a breadcrumb via broadcaster (fail-soft). | ||
| * | ||
| * @param {string} type | ||
| * @param {object} payload | ||
| */ | ||
| function broadcast(type, payload) { | ||
| if (!broadcaster || typeof broadcaster.broadcast !== 'function') return; | ||
| try { | ||
| broadcaster.broadcast(type, payload); | ||
| } catch (err) { | ||
| log(`broadcast ${type} failed: ${err?.message ?? err}`); | ||
| } | ||
| } | ||
| /** | ||
| * Handle a 2-tick-confirmed wedged verdict for a world. | ||
| * | ||
| * Called by world-watchdog.mjs on verdict-transition only (suspect → wedged), | ||
| * NOT on steady-state re-wedge. | ||
| * | ||
| * @param {{ worldId: string, pid: number|null }} opts | ||
| * @returns {Promise<void>} | ||
| */ | ||
| async function onWedgedVerdict({ worldId, pid }) { | ||
| // mode=false → detection-only; never act. | ||
| if (autoRecoverMode === false) return; | ||
| // PID null → watchdog hasn't resolved a real PID yet (Phase A stub case); | ||
| // skip silently — there is nothing to kill. | ||
| if (pid === null) return; | ||
| // Rate-limit gate. | ||
| const bucket = leakyBucket.tryConsume(worldId); | ||
| if (!bucket.allowed) { | ||
| broadcast('world.watchdog.recovery.budget_exhausted', { | ||
| worldId, | ||
| retryAfterMs: bucket.retryAfterMs, | ||
| totalInWindow: bucket.totalInWindow, | ||
| }); | ||
| log(`worldId=${worldId}: budget exhausted (${bucket.totalInWindow} in window); skipping recovery`); | ||
| return; | ||
| } | ||
| // Read last persisted dispatch for replay. | ||
| let lastDispatch = null; | ||
| try { | ||
| lastDispatch = await persister.read({ worldId }); | ||
| } catch (err) { | ||
| log(`worldId=${worldId}: persister.read failed: ${err?.message ?? err}`); | ||
| } | ||
| broadcast('world.watchdog.recovery.start', { | ||
| worldId, | ||
| pid, | ||
| mode: autoRecoverMode, | ||
| lastDispatchMessageId: lastDispatch?.messageId ?? null, | ||
| }); | ||
| // dry-run — log planned action but do NOT kill. | ||
| if (autoRecoverMode === 'dry-run') { | ||
| log(`worldId=${worldId}: dry-run — would SIGKILL pid=${pid}${lastDispatch ? ` + replay messageId=${lastDispatch.messageId}` : ' (no last-dispatch)'}`); | ||
| broadcast('world.watchdog.recovery.complete', { | ||
| worldId, | ||
| pid, | ||
| mode: 'dry-run', | ||
| replayed: false, | ||
| }); | ||
| return; | ||
| } | ||
| // mode=true — act. | ||
| try { | ||
| // 1. SIGKILL the wedged process. | ||
| processKill(pid); | ||
| log(`worldId=${worldId}: SIGKILL sent to pid=${pid}`); | ||
| // 2. Replay or note absence of last-dispatch. | ||
| if (!lastDispatch) { | ||
| broadcast('world.watchdog.recovery.restart_without_replay', { | ||
| worldId, | ||
| pid, | ||
| }); | ||
| log(`worldId=${worldId}: no last-dispatch; killed without replay`); | ||
| } else { | ||
| // TODO: wire real replay once operator has run the B3 idempotence probe | ||
| // and confirmed dispatch is idempotent for the substrates in use. | ||
| // Until then this stub logs and emits a breadcrumb so the stub path | ||
| // is visible in production logs. See B3 probe + operator review gate B6. | ||
| broadcast('world.watchdog.recovery.replay_stub', { | ||
| worldId, | ||
| prompt: lastDispatch.prompt, | ||
| }); | ||
| log(`worldId=${worldId}: replay stub hit — real replay deferred pending B3 sign-off`); | ||
| await replay({ | ||
| worldId, | ||
| messageId: lastDispatch.messageId, | ||
| prompt: lastDispatch.prompt, | ||
| }); | ||
| } | ||
| broadcast('world.watchdog.recovery.complete', { | ||
| worldId, | ||
| pid, | ||
| mode: true, | ||
| replayed: !!lastDispatch, | ||
| }); | ||
| } catch (err) { | ||
| log(`worldId=${worldId}: recovery failed: ${err?.message ?? err}`); | ||
| broadcast('world.watchdog.recovery.failed', { | ||
| worldId, | ||
| pid, | ||
| error: err?.message ?? String(err), | ||
| }); | ||
| } | ||
| } | ||
| return { onWedgedVerdict }; | ||
| } |
| /** | ||
| * world-watchdog.mjs — periodic watchdog that probes each active world's | ||
| * `claude` PID for the three wedge signals (wchan + CLOSE_WAIT + CPU) and | ||
| * emits `world.watchdog.tick` events on the host-stream broadcaster. | ||
| * | ||
| * Design: | ||
| * - Mirrors `world-activity-tracker.mjs` shape exactly: `startWorldWatchdog(deps)` | ||
| * returns `{ stop, tickNow }`. | ||
| * - Per-world 2-tick confirm: a `'wedged'` classification is only emitted | ||
| * after TWO consecutive ticks with the wedge signature. A single-tick | ||
| * wedge emits `'suspect'`. A healthy tick resets the streak. | ||
| * - Per-world fail-soft: a probe error for one world never skips other worlds. | ||
| * - `OLAM_WORLD_WATCHDOG_DISABLED=1` → `start()` is a no-op (returns stub). | ||
| * - Cadence: `OLAM_WORLD_WATCHDOG_TICK_MS` env or `intervalMs` dep (default 30_000). | ||
| * | ||
| * v1 stub: `getClaudePidForWorld(worldId)` returns null for all worlds in | ||
| * Phase A. When null, the tick still fires but all probe signals are null, | ||
| * producing `verdict: 'unknown'`. Real PID lookup (docker inspect → | ||
| * /proc/<hostPid>/status NSpid field) is wired in a follow-up. | ||
| * This is documented here and in docs/architecture/world-watchdog.md. | ||
| * | ||
| * Wire-in: `server.mjs` constructs once after broadcaster is ready and calls | ||
| * `.stop()` from the SIGTERM/SIGINT handler. Gated on `!SERVE_ONLY`. | ||
| * | ||
| * @see docs/architecture/world-watchdog.md | ||
| * @see packages/host-cp/src/world-watchdog-probes.mjs | ||
| * @see packages/host-cp/src/world-activity-tracker.mjs (shape reference) | ||
| */ | ||
| import { | ||
| readWchan, | ||
| readCloseWaitSockets, | ||
| readCpuPercent, | ||
| classify, | ||
| } from './world-watchdog-probes.mjs'; | ||
| // Recovery hook (B5). Optional dep — when absent (recovery is null/undefined), | ||
| // the watchdog behaves exactly as Phase A: detection-only, no kill, no replay. | ||
| // Wire via startWorldWatchdog({ recovery: createRecovery({...}) }) in server.mjs. | ||
| const DEFAULT_TICK_MS = 30_000; | ||
| // CPU measurement window: shorter than the tick cadence so we don't overlap. | ||
| const CPU_WINDOW_MS = 500; | ||
| /** | ||
| * @typedef {object} WorldWatchdogDeps | ||
| * @property {object} [broadcaster] Object with `.broadcast(type, payload)`. | ||
| * Optional — when absent events are skipped but state tracking still works. | ||
| * @property {number} [intervalMs] Tick cadence in ms. Defaults to | ||
| * `OLAM_WORLD_WATCHDOG_TICK_MS` env or 30_000. | ||
| * @property {() => Promise<string[]>} [listActiveWorlds] | ||
| * Returns an array of active world IDs to probe each tick. | ||
| * Defaults to returning []. | ||
| * @property {(worldId: string) => Promise<number|null>} [getClaudePidForWorld] | ||
| * Returns the host-side PID of the claude process for a world, or null. | ||
| * v1 default: always returns null (all worlds → verdict 'unknown'). | ||
| * @property {{ procRoot?: string }} [probes] | ||
| * Injectable probe options (procRoot for tests). | ||
| * @property {{ onWedgedVerdict(opts: { worldId: string, pid: number|null }): Promise<void> }} [recovery] | ||
| * Optional recovery handle (from world-watchdog-recovery.mjs). When present, | ||
| * called once on verdict-transition to 'wedged' (suspect → wedged), NOT on | ||
| * steady-state re-wedge. When absent, detection-only (Phase A behaviour). | ||
| * @property {(msg: string) => void} [log] Defaults to `console.log`. | ||
| * @property {(msg: string) => void} [debug] Defaults to no-op. | ||
| * @property {(cb: () => void, ms: number) => any} [setTimer] | ||
| * Injectable `setInterval` for tests. | ||
| * @property {(handle: any) => void} [clearTimer] | ||
| * Injectable `clearInterval` for tests. | ||
| * @property {() => Date} [now] Clock injection for tests. | ||
| */ | ||
| /** | ||
| * @typedef {object} WorldWatchdogHandle | ||
| * @property {() => void} stop | ||
| * @property {() => Promise<number>} tickNow Run one tick immediately (returns | ||
| * the count of worlds processed). Exposed for tests. | ||
| * @property {(worldId: string) => object|null} getVerdict | ||
| * Returns the latest in-memory verdict entry for a world, or null if no tick | ||
| * has fired yet. Used by the HTTP endpoint (A5). | ||
| */ | ||
| /** | ||
| * Per-world state tracked between ticks for the 2-tick confirm. | ||
| * | ||
| * @typedef {object} WorldWatchdogState | ||
| * @property {'healthy'|'suspect'|'wedged'|'unknown'} lastClassification | ||
| * The raw classification from the previous tick (before 2-tick confirm). | ||
| * @property {'healthy'|'suspect'|'wedged'|'unknown'} lastVerdict | ||
| * The emitted verdict (post-confirm). | ||
| * @property {string} lastTickAt ISO-8601 timestamp of last tick. | ||
| * @property {object|null} lastSignals The signals from the last tick. | ||
| * @property {number|null} lastPid The PID probed last tick. | ||
| */ | ||
| /** | ||
| * Start the world watchdog. Returns a `{ stop, tickNow, getVerdict }` handle. | ||
| * | ||
| * Honoring `OLAM_WORLD_WATCHDOG_DISABLED=1`: if the env var is set, returns | ||
| * a no-op stub immediately without starting the interval or making any probe | ||
| * calls. | ||
| * | ||
| * @param {WorldWatchdogDeps} [deps] | ||
| * @returns {WorldWatchdogHandle} | ||
| */ | ||
| export function startWorldWatchdog(deps = {}) { | ||
| // Honour kill switch — return a no-op stub. | ||
| if (process.env.OLAM_WORLD_WATCHDOG_DISABLED === '1') { | ||
| return { | ||
| stop() {}, | ||
| tickNow: async () => 0, | ||
| getVerdict: () => null, | ||
| }; | ||
| } | ||
| const log = deps.log ?? ((m) => console.log(`[world-watchdog] ${m}`)); | ||
| const debug = deps.debug ?? (() => {}); | ||
| const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms)); | ||
| const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h)); | ||
| const now = deps.now ?? (() => new Date()); | ||
| const intervalMs = | ||
| deps.intervalMs ?? | ||
| parseInt(process.env.OLAM_WORLD_WATCHDOG_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10); | ||
| const broadcaster = deps.broadcaster ?? null; | ||
| const listActiveWorlds = deps.listActiveWorlds ?? (async () => []); | ||
| const getClaudePidForWorld = deps.getClaudePidForWorld ?? (async (_id) => null); | ||
| const probeOpts = deps.probes ?? {}; | ||
| // Recovery hook — null when not configured (Phase A / default-off behaviour). | ||
| const recovery = deps.recovery ?? null; | ||
| // Per-world state map: worldId → WorldWatchdogState. | ||
| /** @type {Map<string, WorldWatchdogState>} */ | ||
| const worldState = new Map(); | ||
| let stopped = false; | ||
| let inFlight = false; | ||
| let intervalHandle = null; | ||
| /** | ||
| * Probe a single world and update its state. Returns the verdict emitted. | ||
| * | ||
| * @param {string} worldId | ||
| * @returns {Promise<'healthy'|'suspect'|'wedged'|'unknown'>} | ||
| */ | ||
| async function probeWorld(worldId) { | ||
| const pid = await getClaudePidForWorld(worldId); | ||
| let wchan = null; | ||
| let closeWaitSockets = []; | ||
| let cpuPercent = null; | ||
| if (pid !== null) { | ||
| // All probes are fail-soft — they return null/[] on I/O error. | ||
| [wchan, closeWaitSockets, cpuPercent] = await Promise.all([ | ||
| readWchan(pid, probeOpts), | ||
| readCloseWaitSockets(pid, probeOpts), | ||
| readCpuPercent(pid, CPU_WINDOW_MS, probeOpts), | ||
| ]); | ||
| } | ||
| const closeWaitCount = closeWaitSockets.length; | ||
| const signals = pid !== null | ||
| ? { wchan, closeWaitCount, cpuPercent } | ||
| : null; | ||
| // Classify raw signals. | ||
| const rawClassification = pid !== null | ||
| ? classify({ wchan, closeWaitCount, cpuPercent }) | ||
| : 'unknown'; | ||
| // 2-tick confirm: only emit 'wedged' if BOTH this tick AND the previous tick | ||
| // classified as 'wedged'. Otherwise emit the raw classification. | ||
| const prev = worldState.get(worldId); | ||
| let verdict; | ||
| if (rawClassification === 'wedged' && prev?.lastClassification === 'wedged') { | ||
| verdict = 'wedged'; | ||
| } else if (rawClassification === 'wedged') { | ||
| // First 'wedged' tick — emit 'suspect' (2-tick confirm pending). | ||
| verdict = 'suspect'; | ||
| } else { | ||
| verdict = rawClassification; | ||
| } | ||
| const tickAt = now().toISOString(); | ||
| // Update per-world state. | ||
| worldState.set(worldId, { | ||
| lastClassification: rawClassification, | ||
| lastVerdict: verdict, | ||
| lastTickAt: tickAt, | ||
| lastSignals: signals, | ||
| lastPid: pid, | ||
| }); | ||
| // Recovery hook — fire ONCE on verdict-transition to 'wedged' (not on | ||
| // steady-state re-wedge). Guard: prev?.lastVerdict !== 'wedged' ensures | ||
| // only the suspect→wedged transition triggers, not wedged→wedged. | ||
| if ( | ||
| verdict === 'wedged' && | ||
| recovery !== null && | ||
| prev?.lastVerdict !== 'wedged' | ||
| ) { | ||
| // Fire-and-forget; fail-soft so a recovery error never skips other worlds. | ||
| void recovery.onWedgedVerdict({ worldId, pid }).catch((err) => { | ||
| log(`recovery.onWedgedVerdict ${worldId} failed: ${err?.message ?? err}`); | ||
| }); | ||
| } | ||
| // Emit broadcaster event. | ||
| if (broadcaster && typeof broadcaster.broadcast === 'function') { | ||
| try { | ||
| broadcaster.broadcast('world.watchdog.tick', { | ||
| worldId, | ||
| verdict, | ||
| signals, | ||
| pid, | ||
| lastTickAt: tickAt, | ||
| }); | ||
| } catch (err) { | ||
| log(`broadcast ${worldId} failed: ${err?.message ?? err}`); | ||
| } | ||
| } | ||
| return verdict; | ||
| } | ||
| /** | ||
| * One tick: get active worlds, probe each, return count processed. | ||
| * | ||
| * @returns {Promise<number>} | ||
| */ | ||
| async function tick() { | ||
| if (stopped) return 0; | ||
| if (inFlight) { | ||
| debug('tick skipped: previous tick still in flight'); | ||
| return 0; | ||
| } | ||
| inFlight = true; | ||
| let processed = 0; | ||
| try { | ||
| let worlds; | ||
| try { | ||
| worlds = await listActiveWorlds(); | ||
| } catch (err) { | ||
| log(`listActiveWorlds failed: ${err?.message ?? err}`); | ||
| return 0; | ||
| } | ||
| for (const worldId of worlds) { | ||
| if (stopped) break; | ||
| if (typeof worldId !== 'string') continue; | ||
| try { | ||
| await probeWorld(worldId); | ||
| processed += 1; | ||
| } catch (err) { | ||
| // Per-world fail-soft: one bad world doesn't crash the loop. | ||
| debug(`probe ${worldId} failed: ${err?.message ?? err}`); | ||
| } | ||
| } | ||
| } finally { | ||
| inFlight = false; | ||
| } | ||
| return processed; | ||
| } | ||
| // Kick off an initial tick on next event-loop turn so callers can | ||
| // attach test spies before any probe work happens. | ||
| setImmediate(() => { | ||
| if (stopped) return; | ||
| void tick().catch((err) => { | ||
| log(`initial tick crashed: ${err?.message ?? err}`); | ||
| }); | ||
| }); | ||
| intervalHandle = setTimer(() => { | ||
| void tick().catch((err) => { | ||
| log(`tick crashed: ${err?.message ?? err}`); | ||
| }); | ||
| }, intervalMs); | ||
| // Don't pin the event loop on shutdown. | ||
| if (intervalHandle && typeof intervalHandle.unref === 'function') { | ||
| intervalHandle.unref(); | ||
| } | ||
| log(`started: interval=${intervalMs}ms`); | ||
| return { | ||
| stop() { | ||
| if (stopped) return; | ||
| stopped = true; | ||
| if (intervalHandle !== null) { | ||
| try { clearTimer(intervalHandle); } catch { /* ignore */ } | ||
| intervalHandle = null; | ||
| } | ||
| }, | ||
| tickNow: tick, | ||
| /** | ||
| * Return the latest in-memory verdict entry for a world. | ||
| * Returns null if no tick has fired for this world yet. | ||
| * | ||
| * @param {string} worldId | ||
| * @returns {WorldWatchdogState|null} | ||
| */ | ||
| getVerdict(worldId) { | ||
| return worldState.get(worldId) ?? null; | ||
| }, | ||
| }; | ||
| } |
| /** | ||
| * WorldsDbSource — reconcile loop that reads ~/.olam/worlds.db and | ||
| * auto-registers running worlds into host-cp's in-memory registry. | ||
| * | ||
| * Two triggers (belt-and-suspenders): | ||
| * 1. fs.watch on the worlds.db file — fires within ~100ms of a write | ||
| * 2. 30s setInterval backstop — catches cases where fs.watch silently | ||
| * misses events (network filesystems, some Linux kernels) | ||
| * | ||
| * Uses better-sqlite3 for synchronous, lightweight reads. If the module | ||
| * is not installed (e.g., no native build in the container), the module | ||
| * logs a warning and exits without crashing the server. | ||
| * | ||
| * DB handle: deliberately NOT cached across reconcile calls. A long-lived | ||
| * readonly connection with the DB bind-mounted across the docker boundary | ||
| * does not reliably pick up writes committed on the host side — the host | ||
| * writer appends to the WAL, but the container reader's snapshot is stuck | ||
| * at the point the handle was first opened. Closing and reopening on every | ||
| * reconcile forces a new read transaction that sees all committed WAL | ||
| * frames. Cost: ~1 ms per call at a 30 s interval — negligible. This | ||
| * eliminates the entire class of "olam create world vanishes within 30 s" | ||
| * bugs (regression confirmed: ember-elk-9191 removed by reconciler despite | ||
| * being present in worlds.db with status=running). | ||
| * | ||
| * Interface: thin wrapper so a future "remote" source (cloud orchestrator) | ||
| * can drop in via the same WorldsSource interface in worlds-source.mjs. | ||
| */ | ||
| import fs from 'node:fs'; | ||
| import { createRequire } from 'node:module'; | ||
| const require = createRequire(import.meta.url); | ||
| /** | ||
| * @typedef {object} WorldsDbSourceDeps | ||
| * @property {string} dbPath Path to worlds.db (OLAM_WORLDS_DB or ~/.olam/worlds.db) | ||
| * @property {string} dockerHost Docker API base URL (tcp://host:port) | ||
| * @property {string} worldHost Host used to reach world CPs (127.0.0.1 or host.docker.internal) | ||
| * @property {() => Record<string, number>} getRegistry Current WORLDS map | ||
| * @property {(id: string, port: number) => void} onWorldAdded Called when a new running world is found | ||
| * @property {(id: string) => void} onWorldRemoved Called when a running world disappears | ||
| * @property {(msg: string) => void} [log] | ||
| */ | ||
| /** | ||
| * Derive the per-world CP host port from docker inspect. | ||
| * | ||
| * @param {string} worldId | ||
| * @param {string} dockerHost e.g. 'tcp://docker-socket-proxy:2375' | ||
| * @returns {Promise<number | null>} | ||
| */ | ||
| async function getWorldPortFromDocker(worldId, dockerHost) { | ||
| const apiBase = dockerHost.replace(/^tcp:\/\//, 'http://'); | ||
| const containerName = `olam-${worldId}-devbox`; | ||
| try { | ||
| const res = await fetch(`${apiBase}/containers/${encodeURIComponent(containerName)}/json`, { | ||
| signal: AbortSignal.timeout(3000), | ||
| }); | ||
| if (!res.ok) return null; | ||
| const data = await res.json(); | ||
| // Per-world CP runs on internal port 8080; host port is the published binding. | ||
| const ports = data?.NetworkSettings?.Ports ?? {}; | ||
| const binding = ports['8080/tcp']; | ||
| if (!Array.isArray(binding) || binding.length === 0) return null; | ||
| const hostPort = parseInt(binding[0].HostPort, 10); | ||
| return Number.isFinite(hostPort) ? hostPort : null; | ||
| } catch { | ||
| return null; | ||
| } | ||
| } | ||
| /** | ||
| * Start the worlds-db reconcile loop. Returns a stop function. | ||
| * | ||
| * @param {WorldsDbSourceDeps} deps | ||
| * @returns {{ stop: () => void }} | ||
| */ | ||
| export function startWorldsDbReconciler(deps) { | ||
| const { dbPath, dockerHost, getRegistry, onWorldAdded, onWorldRemoved, log = console.log } = deps; | ||
| let db = null; | ||
| let stopped = false; | ||
| let watcher = null; | ||
| function tryOpenDb() { | ||
| if (db) return db; | ||
| try { | ||
| // Dynamic require — gracefully degrade if better-sqlite3 is not installed. | ||
| // better-sqlite3 is CommonJS-only; createRequire enables sync dynamic loading in ESM. | ||
| const Database = require('better-sqlite3'); | ||
| db = new Database(dbPath, { readonly: true, fileMustExist: true }); | ||
| log(`[worlds-db] opened ${dbPath}`); | ||
| return db; | ||
| } catch (err) { | ||
| if (err.code === 'MODULE_NOT_FOUND') { | ||
| log('[worlds-db] better-sqlite3 not available; skipping DB reconciler'); | ||
| } else if (err.code !== 'SQLITE_CANTOPEN') { | ||
| log(`[worlds-db] failed to open ${dbPath}: ${err.message}`); | ||
| } | ||
| return null; | ||
| } | ||
| } | ||
| async function reconcile() { | ||
| if (stopped) return; | ||
| // Close any cached handle so tryOpenDb() opens a fresh connection below. | ||
| // A long-lived readonly handle under cross-bind-mount WAL mode has its | ||
| // read snapshot frozen at open time; closing and reopening starts a new | ||
| // read transaction that includes all WAL frames committed by the host. | ||
| if (db) { | ||
| try { db.close(); } catch { /* ignore */ } | ||
| db = null; | ||
| } | ||
| const database = tryOpenDb(); | ||
| if (!database) return; | ||
| try { | ||
| let runningIds; | ||
| try { | ||
| const rows = database.prepare("SELECT id FROM worlds WHERE status = 'running'").all(); | ||
| runningIds = new Set(rows.map((r) => r.id)); | ||
| } catch (err) { | ||
| log(`[worlds-db] query failed: ${err.message}`); | ||
| return; | ||
| } | ||
| const registry = getRegistry(); | ||
| // Add worlds that are running in DB but missing from registry. | ||
| for (const id of runningIds) { | ||
| if (id in registry) continue; | ||
| const port = await getWorldPortFromDocker(id, dockerHost); | ||
| if (port === null) { | ||
| log(`[worlds-db] world ${id} running in DB but no docker port found; skipping`); | ||
| continue; | ||
| } | ||
| log(`[worlds-db] reconcile: adding ${id} → :${port}`); | ||
| onWorldAdded(id, port); | ||
| } | ||
| // Remove worlds that are registered but no longer 'running' in DB. | ||
| for (const id of Object.keys(registry)) { | ||
| if (runningIds.has(id)) continue; | ||
| log(`[worlds-db] reconcile: removing ${id} (not running in DB)`); | ||
| onWorldRemoved(id); | ||
| } | ||
| } finally { | ||
| // Always close — no need to hold the handle between reconciles. | ||
| try { db.close(); } catch { /* ignore */ } | ||
| db = null; | ||
| } | ||
| } | ||
| // Watch the DB file for changes (fast path). | ||
| if (fs.existsSync(dbPath)) { | ||
| try { | ||
| watcher = fs.watch(dbPath, { persistent: false }, () => { | ||
| void reconcile(); | ||
| }); | ||
| } catch (err) { | ||
| log(`[worlds-db] fs.watch failed: ${err.message}; relying on 30s poll`); | ||
| } | ||
| // Initial reconcile on startup. | ||
| void reconcile(); | ||
| } else { | ||
| log(`[worlds-db] ${dbPath} not found; will poll every 30s`); | ||
| } | ||
| // 30s backstop poll. Also watches for the file to appear. | ||
| const interval = setInterval(async () => { | ||
| if (!watcher && fs.existsSync(dbPath)) { | ||
| // File appeared since startup — set up watcher now. | ||
| try { | ||
| watcher = fs.watch(dbPath, { persistent: false }, () => { void reconcile(); }); | ||
| log(`[worlds-db] ${dbPath} appeared; watcher started`); | ||
| } catch { /* fs.watch failure is non-fatal */ } | ||
| } | ||
| await reconcile(); | ||
| }, 30_000); | ||
| return { | ||
| stop() { | ||
| stopped = true; | ||
| clearInterval(interval); | ||
| if (watcher) { try { watcher.close(); } catch { /* ignore */ } } | ||
| if (db) { try { db.close(); } catch { /* ignore */ } } | ||
| }, | ||
| }; | ||
| } |
| /** | ||
| * Phase E1 (olam-dogfood-vision): WorldsSource interface. | ||
| * | ||
| * Single narrow boundary that both LocalWorldsSource (today's | ||
| * dockerode-driven enumeration) and PylonWorldsSource (future cloud | ||
| * worlds) implement. The interface is the entire contract — there is | ||
| * no shared abstract class, no shared base, no shared utility module. | ||
| * | ||
| * Per Phase E plan (S1 contract carried through C-phase): the wire | ||
| * shape IS the abstraction. Sources implementing this interface are | ||
| * free to pick any backend (dockerode, Pylon SDK, mock, sqlite cache | ||
| * — anything) as long as `list()` returns the WorldSummary shape. | ||
| * | ||
| * Deliberately narrow: | ||
| * - `name` — discriminator for the source. SPA uses this to render | ||
| * the per-world `source` chip (E5). | ||
| * - `list()` — read-only enumeration. NO mutations. Mutations stay | ||
| * on host-cp's existing endpoints (POST /api/worlds delegation, | ||
| * DELETE via per-world CP, etc.). T5 mitigation: keeping the | ||
| * surface narrow lets the future Pylon SDK integration extend | ||
| * `list()`'s implementation without forcing a contract change | ||
| * across consumers. | ||
| * | ||
| * This is a `.mjs` file (matches host-cp's existing module style). | ||
| * Type information is conveyed via JSDoc; consumers reading via | ||
| * TypeScript get the shape via `// @ts-check` + JSDoc inference. | ||
| * | ||
| * @typedef {object} ServiceInfo | ||
| * @property {string} name | ||
| * @property {number} host_port | ||
| * @property {number} internal_port | ||
| * @property {string} url | ||
| * @property {boolean} live | ||
| * | ||
| * @typedef {object} WorldSummary | ||
| * @property {string} id | ||
| * @property {string | null} name | ||
| * @property {'running' | 'starting' | 'unknown' | 'failed'} status | ||
| * @property {ServiceInfo[]} services | ||
| * @property {'local' | 'pylon-cloud'} source | ||
| * | ||
| * @typedef {object} WorldsSource | ||
| * @property {'local' | 'pylon-cloud'} name | ||
| * @property {() => Promise<WorldSummary[]>} list | ||
| */ | ||
| // Re-export the source-name discriminator so consumers don't repeat | ||
| // the literal string. Both implementations + E4's composition layer | ||
| // + E5's SPA badge logic reference this. | ||
| export const SOURCE_NAMES = /** @type {const} */ (['local', 'pylon-cloud']); | ||
| // `WorldsSource` is a TYPE export — no runtime symbol. Consumers | ||
| // import it via JSDoc references: | ||
| // /** @type {import('./worlds-source.mjs').WorldsSource} */ | ||
| // or in TypeScript: | ||
| // import type { WorldsSource } from './worlds-source.mjs'; | ||
| // | ||
| // Test files exercising the interface treat it as duck-typed: any | ||
| // object with the right shape passes structural compatibility. |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
AI-detected potential code anomaly
Supply chain riskAI has identified unusual behaviors that may pose a security risk.
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 87 instances
AI-detected potential code anomaly
Supply chain riskAI has identified unusual behaviors that may pose a security risk.
Found 2 instances
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
43
-75.71%8
-84%4559838
-22.29%34
-82.47%118842
-17.05%