@browserbasehq/stagehand
Advanced tools
Comparing version 1.7.0-alpha-b902192bc7ff8eb02c85150c1fe6f89c2a95b211 to 1.7.0-alpha-ba4ec335a5323648c6016cc480300fd58868311a
@@ -151,7 +151,4 @@ import { Page, BrowserContext, Browser } from '@playwright/test'; | ||
sessionUrl: string; | ||
sessionId: string; | ||
} | ||
interface InitResult { | ||
debugUrl: string; | ||
sessionUrl: string; | ||
} | ||
interface InitFromPageOptions { | ||
@@ -186,2 +183,3 @@ page: Page; | ||
domSettleTimeoutMs?: number; | ||
useTextExtract?: boolean; | ||
} | ||
@@ -207,2 +205,3 @@ type ExtractResult<T extends z.AnyZodObject> = z.infer<T>; | ||
contextPath?: string; | ||
sessionId?: string; | ||
} | ||
@@ -227,2 +226,3 @@ | ||
context: BrowserContext; | ||
browserbaseSessionID?: string; | ||
private env; | ||
@@ -260,3 +260,3 @@ private apiKey; | ||
act({ action, modelName, modelClientOptions, useVision, variables, domSettleTimeoutMs, }: ActOptions): Promise<ActResult>; | ||
extract<T extends z.AnyZodObject>({ instruction, schema, modelName, modelClientOptions, domSettleTimeoutMs, }: ExtractOptions<T>): Promise<ExtractResult<T>>; | ||
extract<T extends z.AnyZodObject>({ instruction, schema, modelName, modelClientOptions, domSettleTimeoutMs, useTextExtract, }: ExtractOptions<T>): Promise<ExtractResult<T>>; | ||
observe(options?: ObserveOptions): Promise<ObserveResult[]>; | ||
@@ -263,0 +263,0 @@ close(): Promise<void>; |
@@ -1,1 +0,1 @@ | ||
export declare const scriptContent = "(() => {\n // lib/dom/xpathUtils.ts\n function getParentElement(node) {\n return isElementNode(node) ? node.parentElement : node.parentNode;\n }\n function getCombinations(attributes, size) {\n const results = [];\n function helper(start, combo) {\n if (combo.length === size) {\n results.push([...combo]);\n return;\n }\n for (let i = start; i < attributes.length; i++) {\n combo.push(attributes[i]);\n helper(i + 1, combo);\n combo.pop();\n }\n }\n helper(0, []);\n return results;\n }\n function isXPathFirstResultElement(xpath, target) {\n try {\n const result = document.evaluate(\n xpath,\n document.documentElement,\n null,\n XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,\n null\n );\n return result.snapshotItem(0) === target;\n } catch (error) {\n console.warn(`Invalid XPath expression: ${xpath}`, error);\n return false;\n }\n }\n function escapeXPathString(value) {\n if (value.includes(\"'\")) {\n if (value.includes('\"')) {\n return \"concat(\" + value.split(/('+)/).map((part) => {\n if (part === \"'\") {\n return `\"'\"`;\n } else if (part.startsWith(\"'\") && part.endsWith(\"'\")) {\n return `\"${part}\"`;\n } else {\n return `'${part}'`;\n }\n }).join(\",\") + \")\";\n } else {\n return `\"${value}\"`;\n }\n } else {\n return `'${value}'`;\n }\n }\n async function generateXPathsForElement(element) {\n if (!element) return [];\n const [complexXPath, standardXPath, idBasedXPath] = await Promise.all([\n generateComplexXPath(element),\n generateStandardXPath(element),\n generatedIdBasedXPath(element)\n ]);\n return [standardXPath, ...idBasedXPath ? [idBasedXPath] : [], complexXPath];\n }\n async function generateComplexXPath(element) {\n const parts = [];\n let currentElement = element;\n while (currentElement && (isTextNode(currentElement) || isElementNode(currentElement))) {\n if (isElementNode(currentElement)) {\n const el = currentElement;\n let selector = el.tagName.toLowerCase();\n const attributePriority = [\n \"data-qa\",\n \"data-component\",\n \"data-role\",\n \"role\",\n \"aria-role\",\n \"type\",\n \"name\",\n \"aria-label\",\n \"placeholder\",\n \"title\",\n \"alt\"\n ];\n const attributes = attributePriority.map((attr) => {\n let value = el.getAttribute(attr);\n if (attr === \"href-full\" && value) {\n value = el.getAttribute(\"href\");\n }\n return value ? { attr: attr === \"href-full\" ? \"href\" : attr, value } : null;\n }).filter((attr) => attr !== null);\n let uniqueSelector = \"\";\n for (let i = 1; i <= attributes.length; i++) {\n const combinations = getCombinations(attributes, i);\n for (const combo of combinations) {\n const conditions = combo.map((a) => `@${a.attr}=${escapeXPathString(a.value)}`).join(\" and \");\n const xpath2 = `//${selector}[${conditions}]`;\n if (isXPathFirstResultElement(xpath2, el)) {\n uniqueSelector = xpath2;\n break;\n }\n }\n if (uniqueSelector) break;\n }\n if (uniqueSelector) {\n parts.unshift(uniqueSelector.replace(\"//\", \"\"));\n break;\n } else {\n const parent = getParentElement(el);\n if (parent) {\n const siblings = Array.from(parent.children).filter(\n (sibling) => sibling.tagName === el.tagName\n );\n const index = siblings.indexOf(el) + 1;\n selector += siblings.length > 1 ? `[${index}]` : \"\";\n }\n parts.unshift(selector);\n }\n }\n currentElement = getParentElement(currentElement);\n }\n const xpath = \"//\" + parts.join(\"/\");\n return xpath;\n }\n async function generateStandardXPath(element) {\n const parts = [];\n while (element && (isTextNode(element) || isElementNode(element))) {\n let index = 0;\n let hasSameTypeSiblings = false;\n const siblings = element.parentElement ? Array.from(element.parentElement.childNodes) : [];\n for (let i = 0; i < siblings.length; i++) {\n const sibling = siblings[i];\n if (sibling.nodeType === element.nodeType && sibling.nodeName === element.nodeName) {\n index = index + 1;\n hasSameTypeSiblings = true;\n if (sibling.isSameNode(element)) {\n break;\n }\n }\n }\n if (element.nodeName !== \"#text\") {\n const tagName = element.nodeName.toLowerCase();\n const pathIndex = hasSameTypeSiblings ? `[${index}]` : \"\";\n parts.unshift(`${tagName}${pathIndex}`);\n }\n element = element.parentElement;\n }\n return parts.length ? `/${parts.join(\"/\")}` : \"\";\n }\n async function generatedIdBasedXPath(element) {\n if (isElementNode(element) && element.id) {\n return `//*[@id='${element.id}']`;\n }\n return null;\n }\n\n // lib/dom/utils.ts\n async function waitForDomSettle() {\n return new Promise((resolve) => {\n const createTimeout = () => {\n return setTimeout(() => {\n resolve();\n }, 2e3);\n };\n let timeout = createTimeout();\n const observer = new MutationObserver(() => {\n clearTimeout(timeout);\n timeout = createTimeout();\n });\n observer.observe(window.document.body, { childList: true, subtree: true });\n });\n }\n window.waitForDomSettle = waitForDomSettle;\n function calculateViewportHeight() {\n return Math.ceil(window.innerHeight * 0.75);\n }\n\n // lib/dom/process.ts\n function isElementNode(node) {\n return node.nodeType === Node.ELEMENT_NODE;\n }\n function isTextNode(node) {\n return node.nodeType === Node.TEXT_NODE && Boolean(node.textContent?.trim());\n }\n async function processDom(chunksSeen) {\n const { chunk, chunksArray } = await pickChunk(chunksSeen);\n const { outputString, selectorMap } = await processElements(chunk);\n console.log(\n `Stagehand (Browser Process): Extracted dom elements:\n${outputString}`\n );\n return {\n outputString,\n selectorMap,\n chunk,\n chunks: chunksArray\n };\n }\n async function processAllOfDom() {\n console.log(\"Stagehand (Browser Process): Processing all of DOM\");\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n let index = 0;\n const results = [];\n for (let chunk = 0; chunk < totalChunks; chunk++) {\n const result = await processElements(chunk, true, index);\n results.push(result);\n index += Object.keys(result.selectorMap).length;\n }\n await scrollToHeight(0);\n const allOutputString = results.map((result) => result.outputString).join(\"\");\n const allSelectorMap = results.reduce(\n (acc, result) => ({ ...acc, ...result.selectorMap }),\n {}\n );\n console.log(\n `Stagehand (Browser Process): All dom elements: ${allOutputString}`\n );\n return {\n outputString: allOutputString,\n selectorMap: allSelectorMap\n };\n }\n async function scrollToHeight(height) {\n window.scrollTo({ top: height, left: 0, behavior: \"smooth\" });\n await new Promise((resolve) => {\n let scrollEndTimer;\n const handleScrollEnd = () => {\n clearTimeout(scrollEndTimer);\n scrollEndTimer = window.setTimeout(() => {\n window.removeEventListener(\"scroll\", handleScrollEnd);\n resolve();\n }, 100);\n };\n window.addEventListener(\"scroll\", handleScrollEnd, { passive: true });\n handleScrollEnd();\n });\n }\n var xpathCache = /* @__PURE__ */ new Map();\n async function processElements(chunk, scrollToChunk = true, indexOffset = 0) {\n console.time(\"processElements:total\");\n const viewportHeight = calculateViewportHeight();\n const chunkHeight = viewportHeight * chunk;\n const maxScrollTop = document.documentElement.scrollHeight - viewportHeight;\n const offsetTop = Math.min(chunkHeight, maxScrollTop);\n if (scrollToChunk) {\n console.time(\"processElements:scroll\");\n await scrollToHeight(offsetTop);\n console.timeEnd(\"processElements:scroll\");\n }\n const candidateElements = [];\n const DOMQueue = [...document.body.childNodes];\n console.log(\"Stagehand (Browser Process): Generating candidate elements\");\n console.time(\"processElements:findCandidates\");\n while (DOMQueue.length > 0) {\n const element = DOMQueue.pop();\n let shouldAddElement = false;\n if (element && isElementNode(element)) {\n const childrenCount = element.childNodes.length;\n for (let i = childrenCount - 1; i >= 0; i--) {\n const child = element.childNodes[i];\n DOMQueue.push(child);\n }\n if (isInteractiveElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n if (isLeafElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n }\n if (element && isTextNode(element) && isTextVisible(element)) {\n shouldAddElement = true;\n }\n if (shouldAddElement) {\n candidateElements.push(element);\n }\n }\n console.timeEnd(\"processElements:findCandidates\");\n const selectorMap = {};\n let outputString = \"\";\n console.log(\n `Stagehand (Browser Process): Processing candidate elements: ${candidateElements.length}`\n );\n console.time(\"processElements:processCandidates\");\n console.time(\"processElements:generateXPaths\");\n const xpathLists = await Promise.all(\n candidateElements.map(async (element) => {\n if (xpathCache.has(element)) {\n return xpathCache.get(element);\n }\n const xpaths = await generateXPathsForElement(element);\n xpathCache.set(element, xpaths);\n return xpaths;\n })\n );\n console.timeEnd(\"processElements:generateXPaths\");\n candidateElements.forEach((element, index) => {\n const xpaths = xpathLists[index];\n let elementOutput = \"\";\n if (isTextNode(element)) {\n const textContent = element.textContent?.trim();\n if (textContent) {\n elementOutput += `${index + indexOffset}:${textContent}\n`;\n }\n } else if (isElementNode(element)) {\n const tagName = element.tagName.toLowerCase();\n const attributes = collectEssentialAttributes(element);\n const openingTag = `<${tagName}${attributes ? \" \" + attributes : \"\"}>`;\n const closingTag = `</${tagName}>`;\n const textContent = element.textContent?.trim() || \"\";\n elementOutput += `${index + indexOffset}:${openingTag}${textContent}${closingTag}\n`;\n }\n outputString += elementOutput;\n selectorMap[index + indexOffset] = xpaths;\n });\n console.timeEnd(\"processElements:processCandidates\");\n console.timeEnd(\"processElements:total\");\n return {\n outputString,\n selectorMap\n };\n }\n function collectEssentialAttributes(element) {\n const essentialAttributes = [\n \"id\",\n \"class\",\n \"href\",\n \"src\",\n \"aria-label\",\n \"aria-name\",\n \"aria-role\",\n \"aria-description\",\n \"aria-expanded\",\n \"aria-haspopup\",\n \"type\",\n \"value\"\n ];\n const attrs = essentialAttributes.map((attr) => {\n const value = element.getAttribute(attr);\n return value ? `${attr}=\"${value}\"` : \"\";\n }).filter((attr) => attr !== \"\");\n Array.from(element.attributes).forEach((attr) => {\n if (attr.name.startsWith(\"data-\")) {\n attrs.push(`${attr.name}=\"${attr.value}\"`);\n }\n });\n return attrs.join(\" \");\n }\n window.processDom = processDom;\n window.processAllOfDom = processAllOfDom;\n window.processElements = processElements;\n window.scrollToHeight = scrollToHeight;\n var leafElementDenyList = [\"SVG\", \"IFRAME\", \"SCRIPT\", \"STYLE\", \"LINK\"];\n var interactiveElementTypes = [\n \"A\",\n \"BUTTON\",\n \"DETAILS\",\n \"EMBED\",\n \"INPUT\",\n \"LABEL\",\n \"MENU\",\n \"MENUITEM\",\n \"OBJECT\",\n \"SELECT\",\n \"TEXTAREA\",\n \"SUMMARY\"\n ];\n var interactiveRoles = [\n \"button\",\n \"menu\",\n \"menuitem\",\n \"link\",\n \"checkbox\",\n \"radio\",\n \"slider\",\n \"tab\",\n \"tabpanel\",\n \"textbox\",\n \"combobox\",\n \"grid\",\n \"listbox\",\n \"option\",\n \"progressbar\",\n \"scrollbar\",\n \"searchbox\",\n \"switch\",\n \"tree\",\n \"treeitem\",\n \"spinbutton\",\n \"tooltip\"\n ];\n var interactiveAriaRoles = [\"menu\", \"menuitem\", \"button\"];\n var isVisible = (element) => {\n const rect = element.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n if (!isTopElement(element, rect)) {\n return false;\n }\n const visible = element.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n var isTextVisible = (element) => {\n const range = document.createRange();\n range.selectNodeContents(element);\n const rect = range.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n const parent = element.parentElement;\n if (!parent) {\n return false;\n }\n if (!isTopElement(parent, rect)) {\n return false;\n }\n const visible = parent.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n function isTopElement(elem, rect) {\n const points = [\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 }\n ];\n return points.some((point) => {\n const topEl = document.elementFromPoint(point.x, point.y);\n let current = topEl;\n while (current && current !== document.body) {\n if (current.isSameNode(elem)) {\n return true;\n }\n current = current.parentElement;\n }\n return false;\n });\n }\n var isActive = (element) => {\n if (element.hasAttribute(\"disabled\") || element.hasAttribute(\"hidden\") || element.getAttribute(\"aria-disabled\") === \"true\") {\n return false;\n }\n return true;\n };\n var isInteractiveElement = (element) => {\n const elementType = element.tagName;\n const elementRole = element.getAttribute(\"role\");\n const elementAriaRole = element.getAttribute(\"aria-role\");\n return elementType && interactiveElementTypes.includes(elementType) || elementRole && interactiveRoles.includes(elementRole) || elementAriaRole && interactiveAriaRoles.includes(elementAriaRole);\n };\n var isLeafElement = (element) => {\n if (element.textContent === \"\") {\n return false;\n }\n if (element.childNodes.length === 0) {\n return !leafElementDenyList.includes(element.tagName);\n }\n if (element.childNodes.length === 1 && isTextNode(element.childNodes[0])) {\n return true;\n }\n return false;\n };\n async function pickChunk(chunksSeen) {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const chunks = Math.ceil(documentHeight / viewportHeight);\n const chunksArray = Array.from({ length: chunks }, (_, i) => i);\n const chunksRemaining = chunksArray.filter((chunk2) => {\n return !chunksSeen.includes(chunk2);\n });\n const currentScrollPosition = window.scrollY;\n const closestChunk = chunksRemaining.reduce((closest, current) => {\n const currentChunkTop = viewportHeight * current;\n const closestChunkTop = viewportHeight * closest;\n return Math.abs(currentScrollPosition - currentChunkTop) < Math.abs(currentScrollPosition - closestChunkTop) ? current : closest;\n }, chunksRemaining[0]);\n const chunk = closestChunk;\n if (chunk === void 0) {\n throw new Error(`No chunks remaining to check: ${chunksRemaining}`);\n }\n return {\n chunk,\n chunksArray\n };\n }\n\n // lib/dom/debug.ts\n async function debugDom() {\n window.chunkNumber = 0;\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n }\n function multiSelectorMapToSelectorMap(multiSelectorMap) {\n return Object.fromEntries(\n Object.entries(multiSelectorMap).map(([key, selectors]) => [\n Number(key),\n selectors[0]\n ])\n );\n }\n function drawChunk(selectorMap) {\n if (!window.showChunks) return;\n cleanupMarkers();\n Object.values(selectorMap).forEach((selector) => {\n const element = document.evaluate(\n selector,\n document,\n null,\n XPathResult.FIRST_ORDERED_NODE_TYPE,\n null\n ).singleNodeValue;\n if (element) {\n let rect;\n if (element.nodeType === Node.ELEMENT_NODE) {\n rect = element.getBoundingClientRect();\n } else {\n const range = document.createRange();\n range.selectNodeContents(element);\n rect = range.getBoundingClientRect();\n }\n const color = \"grey\";\n const overlay = document.createElement(\"div\");\n overlay.style.position = \"absolute\";\n overlay.style.left = `${rect.left + window.scrollX}px`;\n overlay.style.top = `${rect.top + window.scrollY}px`;\n overlay.style.padding = \"2px\";\n overlay.style.width = `${rect.width}px`;\n overlay.style.height = `${rect.height}px`;\n overlay.style.backgroundColor = color;\n overlay.className = \"stagehand-marker\";\n overlay.style.opacity = \"0.3\";\n overlay.style.zIndex = \"1000000000\";\n overlay.style.border = \"1px solid\";\n overlay.style.pointerEvents = \"none\";\n document.body.appendChild(overlay);\n }\n });\n }\n async function cleanupDebug() {\n cleanupMarkers();\n cleanupNav();\n }\n function cleanupMarkers() {\n const markers = document.querySelectorAll(\".stagehand-marker\");\n markers.forEach((marker) => {\n marker.remove();\n });\n }\n function cleanupNav() {\n const stagehandNavElements = document.querySelectorAll(\".stagehand-nav\");\n stagehandNavElements.forEach((element) => {\n element.remove();\n });\n }\n function setupChunkNav() {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n if (window.chunkNumber > 0) {\n const prevChunkButton = document.createElement(\"button\");\n prevChunkButton.className = \"stagehand-nav\";\n prevChunkButton.textContent = \"Previous\";\n prevChunkButton.style.marginLeft = \"50px\";\n prevChunkButton.style.position = \"fixed\";\n prevChunkButton.style.bottom = \"10px\";\n prevChunkButton.style.left = \"50%\";\n prevChunkButton.style.transform = \"translateX(-50%)\";\n prevChunkButton.style.zIndex = \"1000000000\";\n prevChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber -= 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(prevChunkButton);\n }\n if (totalChunks > window.chunkNumber) {\n const nextChunkButton = document.createElement(\"button\");\n nextChunkButton.className = \"stagehand-nav\";\n nextChunkButton.textContent = \"Next\";\n nextChunkButton.style.marginRight = \"50px\";\n nextChunkButton.style.position = \"fixed\";\n nextChunkButton.style.bottom = \"10px\";\n nextChunkButton.style.right = \"50%\";\n nextChunkButton.style.transform = \"translateX(50%)\";\n nextChunkButton.style.zIndex = \"1000000000\";\n nextChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber += 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(nextChunkButton);\n }\n }\n window.debugDom = debugDom;\n window.cleanupDebug = cleanupDebug;\n})();\n"; | ||
export declare const scriptContent = "(() => {\n // lib/dom/xpathUtils.ts\n function getParentElement(node) {\n return isElementNode(node) ? node.parentElement : node.parentNode;\n }\n function getCombinations(attributes, size) {\n const results = [];\n function helper(start, combo) {\n if (combo.length === size) {\n results.push([...combo]);\n return;\n }\n for (let i = start; i < attributes.length; i++) {\n combo.push(attributes[i]);\n helper(i + 1, combo);\n combo.pop();\n }\n }\n helper(0, []);\n return results;\n }\n function isXPathFirstResultElement(xpath, target) {\n try {\n const result = document.evaluate(\n xpath,\n document.documentElement,\n null,\n XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,\n null\n );\n return result.snapshotItem(0) === target;\n } catch (error) {\n console.warn(`Invalid XPath expression: ${xpath}`, error);\n return false;\n }\n }\n function escapeXPathString(value) {\n if (value.includes(\"'\")) {\n if (value.includes('\"')) {\n return \"concat(\" + value.split(/('+)/).map((part) => {\n if (part === \"'\") {\n return `\"'\"`;\n } else if (part.startsWith(\"'\") && part.endsWith(\"'\")) {\n return `\"${part}\"`;\n } else {\n return `'${part}'`;\n }\n }).join(\",\") + \")\";\n } else {\n return `\"${value}\"`;\n }\n } else {\n return `'${value}'`;\n }\n }\n async function generateXPathsForElement(element) {\n if (!element) return [];\n const [complexXPath, standardXPath, idBasedXPath] = await Promise.all([\n generateComplexXPath(element),\n generateStandardXPath(element),\n generatedIdBasedXPath(element)\n ]);\n return [standardXPath, ...idBasedXPath ? [idBasedXPath] : [], complexXPath];\n }\n async function generateComplexXPath(element) {\n const parts = [];\n let currentElement = element;\n while (currentElement && (isTextNode(currentElement) || isElementNode(currentElement))) {\n if (isElementNode(currentElement)) {\n const el = currentElement;\n let selector = el.tagName.toLowerCase();\n const attributePriority = [\n \"data-qa\",\n \"data-component\",\n \"data-role\",\n \"role\",\n \"aria-role\",\n \"type\",\n \"name\",\n \"aria-label\",\n \"placeholder\",\n \"title\",\n \"alt\"\n ];\n const attributes = attributePriority.map((attr) => {\n let value = el.getAttribute(attr);\n if (attr === \"href-full\" && value) {\n value = el.getAttribute(\"href\");\n }\n return value ? { attr: attr === \"href-full\" ? \"href\" : attr, value } : null;\n }).filter((attr) => attr !== null);\n let uniqueSelector = \"\";\n for (let i = 1; i <= attributes.length; i++) {\n const combinations = getCombinations(attributes, i);\n for (const combo of combinations) {\n const conditions = combo.map((a) => `@${a.attr}=${escapeXPathString(a.value)}`).join(\" and \");\n const xpath2 = `//${selector}[${conditions}]`;\n if (isXPathFirstResultElement(xpath2, el)) {\n uniqueSelector = xpath2;\n break;\n }\n }\n if (uniqueSelector) break;\n }\n if (uniqueSelector) {\n parts.unshift(uniqueSelector.replace(\"//\", \"\"));\n break;\n } else {\n const parent = getParentElement(el);\n if (parent) {\n const siblings = Array.from(parent.children).filter(\n (sibling) => sibling.tagName === el.tagName\n );\n const index = siblings.indexOf(el) + 1;\n selector += siblings.length > 1 ? `[${index}]` : \"\";\n }\n parts.unshift(selector);\n }\n }\n currentElement = getParentElement(currentElement);\n }\n const xpath = \"//\" + parts.join(\"/\");\n return xpath;\n }\n async function generateStandardXPath(element) {\n const parts = [];\n while (element && (isTextNode(element) || isElementNode(element))) {\n let index = 0;\n let hasSameTypeSiblings = false;\n const siblings = element.parentElement ? Array.from(element.parentElement.childNodes) : [];\n for (let i = 0; i < siblings.length; i++) {\n const sibling = siblings[i];\n if (sibling.nodeType === element.nodeType && sibling.nodeName === element.nodeName) {\n index = index + 1;\n hasSameTypeSiblings = true;\n if (sibling.isSameNode(element)) {\n break;\n }\n }\n }\n if (element.nodeName !== \"#text\") {\n const tagName = element.nodeName.toLowerCase();\n const pathIndex = hasSameTypeSiblings ? `[${index}]` : \"\";\n parts.unshift(`${tagName}${pathIndex}`);\n }\n element = element.parentElement;\n }\n return parts.length ? `/${parts.join(\"/\")}` : \"\";\n }\n async function generatedIdBasedXPath(element) {\n if (isElementNode(element) && element.id) {\n return `//*[@id='${element.id}']`;\n }\n return null;\n }\n\n // lib/dom/utils.ts\n async function waitForDomSettle() {\n return new Promise((resolve) => {\n const createTimeout = () => {\n return setTimeout(() => {\n resolve();\n }, 2e3);\n };\n let timeout = createTimeout();\n const observer = new MutationObserver(() => {\n clearTimeout(timeout);\n timeout = createTimeout();\n });\n observer.observe(window.document.body, { childList: true, subtree: true });\n });\n }\n window.waitForDomSettle = waitForDomSettle;\n function calculateViewportHeight() {\n return Math.ceil(window.innerHeight * 0.75);\n }\n\n // lib/dom/process.ts\n function isElementNode(node) {\n return node.nodeType === Node.ELEMENT_NODE;\n }\n function isTextNode(node) {\n return node.nodeType === Node.TEXT_NODE && Boolean(node.textContent?.trim());\n }\n async function processDom(chunksSeen) {\n const { chunk, chunksArray } = await pickChunk(chunksSeen);\n const { outputString, selectorMap } = await processElements(chunk);\n console.log(\n `Stagehand (Browser Process): Extracted dom elements:\n${outputString}`\n );\n return {\n outputString,\n selectorMap,\n chunk,\n chunks: chunksArray\n };\n }\n async function processAllOfDom() {\n console.log(\"Stagehand (Browser Process): Processing all of DOM\");\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n let index = 0;\n const results = [];\n for (let chunk = 0; chunk < totalChunks; chunk++) {\n const result = await processElements(chunk, true, index);\n results.push(result);\n index += Object.keys(result.selectorMap).length;\n }\n await scrollToHeight(0);\n const allOutputString = results.map((result) => result.outputString).join(\"\");\n const allSelectorMap = results.reduce(\n (acc, result) => ({ ...acc, ...result.selectorMap }),\n {}\n );\n console.log(\n `Stagehand (Browser Process): All dom elements: ${allOutputString}`\n );\n return {\n outputString: allOutputString,\n selectorMap: allSelectorMap\n };\n }\n async function scrollToHeight(height) {\n window.scrollTo({ top: height, left: 0, behavior: \"smooth\" });\n await new Promise((resolve) => {\n let scrollEndTimer;\n const handleScrollEnd = () => {\n clearTimeout(scrollEndTimer);\n scrollEndTimer = window.setTimeout(() => {\n window.removeEventListener(\"scroll\", handleScrollEnd);\n resolve();\n }, 100);\n };\n window.addEventListener(\"scroll\", handleScrollEnd, { passive: true });\n handleScrollEnd();\n });\n }\n var xpathCache = /* @__PURE__ */ new Map();\n async function processElements(chunk, scrollToChunk = true, indexOffset = 0) {\n console.time(\"processElements:total\");\n const viewportHeight = calculateViewportHeight();\n const chunkHeight = viewportHeight * chunk;\n const maxScrollTop = document.documentElement.scrollHeight - viewportHeight;\n const offsetTop = Math.min(chunkHeight, maxScrollTop);\n if (scrollToChunk) {\n console.time(\"processElements:scroll\");\n await scrollToHeight(offsetTop);\n console.timeEnd(\"processElements:scroll\");\n }\n const candidateElements = [];\n const DOMQueue = [...document.body.childNodes];\n console.log(\"Stagehand (Browser Process): Generating candidate elements\");\n console.time(\"processElements:findCandidates\");\n while (DOMQueue.length > 0) {\n const element = DOMQueue.pop();\n let shouldAddElement = false;\n if (element && isElementNode(element)) {\n const childrenCount = element.childNodes.length;\n for (let i = childrenCount - 1; i >= 0; i--) {\n const child = element.childNodes[i];\n DOMQueue.push(child);\n }\n if (isInteractiveElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n if (isLeafElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n }\n if (element && isTextNode(element) && isTextVisible(element)) {\n shouldAddElement = true;\n }\n if (shouldAddElement) {\n candidateElements.push(element);\n }\n }\n console.timeEnd(\"processElements:findCandidates\");\n const selectorMap = {};\n let outputString = \"\";\n console.log(\n `Stagehand (Browser Process): Processing candidate elements: ${candidateElements.length}`\n );\n console.time(\"processElements:processCandidates\");\n console.time(\"processElements:generateXPaths\");\n const xpathLists = await Promise.all(\n candidateElements.map(async (element) => {\n if (xpathCache.has(element)) {\n return xpathCache.get(element);\n }\n const xpaths = await generateXPathsForElement(element);\n xpathCache.set(element, xpaths);\n return xpaths;\n })\n );\n console.timeEnd(\"processElements:generateXPaths\");\n candidateElements.forEach((element, index) => {\n const xpaths = xpathLists[index];\n let elementOutput = \"\";\n if (isTextNode(element)) {\n const textContent = element.textContent?.trim();\n if (textContent) {\n elementOutput += `${index + indexOffset}:${textContent}\n`;\n }\n } else if (isElementNode(element)) {\n const tagName = element.tagName.toLowerCase();\n const attributes = collectEssentialAttributes(element);\n const openingTag = `<${tagName}${attributes ? \" \" + attributes : \"\"}>`;\n const closingTag = `</${tagName}>`;\n const textContent = element.textContent?.trim() || \"\";\n elementOutput += `${index + indexOffset}:${openingTag}${textContent}${closingTag}\n`;\n }\n outputString += elementOutput;\n selectorMap[index + indexOffset] = xpaths;\n });\n console.timeEnd(\"processElements:processCandidates\");\n console.timeEnd(\"processElements:total\");\n return {\n outputString,\n selectorMap\n };\n }\n function collectEssentialAttributes(element) {\n const essentialAttributes = [\n \"id\",\n \"class\",\n \"href\",\n \"src\",\n \"aria-label\",\n \"aria-name\",\n \"aria-role\",\n \"aria-description\",\n \"aria-expanded\",\n \"aria-haspopup\",\n \"type\",\n \"value\"\n ];\n const attrs = essentialAttributes.map((attr) => {\n const value = element.getAttribute(attr);\n return value ? `${attr}=\"${value}\"` : \"\";\n }).filter((attr) => attr !== \"\");\n Array.from(element.attributes).forEach((attr) => {\n if (attr.name.startsWith(\"data-\")) {\n attrs.push(`${attr.name}=\"${attr.value}\"`);\n }\n });\n return attrs.join(\" \");\n }\n function storeDOM() {\n const originalDOM = document.body.cloneNode(true);\n console.log(\"DOM state stored.\");\n return originalDOM.outerHTML;\n }\n function restoreDOM(storedDOM) {\n console.log(\"Restoring DOM\");\n if (storedDOM) {\n document.body.innerHTML = storedDOM;\n } else {\n console.error(\"No DOM state was provided.\");\n }\n }\n function createTextBoundingBoxes() {\n const style = document.createElement(\"style\");\n document.head.appendChild(style);\n if (style.sheet) {\n style.sheet.insertRule(\n `\n .stagehand-highlighted-word, .stagehand-space {\n border: 0px solid orange;\n display: inline-block !important;\n visibility: visible;\n }\n `,\n 0\n );\n style.sheet.insertRule(\n `\n code .stagehand-highlighted-word, code .stagehand-space,\n pre .stagehand-highlighted-word, pre .stagehand-space {\n white-space: pre-wrap;\n display: inline !important;\n }\n `,\n 1\n );\n }\n function applyHighlighting(root) {\n root.querySelectorAll(\"body *\").forEach((element) => {\n if (element.closest(\".stagehand-nav, .stagehand-marker\")) {\n return;\n }\n if ([\"SCRIPT\", \"STYLE\", \"IFRAME\", \"INPUT\", \"TEXTAREA\"].includes(\n element.tagName\n )) {\n return;\n }\n const childNodes = Array.from(element.childNodes);\n childNodes.forEach((node) => {\n if (node.nodeType === 3 && node.textContent?.trim().length > 0) {\n const textContent = node.textContent.replace(/\\u00A0/g, \" \");\n const tokens = textContent.split(/(\\s+)/g);\n const fragment = document.createDocumentFragment();\n const parentIsCode = element.tagName === \"CODE\";\n tokens.forEach((token) => {\n const span = document.createElement(\"span\");\n span.textContent = token;\n if (parentIsCode) {\n span.style.whiteSpace = \"pre-wrap\";\n span.style.display = \"inline\";\n }\n span.className = token.trim().length === 0 ? \"stagehand-space\" : \"stagehand-highlighted-word\";\n fragment.appendChild(span);\n });\n if (fragment.childNodes.length > 0 && node.parentNode) {\n element.insertBefore(fragment, node);\n node.remove();\n }\n }\n });\n });\n }\n applyHighlighting(document);\n document.querySelectorAll(\"iframe\").forEach((iframe) => {\n try {\n iframe.contentWindow?.postMessage({ action: \"highlight\" }, \"*\");\n } catch (error) {\n console.error(\"Error accessing iframe content: \", error);\n }\n });\n }\n function getElementBoundingBoxes(xpath) {\n const element = document.evaluate(\n xpath,\n document,\n null,\n XPathResult.FIRST_ORDERED_NODE_TYPE,\n null\n ).singleNodeValue;\n if (!element) return [];\n const isValidText = (text) => text && text.trim().length > 0;\n let dropDownElem = element.querySelector(\"option[selected]\");\n if (!dropDownElem) {\n dropDownElem = element.querySelector(\"option\");\n }\n if (dropDownElem) {\n const elemText = dropDownElem.textContent || \"\";\n if (isValidText(elemText)) {\n const parentRect = element.getBoundingClientRect();\n return [\n {\n text: elemText.trim(),\n top: parentRect.top + window.scrollY,\n left: parentRect.left + window.scrollX,\n width: parentRect.width,\n height: parentRect.height\n }\n ];\n } else {\n return [];\n }\n }\n let placeholderText = \"\";\n if ((element.tagName.toLowerCase() === \"input\" || element.tagName.toLowerCase() === \"textarea\") && element.placeholder) {\n placeholderText = element.placeholder;\n } else if (element.tagName.toLowerCase() === \"a\") {\n placeholderText = \"\";\n } else if (element.tagName.toLowerCase() === \"img\") {\n placeholderText = element.alt || \"\";\n }\n const words = element.querySelectorAll(\n \".stagehand-highlighted-word\"\n );\n const boundingBoxes = Array.from(words).map((word) => {\n const rect = word.getBoundingClientRect();\n return {\n text: word.innerText || \"\",\n top: rect.top + window.scrollY,\n left: rect.left + window.scrollX,\n width: rect.width,\n height: rect.height * 0.75\n };\n }).filter(\n (box) => box.width > 0 && box.height > 0 && box.top >= 0 && box.left >= 0 && isValidText(box.text)\n );\n if (boundingBoxes.length === 0) {\n const elementRect = element.getBoundingClientRect();\n return [\n {\n text: placeholderText,\n top: elementRect.top + window.scrollY,\n left: elementRect.left + window.scrollX,\n width: elementRect.width,\n height: elementRect.height * 0.75\n }\n ];\n }\n return boundingBoxes;\n }\n window.processDom = processDom;\n window.processAllOfDom = processAllOfDom;\n window.processElements = processElements;\n window.scrollToHeight = scrollToHeight;\n window.storeDOM = storeDOM;\n window.restoreDOM = restoreDOM;\n window.createTextBoundingBoxes = createTextBoundingBoxes;\n window.getElementBoundingBoxes = getElementBoundingBoxes;\n var leafElementDenyList = [\"SVG\", \"IFRAME\", \"SCRIPT\", \"STYLE\", \"LINK\"];\n var interactiveElementTypes = [\n \"A\",\n \"BUTTON\",\n \"DETAILS\",\n \"EMBED\",\n \"INPUT\",\n \"LABEL\",\n \"MENU\",\n \"MENUITEM\",\n \"OBJECT\",\n \"SELECT\",\n \"TEXTAREA\",\n \"SUMMARY\"\n ];\n var interactiveRoles = [\n \"button\",\n \"menu\",\n \"menuitem\",\n \"link\",\n \"checkbox\",\n \"radio\",\n \"slider\",\n \"tab\",\n \"tabpanel\",\n \"textbox\",\n \"combobox\",\n \"grid\",\n \"listbox\",\n \"option\",\n \"progressbar\",\n \"scrollbar\",\n \"searchbox\",\n \"switch\",\n \"tree\",\n \"treeitem\",\n \"spinbutton\",\n \"tooltip\"\n ];\n var interactiveAriaRoles = [\"menu\", \"menuitem\", \"button\"];\n var isVisible = (element) => {\n const rect = element.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n if (!isTopElement(element, rect)) {\n return false;\n }\n const visible = element.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n var isTextVisible = (element) => {\n const range = document.createRange();\n range.selectNodeContents(element);\n const rect = range.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n const parent = element.parentElement;\n if (!parent) {\n return false;\n }\n if (!isTopElement(parent, rect)) {\n return false;\n }\n const visible = parent.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n function isTopElement(elem, rect) {\n const points = [\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 }\n ];\n return points.some((point) => {\n const topEl = document.elementFromPoint(point.x, point.y);\n let current = topEl;\n while (current && current !== document.body) {\n if (current.isSameNode(elem)) {\n return true;\n }\n current = current.parentElement;\n }\n return false;\n });\n }\n var isActive = (element) => {\n if (element.hasAttribute(\"disabled\") || element.hasAttribute(\"hidden\") || element.getAttribute(\"aria-disabled\") === \"true\") {\n return false;\n }\n return true;\n };\n var isInteractiveElement = (element) => {\n const elementType = element.tagName;\n const elementRole = element.getAttribute(\"role\");\n const elementAriaRole = element.getAttribute(\"aria-role\");\n return elementType && interactiveElementTypes.includes(elementType) || elementRole && interactiveRoles.includes(elementRole) || elementAriaRole && interactiveAriaRoles.includes(elementAriaRole);\n };\n var isLeafElement = (element) => {\n if (element.textContent === \"\") {\n return false;\n }\n if (element.childNodes.length === 0) {\n return !leafElementDenyList.includes(element.tagName);\n }\n if (element.childNodes.length === 1 && isTextNode(element.childNodes[0])) {\n return true;\n }\n return false;\n };\n async function pickChunk(chunksSeen) {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const chunks = Math.ceil(documentHeight / viewportHeight);\n const chunksArray = Array.from({ length: chunks }, (_, i) => i);\n const chunksRemaining = chunksArray.filter((chunk2) => {\n return !chunksSeen.includes(chunk2);\n });\n const currentScrollPosition = window.scrollY;\n const closestChunk = chunksRemaining.reduce((closest, current) => {\n const currentChunkTop = viewportHeight * current;\n const closestChunkTop = viewportHeight * closest;\n return Math.abs(currentScrollPosition - currentChunkTop) < Math.abs(currentScrollPosition - closestChunkTop) ? current : closest;\n }, chunksRemaining[0]);\n const chunk = closestChunk;\n if (chunk === void 0) {\n throw new Error(`No chunks remaining to check: ${chunksRemaining}`);\n }\n return {\n chunk,\n chunksArray\n };\n }\n\n // lib/dom/debug.ts\n async function debugDom() {\n window.chunkNumber = 0;\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n }\n function multiSelectorMapToSelectorMap(multiSelectorMap) {\n return Object.fromEntries(\n Object.entries(multiSelectorMap).map(([key, selectors]) => [\n Number(key),\n selectors[0]\n ])\n );\n }\n function drawChunk(selectorMap) {\n if (!window.showChunks) return;\n cleanupMarkers();\n Object.values(selectorMap).forEach((selector) => {\n const element = document.evaluate(\n selector,\n document,\n null,\n XPathResult.FIRST_ORDERED_NODE_TYPE,\n null\n ).singleNodeValue;\n if (element) {\n let rect;\n if (element.nodeType === Node.ELEMENT_NODE) {\n rect = element.getBoundingClientRect();\n } else {\n const range = document.createRange();\n range.selectNodeContents(element);\n rect = range.getBoundingClientRect();\n }\n const color = \"grey\";\n const overlay = document.createElement(\"div\");\n overlay.style.position = \"absolute\";\n overlay.style.left = `${rect.left + window.scrollX}px`;\n overlay.style.top = `${rect.top + window.scrollY}px`;\n overlay.style.padding = \"2px\";\n overlay.style.width = `${rect.width}px`;\n overlay.style.height = `${rect.height}px`;\n overlay.style.backgroundColor = color;\n overlay.className = \"stagehand-marker\";\n overlay.style.opacity = \"0.3\";\n overlay.style.zIndex = \"1000000000\";\n overlay.style.border = \"1px solid\";\n overlay.style.pointerEvents = \"none\";\n document.body.appendChild(overlay);\n }\n });\n }\n async function cleanupDebug() {\n cleanupMarkers();\n cleanupNav();\n }\n function cleanupMarkers() {\n const markers = document.querySelectorAll(\".stagehand-marker\");\n markers.forEach((marker) => {\n marker.remove();\n });\n }\n function cleanupNav() {\n const stagehandNavElements = document.querySelectorAll(\".stagehand-nav\");\n stagehandNavElements.forEach((element) => {\n element.remove();\n });\n }\n function setupChunkNav() {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n if (window.chunkNumber > 0) {\n const prevChunkButton = document.createElement(\"button\");\n prevChunkButton.className = \"stagehand-nav\";\n prevChunkButton.textContent = \"Previous\";\n prevChunkButton.style.marginLeft = \"50px\";\n prevChunkButton.style.position = \"fixed\";\n prevChunkButton.style.bottom = \"10px\";\n prevChunkButton.style.left = \"50%\";\n prevChunkButton.style.transform = \"translateX(-50%)\";\n prevChunkButton.style.zIndex = \"1000000000\";\n prevChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber -= 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(prevChunkButton);\n }\n if (totalChunks > window.chunkNumber) {\n const nextChunkButton = document.createElement(\"button\");\n nextChunkButton.className = \"stagehand-nav\";\n nextChunkButton.textContent = \"Next\";\n nextChunkButton.style.marginRight = \"50px\";\n nextChunkButton.style.position = \"fixed\";\n nextChunkButton.style.bottom = \"10px\";\n nextChunkButton.style.right = \"50%\";\n nextChunkButton.style.transform = \"translateX(50%)\";\n nextChunkButton.style.zIndex = \"1000000000\";\n nextChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber += 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(nextChunkButton);\n }\n }\n window.debugDom = debugDom;\n window.cleanupDebug = cleanupDebug;\n})();\n"; |
@@ -18,1 +18,11 @@ export declare function isElementNode(node: Node): node is Element; | ||
}>; | ||
export declare function storeDOM(): string; | ||
export declare function restoreDOM(storedDOM: string): void; | ||
export declare function createTextBoundingBoxes(): void; | ||
export declare function getElementBoundingBoxes(xpath: string): Array<{ | ||
text: string; | ||
top: number; | ||
left: number; | ||
width: number; | ||
height: number; | ||
}>; |
@@ -5,2 +5,71 @@ import { LLMProvider } from "../llm/LLMProvider"; | ||
import { LLMClient } from "../llm/LLMClient"; | ||
/** | ||
* The `StagehandExtractHandler` class is responsible for extracting structured data from a webpage. | ||
* It provides two approaches: `textExtract` and `domExtract`. `textExtract` is used by default. | ||
* | ||
* Here is what `textExtract` does at a high level: | ||
* | ||
* **1. Wait for the DOM to settle and start DOM debugging.** | ||
* - Ensures the page is fully loaded and stable before extraction. | ||
* | ||
* **2. Store the original DOM before any mutations.** | ||
* - Preserves the initial state of the DOM to restore later. | ||
* - We do this because creating spans around every word in the DOM (see step 4) | ||
* becomes very difficult to revert. Text nodes can be finicky, and directly | ||
* removing the added spans often corrupts the structure of the DOM. | ||
* | ||
* **3. Process the DOM to generate a selector map of candidate elements.** | ||
* - Identifies potential elements that contain the data to extract. | ||
* | ||
* **4. Create text bounding boxes around every word in the webpage.** | ||
* - Wraps words in spans so that their bounding boxes can be used to | ||
* determine their positions on the text-rendered-webpage. | ||
* | ||
* **5. Collect all text annotations (with positions and dimensions) from each of the candidate elements.** | ||
* - Gathers text and positional data for each word. | ||
* | ||
* **6. Group annotations by text and deduplicate them based on proximity.** | ||
* - There is no guarantee that the text annotations are unique (candidate elements can be nested). | ||
* - Thus, we must remove duplicate words that are close to each other on the page. | ||
* | ||
* **7. Restore the original DOM after mutations.** | ||
* - Returns the DOM to its original state after processing. | ||
* | ||
* **8. Format the deduplicated annotations into a text representation.** | ||
* - Prepares the text data for the extraction process. | ||
* | ||
* **9. Pass the formatted text to an LLM for extraction according to the given instruction and schema.** | ||
* - Uses a language model to extract structured data based on instructions. | ||
* | ||
* **10. Handle the extraction response and logging the results.** | ||
* - Processes the output from the LLM and logs relevant information. | ||
* | ||
* | ||
* Here is what `domExtract` does at a high level: | ||
* | ||
* **1. Wait for the DOM to settle and start DOM debugging.** | ||
* - Ensures the page is fully loaded and stable before extraction. | ||
* | ||
* **2. Process the DOM in chunks.** | ||
* - The `processDom` function: | ||
* - Divides the page into vertical "chunks" based on viewport height. | ||
* - Picks the next chunk that hasn't been processed yet. | ||
* - Scrolls to that chunk and extracts candidate elements. | ||
* - Returns `outputString` (HTML snippets of candidate elements), | ||
* `selectorMap` (the XPaths of the candidate elements), | ||
* `chunk` (the current chunk index), and `chunks` (the array of all chunk indices). | ||
* - This chunk-based approach ensures that large or lengthy pages can be processed in smaller, manageable sections. | ||
* | ||
* **3. Pass the extracted DOM elements (in `outputString`) to the LLM for structured data extraction.** | ||
* - Uses the instructions, schema, and previously extracted content as context to | ||
* guide the LLM in extracting the structured data. | ||
* | ||
* **4. Check if extraction is complete.** | ||
* - If the extraction is complete (all chunks have been processed or the LLM determines | ||
* that we do not need to continue), return the final result. | ||
* - If not, repeat steps 1-4 with the next chunk until extraction is complete or no more chunks remain. | ||
* | ||
* @remarks | ||
* Each step corresponds to specific code segments, as noted in the comments throughout the code. | ||
*/ | ||
export declare class StagehandExtractHandler { | ||
@@ -35,3 +104,3 @@ private readonly stagehand; | ||
}); | ||
extract<T extends z.AnyZodObject>({ instruction, schema, content, chunksSeen, llmClient, requestId, domSettleTimeoutMs, }: { | ||
extract<T extends z.AnyZodObject>({ instruction, schema, content, chunksSeen, llmClient, requestId, domSettleTimeoutMs, useTextExtract, }: { | ||
instruction: string; | ||
@@ -44,3 +113,6 @@ schema: T; | ||
domSettleTimeoutMs?: number; | ||
useTextExtract?: boolean; | ||
}): Promise<z.infer<T>>; | ||
private textExtract; | ||
private domExtract; | ||
} |
@@ -10,2 +10,3 @@ import { type BrowserContext, type Page } from "@playwright/test"; | ||
context: BrowserContext; | ||
browserbaseSessionID?: string; | ||
private env; | ||
@@ -43,3 +44,3 @@ private apiKey; | ||
act({ action, modelName, modelClientOptions, useVision, variables, domSettleTimeoutMs, }: ActOptions): Promise<ActResult>; | ||
extract<T extends z.AnyZodObject>({ instruction, schema, modelName, modelClientOptions, domSettleTimeoutMs, }: ExtractOptions<T>): Promise<ExtractResult<T>>; | ||
extract<T extends z.AnyZodObject>({ instruction, schema, modelName, modelClientOptions, domSettleTimeoutMs, useTextExtract, }: ExtractOptions<T>): Promise<ExtractResult<T>>; | ||
observe(options?: ObserveOptions): Promise<ObserveResult[]>; | ||
@@ -46,0 +47,0 @@ close(): Promise<void>; |
@@ -8,3 +8,3 @@ import { z } from "zod"; | ||
export declare function act({ action, domElements, steps, llmClient, screenshot, retries, logger, requestId, variables, }: ActParams): Promise<ActResult | null>; | ||
export declare function extract({ instruction, previouslyExtractedContent, domElements, schema, llmClient, chunksSeen, chunksTotal, requestId, }: { | ||
export declare function extract({ instruction, previouslyExtractedContent, domElements, schema, llmClient, chunksSeen, chunksTotal, requestId, isUsingTextExtract, }: { | ||
instruction: string; | ||
@@ -18,2 +18,3 @@ previouslyExtractedContent: object; | ||
requestId: string; | ||
isUsingTextExtract?: boolean; | ||
}): Promise<{ | ||
@@ -20,0 +21,0 @@ metadata: { |
@@ -8,3 +8,3 @@ import OpenAI from "openai"; | ||
export declare const actTools: Array<OpenAI.ChatCompletionTool>; | ||
export declare function buildExtractSystemPrompt(isUsingPrintExtractedDataTool?: boolean): ChatMessage; | ||
export declare function buildExtractSystemPrompt(isUsingPrintExtractedDataTool?: boolean, useTextExtract?: boolean): ChatMessage; | ||
export declare function buildExtractUserPrompt(instruction: string, domElements: string, isUsingPrintExtractedDataTool?: boolean): ChatMessage; | ||
@@ -11,0 +11,0 @@ export declare function buildRefineSystemPrompt(): ChatMessage; |
import { LogLine } from "../types/log"; | ||
import { TextAnnotation } from "../types/textannotation"; | ||
import { z } from "zod"; | ||
export declare function generateId(operation: string): string; | ||
/** | ||
* `formatText` converts a list of text annotations into a formatted text representation. | ||
* Each annotation represents a piece of text at a certain position on a webpage. | ||
* The formatting attempts to reconstruct a textual "screenshot" of the page by: | ||
* - Grouping annotations into lines based on their vertical positions. | ||
* - Adjusting spacing to reflect line gaps. | ||
* - Attempting to preserve relative positions and formatting. | ||
* | ||
* The output is a text block, optionally surrounded by lines of dashes, that aims | ||
* to closely mirror the visual layout of the text on the page. | ||
* | ||
* @param textAnnotations - An array of TextAnnotations describing text and their positions. | ||
* @param pageWidth - The width of the page in pixels, used to normalize positions. | ||
* @returns A string representing the text layout of the page. | ||
*/ | ||
export declare function formatText(textAnnotations: TextAnnotation[], pageWidth: number): string; | ||
export declare function logLineToString(logLine: LogLine): string; | ||
export declare function validateZodSchema(schema: z.ZodTypeAny, data: unknown): boolean; |
@@ -8,2 +8,3 @@ import { Browser, BrowserContext } from "@playwright/test"; | ||
contextPath?: string; | ||
sessionId?: string; | ||
} |
@@ -9,2 +9,3 @@ import { EvalLogger } from "../evals/utils"; | ||
logger: EvalLogger; | ||
useTextExtract: boolean; | ||
}) => Promise<{ | ||
@@ -17,3 +18,3 @@ _success: boolean; | ||
}>; | ||
export declare const EvalCategorySchema: z.ZodEnum<["observe", "act", "combination", "extract", "experimental"]>; | ||
export declare const EvalCategorySchema: z.ZodEnum<["observe", "act", "combination", "extract", "experimental", "text_extract"]>; | ||
export type EvalCategory = z.infer<typeof EvalCategorySchema>; | ||
@@ -20,0 +21,0 @@ export interface EvalInput { |
@@ -23,6 +23,2 @@ import Browserbase from "@browserbasehq/sdk"; | ||
} | ||
export interface InitResult { | ||
debugUrl: string; | ||
sessionUrl: string; | ||
} | ||
export interface InitOptions { | ||
@@ -39,2 +35,3 @@ /** @deprecated Pass this into the Stagehand constructor instead. This will be removed in the next major version. */ | ||
sessionUrl: string; | ||
sessionId: string; | ||
} | ||
@@ -70,2 +67,3 @@ export interface InitFromPageOptions { | ||
domSettleTimeoutMs?: number; | ||
useTextExtract?: boolean; | ||
} | ||
@@ -72,0 +70,0 @@ export type ExtractResult<T extends z.AnyZodObject> = z.infer<T>; |
@@ -356,2 +356,152 @@ (() => { | ||
} | ||
function storeDOM() { | ||
const originalDOM = document.body.cloneNode(true); | ||
console.log("DOM state stored."); | ||
return originalDOM.outerHTML; | ||
} | ||
function restoreDOM(storedDOM) { | ||
console.log("Restoring DOM"); | ||
if (storedDOM) { | ||
document.body.innerHTML = storedDOM; | ||
} else { | ||
console.error("No DOM state was provided."); | ||
} | ||
} | ||
function createTextBoundingBoxes() { | ||
const style = document.createElement("style"); | ||
document.head.appendChild(style); | ||
if (style.sheet) { | ||
style.sheet.insertRule( | ||
` | ||
.stagehand-highlighted-word, .stagehand-space { | ||
border: 0px solid orange; | ||
display: inline-block !important; | ||
visibility: visible; | ||
} | ||
`, | ||
0 | ||
); | ||
style.sheet.insertRule( | ||
` | ||
code .stagehand-highlighted-word, code .stagehand-space, | ||
pre .stagehand-highlighted-word, pre .stagehand-space { | ||
white-space: pre-wrap; | ||
display: inline !important; | ||
} | ||
`, | ||
1 | ||
); | ||
} | ||
function applyHighlighting(root) { | ||
root.querySelectorAll("body *").forEach((element) => { | ||
if (element.closest(".stagehand-nav, .stagehand-marker")) { | ||
return; | ||
} | ||
if (["SCRIPT", "STYLE", "IFRAME", "INPUT", "TEXTAREA"].includes( | ||
element.tagName | ||
)) { | ||
return; | ||
} | ||
const childNodes = Array.from(element.childNodes); | ||
childNodes.forEach((node) => { | ||
if (node.nodeType === 3 && node.textContent?.trim().length > 0) { | ||
const textContent = node.textContent.replace(/\u00A0/g, " "); | ||
const tokens = textContent.split(/(\s+)/g); | ||
const fragment = document.createDocumentFragment(); | ||
const parentIsCode = element.tagName === "CODE"; | ||
tokens.forEach((token) => { | ||
const span = document.createElement("span"); | ||
span.textContent = token; | ||
if (parentIsCode) { | ||
span.style.whiteSpace = "pre-wrap"; | ||
span.style.display = "inline"; | ||
} | ||
span.className = token.trim().length === 0 ? "stagehand-space" : "stagehand-highlighted-word"; | ||
fragment.appendChild(span); | ||
}); | ||
if (fragment.childNodes.length > 0 && node.parentNode) { | ||
element.insertBefore(fragment, node); | ||
node.remove(); | ||
} | ||
} | ||
}); | ||
}); | ||
} | ||
applyHighlighting(document); | ||
document.querySelectorAll("iframe").forEach((iframe) => { | ||
try { | ||
iframe.contentWindow?.postMessage({ action: "highlight" }, "*"); | ||
} catch (error) { | ||
console.error("Error accessing iframe content: ", error); | ||
} | ||
}); | ||
} | ||
function getElementBoundingBoxes(xpath) { | ||
const element = document.evaluate( | ||
xpath, | ||
document, | ||
null, | ||
XPathResult.FIRST_ORDERED_NODE_TYPE, | ||
null | ||
).singleNodeValue; | ||
if (!element) return []; | ||
const isValidText = (text) => text && text.trim().length > 0; | ||
let dropDownElem = element.querySelector("option[selected]"); | ||
if (!dropDownElem) { | ||
dropDownElem = element.querySelector("option"); | ||
} | ||
if (dropDownElem) { | ||
const elemText = dropDownElem.textContent || ""; | ||
if (isValidText(elemText)) { | ||
const parentRect = element.getBoundingClientRect(); | ||
return [ | ||
{ | ||
text: elemText.trim(), | ||
top: parentRect.top + window.scrollY, | ||
left: parentRect.left + window.scrollX, | ||
width: parentRect.width, | ||
height: parentRect.height | ||
} | ||
]; | ||
} else { | ||
return []; | ||
} | ||
} | ||
let placeholderText = ""; | ||
if ((element.tagName.toLowerCase() === "input" || element.tagName.toLowerCase() === "textarea") && element.placeholder) { | ||
placeholderText = element.placeholder; | ||
} else if (element.tagName.toLowerCase() === "a") { | ||
placeholderText = ""; | ||
} else if (element.tagName.toLowerCase() === "img") { | ||
placeholderText = element.alt || ""; | ||
} | ||
const words = element.querySelectorAll( | ||
".stagehand-highlighted-word" | ||
); | ||
const boundingBoxes = Array.from(words).map((word) => { | ||
const rect = word.getBoundingClientRect(); | ||
return { | ||
text: word.innerText || "", | ||
top: rect.top + window.scrollY, | ||
left: rect.left + window.scrollX, | ||
width: rect.width, | ||
height: rect.height * 0.75 | ||
}; | ||
}).filter( | ||
(box) => box.width > 0 && box.height > 0 && box.top >= 0 && box.left >= 0 && isValidText(box.text) | ||
); | ||
if (boundingBoxes.length === 0) { | ||
const elementRect = element.getBoundingClientRect(); | ||
return [ | ||
{ | ||
text: placeholderText, | ||
top: elementRect.top + window.scrollY, | ||
left: elementRect.left + window.scrollX, | ||
width: elementRect.width, | ||
height: elementRect.height * 0.75 | ||
} | ||
]; | ||
} | ||
return boundingBoxes; | ||
} | ||
window.processDom = processDom; | ||
@@ -361,2 +511,6 @@ window.processAllOfDom = processAllOfDom; | ||
window.scrollToHeight = scrollToHeight; | ||
window.storeDOM = storeDOM; | ||
window.restoreDOM = restoreDOM; | ||
window.createTextBoundingBoxes = createTextBoundingBoxes; | ||
window.getElementBoundingBoxes = getElementBoundingBoxes; | ||
var leafElementDenyList = ["SVG", "IFRAME", "SCRIPT", "STYLE", "LINK"]; | ||
@@ -363,0 +517,0 @@ var interactiveElementTypes = [ |
@@ -1,1 +0,1 @@ | ||
export const scriptContent = "(() => {\n // lib/dom/xpathUtils.ts\n function getParentElement(node) {\n return isElementNode(node) ? node.parentElement : node.parentNode;\n }\n function getCombinations(attributes, size) {\n const results = [];\n function helper(start, combo) {\n if (combo.length === size) {\n results.push([...combo]);\n return;\n }\n for (let i = start; i < attributes.length; i++) {\n combo.push(attributes[i]);\n helper(i + 1, combo);\n combo.pop();\n }\n }\n helper(0, []);\n return results;\n }\n function isXPathFirstResultElement(xpath, target) {\n try {\n const result = document.evaluate(\n xpath,\n document.documentElement,\n null,\n XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,\n null\n );\n return result.snapshotItem(0) === target;\n } catch (error) {\n console.warn(`Invalid XPath expression: ${xpath}`, error);\n return false;\n }\n }\n function escapeXPathString(value) {\n if (value.includes(\"'\")) {\n if (value.includes('\"')) {\n return \"concat(\" + value.split(/('+)/).map((part) => {\n if (part === \"'\") {\n return `\"'\"`;\n } else if (part.startsWith(\"'\") && part.endsWith(\"'\")) {\n return `\"${part}\"`;\n } else {\n return `'${part}'`;\n }\n }).join(\",\") + \")\";\n } else {\n return `\"${value}\"`;\n }\n } else {\n return `'${value}'`;\n }\n }\n async function generateXPathsForElement(element) {\n if (!element) return [];\n const [complexXPath, standardXPath, idBasedXPath] = await Promise.all([\n generateComplexXPath(element),\n generateStandardXPath(element),\n generatedIdBasedXPath(element)\n ]);\n return [standardXPath, ...idBasedXPath ? [idBasedXPath] : [], complexXPath];\n }\n async function generateComplexXPath(element) {\n const parts = [];\n let currentElement = element;\n while (currentElement && (isTextNode(currentElement) || isElementNode(currentElement))) {\n if (isElementNode(currentElement)) {\n const el = currentElement;\n let selector = el.tagName.toLowerCase();\n const attributePriority = [\n \"data-qa\",\n \"data-component\",\n \"data-role\",\n \"role\",\n \"aria-role\",\n \"type\",\n \"name\",\n \"aria-label\",\n \"placeholder\",\n \"title\",\n \"alt\"\n ];\n const attributes = attributePriority.map((attr) => {\n let value = el.getAttribute(attr);\n if (attr === \"href-full\" && value) {\n value = el.getAttribute(\"href\");\n }\n return value ? { attr: attr === \"href-full\" ? \"href\" : attr, value } : null;\n }).filter((attr) => attr !== null);\n let uniqueSelector = \"\";\n for (let i = 1; i <= attributes.length; i++) {\n const combinations = getCombinations(attributes, i);\n for (const combo of combinations) {\n const conditions = combo.map((a) => `@${a.attr}=${escapeXPathString(a.value)}`).join(\" and \");\n const xpath2 = `//${selector}[${conditions}]`;\n if (isXPathFirstResultElement(xpath2, el)) {\n uniqueSelector = xpath2;\n break;\n }\n }\n if (uniqueSelector) break;\n }\n if (uniqueSelector) {\n parts.unshift(uniqueSelector.replace(\"//\", \"\"));\n break;\n } else {\n const parent = getParentElement(el);\n if (parent) {\n const siblings = Array.from(parent.children).filter(\n (sibling) => sibling.tagName === el.tagName\n );\n const index = siblings.indexOf(el) + 1;\n selector += siblings.length > 1 ? `[${index}]` : \"\";\n }\n parts.unshift(selector);\n }\n }\n currentElement = getParentElement(currentElement);\n }\n const xpath = \"//\" + parts.join(\"/\");\n return xpath;\n }\n async function generateStandardXPath(element) {\n const parts = [];\n while (element && (isTextNode(element) || isElementNode(element))) {\n let index = 0;\n let hasSameTypeSiblings = false;\n const siblings = element.parentElement ? Array.from(element.parentElement.childNodes) : [];\n for (let i = 0; i < siblings.length; i++) {\n const sibling = siblings[i];\n if (sibling.nodeType === element.nodeType && sibling.nodeName === element.nodeName) {\n index = index + 1;\n hasSameTypeSiblings = true;\n if (sibling.isSameNode(element)) {\n break;\n }\n }\n }\n if (element.nodeName !== \"#text\") {\n const tagName = element.nodeName.toLowerCase();\n const pathIndex = hasSameTypeSiblings ? `[${index}]` : \"\";\n parts.unshift(`${tagName}${pathIndex}`);\n }\n element = element.parentElement;\n }\n return parts.length ? `/${parts.join(\"/\")}` : \"\";\n }\n async function generatedIdBasedXPath(element) {\n if (isElementNode(element) && element.id) {\n return `//*[@id='${element.id}']`;\n }\n return null;\n }\n\n // lib/dom/utils.ts\n async function waitForDomSettle() {\n return new Promise((resolve) => {\n const createTimeout = () => {\n return setTimeout(() => {\n resolve();\n }, 2e3);\n };\n let timeout = createTimeout();\n const observer = new MutationObserver(() => {\n clearTimeout(timeout);\n timeout = createTimeout();\n });\n observer.observe(window.document.body, { childList: true, subtree: true });\n });\n }\n window.waitForDomSettle = waitForDomSettle;\n function calculateViewportHeight() {\n return Math.ceil(window.innerHeight * 0.75);\n }\n\n // lib/dom/process.ts\n function isElementNode(node) {\n return node.nodeType === Node.ELEMENT_NODE;\n }\n function isTextNode(node) {\n return node.nodeType === Node.TEXT_NODE && Boolean(node.textContent?.trim());\n }\n async function processDom(chunksSeen) {\n const { chunk, chunksArray } = await pickChunk(chunksSeen);\n const { outputString, selectorMap } = await processElements(chunk);\n console.log(\n `Stagehand (Browser Process): Extracted dom elements:\n${outputString}`\n );\n return {\n outputString,\n selectorMap,\n chunk,\n chunks: chunksArray\n };\n }\n async function processAllOfDom() {\n console.log(\"Stagehand (Browser Process): Processing all of DOM\");\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n let index = 0;\n const results = [];\n for (let chunk = 0; chunk < totalChunks; chunk++) {\n const result = await processElements(chunk, true, index);\n results.push(result);\n index += Object.keys(result.selectorMap).length;\n }\n await scrollToHeight(0);\n const allOutputString = results.map((result) => result.outputString).join(\"\");\n const allSelectorMap = results.reduce(\n (acc, result) => ({ ...acc, ...result.selectorMap }),\n {}\n );\n console.log(\n `Stagehand (Browser Process): All dom elements: ${allOutputString}`\n );\n return {\n outputString: allOutputString,\n selectorMap: allSelectorMap\n };\n }\n async function scrollToHeight(height) {\n window.scrollTo({ top: height, left: 0, behavior: \"smooth\" });\n await new Promise((resolve) => {\n let scrollEndTimer;\n const handleScrollEnd = () => {\n clearTimeout(scrollEndTimer);\n scrollEndTimer = window.setTimeout(() => {\n window.removeEventListener(\"scroll\", handleScrollEnd);\n resolve();\n }, 100);\n };\n window.addEventListener(\"scroll\", handleScrollEnd, { passive: true });\n handleScrollEnd();\n });\n }\n var xpathCache = /* @__PURE__ */ new Map();\n async function processElements(chunk, scrollToChunk = true, indexOffset = 0) {\n console.time(\"processElements:total\");\n const viewportHeight = calculateViewportHeight();\n const chunkHeight = viewportHeight * chunk;\n const maxScrollTop = document.documentElement.scrollHeight - viewportHeight;\n const offsetTop = Math.min(chunkHeight, maxScrollTop);\n if (scrollToChunk) {\n console.time(\"processElements:scroll\");\n await scrollToHeight(offsetTop);\n console.timeEnd(\"processElements:scroll\");\n }\n const candidateElements = [];\n const DOMQueue = [...document.body.childNodes];\n console.log(\"Stagehand (Browser Process): Generating candidate elements\");\n console.time(\"processElements:findCandidates\");\n while (DOMQueue.length > 0) {\n const element = DOMQueue.pop();\n let shouldAddElement = false;\n if (element && isElementNode(element)) {\n const childrenCount = element.childNodes.length;\n for (let i = childrenCount - 1; i >= 0; i--) {\n const child = element.childNodes[i];\n DOMQueue.push(child);\n }\n if (isInteractiveElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n if (isLeafElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n }\n if (element && isTextNode(element) && isTextVisible(element)) {\n shouldAddElement = true;\n }\n if (shouldAddElement) {\n candidateElements.push(element);\n }\n }\n console.timeEnd(\"processElements:findCandidates\");\n const selectorMap = {};\n let outputString = \"\";\n console.log(\n `Stagehand (Browser Process): Processing candidate elements: ${candidateElements.length}`\n );\n console.time(\"processElements:processCandidates\");\n console.time(\"processElements:generateXPaths\");\n const xpathLists = await Promise.all(\n candidateElements.map(async (element) => {\n if (xpathCache.has(element)) {\n return xpathCache.get(element);\n }\n const xpaths = await generateXPathsForElement(element);\n xpathCache.set(element, xpaths);\n return xpaths;\n })\n );\n console.timeEnd(\"processElements:generateXPaths\");\n candidateElements.forEach((element, index) => {\n const xpaths = xpathLists[index];\n let elementOutput = \"\";\n if (isTextNode(element)) {\n const textContent = element.textContent?.trim();\n if (textContent) {\n elementOutput += `${index + indexOffset}:${textContent}\n`;\n }\n } else if (isElementNode(element)) {\n const tagName = element.tagName.toLowerCase();\n const attributes = collectEssentialAttributes(element);\n const openingTag = `<${tagName}${attributes ? \" \" + attributes : \"\"}>`;\n const closingTag = `</${tagName}>`;\n const textContent = element.textContent?.trim() || \"\";\n elementOutput += `${index + indexOffset}:${openingTag}${textContent}${closingTag}\n`;\n }\n outputString += elementOutput;\n selectorMap[index + indexOffset] = xpaths;\n });\n console.timeEnd(\"processElements:processCandidates\");\n console.timeEnd(\"processElements:total\");\n return {\n outputString,\n selectorMap\n };\n }\n function collectEssentialAttributes(element) {\n const essentialAttributes = [\n \"id\",\n \"class\",\n \"href\",\n \"src\",\n \"aria-label\",\n \"aria-name\",\n \"aria-role\",\n \"aria-description\",\n \"aria-expanded\",\n \"aria-haspopup\",\n \"type\",\n \"value\"\n ];\n const attrs = essentialAttributes.map((attr) => {\n const value = element.getAttribute(attr);\n return value ? `${attr}=\"${value}\"` : \"\";\n }).filter((attr) => attr !== \"\");\n Array.from(element.attributes).forEach((attr) => {\n if (attr.name.startsWith(\"data-\")) {\n attrs.push(`${attr.name}=\"${attr.value}\"`);\n }\n });\n return attrs.join(\" \");\n }\n window.processDom = processDom;\n window.processAllOfDom = processAllOfDom;\n window.processElements = processElements;\n window.scrollToHeight = scrollToHeight;\n var leafElementDenyList = [\"SVG\", \"IFRAME\", \"SCRIPT\", \"STYLE\", \"LINK\"];\n var interactiveElementTypes = [\n \"A\",\n \"BUTTON\",\n \"DETAILS\",\n \"EMBED\",\n \"INPUT\",\n \"LABEL\",\n \"MENU\",\n \"MENUITEM\",\n \"OBJECT\",\n \"SELECT\",\n \"TEXTAREA\",\n \"SUMMARY\"\n ];\n var interactiveRoles = [\n \"button\",\n \"menu\",\n \"menuitem\",\n \"link\",\n \"checkbox\",\n \"radio\",\n \"slider\",\n \"tab\",\n \"tabpanel\",\n \"textbox\",\n \"combobox\",\n \"grid\",\n \"listbox\",\n \"option\",\n \"progressbar\",\n \"scrollbar\",\n \"searchbox\",\n \"switch\",\n \"tree\",\n \"treeitem\",\n \"spinbutton\",\n \"tooltip\"\n ];\n var interactiveAriaRoles = [\"menu\", \"menuitem\", \"button\"];\n var isVisible = (element) => {\n const rect = element.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n if (!isTopElement(element, rect)) {\n return false;\n }\n const visible = element.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n var isTextVisible = (element) => {\n const range = document.createRange();\n range.selectNodeContents(element);\n const rect = range.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n const parent = element.parentElement;\n if (!parent) {\n return false;\n }\n if (!isTopElement(parent, rect)) {\n return false;\n }\n const visible = parent.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n function isTopElement(elem, rect) {\n const points = [\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 }\n ];\n return points.some((point) => {\n const topEl = document.elementFromPoint(point.x, point.y);\n let current = topEl;\n while (current && current !== document.body) {\n if (current.isSameNode(elem)) {\n return true;\n }\n current = current.parentElement;\n }\n return false;\n });\n }\n var isActive = (element) => {\n if (element.hasAttribute(\"disabled\") || element.hasAttribute(\"hidden\") || element.getAttribute(\"aria-disabled\") === \"true\") {\n return false;\n }\n return true;\n };\n var isInteractiveElement = (element) => {\n const elementType = element.tagName;\n const elementRole = element.getAttribute(\"role\");\n const elementAriaRole = element.getAttribute(\"aria-role\");\n return elementType && interactiveElementTypes.includes(elementType) || elementRole && interactiveRoles.includes(elementRole) || elementAriaRole && interactiveAriaRoles.includes(elementAriaRole);\n };\n var isLeafElement = (element) => {\n if (element.textContent === \"\") {\n return false;\n }\n if (element.childNodes.length === 0) {\n return !leafElementDenyList.includes(element.tagName);\n }\n if (element.childNodes.length === 1 && isTextNode(element.childNodes[0])) {\n return true;\n }\n return false;\n };\n async function pickChunk(chunksSeen) {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const chunks = Math.ceil(documentHeight / viewportHeight);\n const chunksArray = Array.from({ length: chunks }, (_, i) => i);\n const chunksRemaining = chunksArray.filter((chunk2) => {\n return !chunksSeen.includes(chunk2);\n });\n const currentScrollPosition = window.scrollY;\n const closestChunk = chunksRemaining.reduce((closest, current) => {\n const currentChunkTop = viewportHeight * current;\n const closestChunkTop = viewportHeight * closest;\n return Math.abs(currentScrollPosition - currentChunkTop) < Math.abs(currentScrollPosition - closestChunkTop) ? current : closest;\n }, chunksRemaining[0]);\n const chunk = closestChunk;\n if (chunk === void 0) {\n throw new Error(`No chunks remaining to check: ${chunksRemaining}`);\n }\n return {\n chunk,\n chunksArray\n };\n }\n\n // lib/dom/debug.ts\n async function debugDom() {\n window.chunkNumber = 0;\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n }\n function multiSelectorMapToSelectorMap(multiSelectorMap) {\n return Object.fromEntries(\n Object.entries(multiSelectorMap).map(([key, selectors]) => [\n Number(key),\n selectors[0]\n ])\n );\n }\n function drawChunk(selectorMap) {\n if (!window.showChunks) return;\n cleanupMarkers();\n Object.values(selectorMap).forEach((selector) => {\n const element = document.evaluate(\n selector,\n document,\n null,\n XPathResult.FIRST_ORDERED_NODE_TYPE,\n null\n ).singleNodeValue;\n if (element) {\n let rect;\n if (element.nodeType === Node.ELEMENT_NODE) {\n rect = element.getBoundingClientRect();\n } else {\n const range = document.createRange();\n range.selectNodeContents(element);\n rect = range.getBoundingClientRect();\n }\n const color = \"grey\";\n const overlay = document.createElement(\"div\");\n overlay.style.position = \"absolute\";\n overlay.style.left = `${rect.left + window.scrollX}px`;\n overlay.style.top = `${rect.top + window.scrollY}px`;\n overlay.style.padding = \"2px\";\n overlay.style.width = `${rect.width}px`;\n overlay.style.height = `${rect.height}px`;\n overlay.style.backgroundColor = color;\n overlay.className = \"stagehand-marker\";\n overlay.style.opacity = \"0.3\";\n overlay.style.zIndex = \"1000000000\";\n overlay.style.border = \"1px solid\";\n overlay.style.pointerEvents = \"none\";\n document.body.appendChild(overlay);\n }\n });\n }\n async function cleanupDebug() {\n cleanupMarkers();\n cleanupNav();\n }\n function cleanupMarkers() {\n const markers = document.querySelectorAll(\".stagehand-marker\");\n markers.forEach((marker) => {\n marker.remove();\n });\n }\n function cleanupNav() {\n const stagehandNavElements = document.querySelectorAll(\".stagehand-nav\");\n stagehandNavElements.forEach((element) => {\n element.remove();\n });\n }\n function setupChunkNav() {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n if (window.chunkNumber > 0) {\n const prevChunkButton = document.createElement(\"button\");\n prevChunkButton.className = \"stagehand-nav\";\n prevChunkButton.textContent = \"Previous\";\n prevChunkButton.style.marginLeft = \"50px\";\n prevChunkButton.style.position = \"fixed\";\n prevChunkButton.style.bottom = \"10px\";\n prevChunkButton.style.left = \"50%\";\n prevChunkButton.style.transform = \"translateX(-50%)\";\n prevChunkButton.style.zIndex = \"1000000000\";\n prevChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber -= 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(prevChunkButton);\n }\n if (totalChunks > window.chunkNumber) {\n const nextChunkButton = document.createElement(\"button\");\n nextChunkButton.className = \"stagehand-nav\";\n nextChunkButton.textContent = \"Next\";\n nextChunkButton.style.marginRight = \"50px\";\n nextChunkButton.style.position = \"fixed\";\n nextChunkButton.style.bottom = \"10px\";\n nextChunkButton.style.right = \"50%\";\n nextChunkButton.style.transform = \"translateX(50%)\";\n nextChunkButton.style.zIndex = \"1000000000\";\n nextChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber += 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(nextChunkButton);\n }\n }\n window.debugDom = debugDom;\n window.cleanupDebug = cleanupDebug;\n})();\n"; | ||
export const scriptContent = "(() => {\n // lib/dom/xpathUtils.ts\n function getParentElement(node) {\n return isElementNode(node) ? node.parentElement : node.parentNode;\n }\n function getCombinations(attributes, size) {\n const results = [];\n function helper(start, combo) {\n if (combo.length === size) {\n results.push([...combo]);\n return;\n }\n for (let i = start; i < attributes.length; i++) {\n combo.push(attributes[i]);\n helper(i + 1, combo);\n combo.pop();\n }\n }\n helper(0, []);\n return results;\n }\n function isXPathFirstResultElement(xpath, target) {\n try {\n const result = document.evaluate(\n xpath,\n document.documentElement,\n null,\n XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,\n null\n );\n return result.snapshotItem(0) === target;\n } catch (error) {\n console.warn(`Invalid XPath expression: ${xpath}`, error);\n return false;\n }\n }\n function escapeXPathString(value) {\n if (value.includes(\"'\")) {\n if (value.includes('\"')) {\n return \"concat(\" + value.split(/('+)/).map((part) => {\n if (part === \"'\") {\n return `\"'\"`;\n } else if (part.startsWith(\"'\") && part.endsWith(\"'\")) {\n return `\"${part}\"`;\n } else {\n return `'${part}'`;\n }\n }).join(\",\") + \")\";\n } else {\n return `\"${value}\"`;\n }\n } else {\n return `'${value}'`;\n }\n }\n async function generateXPathsForElement(element) {\n if (!element) return [];\n const [complexXPath, standardXPath, idBasedXPath] = await Promise.all([\n generateComplexXPath(element),\n generateStandardXPath(element),\n generatedIdBasedXPath(element)\n ]);\n return [standardXPath, ...idBasedXPath ? [idBasedXPath] : [], complexXPath];\n }\n async function generateComplexXPath(element) {\n const parts = [];\n let currentElement = element;\n while (currentElement && (isTextNode(currentElement) || isElementNode(currentElement))) {\n if (isElementNode(currentElement)) {\n const el = currentElement;\n let selector = el.tagName.toLowerCase();\n const attributePriority = [\n \"data-qa\",\n \"data-component\",\n \"data-role\",\n \"role\",\n \"aria-role\",\n \"type\",\n \"name\",\n \"aria-label\",\n \"placeholder\",\n \"title\",\n \"alt\"\n ];\n const attributes = attributePriority.map((attr) => {\n let value = el.getAttribute(attr);\n if (attr === \"href-full\" && value) {\n value = el.getAttribute(\"href\");\n }\n return value ? { attr: attr === \"href-full\" ? \"href\" : attr, value } : null;\n }).filter((attr) => attr !== null);\n let uniqueSelector = \"\";\n for (let i = 1; i <= attributes.length; i++) {\n const combinations = getCombinations(attributes, i);\n for (const combo of combinations) {\n const conditions = combo.map((a) => `@${a.attr}=${escapeXPathString(a.value)}`).join(\" and \");\n const xpath2 = `//${selector}[${conditions}]`;\n if (isXPathFirstResultElement(xpath2, el)) {\n uniqueSelector = xpath2;\n break;\n }\n }\n if (uniqueSelector) break;\n }\n if (uniqueSelector) {\n parts.unshift(uniqueSelector.replace(\"//\", \"\"));\n break;\n } else {\n const parent = getParentElement(el);\n if (parent) {\n const siblings = Array.from(parent.children).filter(\n (sibling) => sibling.tagName === el.tagName\n );\n const index = siblings.indexOf(el) + 1;\n selector += siblings.length > 1 ? `[${index}]` : \"\";\n }\n parts.unshift(selector);\n }\n }\n currentElement = getParentElement(currentElement);\n }\n const xpath = \"//\" + parts.join(\"/\");\n return xpath;\n }\n async function generateStandardXPath(element) {\n const parts = [];\n while (element && (isTextNode(element) || isElementNode(element))) {\n let index = 0;\n let hasSameTypeSiblings = false;\n const siblings = element.parentElement ? Array.from(element.parentElement.childNodes) : [];\n for (let i = 0; i < siblings.length; i++) {\n const sibling = siblings[i];\n if (sibling.nodeType === element.nodeType && sibling.nodeName === element.nodeName) {\n index = index + 1;\n hasSameTypeSiblings = true;\n if (sibling.isSameNode(element)) {\n break;\n }\n }\n }\n if (element.nodeName !== \"#text\") {\n const tagName = element.nodeName.toLowerCase();\n const pathIndex = hasSameTypeSiblings ? `[${index}]` : \"\";\n parts.unshift(`${tagName}${pathIndex}`);\n }\n element = element.parentElement;\n }\n return parts.length ? `/${parts.join(\"/\")}` : \"\";\n }\n async function generatedIdBasedXPath(element) {\n if (isElementNode(element) && element.id) {\n return `//*[@id='${element.id}']`;\n }\n return null;\n }\n\n // lib/dom/utils.ts\n async function waitForDomSettle() {\n return new Promise((resolve) => {\n const createTimeout = () => {\n return setTimeout(() => {\n resolve();\n }, 2e3);\n };\n let timeout = createTimeout();\n const observer = new MutationObserver(() => {\n clearTimeout(timeout);\n timeout = createTimeout();\n });\n observer.observe(window.document.body, { childList: true, subtree: true });\n });\n }\n window.waitForDomSettle = waitForDomSettle;\n function calculateViewportHeight() {\n return Math.ceil(window.innerHeight * 0.75);\n }\n\n // lib/dom/process.ts\n function isElementNode(node) {\n return node.nodeType === Node.ELEMENT_NODE;\n }\n function isTextNode(node) {\n return node.nodeType === Node.TEXT_NODE && Boolean(node.textContent?.trim());\n }\n async function processDom(chunksSeen) {\n const { chunk, chunksArray } = await pickChunk(chunksSeen);\n const { outputString, selectorMap } = await processElements(chunk);\n console.log(\n `Stagehand (Browser Process): Extracted dom elements:\n${outputString}`\n );\n return {\n outputString,\n selectorMap,\n chunk,\n chunks: chunksArray\n };\n }\n async function processAllOfDom() {\n console.log(\"Stagehand (Browser Process): Processing all of DOM\");\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n let index = 0;\n const results = [];\n for (let chunk = 0; chunk < totalChunks; chunk++) {\n const result = await processElements(chunk, true, index);\n results.push(result);\n index += Object.keys(result.selectorMap).length;\n }\n await scrollToHeight(0);\n const allOutputString = results.map((result) => result.outputString).join(\"\");\n const allSelectorMap = results.reduce(\n (acc, result) => ({ ...acc, ...result.selectorMap }),\n {}\n );\n console.log(\n `Stagehand (Browser Process): All dom elements: ${allOutputString}`\n );\n return {\n outputString: allOutputString,\n selectorMap: allSelectorMap\n };\n }\n async function scrollToHeight(height) {\n window.scrollTo({ top: height, left: 0, behavior: \"smooth\" });\n await new Promise((resolve) => {\n let scrollEndTimer;\n const handleScrollEnd = () => {\n clearTimeout(scrollEndTimer);\n scrollEndTimer = window.setTimeout(() => {\n window.removeEventListener(\"scroll\", handleScrollEnd);\n resolve();\n }, 100);\n };\n window.addEventListener(\"scroll\", handleScrollEnd, { passive: true });\n handleScrollEnd();\n });\n }\n var xpathCache = /* @__PURE__ */ new Map();\n async function processElements(chunk, scrollToChunk = true, indexOffset = 0) {\n console.time(\"processElements:total\");\n const viewportHeight = calculateViewportHeight();\n const chunkHeight = viewportHeight * chunk;\n const maxScrollTop = document.documentElement.scrollHeight - viewportHeight;\n const offsetTop = Math.min(chunkHeight, maxScrollTop);\n if (scrollToChunk) {\n console.time(\"processElements:scroll\");\n await scrollToHeight(offsetTop);\n console.timeEnd(\"processElements:scroll\");\n }\n const candidateElements = [];\n const DOMQueue = [...document.body.childNodes];\n console.log(\"Stagehand (Browser Process): Generating candidate elements\");\n console.time(\"processElements:findCandidates\");\n while (DOMQueue.length > 0) {\n const element = DOMQueue.pop();\n let shouldAddElement = false;\n if (element && isElementNode(element)) {\n const childrenCount = element.childNodes.length;\n for (let i = childrenCount - 1; i >= 0; i--) {\n const child = element.childNodes[i];\n DOMQueue.push(child);\n }\n if (isInteractiveElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n if (isLeafElement(element)) {\n if (isActive(element) && isVisible(element)) {\n shouldAddElement = true;\n }\n }\n }\n if (element && isTextNode(element) && isTextVisible(element)) {\n shouldAddElement = true;\n }\n if (shouldAddElement) {\n candidateElements.push(element);\n }\n }\n console.timeEnd(\"processElements:findCandidates\");\n const selectorMap = {};\n let outputString = \"\";\n console.log(\n `Stagehand (Browser Process): Processing candidate elements: ${candidateElements.length}`\n );\n console.time(\"processElements:processCandidates\");\n console.time(\"processElements:generateXPaths\");\n const xpathLists = await Promise.all(\n candidateElements.map(async (element) => {\n if (xpathCache.has(element)) {\n return xpathCache.get(element);\n }\n const xpaths = await generateXPathsForElement(element);\n xpathCache.set(element, xpaths);\n return xpaths;\n })\n );\n console.timeEnd(\"processElements:generateXPaths\");\n candidateElements.forEach((element, index) => {\n const xpaths = xpathLists[index];\n let elementOutput = \"\";\n if (isTextNode(element)) {\n const textContent = element.textContent?.trim();\n if (textContent) {\n elementOutput += `${index + indexOffset}:${textContent}\n`;\n }\n } else if (isElementNode(element)) {\n const tagName = element.tagName.toLowerCase();\n const attributes = collectEssentialAttributes(element);\n const openingTag = `<${tagName}${attributes ? \" \" + attributes : \"\"}>`;\n const closingTag = `</${tagName}>`;\n const textContent = element.textContent?.trim() || \"\";\n elementOutput += `${index + indexOffset}:${openingTag}${textContent}${closingTag}\n`;\n }\n outputString += elementOutput;\n selectorMap[index + indexOffset] = xpaths;\n });\n console.timeEnd(\"processElements:processCandidates\");\n console.timeEnd(\"processElements:total\");\n return {\n outputString,\n selectorMap\n };\n }\n function collectEssentialAttributes(element) {\n const essentialAttributes = [\n \"id\",\n \"class\",\n \"href\",\n \"src\",\n \"aria-label\",\n \"aria-name\",\n \"aria-role\",\n \"aria-description\",\n \"aria-expanded\",\n \"aria-haspopup\",\n \"type\",\n \"value\"\n ];\n const attrs = essentialAttributes.map((attr) => {\n const value = element.getAttribute(attr);\n return value ? `${attr}=\"${value}\"` : \"\";\n }).filter((attr) => attr !== \"\");\n Array.from(element.attributes).forEach((attr) => {\n if (attr.name.startsWith(\"data-\")) {\n attrs.push(`${attr.name}=\"${attr.value}\"`);\n }\n });\n return attrs.join(\" \");\n }\n function storeDOM() {\n const originalDOM = document.body.cloneNode(true);\n console.log(\"DOM state stored.\");\n return originalDOM.outerHTML;\n }\n function restoreDOM(storedDOM) {\n console.log(\"Restoring DOM\");\n if (storedDOM) {\n document.body.innerHTML = storedDOM;\n } else {\n console.error(\"No DOM state was provided.\");\n }\n }\n function createTextBoundingBoxes() {\n const style = document.createElement(\"style\");\n document.head.appendChild(style);\n if (style.sheet) {\n style.sheet.insertRule(\n `\n .stagehand-highlighted-word, .stagehand-space {\n border: 0px solid orange;\n display: inline-block !important;\n visibility: visible;\n }\n `,\n 0\n );\n style.sheet.insertRule(\n `\n code .stagehand-highlighted-word, code .stagehand-space,\n pre .stagehand-highlighted-word, pre .stagehand-space {\n white-space: pre-wrap;\n display: inline !important;\n }\n `,\n 1\n );\n }\n function applyHighlighting(root) {\n root.querySelectorAll(\"body *\").forEach((element) => {\n if (element.closest(\".stagehand-nav, .stagehand-marker\")) {\n return;\n }\n if ([\"SCRIPT\", \"STYLE\", \"IFRAME\", \"INPUT\", \"TEXTAREA\"].includes(\n element.tagName\n )) {\n return;\n }\n const childNodes = Array.from(element.childNodes);\n childNodes.forEach((node) => {\n if (node.nodeType === 3 && node.textContent?.trim().length > 0) {\n const textContent = node.textContent.replace(/\\u00A0/g, \" \");\n const tokens = textContent.split(/(\\s+)/g);\n const fragment = document.createDocumentFragment();\n const parentIsCode = element.tagName === \"CODE\";\n tokens.forEach((token) => {\n const span = document.createElement(\"span\");\n span.textContent = token;\n if (parentIsCode) {\n span.style.whiteSpace = \"pre-wrap\";\n span.style.display = \"inline\";\n }\n span.className = token.trim().length === 0 ? \"stagehand-space\" : \"stagehand-highlighted-word\";\n fragment.appendChild(span);\n });\n if (fragment.childNodes.length > 0 && node.parentNode) {\n element.insertBefore(fragment, node);\n node.remove();\n }\n }\n });\n });\n }\n applyHighlighting(document);\n document.querySelectorAll(\"iframe\").forEach((iframe) => {\n try {\n iframe.contentWindow?.postMessage({ action: \"highlight\" }, \"*\");\n } catch (error) {\n console.error(\"Error accessing iframe content: \", error);\n }\n });\n }\n function getElementBoundingBoxes(xpath) {\n const element = document.evaluate(\n xpath,\n document,\n null,\n XPathResult.FIRST_ORDERED_NODE_TYPE,\n null\n ).singleNodeValue;\n if (!element) return [];\n const isValidText = (text) => text && text.trim().length > 0;\n let dropDownElem = element.querySelector(\"option[selected]\");\n if (!dropDownElem) {\n dropDownElem = element.querySelector(\"option\");\n }\n if (dropDownElem) {\n const elemText = dropDownElem.textContent || \"\";\n if (isValidText(elemText)) {\n const parentRect = element.getBoundingClientRect();\n return [\n {\n text: elemText.trim(),\n top: parentRect.top + window.scrollY,\n left: parentRect.left + window.scrollX,\n width: parentRect.width,\n height: parentRect.height\n }\n ];\n } else {\n return [];\n }\n }\n let placeholderText = \"\";\n if ((element.tagName.toLowerCase() === \"input\" || element.tagName.toLowerCase() === \"textarea\") && element.placeholder) {\n placeholderText = element.placeholder;\n } else if (element.tagName.toLowerCase() === \"a\") {\n placeholderText = \"\";\n } else if (element.tagName.toLowerCase() === \"img\") {\n placeholderText = element.alt || \"\";\n }\n const words = element.querySelectorAll(\n \".stagehand-highlighted-word\"\n );\n const boundingBoxes = Array.from(words).map((word) => {\n const rect = word.getBoundingClientRect();\n return {\n text: word.innerText || \"\",\n top: rect.top + window.scrollY,\n left: rect.left + window.scrollX,\n width: rect.width,\n height: rect.height * 0.75\n };\n }).filter(\n (box) => box.width > 0 && box.height > 0 && box.top >= 0 && box.left >= 0 && isValidText(box.text)\n );\n if (boundingBoxes.length === 0) {\n const elementRect = element.getBoundingClientRect();\n return [\n {\n text: placeholderText,\n top: elementRect.top + window.scrollY,\n left: elementRect.left + window.scrollX,\n width: elementRect.width,\n height: elementRect.height * 0.75\n }\n ];\n }\n return boundingBoxes;\n }\n window.processDom = processDom;\n window.processAllOfDom = processAllOfDom;\n window.processElements = processElements;\n window.scrollToHeight = scrollToHeight;\n window.storeDOM = storeDOM;\n window.restoreDOM = restoreDOM;\n window.createTextBoundingBoxes = createTextBoundingBoxes;\n window.getElementBoundingBoxes = getElementBoundingBoxes;\n var leafElementDenyList = [\"SVG\", \"IFRAME\", \"SCRIPT\", \"STYLE\", \"LINK\"];\n var interactiveElementTypes = [\n \"A\",\n \"BUTTON\",\n \"DETAILS\",\n \"EMBED\",\n \"INPUT\",\n \"LABEL\",\n \"MENU\",\n \"MENUITEM\",\n \"OBJECT\",\n \"SELECT\",\n \"TEXTAREA\",\n \"SUMMARY\"\n ];\n var interactiveRoles = [\n \"button\",\n \"menu\",\n \"menuitem\",\n \"link\",\n \"checkbox\",\n \"radio\",\n \"slider\",\n \"tab\",\n \"tabpanel\",\n \"textbox\",\n \"combobox\",\n \"grid\",\n \"listbox\",\n \"option\",\n \"progressbar\",\n \"scrollbar\",\n \"searchbox\",\n \"switch\",\n \"tree\",\n \"treeitem\",\n \"spinbutton\",\n \"tooltip\"\n ];\n var interactiveAriaRoles = [\"menu\", \"menuitem\", \"button\"];\n var isVisible = (element) => {\n const rect = element.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n if (!isTopElement(element, rect)) {\n return false;\n }\n const visible = element.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n var isTextVisible = (element) => {\n const range = document.createRange();\n range.selectNodeContents(element);\n const rect = range.getBoundingClientRect();\n if (rect.width === 0 || rect.height === 0 || rect.top < 0 || rect.top > window.innerHeight) {\n return false;\n }\n const parent = element.parentElement;\n if (!parent) {\n return false;\n }\n if (!isTopElement(parent, rect)) {\n return false;\n }\n const visible = parent.checkVisibility({\n checkOpacity: true,\n checkVisibilityCSS: true\n });\n return visible;\n };\n function isTopElement(elem, rect) {\n const points = [\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.25 },\n { x: rect.left + rect.width * 0.25, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width * 0.75, y: rect.top + rect.height * 0.75 },\n { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 }\n ];\n return points.some((point) => {\n const topEl = document.elementFromPoint(point.x, point.y);\n let current = topEl;\n while (current && current !== document.body) {\n if (current.isSameNode(elem)) {\n return true;\n }\n current = current.parentElement;\n }\n return false;\n });\n }\n var isActive = (element) => {\n if (element.hasAttribute(\"disabled\") || element.hasAttribute(\"hidden\") || element.getAttribute(\"aria-disabled\") === \"true\") {\n return false;\n }\n return true;\n };\n var isInteractiveElement = (element) => {\n const elementType = element.tagName;\n const elementRole = element.getAttribute(\"role\");\n const elementAriaRole = element.getAttribute(\"aria-role\");\n return elementType && interactiveElementTypes.includes(elementType) || elementRole && interactiveRoles.includes(elementRole) || elementAriaRole && interactiveAriaRoles.includes(elementAriaRole);\n };\n var isLeafElement = (element) => {\n if (element.textContent === \"\") {\n return false;\n }\n if (element.childNodes.length === 0) {\n return !leafElementDenyList.includes(element.tagName);\n }\n if (element.childNodes.length === 1 && isTextNode(element.childNodes[0])) {\n return true;\n }\n return false;\n };\n async function pickChunk(chunksSeen) {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const chunks = Math.ceil(documentHeight / viewportHeight);\n const chunksArray = Array.from({ length: chunks }, (_, i) => i);\n const chunksRemaining = chunksArray.filter((chunk2) => {\n return !chunksSeen.includes(chunk2);\n });\n const currentScrollPosition = window.scrollY;\n const closestChunk = chunksRemaining.reduce((closest, current) => {\n const currentChunkTop = viewportHeight * current;\n const closestChunkTop = viewportHeight * closest;\n return Math.abs(currentScrollPosition - currentChunkTop) < Math.abs(currentScrollPosition - closestChunkTop) ? current : closest;\n }, chunksRemaining[0]);\n const chunk = closestChunk;\n if (chunk === void 0) {\n throw new Error(`No chunks remaining to check: ${chunksRemaining}`);\n }\n return {\n chunk,\n chunksArray\n };\n }\n\n // lib/dom/debug.ts\n async function debugDom() {\n window.chunkNumber = 0;\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n }\n function multiSelectorMapToSelectorMap(multiSelectorMap) {\n return Object.fromEntries(\n Object.entries(multiSelectorMap).map(([key, selectors]) => [\n Number(key),\n selectors[0]\n ])\n );\n }\n function drawChunk(selectorMap) {\n if (!window.showChunks) return;\n cleanupMarkers();\n Object.values(selectorMap).forEach((selector) => {\n const element = document.evaluate(\n selector,\n document,\n null,\n XPathResult.FIRST_ORDERED_NODE_TYPE,\n null\n ).singleNodeValue;\n if (element) {\n let rect;\n if (element.nodeType === Node.ELEMENT_NODE) {\n rect = element.getBoundingClientRect();\n } else {\n const range = document.createRange();\n range.selectNodeContents(element);\n rect = range.getBoundingClientRect();\n }\n const color = \"grey\";\n const overlay = document.createElement(\"div\");\n overlay.style.position = \"absolute\";\n overlay.style.left = `${rect.left + window.scrollX}px`;\n overlay.style.top = `${rect.top + window.scrollY}px`;\n overlay.style.padding = \"2px\";\n overlay.style.width = `${rect.width}px`;\n overlay.style.height = `${rect.height}px`;\n overlay.style.backgroundColor = color;\n overlay.className = \"stagehand-marker\";\n overlay.style.opacity = \"0.3\";\n overlay.style.zIndex = \"1000000000\";\n overlay.style.border = \"1px solid\";\n overlay.style.pointerEvents = \"none\";\n document.body.appendChild(overlay);\n }\n });\n }\n async function cleanupDebug() {\n cleanupMarkers();\n cleanupNav();\n }\n function cleanupMarkers() {\n const markers = document.querySelectorAll(\".stagehand-marker\");\n markers.forEach((marker) => {\n marker.remove();\n });\n }\n function cleanupNav() {\n const stagehandNavElements = document.querySelectorAll(\".stagehand-nav\");\n stagehandNavElements.forEach((element) => {\n element.remove();\n });\n }\n function setupChunkNav() {\n const viewportHeight = calculateViewportHeight();\n const documentHeight = document.documentElement.scrollHeight;\n const totalChunks = Math.ceil(documentHeight / viewportHeight);\n if (window.chunkNumber > 0) {\n const prevChunkButton = document.createElement(\"button\");\n prevChunkButton.className = \"stagehand-nav\";\n prevChunkButton.textContent = \"Previous\";\n prevChunkButton.style.marginLeft = \"50px\";\n prevChunkButton.style.position = \"fixed\";\n prevChunkButton.style.bottom = \"10px\";\n prevChunkButton.style.left = \"50%\";\n prevChunkButton.style.transform = \"translateX(-50%)\";\n prevChunkButton.style.zIndex = \"1000000000\";\n prevChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber -= 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(prevChunkButton);\n }\n if (totalChunks > window.chunkNumber) {\n const nextChunkButton = document.createElement(\"button\");\n nextChunkButton.className = \"stagehand-nav\";\n nextChunkButton.textContent = \"Next\";\n nextChunkButton.style.marginRight = \"50px\";\n nextChunkButton.style.position = \"fixed\";\n nextChunkButton.style.bottom = \"10px\";\n nextChunkButton.style.right = \"50%\";\n nextChunkButton.style.transform = \"translateX(50%)\";\n nextChunkButton.style.zIndex = \"1000000000\";\n nextChunkButton.onclick = async () => {\n cleanupMarkers();\n cleanupNav();\n window.chunkNumber += 1;\n window.scrollTo(0, window.chunkNumber * viewportHeight);\n await window.waitForDomSettle();\n const { selectorMap: multiSelectorMap } = await window.processElements(\n window.chunkNumber\n );\n const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);\n drawChunk(selectorMap);\n setupChunkNav();\n };\n document.body.appendChild(nextChunkButton);\n }\n }\n window.debugDom = debugDom;\n window.cleanupDebug = cleanupDebug;\n})();\n"; |
@@ -27,3 +27,13 @@ export {}; | ||
__PW_inspect?: unknown; | ||
storeDOM: () => string; | ||
restoreDOM: (storedDOM: string) => void; | ||
createTextBoundingBoxes: () => void; | ||
getElementBoundingBoxes: (xpath: string) => Array<{ | ||
text: string; | ||
top: number; | ||
left: number; | ||
width: number; | ||
height: number; | ||
}>; | ||
} | ||
} |
@@ -243,2 +243,195 @@ import { generateXPathsForElement as generateXPaths } from "./xpathUtils"; | ||
export function storeDOM(): string { | ||
const originalDOM = document.body.cloneNode(true) as HTMLElement; | ||
console.log("DOM state stored."); | ||
return originalDOM.outerHTML; | ||
} | ||
export function restoreDOM(storedDOM: string): void { | ||
console.log("Restoring DOM"); | ||
if (storedDOM) { | ||
document.body.innerHTML = storedDOM; | ||
} else { | ||
console.error("No DOM state was provided."); | ||
} | ||
} | ||
export function createTextBoundingBoxes(): void { | ||
const style = document.createElement("style"); | ||
document.head.appendChild(style); | ||
if (style.sheet) { | ||
style.sheet.insertRule( | ||
` | ||
.stagehand-highlighted-word, .stagehand-space { | ||
border: 0px solid orange; | ||
display: inline-block !important; | ||
visibility: visible; | ||
} | ||
`, | ||
0, | ||
); | ||
style.sheet.insertRule( | ||
` | ||
code .stagehand-highlighted-word, code .stagehand-space, | ||
pre .stagehand-highlighted-word, pre .stagehand-space { | ||
white-space: pre-wrap; | ||
display: inline !important; | ||
} | ||
`, | ||
1, | ||
); | ||
} | ||
function applyHighlighting(root: Document | HTMLElement): void { | ||
root.querySelectorAll("body *").forEach((element) => { | ||
if (element.closest(".stagehand-nav, .stagehand-marker")) { | ||
return; | ||
} | ||
if ( | ||
["SCRIPT", "STYLE", "IFRAME", "INPUT", "TEXTAREA"].includes( | ||
element.tagName, | ||
) | ||
) { | ||
return; | ||
} | ||
const childNodes = Array.from(element.childNodes); | ||
childNodes.forEach((node) => { | ||
if (node.nodeType === 3 && node.textContent?.trim().length > 0) { | ||
const textContent = node.textContent.replace(/\u00A0/g, " "); | ||
const tokens = textContent.split(/(\s+)/g); // Split text by spaces | ||
const fragment = document.createDocumentFragment(); | ||
const parentIsCode = element.tagName === "CODE"; | ||
tokens.forEach((token) => { | ||
const span = document.createElement("span"); | ||
span.textContent = token; | ||
if (parentIsCode) { | ||
// Special handling for <code> tags | ||
span.style.whiteSpace = "pre-wrap"; | ||
span.style.display = "inline"; | ||
} | ||
span.className = | ||
token.trim().length === 0 | ||
? "stagehand-space" | ||
: "stagehand-highlighted-word"; | ||
fragment.appendChild(span); | ||
}); | ||
if (fragment.childNodes.length > 0 && node.parentNode) { | ||
element.insertBefore(fragment, node); | ||
node.remove(); | ||
} | ||
} | ||
}); | ||
}); | ||
} | ||
applyHighlighting(document); | ||
document.querySelectorAll("iframe").forEach((iframe) => { | ||
try { | ||
iframe.contentWindow?.postMessage({ action: "highlight" }, "*"); | ||
} catch (error) { | ||
console.error("Error accessing iframe content: ", error); | ||
} | ||
}); | ||
} | ||
export function getElementBoundingBoxes(xpath: string): Array<{ | ||
text: string; | ||
top: number; | ||
left: number; | ||
width: number; | ||
height: number; | ||
}> { | ||
const element = document.evaluate( | ||
xpath, | ||
document, | ||
null, | ||
XPathResult.FIRST_ORDERED_NODE_TYPE, | ||
null, | ||
).singleNodeValue as HTMLElement; | ||
if (!element) return []; | ||
const isValidText = (text: string) => text && text.trim().length > 0; | ||
let dropDownElem = element.querySelector("option[selected]"); | ||
if (!dropDownElem) { | ||
dropDownElem = element.querySelector("option"); | ||
} | ||
if (dropDownElem) { | ||
const elemText = dropDownElem.textContent || ""; | ||
if (isValidText(elemText)) { | ||
const parentRect = element.getBoundingClientRect(); | ||
return [ | ||
{ | ||
text: elemText.trim(), | ||
top: parentRect.top + window.scrollY, | ||
left: parentRect.left + window.scrollX, | ||
width: parentRect.width, | ||
height: parentRect.height, | ||
}, | ||
]; | ||
} else { | ||
return []; | ||
} | ||
} | ||
let placeholderText = ""; | ||
if ( | ||
(element.tagName.toLowerCase() === "input" || | ||
element.tagName.toLowerCase() === "textarea") && | ||
(element as HTMLInputElement).placeholder | ||
) { | ||
placeholderText = (element as HTMLInputElement).placeholder; | ||
} else if (element.tagName.toLowerCase() === "a") { | ||
placeholderText = ""; | ||
} else if (element.tagName.toLowerCase() === "img") { | ||
placeholderText = (element as HTMLImageElement).alt || ""; | ||
} | ||
const words = element.querySelectorAll( | ||
".stagehand-highlighted-word", | ||
) as NodeListOf<HTMLElement>; | ||
const boundingBoxes = Array.from(words) | ||
.map((word) => { | ||
const rect = word.getBoundingClientRect(); | ||
return { | ||
text: word.innerText || "", | ||
top: rect.top + window.scrollY, | ||
left: rect.left + window.scrollX, | ||
width: rect.width, | ||
height: rect.height * 0.75, | ||
}; | ||
}) | ||
.filter( | ||
(box) => | ||
box.width > 0 && | ||
box.height > 0 && | ||
box.top >= 0 && | ||
box.left >= 0 && | ||
isValidText(box.text), | ||
); | ||
if (boundingBoxes.length === 0) { | ||
const elementRect = element.getBoundingClientRect(); | ||
return [ | ||
{ | ||
text: placeholderText, | ||
top: elementRect.top + window.scrollY, | ||
left: elementRect.left + window.scrollX, | ||
width: elementRect.width, | ||
height: elementRect.height * 0.75, | ||
}, | ||
]; | ||
} | ||
return boundingBoxes; | ||
} | ||
window.processDom = processDom; | ||
@@ -248,2 +441,6 @@ window.processAllOfDom = processAllOfDom; | ||
window.scrollToHeight = scrollToHeight; | ||
window.storeDOM = storeDOM; | ||
window.restoreDOM = restoreDOM; | ||
window.createTextBoundingBoxes = createTextBoundingBoxes; | ||
window.getElementBoundingBoxes = getElementBoundingBoxes; | ||
@@ -250,0 +447,0 @@ const leafElementDenyList = ["SVG", "IFRAME", "SCRIPT", "STYLE", "LINK"]; |
@@ -5,5 +5,79 @@ import { LLMProvider } from "../llm/LLMProvider"; | ||
import { LogLine } from "../../types/log"; | ||
import { TextAnnotation } from "../../types/textannotation"; | ||
import { extract } from "../inference"; | ||
import { LLMClient } from "../llm/LLMClient"; | ||
import { formatText } from "../utils"; | ||
const PROXIMITY_THRESHOLD = 15; | ||
/** | ||
* The `StagehandExtractHandler` class is responsible for extracting structured data from a webpage. | ||
* It provides two approaches: `textExtract` and `domExtract`. `textExtract` is used by default. | ||
* | ||
* Here is what `textExtract` does at a high level: | ||
* | ||
* **1. Wait for the DOM to settle and start DOM debugging.** | ||
* - Ensures the page is fully loaded and stable before extraction. | ||
* | ||
* **2. Store the original DOM before any mutations.** | ||
* - Preserves the initial state of the DOM to restore later. | ||
* - We do this because creating spans around every word in the DOM (see step 4) | ||
* becomes very difficult to revert. Text nodes can be finicky, and directly | ||
* removing the added spans often corrupts the structure of the DOM. | ||
* | ||
* **3. Process the DOM to generate a selector map of candidate elements.** | ||
* - Identifies potential elements that contain the data to extract. | ||
* | ||
* **4. Create text bounding boxes around every word in the webpage.** | ||
* - Wraps words in spans so that their bounding boxes can be used to | ||
* determine their positions on the text-rendered-webpage. | ||
* | ||
* **5. Collect all text annotations (with positions and dimensions) from each of the candidate elements.** | ||
* - Gathers text and positional data for each word. | ||
* | ||
* **6. Group annotations by text and deduplicate them based on proximity.** | ||
* - There is no guarantee that the text annotations are unique (candidate elements can be nested). | ||
* - Thus, we must remove duplicate words that are close to each other on the page. | ||
* | ||
* **7. Restore the original DOM after mutations.** | ||
* - Returns the DOM to its original state after processing. | ||
* | ||
* **8. Format the deduplicated annotations into a text representation.** | ||
* - Prepares the text data for the extraction process. | ||
* | ||
* **9. Pass the formatted text to an LLM for extraction according to the given instruction and schema.** | ||
* - Uses a language model to extract structured data based on instructions. | ||
* | ||
* **10. Handle the extraction response and logging the results.** | ||
* - Processes the output from the LLM and logs relevant information. | ||
* | ||
* | ||
* Here is what `domExtract` does at a high level: | ||
* | ||
* **1. Wait for the DOM to settle and start DOM debugging.** | ||
* - Ensures the page is fully loaded and stable before extraction. | ||
* | ||
* **2. Process the DOM in chunks.** | ||
* - The `processDom` function: | ||
* - Divides the page into vertical "chunks" based on viewport height. | ||
* - Picks the next chunk that hasn't been processed yet. | ||
* - Scrolls to that chunk and extracts candidate elements. | ||
* - Returns `outputString` (HTML snippets of candidate elements), | ||
* `selectorMap` (the XPaths of the candidate elements), | ||
* `chunk` (the current chunk index), and `chunks` (the array of all chunk indices). | ||
* - This chunk-based approach ensures that large or lengthy pages can be processed in smaller, manageable sections. | ||
* | ||
* **3. Pass the extracted DOM elements (in `outputString`) to the LLM for structured data extraction.** | ||
* - Uses the instructions, schema, and previously extracted content as context to | ||
* guide the LLM in extracting the structured data. | ||
* | ||
* **4. Check if extraction is complete.** | ||
* - If the extraction is complete (all chunks have been processed or the LLM determines | ||
* that we do not need to continue), return the final result. | ||
* - If not, repeat steps 1-4 with the next chunk until extraction is complete or no more chunks remain. | ||
* | ||
* @remarks | ||
* Each step corresponds to specific code segments, as noted in the comments throughout the code. | ||
*/ | ||
export class StagehandExtractHandler { | ||
@@ -64,2 +138,3 @@ private readonly stagehand: Stagehand; | ||
domSettleTimeoutMs, | ||
useTextExtract = false, | ||
}: { | ||
@@ -73,3 +148,41 @@ instruction: string; | ||
domSettleTimeoutMs?: number; | ||
useTextExtract?: boolean; | ||
}): Promise<z.infer<T>> { | ||
if (useTextExtract) { | ||
return this.textExtract({ | ||
instruction, | ||
schema, | ||
content, | ||
llmClient, | ||
requestId, | ||
domSettleTimeoutMs, | ||
}); | ||
} else { | ||
return this.domExtract({ | ||
instruction, | ||
schema, | ||
content, | ||
chunksSeen, | ||
llmClient, | ||
requestId, | ||
domSettleTimeoutMs, | ||
}); | ||
} | ||
} | ||
private async textExtract<T extends z.AnyZodObject>({ | ||
instruction, | ||
schema, | ||
content = {}, | ||
llmClient, | ||
requestId, | ||
domSettleTimeoutMs, | ||
}: { | ||
instruction: string; | ||
schema: T; | ||
content?: z.infer<T>; | ||
llmClient: LLMClient; | ||
requestId?: string; | ||
domSettleTimeoutMs?: number; | ||
}): Promise<z.infer<T>> { | ||
this.logger({ | ||
@@ -87,4 +200,228 @@ category: "extraction", | ||
// **1:** Wait for the DOM to settle and start DOM debugging | ||
await this.waitForSettledDom(domSettleTimeoutMs); | ||
await this.startDomDebug(); | ||
// **2:** Store the original DOM before any mutations | ||
// we need to store the original DOM here because calling createTextBoundingBoxes() | ||
// will mutate the DOM by adding spans around every word | ||
const originalDOM = await this.stagehand.page.evaluate(() => | ||
window.storeDOM(), | ||
); | ||
// **3:** Process the DOM to generate a selector map of candidate elements | ||
const { selectorMap }: { selectorMap: Record<number, string[]> } = | ||
await this.stagehand.page.evaluate(() => window.processAllOfDom()); | ||
this.logger({ | ||
category: "extraction", | ||
message: `received output from processAllOfDom. selectorMap has ${Object.keys(selectorMap).length} entries`, | ||
level: 1, | ||
}); | ||
// **4:** Create text bounding boxes around every word in the webpage | ||
// calling createTextBoundingBoxes() will create a span around every word on the | ||
// webpage. The bounding boxes of these spans will be used to determine their | ||
// positions in the text rendered webpage | ||
await this.stagehand.page.evaluate(() => window.createTextBoundingBoxes()); | ||
const pageWidth = await this.stagehand.page.evaluate( | ||
() => window.innerWidth, | ||
); | ||
const pageHeight = await this.stagehand.page.evaluate( | ||
() => window.innerHeight, | ||
); | ||
// **5:** Collect all text annotations (with positions and dimensions) from the candidate elements | ||
// allAnnotations will store all the TextAnnotations BEFORE deduplication | ||
const allAnnotations: TextAnnotation[] = []; | ||
// here we will loop through all the xpaths in the selectorMap, | ||
// and get the bounding boxes for each one. These are xpaths to "candidate elements" | ||
for (const xpaths of Object.values(selectorMap)) { | ||
const xpath = xpaths[0]; | ||
// boundingBoxes is an array because there may be multiple bounding boxes within a single element | ||
// (since each bounding box is around a single word) | ||
const boundingBoxes: Array<{ | ||
text: string; | ||
left: number; | ||
top: number; | ||
width: number; | ||
height: number; | ||
}> = await this.stagehand.page.evaluate( | ||
(xpath) => window.getElementBoundingBoxes(xpath), | ||
xpath, | ||
); | ||
for (const box of boundingBoxes) { | ||
const bottom_left = { | ||
x: box.left, | ||
y: box.top + box.height, | ||
}; | ||
const bottom_left_normalized = { | ||
x: box.left / pageWidth, | ||
y: (box.top + box.height) / pageHeight, | ||
}; | ||
const annotation: TextAnnotation = { | ||
text: box.text, | ||
bottom_left, | ||
bottom_left_normalized, | ||
width: box.width, | ||
height: box.height, | ||
}; | ||
allAnnotations.push(annotation); | ||
} | ||
} | ||
// **6:** Group annotations by text and deduplicate them based on proximity | ||
const annotationsGroupedByText = new Map<string, TextAnnotation[]>(); | ||
for (const annotation of allAnnotations) { | ||
if (!annotationsGroupedByText.has(annotation.text)) { | ||
annotationsGroupedByText.set(annotation.text, []); | ||
} | ||
annotationsGroupedByText.get(annotation.text)!.push(annotation); | ||
} | ||
const deduplicatedTextAnnotations: TextAnnotation[] = []; | ||
// here, we deduplicate annotations per text group | ||
for (const [text, annotations] of annotationsGroupedByText.entries()) { | ||
for (const annotation of annotations) { | ||
// check if this annotation is close to any existing deduplicated annotation | ||
const isDuplicate = deduplicatedTextAnnotations.some( | ||
(existingAnnotation) => { | ||
if (existingAnnotation.text !== text) return false; | ||
const dx = | ||
existingAnnotation.bottom_left.x - annotation.bottom_left.x; | ||
const dy = | ||
existingAnnotation.bottom_left.y - annotation.bottom_left.y; | ||
const distance = Math.hypot(dx, dy); | ||
// the annotation is a duplicate if it has the same text and its bottom_left | ||
// position is within the PROXIMITY_THRESHOLD of an existing annotation. | ||
// we calculate the Euclidean distance between the two bottom_left points, | ||
// and if the distance is less than PROXIMITY_THRESHOLD, | ||
// the annotation is considered a duplicate. | ||
return distance < PROXIMITY_THRESHOLD; | ||
}, | ||
); | ||
if (!isDuplicate) { | ||
deduplicatedTextAnnotations.push(annotation); | ||
} | ||
} | ||
} | ||
// **7:** Restore the original DOM after mutations | ||
await this.stagehand.page.evaluate( | ||
(dom) => window.restoreDOM(dom), | ||
originalDOM, | ||
); | ||
// **8:** Format the deduplicated annotations into a text representation | ||
const formattedText = formatText(deduplicatedTextAnnotations, pageWidth); | ||
// **9:** Pass the formatted text to an LLM for extraction according to the given instruction and schema | ||
const extractionResponse = await extract({ | ||
instruction, | ||
previouslyExtractedContent: content, | ||
domElements: formattedText, | ||
schema, | ||
chunksSeen: 1, | ||
chunksTotal: 1, | ||
llmClient, | ||
requestId, | ||
}); | ||
const { | ||
metadata: { completed }, | ||
...output | ||
} = extractionResponse; | ||
await this.cleanupDomDebug(); | ||
// **10:** Handle the extraction response and log the results | ||
this.logger({ | ||
category: "extraction", | ||
message: "received extraction response", | ||
auxiliary: { | ||
extraction_response: { | ||
value: JSON.stringify(extractionResponse), | ||
type: "object", | ||
}, | ||
}, | ||
}); | ||
if (completed) { | ||
this.logger({ | ||
category: "extraction", | ||
message: "extraction completed successfully", | ||
level: 1, | ||
auxiliary: { | ||
extraction_response: { | ||
value: JSON.stringify(extractionResponse), | ||
type: "object", | ||
}, | ||
}, | ||
}); | ||
} else { | ||
this.logger({ | ||
category: "extraction", | ||
message: "extraction incomplete after processing all data", | ||
level: 1, | ||
auxiliary: { | ||
extraction_response: { | ||
value: JSON.stringify(extractionResponse), | ||
type: "object", | ||
}, | ||
}, | ||
}); | ||
} | ||
return output; | ||
} | ||
private async domExtract<T extends z.AnyZodObject>({ | ||
instruction, | ||
schema, | ||
content = {}, | ||
chunksSeen = [], | ||
llmClient, | ||
requestId, | ||
domSettleTimeoutMs, | ||
}: { | ||
instruction: string; | ||
schema: T; | ||
content?: z.infer<T>; | ||
chunksSeen?: Array<number>; | ||
llmClient: LLMClient; | ||
requestId?: string; | ||
domSettleTimeoutMs?: number; | ||
}): Promise<z.infer<T>> { | ||
this.logger({ | ||
category: "extraction", | ||
message: "starting extraction using old approach", | ||
level: 1, | ||
auxiliary: { | ||
instruction: { | ||
value: instruction, | ||
type: "string", | ||
}, | ||
}, | ||
}); | ||
// **1:** Wait for the DOM to settle and start DOM debugging | ||
// This ensures the page is stable before extracting any data. | ||
await this.waitForSettledDom(domSettleTimeoutMs); | ||
await this.startDomDebug(); | ||
// **2:** Call processDom() to handle chunk-based extraction | ||
// processDom determines which chunk of the page to process next. | ||
// It will: | ||
// - Identify all chunks (vertical segments of the page), | ||
// - Pick the next unprocessed chunk, | ||
// - Scroll to that chunk's region, | ||
// - Extract candidate elements and their text, | ||
// - Return the extracted text (outputString), a selectorMap (for referencing elements), | ||
// the current chunk index, and the full list of chunks. | ||
const { outputString, chunk, chunks } = await this.stagehand.page.evaluate( | ||
@@ -114,2 +451,5 @@ (chunksSeen?: number[]) => window.processDom(chunksSeen ?? []), | ||
// **3:** Pass the list of candidate HTML snippets to the LLM | ||
// The LLM uses the provided instruction and schema to parse and extract | ||
// structured data. | ||
const extractionResponse = await extract({ | ||
@@ -124,2 +464,3 @@ instruction, | ||
requestId, | ||
isUsingTextExtract: false, | ||
}); | ||
@@ -131,2 +472,3 @@ | ||
} = extractionResponse; | ||
await this.cleanupDomDebug(); | ||
@@ -145,4 +487,8 @@ | ||
// Mark the current chunk as processed by adding it to chunksSeen | ||
chunksSeen.push(chunk); | ||
// **4:** Check if extraction is complete | ||
// If the LLM deems the extraction complete or we've processed all chunks, return the final result. | ||
// Otherwise, call domExtract again for the next chunk. | ||
if (completed || chunksSeen.length === chunks.length) { | ||
@@ -173,3 +519,5 @@ this.logger({ | ||
await this.waitForSettledDom(domSettleTimeoutMs); | ||
return this.extract({ | ||
// Recursively continue with the next chunk | ||
return this.domExtract({ | ||
instruction, | ||
@@ -176,0 +524,0 @@ schema, |
@@ -198,3 +198,3 @@ import { Browserbase } from "@browserbasehq/sdk"; | ||
return { browser, context, debugUrl, sessionUrl }; | ||
return { browser, context, debugUrl, sessionUrl, sessionId }; | ||
} else { | ||
@@ -311,2 +311,4 @@ logger({ | ||
public context: BrowserContext; | ||
public browserbaseSessionID?: string; | ||
private env: "LOCAL" | "BROWSERBASE"; | ||
@@ -382,19 +384,21 @@ private apiKey: string | undefined; | ||
} | ||
const { context, debugUrl, sessionUrl, contextPath } = await getBrowser( | ||
this.apiKey, | ||
this.projectId, | ||
this.env, | ||
this.headless, | ||
this.logger, | ||
this.browserbaseSessionCreateParams, | ||
this.browserbaseResumeSessionID, | ||
).catch((e) => { | ||
console.error("Error in init:", e); | ||
const br: BrowserResult = { | ||
context: undefined, | ||
debugUrl: undefined, | ||
sessionUrl: undefined, | ||
}; | ||
return br; | ||
}); | ||
const { context, debugUrl, sessionUrl, contextPath, sessionId } = | ||
await getBrowser( | ||
this.apiKey, | ||
this.projectId, | ||
this.env, | ||
this.headless, | ||
this.logger, | ||
this.browserbaseSessionCreateParams, | ||
this.browserbaseResumeSessionID, | ||
).catch((e) => { | ||
console.error("Error in init:", e); | ||
const br: BrowserResult = { | ||
context: undefined, | ||
debugUrl: undefined, | ||
sessionUrl: undefined, | ||
sessionId: undefined, | ||
}; | ||
return br; | ||
}); | ||
this.contextPath = contextPath; | ||
@@ -461,4 +465,5 @@ this.context = context; | ||
}); | ||
this.browserbaseSessionID = sessionId; | ||
return { debugUrl, sessionUrl }; | ||
return { debugUrl, sessionUrl, sessionId }; | ||
} | ||
@@ -772,2 +777,3 @@ | ||
domSettleTimeoutMs, | ||
useTextExtract, | ||
}: ExtractOptions<T>): Promise<ExtractResult<T>> { | ||
@@ -810,2 +816,3 @@ if (!this.extractHandler) { | ||
domSettleTimeoutMs, | ||
useTextExtract, | ||
}) | ||
@@ -812,0 +819,0 @@ .catch((e) => { |
@@ -163,2 +163,3 @@ import { | ||
requestId, | ||
isUsingTextExtract, | ||
}: { | ||
@@ -173,2 +174,3 @@ instruction: string; | ||
requestId: string; | ||
isUsingTextExtract?: boolean; | ||
}) { | ||
@@ -181,3 +183,3 @@ type ExtractionResponse = z.infer<typeof schema>; | ||
messages: [ | ||
buildExtractSystemPrompt(isUsingAnthropic), | ||
buildExtractSystemPrompt(isUsingAnthropic, isUsingTextExtract), | ||
buildExtractUserPrompt(instruction, domElements, isUsingAnthropic), | ||
@@ -184,0 +186,0 @@ ], |
@@ -225,3 +225,3 @@ import Anthropic, { ClientOptions } from "@anthropic-ai/sdk"; | ||
model: this.modelName, | ||
max_tokens: options.maxTokens || 1500, | ||
max_tokens: options.maxTokens || 8192, | ||
messages: formattedMessages, | ||
@@ -228,0 +228,0 @@ tools: anthropicTools, |
@@ -204,19 +204,43 @@ import OpenAI from "openai"; | ||
// extract | ||
const extractSystemPrompt = `You are extracting content on behalf of a user. You will be given: | ||
export function buildExtractSystemPrompt( | ||
isUsingPrintExtractedDataTool: boolean = false, | ||
useTextExtract: boolean = true, | ||
): ChatMessage { | ||
const baseContent = `You are extracting content on behalf of a user. | ||
If a user asks you to extract a 'list' of information, or 'all' information, | ||
YOU MUST EXTRACT ALL OF THE INFORMATION THAT THE USER REQUESTS. | ||
You will be given: | ||
1. An instruction | ||
2. A list of DOM elements to extract from | ||
2. `; | ||
Print the exact text from the DOM elements with all symbols, characters, and endlines as is. | ||
const contentDetail = useTextExtract | ||
? `A text representation of a webpage to extract information from.` | ||
: `A list of DOM elements to extract from.`; | ||
const instructions = ` | ||
Print the exact text from the ${ | ||
useTextExtract ? "text-rendered webpage" : "DOM elements" | ||
} with all symbols, characters, and endlines as is. | ||
Print null or an empty string if no new information is found. | ||
`; | ||
`.trim(); | ||
export function buildExtractSystemPrompt( | ||
isUsingPrintExtractedDataTool: boolean = false, | ||
): ChatMessage { | ||
let content = extractSystemPrompt.replace(/\s+/g, " "); | ||
if (isUsingPrintExtractedDataTool) { | ||
content += ` | ||
const toolInstructions = isUsingPrintExtractedDataTool | ||
? ` | ||
ONLY print the content using the print_extracted_data tool provided. | ||
ONLY print the content using the print_extracted_data tool provided.`; | ||
} | ||
ONLY print the content using the print_extracted_data tool provided. | ||
`.trim() | ||
: ""; | ||
const additionalInstructions = useTextExtract | ||
? `Once you are given the text-rendered webpage, | ||
you must thoroughly and meticulously analyze it. Be very careful to ensure that you | ||
do not miss any important information.` | ||
: ""; | ||
const content = | ||
`${baseContent}${contentDetail}\n\n${instructions}\n${toolInstructions}${ | ||
additionalInstructions ? `\n\n${additionalInstructions}` : "" | ||
}`.replace(/\s+/g, " "); | ||
return { | ||
@@ -223,0 +247,0 @@ role: "system", |
352
lib/utils.ts
import crypto from "crypto"; | ||
import { LogLine } from "../types/log"; | ||
import { TextAnnotation } from "../types/textannotation"; | ||
import { z } from "zod"; | ||
@@ -9,2 +10,353 @@ | ||
/** | ||
* `formatText` converts a list of text annotations into a formatted text representation. | ||
* Each annotation represents a piece of text at a certain position on a webpage. | ||
* The formatting attempts to reconstruct a textual "screenshot" of the page by: | ||
* - Grouping annotations into lines based on their vertical positions. | ||
* - Adjusting spacing to reflect line gaps. | ||
* - Attempting to preserve relative positions and formatting. | ||
* | ||
* The output is a text block, optionally surrounded by lines of dashes, that aims | ||
* to closely mirror the visual layout of the text on the page. | ||
* | ||
* @param textAnnotations - An array of TextAnnotations describing text and their positions. | ||
* @param pageWidth - The width of the page in pixels, used to normalize positions. | ||
* @returns A string representing the text layout of the page. | ||
*/ | ||
export function formatText( | ||
textAnnotations: TextAnnotation[], | ||
pageWidth: number, | ||
): string { | ||
// **1:** Estimate the average character width in pixels by examining the text annotations. | ||
// If no reliable measurement is found, default to 10 pixels per character. | ||
const charWidth = estimateCharacterWidth(textAnnotations) || 10; | ||
// **2:** Create a copy of textAnnotations and sort them by their vertical position (y-coordinate), | ||
// ensuring that topmost annotations appear first and bottommost appear last. | ||
const sortedAnnotations = [...textAnnotations].sort( | ||
(a, b) => a.bottom_left.y - b.bottom_left.y, | ||
); | ||
// **3:** Group annotations by their line position. We use a small epsilon to handle | ||
// floating-point differences. Two annotations are considered on the same line if their | ||
// y-coordinates differ by less than epsilon. | ||
const epsilon = 0.0001; | ||
const lineMap: Map<number, TextAnnotation[]> = new Map(); | ||
for (const annotation of sortedAnnotations) { | ||
let foundLineY: number | undefined; | ||
// **4:** Check if the annotation belongs to an existing line group. | ||
// If so, add it to that line. Otherwise, start a new line group. | ||
for (const key of lineMap.keys()) { | ||
if (Math.abs(key - annotation.bottom_left.y) < epsilon) { | ||
foundLineY = key; | ||
break; | ||
} | ||
} | ||
if (foundLineY !== undefined) { | ||
lineMap.get(foundLineY)!.push(annotation); | ||
} else { | ||
lineMap.set(annotation.bottom_left.y, [annotation]); | ||
} | ||
} | ||
// **5:** Extract all line keys (y-coordinates) and sort them to process lines top-to-bottom. | ||
const lineYs = Array.from(lineMap.keys()).sort((a, b) => a - b); | ||
// **6:** For each line, group words together and calculate the maximum normalized end position (maxNormalizedEndX). | ||
// This will help determine the necessary canvas width to accommodate all text. | ||
let maxNormalizedEndX = 0; | ||
const finalLines: TextAnnotation[][] = []; | ||
for (const lineY of lineYs) { | ||
const lineAnnotations = lineMap.get(lineY)!; | ||
// **7:** Sort annotations in the current line by their horizontal position (x-coordinate), | ||
// ensuring left-to-right ordering. | ||
lineAnnotations.sort((a, b) => a.bottom_left.x - b.bottom_left.x); | ||
// **8:** Group nearby annotations into word clusters, forming logical sentences or phrases. | ||
const groupedLineAnnotations = groupWordsInSentence(lineAnnotations); | ||
// **9:** Determine how far to the right the text in this line extends, normalized by page width. | ||
// Update maxNormalizedEndX to track the widest line encountered. | ||
for (const ann of groupedLineAnnotations) { | ||
const textLengthInPx = ann.text.length * charWidth; | ||
const normalizedTextLength = textLengthInPx / pageWidth; | ||
const endX = ann.bottom_left_normalized.x + normalizedTextLength; | ||
if (endX > maxNormalizedEndX) { | ||
maxNormalizedEndX = endX; | ||
} | ||
} | ||
// **10:** Save the processed line to finalLines for later rendering. | ||
finalLines.push(groupedLineAnnotations); | ||
} | ||
// **11:** Determine the canvas width in characters. We scale according to maxNormalizedEndX and page width. | ||
// Add a small buffer (20 chars) to ensure no text overflows the canvas. | ||
let canvasWidth = Math.ceil(maxNormalizedEndX * (pageWidth / charWidth)) + 20; | ||
canvasWidth = Math.max(canvasWidth, 1); | ||
// **12:** Compute the baseline (lowest point) of each line. This helps us understand vertical spacing. | ||
const lineBaselines = finalLines.map((line) => | ||
Math.min(...line.map((a) => a.bottom_left.y)), | ||
); | ||
// **13:** Compute vertical gaps between consecutive lines to determine line spacing. | ||
const verticalGaps: number[] = []; | ||
for (let i = 1; i < lineBaselines.length; i++) { | ||
verticalGaps.push(lineBaselines[i] - lineBaselines[i - 1]); | ||
} | ||
// **14:** Estimate what a "normal" line spacing is by taking the median of all vertical gaps. | ||
const normalLineSpacing = verticalGaps.length > 0 ? median(verticalGaps) : 0; | ||
// **15:** Create a 2D character canvas initialized with spaces, onto which we'll "print" text lines. | ||
let canvas: string[][] = []; | ||
// **16:** lineIndex represents the current line of the canvas. Initialize with -1 so the first line starts at 0. | ||
let lineIndex = -1; | ||
// **17:** Iterate over each line of processed text. | ||
for (let i = 0; i < finalLines.length; i++) { | ||
if (i === 0) { | ||
// **18:** For the first line, just increment lineIndex to start at 0 with no extra spacing. | ||
lineIndex++; | ||
ensureLineExists(canvas, lineIndex, canvasWidth); | ||
} else { | ||
// **19:** For subsequent lines, calculate how many extra blank lines to insert based on spacing. | ||
const gap = lineBaselines[i] - lineBaselines[i - 1]; | ||
let extraLines = 0; | ||
// **20:** If we have a known normal line spacing, and the gap is larger than expected, | ||
// insert extra blank lines proportional to the ratio of gap to normal spacing. | ||
if (normalLineSpacing > 0) { | ||
if (gap > 1.2 * normalLineSpacing) { | ||
extraLines = Math.max(Math.round(gap / normalLineSpacing) - 1, 0); | ||
} | ||
} | ||
// **21:** Insert the calculated extra blank lines to maintain approximate vertical spacing. | ||
for (let e = 0; e < extraLines; e++) { | ||
lineIndex++; | ||
ensureLineExists(canvas, lineIndex, canvasWidth); | ||
} | ||
// **22:** After adjusting for spacing, increment lineIndex for the current line of text. | ||
lineIndex++; | ||
ensureLineExists(canvas, lineIndex, canvasWidth); | ||
} | ||
// **23:** Now place the annotations for the current line onto the canvas at the appropriate horizontal positions. | ||
const lineAnnotations = finalLines[i]; | ||
for (const annotation of lineAnnotations) { | ||
const text = annotation.text; | ||
// **24:** Calculate the starting x-position in the canvas based on normalized coordinates. | ||
const startXInChars = Math.round( | ||
annotation.bottom_left_normalized.x * canvasWidth, | ||
); | ||
// **25:** Place each character of the annotation text into the canvas. | ||
for (let j = 0; j < text.length; j++) { | ||
const xPos = startXInChars + j; | ||
// **26:** Ensure we don't exceed the canvas width. | ||
if (xPos < canvasWidth) { | ||
canvas[lineIndex][xPos] = text[j]; | ||
} | ||
} | ||
} | ||
} | ||
// **27:** Trim trailing whitespace from each line to create a cleaner output. | ||
canvas = canvas.map((row) => { | ||
const lineStr = row.join(""); | ||
return Array.from(lineStr.trimEnd()); | ||
}); | ||
// **29:** Join all lines to form the final page text. Trim any trailing whitespace from the entire text. | ||
let pageText = canvas.map((line) => line.join("")).join("\n"); | ||
pageText = pageText.trimEnd(); | ||
// **30:** Surround the page text with lines of dashes to clearly delineate the text block. | ||
pageText = | ||
"-".repeat(canvasWidth) + "\n" + pageText + "\n" + "-".repeat(canvasWidth); | ||
// **31:** Return the fully formatted text. | ||
return pageText; | ||
} | ||
/** | ||
* `ensureLineExists` ensures that a specified line index exists in the canvas. | ||
* If the canvas is not long enough, it extends it by adding new empty lines (filled with spaces). | ||
* This function is used to dynamically grow the canvas as we progress through the lines. | ||
* | ||
* @param canvas - The 2D character canvas array. | ||
* @param lineIndex - The desired line index that must exist. | ||
* @param width - The width of each line in characters. | ||
*/ | ||
function ensureLineExists( | ||
canvas: string[][], | ||
lineIndex: number, | ||
width: number, | ||
) { | ||
// loop until the canvas has at least lineIndex+1 lines. | ||
// each new line is filled with spaces to match the required width. | ||
while (lineIndex >= canvas.length) { | ||
canvas.push(new Array(width).fill(" ")); | ||
} | ||
} | ||
/** | ||
* `estimateCharacterWidth` estimates the average character width (in pixels) from a collection of text annotations. | ||
* It calculates the width per character for each annotation and uses their median as the result. | ||
* If no annotations are available or they have zero-length text, returns 0. | ||
* | ||
* @param textAnnotations - An array of text annotations with text and width fields. | ||
* @returns The median character width in pixels, or 0 if none can be calculated. | ||
*/ | ||
function estimateCharacterWidth(textAnnotations: TextAnnotation[]): number { | ||
// collect width-per-character measurements from each annotation | ||
const charWidths: number[] = []; | ||
for (const annotation of textAnnotations) { | ||
const length = annotation.text.length; | ||
if (length > 0) { | ||
charWidths.push(annotation.width / length); | ||
} | ||
} | ||
// return the median of all collected measurements | ||
return median(charWidths); | ||
} | ||
/** | ||
* `groupWordsInSentence` groups annotations within a single line into logical "words" or "sentences". | ||
* It uses a set of heuristics involving horizontal proximity and similar height | ||
* to decide when to join multiple annotations into a single grouped annotation. | ||
* | ||
* @param lineAnnotations - An array of annotations from a single line of text. | ||
* @returns An array of grouped annotations, where each represents one concatenated piece of text. | ||
*/ | ||
function groupWordsInSentence( | ||
lineAnnotations: TextAnnotation[], | ||
): TextAnnotation[] { | ||
const groupedAnnotations: TextAnnotation[] = []; | ||
let currentGroup: TextAnnotation[] = []; | ||
for (const annotation of lineAnnotations) { | ||
// if the current group is empty, start a new group with this annotation | ||
if (currentGroup.length === 0) { | ||
currentGroup.push(annotation); | ||
continue; | ||
} | ||
// determine horizontal grouping criteria | ||
// use a padding factor to allow slight spaces between words | ||
const padding = 2; | ||
const lastAnn = currentGroup[currentGroup.length - 1]; | ||
const characterWidth = (lastAnn.width / lastAnn.text.length) * padding; | ||
const isWithinHorizontalRange = | ||
annotation.bottom_left.x <= | ||
lastAnn.bottom_left.x + lastAnn.width + characterWidth; | ||
// check if the annotation can be grouped with the current group. | ||
// conditions: | ||
// 1. the height difference from the group's first annotation is ≤ 4 units | ||
// 2. the annotation is horizontally close to the last annotation in the group | ||
if ( | ||
Math.abs(annotation.height - currentGroup[0].height) <= 4 && | ||
isWithinHorizontalRange | ||
) { | ||
// if it meets the criteria, add to the current group | ||
currentGroup.push(annotation); | ||
} else { | ||
// if it doesn't meet criteria: | ||
// 1. finalize the current group into a single grouped annotation, | ||
// 2. add it to groupedAnnotations, | ||
// 3. start a new group with the current annotation | ||
if (currentGroup.length > 0) { | ||
const groupedAnnotation = createGroupedAnnotation(currentGroup); | ||
groupedAnnotations.push(groupedAnnotation); | ||
currentGroup = [annotation]; | ||
} | ||
} | ||
} | ||
// after processing all annotations, if there's a remaining group, finalize it too | ||
if (currentGroup.length > 0) { | ||
const groupedAnnotation = createGroupedAnnotation(currentGroup); | ||
groupedAnnotations.push(groupedAnnotation); | ||
} | ||
// return the final array of grouped annotations representing words or phrases | ||
return groupedAnnotations; | ||
} | ||
/** | ||
* `createGroupedAnnotation` combines a group of annotations into a single annotation by concatenating their text. | ||
* It also attempts to preserve formatting, such as marking bold text if the median height suggests emphasis. | ||
* | ||
* @param group - An array of annotations that should be merged into a single text element. | ||
* @returns A new TextAnnotation representing the combined text and averaged metrics from the group. | ||
*/ | ||
function createGroupedAnnotation(group: TextAnnotation[]): TextAnnotation { | ||
// initialize an empty string to build the combined text. | ||
let text = ""; | ||
// concatenate the text from each annotation in the group. | ||
// insert a space between words, except when punctuation directly follows a word | ||
for (const word of group) { | ||
if ( | ||
[".", ",", '"', "'", ":", ";", "!", "?", "{", "}", "’", "”"].includes( | ||
word.text, | ||
) | ||
) { | ||
text += word.text; | ||
} else { | ||
text += text !== "" ? " " + word.text : word.text; | ||
} | ||
} | ||
// determine if the combined text qualifies as a "word" (contains alphanumeric chars) | ||
// and whether its median height suggests emphasizing it (e.g., bold text). | ||
const isWord = /[a-zA-Z0-9]/.test(text); | ||
const medianHeight = median(group.map((word) => word.height)); | ||
// if it's considered a word and tall enough, surround it with `**` for bold formatting. | ||
if (isWord && medianHeight > 25) { | ||
text = "**" + text + "**"; | ||
} | ||
// return a new annotation that represents the merged group. | ||
// use the first annotation's coordinates and normalized positions as references, | ||
// and sum the widths of all annotations to get the total width. | ||
return { | ||
text: text, | ||
bottom_left: { | ||
x: group[0].bottom_left.x, | ||
y: group[0].bottom_left.y, | ||
}, | ||
bottom_left_normalized: { | ||
x: group[0].bottom_left_normalized.x, | ||
y: group[0].bottom_left_normalized.y, | ||
}, | ||
width: group.reduce((sum, a) => sum + a.width, 0), | ||
height: group[0].height, | ||
}; | ||
} | ||
function median(values: number[]): number { | ||
if (values.length === 0) return 0; | ||
const sorted = [...values].sort((a, b) => a - b); | ||
const middle = Math.floor(sorted.length / 2); | ||
if (sorted.length % 2 === 0) { | ||
return (sorted[middle - 1] + sorted[middle]) / 2; | ||
} else { | ||
return sorted[middle]; | ||
} | ||
} | ||
export function logLineToString(logLine: LogLine): string { | ||
@@ -11,0 +363,0 @@ try { |
{ | ||
"name": "@browserbasehq/stagehand", | ||
"version": "1.7.0-alpha-b902192bc7ff8eb02c85150c1fe6f89c2a95b211", | ||
"version": "1.7.0-alpha-ba4ec335a5323648c6016cc480300fd58868311a", | ||
"description": "An AI web browsing framework focused on simplicity and extensibility.", | ||
@@ -5,0 +5,0 @@ "main": "./dist/index.js", |
@@ -201,2 +201,3 @@ <div id="toc" align="center"> | ||
- `sessionUrl`: a `string` representing the session URL. This is only available when using a Browserbase browser. | ||
- `sessionId`: a `string` representing the session ID. This is only available when using a Browserbase browser. | ||
@@ -263,2 +264,3 @@ - **Example:** | ||
- `domSettleTimeoutMs`: (optional) timeout in milliseconds for waiting for the DOM to settle | ||
- `useTextExtract`: (optional) a `boolean` to determine if text-based extraction should be used. Defaults to `false` | ||
@@ -265,0 +267,0 @@ - **Returns:** |
Sorry, the diff of this file is too big to display
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
549420
119
13962
560