@zumer/snapdom-plugins
Advanced tools
+1
-1
| { | ||
| "name": "@zumer/snapdom-plugins", | ||
| "version": "1.1.0", | ||
| "version": "1.2.0", | ||
| "description": "Official plugins for SnapDOM", | ||
@@ -5,0 +5,0 @@ "type": "module", |
+62
-39
@@ -22,2 +22,7 @@ /** | ||
| * @param {boolean} [options.includeCoords=true] - Include bbox in the prompt text | ||
| * @param {string[]} [options.include] - Which fields to return. Default | ||
| * ['elements', 'prompt']. For vision-dependent tasks (chart content, layout QA, | ||
| * canvas) pass ['image', 'elements', 'prompt'] or add 'image' to the array. For | ||
| * text-only agent prompts pass ['prompt'] (cheapest — skips canvas draw entirely). | ||
| * Accepted values: 'image', 'elements', 'prompt'. | ||
| * @returns {Object} SnapDOM plugin | ||
@@ -62,2 +67,8 @@ */ | ||
| // Default omits 'image'. Benchmarking showed the text + JSON map is enough | ||
| // to answer most UI-inspection questions and uses ~14× fewer tokens. Pass | ||
| // `include: ['image', 'elements', 'prompt']` explicitly when the task truly | ||
| // depends on vision (charts, canvas content, layout QA). | ||
| const DEFAULT_INCLUDE = ['elements', 'prompt']; | ||
| export function promptExport(options = {}) { | ||
@@ -74,2 +85,3 @@ const { | ||
| includeCoords = true, | ||
| include = DEFAULT_INCLUDE, | ||
| } = options; | ||
@@ -96,9 +108,13 @@ | ||
| const meta = ctx.__promptMetadata; | ||
| const wantSet = new Set(opts.include || include || DEFAULT_INCLUDE); | ||
| const wantImage = wantSet.has('image'); | ||
| const wantElements = wantSet.has('elements'); | ||
| const wantPrompt = wantSet.has('prompt'); | ||
| if (!meta || !meta.elements.length) { | ||
| return { | ||
| image: ctx.export.url, | ||
| elements: [], | ||
| dimensions: { width: 0, height: 0 }, | ||
| prompt: '', | ||
| }; | ||
| const empty = { dimensions: { width: 0, height: 0 } }; | ||
| if (wantImage) empty.image = ctx.export.url; | ||
| if (wantElements) empty.elements = []; | ||
| if (wantPrompt) empty.prompt = ''; | ||
| return empty; | ||
| } | ||
@@ -112,28 +128,36 @@ | ||
| const img = new Image(); | ||
| img.src = ctx.export.url; | ||
| await new Promise((res, rej) => { | ||
| img.onload = res; | ||
| img.onerror = rej; | ||
| }); | ||
| // Only load + rasterize the SVG when the caller actually wants the | ||
| // image. Skipping saves the img decode + canvas draw + toDataURL — | ||
| // the most expensive steps of this export. | ||
| let w, h, dataURL; | ||
| if (wantImage) { | ||
| const img = new Image(); | ||
| img.src = ctx.export.url; | ||
| await new Promise((res, rej) => { img.onload = res; img.onerror = rej; }); | ||
| const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1; | ||
| w = Math.round(img.naturalWidth * ratio); | ||
| h = Math.round(img.naturalHeight * ratio); | ||
| const canvas = document.createElement('canvas'); | ||
| canvas.width = w; | ||
| canvas.height = h; | ||
| canvas.getContext('2d').drawImage(img, 0, 0, w, h); | ||
| const mime = | ||
| format === 'jpg' || format === 'jpeg' ? 'image/jpeg' | ||
| : format === 'webp' ? 'image/webp' | ||
| : 'image/png'; | ||
| dataURL = canvas.toDataURL(mime, quality); | ||
| } else { | ||
| // No image — scale bboxes to the same target width the image would | ||
| // have used, so downstream callers can still render the map over | ||
| // a separately-rendered screenshot at the same scale. | ||
| const sourceW = meta.dimensions.width || 1; | ||
| const ratio = sourceW > maxWidth ? maxWidth / sourceW : 1; | ||
| w = Math.round(sourceW * ratio); | ||
| h = Math.round(meta.dimensions.height * ratio); | ||
| } | ||
| const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1; | ||
| const w = Math.round(img.naturalWidth * ratio); | ||
| const h = Math.round(img.naturalHeight * ratio); | ||
| const canvas = document.createElement('canvas'); | ||
| canvas.width = w; | ||
| canvas.height = h; | ||
| canvas.getContext('2d').drawImage(img, 0, 0, w, h); | ||
| const mime = | ||
| format === 'jpg' || format === 'jpeg' ? 'image/jpeg' | ||
| : format === 'webp' ? 'image/webp' | ||
| : 'image/png'; | ||
| const dataURL = canvas.toDataURL(mime, quality); | ||
| const sx = w / (meta.dimensions.width || 1); | ||
| const sy = h / (meta.dimensions.height || 1); | ||
| const elements = meta.elements.map((el) => ({ | ||
| const scaledElements = meta.elements.map((el) => ({ | ||
| ...el, | ||
@@ -148,15 +172,14 @@ bbox: { | ||
| // In compact + annotate, badges already encode positions on the | ||
| // image itself, so repeating coords in the prompt text is noise. | ||
| const emitCoords = mode === 'verbose' ? true : (withCoords && !annotate); | ||
| const prompt = mode === 'verbose' | ||
| ? formatPromptVerbose(elements, { width: w, height: h }, emitCoords) | ||
| : formatPromptCompact(elements, { width: w, height: h }, emitCoords); | ||
| const promptText = (wantPrompt) | ||
| ? (mode === 'verbose' | ||
| ? formatPromptVerbose(scaledElements, { width: w, height: h }, emitCoords) | ||
| : formatPromptCompact(scaledElements, { width: w, height: h }, emitCoords)) | ||
| : null; | ||
| return { | ||
| image: dataURL, | ||
| elements, | ||
| dimensions: { width: w, height: h }, | ||
| prompt, | ||
| }; | ||
| const out = { dimensions: { width: w, height: h } }; | ||
| if (wantImage) out.image = dataURL; | ||
| if (wantElements) out.elements = scaledElements; | ||
| if (wantPrompt) out.prompt = promptText; | ||
| return out; | ||
| }, | ||
@@ -163,0 +186,0 @@ }; |
+41
-10
@@ -175,3 +175,3 @@ # @zumer/snapdom-plugins | ||
| Adds a `toPrompt()` export method that returns an LLM-ready package: an annotated screenshot, a structured element map with bounding boxes, and a pre-formatted text description. Useful for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata. | ||
| Adds a `toPrompt()` export method that returns an LLM-ready package: a structured element map with bounding boxes, a pre-formatted prompt text, and (optionally) an annotated screenshot. Tuned for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata. | ||
@@ -182,13 +182,23 @@ ```js | ||
| const result = await snapdom(el, { plugins: [promptExport()] }); | ||
| const { image, elements, dimensions, prompt } = await result.toPrompt(); | ||
| // Default: no image, just the structured map + prompt text (cheapest) | ||
| const { elements, prompt, dimensions } = await result.toPrompt(); | ||
| ``` | ||
| The returned object: | ||
| To also include the annotated image (for tasks that truly depend on vision): | ||
| ```js | ||
| const result = await snapdom(el, { | ||
| plugins: [promptExport({ include: ['image', 'elements', 'prompt'] })] | ||
| }); | ||
| const { image, elements, prompt, dimensions } = await result.toPrompt(); | ||
| ``` | ||
| The returned object (fields present only if requested via `include`): | ||
| | Field | Type | Description | | ||
| |-------|------|-------------| | ||
| | `image` | `string` | Data URL of the (optionally annotated) screenshot | | ||
| | `elements` | `Array` | One entry per detected element: `{ id, tag, type, text, bbox, attributes }` | | ||
| | `dimensions` | `{width, height}` | Image dimensions after `maxImageWidth` scaling | | ||
| | `elements` | `Array` | One entry per detected element: `{ id, tag, type, name, text, bbox, attributes, state?, styles?, covered? }` | | ||
| | `prompt` | `string` | Pre-formatted text describing interactive + semantic elements | | ||
| | `image` | `string` | Data URL of the (optionally annotated) screenshot — **only when `include` contains `'image'`** | | ||
| | `dimensions` | `{width, height}` | Scaled dimensions (always present) | | ||
@@ -201,5 +211,13 @@ `elements` is split into two `type`s: | ||
| Each interactive entry also carries: | ||
| - `name` — the computed accessible name (aria-label → labelledby → alt → title → labels[0] → textContent) | ||
| - `state` — runtime state: `{ checked, disabled, focus, open, value, selectedText }` (only keys that apply) | ||
| - `styles` — visually-meaningful computed props filtered to drop defaults | ||
| - `covered: true` when another element is painted on top of the bbox center (an agent won't click through a modal) | ||
| ```js | ||
| // Example — feed a vision-capable LLM | ||
| const { image, elements } = await result.toPrompt({ maxImageWidth: 1024 }); | ||
| const { image, elements } = await result.toPrompt({ | ||
| include: ['image', 'elements', 'prompt'] | ||
| }); | ||
@@ -213,4 +231,7 @@ // image is a data URL → pass as image input | ||
| |--------|------|---------|-------------| | ||
| | `annotate` | `boolean` | `true` | Overlay numbered badges on interactive elements | | ||
| | `imageFormat` | `'png' \| 'jpg' \| 'webp'` | `'png'` | Output image format | | ||
| | `include` | `string[]` | `['elements', 'prompt']` | Fields to return. Add `'image'` for tasks that need vision (chart content, layout QA, canvas). Use `['prompt']` for the cheapest text-only mode. | | ||
| | `annotate` | `boolean` | `true` | Overlay numbered badges on interactive elements (only affects the image when included) | | ||
| | `promptMode` | `'compact' \| 'verbose'` | `'compact'` | Prompt text verbosity. Compact omits coords when badges are on the image. | | ||
| | `includeCoords` | `boolean` | `true` | Include bbox in the prompt text | | ||
| | `imageFormat` | `'png' \| 'jpg' \| 'webp'` | `'png'` | Output image format (only used when `image` is included) | | ||
| | `imageQuality` | `number` | `0.8` | Quality for lossy formats (0–1) | | ||
@@ -226,4 +247,14 @@ | `maxImageWidth` | `number` | `1024` | Max width in px; downscales and rescales bboxes if larger | | ||
| Both per-call options (`opts.imageFormat`, etc.) and constructor options are supported; per-call wins. | ||
| Both per-call options (`opts.include`, `opts.imageFormat`, etc.) and constructor options are supported; per-call wins. | ||
| The image is the most expensive part of `toPrompt()` to produce (canvas draw + data-URL serialization), so the default skips it. Add `'image'` to `include` when the task actually uses vision: | ||
| ```js | ||
| // Vision-dependent task (chart content, layout QA, visual diff) | ||
| await result.toPrompt({ include: ['image', 'elements', 'prompt'] }); | ||
| // Pure structured agent loop (cheapest) | ||
| await result.toPrompt({ include: ['prompt'] }); | ||
| ``` | ||
| --- | ||
@@ -230,0 +261,0 @@ |
47909
8.34%922
2.67%295
11.74%