Big News: Socket raises $60M Series C at a $1B valuation to secure software supply chains for AI-driven development.Announcement
Sign In

@zumer/snapdom-plugins

Package Overview
Dependencies
Maintainers
1
Versions
8
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@zumer/snapdom-plugins - npm Package Compare versions

Comparing version
1.2.0
to
2.1.0
+376
agent-map.js
/**
* agentMap – Official SnapDOM Plugin
*
* Produces a Set-of-Mark package for visual agents: an annotated screenshot
* with numbered badges on interactive elements, plus a compact JSON map from
* badge index → role / name / bbox / state. Designed for one-call capture
* on the client side — visual agents, computer-use harnesses, dataset
* generation for vision training, visual QA.
*
* Usage:
* import { agentMap } from '@zumer/snapdom-plugins/agent-map';
* const result = await snapdom(el, { plugins: [agentMap()] });
* const { image, map, dimensions } = await result.toAgentMap();
*
* // model reply: "click element 2" → map[2].b gives [x, y, w, h]
*
* @param {Object} [options]
* @param {'annotated'|'raw'|false} [options.image='annotated'] Image output
* mode. 'annotated' draws numbered badges on the rendered image, 'raw'
* returns the image without badges, false skips image generation (cheapest).
* @param {'minimal'|'full'} [options.fields='minimal'] Per-entry shape.
* 'minimal' returns {i, n, r, b, s?}. 'full' adds {t (text), a (attrs)}.
* @param {boolean} [options.semantic=false] Include non-interactive semantic
* elements (headings, paragraphs, nav, main, landmarks). Off by default —
* agents typically only act on interactive.
* @param {number} [options.maxImageWidth=1024] Downscale target for the image.
* @param {'png'|'jpg'|'webp'} [options.imageFormat='png'] Image format.
* @param {number} [options.imageQuality=0.8] Quality for lossy formats.
* @param {string} [options.interactiveSelector] CSS selector (default below).
* @param {string} [options.semanticSelector] CSS selector (default below).
* @param {Object} [options.labelStyle={}] Override badge styles.
* @returns {Object} SnapDOM plugin
*/
const DEFAULT_INTERACTIVE =
'a[href], button, input, select, textarea, ' +
'[role="button"], [role="link"], [role="tab"], [role="menuitem"], [role="checkbox"], [role="radio"], [role="switch"], [role="slider"], [role="combobox"], [role="textbox"], ' +
'[tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]';
const DEFAULT_SEMANTIC =
'h1, h2, h3, h4, h5, h6, nav, main, article, section, header, footer, ' +
'figcaption, blockquote, legend, p';
export function agentMap(options = {}) {
const {
image = 'annotated',
fields = 'minimal',
semantic = false,
maxImageWidth = 1024,
imageFormat = 'png',
imageQuality = 0.8,
interactiveSelector = DEFAULT_INTERACTIVE,
semanticSelector = DEFAULT_SEMANTIC,
labelStyle = {},
} = options;
return {
name: 'agent-map',
afterClone(ctx) {
const meta = extractMap(
ctx.element,
interactiveSelector,
semantic ? semanticSelector : null,
fields
);
// snapdom's export ctx is a fresh spread of ctx.options, so we stash on
// both for the agentMap() call below to find it.
ctx.__agentMapMeta = meta;
if (ctx.options) ctx.options.__agentMapMeta = meta;
if (image === 'annotated') {
addAnnotations(ctx.clone, meta.map, labelStyle);
}
},
defineExports() {
return {
agentMap: async (ctx, opts = {}) => {
const meta = ctx.__agentMapMeta;
const wantImage = opts.image !== undefined ? opts.image : image;
if (!meta || !meta.map.length) {
const out = { dimensions: { width: 0, height: 0 }, map: [] };
if (wantImage) out.image = ctx.export.url;
return out;
}
const format = opts.imageFormat || imageFormat;
const quality = opts.imageQuality || imageQuality;
const maxWidth = opts.maxImageWidth || maxImageWidth;
// Scale dimensions — whether we rasterize or not, bboxes get resized
// to the target output size so callers can overlay them on the image.
let w, h, dataURL;
if (wantImage) {
const img = new Image();
img.src = ctx.export.url;
await new Promise((res, rej) => { img.onload = res; img.onerror = rej; });
const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
w = Math.round(img.naturalWidth * ratio);
h = Math.round(img.naturalHeight * ratio);
const canvas = document.createElement('canvas');
canvas.width = w;
canvas.height = h;
canvas.getContext('2d').drawImage(img, 0, 0, w, h);
const mime =
format === 'jpg' || format === 'jpeg' ? 'image/jpeg'
: format === 'webp' ? 'image/webp'
: 'image/png';
dataURL = canvas.toDataURL(mime, quality);
} else {
const sourceW = meta.dimensions.width || 1;
const ratio = sourceW > maxWidth ? maxWidth / sourceW : 1;
w = Math.round(sourceW * ratio);
h = Math.round(meta.dimensions.height * ratio);
}
const sx = w / (meta.dimensions.width || 1);
const sy = h / (meta.dimensions.height || 1);
const scaledMap = meta.map.map(e => {
const scaled = { ...e, b: [
Math.round(e.b[0] * sx),
Math.round(e.b[1] * sy),
Math.round(e.b[2] * sx),
Math.round(e.b[3] * sy),
] };
return scaled;
});
const out = { dimensions: { width: w, height: h }, map: scaledMap };
if (wantImage) out.image = dataURL;
return out;
},
};
},
};
}
/* ── Role derivation ────────────────────────────── */
function deriveRole(el) {
const explicit = el.getAttribute('role');
if (explicit) return explicit;
const tag = el.tagName.toLowerCase();
const type = (el.type || '').toLowerCase();
if (tag === 'button') return 'button';
if (tag === 'a' && el.hasAttribute('href')) return 'link';
if (tag === 'input') {
if (type === 'checkbox') return 'checkbox';
if (type === 'radio') return 'radio';
if (type === 'range') return 'slider';
if (type === 'file') return 'file';
if (type === 'submit' || type === 'button' || type === 'reset' || type === 'image') return 'button';
return 'textbox';
}
if (tag === 'select') return 'combobox';
if (tag === 'textarea') return 'textbox';
if (tag === 'summary') return 'button';
if (tag === 'details') return 'group';
if (/^h[1-6]$/.test(tag)) return 'heading';
if (tag === 'nav') return 'navigation';
if (tag === 'main') return 'main';
if (tag === 'header') return 'banner';
if (tag === 'footer') return 'contentinfo';
if (tag === 'article') return 'article';
if (tag === 'section') return 'region';
if (tag === 'p') return 'paragraph';
if (tag === 'img') return 'image';
return tag;
}
/* ── Accessible name ────────────────────────────── */
function accessibleName(el) {
const ariaLabel = el.getAttribute('aria-label');
if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim();
const labelledBy = el.getAttribute('aria-labelledby');
if (labelledBy) {
const root = el.getRootNode();
const getById = (id) =>
root && typeof root.getElementById === 'function'
? root.getElementById(id) : document.getElementById(id);
const parts = labelledBy.trim().split(/\s+/)
.map(id => { const r = getById(id); return r ? (r.textContent || '').trim() : ''; })
.filter(Boolean);
if (parts.length) return parts.join(' ');
}
if (el.tagName === 'IMG' || (el.tagName === 'INPUT' && (el.type || '').toLowerCase() === 'image')) {
const alt = el.getAttribute('alt');
if (alt && alt.trim()) return alt.trim();
}
const title = el.getAttribute('title');
if (title && title.trim()) return title.trim();
if (el.labels && el.labels[0]) {
const t = (el.labels[0].textContent || '').trim();
if (t) return t;
}
const text = (el.textContent || '').replace(/\s+/g, ' ').trim();
if (text) return text.length > 60 ? text.slice(0, 59) + '…' : text;
return '';
}
/* ── State extraction ───────────────────────────── */
/**
* Builds the `s` (state) object. Only meaningful states — never default
* values that add no signal for an agent. Critically, aria-expanded and
* aria-pressed are included for BOTH values (true and false) because
* "pressed: false" on a toggle is meaningful information.
*/
function deriveState(el, role, rect) {
const s = {};
try {
if (el.matches(':checked')) s.checked = true;
else if (role === 'checkbox' || role === 'radio') {
// include checked:false for form groups where an agent needs to
// know "unchecked" is a valid state distinct from "not a checkbox".
s.checked = false;
}
if (el.matches(':disabled')) s.disabled = true;
if (el.matches(':focus')) s.focus = true;
} catch { /* exotic nodes */ }
const expanded = el.getAttribute('aria-expanded');
if (expanded === 'true') s.expanded = true;
else if (expanded === 'false') s.expanded = false;
const pressed = el.getAttribute('aria-pressed');
if (pressed === 'true') s.pressed = true;
else if (pressed === 'false') s.pressed = false;
const selected = el.getAttribute('aria-selected');
if (selected === 'true') s.selected = true;
else if (selected === 'false' && (role === 'tab' || role === 'option')) s.selected = false;
if (el.tagName === 'INPUT') {
const type = (el.type || 'text').toLowerCase();
if (type !== 'checkbox' && type !== 'radio' && type !== 'submit' && type !== 'button' && type !== 'reset' && el.value) {
s.value = el.value;
}
} else if (el.tagName === 'TEXTAREA') {
if (el.value) s.value = el.value;
} else if (el.tagName === 'SELECT') {
s.value = el.value;
const opt = el.options && el.options[el.selectedIndex];
if (opt) s.selectedText = opt.text || '';
} else if (el.tagName === 'DETAILS') {
s.open = !!el.open;
}
// Covered — element visually occluded by something else at its center.
if (rect && rect.width && rect.height) {
const cx = rect.left + rect.width / 2;
const cy = rect.top + rect.height / 2;
if (cx >= 0 && cy >= 0) {
const doc = el.ownerDocument || document;
if (doc.elementFromPoint) {
const top = doc.elementFromPoint(cx, cy);
if (top && top !== el && !(el.contains && el.contains(top))) {
s.covered = true;
}
}
}
}
return Object.keys(s).length ? s : null;
}
/* ── Map extraction ─────────────────────────────── */
function extractMap(element, interactiveSelector, semanticSelector, fields) {
const rootRect = element.getBoundingClientRect();
const map = [];
let i = 0;
const tracked = new Set();
for (const el of element.querySelectorAll(interactiveSelector)) {
const entry = buildEntry(el, rootRect, i, fields, 'interactive');
if (entry) { map.push(entry); tracked.add(el); i++; }
}
if (semanticSelector) {
for (const el of element.querySelectorAll(semanticSelector)) {
if (tracked.has(el)) continue;
const entry = buildEntry(el, rootRect, i, fields, 'semantic');
if (entry) { map.push(entry); i++; }
}
}
return {
map,
dimensions: { width: rootRect.width, height: rootRect.height },
};
}
function buildEntry(el, rootRect, i, fields, kind) {
const rect = el.getBoundingClientRect();
const b = [
Math.round(rect.left - rootRect.left),
Math.round(rect.top - rootRect.top),
Math.round(rect.width),
Math.round(rect.height),
];
if (b[2] <= 0 && b[3] <= 0) return null;
const role = deriveRole(el);
const n = accessibleName(el);
const entry = { i, n, r: role, b };
if (kind === 'interactive') {
const s = deriveState(el, role, rect);
if (s) entry.s = s;
}
if (fields === 'full') {
const t = (el.textContent || '').replace(/\s+/g, ' ').trim();
if (t && t !== n) entry.t = t.length > 160 ? t.slice(0, 159) + '…' : t;
const a = {};
for (const name of ['href', 'type', 'name', 'placeholder', 'alt', 'title', 'role', 'aria-label']) {
const v = el.getAttribute(name);
if (v && v !== 'false') a[name] = v;
}
if (Object.keys(a).length) entry.a = a;
}
return entry;
}
/* ── Annotations ────────────────────────────────── */
function addAnnotations(clone, entries, customStyle) {
const interactive = entries.filter(e => !e.isSemanticOnly);
if (!interactive.length) return;
const overlay = document.createElement('div');
overlay.setAttribute('data-snap-agent-overlay', 'true');
Object.assign(overlay.style, {
position: 'absolute',
top: '0', left: '0', width: '100%', height: '100%',
pointerEvents: 'none',
zIndex: '2147483647',
overflow: 'visible',
});
for (const e of interactive) {
const badge = document.createElement('span');
badge.textContent = String(e.i);
const cx = e.b[0] + e.b[2] / 2;
const cy = e.b[1] + e.b[3] / 2;
Object.assign(badge.style, {
position: 'absolute',
left: cx + 'px', top: cy + 'px',
transform: 'translate(-50%, -50%)',
minWidth: '18px', height: '18px',
lineHeight: '18px', fontSize: '11px', fontWeight: '700',
fontFamily: 'system-ui, -apple-system, sans-serif',
color: '#fff', backgroundColor: 'rgba(220, 38, 38, 0.92)',
borderRadius: '9px', textAlign: 'center', padding: '0 4px',
boxSizing: 'border-box', boxShadow: '0 1px 3px rgba(0,0,0,0.3)',
...customStyle,
});
overlay.appendChild(badge);
}
clone.style.position = 'relative';
clone.appendChild(overlay);
}
+33
-17

@@ -12,9 +12,17 @@ /**

function isDrawElementImageAvailable() {
/**
* The WICG canvas-place-element spec evolved: Chrome ~130+ exposes drawElement(),
* earlier flagged builds shipped drawElementImage(). Detect either.
* @returns {'drawElement'|'drawElementImage'|null}
*/
function detectDrawApi() {
try {
const c = document.createElement('canvas')
const ctx = c.getContext('2d')
return ctx && typeof ctx.drawElementImage === 'function'
if (!ctx) return null
if (typeof ctx.drawElement === 'function') return 'drawElement'
if (typeof ctx.drawElementImage === 'function') return 'drawElementImage'
return null
} catch {
return false
return null
}

@@ -27,5 +35,6 @@ }

export function htmlInCanvasPlugin() {
const available = isDrawElementImageAvailable()
const drawApi = detectDrawApi()
const available = !!drawApi
if (!available) {
console.warn('[snapdom] html-in-canvas plugin: drawElementImage not available. Enable chrome://flags/#canvas-draw-element')
console.warn('[snapdom] html-in-canvas plugin: drawElement / drawElementImage not available. Enable chrome://flags/#canvas-draw-element')
}

@@ -97,23 +106,30 @@

const container = document.createElement('div')
container.id = 'snapdom-html-in-canvas-temp'
container.style.cssText = 'position:fixed;left:-9999px;top:0;visibility:hidden;'
container.appendChild(canvas)
document.body.appendChild(container)
// Append directly to body, taken out of flow with position:fixed + z-index:-1
// so it sits behind the page's content (covered by body/main backgrounds)
// while still being painted. visibility:hidden / opacity:0 / left:-9999px
// skip the paint pass and trigger "No cached paint record".
canvas.style.cssText = 'position:fixed;top:0;left:0;z-index:-1;'
document.body.appendChild(canvas)
try {
await new Promise(r => requestAnimationFrame(r))
canvas.getBoundingClientRect()
await new Promise(r => requestAnimationFrame(() => requestAnimationFrame(r)))
const ctx2d = canvas.getContext('2d')
if (!ctx2d || typeof ctx2d.drawElementImage !== 'function') {
throw new Error('drawElementImage not available')
const fn = ctx2d && (ctx2d[drawApi] || ctx2d.drawElement || ctx2d.drawElementImage)
if (typeof fn !== 'function') {
throw new Error('drawElement / drawElementImage not available on this canvas context')
}
ctx2d.save()
ctx2d.scale(dpr * scale, dpr * scale)
ctx2d.drawElementImage(wrapper, 0, 0, width, height)
fn.call(ctx2d, wrapper, 0, 0, width, height)
ctx2d.restore()
return canvas
} catch (e) {
if (e && /paint record/i.test(e.message || '')) {
throw new Error('Browser had no paint record for the element. Make sure the document is fully loaded and visible before calling html-in-canvas (drawElement requires a real paint pass).')
}
throw e
} finally {
try {
document.body.removeChild(container)
} catch {}
try { canvas.remove() } catch {}
}

@@ -120,0 +136,0 @@ }

@@ -15,3 +15,3 @@ /**

export { pdfImage } from './pdf-image.js';
export { promptExport } from './prompt-export.js';
export { agentMap } from './agent-map.js';
// export { htmlInCanvas } from './html-in-canvas.js';
{
"name": "@zumer/snapdom-plugins",
"version": "1.2.0",
"version": "2.1.0",
"description": "Official plugins for SnapDOM",

@@ -15,3 +15,3 @@ "type": "module",

"./pdf-image": "./pdf-image.js",
"./prompt-export": "./prompt-export.js"
"./agent-map": "./agent-map.js"
},

@@ -18,0 +18,0 @@ "files": [

+44
-60

@@ -173,83 +173,67 @@ # @zumer/snapdom-plugins

### `prompt-export`
### `agent-map`
Adds a `toPrompt()` export method that returns an LLM-ready package: a structured element map with bounding boxes, a pre-formatted prompt text, and (optionally) an annotated screenshot. Tuned for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata.
Produces a Set-of-Mark package for **visual agents**: an annotated screenshot with numbered badges on interactive elements, plus a compact JSON map from badge index → role / accessible name / bbox / state. One call, fully client-side.
```js
import { promptExport } from '@zumer/snapdom-plugins/prompt-export';
import { agentMap } from '@zumer/snapdom-plugins/agent-map';
const result = await snapdom(el, { plugins: [promptExport()] });
// Default: no image, just the structured map + prompt text (cheapest)
const { elements, prompt, dimensions } = await result.toPrompt();
```
const result = await snapdom(el, { plugins: [agentMap()] });
const { image, map, dimensions } = await result.toAgentMap();
To also include the annotated image (for tasks that truly depend on vision):
```js
const result = await snapdom(el, {
plugins: [promptExport({ include: ['image', 'elements', 'prompt'] })]
});
const { image, elements, prompt, dimensions } = await result.toPrompt();
// image: data URL of the screenshot with numbered red badges overlaid
// map: [{ i, n, r, b, s? }, …] — index, name, role, bbox, state
// Agent says "click element 2" → map[2].b gives [x, y, w, h]
```
The returned object (fields present only if requested via `include`):
Map entry shape (default `fields: 'minimal'`):
| Field | Type | Description |
|-------|------|-------------|
| `elements` | `Array` | One entry per detected element: `{ id, tag, type, name, text, bbox, attributes, state?, styles?, covered? }` |
| `prompt` | `string` | Pre-formatted text describing interactive + semantic elements |
| `image` | `string` | Data URL of the (optionally annotated) screenshot — **only when `include` contains `'image'`** |
| `dimensions` | `{width, height}` | Scaled dimensions (always present) |
| Key | Type | Description |
|-----|------|-------------|
| `i` | `number` | Index matching the badge drawn on the image |
| `n` | `string` | Accessible name (aria-label → labelledby → alt → title → labels → textContent, truncated to 60 chars) |
| `r` | `string` | ARIA-style role (`button`, `link`, `checkbox`, `radio`, `textbox`, `combobox`, `slider`, `heading`, …) — derived from `role` attribute or implicit role of the element |
| `b` | `[x, y, w, h]` | Bounding box in pixels, scaled against `maxImageWidth` |
| `s` | `object?` | State: included only when at least one key is meaningful — `checked`, `disabled`, `focus`, `expanded`, `pressed`, `selected`, `value`, `open`, `selectedText`, `covered` |
`elements` is split into two `type`s:
- `'interactive'` — buttons, links, inputs, `[role]`/`[tabindex]` targets. These get numbered badges overlaid on the screenshot when `annotate` is on.
- `'semantic'` — headings, paragraphs, `<nav>`, `<main>`, images with `alt`, table cells, etc. Structural context, not overlaid.
Example map for a checkout form:
Each `bbox` is in pixel coordinates of the returned image (scaled against `maxImageWidth`).
Each interactive entry also carries:
- `name` — the computed accessible name (aria-label → labelledby → alt → title → labels[0] → textContent)
- `state` — runtime state: `{ checked, disabled, focus, open, value, selectedText }` (only keys that apply)
- `styles` — visually-meaningful computed props filtered to drop defaults
- `covered: true` when another element is painted on top of the bbox center (an agent won't click through a modal)
```js
// Example — feed a vision-capable LLM
const { image, elements } = await result.toPrompt({
include: ['image', 'elements', 'prompt']
});
// image is a data URL → pass as image input
// elements is JSON → pass as structured context alongside the image
// "Click element [3]" → look up elements[3].bbox for real coordinates
[
{ i: 0, n: 'Email', r: 'textbox', b: [28, 80, 280, 34], s: { value: 'ada@example.com' } },
{ i: 1, n: 'Send product updates', r: 'checkbox', b: [28, 134, 13, 13], s: { checked: true } },
{ i: 2, n: 'Apply coupon', r: 'button', b: [28, 176, 114, 38], s: { expanded: false } },
{ i: 3, n: 'Remove coupon', r: 'button', b: [150, 176, 140, 38], s: { disabled: true } },
{ i: 4, n: 'Pay $53.90', r: 'button', b: [28, 220, 97, 38] }
]
```
#### Options
| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `include` | `string[]` | `['elements', 'prompt']` | Fields to return. Add `'image'` for tasks that need vision (chart content, layout QA, canvas). Use `['prompt']` for the cheapest text-only mode. |
| `annotate` | `boolean` | `true` | Overlay numbered badges on interactive elements (only affects the image when included) |
| `promptMode` | `'compact' \| 'verbose'` | `'compact'` | Prompt text verbosity. Compact omits coords when badges are on the image. |
| `includeCoords` | `boolean` | `true` | Include bbox in the prompt text |
| `imageFormat` | `'png' \| 'jpg' \| 'webp'` | `'png'` | Output image format (only used when `image` is included) |
| `imageQuality` | `number` | `0.8` | Quality for lossy formats (0–1) |
| `maxImageWidth` | `number` | `1024` | Max width in px; downscales and rescales bboxes if larger |
| `interactiveSelector` | `string` | see below | CSS selector for the interactive element set |
| `semanticSelector` | `string` | see below | CSS selector for the semantic element set |
| `labelStyle` | `object` | `{}` | Override styles for the numbered badges (`position`, `color`, `backgroundColor`, etc.) |
| `image` | `'annotated' \| 'raw' \| false` | `'annotated'` | `'annotated'` overlays numbered badges; `'raw'` skips badges; `false` skips image generation entirely (no canvas draw, no toDataURL — cheapest path). |
| `fields` | `'minimal' \| 'full'` | `'minimal'` | `'full'` adds `t` (raw text content) and `a` (meaningful attributes) per entry. |
| `semantic` | `boolean` | `false` | Include non-interactive structural elements (headings, paragraphs, landmarks). Off by default — agents act on interactive. |
| `maxImageWidth` | `number` | `1024` | Downscale target for the image; bboxes rescale to match. |
| `imageFormat` | `'png' \| 'jpg' \| 'webp'` | `'png'` | Image format (only used when image is rendered). |
| `imageQuality` | `number` | `0.8` | Quality for lossy formats. |
| `interactiveSelector` | `string` | see below | CSS selector for interactive elements. |
| `semanticSelector` | `string` | see below | CSS selector for semantic elements (used when `semantic: true`). |
| `labelStyle` | `object` | `{}` | Override badge styles. |
Defaults:
- **interactive**: `a[href], button, input, select, textarea, [role="button"|"link"|"tab"|"menuitem"|"checkbox"|"radio"], [tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]`
- **semantic**: `h1–h6, p, li, img[alt], nav, main, article, section, header, footer, label, td, th, figcaption, blockquote, legend`
- **interactive**: `a[href], button, input, select, textarea, [role="button"|"link"|"tab"|"menuitem"|"checkbox"|"radio"|"switch"|"slider"|"combobox"|"textbox"], [tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]`
- **semantic**: `h1–h6, nav, main, article, section, header, footer, figcaption, blockquote, legend, p`
Both per-call options (`opts.include`, `opts.imageFormat`, etc.) and constructor options are supported; per-call wins.
Per-call options override constructor options (e.g. `result.toAgentMap({ image: false })`).
The image is the most expensive part of `toPrompt()` to produce (canvas draw + data-URL serialization), so the default skips it. Add `'image'` to `include` when the task actually uses vision:
#### When to use
```js
// Vision-dependent task (chart content, layout QA, visual diff)
await result.toPrompt({ include: ['image', 'elements', 'prompt'] });
- Visual agents using Set-of-Mark prompting — one call gives you both the labelled image and the coordinate lookup table.
- Computer-use / browser-agent harnesses that need click coordinates for a vision model's output.
- Visual QA with an LLM judge — compare before/after captures with structured element identity.
- Dataset generation for vision-LLM fine-tuning — (image, map) pairs.
// Pure structured agent loop (cheapest)
await result.toPrompt({ include: ['prompt'] });
```
Because it runs entirely in the browser, it works in contexts where Playwright / Puppeteer can't: Chrome extensions, SaaS web apps capturing the user's own page, Electron apps capturing their own window.

@@ -256,0 +240,0 @@ ---

/**
* promptExport – Official SnapDOM Plugin
* Produces an LLM-ready package: annotated screenshot + structured element
* map + prompt text. Tuned for vision-capable LLMs reading the image + map
* together (Set-of-Mark pattern).
*
* Usage:
* import { promptExport } from '@zumer/snapdom-plugins/prompt-export';
* const result = await snapdom(el, { plugins: [promptExport()] });
* const { image, elements, dimensions, prompt } = await result.toPrompt();
*
* @param {Object} [options]
* @param {boolean} [options.annotate=true] - Overlay numbered badges on interactive elements
* @param {string} [options.imageFormat='png'] - Output image format ('png'|'jpg'|'webp')
* @param {number} [options.imageQuality=0.8] - Quality for lossy formats (0..1)
* @param {number} [options.maxImageWidth=1024] - Max width in px (downscales if larger)
* @param {string} [options.interactiveSelector] - Custom CSS selector for interactive elements
* @param {string} [options.semanticSelector] - Custom CSS selector for semantic elements
* @param {Object} [options.labelStyle={}] - Override styles for annotation badges
* @param {'compact'|'verbose'} [options.promptMode='compact'] - Prompt text verbosity
* @param {boolean} [options.includeCoords=true] - Include bbox in the prompt text
* @param {string[]} [options.include] - Which fields to return. Default
* ['elements', 'prompt']. For vision-dependent tasks (chart content, layout QA,
* canvas) pass ['image', 'elements', 'prompt'] or add 'image' to the array. For
* text-only agent prompts pass ['prompt'] (cheapest — skips canvas draw entirely).
* Accepted values: 'image', 'elements', 'prompt'.
* @returns {Object} SnapDOM plugin
*/
const DEFAULT_INTERACTIVE =
'a[href], button, input, select, textarea, ' +
'[role="button"], [role="link"], [role="tab"], [role="menuitem"], [role="checkbox"], [role="radio"], ' +
'[tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]';
const DEFAULT_SEMANTIC =
'h1, h2, h3, h4, h5, h6, p, li, img[alt], nav, main, article, section, ' +
'header, footer, label, td, th, figcaption, blockquote, legend';
const COLLECTED_ATTRS = [
'role', 'aria-label', 'aria-expanded', 'aria-checked', 'aria-disabled',
'alt', 'href', 'placeholder', 'name', 'type', 'value', 'title', 'disabled',
];
const VISUAL_FIELDS = [
'display', 'visibility', 'opacity',
'color', 'backgroundColor',
'fontSize', 'fontWeight',
'cursor', 'overflow',
];
// Common computed-style values that carry no information. Keeping the
// `styles` object small is the difference between a useful LLM input and
// token bloat on every element.
const VISUAL_SKIP = new Set(['initial', 'normal', 'visible', 'auto', 'static', '0']);
function isDefaultStyleValue(prop, value) {
if (!value) return true;
if (VISUAL_SKIP.has(value)) return true;
if (prop === 'cursor' && value === 'none') return true;
if (prop === 'color' && value === 'rgb(0, 0, 0)') return true;
if (prop === 'backgroundColor' && value === 'rgba(0, 0, 0, 0)') return true;
if (prop === 'fontWeight' && (value === '400' || value === 'normal')) return true;
if (prop === 'opacity' && value === '1') return true;
return false;
}
// Default omits 'image'. Benchmarking showed the text + JSON map is enough
// to answer most UI-inspection questions and uses ~14× fewer tokens. Pass
// `include: ['image', 'elements', 'prompt']` explicitly when the task truly
// depends on vision (charts, canvas content, layout QA).
const DEFAULT_INCLUDE = ['elements', 'prompt'];
export function promptExport(options = {}) {
const {
annotate = true,
imageFormat = 'png',
imageQuality = 0.8,
maxImageWidth = 1024,
interactiveSelector = DEFAULT_INTERACTIVE,
semanticSelector = DEFAULT_SEMANTIC,
labelStyle = {},
promptMode = 'compact',
includeCoords = true,
include = DEFAULT_INCLUDE,
} = options;
return {
name: 'prompt-export',
afterClone(ctx) {
const meta = extractMetadata(ctx.element, interactiveSelector, semanticSelector);
// snapdom spreads a fresh ctx for the export phase from ctx.options,
// so write to both so the prompt() call below can read it.
ctx.__promptMetadata = meta;
if (ctx.options) ctx.options.__promptMetadata = meta;
if (annotate) {
addAnnotations(ctx.clone, meta.elements, labelStyle);
}
},
defineExports() {
return {
prompt: async (ctx, opts = {}) => {
const meta = ctx.__promptMetadata;
const wantSet = new Set(opts.include || include || DEFAULT_INCLUDE);
const wantImage = wantSet.has('image');
const wantElements = wantSet.has('elements');
const wantPrompt = wantSet.has('prompt');
if (!meta || !meta.elements.length) {
const empty = { dimensions: { width: 0, height: 0 } };
if (wantImage) empty.image = ctx.export.url;
if (wantElements) empty.elements = [];
if (wantPrompt) empty.prompt = '';
return empty;
}
const format = opts.imageFormat || imageFormat;
const quality = opts.imageQuality || imageQuality;
const maxWidth = opts.maxImageWidth || maxImageWidth;
const mode = opts.promptMode || promptMode;
const withCoords = opts.includeCoords !== undefined ? opts.includeCoords : includeCoords;
// Only load + rasterize the SVG when the caller actually wants the
// image. Skipping saves the img decode + canvas draw + toDataURL —
// the most expensive steps of this export.
let w, h, dataURL;
if (wantImage) {
const img = new Image();
img.src = ctx.export.url;
await new Promise((res, rej) => { img.onload = res; img.onerror = rej; });
const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
w = Math.round(img.naturalWidth * ratio);
h = Math.round(img.naturalHeight * ratio);
const canvas = document.createElement('canvas');
canvas.width = w;
canvas.height = h;
canvas.getContext('2d').drawImage(img, 0, 0, w, h);
const mime =
format === 'jpg' || format === 'jpeg' ? 'image/jpeg'
: format === 'webp' ? 'image/webp'
: 'image/png';
dataURL = canvas.toDataURL(mime, quality);
} else {
// No image — scale bboxes to the same target width the image would
// have used, so downstream callers can still render the map over
// a separately-rendered screenshot at the same scale.
const sourceW = meta.dimensions.width || 1;
const ratio = sourceW > maxWidth ? maxWidth / sourceW : 1;
w = Math.round(sourceW * ratio);
h = Math.round(meta.dimensions.height * ratio);
}
const sx = w / (meta.dimensions.width || 1);
const sy = h / (meta.dimensions.height || 1);
const scaledElements = meta.elements.map((el) => ({
...el,
bbox: {
x: Math.round(el.bbox.x * sx),
y: Math.round(el.bbox.y * sy),
width: Math.round(el.bbox.width * sx),
height: Math.round(el.bbox.height * sy),
},
}));
const emitCoords = mode === 'verbose' ? true : (withCoords && !annotate);
const promptText = (wantPrompt)
? (mode === 'verbose'
? formatPromptVerbose(scaledElements, { width: w, height: h }, emitCoords)
: formatPromptCompact(scaledElements, { width: w, height: h }, emitCoords))
: null;
const out = { dimensions: { width: w, height: h } };
if (wantImage) out.image = dataURL;
if (wantElements) out.elements = scaledElements;
if (wantPrompt) out.prompt = promptText;
return out;
},
};
},
};
}
/* ── Accessible name ──────────────────────────── */
function truncate(str, max) {
if (!str) return '';
if (str.length <= max) return str;
return str.slice(0, max - 1) + '…';
}
/**
* Compute the element's accessible name following a simplified WAI-ARIA order.
* This is what an LLM agent reads first to know what the element IS.
*/
function accessibleName(el) {
const ariaLabel = el.getAttribute('aria-label');
if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim();
const labelledBy = el.getAttribute('aria-labelledby');
if (labelledBy) {
const root = el.getRootNode();
const getById = (id) =>
root && typeof root.getElementById === 'function'
? root.getElementById(id)
: document.getElementById(id);
const parts = labelledBy.trim().split(/\s+/)
.map((id) => {
const ref = getById(id);
return ref ? (ref.textContent || '').trim() : '';
})
.filter(Boolean);
if (parts.length) return parts.join(' ');
}
if (el.tagName === 'IMG') {
const alt = el.getAttribute('alt');
if (alt && alt.trim()) return alt.trim();
}
const title = el.getAttribute('title');
if (title && title.trim()) return title.trim();
if (el.labels && el.labels[0]) {
const t = (el.labels[0].textContent || '').trim();
if (t) return t;
}
const text = (el.textContent || '').trim();
return text ? truncate(text, 40) : '';
}
/* ── DOM state ────────────────────────────────── */
/**
* Capture real runtime state (not just ARIA attributes). This is what
* separates a useful map from a static screenshot annotation: an agent can
* see "checkbox is unchecked / select has value=X / details is closed".
*/
function computeState(el) {
const state = {};
try {
if (el.matches(':checked')) state.checked = true;
if (el.matches(':disabled')) state.disabled = true;
if (el.matches(':focus')) state.focus = true;
} catch { /* some selectors may fail on exotic nodes */ }
const tag = el.tagName;
if (tag === 'INPUT') {
// Checkbox/radio: `value` defaults to "on" when no explicit value attr
// is set — noise. The meaningful signal is `checked`. Skip value here.
const type = (el.type || 'text').toLowerCase();
if (type !== 'checkbox' && type !== 'radio' && el.value) {
state.value = el.value;
}
} else if (tag === 'TEXTAREA') {
if (el.value) state.value = el.value;
} else if (tag === 'SELECT') {
state.value = el.value;
const opt = el.options && el.options[el.selectedIndex];
if (opt) state.selectedText = opt.text || '';
} else if (tag === 'DETAILS') {
state.open = !!el.open;
} else if (el.hasAttribute && el.hasAttribute('open')) {
state.open = true;
}
return Object.keys(state).length ? state : null;
}
/* ── Visual styles ────────────────────────────── */
/**
* Pull a small set of visually-meaningful computed styles. Reads from the
* live element (the clone uses class-based styling so its inline style is
* empty and a detached `getComputedStyle(cloneNode)` returns initial values).
* The computed style of the original is what snapdom used to build the
* capture, so the result matches what the screenshot shows.
*/
function computeVisualStyles(el) {
let cs;
try { cs = getComputedStyle(el); } catch { return null; }
if (!cs) return null;
const out = {};
for (const prop of VISUAL_FIELDS) {
const v = cs[prop];
if (isDefaultStyleValue(prop, v)) continue;
out[prop] = v;
}
return Object.keys(out).length ? out : null;
}
/* ── Covered detection ────────────────────────── */
/**
* True when another element is painted on top of the center of this one.
* An agent that knows a button is covered by a modal won't try to click it.
*/
function isCovered(el, rect) {
if (!rect.width || !rect.height) return false;
const cx = rect.left + rect.width / 2;
const cy = rect.top + rect.height / 2;
if (cx < 0 || cy < 0) return false;
const doc = el.ownerDocument || document;
if (!doc.elementFromPoint) return false;
const top = doc.elementFromPoint(cx, cy);
if (!top || top === el) return false;
if (el.contains && el.contains(top)) return false;
return true;
}
/* ── Metadata extraction ──────────────────────── */
function extractMetadata(element, interactiveSelector, semanticSelector) {
const rootRect = element.getBoundingClientRect();
const elements = [];
let id = 0;
const tracked = new Set();
for (const el of element.querySelectorAll(interactiveSelector)) {
const entry = buildEntry(el, rootRect, id, 'interactive');
if (entry) {
elements.push(entry);
tracked.add(el);
id++;
}
}
for (const el of element.querySelectorAll(semanticSelector)) {
if (tracked.has(el)) continue;
const entry = buildEntry(el, rootRect, id, 'semantic');
if (entry) {
elements.push(entry);
id++;
}
}
return {
elements,
dimensions: { width: rootRect.width, height: rootRect.height },
};
}
function buildEntry(el, rootRect, id, type) {
const rect = el.getBoundingClientRect();
const bbox = {
x: Math.round(rect.left - rootRect.left),
y: Math.round(rect.top - rootRect.top),
width: Math.round(rect.width),
height: Math.round(rect.height),
};
if (bbox.width <= 0 && bbox.height <= 0) return null;
const tag = el.tagName.toLowerCase();
const maxText = type === 'interactive' ? 200 : 120;
const text = (el.textContent || '').trim().slice(0, maxText);
const attributes = {};
for (const attr of COLLECTED_ATTRS) {
const val = el.getAttribute(attr);
if (val == null || val === '' || val === 'false') continue;
attributes[attr] = val;
}
// For <img>, replace the (potentially long, cache-busted) src with just the
// filename — "logo.svg" is more useful to an LLM than the full URL.
if (tag === 'img' && el.src) {
try {
const base = (el.ownerDocument && el.ownerDocument.location)
? el.ownerDocument.location.href
: (typeof location !== 'undefined' ? location.href : undefined);
const url = base ? new URL(el.src, base) : new URL(el.src);
attributes.src = url.pathname.split('/').pop() || el.src;
} catch {
attributes.src = el.src;
}
}
const entry = {
id,
tag,
type,
name: accessibleName(el),
text,
bbox,
attributes,
};
const styles = computeVisualStyles(el);
if (styles) entry.styles = styles;
if (type === 'interactive') {
const state = computeState(el);
if (state) {
// Drop state.value when it just echoes attributes.value — no
// divergence between the initial HTML attribute and the current
// property, nothing for the LLM to learn from the repeat.
if (state.value !== undefined && state.value === attributes.value) {
delete state.value;
}
if (Object.keys(state).length) entry.state = state;
}
if (isCovered(el, rect)) entry.covered = true;
}
return entry;
}
/* ── Visual annotations ───────────────────────── */
function addAnnotations(clone, elements, customStyle) {
const interactive = elements.filter((e) => e.type === 'interactive');
if (!interactive.length) return;
const overlay = document.createElement('div');
overlay.setAttribute('data-snap-prompt-overlay', 'true');
Object.assign(overlay.style, {
position: 'absolute',
top: '0',
left: '0',
width: '100%',
height: '100%',
pointerEvents: 'none',
zIndex: '2147483647',
overflow: 'visible',
});
for (const el of interactive) {
const badge = document.createElement('span');
badge.textContent = String(el.id);
badge.setAttribute('data-snap-prompt-label', String(el.id));
// Center the badge on the element's bbox, not on its top-left corner:
// `translate(-50%, -50%)` offsets from the anchor point.
const cx = el.bbox.x + el.bbox.width / 2;
const cy = el.bbox.y + el.bbox.height / 2;
Object.assign(badge.style, {
position: 'absolute',
left: `${cx}px`,
top: `${cy}px`,
transform: 'translate(-50%, -50%)',
minWidth: '18px',
height: '18px',
lineHeight: '18px',
fontSize: '11px',
fontWeight: '700',
fontFamily: 'system-ui, -apple-system, sans-serif',
color: '#fff',
backgroundColor: 'rgba(220, 38, 38, 0.92)',
borderRadius: '9px',
textAlign: 'center',
padding: '0 4px',
boxSizing: 'border-box',
boxShadow: '0 1px 3px rgba(0,0,0,0.3)',
...customStyle,
});
overlay.appendChild(badge);
}
clone.style.position = 'relative';
clone.appendChild(overlay);
}
/* ── Prompt text formatters ───────────────────── */
function stateToStr(state) {
if (!state) return '';
const flags = [];
const pairs = [];
for (const k of Object.keys(state)) {
const v = state[k];
if (v === true) flags.push(k);
else pairs.push(`${k}=${JSON.stringify(v)}`);
}
if (!flags.length && !pairs.length) return '';
return ` {${[...flags, ...pairs].join(', ')}}`;
}
function coordsToStr(bbox) {
return ` (${bbox.x},${bbox.y} ${bbox.width}×${bbox.height})`;
}
function formatPromptCompact(elements, dimensions, withCoords) {
const lines = [`Screenshot (${dimensions.width}×${dimensions.height}px).`];
const interactive = elements.filter((e) => e.type === 'interactive');
const semantic = elements.filter((e) => e.type === 'semantic');
if (interactive.length) {
lines.push('', 'Interactive:');
for (const el of interactive) {
const name = el.name ? ` "${truncate(el.name, 60)}"` : '';
const st = stateToStr(el.state);
const cov = el.covered ? ' (covered)' : '';
const coords = withCoords ? coordsToStr(el.bbox) : '';
lines.push(` [${el.id}] ${el.tag}${name}${st}${cov}${coords}`);
}
}
if (semantic.length) {
lines.push('', 'Semantic:');
for (const el of semantic) {
const name = el.name ? ` "${truncate(el.name, 60)}"` : '';
lines.push(` [${el.id}] ${el.tag}${name}`);
}
}
return lines.join('\n');
}
function formatPromptVerbose(elements, dimensions, withCoords) {
const lines = [
`Screenshot of a web page (${dimensions.width}×${dimensions.height}px).`,
'',
];
const interactive = elements.filter((e) => e.type === 'interactive');
const semantic = elements.filter((e) => e.type === 'semantic');
if (interactive.length) {
lines.push('Interactive elements:');
for (const el of interactive) {
const name = el.name ? ` "${truncate(el.name, 80)}"` : '';
const attrParts = Object.entries(el.attributes).map(([k, v]) => `${k}="${v}"`);
const attrs = attrParts.length ? ' ' + attrParts.join(' ') : '';
const pos = withCoords ? coordsToStr(el.bbox) : '';
const st = stateToStr(el.state);
const cov = el.covered ? ' (covered)' : '';
lines.push(` [${el.id}] <${el.tag}>${name}${pos}${attrs}${st}${cov}`);
}
lines.push('');
}
if (semantic.length) {
lines.push('Semantic structure:');
for (const el of semantic) {
const name = el.name ? ` "${truncate(el.name, 80)}"` : '';
const attrParts = [];
if (el.attributes.alt) attrParts.push(`alt="${el.attributes.alt}"`);
if (el.attributes.role) attrParts.push(`role="${el.attributes.role}"`);
const attrs = attrParts.length ? ' ' + attrParts.join(' ') : '';
lines.push(` [${el.id}] <${el.tag}>${name}${attrs}`);
}
lines.push('');
}
return lines.join('\n');
}