@zumer/snapdom-plugins - npm Package Compare versions

Comparing version

1.2.0

2.1.0

+376

agent-map.js

		/**
		* agentMap – Official SnapDOM Plugin
		*
		* Produces a Set-of-Mark package for visual agents: an annotated screenshot
		* with numbered badges on interactive elements, plus a compact JSON map from
		* badge index → role / name / bbox / state. Designed for one-call capture
		* on the client side — visual agents, computer-use harnesses, dataset
		* generation for vision training, visual QA.
		*
		* Usage:
		* import { agentMap } from '@zumer/snapdom-plugins/agent-map';
		* const result = await snapdom(el, { plugins: [agentMap()] });
		* const { image, map, dimensions } = await result.toAgentMap();
		*
		* // model reply: "click element 2" → map[2].b gives [x, y, w, h]
		*
		* @param {Object} [options]
		* @param {'annotated'\|'raw'\|false} [options.image='annotated'] Image output
		* mode. 'annotated' draws numbered badges on the rendered image, 'raw'
		* returns the image without badges, false skips image generation (cheapest).
		* @param {'minimal'\|'full'} [options.fields='minimal'] Per-entry shape.
		* 'minimal' returns {i, n, r, b, s?}. 'full' adds {t (text), a (attrs)}.
		* @param {boolean} [options.semantic=false] Include non-interactive semantic
		* elements (headings, paragraphs, nav, main, landmarks). Off by default —
		* agents typically only act on interactive.
		* @param {number} [options.maxImageWidth=1024] Downscale target for the image.
		* @param {'png'\|'jpg'\|'webp'} [options.imageFormat='png'] Image format.
		* @param {number} [options.imageQuality=0.8] Quality for lossy formats.
		* @param {string} [options.interactiveSelector] CSS selector (default below).
		* @param {string} [options.semanticSelector] CSS selector (default below).
		* @param {Object} [options.labelStyle={}] Override badge styles.
		* @returns {Object} SnapDOM plugin
		*/

		const DEFAULT_INTERACTIVE =
		'a[href], button, input, select, textarea, ' +
		'[role="button"], [role="link"], [role="tab"], [role="menuitem"], [role="checkbox"], [role="radio"], [role="switch"], [role="slider"], [role="combobox"], [role="textbox"], ' +
		'[tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]';

		const DEFAULT_SEMANTIC =
		'h1, h2, h3, h4, h5, h6, nav, main, article, section, header, footer, ' +
		'figcaption, blockquote, legend, p';

		export function agentMap(options = {}) {
		const {
		image = 'annotated',
		fields = 'minimal',
		semantic = false,
		maxImageWidth = 1024,
		imageFormat = 'png',
		imageQuality = 0.8,
		interactiveSelector = DEFAULT_INTERACTIVE,
		semanticSelector = DEFAULT_SEMANTIC,
		labelStyle = {},
		} = options;

		return {
		name: 'agent-map',

		afterClone(ctx) {
		const meta = extractMap(
		ctx.element,
		interactiveSelector,
		semantic ? semanticSelector : null,
		fields
		);
		// snapdom's export ctx is a fresh spread of ctx.options, so we stash on
		// both for the agentMap() call below to find it.
		ctx.__agentMapMeta = meta;
		if (ctx.options) ctx.options.__agentMapMeta = meta;

		if (image === 'annotated') {
		addAnnotations(ctx.clone, meta.map, labelStyle);
		}
		},

		defineExports() {
		return {
		agentMap: async (ctx, opts = {}) => {
		const meta = ctx.__agentMapMeta;
		const wantImage = opts.image !== undefined ? opts.image : image;

		if (!meta \|\| !meta.map.length) {
		const out = { dimensions: { width: 0, height: 0 }, map: [] };
		if (wantImage) out.image = ctx.export.url;
		return out;
		}

		const format = opts.imageFormat \|\| imageFormat;
		const quality = opts.imageQuality \|\| imageQuality;
		const maxWidth = opts.maxImageWidth \|\| maxImageWidth;

		// Scale dimensions — whether we rasterize or not, bboxes get resized
		// to the target output size so callers can overlay them on the image.
		let w, h, dataURL;
		if (wantImage) {
		const img = new Image();
		img.src = ctx.export.url;
		await new Promise((res, rej) => { img.onload = res; img.onerror = rej; });
		const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
		w = Math.round(img.naturalWidth * ratio);
		h = Math.round(img.naturalHeight * ratio);
		const canvas = document.createElement('canvas');
		canvas.width = w;
		canvas.height = h;
		canvas.getContext('2d').drawImage(img, 0, 0, w, h);
		const mime =
		format === 'jpg' \|\| format === 'jpeg' ? 'image/jpeg'
		: format === 'webp' ? 'image/webp'
		: 'image/png';
		dataURL = canvas.toDataURL(mime, quality);
		} else {
		const sourceW = meta.dimensions.width \|\| 1;
		const ratio = sourceW > maxWidth ? maxWidth / sourceW : 1;
		w = Math.round(sourceW * ratio);
		h = Math.round(meta.dimensions.height * ratio);
		}

		const sx = w / (meta.dimensions.width \|\| 1);
		const sy = h / (meta.dimensions.height \|\| 1);

		const scaledMap = meta.map.map(e => {
		const scaled = { ...e, b: [
		Math.round(e.b[0] * sx),
		Math.round(e.b[1] * sy),
		Math.round(e.b[2] * sx),
		Math.round(e.b[3] * sy),
		] };
		return scaled;
		});

		const out = { dimensions: { width: w, height: h }, map: scaledMap };
		if (wantImage) out.image = dataURL;
		return out;
		},
		};
		},
		};
		}

		/* ── Role derivation ────────────────────────────── */

		function deriveRole(el) {
		const explicit = el.getAttribute('role');
		if (explicit) return explicit;
		const tag = el.tagName.toLowerCase();
		const type = (el.type \|\| '').toLowerCase();
		if (tag === 'button') return 'button';
		if (tag === 'a' && el.hasAttribute('href')) return 'link';
		if (tag === 'input') {
		if (type === 'checkbox') return 'checkbox';
		if (type === 'radio') return 'radio';
		if (type === 'range') return 'slider';
		if (type === 'file') return 'file';
		if (type === 'submit' \|\| type === 'button' \|\| type === 'reset' \|\| type === 'image') return 'button';
		return 'textbox';
		}
		if (tag === 'select') return 'combobox';
		if (tag === 'textarea') return 'textbox';
		if (tag === 'summary') return 'button';
		if (tag === 'details') return 'group';
		if (/^h[1-6]$/.test(tag)) return 'heading';
		if (tag === 'nav') return 'navigation';
		if (tag === 'main') return 'main';
		if (tag === 'header') return 'banner';
		if (tag === 'footer') return 'contentinfo';
		if (tag === 'article') return 'article';
		if (tag === 'section') return 'region';
		if (tag === 'p') return 'paragraph';
		if (tag === 'img') return 'image';
		return tag;
		}

		/* ── Accessible name ────────────────────────────── */

		function accessibleName(el) {
		const ariaLabel = el.getAttribute('aria-label');
		if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim();

		const labelledBy = el.getAttribute('aria-labelledby');
		if (labelledBy) {
		const root = el.getRootNode();
		const getById = (id) =>
		root && typeof root.getElementById === 'function'
		? root.getElementById(id) : document.getElementById(id);
		const parts = labelledBy.trim().split(/\s+/)
		.map(id => { const r = getById(id); return r ? (r.textContent \|\| '').trim() : ''; })
		.filter(Boolean);
		if (parts.length) return parts.join(' ');
		}

		if (el.tagName === 'IMG' \|\| (el.tagName === 'INPUT' && (el.type \|\| '').toLowerCase() === 'image')) {
		const alt = el.getAttribute('alt');
		if (alt && alt.trim()) return alt.trim();
		}

		const title = el.getAttribute('title');
		if (title && title.trim()) return title.trim();

		if (el.labels && el.labels[0]) {
		const t = (el.labels[0].textContent \|\| '').trim();
		if (t) return t;
		}

		const text = (el.textContent \|\| '').replace(/\s+/g, ' ').trim();
		if (text) return text.length > 60 ? text.slice(0, 59) + '…' : text;
		return '';
		}

		/* ── State extraction ───────────────────────────── */

		/**
		* Builds the `s` (state) object. Only meaningful states — never default
		* values that add no signal for an agent. Critically, aria-expanded and
		* aria-pressed are included for BOTH values (true and false) because
		* "pressed: false" on a toggle is meaningful information.
		*/
		function deriveState(el, role, rect) {
		const s = {};

		try {
		if (el.matches(':checked')) s.checked = true;
		else if (role === 'checkbox' \|\| role === 'radio') {
		// include checked:false for form groups where an agent needs to
		// know "unchecked" is a valid state distinct from "not a checkbox".
		s.checked = false;
		}
		if (el.matches(':disabled')) s.disabled = true;
		if (el.matches(':focus')) s.focus = true;
		} catch { /* exotic nodes */ }

		const expanded = el.getAttribute('aria-expanded');
		if (expanded === 'true') s.expanded = true;
		else if (expanded === 'false') s.expanded = false;

		const pressed = el.getAttribute('aria-pressed');
		if (pressed === 'true') s.pressed = true;
		else if (pressed === 'false') s.pressed = false;

		const selected = el.getAttribute('aria-selected');
		if (selected === 'true') s.selected = true;
		else if (selected === 'false' && (role === 'tab' \|\| role === 'option')) s.selected = false;

		if (el.tagName === 'INPUT') {
		const type = (el.type \|\| 'text').toLowerCase();
		if (type !== 'checkbox' && type !== 'radio' && type !== 'submit' && type !== 'button' && type !== 'reset' && el.value) {
		s.value = el.value;
		}
		} else if (el.tagName === 'TEXTAREA') {
		if (el.value) s.value = el.value;
		} else if (el.tagName === 'SELECT') {
		s.value = el.value;
		const opt = el.options && el.options[el.selectedIndex];
		if (opt) s.selectedText = opt.text \|\| '';
		} else if (el.tagName === 'DETAILS') {
		s.open = !!el.open;
		}

		// Covered — element visually occluded by something else at its center.
		if (rect && rect.width && rect.height) {
		const cx = rect.left + rect.width / 2;
		const cy = rect.top + rect.height / 2;
		if (cx >= 0 && cy >= 0) {
		const doc = el.ownerDocument \|\| document;
		if (doc.elementFromPoint) {
		const top = doc.elementFromPoint(cx, cy);
		if (top && top !== el && !(el.contains && el.contains(top))) {
		s.covered = true;
		}
		}
		}
		}

		return Object.keys(s).length ? s : null;
		}

		/* ── Map extraction ─────────────────────────────── */

		function extractMap(element, interactiveSelector, semanticSelector, fields) {
		const rootRect = element.getBoundingClientRect();
		const map = [];
		let i = 0;
		const tracked = new Set();

		for (const el of element.querySelectorAll(interactiveSelector)) {
		const entry = buildEntry(el, rootRect, i, fields, 'interactive');
		if (entry) { map.push(entry); tracked.add(el); i++; }
		}

		if (semanticSelector) {
		for (const el of element.querySelectorAll(semanticSelector)) {
		if (tracked.has(el)) continue;
		const entry = buildEntry(el, rootRect, i, fields, 'semantic');
		if (entry) { map.push(entry); i++; }
		}
		}

		return {
		map,
		dimensions: { width: rootRect.width, height: rootRect.height },
		};
		}

		function buildEntry(el, rootRect, i, fields, kind) {
		const rect = el.getBoundingClientRect();
		const b = [
		Math.round(rect.left - rootRect.left),
		Math.round(rect.top - rootRect.top),
		Math.round(rect.width),
		Math.round(rect.height),
		];
		if (b[2] <= 0 && b[3] <= 0) return null;

		const role = deriveRole(el);
		const n = accessibleName(el);

		const entry = { i, n, r: role, b };

		if (kind === 'interactive') {
		const s = deriveState(el, role, rect);
		if (s) entry.s = s;
		}

		if (fields === 'full') {
		const t = (el.textContent \|\| '').replace(/\s+/g, ' ').trim();
		if (t && t !== n) entry.t = t.length > 160 ? t.slice(0, 159) + '…' : t;
		const a = {};
		for (const name of ['href', 'type', 'name', 'placeholder', 'alt', 'title', 'role', 'aria-label']) {
		const v = el.getAttribute(name);
		if (v && v !== 'false') a[name] = v;
		}
		if (Object.keys(a).length) entry.a = a;
		}

		return entry;
		}

		/* ── Annotations ────────────────────────────────── */

		function addAnnotations(clone, entries, customStyle) {
		const interactive = entries.filter(e => !e.isSemanticOnly);
		if (!interactive.length) return;

		const overlay = document.createElement('div');
		overlay.setAttribute('data-snap-agent-overlay', 'true');
		Object.assign(overlay.style, {
		position: 'absolute',
		top: '0', left: '0', width: '100%', height: '100%',
		pointerEvents: 'none',
		zIndex: '2147483647',
		overflow: 'visible',
		});

		for (const e of interactive) {
		const badge = document.createElement('span');
		badge.textContent = String(e.i);
		const cx = e.b[0] + e.b[2] / 2;
		const cy = e.b[1] + e.b[3] / 2;
		Object.assign(badge.style, {
		position: 'absolute',
		left: cx + 'px', top: cy + 'px',
		transform: 'translate(-50%, -50%)',
		minWidth: '18px', height: '18px',
		lineHeight: '18px', fontSize: '11px', fontWeight: '700',
		fontFamily: 'system-ui, -apple-system, sans-serif',
		color: '#fff', backgroundColor: 'rgba(220, 38, 38, 0.92)',
		borderRadius: '9px', textAlign: 'center', padding: '0 4px',
		boxSizing: 'border-box', boxShadow: '0 1px 3px rgba(0,0,0,0.3)',
		...customStyle,
		});
		overlay.appendChild(badge);
		}

		clone.style.position = 'relative';
		clone.appendChild(overlay);
		}

+33

-17

html-in-canvas.js

		@@ -12,9 +12,17 @@ /**

		function isDrawElementImageAvailable() {
		/**
		* The WICG canvas-place-element spec evolved: Chrome ~130+ exposes drawElement(),
		* earlier flagged builds shipped drawElementImage(). Detect either.
		* @returns {'drawElement'\|'drawElementImage'\|null}
		*/
		function detectDrawApi() {
		try {
		const c = document.createElement('canvas')
		const ctx = c.getContext('2d')
		return ctx && typeof ctx.drawElementImage === 'function'
		if (!ctx) return null
		if (typeof ctx.drawElement === 'function') return 'drawElement'
		if (typeof ctx.drawElementImage === 'function') return 'drawElementImage'
		return null
		} catch {
		return false
		return null
		}
		@@ -27,5 +35,6 @@ }
		export function htmlInCanvasPlugin() {
		const available = isDrawElementImageAvailable()
		const drawApi = detectDrawApi()
		const available = !!drawApi
		if (!available) {
		console.warn('[snapdom] html-in-canvas plugin: drawElementImage not available. Enable chrome://flags/#canvas-draw-element')
		console.warn('[snapdom] html-in-canvas plugin: drawElement / drawElementImage not available. Enable chrome://flags/#canvas-draw-element')
		}
		@@ -97,23 +106,30 @@

		const container = document.createElement('div')
		container.id = 'snapdom-html-in-canvas-temp'
		container.style.cssText = 'position:fixed;left:-9999px;top:0;visibility:hidden;'
		container.appendChild(canvas)
		document.body.appendChild(container)
		// Append directly to body, taken out of flow with position:fixed + z-index:-1
		// so it sits behind the page's content (covered by body/main backgrounds)
		// while still being painted. visibility:hidden / opacity:0 / left:-9999px
		// skip the paint pass and trigger "No cached paint record".
		canvas.style.cssText = 'position:fixed;top:0;left:0;z-index:-1;'
		document.body.appendChild(canvas)

		try {
		await new Promise(r => requestAnimationFrame(r))
		canvas.getBoundingClientRect()
		await new Promise(r => requestAnimationFrame(() => requestAnimationFrame(r)))

		const ctx2d = canvas.getContext('2d')
		if (!ctx2d \|\| typeof ctx2d.drawElementImage !== 'function') {
		throw new Error('drawElementImage not available')
		const fn = ctx2d && (ctx2d[drawApi] \|\| ctx2d.drawElement \|\| ctx2d.drawElementImage)
		if (typeof fn !== 'function') {
		throw new Error('drawElement / drawElementImage not available on this canvas context')
		}
		ctx2d.save()
		ctx2d.scale(dpr * scale, dpr * scale)
		ctx2d.drawElementImage(wrapper, 0, 0, width, height)
		fn.call(ctx2d, wrapper, 0, 0, width, height)
		ctx2d.restore()
		return canvas
		} catch (e) {
		if (e && /paint record/i.test(e.message \|\| '')) {
		throw new Error('Browser had no paint record for the element. Make sure the document is fully loaded and visible before calling html-in-canvas (drawElement requires a real paint pass).')
		}
		throw e
		} finally {
		try {
		document.body.removeChild(container)
		} catch {}
		try { canvas.remove() } catch {}
		}
		@@ -120,0 +136,0 @@ }

+1

-1

index.js

		@@ -15,3 +15,3 @@ /**
		export { pdfImage } from './pdf-image.js';
		export { promptExport } from './prompt-export.js';
		export { agentMap } from './agent-map.js';
		// export { htmlInCanvas } from './html-in-canvas.js';

+2

-2

package.json

		{
		"name": "@zumer/snapdom-plugins",
		"version": "1.2.0",
		"version": "2.1.0",
		"description": "Official plugins for SnapDOM",
		@@ -15,3 +15,3 @@ "type": "module",
		"./pdf-image": "./pdf-image.js",
		"./prompt-export": "./prompt-export.js"
		"./agent-map": "./agent-map.js"
		},
		@@ -18,0 +18,0 @@ "files": [

+44

-60

README.md

		@@ -173,83 +173,67 @@ # @zumer/snapdom-plugins

		### `prompt-export`
		### `agent-map`

		Adds a `toPrompt()` export method that returns an LLM-ready package: a structured element map with bounding boxes, a pre-formatted prompt text, and (optionally) an annotated screenshot. Tuned for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata.
		Produces a Set-of-Mark package for visual agents: an annotated screenshot with numbered badges on interactive elements, plus a compact JSON map from badge index → role / accessible name / bbox / state. One call, fully client-side.

		```js
		import { promptExport } from '@zumer/snapdom-plugins/prompt-export';
		import { agentMap } from '@zumer/snapdom-plugins/agent-map';

		const result = await snapdom(el, { plugins: [promptExport()] });
		// Default: no image, just the structured map + prompt text (cheapest)
		const { elements, prompt, dimensions } = await result.toPrompt();
		```
		const result = await snapdom(el, { plugins: [agentMap()] });
		const { image, map, dimensions } = await result.toAgentMap();

		To also include the annotated image (for tasks that truly depend on vision):

		```js
		const result = await snapdom(el, {
		plugins: [promptExport({ include: ['image', 'elements', 'prompt'] })]
		});
		const { image, elements, prompt, dimensions } = await result.toPrompt();
		// image: data URL of the screenshot with numbered red badges overlaid
		// map: [{ i, n, r, b, s? }, …] — index, name, role, bbox, state
		// Agent says "click element 2" → map[2].b gives [x, y, w, h]
		```

		The returned object (fields present only if requested via `include`):
		Map entry shape (default `fields: 'minimal'`):

		\| Field \| Type \| Description \|
		\|-------\|------\|-------------\|
		\| `elements` \| `Array` \| One entry per detected element: `{ id, tag, type, name, text, bbox, attributes, state?, styles?, covered? }` \|
		\| `prompt` \| `string` \| Pre-formatted text describing interactive + semantic elements \|
		\| `image` \| `string` \| Data URL of the (optionally annotated) screenshot — only when `include` contains `'image'` \|
		\| `dimensions` \| `{width, height}` \| Scaled dimensions (always present) \|
		\| Key \| Type \| Description \|
		\|-----\|------\|-------------\|
		\| `i` \| `number` \| Index matching the badge drawn on the image \|
		\| `n` \| `string` \| Accessible name (aria-label → labelledby → alt → title → labels → textContent, truncated to 60 chars) \|
		\| `r` \| `string` \| ARIA-style role (`button`, `link`, `checkbox`, `radio`, `textbox`, `combobox`, `slider`, `heading`, …) — derived from `role` attribute or implicit role of the element \|
		\| `b` \| `[x, y, w, h]` \| Bounding box in pixels, scaled against `maxImageWidth` \|
		\| `s` \| `object?` \| State: included only when at least one key is meaningful — `checked`, `disabled`, `focus`, `expanded`, `pressed`, `selected`, `value`, `open`, `selectedText`, `covered` \|

		`elements` is split into two `type`s:
		- `'interactive'` — buttons, links, inputs, `[role]`/`[tabindex]` targets. These get numbered badges overlaid on the screenshot when `annotate` is on.
		- `'semantic'` — headings, paragraphs, `<nav>`, `<main>`, images with `alt`, table cells, etc. Structural context, not overlaid.
		Example map for a checkout form:

		Each `bbox` is in pixel coordinates of the returned image (scaled against `maxImageWidth`).

		Each interactive entry also carries:
		- `name` — the computed accessible name (aria-label → labelledby → alt → title → labels[0] → textContent)
		- `state` — runtime state: `{ checked, disabled, focus, open, value, selectedText }` (only keys that apply)
		- `styles` — visually-meaningful computed props filtered to drop defaults
		- `covered: true` when another element is painted on top of the bbox center (an agent won't click through a modal)

		```js
		// Example — feed a vision-capable LLM
		const { image, elements } = await result.toPrompt({
		include: ['image', 'elements', 'prompt']
		});

		// image is a data URL → pass as image input
		// elements is JSON → pass as structured context alongside the image
		// "Click element [3]" → look up elements[3].bbox for real coordinates
		[
		{ i: 0, n: 'Email', r: 'textbox', b: [28, 80, 280, 34], s: { value: 'ada@example.com' } },
		{ i: 1, n: 'Send product updates', r: 'checkbox', b: [28, 134, 13, 13], s: { checked: true } },
		{ i: 2, n: 'Apply coupon', r: 'button', b: [28, 176, 114, 38], s: { expanded: false } },
		{ i: 3, n: 'Remove coupon', r: 'button', b: [150, 176, 140, 38], s: { disabled: true } },
		{ i: 4, n: 'Pay $53.90', r: 'button', b: [28, 220, 97, 38] }
		]
		```

		#### Options

		\| Option \| Type \| Default \| Description \|
		\|--------\|------\|---------\|-------------\|
		\| `include` \| `string[]` \| `['elements', 'prompt']` \| Fields to return. Add `'image'` for tasks that need vision (chart content, layout QA, canvas). Use `['prompt']` for the cheapest text-only mode. \|
		\| `annotate` \| `boolean` \| `true` \| Overlay numbered badges on interactive elements (only affects the image when included) \|
		\| `promptMode` \| `'compact' \\| 'verbose'` \| `'compact'` \| Prompt text verbosity. Compact omits coords when badges are on the image. \|
		\| `includeCoords` \| `boolean` \| `true` \| Include bbox in the prompt text \|
		\| `imageFormat` \| `'png' \\| 'jpg' \\| 'webp'` \| `'png'` \| Output image format (only used when `image` is included) \|
		\| `imageQuality` \| `number` \| `0.8` \| Quality for lossy formats (0–1) \|
		\| `maxImageWidth` \| `number` \| `1024` \| Max width in px; downscales and rescales bboxes if larger \|
		\| `interactiveSelector` \| `string` \| see below \| CSS selector for the interactive element set \|
		\| `semanticSelector` \| `string` \| see below \| CSS selector for the semantic element set \|
		\| `labelStyle` \| `object` \| `{}` \| Override styles for the numbered badges (`position`, `color`, `backgroundColor`, etc.) \|
		\| `image` \| `'annotated' \\| 'raw' \\| false` \| `'annotated'` \| `'annotated'` overlays numbered badges; `'raw'` skips badges; `false` skips image generation entirely (no canvas draw, no toDataURL — cheapest path). \|
		\| `fields` \| `'minimal' \\| 'full'` \| `'minimal'` \| `'full'` adds `t` (raw text content) and `a` (meaningful attributes) per entry. \|
		\| `semantic` \| `boolean` \| `false` \| Include non-interactive structural elements (headings, paragraphs, landmarks). Off by default — agents act on interactive. \|
		\| `maxImageWidth` \| `number` \| `1024` \| Downscale target for the image; bboxes rescale to match. \|
		\| `imageFormat` \| `'png' \\| 'jpg' \\| 'webp'` \| `'png'` \| Image format (only used when image is rendered). \|
		\| `imageQuality` \| `number` \| `0.8` \| Quality for lossy formats. \|
		\| `interactiveSelector` \| `string` \| see below \| CSS selector for interactive elements. \|
		\| `semanticSelector` \| `string` \| see below \| CSS selector for semantic elements (used when `semantic: true`). \|
		\| `labelStyle` \| `object` \| `{}` \| Override badge styles. \|

		Defaults:
		- interactive: `a[href], button, input, select, textarea, [role="button"\|"link"\|"tab"\|"menuitem"\|"checkbox"\|"radio"], [tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]`
		- semantic: `h1–h6, p, li, img[alt], nav, main, article, section, header, footer, label, td, th, figcaption, blockquote, legend`
		- interactive: `a[href], button, input, select, textarea, [role="button"\|"link"\|"tab"\|"menuitem"\|"checkbox"\|"radio"\|"switch"\|"slider"\|"combobox"\|"textbox"], [tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]`
		- semantic: `h1–h6, nav, main, article, section, header, footer, figcaption, blockquote, legend, p`

		Both per-call options (`opts.include`, `opts.imageFormat`, etc.) and constructor options are supported; per-call wins.
		Per-call options override constructor options (e.g. `result.toAgentMap({ image: false })`).

		The image is the most expensive part of `toPrompt()` to produce (canvas draw + data-URL serialization), so the default skips it. Add `'image'` to `include` when the task actually uses vision:
		#### When to use

		```js
		// Vision-dependent task (chart content, layout QA, visual diff)
		await result.toPrompt({ include: ['image', 'elements', 'prompt'] });
		- Visual agents using Set-of-Mark prompting — one call gives you both the labelled image and the coordinate lookup table.
		- Computer-use / browser-agent harnesses that need click coordinates for a vision model's output.
		- Visual QA with an LLM judge — compare before/after captures with structured element identity.
		- Dataset generation for vision-LLM fine-tuning — (image, map) pairs.

		// Pure structured agent loop (cheapest)
		await result.toPrompt({ include: ['prompt'] });
		```
		Because it runs entirely in the browser, it works in contexts where Playwright / Puppeteer can't: Chrome extensions, SaaS web apps capturing the user's own page, Electron apps capturing their own window.

		@@ -256,0 +240,0 @@ ---

-550

prompt-export.js

		/**
		* promptExport – Official SnapDOM Plugin
		* Produces an LLM-ready package: annotated screenshot + structured element
		* map + prompt text. Tuned for vision-capable LLMs reading the image + map
		* together (Set-of-Mark pattern).
		*
		* Usage:
		* import { promptExport } from '@zumer/snapdom-plugins/prompt-export';
		* const result = await snapdom(el, { plugins: [promptExport()] });
		* const { image, elements, dimensions, prompt } = await result.toPrompt();
		*
		* @param {Object} [options]
		* @param {boolean} [options.annotate=true] - Overlay numbered badges on interactive elements
		* @param {string} [options.imageFormat='png'] - Output image format ('png'\|'jpg'\|'webp')
		* @param {number} [options.imageQuality=0.8] - Quality for lossy formats (0..1)
		* @param {number} [options.maxImageWidth=1024] - Max width in px (downscales if larger)
		* @param {string} [options.interactiveSelector] - Custom CSS selector for interactive elements
		* @param {string} [options.semanticSelector] - Custom CSS selector for semantic elements
		* @param {Object} [options.labelStyle={}] - Override styles for annotation badges
		* @param {'compact'\|'verbose'} [options.promptMode='compact'] - Prompt text verbosity
		* @param {boolean} [options.includeCoords=true] - Include bbox in the prompt text
		* @param {string[]} [options.include] - Which fields to return. Default
		* ['elements', 'prompt']. For vision-dependent tasks (chart content, layout QA,
		* canvas) pass ['image', 'elements', 'prompt'] or add 'image' to the array. For
		* text-only agent prompts pass ['prompt'] (cheapest — skips canvas draw entirely).
		* Accepted values: 'image', 'elements', 'prompt'.
		* @returns {Object} SnapDOM plugin
		*/

		const DEFAULT_INTERACTIVE =
		'a[href], button, input, select, textarea, ' +
		'[role="button"], [role="link"], [role="tab"], [role="menuitem"], [role="checkbox"], [role="radio"], ' +
		'[tabindex]:not([tabindex="-1"]), summary, [contenteditable="true"]';

		const DEFAULT_SEMANTIC =
		'h1, h2, h3, h4, h5, h6, p, li, img[alt], nav, main, article, section, ' +
		'header, footer, label, td, th, figcaption, blockquote, legend';

		const COLLECTED_ATTRS = [
		'role', 'aria-label', 'aria-expanded', 'aria-checked', 'aria-disabled',
		'alt', 'href', 'placeholder', 'name', 'type', 'value', 'title', 'disabled',
		];

		const VISUAL_FIELDS = [
		'display', 'visibility', 'opacity',
		'color', 'backgroundColor',
		'fontSize', 'fontWeight',
		'cursor', 'overflow',
		];

		// Common computed-style values that carry no information. Keeping the
		// `styles` object small is the difference between a useful LLM input and
		// token bloat on every element.
		const VISUAL_SKIP = new Set(['initial', 'normal', 'visible', 'auto', 'static', '0']);
		function isDefaultStyleValue(prop, value) {
		if (!value) return true;
		if (VISUAL_SKIP.has(value)) return true;
		if (prop === 'cursor' && value === 'none') return true;
		if (prop === 'color' && value === 'rgb(0, 0, 0)') return true;
		if (prop === 'backgroundColor' && value === 'rgba(0, 0, 0, 0)') return true;
		if (prop === 'fontWeight' && (value === '400' \|\| value === 'normal')) return true;
		if (prop === 'opacity' && value === '1') return true;
		return false;
		}

		// Default omits 'image'. Benchmarking showed the text + JSON map is enough
		// to answer most UI-inspection questions and uses ~14× fewer tokens. Pass
		// `include: ['image', 'elements', 'prompt']` explicitly when the task truly
		// depends on vision (charts, canvas content, layout QA).
		const DEFAULT_INCLUDE = ['elements', 'prompt'];

		export function promptExport(options = {}) {
		const {
		annotate = true,
		imageFormat = 'png',
		imageQuality = 0.8,
		maxImageWidth = 1024,
		interactiveSelector = DEFAULT_INTERACTIVE,
		semanticSelector = DEFAULT_SEMANTIC,
		labelStyle = {},
		promptMode = 'compact',
		includeCoords = true,
		include = DEFAULT_INCLUDE,
		} = options;

		return {
		name: 'prompt-export',

		afterClone(ctx) {
		const meta = extractMetadata(ctx.element, interactiveSelector, semanticSelector);
		// snapdom spreads a fresh ctx for the export phase from ctx.options,
		// so write to both so the prompt() call below can read it.
		ctx.__promptMetadata = meta;
		if (ctx.options) ctx.options.__promptMetadata = meta;

		if (annotate) {
		addAnnotations(ctx.clone, meta.elements, labelStyle);
		}
		},

		defineExports() {
		return {
		prompt: async (ctx, opts = {}) => {
		const meta = ctx.__promptMetadata;
		const wantSet = new Set(opts.include \|\| include \|\| DEFAULT_INCLUDE);
		const wantImage = wantSet.has('image');
		const wantElements = wantSet.has('elements');
		const wantPrompt = wantSet.has('prompt');

		if (!meta \|\| !meta.elements.length) {
		const empty = { dimensions: { width: 0, height: 0 } };
		if (wantImage) empty.image = ctx.export.url;
		if (wantElements) empty.elements = [];
		if (wantPrompt) empty.prompt = '';
		return empty;
		}

		const format = opts.imageFormat \|\| imageFormat;
		const quality = opts.imageQuality \|\| imageQuality;
		const maxWidth = opts.maxImageWidth \|\| maxImageWidth;
		const mode = opts.promptMode \|\| promptMode;
		const withCoords = opts.includeCoords !== undefined ? opts.includeCoords : includeCoords;

		// Only load + rasterize the SVG when the caller actually wants the
		// image. Skipping saves the img decode + canvas draw + toDataURL —
		// the most expensive steps of this export.
		let w, h, dataURL;
		if (wantImage) {
		const img = new Image();
		img.src = ctx.export.url;
		await new Promise((res, rej) => { img.onload = res; img.onerror = rej; });
		const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
		w = Math.round(img.naturalWidth * ratio);
		h = Math.round(img.naturalHeight * ratio);
		const canvas = document.createElement('canvas');
		canvas.width = w;
		canvas.height = h;
		canvas.getContext('2d').drawImage(img, 0, 0, w, h);
		const mime =
		format === 'jpg' \|\| format === 'jpeg' ? 'image/jpeg'
		: format === 'webp' ? 'image/webp'
		: 'image/png';
		dataURL = canvas.toDataURL(mime, quality);
		} else {
		// No image — scale bboxes to the same target width the image would
		// have used, so downstream callers can still render the map over
		// a separately-rendered screenshot at the same scale.
		const sourceW = meta.dimensions.width \|\| 1;
		const ratio = sourceW > maxWidth ? maxWidth / sourceW : 1;
		w = Math.round(sourceW * ratio);
		h = Math.round(meta.dimensions.height * ratio);
		}

		const sx = w / (meta.dimensions.width \|\| 1);
		const sy = h / (meta.dimensions.height \|\| 1);

		const scaledElements = meta.elements.map((el) => ({
		...el,
		bbox: {
		x: Math.round(el.bbox.x * sx),
		y: Math.round(el.bbox.y * sy),
		width: Math.round(el.bbox.width * sx),
		height: Math.round(el.bbox.height * sy),
		},
		}));

		const emitCoords = mode === 'verbose' ? true : (withCoords && !annotate);
		const promptText = (wantPrompt)
		? (mode === 'verbose'
		? formatPromptVerbose(scaledElements, { width: w, height: h }, emitCoords)
		: formatPromptCompact(scaledElements, { width: w, height: h }, emitCoords))
		: null;

		const out = { dimensions: { width: w, height: h } };
		if (wantImage) out.image = dataURL;
		if (wantElements) out.elements = scaledElements;
		if (wantPrompt) out.prompt = promptText;
		return out;
		},
		};
		},
		};
		}

		/* ── Accessible name ──────────────────────────── */

		function truncate(str, max) {
		if (!str) return '';
		if (str.length <= max) return str;
		return str.slice(0, max - 1) + '…';
		}

		/**
		* Compute the element's accessible name following a simplified WAI-ARIA order.
		* This is what an LLM agent reads first to know what the element IS.
		*/
		function accessibleName(el) {
		const ariaLabel = el.getAttribute('aria-label');
		if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim();

		const labelledBy = el.getAttribute('aria-labelledby');
		if (labelledBy) {
		const root = el.getRootNode();
		const getById = (id) =>
		root && typeof root.getElementById === 'function'
		? root.getElementById(id)
		: document.getElementById(id);
		const parts = labelledBy.trim().split(/\s+/)
		.map((id) => {
		const ref = getById(id);
		return ref ? (ref.textContent \|\| '').trim() : '';
		})
		.filter(Boolean);
		if (parts.length) return parts.join(' ');
		}

		if (el.tagName === 'IMG') {
		const alt = el.getAttribute('alt');
		if (alt && alt.trim()) return alt.trim();
		}

		const title = el.getAttribute('title');
		if (title && title.trim()) return title.trim();

		if (el.labels && el.labels[0]) {
		const t = (el.labels[0].textContent \|\| '').trim();
		if (t) return t;
		}

		const text = (el.textContent \|\| '').trim();
		return text ? truncate(text, 40) : '';
		}

		/* ── DOM state ────────────────────────────────── */

		/**
		* Capture real runtime state (not just ARIA attributes). This is what
		* separates a useful map from a static screenshot annotation: an agent can
		* see "checkbox is unchecked / select has value=X / details is closed".
		*/
		function computeState(el) {
		const state = {};
		try {
		if (el.matches(':checked')) state.checked = true;
		if (el.matches(':disabled')) state.disabled = true;
		if (el.matches(':focus')) state.focus = true;
		} catch { /* some selectors may fail on exotic nodes */ }

		const tag = el.tagName;
		if (tag === 'INPUT') {
		// Checkbox/radio: `value` defaults to "on" when no explicit value attr
		// is set — noise. The meaningful signal is `checked`. Skip value here.
		const type = (el.type \|\| 'text').toLowerCase();
		if (type !== 'checkbox' && type !== 'radio' && el.value) {
		state.value = el.value;
		}
		} else if (tag === 'TEXTAREA') {
		if (el.value) state.value = el.value;
		} else if (tag === 'SELECT') {
		state.value = el.value;
		const opt = el.options && el.options[el.selectedIndex];
		if (opt) state.selectedText = opt.text \|\| '';
		} else if (tag === 'DETAILS') {
		state.open = !!el.open;
		} else if (el.hasAttribute && el.hasAttribute('open')) {
		state.open = true;
		}

		return Object.keys(state).length ? state : null;
		}

		/* ── Visual styles ────────────────────────────── */

		/**
		* Pull a small set of visually-meaningful computed styles. Reads from the
		* live element (the clone uses class-based styling so its inline style is
		* empty and a detached `getComputedStyle(cloneNode)` returns initial values).
		* The computed style of the original is what snapdom used to build the
		* capture, so the result matches what the screenshot shows.
		*/
		function computeVisualStyles(el) {
		let cs;
		try { cs = getComputedStyle(el); } catch { return null; }
		if (!cs) return null;
		const out = {};
		for (const prop of VISUAL_FIELDS) {
		const v = cs[prop];
		if (isDefaultStyleValue(prop, v)) continue;
		out[prop] = v;
		}
		return Object.keys(out).length ? out : null;
		}

		/* ── Covered detection ────────────────────────── */

		/**
		* True when another element is painted on top of the center of this one.
		* An agent that knows a button is covered by a modal won't try to click it.
		*/
		function isCovered(el, rect) {
		if (!rect.width \|\| !rect.height) return false;
		const cx = rect.left + rect.width / 2;
		const cy = rect.top + rect.height / 2;
		if (cx < 0 \|\| cy < 0) return false;
		const doc = el.ownerDocument \|\| document;
		if (!doc.elementFromPoint) return false;
		const top = doc.elementFromPoint(cx, cy);
		if (!top \|\| top === el) return false;
		if (el.contains && el.contains(top)) return false;
		return true;
		}

		/* ── Metadata extraction ──────────────────────── */

		function extractMetadata(element, interactiveSelector, semanticSelector) {
		const rootRect = element.getBoundingClientRect();
		const elements = [];
		let id = 0;

		const tracked = new Set();

		for (const el of element.querySelectorAll(interactiveSelector)) {
		const entry = buildEntry(el, rootRect, id, 'interactive');
		if (entry) {
		elements.push(entry);
		tracked.add(el);
		id++;
		}
		}

		for (const el of element.querySelectorAll(semanticSelector)) {
		if (tracked.has(el)) continue;
		const entry = buildEntry(el, rootRect, id, 'semantic');
		if (entry) {
		elements.push(entry);
		id++;
		}
		}

		return {
		elements,
		dimensions: { width: rootRect.width, height: rootRect.height },
		};
		}

		function buildEntry(el, rootRect, id, type) {
		const rect = el.getBoundingClientRect();
		const bbox = {
		x: Math.round(rect.left - rootRect.left),
		y: Math.round(rect.top - rootRect.top),
		width: Math.round(rect.width),
		height: Math.round(rect.height),
		};

		if (bbox.width <= 0 && bbox.height <= 0) return null;

		const tag = el.tagName.toLowerCase();
		const maxText = type === 'interactive' ? 200 : 120;
		const text = (el.textContent \|\| '').trim().slice(0, maxText);

		const attributes = {};
		for (const attr of COLLECTED_ATTRS) {
		const val = el.getAttribute(attr);
		if (val == null \|\| val === '' \|\| val === 'false') continue;
		attributes[attr] = val;
		}

		// For <img>, replace the (potentially long, cache-busted) src with just the
		// filename — "logo.svg" is more useful to an LLM than the full URL.
		if (tag === 'img' && el.src) {
		try {
		const base = (el.ownerDocument && el.ownerDocument.location)
		? el.ownerDocument.location.href
		: (typeof location !== 'undefined' ? location.href : undefined);
		const url = base ? new URL(el.src, base) : new URL(el.src);
		attributes.src = url.pathname.split('/').pop() \|\| el.src;
		} catch {
		attributes.src = el.src;
		}
		}

		const entry = {
		id,
		tag,
		type,
		name: accessibleName(el),
		text,
		bbox,
		attributes,
		};

		const styles = computeVisualStyles(el);
		if (styles) entry.styles = styles;

		if (type === 'interactive') {
		const state = computeState(el);
		if (state) {
		// Drop state.value when it just echoes attributes.value — no
		// divergence between the initial HTML attribute and the current
		// property, nothing for the LLM to learn from the repeat.
		if (state.value !== undefined && state.value === attributes.value) {
		delete state.value;
		}
		if (Object.keys(state).length) entry.state = state;
		}
		if (isCovered(el, rect)) entry.covered = true;
		}

		return entry;
		}

		/* ── Visual annotations ───────────────────────── */

		function addAnnotations(clone, elements, customStyle) {
		const interactive = elements.filter((e) => e.type === 'interactive');
		if (!interactive.length) return;

		const overlay = document.createElement('div');
		overlay.setAttribute('data-snap-prompt-overlay', 'true');
		Object.assign(overlay.style, {
		position: 'absolute',
		top: '0',
		left: '0',
		width: '100%',
		height: '100%',
		pointerEvents: 'none',
		zIndex: '2147483647',
		overflow: 'visible',
		});

		for (const el of interactive) {
		const badge = document.createElement('span');
		badge.textContent = String(el.id);
		badge.setAttribute('data-snap-prompt-label', String(el.id));
		// Center the badge on the element's bbox, not on its top-left corner:
		// `translate(-50%, -50%)` offsets from the anchor point.
		const cx = el.bbox.x + el.bbox.width / 2;
		const cy = el.bbox.y + el.bbox.height / 2;
		Object.assign(badge.style, {
		position: 'absolute',
		left: `${cx}px`,
		top: `${cy}px`,
		transform: 'translate(-50%, -50%)',
		minWidth: '18px',
		height: '18px',
		lineHeight: '18px',
		fontSize: '11px',
		fontWeight: '700',
		fontFamily: 'system-ui, -apple-system, sans-serif',
		color: '#fff',
		backgroundColor: 'rgba(220, 38, 38, 0.92)',
		borderRadius: '9px',
		textAlign: 'center',
		padding: '0 4px',
		boxSizing: 'border-box',
		boxShadow: '0 1px 3px rgba(0,0,0,0.3)',
		...customStyle,
		});
		overlay.appendChild(badge);
		}

		clone.style.position = 'relative';
		clone.appendChild(overlay);
		}

		/* ── Prompt text formatters ───────────────────── */

		function stateToStr(state) {
		if (!state) return '';
		const flags = [];
		const pairs = [];
		for (const k of Object.keys(state)) {
		const v = state[k];
		if (v === true) flags.push(k);
		else pairs.push(`${k}=${JSON.stringify(v)}`);
		}
		if (!flags.length && !pairs.length) return '';
		return ` {${[...flags, ...pairs].join(', ')}}`;
		}

		function coordsToStr(bbox) {
		return ` (${bbox.x},${bbox.y} ${bbox.width}×${bbox.height})`;
		}

		function formatPromptCompact(elements, dimensions, withCoords) {
		const lines = [`Screenshot (${dimensions.width}×${dimensions.height}px).`];

		const interactive = elements.filter((e) => e.type === 'interactive');
		const semantic = elements.filter((e) => e.type === 'semantic');

		if (interactive.length) {
		lines.push('', 'Interactive:');
		for (const el of interactive) {
		const name = el.name ? ` "${truncate(el.name, 60)}"` : '';
		const st = stateToStr(el.state);
		const cov = el.covered ? ' (covered)' : '';
		const coords = withCoords ? coordsToStr(el.bbox) : '';
		lines.push(` [${el.id}] ${el.tag}${name}${st}${cov}${coords}`);
		}
		}

		if (semantic.length) {
		lines.push('', 'Semantic:');
		for (const el of semantic) {
		const name = el.name ? ` "${truncate(el.name, 60)}"` : '';
		lines.push(` [${el.id}] ${el.tag}${name}`);
		}
		}

		return lines.join('\n');
		}

		function formatPromptVerbose(elements, dimensions, withCoords) {
		const lines = [
		`Screenshot of a web page (${dimensions.width}×${dimensions.height}px).`,
		'',
		];

		const interactive = elements.filter((e) => e.type === 'interactive');
		const semantic = elements.filter((e) => e.type === 'semantic');

		if (interactive.length) {
		lines.push('Interactive elements:');
		for (const el of interactive) {
		const name = el.name ? ` "${truncate(el.name, 80)}"` : '';
		const attrParts = Object.entries(el.attributes).map(([k, v]) => `${k}="${v}"`);
		const attrs = attrParts.length ? ' ' + attrParts.join(' ') : '';
		const pos = withCoords ? coordsToStr(el.bbox) : '';
		const st = stateToStr(el.state);
		const cov = el.covered ? ' (covered)' : '';
		lines.push(` [${el.id}] <${el.tag}>${name}${pos}${attrs}${st}${cov}`);
		}
		lines.push('');
		}

		if (semantic.length) {
		lines.push('Semantic structure:');
		for (const el of semantic) {
		const name = el.name ? ` "${truncate(el.name, 80)}"` : '';
		const attrParts = [];
		if (el.attributes.alt) attrParts.push(`alt="${el.attributes.alt}"`);
		if (el.attributes.role) attrParts.push(`role="${el.attributes.role}"`);
		const attrs = attrParts.length ? ' ' + attrParts.join(' ') : '';
		lines.push(` [${el.id}] <${el.tag}>${name}${attrs}`);
		}
		lines.push('');
		}

		return lines.join('\n');
		}

@zumer/snapdom-plugins - npm Package Compare versions

New alerts

Worsened metrics