@zumer/snapdom-plugins - npm Package Compare versions

Comparing version

1.0.3

1.1.0

+1

-1

package.json

		{
		"name": "@zumer/snapdom-plugins",
		"version": "1.0.3",
		"version": "1.1.0",
		"description": "Official plugins for SnapDOM",
		@@ -5,0 +5,0 @@ "type": "module",

+293

-47

prompt-export.js

		/**
		* promptExport – Official SnapDOM Plugin
		* Produces an LLM-ready package: annotated screenshot + structured element map + prompt text.
		* Produces an LLM-ready package: annotated screenshot + structured element
		* map + prompt text. Tuned for vision-capable LLMs reading the image + map
		* together (Set-of-Mark pattern).
		*
		@@ -12,8 +14,10 @@ * Usage:
		* @param {boolean} [options.annotate=true] - Overlay numbered badges on interactive elements
		* @param {string} [options.imageFormat='png'] - Output image format ('png'\|'jpg'\|'webp')
		* @param {number} [options.imageQuality=0.8] - Quality for lossy formats (0..1)
		* @param {number} [options.maxImageWidth=1024] - Max width in px (downscales if larger)
		* @param {string} [options.interactiveSelector] - Custom CSS selector for interactive elements
		* @param {string} [options.semanticSelector] - Custom CSS selector for semantic elements
		* @param {Object} [options.labelStyle={}] - Override styles for annotation badges
		* @param {string} [options.imageFormat='png'] - Output image format ('png'\|'jpg'\|'webp')
		* @param {number} [options.imageQuality=0.8] - Quality for lossy formats (0..1)
		* @param {number} [options.maxImageWidth=1024] - Max width in px (downscales if larger)
		* @param {string} [options.interactiveSelector] - Custom CSS selector for interactive elements
		* @param {string} [options.semanticSelector] - Custom CSS selector for semantic elements
		* @param {Object} [options.labelStyle={}] - Override styles for annotation badges
		* @param {'compact'\|'verbose'} [options.promptMode='compact'] - Prompt text verbosity
		* @param {boolean} [options.includeCoords=true] - Include bbox in the prompt text
		* @returns {Object} SnapDOM plugin
		@@ -31,2 +35,29 @@ */

		const COLLECTED_ATTRS = [
		'role', 'aria-label', 'aria-expanded', 'aria-checked', 'aria-disabled',
		'alt', 'href', 'placeholder', 'name', 'type', 'value', 'title', 'disabled',
		];

		const VISUAL_FIELDS = [
		'display', 'visibility', 'opacity',
		'color', 'backgroundColor',
		'fontSize', 'fontWeight',
		'cursor', 'overflow',
		];

		// Common computed-style values that carry no information. Keeping the
		// `styles` object small is the difference between a useful LLM input and
		// token bloat on every element.
		const VISUAL_SKIP = new Set(['initial', 'normal', 'visible', 'auto', 'static', '0']);
		function isDefaultStyleValue(prop, value) {
		if (!value) return true;
		if (VISUAL_SKIP.has(value)) return true;
		if (prop === 'cursor' && value === 'none') return true;
		if (prop === 'color' && value === 'rgb(0, 0, 0)') return true;
		if (prop === 'backgroundColor' && value === 'rgba(0, 0, 0, 0)') return true;
		if (prop === 'fontWeight' && (value === '400' \|\| value === 'normal')) return true;
		if (prop === 'opacity' && value === '1') return true;
		return false;
		}

		export function promptExport(options = {}) {
		@@ -41,2 +72,4 @@ const {
		labelStyle = {},
		promptMode = 'compact',
		includeCoords = true,
		} = options;
		@@ -49,6 +82,4 @@
		const meta = extractMetadata(ctx.element, interactiveSelector, semanticSelector);
		// The ctx passed to afterClone is NOT the same object passed to the
		// prompt() export later — snapdom spreads from ctx.options there. Stash
		// on both so (a) standalone tests that pass a minimal ctx still see it
		// and (b) the prompt() call can read it through the shared options ref.
		// snapdom spreads a fresh ctx for the export phase from ctx.options,
		// so write to both so the prompt() call below can read it.
		ctx.__promptMetadata = meta;
		@@ -78,2 +109,4 @@ if (ctx.options) ctx.options.__promptMetadata = meta;
		const maxWidth = opts.maxImageWidth \|\| maxImageWidth;
		const mode = opts.promptMode \|\| promptMode;
		const withCoords = opts.includeCoords !== undefined ? opts.includeCoords : includeCoords;

		@@ -87,4 +120,3 @@ const img = new Image();

		const ratio =
		img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
		const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
		const w = Math.round(img.naturalWidth * ratio);
		@@ -96,11 +128,8 @@ const h = Math.round(img.naturalHeight * ratio);
		canvas.height = h;
		const c2d = canvas.getContext('2d');
		c2d.drawImage(img, 0, 0, w, h);
		canvas.getContext('2d').drawImage(img, 0, 0, w, h);

		const mime =
		format === 'jpg' \|\| format === 'jpeg'
		? 'image/jpeg'
		: format === 'webp'
		? 'image/webp'
		: 'image/png';
		format === 'jpg' \|\| format === 'jpeg' ? 'image/jpeg'
		: format === 'webp' ? 'image/webp'
		: 'image/png';
		const dataURL = canvas.toDataURL(mime, quality);
		@@ -121,2 +150,9 @@

		// In compact + annotate, badges already encode positions on the
		// image itself, so repeating coords in the prompt text is noise.
		const emitCoords = mode === 'verbose' ? true : (withCoords && !annotate);
		const prompt = mode === 'verbose'
		? formatPromptVerbose(elements, { width: w, height: h }, emitCoords)
		: formatPromptCompact(elements, { width: w, height: h }, emitCoords);

		return {
		@@ -126,3 +162,3 @@ image: dataURL,
		dimensions: { width: w, height: h },
		prompt: formatPromptText(elements, { width: w, height: h }),
		prompt,
		};
		@@ -135,2 +171,130 @@ },

		/* ── Accessible name ──────────────────────────── */

		function truncate(str, max) {
		if (!str) return '';
		if (str.length <= max) return str;
		return str.slice(0, max - 1) + '…';
		}

		/**
		* Compute the element's accessible name following a simplified WAI-ARIA order.
		* This is what an LLM agent reads first to know what the element IS.
		*/
		function accessibleName(el) {
		const ariaLabel = el.getAttribute('aria-label');
		if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim();

		const labelledBy = el.getAttribute('aria-labelledby');
		if (labelledBy) {
		const root = el.getRootNode();
		const getById = (id) =>
		root && typeof root.getElementById === 'function'
		? root.getElementById(id)
		: document.getElementById(id);
		const parts = labelledBy.trim().split(/\s+/)
		.map((id) => {
		const ref = getById(id);
		return ref ? (ref.textContent \|\| '').trim() : '';
		})
		.filter(Boolean);
		if (parts.length) return parts.join(' ');
		}

		if (el.tagName === 'IMG') {
		const alt = el.getAttribute('alt');
		if (alt && alt.trim()) return alt.trim();
		}

		const title = el.getAttribute('title');
		if (title && title.trim()) return title.trim();

		if (el.labels && el.labels[0]) {
		const t = (el.labels[0].textContent \|\| '').trim();
		if (t) return t;
		}

		const text = (el.textContent \|\| '').trim();
		return text ? truncate(text, 40) : '';
		}

		/* ── DOM state ────────────────────────────────── */

		/**
		* Capture real runtime state (not just ARIA attributes). This is what
		* separates a useful map from a static screenshot annotation: an agent can
		* see "checkbox is unchecked / select has value=X / details is closed".
		*/
		function computeState(el) {
		const state = {};
		try {
		if (el.matches(':checked')) state.checked = true;
		if (el.matches(':disabled')) state.disabled = true;
		if (el.matches(':focus')) state.focus = true;
		} catch { /* some selectors may fail on exotic nodes */ }

		const tag = el.tagName;
		if (tag === 'INPUT') {
		// Checkbox/radio: `value` defaults to "on" when no explicit value attr
		// is set — noise. The meaningful signal is `checked`. Skip value here.
		const type = (el.type \|\| 'text').toLowerCase();
		if (type !== 'checkbox' && type !== 'radio' && el.value) {
		state.value = el.value;
		}
		} else if (tag === 'TEXTAREA') {
		if (el.value) state.value = el.value;
		} else if (tag === 'SELECT') {
		state.value = el.value;
		const opt = el.options && el.options[el.selectedIndex];
		if (opt) state.selectedText = opt.text \|\| '';
		} else if (tag === 'DETAILS') {
		state.open = !!el.open;
		} else if (el.hasAttribute && el.hasAttribute('open')) {
		state.open = true;
		}

		return Object.keys(state).length ? state : null;
		}

		/* ── Visual styles ────────────────────────────── */

		/**
		* Pull a small set of visually-meaningful computed styles. Reads from the
		* live element (the clone uses class-based styling so its inline style is
		* empty and a detached `getComputedStyle(cloneNode)` returns initial values).
		* The computed style of the original is what snapdom used to build the
		* capture, so the result matches what the screenshot shows.
		*/
		function computeVisualStyles(el) {
		let cs;
		try { cs = getComputedStyle(el); } catch { return null; }
		if (!cs) return null;
		const out = {};
		for (const prop of VISUAL_FIELDS) {
		const v = cs[prop];
		if (isDefaultStyleValue(prop, v)) continue;
		out[prop] = v;
		}
		return Object.keys(out).length ? out : null;
		}

		/* ── Covered detection ────────────────────────── */

		/**
		* True when another element is painted on top of the center of this one.
		* An agent that knows a button is covered by a modal won't try to click it.
		*/
		function isCovered(el, rect) {
		if (!rect.width \|\| !rect.height) return false;
		const cx = rect.left + rect.width / 2;
		const cy = rect.top + rect.height / 2;
		if (cx < 0 \|\| cy < 0) return false;
		const doc = el.ownerDocument \|\| document;
		if (!doc.elementFromPoint) return false;
		const top = doc.elementFromPoint(cx, cy);
		if (!top \|\| top === el) return false;
		if (el.contains && el.contains(top)) return false;
		return true;
		}

		/* ── Metadata extraction ──────────────────────── */
		@@ -169,7 +333,2 @@

		const COLLECTED_ATTRS = [
		'role', 'aria-label', 'aria-expanded', 'aria-checked', 'aria-disabled',
		'alt', 'href', 'placeholder', 'name', 'type', 'value', 'title', 'disabled',
		];

		function buildEntry(el, rootRect, id, type) {
		@@ -187,3 +346,4 @@ const rect = el.getBoundingClientRect();
		const tag = el.tagName.toLowerCase();
		const text = (el.textContent \|\| '').trim().slice(0, 200);
		const maxText = type === 'interactive' ? 200 : 120;
		const text = (el.textContent \|\| '').trim().slice(0, maxText);

		@@ -193,6 +353,48 @@ const attributes = {};
		const val = el.getAttribute(attr);
		if (val != null) attributes[attr] = val;
		if (val == null \|\| val === '' \|\| val === 'false') continue;
		attributes[attr] = val;
		}

		return { id, tag, type, text, bbox, attributes };
		// For <img>, replace the (potentially long, cache-busted) src with just the
		// filename — "logo.svg" is more useful to an LLM than the full URL.
		if (tag === 'img' && el.src) {
		try {
		const base = (el.ownerDocument && el.ownerDocument.location)
		? el.ownerDocument.location.href
		: (typeof location !== 'undefined' ? location.href : undefined);
		const url = base ? new URL(el.src, base) : new URL(el.src);
		attributes.src = url.pathname.split('/').pop() \|\| el.src;
		} catch {
		attributes.src = el.src;
		}
		}

		const entry = {
		id,
		tag,
		type,
		name: accessibleName(el),
		text,
		bbox,
		attributes,
		};

		const styles = computeVisualStyles(el);
		if (styles) entry.styles = styles;

		if (type === 'interactive') {
		const state = computeState(el);
		if (state) {
		// Drop state.value when it just echoes attributes.value — no
		// divergence between the initial HTML attribute and the current
		// property, nothing for the LLM to learn from the repeat.
		if (state.value !== undefined && state.value === attributes.value) {
		delete state.value;
		}
		if (Object.keys(state).length) entry.state = state;
		}
		if (isCovered(el, rect)) entry.covered = true;
		}

		return entry;
		}
		@@ -223,6 +425,10 @@
		badge.setAttribute('data-snap-prompt-label', String(el.id));
		// Center the badge on the element's bbox, not on its top-left corner:
		// `translate(-50%, -50%)` offsets from the anchor point.
		const cx = el.bbox.x + el.bbox.width / 2;
		const cy = el.bbox.y + el.bbox.height / 2;
		Object.assign(badge.style, {
		position: 'absolute',
		left: `${el.bbox.x}px`,
		top: `${el.bbox.y}px`,
		left: `${cx}px`,
		top: `${cy}px`,
		transform: 'translate(-50%, -50%)',
		@@ -251,7 +457,52 @@ minWidth: '18px',

		/* ── Prompt text formatter ────────────────────── */
		/* ── Prompt text formatters ───────────────────── */

		function formatPromptText(elements, dimensions) {
		function stateToStr(state) {
		if (!state) return '';
		const flags = [];
		const pairs = [];
		for (const k of Object.keys(state)) {
		const v = state[k];
		if (v === true) flags.push(k);
		else pairs.push(`${k}=${JSON.stringify(v)}`);
		}
		if (!flags.length && !pairs.length) return '';
		return ` {${[...flags, ...pairs].join(', ')}}`;
		}

		function coordsToStr(bbox) {
		return ` (${bbox.x},${bbox.y} ${bbox.width}×${bbox.height})`;
		}

		function formatPromptCompact(elements, dimensions, withCoords) {
		const lines = [`Screenshot (${dimensions.width}×${dimensions.height}px).`];

		const interactive = elements.filter((e) => e.type === 'interactive');
		const semantic = elements.filter((e) => e.type === 'semantic');

		if (interactive.length) {
		lines.push('', 'Interactive:');
		for (const el of interactive) {
		const name = el.name ? ` "${truncate(el.name, 60)}"` : '';
		const st = stateToStr(el.state);
		const cov = el.covered ? ' (covered)' : '';
		const coords = withCoords ? coordsToStr(el.bbox) : '';
		lines.push(` [${el.id}] ${el.tag}${name}${st}${cov}${coords}`);
		}
		}

		if (semantic.length) {
		lines.push('', 'Semantic:');
		for (const el of semantic) {
		const name = el.name ? ` "${truncate(el.name, 60)}"` : '';
		lines.push(` [${el.id}] ${el.tag}${name}`);
		}
		}

		return lines.join('\n');
		}

		function formatPromptVerbose(elements, dimensions, withCoords) {
		const lines = [
		`Screenshot of a web page (${dimensions.width}\u00d7${dimensions.height}px).`,
		`Screenshot of a web page (${dimensions.width}×${dimensions.height}px).`,
		'',
		@@ -266,9 +517,9 @@ ];
		for (const el of interactive) {
		const attrParts = Object.entries(el.attributes).map(
		([k, v]) => `${k}="${v}"`
		);
		const text = el.text ? ` "${truncate(el.text, 60)}"` : '';
		const pos = `(${el.bbox.x},${el.bbox.y} ${el.bbox.width}\u00d7${el.bbox.height})`;
		const name = el.name ? ` "${truncate(el.name, 80)}"` : '';
		const attrParts = Object.entries(el.attributes).map(([k, v]) => `${k}="${v}"`);
		const attrs = attrParts.length ? ' ' + attrParts.join(' ') : '';
		lines.push(` [${el.id}] <${el.tag}>${text} ${pos}${attrs}`);
		const pos = withCoords ? coordsToStr(el.bbox) : '';
		const st = stateToStr(el.state);
		const cov = el.covered ? ' (covered)' : '';
		lines.push(` [${el.id}] <${el.tag}>${name}${pos}${attrs}${st}${cov}`);
		}
		@@ -281,3 +532,3 @@ lines.push('');
		for (const el of semantic) {
		const text = el.text ? ` "${truncate(el.text, 80)}"` : '';
		const name = el.name ? ` "${truncate(el.name, 80)}"` : '';
		const attrParts = [];
		@@ -287,3 +538,3 @@ if (el.attributes.alt) attrParts.push(`alt="${el.attributes.alt}"`);
		const attrs = attrParts.length ? ' ' + attrParts.join(' ') : '';
		lines.push(` [${el.id}] <${el.tag}>${text}${attrs}`);
		lines.push(` [${el.id}] <${el.tag}>${name}${attrs}`);
		}
		@@ -295,6 +546,1 @@ lines.push('');
		}

		function truncate(str, max) {
		if (str.length <= max) return str;
		return str.slice(0, max - 1) + '\u2026';
		}

@zumer/snapdom-plugins - npm Package Compare versions

Improved metrics