@zumer/snapdom-plugins
Advanced tools
+1
-1
| { | ||
| "name": "@zumer/snapdom-plugins", | ||
| "version": "1.0.3", | ||
| "version": "1.1.0", | ||
| "description": "Official plugins for SnapDOM", | ||
@@ -5,0 +5,0 @@ "type": "module", |
+293
-47
| /** | ||
| * promptExport – Official SnapDOM Plugin | ||
| * Produces an LLM-ready package: annotated screenshot + structured element map + prompt text. | ||
| * Produces an LLM-ready package: annotated screenshot + structured element | ||
| * map + prompt text. Tuned for vision-capable LLMs reading the image + map | ||
| * together (Set-of-Mark pattern). | ||
| * | ||
@@ -12,8 +14,10 @@ * Usage: | ||
| * @param {boolean} [options.annotate=true] - Overlay numbered badges on interactive elements | ||
| * @param {string} [options.imageFormat='png'] - Output image format ('png'|'jpg'|'webp') | ||
| * @param {number} [options.imageQuality=0.8] - Quality for lossy formats (0..1) | ||
| * @param {number} [options.maxImageWidth=1024] - Max width in px (downscales if larger) | ||
| * @param {string} [options.interactiveSelector] - Custom CSS selector for interactive elements | ||
| * @param {string} [options.semanticSelector] - Custom CSS selector for semantic elements | ||
| * @param {Object} [options.labelStyle={}] - Override styles for annotation badges | ||
| * @param {string} [options.imageFormat='png'] - Output image format ('png'|'jpg'|'webp') | ||
| * @param {number} [options.imageQuality=0.8] - Quality for lossy formats (0..1) | ||
| * @param {number} [options.maxImageWidth=1024] - Max width in px (downscales if larger) | ||
| * @param {string} [options.interactiveSelector] - Custom CSS selector for interactive elements | ||
| * @param {string} [options.semanticSelector] - Custom CSS selector for semantic elements | ||
| * @param {Object} [options.labelStyle={}] - Override styles for annotation badges | ||
| * @param {'compact'|'verbose'} [options.promptMode='compact'] - Prompt text verbosity | ||
| * @param {boolean} [options.includeCoords=true] - Include bbox in the prompt text | ||
| * @returns {Object} SnapDOM plugin | ||
@@ -31,2 +35,29 @@ */ | ||
| const COLLECTED_ATTRS = [ | ||
| 'role', 'aria-label', 'aria-expanded', 'aria-checked', 'aria-disabled', | ||
| 'alt', 'href', 'placeholder', 'name', 'type', 'value', 'title', 'disabled', | ||
| ]; | ||
| const VISUAL_FIELDS = [ | ||
| 'display', 'visibility', 'opacity', | ||
| 'color', 'backgroundColor', | ||
| 'fontSize', 'fontWeight', | ||
| 'cursor', 'overflow', | ||
| ]; | ||
| // Common computed-style values that carry no information. Keeping the | ||
| // `styles` object small is the difference between a useful LLM input and | ||
| // token bloat on every element. | ||
| const VISUAL_SKIP = new Set(['initial', 'normal', 'visible', 'auto', 'static', '0']); | ||
| function isDefaultStyleValue(prop, value) { | ||
| if (!value) return true; | ||
| if (VISUAL_SKIP.has(value)) return true; | ||
| if (prop === 'cursor' && value === 'none') return true; | ||
| if (prop === 'color' && value === 'rgb(0, 0, 0)') return true; | ||
| if (prop === 'backgroundColor' && value === 'rgba(0, 0, 0, 0)') return true; | ||
| if (prop === 'fontWeight' && (value === '400' || value === 'normal')) return true; | ||
| if (prop === 'opacity' && value === '1') return true; | ||
| return false; | ||
| } | ||
| export function promptExport(options = {}) { | ||
@@ -41,2 +72,4 @@ const { | ||
| labelStyle = {}, | ||
| promptMode = 'compact', | ||
| includeCoords = true, | ||
| } = options; | ||
@@ -49,6 +82,4 @@ | ||
| const meta = extractMetadata(ctx.element, interactiveSelector, semanticSelector); | ||
| // The ctx passed to afterClone is NOT the same object passed to the | ||
| // prompt() export later — snapdom spreads from ctx.options there. Stash | ||
| // on both so (a) standalone tests that pass a minimal ctx still see it | ||
| // and (b) the prompt() call can read it through the shared options ref. | ||
| // snapdom spreads a fresh ctx for the export phase from ctx.options, | ||
| // so write to both so the prompt() call below can read it. | ||
| ctx.__promptMetadata = meta; | ||
@@ -78,2 +109,4 @@ if (ctx.options) ctx.options.__promptMetadata = meta; | ||
| const maxWidth = opts.maxImageWidth || maxImageWidth; | ||
| const mode = opts.promptMode || promptMode; | ||
| const withCoords = opts.includeCoords !== undefined ? opts.includeCoords : includeCoords; | ||
@@ -87,4 +120,3 @@ const img = new Image(); | ||
| const ratio = | ||
| img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1; | ||
| const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1; | ||
| const w = Math.round(img.naturalWidth * ratio); | ||
@@ -96,11 +128,8 @@ const h = Math.round(img.naturalHeight * ratio); | ||
| canvas.height = h; | ||
| const c2d = canvas.getContext('2d'); | ||
| c2d.drawImage(img, 0, 0, w, h); | ||
| canvas.getContext('2d').drawImage(img, 0, 0, w, h); | ||
| const mime = | ||
| format === 'jpg' || format === 'jpeg' | ||
| ? 'image/jpeg' | ||
| : format === 'webp' | ||
| ? 'image/webp' | ||
| : 'image/png'; | ||
| format === 'jpg' || format === 'jpeg' ? 'image/jpeg' | ||
| : format === 'webp' ? 'image/webp' | ||
| : 'image/png'; | ||
| const dataURL = canvas.toDataURL(mime, quality); | ||
@@ -121,2 +150,9 @@ | ||
| // In compact + annotate, badges already encode positions on the | ||
| // image itself, so repeating coords in the prompt text is noise. | ||
| const emitCoords = mode === 'verbose' ? true : (withCoords && !annotate); | ||
| const prompt = mode === 'verbose' | ||
| ? formatPromptVerbose(elements, { width: w, height: h }, emitCoords) | ||
| : formatPromptCompact(elements, { width: w, height: h }, emitCoords); | ||
| return { | ||
@@ -126,3 +162,3 @@ image: dataURL, | ||
| dimensions: { width: w, height: h }, | ||
| prompt: formatPromptText(elements, { width: w, height: h }), | ||
| prompt, | ||
| }; | ||
@@ -135,2 +171,130 @@ }, | ||
| /* ── Accessible name ──────────────────────────── */ | ||
| function truncate(str, max) { | ||
| if (!str) return ''; | ||
| if (str.length <= max) return str; | ||
| return str.slice(0, max - 1) + '…'; | ||
| } | ||
| /** | ||
| * Compute the element's accessible name following a simplified WAI-ARIA order. | ||
| * This is what an LLM agent reads first to know what the element IS. | ||
| */ | ||
| function accessibleName(el) { | ||
| const ariaLabel = el.getAttribute('aria-label'); | ||
| if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim(); | ||
| const labelledBy = el.getAttribute('aria-labelledby'); | ||
| if (labelledBy) { | ||
| const root = el.getRootNode(); | ||
| const getById = (id) => | ||
| root && typeof root.getElementById === 'function' | ||
| ? root.getElementById(id) | ||
| : document.getElementById(id); | ||
| const parts = labelledBy.trim().split(/\s+/) | ||
| .map((id) => { | ||
| const ref = getById(id); | ||
| return ref ? (ref.textContent || '').trim() : ''; | ||
| }) | ||
| .filter(Boolean); | ||
| if (parts.length) return parts.join(' '); | ||
| } | ||
| if (el.tagName === 'IMG') { | ||
| const alt = el.getAttribute('alt'); | ||
| if (alt && alt.trim()) return alt.trim(); | ||
| } | ||
| const title = el.getAttribute('title'); | ||
| if (title && title.trim()) return title.trim(); | ||
| if (el.labels && el.labels[0]) { | ||
| const t = (el.labels[0].textContent || '').trim(); | ||
| if (t) return t; | ||
| } | ||
| const text = (el.textContent || '').trim(); | ||
| return text ? truncate(text, 40) : ''; | ||
| } | ||
| /* ── DOM state ────────────────────────────────── */ | ||
| /** | ||
| * Capture real runtime state (not just ARIA attributes). This is what | ||
| * separates a useful map from a static screenshot annotation: an agent can | ||
| * see "checkbox is unchecked / select has value=X / details is closed". | ||
| */ | ||
| function computeState(el) { | ||
| const state = {}; | ||
| try { | ||
| if (el.matches(':checked')) state.checked = true; | ||
| if (el.matches(':disabled')) state.disabled = true; | ||
| if (el.matches(':focus')) state.focus = true; | ||
| } catch { /* some selectors may fail on exotic nodes */ } | ||
| const tag = el.tagName; | ||
| if (tag === 'INPUT') { | ||
| // Checkbox/radio: `value` defaults to "on" when no explicit value attr | ||
| // is set — noise. The meaningful signal is `checked`. Skip value here. | ||
| const type = (el.type || 'text').toLowerCase(); | ||
| if (type !== 'checkbox' && type !== 'radio' && el.value) { | ||
| state.value = el.value; | ||
| } | ||
| } else if (tag === 'TEXTAREA') { | ||
| if (el.value) state.value = el.value; | ||
| } else if (tag === 'SELECT') { | ||
| state.value = el.value; | ||
| const opt = el.options && el.options[el.selectedIndex]; | ||
| if (opt) state.selectedText = opt.text || ''; | ||
| } else if (tag === 'DETAILS') { | ||
| state.open = !!el.open; | ||
| } else if (el.hasAttribute && el.hasAttribute('open')) { | ||
| state.open = true; | ||
| } | ||
| return Object.keys(state).length ? state : null; | ||
| } | ||
| /* ── Visual styles ────────────────────────────── */ | ||
| /** | ||
| * Pull a small set of visually-meaningful computed styles. Reads from the | ||
| * live element (the clone uses class-based styling so its inline style is | ||
| * empty and a detached `getComputedStyle(cloneNode)` returns initial values). | ||
| * The computed style of the original is what snapdom used to build the | ||
| * capture, so the result matches what the screenshot shows. | ||
| */ | ||
| function computeVisualStyles(el) { | ||
| let cs; | ||
| try { cs = getComputedStyle(el); } catch { return null; } | ||
| if (!cs) return null; | ||
| const out = {}; | ||
| for (const prop of VISUAL_FIELDS) { | ||
| const v = cs[prop]; | ||
| if (isDefaultStyleValue(prop, v)) continue; | ||
| out[prop] = v; | ||
| } | ||
| return Object.keys(out).length ? out : null; | ||
| } | ||
| /* ── Covered detection ────────────────────────── */ | ||
| /** | ||
| * True when another element is painted on top of the center of this one. | ||
| * An agent that knows a button is covered by a modal won't try to click it. | ||
| */ | ||
| function isCovered(el, rect) { | ||
| if (!rect.width || !rect.height) return false; | ||
| const cx = rect.left + rect.width / 2; | ||
| const cy = rect.top + rect.height / 2; | ||
| if (cx < 0 || cy < 0) return false; | ||
| const doc = el.ownerDocument || document; | ||
| if (!doc.elementFromPoint) return false; | ||
| const top = doc.elementFromPoint(cx, cy); | ||
| if (!top || top === el) return false; | ||
| if (el.contains && el.contains(top)) return false; | ||
| return true; | ||
| } | ||
| /* ── Metadata extraction ──────────────────────── */ | ||
@@ -169,7 +333,2 @@ | ||
| const COLLECTED_ATTRS = [ | ||
| 'role', 'aria-label', 'aria-expanded', 'aria-checked', 'aria-disabled', | ||
| 'alt', 'href', 'placeholder', 'name', 'type', 'value', 'title', 'disabled', | ||
| ]; | ||
| function buildEntry(el, rootRect, id, type) { | ||
@@ -187,3 +346,4 @@ const rect = el.getBoundingClientRect(); | ||
| const tag = el.tagName.toLowerCase(); | ||
| const text = (el.textContent || '').trim().slice(0, 200); | ||
| const maxText = type === 'interactive' ? 200 : 120; | ||
| const text = (el.textContent || '').trim().slice(0, maxText); | ||
@@ -193,6 +353,48 @@ const attributes = {}; | ||
| const val = el.getAttribute(attr); | ||
| if (val != null) attributes[attr] = val; | ||
| if (val == null || val === '' || val === 'false') continue; | ||
| attributes[attr] = val; | ||
| } | ||
| return { id, tag, type, text, bbox, attributes }; | ||
| // For <img>, replace the (potentially long, cache-busted) src with just the | ||
| // filename — "logo.svg" is more useful to an LLM than the full URL. | ||
| if (tag === 'img' && el.src) { | ||
| try { | ||
| const base = (el.ownerDocument && el.ownerDocument.location) | ||
| ? el.ownerDocument.location.href | ||
| : (typeof location !== 'undefined' ? location.href : undefined); | ||
| const url = base ? new URL(el.src, base) : new URL(el.src); | ||
| attributes.src = url.pathname.split('/').pop() || el.src; | ||
| } catch { | ||
| attributes.src = el.src; | ||
| } | ||
| } | ||
| const entry = { | ||
| id, | ||
| tag, | ||
| type, | ||
| name: accessibleName(el), | ||
| text, | ||
| bbox, | ||
| attributes, | ||
| }; | ||
| const styles = computeVisualStyles(el); | ||
| if (styles) entry.styles = styles; | ||
| if (type === 'interactive') { | ||
| const state = computeState(el); | ||
| if (state) { | ||
| // Drop state.value when it just echoes attributes.value — no | ||
| // divergence between the initial HTML attribute and the current | ||
| // property, nothing for the LLM to learn from the repeat. | ||
| if (state.value !== undefined && state.value === attributes.value) { | ||
| delete state.value; | ||
| } | ||
| if (Object.keys(state).length) entry.state = state; | ||
| } | ||
| if (isCovered(el, rect)) entry.covered = true; | ||
| } | ||
| return entry; | ||
| } | ||
@@ -223,6 +425,10 @@ | ||
| badge.setAttribute('data-snap-prompt-label', String(el.id)); | ||
| // Center the badge on the element's bbox, not on its top-left corner: | ||
| // `translate(-50%, -50%)` offsets from the anchor point. | ||
| const cx = el.bbox.x + el.bbox.width / 2; | ||
| const cy = el.bbox.y + el.bbox.height / 2; | ||
| Object.assign(badge.style, { | ||
| position: 'absolute', | ||
| left: `${el.bbox.x}px`, | ||
| top: `${el.bbox.y}px`, | ||
| left: `${cx}px`, | ||
| top: `${cy}px`, | ||
| transform: 'translate(-50%, -50%)', | ||
@@ -251,7 +457,52 @@ minWidth: '18px', | ||
| /* ── Prompt text formatter ────────────────────── */ | ||
| /* ── Prompt text formatters ───────────────────── */ | ||
| function formatPromptText(elements, dimensions) { | ||
| function stateToStr(state) { | ||
| if (!state) return ''; | ||
| const flags = []; | ||
| const pairs = []; | ||
| for (const k of Object.keys(state)) { | ||
| const v = state[k]; | ||
| if (v === true) flags.push(k); | ||
| else pairs.push(`${k}=${JSON.stringify(v)}`); | ||
| } | ||
| if (!flags.length && !pairs.length) return ''; | ||
| return ` {${[...flags, ...pairs].join(', ')}}`; | ||
| } | ||
| function coordsToStr(bbox) { | ||
| return ` (${bbox.x},${bbox.y} ${bbox.width}×${bbox.height})`; | ||
| } | ||
| function formatPromptCompact(elements, dimensions, withCoords) { | ||
| const lines = [`Screenshot (${dimensions.width}×${dimensions.height}px).`]; | ||
| const interactive = elements.filter((e) => e.type === 'interactive'); | ||
| const semantic = elements.filter((e) => e.type === 'semantic'); | ||
| if (interactive.length) { | ||
| lines.push('', 'Interactive:'); | ||
| for (const el of interactive) { | ||
| const name = el.name ? ` "${truncate(el.name, 60)}"` : ''; | ||
| const st = stateToStr(el.state); | ||
| const cov = el.covered ? ' (covered)' : ''; | ||
| const coords = withCoords ? coordsToStr(el.bbox) : ''; | ||
| lines.push(` [${el.id}] ${el.tag}${name}${st}${cov}${coords}`); | ||
| } | ||
| } | ||
| if (semantic.length) { | ||
| lines.push('', 'Semantic:'); | ||
| for (const el of semantic) { | ||
| const name = el.name ? ` "${truncate(el.name, 60)}"` : ''; | ||
| lines.push(` [${el.id}] ${el.tag}${name}`); | ||
| } | ||
| } | ||
| return lines.join('\n'); | ||
| } | ||
| function formatPromptVerbose(elements, dimensions, withCoords) { | ||
| const lines = [ | ||
| `Screenshot of a web page (${dimensions.width}\u00d7${dimensions.height}px).`, | ||
| `Screenshot of a web page (${dimensions.width}×${dimensions.height}px).`, | ||
| '', | ||
@@ -266,9 +517,9 @@ ]; | ||
| for (const el of interactive) { | ||
| const attrParts = Object.entries(el.attributes).map( | ||
| ([k, v]) => `${k}="${v}"` | ||
| ); | ||
| const text = el.text ? ` "${truncate(el.text, 60)}"` : ''; | ||
| const pos = `(${el.bbox.x},${el.bbox.y} ${el.bbox.width}\u00d7${el.bbox.height})`; | ||
| const name = el.name ? ` "${truncate(el.name, 80)}"` : ''; | ||
| const attrParts = Object.entries(el.attributes).map(([k, v]) => `${k}="${v}"`); | ||
| const attrs = attrParts.length ? ' ' + attrParts.join(' ') : ''; | ||
| lines.push(` [${el.id}] <${el.tag}>${text} ${pos}${attrs}`); | ||
| const pos = withCoords ? coordsToStr(el.bbox) : ''; | ||
| const st = stateToStr(el.state); | ||
| const cov = el.covered ? ' (covered)' : ''; | ||
| lines.push(` [${el.id}] <${el.tag}>${name}${pos}${attrs}${st}${cov}`); | ||
| } | ||
@@ -281,3 +532,3 @@ lines.push(''); | ||
| for (const el of semantic) { | ||
| const text = el.text ? ` "${truncate(el.text, 80)}"` : ''; | ||
| const name = el.name ? ` "${truncate(el.name, 80)}"` : ''; | ||
| const attrParts = []; | ||
@@ -287,3 +538,3 @@ if (el.attributes.alt) attrParts.push(`alt="${el.attributes.alt}"`); | ||
| const attrs = attrParts.length ? ' ' + attrParts.join(' ') : ''; | ||
| lines.push(` [${el.id}] <${el.tag}>${text}${attrs}`); | ||
| lines.push(` [${el.id}] <${el.tag}>${name}${attrs}`); | ||
| } | ||
@@ -295,6 +546,1 @@ lines.push(''); | ||
| } | ||
| function truncate(str, max) { | ||
| if (str.length <= max) return str; | ||
| return str.slice(0, max - 1) + '\u2026'; | ||
| } |
44220
25%898
31.86%