@zumer/snapdom-plugins - npm Package Compare versions

Comparing version

1.1.0

1.2.0

+1

-1

package.json

		{
		"name": "@zumer/snapdom-plugins",
		"version": "1.1.0",
		"version": "1.2.0",
		"description": "Official plugins for SnapDOM",
		@@ -5,0 +5,0 @@ "type": "module",

+62

-39

prompt-export.js

		@@ -22,2 +22,7 @@ /**
		* @param {boolean} [options.includeCoords=true] - Include bbox in the prompt text
		* @param {string[]} [options.include] - Which fields to return. Default
		* ['elements', 'prompt']. For vision-dependent tasks (chart content, layout QA,
		* canvas) pass ['image', 'elements', 'prompt'] or add 'image' to the array. For
		* text-only agent prompts pass ['prompt'] (cheapest — skips canvas draw entirely).
		* Accepted values: 'image', 'elements', 'prompt'.
		* @returns {Object} SnapDOM plugin
		@@ -62,2 +67,8 @@ */

		// Default omits 'image'. Benchmarking showed the text + JSON map is enough
		// to answer most UI-inspection questions and uses ~14× fewer tokens. Pass
		// `include: ['image', 'elements', 'prompt']` explicitly when the task truly
		// depends on vision (charts, canvas content, layout QA).
		const DEFAULT_INCLUDE = ['elements', 'prompt'];

		export function promptExport(options = {}) {
		@@ -74,2 +85,3 @@ const {
		includeCoords = true,
		include = DEFAULT_INCLUDE,
		} = options;
		@@ -96,9 +108,13 @@
		const meta = ctx.__promptMetadata;
		const wantSet = new Set(opts.include \|\| include \|\| DEFAULT_INCLUDE);
		const wantImage = wantSet.has('image');
		const wantElements = wantSet.has('elements');
		const wantPrompt = wantSet.has('prompt');

		if (!meta \|\| !meta.elements.length) {
		return {
		image: ctx.export.url,
		elements: [],
		dimensions: { width: 0, height: 0 },
		prompt: '',
		};
		const empty = { dimensions: { width: 0, height: 0 } };
		if (wantImage) empty.image = ctx.export.url;
		if (wantElements) empty.elements = [];
		if (wantPrompt) empty.prompt = '';
		return empty;
		}
		@@ -112,28 +128,36 @@

		const img = new Image();
		img.src = ctx.export.url;
		await new Promise((res, rej) => {
		img.onload = res;
		img.onerror = rej;
		});
		// Only load + rasterize the SVG when the caller actually wants the
		// image. Skipping saves the img decode + canvas draw + toDataURL —
		// the most expensive steps of this export.
		let w, h, dataURL;
		if (wantImage) {
		const img = new Image();
		img.src = ctx.export.url;
		await new Promise((res, rej) => { img.onload = res; img.onerror = rej; });
		const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
		w = Math.round(img.naturalWidth * ratio);
		h = Math.round(img.naturalHeight * ratio);
		const canvas = document.createElement('canvas');
		canvas.width = w;
		canvas.height = h;
		canvas.getContext('2d').drawImage(img, 0, 0, w, h);
		const mime =
		format === 'jpg' \|\| format === 'jpeg' ? 'image/jpeg'
		: format === 'webp' ? 'image/webp'
		: 'image/png';
		dataURL = canvas.toDataURL(mime, quality);
		} else {
		// No image — scale bboxes to the same target width the image would
		// have used, so downstream callers can still render the map over
		// a separately-rendered screenshot at the same scale.
		const sourceW = meta.dimensions.width \|\| 1;
		const ratio = sourceW > maxWidth ? maxWidth / sourceW : 1;
		w = Math.round(sourceW * ratio);
		h = Math.round(meta.dimensions.height * ratio);
		}

		const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
		const w = Math.round(img.naturalWidth * ratio);
		const h = Math.round(img.naturalHeight * ratio);

		const canvas = document.createElement('canvas');
		canvas.width = w;
		canvas.height = h;
		canvas.getContext('2d').drawImage(img, 0, 0, w, h);

		const mime =
		format === 'jpg' \|\| format === 'jpeg' ? 'image/jpeg'
		: format === 'webp' ? 'image/webp'
		: 'image/png';
		const dataURL = canvas.toDataURL(mime, quality);

		const sx = w / (meta.dimensions.width \|\| 1);
		const sy = h / (meta.dimensions.height \|\| 1);

		const elements = meta.elements.map((el) => ({
		const scaledElements = meta.elements.map((el) => ({
		...el,
		@@ -148,15 +172,14 @@ bbox: {

		// In compact + annotate, badges already encode positions on the
		// image itself, so repeating coords in the prompt text is noise.
		const emitCoords = mode === 'verbose' ? true : (withCoords && !annotate);
		const prompt = mode === 'verbose'
		? formatPromptVerbose(elements, { width: w, height: h }, emitCoords)
		: formatPromptCompact(elements, { width: w, height: h }, emitCoords);
		const promptText = (wantPrompt)
		? (mode === 'verbose'
		? formatPromptVerbose(scaledElements, { width: w, height: h }, emitCoords)
		: formatPromptCompact(scaledElements, { width: w, height: h }, emitCoords))
		: null;

		return {
		image: dataURL,
		elements,
		dimensions: { width: w, height: h },
		prompt,
		};
		const out = { dimensions: { width: w, height: h } };
		if (wantImage) out.image = dataURL;
		if (wantElements) out.elements = scaledElements;
		if (wantPrompt) out.prompt = promptText;
		return out;
		},
		@@ -163,0 +186,0 @@ };

+41

-10

README.md

		@@ -175,3 +175,3 @@ # @zumer/snapdom-plugins

		Adds a `toPrompt()` export method that returns an LLM-ready package: an annotated screenshot, a structured element map with bounding boxes, and a pre-formatted text description. Useful for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata.
		Adds a `toPrompt()` export method that returns an LLM-ready package: a structured element map with bounding boxes, a pre-formatted prompt text, and (optionally) an annotated screenshot. Tuned for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata.

		@@ -182,13 +182,23 @@ ```js
		const result = await snapdom(el, { plugins: [promptExport()] });
		const { image, elements, dimensions, prompt } = await result.toPrompt();
		// Default: no image, just the structured map + prompt text (cheapest)
		const { elements, prompt, dimensions } = await result.toPrompt();
		```

		The returned object:
		To also include the annotated image (for tasks that truly depend on vision):

		```js
		const result = await snapdom(el, {
		plugins: [promptExport({ include: ['image', 'elements', 'prompt'] })]
		});
		const { image, elements, prompt, dimensions } = await result.toPrompt();
		```

		The returned object (fields present only if requested via `include`):

		\| Field \| Type \| Description \|
		\|-------\|------\|-------------\|
		\| `image` \| `string` \| Data URL of the (optionally annotated) screenshot \|
		\| `elements` \| `Array` \| One entry per detected element: `{ id, tag, type, text, bbox, attributes }` \|
		\| `dimensions` \| `{width, height}` \| Image dimensions after `maxImageWidth` scaling \|
		\| `elements` \| `Array` \| One entry per detected element: `{ id, tag, type, name, text, bbox, attributes, state?, styles?, covered? }` \|
		\| `prompt` \| `string` \| Pre-formatted text describing interactive + semantic elements \|
		\| `image` \| `string` \| Data URL of the (optionally annotated) screenshot — only when `include` contains `'image'` \|
		\| `dimensions` \| `{width, height}` \| Scaled dimensions (always present) \|

		@@ -201,5 +211,13 @@ `elements` is split into two `type`s:

		Each interactive entry also carries:
		- `name` — the computed accessible name (aria-label → labelledby → alt → title → labels[0] → textContent)
		- `state` — runtime state: `{ checked, disabled, focus, open, value, selectedText }` (only keys that apply)
		- `styles` — visually-meaningful computed props filtered to drop defaults
		- `covered: true` when another element is painted on top of the bbox center (an agent won't click through a modal)

		```js
		// Example — feed a vision-capable LLM
		const { image, elements } = await result.toPrompt({ maxImageWidth: 1024 });
		const { image, elements } = await result.toPrompt({
		include: ['image', 'elements', 'prompt']
		});

		@@ -213,4 +231,7 @@ // image is a data URL → pass as image input
		\|--------\|------\|---------\|-------------\|
		\| `annotate` \| `boolean` \| `true` \| Overlay numbered badges on interactive elements \|
		\| `imageFormat` \| `'png' \\| 'jpg' \\| 'webp'` \| `'png'` \| Output image format \|
		\| `include` \| `string[]` \| `['elements', 'prompt']` \| Fields to return. Add `'image'` for tasks that need vision (chart content, layout QA, canvas). Use `['prompt']` for the cheapest text-only mode. \|
		\| `annotate` \| `boolean` \| `true` \| Overlay numbered badges on interactive elements (only affects the image when included) \|
		\| `promptMode` \| `'compact' \\| 'verbose'` \| `'compact'` \| Prompt text verbosity. Compact omits coords when badges are on the image. \|
		\| `includeCoords` \| `boolean` \| `true` \| Include bbox in the prompt text \|
		\| `imageFormat` \| `'png' \\| 'jpg' \\| 'webp'` \| `'png'` \| Output image format (only used when `image` is included) \|
		\| `imageQuality` \| `number` \| `0.8` \| Quality for lossy formats (0–1) \|
		@@ -226,4 +247,14 @@ \| `maxImageWidth` \| `number` \| `1024` \| Max width in px; downscales and rescales bboxes if larger \|

		Both per-call options (`opts.imageFormat`, etc.) and constructor options are supported; per-call wins.
		Both per-call options (`opts.include`, `opts.imageFormat`, etc.) and constructor options are supported; per-call wins.

		The image is the most expensive part of `toPrompt()` to produce (canvas draw + data-URL serialization), so the default skips it. Add `'image'` to `include` when the task actually uses vision:

		```js
		// Vision-dependent task (chart content, layout QA, visual diff)
		await result.toPrompt({ include: ['image', 'elements', 'prompt'] });

		// Pure structured agent loop (cheapest)
		await result.toPrompt({ include: ['prompt'] });
		```

		---
		@@ -230,0 +261,0 @@

@zumer/snapdom-plugins - npm Package Compare versions

Improved metrics