Big News: Socket raises $60M Series C at a $1B valuation to secure software supply chains for AI-driven development.Announcement
Sign In

@zumer/snapdom-plugins

Package Overview
Dependencies
Maintainers
1
Versions
8
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@zumer/snapdom-plugins - npm Package Compare versions

Comparing version
1.1.0
to
1.2.0
+1
-1
package.json
{
"name": "@zumer/snapdom-plugins",
"version": "1.1.0",
"version": "1.2.0",
"description": "Official plugins for SnapDOM",

@@ -5,0 +5,0 @@ "type": "module",

@@ -22,2 +22,7 @@ /**

* @param {boolean} [options.includeCoords=true] - Include bbox in the prompt text
* @param {string[]} [options.include] - Which fields to return. Default
* ['elements', 'prompt']. For vision-dependent tasks (chart content, layout QA,
* canvas) pass ['image', 'elements', 'prompt'] or add 'image' to the array. For
* text-only agent prompts pass ['prompt'] (cheapest — skips canvas draw entirely).
* Accepted values: 'image', 'elements', 'prompt'.
* @returns {Object} SnapDOM plugin

@@ -62,2 +67,8 @@ */

// Default omits 'image'. Benchmarking showed the text + JSON map is enough
// to answer most UI-inspection questions and uses ~14× fewer tokens. Pass
// `include: ['image', 'elements', 'prompt']` explicitly when the task truly
// depends on vision (charts, canvas content, layout QA).
const DEFAULT_INCLUDE = ['elements', 'prompt'];
export function promptExport(options = {}) {

@@ -74,2 +85,3 @@ const {

includeCoords = true,
include = DEFAULT_INCLUDE,
} = options;

@@ -96,9 +108,13 @@

const meta = ctx.__promptMetadata;
const wantSet = new Set(opts.include || include || DEFAULT_INCLUDE);
const wantImage = wantSet.has('image');
const wantElements = wantSet.has('elements');
const wantPrompt = wantSet.has('prompt');
if (!meta || !meta.elements.length) {
return {
image: ctx.export.url,
elements: [],
dimensions: { width: 0, height: 0 },
prompt: '',
};
const empty = { dimensions: { width: 0, height: 0 } };
if (wantImage) empty.image = ctx.export.url;
if (wantElements) empty.elements = [];
if (wantPrompt) empty.prompt = '';
return empty;
}

@@ -112,28 +128,36 @@

const img = new Image();
img.src = ctx.export.url;
await new Promise((res, rej) => {
img.onload = res;
img.onerror = rej;
});
// Only load + rasterize the SVG when the caller actually wants the
// image. Skipping saves the img decode + canvas draw + toDataURL —
// the most expensive steps of this export.
let w, h, dataURL;
if (wantImage) {
const img = new Image();
img.src = ctx.export.url;
await new Promise((res, rej) => { img.onload = res; img.onerror = rej; });
const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
w = Math.round(img.naturalWidth * ratio);
h = Math.round(img.naturalHeight * ratio);
const canvas = document.createElement('canvas');
canvas.width = w;
canvas.height = h;
canvas.getContext('2d').drawImage(img, 0, 0, w, h);
const mime =
format === 'jpg' || format === 'jpeg' ? 'image/jpeg'
: format === 'webp' ? 'image/webp'
: 'image/png';
dataURL = canvas.toDataURL(mime, quality);
} else {
// No image — scale bboxes to the same target width the image would
// have used, so downstream callers can still render the map over
// a separately-rendered screenshot at the same scale.
const sourceW = meta.dimensions.width || 1;
const ratio = sourceW > maxWidth ? maxWidth / sourceW : 1;
w = Math.round(sourceW * ratio);
h = Math.round(meta.dimensions.height * ratio);
}
const ratio = img.naturalWidth > maxWidth ? maxWidth / img.naturalWidth : 1;
const w = Math.round(img.naturalWidth * ratio);
const h = Math.round(img.naturalHeight * ratio);
const canvas = document.createElement('canvas');
canvas.width = w;
canvas.height = h;
canvas.getContext('2d').drawImage(img, 0, 0, w, h);
const mime =
format === 'jpg' || format === 'jpeg' ? 'image/jpeg'
: format === 'webp' ? 'image/webp'
: 'image/png';
const dataURL = canvas.toDataURL(mime, quality);
const sx = w / (meta.dimensions.width || 1);
const sy = h / (meta.dimensions.height || 1);
const elements = meta.elements.map((el) => ({
const scaledElements = meta.elements.map((el) => ({
...el,

@@ -148,15 +172,14 @@ bbox: {

// In compact + annotate, badges already encode positions on the
// image itself, so repeating coords in the prompt text is noise.
const emitCoords = mode === 'verbose' ? true : (withCoords && !annotate);
const prompt = mode === 'verbose'
? formatPromptVerbose(elements, { width: w, height: h }, emitCoords)
: formatPromptCompact(elements, { width: w, height: h }, emitCoords);
const promptText = (wantPrompt)
? (mode === 'verbose'
? formatPromptVerbose(scaledElements, { width: w, height: h }, emitCoords)
: formatPromptCompact(scaledElements, { width: w, height: h }, emitCoords))
: null;
return {
image: dataURL,
elements,
dimensions: { width: w, height: h },
prompt,
};
const out = { dimensions: { width: w, height: h } };
if (wantImage) out.image = dataURL;
if (wantElements) out.elements = scaledElements;
if (wantPrompt) out.prompt = promptText;
return out;
},

@@ -163,0 +186,0 @@ };

+41
-10

@@ -175,3 +175,3 @@ # @zumer/snapdom-plugins

Adds a `toPrompt()` export method that returns an LLM-ready package: an annotated screenshot, a structured element map with bounding boxes, and a pre-formatted text description. Useful for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata.
Adds a `toPrompt()` export method that returns an LLM-ready package: a structured element map with bounding boxes, a pre-formatted prompt text, and (optionally) an annotated screenshot. Tuned for vision-language models, browser-agent pipelines, visual QA, and any workflow that pairs a capture with structured metadata.

@@ -182,13 +182,23 @@ ```js

const result = await snapdom(el, { plugins: [promptExport()] });
const { image, elements, dimensions, prompt } = await result.toPrompt();
// Default: no image, just the structured map + prompt text (cheapest)
const { elements, prompt, dimensions } = await result.toPrompt();
```
The returned object:
To also include the annotated image (for tasks that truly depend on vision):
```js
const result = await snapdom(el, {
plugins: [promptExport({ include: ['image', 'elements', 'prompt'] })]
});
const { image, elements, prompt, dimensions } = await result.toPrompt();
```
The returned object (fields present only if requested via `include`):
| Field | Type | Description |
|-------|------|-------------|
| `image` | `string` | Data URL of the (optionally annotated) screenshot |
| `elements` | `Array` | One entry per detected element: `{ id, tag, type, text, bbox, attributes }` |
| `dimensions` | `{width, height}` | Image dimensions after `maxImageWidth` scaling |
| `elements` | `Array` | One entry per detected element: `{ id, tag, type, name, text, bbox, attributes, state?, styles?, covered? }` |
| `prompt` | `string` | Pre-formatted text describing interactive + semantic elements |
| `image` | `string` | Data URL of the (optionally annotated) screenshot — **only when `include` contains `'image'`** |
| `dimensions` | `{width, height}` | Scaled dimensions (always present) |

@@ -201,5 +211,13 @@ `elements` is split into two `type`s:

Each interactive entry also carries:
- `name` — the computed accessible name (aria-label → labelledby → alt → title → labels[0] → textContent)
- `state` — runtime state: `{ checked, disabled, focus, open, value, selectedText }` (only keys that apply)
- `styles` — visually-meaningful computed props filtered to drop defaults
- `covered: true` when another element is painted on top of the bbox center (an agent won't click through a modal)
```js
// Example — feed a vision-capable LLM
const { image, elements } = await result.toPrompt({ maxImageWidth: 1024 });
const { image, elements } = await result.toPrompt({
include: ['image', 'elements', 'prompt']
});

@@ -213,4 +231,7 @@ // image is a data URL → pass as image input

|--------|------|---------|-------------|
| `annotate` | `boolean` | `true` | Overlay numbered badges on interactive elements |
| `imageFormat` | `'png' \| 'jpg' \| 'webp'` | `'png'` | Output image format |
| `include` | `string[]` | `['elements', 'prompt']` | Fields to return. Add `'image'` for tasks that need vision (chart content, layout QA, canvas). Use `['prompt']` for the cheapest text-only mode. |
| `annotate` | `boolean` | `true` | Overlay numbered badges on interactive elements (only affects the image when included) |
| `promptMode` | `'compact' \| 'verbose'` | `'compact'` | Prompt text verbosity. Compact omits coords when badges are on the image. |
| `includeCoords` | `boolean` | `true` | Include bbox in the prompt text |
| `imageFormat` | `'png' \| 'jpg' \| 'webp'` | `'png'` | Output image format (only used when `image` is included) |
| `imageQuality` | `number` | `0.8` | Quality for lossy formats (0–1) |

@@ -226,4 +247,14 @@ | `maxImageWidth` | `number` | `1024` | Max width in px; downscales and rescales bboxes if larger |

Both per-call options (`opts.imageFormat`, etc.) and constructor options are supported; per-call wins.
Both per-call options (`opts.include`, `opts.imageFormat`, etc.) and constructor options are supported; per-call wins.
The image is the most expensive part of `toPrompt()` to produce (canvas draw + data-URL serialization), so the default skips it. Add `'image'` to `include` when the task actually uses vision:
```js
// Vision-dependent task (chart content, layout QA, visual diff)
await result.toPrompt({ include: ['image', 'elements', 'prompt'] });
// Pure structured agent loop (cheapest)
await result.toPrompt({ include: ['prompt'] });
```
---

@@ -230,0 +261,0 @@