New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

@browserbasehq/stagehand

Package Overview
Dependencies
Maintainers
3
Versions
89
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@browserbasehq/stagehand - npm Package Compare versions

Comparing version 1.9.0-alpha-db2ef5997664e81b1dfb5ca992392362f2d3bab1 to 1.9.0

dist/evals/deterministic/bb.playwright.config.d.ts

15

dist/examples/external_clients/ollama.d.ts
import { type ClientOptions } from "openai";
import type { ChatCompletion } from "openai/resources/chat";
import type { LLMCache } from "../../lib/cache/LLMCache";
import { type ChatCompletionOptions, LLMClient } from "../../lib/llm/LLMClient";
import type { LogLine } from "../../types/log";
import type { ChatCompletion } from "openai/resources/chat";
import { CreateChatCompletionOptions, LLMClient } from "../../lib/llm/LLMClient";
export declare class OllamaClient extends LLMClient {

@@ -10,7 +9,11 @@ type: "ollama";

private cache;
logger: (message: LogLine) => void;
private enableCaching;
clientOptions: ClientOptions;
constructor(logger: (message: LogLine) => void, enableCaching: boolean, cache: LLMCache | undefined, modelName: "llama3.2", clientOptions?: ClientOptions);
createChatCompletion<T = ChatCompletion>(options: ChatCompletionOptions, retries?: number): Promise<T>;
constructor({ enableCaching, cache, modelName, clientOptions, }: {
enableCaching?: boolean;
cache?: LLMCache;
modelName?: string;
clientOptions?: ClientOptions;
});
createChatCompletion<T = ChatCompletion>({ options, retries, logger, }: CreateChatCompletionOptions): Promise<T>;
}

81

dist/index.d.ts

@@ -6,3 +6,2 @@ import { z, ZodType } from 'zod';

import { ClientOptions as ClientOptions$1 } from 'openai';
import { ChatCompletionTool, ChatCompletionToolChoiceOption, ChatCompletion } from 'openai/resources';

@@ -27,30 +26,2 @@ type LogLine = {

type ClientOptions = ClientOptions$1 | ClientOptions$2;
type ToolCall = ChatCompletionTool;
type AnthropicTransformedResponse = {
id: string;
object: string;
created: number;
model: string;
choices: {
index: number;
message: {
role: string;
content: string | null;
tool_calls: {
id: string;
type: string;
function: {
name: string;
arguments: string;
};
}[];
};
finish_reason: string;
}[];
usage: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
};
};
interface AnthropicJsonSchemaObject {

@@ -67,2 +38,9 @@ definitions?: {

interface LLMTool {
type: "function";
name: string;
description: string;
parameters: Record<string, unknown>;
}
interface ChatMessage {

@@ -98,8 +76,39 @@ role: "system" | "user" | "assistant";

};
tools?: ToolCall[];
tool_choice?: "auto" | ChatCompletionToolChoiceOption;
tools?: LLMTool[];
tool_choice?: "auto" | "none" | "required";
maxTokens?: number;
requestId: string;
}
type LLMResponse = AnthropicTransformedResponse | ChatCompletion;
type LLMResponse = {
id: string;
object: string;
created: number;
model: string;
choices: {
index: number;
message: {
role: string;
content: string | null;
tool_calls: {
id: string;
type: string;
function: {
name: string;
arguments: string;
};
}[];
};
finish_reason: string;
}[];
usage: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
};
};
interface CreateChatCompletionOptions {
options: ChatCompletionOptions;
logger: (message: LogLine) => void;
retries?: number;
}
declare abstract class LLMClient {

@@ -111,7 +120,3 @@ type: "openai" | "anthropic" | string;

constructor(modelName: AvailableModel);
abstract createChatCompletion<T = LLMResponse>(options: ChatCompletionOptions): Promise<T>;
abstract logger: (message: {
category?: string;
message: string;
}) => void;
abstract createChatCompletion<T = LLMResponse>(options: CreateChatCompletionOptions): Promise<T>;
}

@@ -280,2 +285,2 @@

export { type ActOptions, type ActResult, type AnthropicJsonSchemaObject, type AnthropicTransformedResponse, type AvailableModel, AvailableModelSchema, type Browser, type BrowserContext, type BrowserResult, type ClientOptions, type ConstructorParams, type ExtractOptions, type ExtractResult, type GotoOptions, type InitFromPageOptions, type InitFromPageResult, type InitOptions, type InitResult, type LogLine, type ModelProvider, type ObserveOptions, type ObserveResult, type Page, PlaywrightCommandException, PlaywrightCommandMethodNotSupportedException, Stagehand, type ToolCall };
export { type ActOptions, type ActResult, type AnthropicJsonSchemaObject, type AvailableModel, AvailableModelSchema, type Browser, type BrowserContext, type BrowserResult, type ClientOptions, type ConstructorParams, type ExtractOptions, type ExtractResult, type GotoOptions, type InitFromPageOptions, type InitFromPageResult, type InitOptions, type InitResult, LLMClient, type LogLine, type ModelProvider, type ObserveOptions, type ObserveResult, type Page, PlaywrightCommandException, PlaywrightCommandMethodNotSupportedException, Stagehand };

@@ -55,1 +55,2 @@ import { z } from "zod";

export * from "../types/page";
export { LLMClient } from "./llm/LLMClient";
import { z } from "zod";
import { ActCommandParams, ActCommandResult } from "../types/act";
import { VerifyActCompletionParams } from "../types/inference";
import { LogLine } from "../types/log";
import { LLMClient } from "./llm/LLMClient";
import { VerifyActCompletionParams } from "../types/inference";
import { ActCommandParams, ActCommandResult } from "../types/act";
export declare function verifyActCompletion({ goal, steps, llmClient, screenshot, domElements, logger, requestId, }: VerifyActCompletionParams): Promise<boolean>;
export declare function fillInVariables(text: string, variables: Record<string, string>): string;
export declare function act({ action, domElements, steps, llmClient, screenshot, retries, logger, requestId, variables, }: ActCommandParams): Promise<ActCommandResult | null>;
export declare function extract({ instruction, previouslyExtractedContent, domElements, schema, llmClient, chunksSeen, chunksTotal, requestId, isUsingTextExtract, }: {
export declare function extract({ instruction, previouslyExtractedContent, domElements, schema, llmClient, chunksSeen, chunksTotal, requestId, logger, isUsingTextExtract, }: {
instruction: string;

@@ -18,2 +19,3 @@ previouslyExtractedContent: object;

isUsingTextExtract?: boolean;
logger: (message: LogLine) => void;
}): Promise<{

@@ -25,3 +27,3 @@ metadata: {

}>;
export declare function observe({ instruction, domElements, llmClient, image, requestId, }: {
export declare function observe({ instruction, domElements, llmClient, image, requestId, logger, }: {
instruction: string;

@@ -32,2 +34,3 @@ domElements: string;

requestId: string;
logger: (message: LogLine) => void;
}): Promise<{

@@ -39,6 +42,1 @@ elements: {

}>;
export declare function ask({ question, llmClient, requestId, }: {
question: string;
llmClient: LLMClient;
requestId: string;
}): Promise<string>;
import { ClientOptions } from "@anthropic-ai/sdk";
import { LogLine } from "../../types/log";
import { AnthropicTransformedResponse, AvailableModel } from "../../types/model";
import { AvailableModel } from "../../types/model";
import { LLMCache } from "../cache/LLMCache";
import { ChatCompletionOptions, LLMClient } from "./LLMClient";
import { CreateChatCompletionOptions, LLMClient, LLMResponse } from "./LLMClient";
export declare class AnthropicClient extends LLMClient {

@@ -10,9 +10,12 @@ type: "anthropic";

private cache;
logger: (message: LogLine) => void;
private enableCaching;
clientOptions: ClientOptions;
constructor(logger: (message: LogLine) => void, enableCaching: boolean, cache: LLMCache | undefined, modelName: AvailableModel, clientOptions?: ClientOptions);
createChatCompletion<T = AnthropicTransformedResponse>(options: ChatCompletionOptions & {
retries?: number;
}): Promise<T>;
constructor({ enableCaching, cache, modelName, clientOptions, }: {
logger: (message: LogLine) => void;
enableCaching?: boolean;
cache?: LLMCache;
modelName: AvailableModel;
clientOptions?: ClientOptions;
});
createChatCompletion<T = LLMResponse>({ options, retries, logger, }: CreateChatCompletionOptions): Promise<T>;
}

@@ -1,4 +0,5 @@

import { ChatCompletion, ChatCompletionToolChoiceOption } from "openai/resources";
import { ZodType } from "zod";
import { AnthropicTransformedResponse, AvailableModel, ClientOptions, ToolCall } from "../../types/model";
import { LLMTool } from "../../types/llm";
import { AvailableModel, ClientOptions } from "../../types/model";
import { LogLine } from "../../types/log";
export interface ChatMessage {

@@ -36,8 +37,39 @@ role: "system" | "user" | "assistant";

};
tools?: ToolCall[];
tool_choice?: "auto" | ChatCompletionToolChoiceOption;
tools?: LLMTool[];
tool_choice?: "auto" | "none" | "required";
maxTokens?: number;
requestId: string;
}
export type LLMResponse = AnthropicTransformedResponse | ChatCompletion;
export type LLMResponse = {
id: string;
object: string;
created: number;
model: string;
choices: {
index: number;
message: {
role: string;
content: string | null;
tool_calls: {
id: string;
type: string;
function: {
name: string;
arguments: string;
};
}[];
};
finish_reason: string;
}[];
usage: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
};
};
export interface CreateChatCompletionOptions {
options: ChatCompletionOptions;
logger: (message: LogLine) => void;
retries?: number;
}
export declare abstract class LLMClient {

@@ -49,7 +81,3 @@ type: "openai" | "anthropic" | string;

constructor(modelName: AvailableModel);
abstract createChatCompletion<T = LLMResponse>(options: ChatCompletionOptions): Promise<T>;
abstract logger: (message: {
category?: string;
message: string;
}) => void;
abstract createChatCompletion<T = LLMResponse>(options: CreateChatCompletionOptions): Promise<T>;
}
import { ClientOptions } from "openai";
import { ChatCompletion } from "openai/resources/chat";
import { LogLine } from "../../types/log";
import { AvailableModel } from "../../types/model";
import { LLMCache } from "../cache/LLMCache";
import { ChatCompletionOptions, LLMClient } from "./LLMClient";
import { CreateChatCompletionOptions, LLMClient, LLMResponse } from "./LLMClient";
export declare class OpenAIClient extends LLMClient {

@@ -11,7 +10,12 @@ type: "openai";

private cache;
logger: (message: LogLine) => void;
private enableCaching;
clientOptions: ClientOptions;
constructor(logger: (message: LogLine) => void, enableCaching: boolean, cache: LLMCache | undefined, modelName: AvailableModel, clientOptions?: ClientOptions);
createChatCompletion<T = ChatCompletion>(optionsInitial: ChatCompletionOptions, retries?: number): Promise<T>;
constructor({ enableCaching, cache, modelName, clientOptions, }: {
logger: (message: LogLine) => void;
enableCaching?: boolean;
cache?: LLMCache;
modelName: AvailableModel;
clientOptions?: ClientOptions;
});
createChatCompletion<T = LLMResponse>({ options: optionsInitial, logger, retries, }: CreateChatCompletionOptions): Promise<T>;
}

@@ -1,2 +0,2 @@

import OpenAI from "openai";
import { LLMTool } from "../types/llm";
import { ChatMessage } from "./llm/LLMClient";

@@ -7,3 +7,3 @@ export declare function buildVerifyActCompletionSystemPrompt(): ChatMessage;

export declare function buildActUserPrompt(action: string, steps: string, domElements: string, variables?: Record<string, string>): ChatMessage;
export declare const actTools: Array<OpenAI.ChatCompletionTool>;
export declare const actTools: LLMTool[];
export declare function buildExtractSystemPrompt(isUsingPrintExtractedDataTool?: boolean, useTextExtract?: boolean): ChatMessage;

@@ -17,3 +17,1 @@ export declare function buildExtractUserPrompt(instruction: string, domElements: string, isUsingPrintExtractedDataTool?: boolean): ChatMessage;

export declare function buildObserveUserMessage(instruction: string, domElements: string): ChatMessage;
export declare function buildAskSystemPrompt(): ChatMessage;
export declare function buildAskUserPrompt(question: string): ChatMessage;
import type { ClientOptions as AnthropicClientOptions } from "@anthropic-ai/sdk";
import type { ClientOptions as OpenAIClientOptions } from "openai";
import { ChatCompletionTool as OpenAITool } from "openai/resources";
import { z } from "zod";

@@ -9,30 +8,2 @@ export declare const AvailableModelSchema: z.ZodEnum<["gpt-4o", "gpt-4o-mini", "gpt-4o-2024-08-06", "claude-3-5-sonnet-latest", "claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "o1-mini", "o1-preview"]>;

export type ClientOptions = OpenAIClientOptions | AnthropicClientOptions;
export type ToolCall = OpenAITool;
export type AnthropicTransformedResponse = {
id: string;
object: string;
created: number;
model: string;
choices: {
index: number;
message: {
role: string;
content: string | null;
tool_calls: {
id: string;
type: string;
function: {
name: string;
arguments: string;
};
}[];
};
finish_reason: string;
}[];
usage: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
};
};
export interface AnthropicJsonSchemaObject {

@@ -39,0 +10,0 @@ definitions?: {

@@ -1381,3 +1381,2 @@ import { Locator, Page } from "@playwright/test";

}).catch((error) => {
console.log("error verifying action completion", error);
this.logger({

@@ -1393,2 +1392,6 @@ category: "action",

},
trace: {
value: error.stack,
type: "string",
},
},

@@ -1395,0 +1398,0 @@ });

@@ -309,2 +309,3 @@ import { z } from "zod";

requestId,
logger: this.logger,
});

@@ -438,2 +439,3 @@

isUsingTextExtract: false,
logger: this.logger,
});

@@ -440,0 +442,0 @@

@@ -123,2 +123,3 @@ import { LogLine } from "../../types/log";

requestId,
logger: this.logger,
});

@@ -125,0 +126,0 @@

@@ -362,8 +362,16 @@ import { Browserbase } from "@browserbasehq/sdk";

this.debugDom = debugDom ?? false;
this.llmClient =
llmClient ||
this.llmProvider.getClient(
modelName ?? DEFAULT_MODEL_NAME,
modelClientOptions,
);
if (llmClient) {
this.llmClient = llmClient;
} else {
try {
// try to set a default LLM client
this.llmClient = this.llmProvider.getClient(
modelName ?? DEFAULT_MODEL_NAME,
modelClientOptions,
);
} catch {
this.llmClient = undefined;
}
}
this.domSettleTimeoutMs = domSettleTimeoutMs ?? 30_000;

@@ -618,1 +626,2 @@ this.headless = headless ?? false;

export * from "../types/page";
export { LLMClient } from "./llm/LLMClient";

@@ -0,26 +1,25 @@

import { z } from "zod";
import { ActCommandParams, ActCommandResult } from "../types/act";
import { VerifyActCompletionParams } from "../types/inference";
import { LogLine } from "../types/log";
import {
AnnotatedScreenshotText,
ChatMessage,
LLMClient,
} from "./llm/LLMClient";
import {
actTools,
buildActSystemPrompt,
buildActUserPrompt,
buildAskSystemPrompt,
buildExtractSystemPrompt,
buildExtractUserPrompt,
buildMetadataPrompt,
buildMetadataSystemPrompt,
buildObserveSystemPrompt,
buildObserveUserMessage,
buildAskUserPrompt,
buildRefineSystemPrompt,
buildRefineUserPrompt,
buildVerifyActCompletionSystemPrompt,
buildVerifyActCompletionUserPrompt,
buildRefineSystemPrompt,
buildRefineUserPrompt,
buildMetadataSystemPrompt,
buildMetadataPrompt,
} from "./prompt";
import { z } from "zod";
import {
AnnotatedScreenshotText,
ChatMessage,
LLMClient,
} from "./llm/LLMClient";
import { VerifyActCompletionParams } from "../types/inference";
import { ActCommandParams, ActCommandResult } from "../types/act";

@@ -43,21 +42,24 @@ export async function verifyActCompletion({

const response = await llmClient.createChatCompletion<VerificationResponse>({
messages: [
buildVerifyActCompletionSystemPrompt(),
buildVerifyActCompletionUserPrompt(goal, steps, domElements),
],
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
image: screenshot
? {
buffer: screenshot,
description: "This is a screenshot of the whole visible page.",
}
: undefined,
response_model: {
name: "Verification",
schema: verificationSchema,
options: {
messages: [
buildVerifyActCompletionSystemPrompt(),
buildVerifyActCompletionUserPrompt(goal, steps, domElements),
],
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
image: screenshot
? {
buffer: screenshot,
description: "This is a screenshot of the whole visible page.",
}
: undefined,
response_model: {
name: "Verification",
schema: verificationSchema,
},
requestId,
},
requestId,
logger,
});

@@ -113,13 +115,16 @@

const response = await llmClient.createChatCompletion({
messages,
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
tool_choice: "auto" as const,
tools: actTools,
image: screenshot
? { buffer: screenshot, description: AnnotatedScreenshotText }
: undefined,
requestId,
options: {
messages,
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
tool_choice: "auto" as const,
tools: actTools,
image: screenshot
? { buffer: screenshot, description: AnnotatedScreenshotText }
: undefined,
requestId,
},
logger,
});

@@ -165,2 +170,3 @@

requestId,
logger,
isUsingTextExtract,

@@ -177,36 +183,18 @@ }: {

isUsingTextExtract?: boolean;
logger: (message: LogLine) => void;
}) {
type ExtractionResponse = z.infer<typeof schema>;
type MetadataResponse = z.infer<typeof metadataSchema>;
// TODO: antipattern
const isUsingAnthropic = llmClient.type === "anthropic";
const extractionResponse = await llmClient.createChatCompletion({
messages: [
buildExtractSystemPrompt(isUsingAnthropic, isUsingTextExtract),
buildExtractUserPrompt(instruction, domElements, isUsingAnthropic),
],
response_model: {
schema: schema,
name: "Extraction",
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
});
const refinedResponse =
await llmClient.createChatCompletion<ExtractionResponse>({
options: {
messages: [
buildRefineSystemPrompt(),
buildRefineUserPrompt(
instruction,
previouslyExtractedContent,
extractionResponse,
),
buildExtractSystemPrompt(isUsingAnthropic, isUsingTextExtract),
buildExtractUserPrompt(instruction, domElements, isUsingAnthropic),
],
response_model: {
schema: schema,
name: "RefinedExtraction",
name: "Extraction",
},

@@ -218,2 +206,28 @@ temperature: 0.1,

requestId,
},
logger,
});
const refinedResponse =
await llmClient.createChatCompletion<ExtractionResponse>({
options: {
messages: [
buildRefineSystemPrompt(),
buildRefineUserPrompt(
instruction,
previouslyExtractedContent,
extractionResponse,
),
],
response_model: {
schema: schema,
name: "RefinedExtraction",
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
},
logger,
});

@@ -236,20 +250,23 @@

await llmClient.createChatCompletion<MetadataResponse>({
messages: [
buildMetadataSystemPrompt(),
buildMetadataPrompt(
instruction,
refinedResponse,
chunksSeen,
chunksTotal,
),
],
response_model: {
name: "Metadata",
schema: metadataSchema,
options: {
messages: [
buildMetadataSystemPrompt(),
buildMetadataPrompt(
instruction,
refinedResponse,
chunksSeen,
chunksTotal,
),
],
response_model: {
name: "Metadata",
schema: metadataSchema,
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
logger,
});

@@ -269,2 +286,3 @@

requestId,
logger,
}: {

@@ -276,2 +294,3 @@ instruction: string;

requestId: string;
logger: (message: LogLine) => void;
}): Promise<{

@@ -299,18 +318,21 @@ elements: { elementId: number; description: string }[];

await llmClient.createChatCompletion<ObserveResponse>({
messages: [
buildObserveSystemPrompt(),
buildObserveUserMessage(instruction, domElements),
],
image: image
? { buffer: image, description: AnnotatedScreenshotText }
: undefined,
response_model: {
schema: observeSchema,
name: "Observation",
options: {
messages: [
buildObserveSystemPrompt(),
buildObserveUserMessage(instruction, domElements),
],
image: image
? { buffer: image, description: AnnotatedScreenshotText }
: undefined,
response_model: {
schema: observeSchema,
name: "Observation",
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
logger,
});

@@ -328,23 +350,1 @@

}
export async function ask({
question,
llmClient,
requestId,
}: {
question: string;
llmClient: LLMClient;
requestId: string;
}) {
const response = await llmClient.createChatCompletion({
messages: [buildAskSystemPrompt(), buildAskUserPrompt(question)],
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
});
// The parsing is now handled in the LLM clients
return response.choices[0].message.content;
}

@@ -10,9 +10,9 @@ import Anthropic, { ClientOptions } from "@anthropic-ai/sdk";

import { LogLine } from "../../types/log";
import { AnthropicJsonSchemaObject, AvailableModel } from "../../types/model";
import { LLMCache } from "../cache/LLMCache";
import {
AnthropicJsonSchemaObject,
AnthropicTransformedResponse,
AvailableModel,
} from "../../types/model";
import { LLMCache } from "../cache/LLMCache";
import { ChatCompletionOptions, LLMClient } from "./LLMClient";
CreateChatCompletionOptions,
LLMClient,
LLMResponse,
} from "./LLMClient";

@@ -23,16 +23,19 @@ export class AnthropicClient extends LLMClient {

private cache: LLMCache | undefined;
public logger: (message: LogLine) => void;
private enableCaching: boolean;
public clientOptions: ClientOptions;
constructor(
logger: (message: LogLine) => void,
constructor({
enableCaching = false,
cache: LLMCache | undefined,
modelName: AvailableModel,
clientOptions?: ClientOptions,
) {
cache,
modelName,
clientOptions,
}: {
logger: (message: LogLine) => void;
enableCaching?: boolean;
cache?: LLMCache;
modelName: AvailableModel;
clientOptions?: ClientOptions;
}) {
super(modelName);
this.client = new Anthropic(clientOptions);
this.logger = logger;
this.cache = cache;

@@ -44,9 +47,11 @@ this.enableCaching = enableCaching;

async createChatCompletion<T = AnthropicTransformedResponse>(
options: ChatCompletionOptions & { retries?: number },
): Promise<T> {
async createChatCompletion<T = LLMResponse>({
options,
retries,
logger,
}: CreateChatCompletionOptions): Promise<T> {
const optionsWithoutImage = { ...options };
delete optionsWithoutImage.image;
this.logger({
logger({
category: "anthropic",

@@ -70,3 +75,3 @@ message: "creating chat completion",

tools: options.tools,
retries: options.retries,
retries: retries,
};

@@ -80,3 +85,3 @@

if (cachedResponse) {
this.logger({
logger({
category: "llm_cache",

@@ -102,3 +107,3 @@ message: "LLM cache hit - returning cached response",

} else {
this.logger({
logger({
category: "llm_cache",

@@ -194,13 +199,11 @@ message: "LLM cache miss - no cached response found",

let anthropicTools: Tool[] = options.tools?.map((tool) => {
if (tool.type === "function") {
return {
name: tool.function.name,
description: tool.function.description,
input_schema: {
type: "object",
properties: tool.function.parameters.properties,
required: tool.function.parameters.required,
},
};
}
return {
name: tool.name,
description: tool.description,
input_schema: {
type: "object",
properties: tool.parameters.properties,
required: tool.parameters.required,
},
};
});

@@ -241,3 +244,3 @@

this.logger({
logger({
category: "anthropic",

@@ -258,3 +261,3 @@ message: "response",

const transformedResponse: AnthropicTransformedResponse = {
const transformedResponse: LLMResponse = {
id: response.id,

@@ -293,3 +296,3 @@ object: "chat.completion",

this.logger({
logger({
category: "anthropic",

@@ -320,9 +323,10 @@ message: "transformed response",

} else {
if (!options.retries || options.retries < 5) {
if (!retries || retries < 5) {
return this.createChatCompletion({
...options,
retries: (options.retries ?? 0) + 1,
options,
logger,
retries: (retries ?? 0) + 1,
});
}
this.logger({
logger({
category: "anthropic",

@@ -346,3 +350,3 @@ message: "error creating chat completion",

this.cache.set(cacheOptions, transformedResponse, options.requestId);
this.logger({
logger({
category: "anthropic",

@@ -349,0 +353,0 @@ message: "cached response",

@@ -1,12 +0,5 @@

import {
ChatCompletion,
ChatCompletionToolChoiceOption,
} from "openai/resources";
import { ZodType } from "zod";
import {
AnthropicTransformedResponse,
AvailableModel,
ClientOptions,
ToolCall,
} from "../../types/model";
import { LLMTool } from "../../types/llm";
import { AvailableModel, ClientOptions } from "../../types/model";
import { LogLine } from "../../types/log";

@@ -59,4 +52,4 @@ export interface ChatMessage {

};
tools?: ToolCall[];
tool_choice?: "auto" | ChatCompletionToolChoiceOption;
tools?: LLMTool[];
tool_choice?: "auto" | "none" | "required";
maxTokens?: number;

@@ -66,4 +59,36 @@ requestId: string;

export type LLMResponse = AnthropicTransformedResponse | ChatCompletion;
export type LLMResponse = {
id: string;
object: string;
created: number;
model: string;
choices: {
index: number;
message: {
role: string;
content: string | null;
tool_calls: {
id: string;
type: string;
function: {
name: string;
arguments: string;
};
}[];
};
finish_reason: string;
}[];
usage: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
};
};
export interface CreateChatCompletionOptions {
options: ChatCompletionOptions;
logger: (message: LogLine) => void;
retries?: number;
}
export abstract class LLMClient {

@@ -81,5 +106,4 @@ public type: "openai" | "anthropic" | string;

abstract createChatCompletion<T = LLMResponse>(
options: ChatCompletionOptions,
options: CreateChatCompletionOptions,
): Promise<T>;
abstract logger: (message: { category?: string; message: string }) => void;
}

@@ -64,17 +64,17 @@ import { LogLine } from "../../types/log";

case "openai":
return new OpenAIClient(
this.logger,
this.enableCaching,
this.cache,
return new OpenAIClient({
logger: this.logger,
enableCaching: this.enableCaching,
cache: this.cache,
modelName,
clientOptions,
);
});
case "anthropic":
return new AnthropicClient(
this.logger,
this.enableCaching,
this.cache,
return new AnthropicClient({
logger: this.logger,
enableCaching: this.enableCaching,
cache: this.cache,
modelName,
clientOptions,
);
});
default:

@@ -81,0 +81,0 @@ throw new Error(`Unsupported provider: ${provider}`);

import OpenAI, { ClientOptions } from "openai";
import { zodResponseFormat } from "openai/helpers/zod";
import {
ChatCompletion,
ChatCompletionAssistantMessageParam,

@@ -18,3 +17,9 @@ ChatCompletionContentPartImage,

import { validateZodSchema } from "../utils";
import { ChatCompletionOptions, ChatMessage, LLMClient } from "./LLMClient";
import {
ChatCompletionOptions,
ChatMessage,
CreateChatCompletionOptions,
LLMClient,
LLMResponse,
} from "./LLMClient";

@@ -25,17 +30,20 @@ export class OpenAIClient extends LLMClient {

private cache: LLMCache | undefined;
public logger: (message: LogLine) => void;
private enableCaching: boolean;
public clientOptions: ClientOptions;
constructor(
logger: (message: LogLine) => void,
constructor({
enableCaching = false,
cache: LLMCache | undefined,
modelName: AvailableModel,
clientOptions?: ClientOptions,
) {
cache,
modelName,
clientOptions,
}: {
logger: (message: LogLine) => void;
enableCaching?: boolean;
cache?: LLMCache;
modelName: AvailableModel;
clientOptions?: ClientOptions;
}) {
super(modelName);
this.clientOptions = clientOptions;
this.client = new OpenAI(clientOptions);
this.logger = logger;
this.cache = cache;

@@ -46,6 +54,7 @@ this.enableCaching = enableCaching;

async createChatCompletion<T = ChatCompletion>(
optionsInitial: ChatCompletionOptions,
retries: number = 3,
): Promise<T> {
async createChatCompletion<T = LLMResponse>({
options: optionsInitial,
logger,
retries = 3,
}: CreateChatCompletionOptions): Promise<T> {
let options: Partial<ChatCompletionOptions> = optionsInitial;

@@ -115,3 +124,3 @@

this.logger({
logger({
category: "openai",

@@ -152,3 +161,3 @@ message: "creating chat completion",

if (cachedResponse) {
this.logger({
logger({
category: "llm_cache",

@@ -170,3 +179,3 @@ message: "LLM cache hit - returning cached response",

} else {
this.logger({
logger({
category: "llm_cache",

@@ -219,3 +228,3 @@ message: "LLM cache miss - no cached response found",

} catch (error) {
this.logger({
logger({
category: "openai",

@@ -228,6 +237,7 @@ message: "Failed to parse response model schema",

// as-casting to account for o1 models not supporting all options
return this.createChatCompletion(
options as ChatCompletionOptions,
retries - 1,
);
return this.createChatCompletion({
options: options as ChatCompletionOptions,
logger,
retries: retries - 1,
});
}

@@ -253,3 +263,3 @@

this.logger({
logger({
category: "openai",

@@ -331,3 +341,10 @@ message: "creating chat completion",

stream: false,
tools: options.tools?.filter((tool) => "function" in tool), // ensure only OpenAI tools are used
tools: options.tools?.map((tool) => ({
function: {
name: tool.name,
description: tool.description,
parameters: tool.parameters,
},
type: "function",
})),
};

@@ -354,3 +371,3 @@

} catch (error) {
this.logger({
logger({
category: "openai",

@@ -373,6 +390,7 @@ message: "Failed to parse tool call response",

// as-casting to account for o1 models not supporting all options
return this.createChatCompletion(
options as ChatCompletionOptions,
retries - 1,
);
return this.createChatCompletion({
options: options as ChatCompletionOptions,
logger,
retries: retries - 1,
});
}

@@ -384,3 +402,3 @@

this.logger({
logger({
category: "openai",

@@ -408,6 +426,7 @@ message: "response",

// as-casting to account for o1 models not supporting all options
return this.createChatCompletion(
options as ChatCompletionOptions,
retries - 1,
);
return this.createChatCompletion({
options: options as ChatCompletionOptions,
logger,
retries: retries - 1,
});
}

@@ -432,3 +451,3 @@

if (this.enableCaching) {
this.logger({
logger({
category: "llm_cache",

@@ -435,0 +454,0 @@ message: "caching response",

@@ -1,2 +0,2 @@

import OpenAI from "openai";
import { LLMTool } from "../types/llm";
import { ChatMessage } from "./llm/LLMClient";

@@ -138,45 +138,42 @@

export const actTools: Array<OpenAI.ChatCompletionTool> = [
export const actTools: LLMTool[] = [
{
type: "function",
function: {
name: "doAction",
description:
"execute the next playwright step that directly accomplishes the goal",
parameters: {
type: "object",
required: ["method", "element", "args", "step", "completed"],
properties: {
method: {
name: "doAction",
description:
"execute the next playwright step that directly accomplishes the goal",
parameters: {
type: "object",
required: ["method", "element", "args", "step", "completed"],
properties: {
method: {
type: "string",
description: "The playwright function to call.",
},
element: {
type: "number",
description: "The element number to act on",
},
args: {
type: "array",
description: "The required arguments",
items: {
type: "string",
description: "The playwright function to call.",
description: "The argument to pass to the function",
},
element: {
type: "number",
description: "The element number to act on",
},
args: {
type: "array",
description: "The required arguments",
items: {
type: "string",
description: "The argument to pass to the function",
},
},
step: {
type: "string",
description:
"human readable description of the step that is taken in the past tense. Please be very detailed.",
},
why: {
type: "string",
description:
"why is this step taken? how does it advance the goal?",
},
completed: {
type: "boolean",
description:
"true if the goal should be accomplished after this step",
},
},
step: {
type: "string",
description:
"human readable description of the step that is taken in the past tense. Please be very detailed.",
},
why: {
type: "string",
description: "why is this step taken? how does it advance the goal?",
},
completed: {
type: "boolean",
description:
"true if the goal should be accomplished after this step",
},
},

@@ -187,13 +184,11 @@ },

type: "function",
function: {
name: "skipSection",
description:
"skips this area of the webpage because the current goal cannot be accomplished here",
parameters: {
type: "object",
properties: {
reason: {
type: "string",
description: "reason that no action is taken",
},
name: "skipSection",
description:
"skips this area of the webpage because the current goal cannot be accomplished here",
parameters: {
type: "object",
properties: {
reason: {
type: "string",
description: "reason that no action is taken",
},

@@ -361,19 +356,1 @@ },

}
// ask
const askSystemPrompt = `
you are a simple question answering assistent given the user's question. respond with only the answer.
`;
export function buildAskSystemPrompt(): ChatMessage {
return {
role: "system",
content: askSystemPrompt,
};
}
export function buildAskUserPrompt(question: string): ChatMessage {
return {
role: "user",
content: `question: ${question}`,
};
}

@@ -59,22 +59,24 @@ import type {

this.intContext = context;
this.actHandler = new StagehandActHandler({
verbose: this.stagehand.verbose,
llmProvider: this.stagehand.llmProvider,
enableCaching: this.stagehand.enableCaching,
logger: this.stagehand.logger,
stagehandPage: this,
stagehandContext: this.intContext,
llmClient: llmClient,
});
this.extractHandler = new StagehandExtractHandler({
stagehand: this.stagehand,
logger: this.stagehand.logger,
stagehandPage: this,
});
this.observeHandler = new StagehandObserveHandler({
stagehand: this.stagehand,
logger: this.stagehand.logger,
stagehandPage: this,
});
this.llmClient = llmClient;
if (this.llmClient) {
this.actHandler = new StagehandActHandler({
verbose: this.stagehand.verbose,
llmProvider: this.stagehand.llmProvider,
enableCaching: this.stagehand.enableCaching,
logger: this.stagehand.logger,
stagehandPage: this,
stagehandContext: this.intContext,
llmClient: llmClient,
});
this.extractHandler = new StagehandExtractHandler({
stagehand: this.stagehand,
logger: this.stagehand.logger,
stagehandPage: this,
});
this.observeHandler = new StagehandObserveHandler({
stagehand: this.stagehand,
logger: this.stagehand.logger,
stagehandPage: this,
});
}
}

@@ -102,20 +104,28 @@

if (prop === "act") {
return async (options: ActOptions) => {
return this.act(options);
};
if (this.llmClient) {
if (prop === "act") {
return async (options: ActOptions) => {
return this.act(options);
};
}
if (prop === "extract") {
return async (options: ExtractOptions<z.AnyZodObject>) => {
return this.extract(options);
};
}
if (prop === "observe") {
return async (options: ObserveOptions) => {
return this.observe(options);
};
}
} else {
if (prop === "act" || prop === "extract" || prop === "observe") {
return () => {
throw new Error(
"No LLM API key or LLM Client configured. An LLM API key or a custom LLM Client is required to use act, extract, or observe.",
);
};
}
}
if (prop === "extract") {
return async (options: ExtractOptions<z.AnyZodObject>) => {
return this.extract(options);
};
}
if (prop === "observe") {
return async (options: ObserveOptions) => {
return this.observe(options);
};
}
if (prop === "on") {

@@ -122,0 +132,0 @@ return (event: string, listener: (param: unknown) => void) => {

{
"name": "@browserbasehq/stagehand",
"version": "1.9.0-alpha-db2ef5997664e81b1dfb5ca992392362f2d3bab1",
"version": "1.9.0",
"description": "An AI web browsing framework focused on simplicity and extensibility.",

@@ -14,2 +14,3 @@ "main": "./dist/index.js",

"external-client": "npm run build-dom-scripts && tsx examples/external_client.ts",
"ai-sdk-client": "npm run build-dom-scripts && tsx examples/ai_sdk_example.ts",
"format": "prettier --write .",

@@ -21,3 +22,4 @@ "prettier": "prettier --check .",

"evals": "npm run build-dom-scripts && tsx evals/index.eval.ts",
"e2e": "npm run build-dom-scripts && cd evals/deterministic && npx playwright test",
"e2e": "npm run build-dom-scripts && cd evals/deterministic && npx playwright test --config=e2e.playwright.config.ts",
"e2e:bb": "npm run build-dom-scripts && cd evals/deterministic && npx playwright test --config=bb.playwright.config.ts",
"build-dom-scripts": "tsx lib/dom/genDomScripts.ts",

@@ -39,2 +41,4 @@ "build-types": "tsc --emitDeclarationOnly --outDir dist",

"devDependencies": {
"@ai-sdk/google": "^1.0.13",
"@ai-sdk/openai": "^1.0.14",
"@changesets/changelog-github": "^0.5.0",

@@ -49,2 +53,3 @@ "@changesets/cli": "^2.27.9",

"adm-zip": "^0.5.16",
"ai": "^4.0.26",
"autoevals": "^0.0.64",

@@ -51,0 +56,0 @@ "braintrust": "^0.0.171",

<div id="toc" align="center">
<ul style="list-style: none">
<summary>
<h1> 🤘 Stagehand </h1>
</summary>
<a href="https://stagehand.dev">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://stagehand.dev/logo-dark.svg" />
<img alt="Stagehand" src="https://stagehand.dev/logo-light.svg" />
</picture>
</a>
</ul>

@@ -10,9 +13,25 @@ </div>

<p align="center">
An AI web browsing framework focused on simplicity and extensibility.</em>
An AI web browsing framework focused on simplicity and extensibility.<br>
<a href="https://docs.stagehand.dev">Read the Docs</a>
</p>
<p align="center">
<a href="https://www.npmjs.com/package/@browserbasehq/stagehand"><img alt="NPM" src="https://img.shields.io/npm/v/@browserbasehq/stagehand.svg" /></a>
<a href="https://github.com/browserbase/stagehand/blob/main/license"><img alt="MIT License" src="https://img.shields.io/badge/license-MIT-blue" /></a>
<a href="https://join.slack.com/t/stagehand-dev/shared_invite/zt-2tdncfgkk-fF8y5U0uJzR2y2_M9c9OJA"><img alt="Slack Community" src="https://img.shields.io/badge/slack-Join%20our%20community-FEC89A.svg?logo=slack" /></a>
<a href="https://www.npmjs.com/package/@browserbasehq/stagehand">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://stagehand.dev/api/assets/npm?mode=dark" />
<img alt="NPM" src="https://stagehand.dev/api/assets/npm?mode=light" />
</picture>
</a>
<a href="https://github.com/browserbase/stagehand/tree/main?tab=MIT-1-ov-file#MIT-1-ov-file">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://stagehand.dev/api/assets/license?mode=dark" />
<img alt="MIT License" src="https://stagehand.dev/api/assets/license?mode=light" />
</picture>
</a>
<a href="https://join.slack.com/t/stagehand-dev/shared_invite/zt-2tdncfgkk-fF8y5U0uJzR2y2_M9c9OJA">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://stagehand.dev/api/assets/slack?mode=dark" />
<img alt="Slack Community" src="https://stagehand.dev/api/assets/slack?mode=light" />
</picture>
</a>
</p>

@@ -22,541 +41,64 @@

- [Intro](#intro)
- [Getting Started](#getting-started)
- [API Reference](#api-reference)
- [act()](#act)
- [extract()](#extract)
- [observe()](#observe)
- [close()](#close)
- [Model Support](#model-support)
- [How It Works](#how-it-works)
- [Stagehand vs Playwright](#stagehand-vs-playwright)
- [Prompting Tips](#prompting-tips)
- [Roadmap](#roadmap)
- [Contributing](#contributing)
- [Acknowledgements](#acknowledgements)
- [License](#license)
Stagehand is the easiest way to build browser automations. It is fully compatible with [Playwright](https://playwright.dev/), offering three simple AI APIs (`act`, `extract`, and `observe`) on top of the base Playwright `Page` class that provide the building blocks for web automation via natural language. It also makes Playwright more accessible to non-technical users and less vulnerable to minor changes in the UI/DOM.
> [!NOTE]
> `Stagehand` is currently available as an early release, and we're actively seeking feedback from the community. Please join our [Slack community](https://join.slack.com/t/stagehand-dev/shared_invite/zt-2tdncfgkk-fF8y5U0uJzR2y2_M9c9OJA) to stay updated on the latest developments and provide feedback.
Anything that can be done in a browser can be done with Stagehand. Consider:
## Intro
1. Go to Hacker News and extract the top stories of the day
1. Log into Amazon, search for AirPods, and buy the most relevant product
1. Go to ESPN, search for Steph Curry, and get stats for his last 10 games
Stagehand is the AI-powered successor to [Playwright](https://github.com/microsoft/playwright), offering three simple APIs (`act`, `extract`, and `observe`) that provide the building blocks for natural language driven web automation.
Stagehand makes it easier to write durable, performant browser automation code. When used with [Browserbase](https://browserbase.com/), it offers unparalleled debugging tools like session replay and step-by-step debugging.
The goal of Stagehand is to provide a lightweight, configurable framework, without overly complex abstractions, as well as modular support for different models and model providers. It's not going to order you a pizza, but it will help you reliably automate the web.
> [!NOTE]
> `Stagehand` is currently available as an early release, and we're actively seeking feedback from the community. Please join our [Slack community](https://join.slack.com/t/stagehand-dev/shared_invite/zt-2tdncfgkk-fF8y5U0uJzR2y2_M9c9OJA) to stay updated on the latest developments and provide feedback.
Each Stagehand function takes in an atomic instruction, such as `act("click the login button")` or `extract("find the red shoes")`, generates the appropriate Playwright code to accomplish that instruction, and executes it.
## Documentation
Instructions should be atomic to increase reliability, and step planning should be handled by the higher level agent. You can use `observe()` to get a suggested list of actions that can be taken on the current page, and then use those to ground your step planning prompts.
Visit [docs.stagehand.dev](https://docs.stagehand.dev) to view the full documentation.
Stagehand is [open source](#license) and maintained by the [Browserbase](https://browserbase.com) team. We believe that by enabling more developers to build reliable web automations, we'll expand the market of developers who benefit from our headless browser infrastructure. This is the framework that we wished we had while tinkering on our own applications, and we're excited to share it with you.
## Getting Started
### 1. Install the Stagehand package
### Quickstart
We also install zod to power typed extraction
To create a new Stagehand project configured to our default settings, run:
```bash
npm install @browserbasehq/stagehand zod
npx create-browser-app --example quickstart
```
### 2. Configure your model provider
Read our [Quickstart Guide](https://docs.stagehand.dev/get_started/quickstart) in the docs for more information.
You'll need to provide your API Key for the model provider you'd like to use. The default model provider is OpenAI, but you can also use Anthropic or others. More information on supported models can be found in the [API Reference](#api-reference).
You can also add Stagehand to an existing Typescript project by running:
Ensure that an OpenAI API Key or Anthropic API key is accessible in your local environment.
```bash
npm install @browserbasehq/stagehand zod
npx playwright install # if running locally
```
export OPENAI_API_KEY=sk-...
export ANTHROPIC_API_KEY=sk-...
```
### 3. Create a Stagehand Instance
### Build and Run from Source
If you plan to run the browser locally, you'll also need to install Playwright's browser dependencies.
```bash
npm exec playwright install
git clone https://github.com/browserbase/stagehand.git
cd stagehand
npm install
npx playwright install
npm run example # run the blank script at ./examples/example.ts
```
Then you can create a Stagehand instance like so:
Stagehand is best when you have an API key for an LLM provider and Browserbase credentials. To add these to your project, run:
```javascript
import { Stagehand } from "@browserbasehq/stagehand";
import { z } from "zod";
const stagehand = new Stagehand({
env: "LOCAL",
});
```
If you plan to run the browser remotely, you'll need to set a Browserbase API Key and Project ID.
```bash
export BROWSERBASE_API_KEY=...
export BROWSERBASE_PROJECT_ID=...
cp .env.example .env
nano .env # Edit the .env file to add API keys
```
```javascript
import { Stagehand } from "@browserbasehq/stagehand";
import { z } from "zod";
const stagehand = new Stagehand({
env: "BROWSERBASE",
enableCaching: true,
});
```
### 4. Run your first automation
```javascript
await stagehand.init();
const page = stagehand.page;
await page.goto("https://github.com/browserbase/stagehand");
await page.act({ action: "click on the contributors" });
const contributor = await page.extract({
instruction: "extract the top contributor",
schema: z.object({
username: z.string(),
url: z.string(),
}),
});
await stagehand.close();
console.log(`Our favorite contributor is ${contributor.username}`);
```
This simple snippet will open a browser, navigate to the Stagehand repo, and log the top contributor.
## API Reference
### `Stagehand()`
This constructor is used to create an instance of Stagehand.
- **Arguments:**
- `env`: `'LOCAL'` or `'BROWSERBASE'`. Defaults to `'BROWSERBASE'`.
- `modelName`: (optional) an `AvailableModel` string to specify the default model to use.
- `modelClientOptions`: (optional) configuration options for the model client.
- `enableCaching`: a `boolean` that enables caching of LLM responses. When set to `true`, the LLM requests will be cached on disk and reused for identical requests. Defaults to `false`.
- `headless`: a `boolean` that determines if the browser runs in headless mode. Defaults to `false`. When the env is set to `BROWSERBASE`, this will be ignored.
- `domSettleTimeoutMs`: an `integer` that specifies the timeout in milliseconds for waiting for the DOM to settle. Defaults to 30000 (30 seconds).
- `apiKey`: (optional) your Browserbase API key. Defaults to `BROWSERBASE_API_KEY` environment variable.
- `projectId`: (optional) your Browserbase project ID. Defaults to `BROWSERBASE_PROJECT_ID` environment variable.
- `browserbaseSessionCreateParams`: configuration options for creating new Browserbase sessions.
- `browserbaseSessionID`: ID of an existing live Browserbase session. Overrides `browserbaseSessionCreateParams`.
- `logger`: a function that handles log messages. Useful for custom logging implementations.
- `verbose`: an `integer` that enables several levels of logging during automation:
- `0`: limited to no logging
- `1`: SDK-level logging
- `2`: LLM-client level logging (most granular)
- `debugDom`: a `boolean` that draws bounding boxes around elements presented to the LLM during automation.
- `llmClient`: (optional) a custom `LLMClient` implementation.
- **Returns:**
- An instance of the `Stagehand` class configured with the specified options.
- **Example:**
```javascript
// Basic usage
const stagehand = new Stagehand();
// Custom configuration
const stagehand = new Stagehand({
env: "LOCAL",
verbose: 1,
headless: true,
enableCaching: true,
logger: (logLine) => {
console.log(`[${logLine.category}] ${logLine.message}`);
},
});
// Resume existing Browserbase session
const stagehand = new Stagehand({
env: "BROWSERBASE",
browserbaseSessionID: "existing-session-id",
});
```
### Methods
#### `init()`
`init()` asynchronously initializes the Stagehand instance. It should be called before any other methods.
> [!WARNING]
> Passing parameters to `init()` is deprecated and will be removed in the next major version. Use the constructor options instead.
- **Arguments:**
- `modelName`: (**deprecated**, optional) an `AvailableModel` string to specify the model to use. This will be used for all other methods unless overridden.
- `modelClientOptions`: (**deprecated**, optional) configuration options for the model client
- `domSettleTimeoutMs`: (**deprecated**, optional) timeout in milliseconds for waiting for the DOM to settle
- **Returns:**
- A `Promise` that resolves to an object containing:
- `debugUrl`: a `string` representing the URL for live debugging. This is only available when using a Browserbase browser.
- `sessionUrl`: a `string` representing the session URL. This is only available when using a Browserbase browser.
- `sessionId`: a `string` representing the session ID. This is only available when using a Browserbase browser.
- **Example:**
```javascript
await stagehand.init();
```
#### `act()`
`act()` allows Stagehand to interact with a web page. Provide an `action` like `"search for 'x'"`, or `"select the cheapest flight presented"` (small atomic goals perform the best).
> [!WARNING]
> `act()` on the Stagehand instance is deprecated and will be removed in the next major version. Use `stagehand.page.act()` instead.
- **Arguments:**
- `action`: a `string` describing the action to perform
- `modelName`: (optional) an `AvailableModel` string to specify the model to use
- `modelClientOptions`: (optional) configuration options for the model client
- `useVision`: (optional) a `boolean` or `"fallback"` to determine if vision-based processing should be used. Defaults to `"fallback"`
- `variables`: (optional) a `Record<string, string>` of variables to use in the action. Variables in the action string are referenced using `%variable_name%`
- `domSettleTimeoutMs`: (optional) timeout in milliseconds for waiting for the DOM to settle
- **Returns:**
- A `Promise` that resolves to an object containing:
- `success`: a `boolean` indicating if the action was completed successfully.
- `message`: a `string` providing details about the action's execution.
- `action`: a `string` describing the action performed.
- **Example:**
```javascript
// Basic usage
await stagehand.page.act({ action: "click on add to cart" });
// Using variables
await stagehand.page.act({
action: "enter %username% into the username field",
variables: {
username: "john.doe@example.com",
},
});
// Multiple variables
await stagehand.page.act({
action: "fill in the form with %username% and %password%",
variables: {
username: "john.doe",
password: "secretpass123",
},
});
```
#### `extract()`
`extract()` grabs structured text from the current page using [zod](https://github.com/colinhacks/zod). Given instructions and `schema`, you will receive structured data. Unlike some extraction libraries, stagehand can extract any information on a page, not just the main article contents.
> [!WARNING]
> `extract()` on the Stagehand instance is deprecated and will be removed in the next major version. Use `stagehand.page.extract()` instead.
- **Arguments:**
- `instruction`: a `string` providing instructions for extraction
- `schema`: a `z.AnyZodObject` defining the structure of the data to extract
- `modelName`: (optional) an `AvailableModel` string to specify the model to use
- `modelClientOptions`: (optional) configuration options for the model client
- `domSettleTimeoutMs`: (optional) timeout in milliseconds for waiting for the DOM to settle
- `useTextExtract`: (optional) a `boolean` to determine if text-based extraction should be used. Defaults to `false`
- **Returns:**
- A `Promise` that resolves to the structured data as defined by the provided `schema`.
- **Example:**
```javascript
const price = await stagehand.page.extract({
instruction: "extract the price of the item",
schema: z.object({
price: z.number(),
}),
});
```
#### `observe()`
> [!WARNING]
> `observe()` on the Stagehand instance is deprecated and will be removed in the next major version. Use `stagehand.page.observe()` instead.
> [!NOTE]
> `observe()` currently only evaluates the first chunk in the page.
`observe()` is used to get a list of actions that can be taken on the current page. It's useful for adding context to your planning step, or if you unsure of what page you're on.
If you are looking for a specific element, you can also pass in an instruction to observe via: `observe({ instruction: "{your instruction}"})`.
- **Arguments:**
- `instruction`: (optional) a `string` providing instructions for the observation. Defaults to "Find actions that can be performed on this page."
- `modelName`: (optional) an `AvailableModel` string to specify the model to use
- `modelClientOptions`: (optional) configuration options for the model client
- `useVision`: (optional) a `boolean` to determine if vision-based processing should be used. Defaults to `false`
- `domSettleTimeoutMs`: (optional) timeout in milliseconds for waiting for the DOM to settle
- **Returns:**
- A `Promise` that resolves to an array of objects containing:
- `selector`: a `string` representing the element selector
- `description`: a `string` describing the possible action
- **Example:**
```javascript
const actions = await stagehand.page.observe();
```
#### `close()`
`close()` is a cleanup method to remove the temporary files created by Stagehand. It's highly recommended that you call this when you're done with your automation.
- **Example:**
```javascript
await stagehand.close();
```
#### `page` and `context`
`page` and `context` are instances of Playwright's `Page` and `BrowserContext` respectively. Use these methods to interact with the Playwright instance that Stagehand is using. Most commonly, you'll use `page.goto()` to navigate to a URL.
- **Example:**
```javascript
await stagehand.page.goto("https://github.com/browserbase/stagehand");
```
### `log()`
`log()` is used to print a message to the browser console. These messages will be persisted in the Browserbase session logs, and can be used to debug sessions after they've completed.
Make sure the log level is above the verbose level you set when initializing the Stagehand instance.
- **Example:**
```javascript
stagehand.log("Hello, world!");
```
## Model Support
Stagehand leverages a generic LLM client architecture to support various language models from different providers. This design allows for flexibility, enabling the integration of new models with minimal changes to the core system. Different models work better for different tasks, so you can choose the model that best suits your needs.
#### Currently Supported Models
Stagehand currently supports the following models from OpenAI and Anthropic:
- **OpenAI Models:**
- `gpt-4o`
- `gpt-4o-mini`
- `gpt-4o-2024-08-06`
- **Anthropic Models:**
- `claude-3-5-sonnet-latest`
- `claude-3-5-sonnet-20240620`
- `claude-3-5-sonnet-20241022`
These models can be specified when initializing the `Stagehand` instance or when calling methods like `act()` and `extract()`.
## How It Works
The SDK has two major phases:
1. Processing the DOM (including chunking - _see below_).
2. Taking LLM powered actions based on the current state of the DOM.
### DOM processing
Stagehand uses a combination of techniques to prepare the DOM.
The DOM Processing steps look as follows:
1. Via Playwright, inject a script into the DOM accessible by the SDK that can run processing.
2. Crawl the DOM and create a list of candidate elements.
- Candidate elements are either leaf elements (DOM elements that contain actual user facing substance), or are interactive elements.
- Interactive elements are determined by a combination of roles and HTML tags.
3. Candidate elements that are not active, visible, or at the top of the DOM are discarded.
- The LLM should only receive elements it can faithfully act on on behalf of the agent/user.
4. For each candidate element, an xPath is generated. this guarantees that if this element is picked by the LLM, we'll be able to reliably target it.
5. Return both the list of candidate elements, as well as the map of elements to xPath selectors across the browser back to the SDK, to be analyzed by the LLM.
#### Chunking
While LLMs will continue to increase context window length and reduce latency, giving any reasoning system less stuff to think about should make it more reliable. As a result, DOM processing is done in chunks in order to keep the context small per inference call. In order to chunk, the SDK considers a candidate element that starts in a section of the viewport to be a part of that chunk. In the future, padding will be added to ensure that an individual chunk does not lack relevant context. See this diagram for how it looks:
![](./docs/media/chunks.png)
### Vision
The `act()` and `observe()` methods can take a `useVision` flag. If this is set to `true`, the LLM will be provided with a annotated screenshot of the current page to identify which elements to act on. This is useful for complex DOMs that the LLM has a hard time reasoning about, even after processing and chunking. By default, this flag is set to `"fallback"`, which means that if the LLM fails to successfully identify a single element, Stagehand will retry the attempt using vision.
### LLM analysis
Now we have a list of candidate elements and a way to select them. We can present those elements with additional context to the LLM for extraction or action. While untested on a large scale, presenting a "numbered list of elements" guides the model to not treat the context as a full DOM, but as a list of related but independent elements to operate on.
In the case of action, we ask the LLM to write a playwright method in order to do the correct thing. In our limited testing, playwright syntax is much more effective than relying on built in javascript APIs, possibly due to tokenization.
Lastly, we use the LLM to write future instructions to itself to help manage it's progress and goals when operating across chunks.
### Stagehand vs Playwright
Below is an example of how to extract a list of companies from the AI Grant website using both Stagehand and Playwright.
![](./docs/media/stagehand-playwright.png)
## Prompting Tips
Prompting Stagehand is more literal and atomic than other higher level frameworks, including agentic frameworks. Here are some guidelines to help you craft effective prompts:
### Do:
- **Use specific and concise actions**
```javascript
await stagehand.page.act({ action: "click the login button" });
const productInfo = await stagehand.page.extract({
instruction: "find the red shoes",
schema: z.object({
productName: z.string(),
price: z.number(),
}),
});
```
- **Break down complex tasks into smaller, atomic steps**
Instead of combining actions:
```javascript
// Avoid this
await stagehand.page.act({ action: "log in and purchase the first item" });
```
Split them into individual steps:
```javascript
await stagehand.page.act({ action: "click the login button" });
// ...additional steps to log in...
await stagehand.page.act({ action: "click on the first item" });
await stagehand.page.act({ action: "click the purchase button" });
```
- **Use `observe()` to get actionable suggestions from the current page**
```javascript
const actions = await stagehand.page.observe();
console.log("Possible actions:", actions);
```
### Don't:
- **Use broad or ambiguous instructions**
```javascript
// Too vague
await stagehand.page.act({ action: "find something interesting on the page" });
```
- **Combine multiple actions into one instruction**
```javascript
// Avoid combining actions
await stagehand.page.act({ action: "fill out the form and submit it" });
```
- **Expect Stagehand to perform high-level planning or reasoning**
```javascript
// Outside Stagehand's scope
await stagehand.page.act({ action: "book the cheapest flight available" });
```
By following these guidelines, you'll increase the reliability and effectiveness of your web automations with Stagehand. Remember, Stagehand excels at executing precise, well-defined actions so keeping your instructions atomic will lead to the best outcomes.
We leave the agentic behaviour to higher-level agentic systems which can use Stagehand as a tool.
## Roadmap
At a high level, we're focused on improving reliability, speed, and cost in that order of priority.
You can see the roadmap [here](./ROADMAP.md). Looking to contribute? Read on!
## Contributing
> [!NOTE]
> We highly value contributions to Stagehand! For support or code review, please join our [Slack community](https://join.slack.com/t/stagehand-dev/shared_invite/zt-2tdncfgkk-fF8y5U0uJzR2y2_M9c9OJA).
> We highly value contributions to Stagehand! For questions or support, please join our [Slack community](https://join.slack.com/t/stagehand-dev/shared_invite/zt-2tdncfgkk-fF8y5U0uJzR2y2_M9c9OJA).
First, clone the repo
At a high level, we're focused on improving reliability, speed, and cost in that order of priority. If you're interested in contributing, we strongly recommend reaching out to [Anirudh Kamath](https://x.com/kamathematic) or [Paul Klein](https://x.com/pk_iv) in our [Slack community](https://join.slack.com/t/stagehand-dev/shared_invite/zt-2tdncfgkk-fF8y5U0uJzR2y2_M9c9OJA) before starting to ensure that your contribution aligns with our goals.
```bash
git clone git@github.com:browserbase/stagehand.git
```
For more information, please see our [Contributing Guide](https://docs.stagehand.dev/contributions/contributing).
Then install dependencies
```bash
npm install
```
Ensure you have the `.env` file as documented above in the Getting Started section.
Then, run the example script `npm run example`.
### Development tips
A good development loop is:
1. Try things in the example file
2. Use that to make changes to the SDK
3. Write evals that help validate your changes
4. Make sure you don't break existing evals!
5. Open a PR and get it reviewed by the team.
### Running evals
You'll need a Braintrust API key to run evals
```.env
BRAINTRUST_API_KEY=""
```
After that, you can run all evals at once using `npm run evals`
You can also run individual evals using `npm run evals -- your_eval_name`.
### Adding new evals
Running all evals can take some time. We have a convenience script `example.ts` where you can develop your new single eval before adding it to the set of all evals.
You can run `npm run example` to execute and iterate on the eval you are currently developing.
#### Adding a New Model
To add a new model to Stagehand, follow these steps:
1. **Define the Model**: Add the new model name to the `AvailableModel` type in the `LLMProvider.ts` file. This ensures that the model is recognized by the system.
2. **Map the Model to a Provider**: Update the `modelToProviderMap` in the `LLMProvider` class to associate the new model with its corresponding provider. This mapping is crucial for determining which client to use.
3. **Implement the Client**: If the new model requires a new client, implement a class that adheres to the `LLMClient` interface. This class should define all necessary methods, such as `createChatCompletion`.
4. **Update the `getClient` Method**: Modify the `getClient` method in the `LLMProvider` class to return an instance of the new client when the new model is requested.
### Building the SDK
Stagehand uses [tsup](https://github.com/egoist/tsup) to build the SDK and vanilla [esbuild](https://esbuild.github.io/d) to build the scripts that run in the DOM.
1. run `npm run build`
2. run `npm pack` to get a tarball for distribution
## Acknowledgements

@@ -566,3 +108,8 @@

[Jeremy Press](https://x.com/jeremypress) wrote the original MVP of Stagehand and continues to be a major ally to the project.
We'd like to thank the following people for their contributions to Stagehand:
- [Jeremy Press](https://x.com/jeremypress) wrote the original MVP of Stagehand and continues to be an ally to the project.
- [Navid Pour](https://github.com/navidpour) is heavily responsible for the current architecture of Stagehand and the `act` API.
- [Sean McGuire](https://github.com/seanmcguire12) is a major contributor to the project and has been a great help with improving the `extract` API and getting evals to a high level.
- [Filip Michalsky](https://github.com/filip-michalsky) has been doing a lot of work on building out integrations like [Langchain](https://js.langchain.com/docs/integrations/tools/stagehand/) and [Claude MCP](https://github.com/browserbase/mcp-server-browserbase), generally improving the repository, and unblocking users.
- [Sameel Arif](https://github.com/sameelarif) is a major contributor to the project, especially around improving the developer experience.

@@ -573,2 +120,2 @@ ## License

Copyright 2024 Browserbase, Inc.
Copyright 2025 Browserbase, Inc.

Sorry, the diff of this file is too big to display

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc