import { Scorer, ScorerArgs, Score } from '@braintrust/core';
		export { Score, Scorer, ScorerArgs } from '@braintrust/core';
		import { ChatCompletion, ChatCompletionMessageParam, ChatCompletionTool, ChatCompletionToolChoiceOption } from 'openai/resources';
		import { ChatCompletionMessageParam, ChatCompletionTool, ChatCompletionToolChoiceOption, ChatCompletion } from 'openai/resources';
		import { OpenAI } from 'openai';
		import { z } from 'zod';
		@@ -25,14 +26,31 @@
		}
		interface OpenAIAuth {
		type OpenAIAuth = {
		/** @deprecated Use the `client` option instead */
		openAiApiKey?: string;
		/** @deprecated Use the `client` option instead */
		openAiOrganizationId?: string;
		/** @deprecated Use the `client` option instead */
		openAiBaseUrl?: string;
		/** @deprecated Use the `client` option instead */
		openAiDefaultHeaders?: Record<string, string>;
		/** @deprecated Use the `client` option instead */
		openAiDangerouslyAllowBrowser?: boolean;
		/**
		If present, use [Azure OpenAI Service](https://learn.microsoft.com/en-us/azure/ai-services/openai/)
		instead of OpenAI.
		*/
		/** @deprecated Use the `client` option instead */
		azureOpenAi?: AzureOpenAiAuth;
		}
		client?: never;
		} \| {
		client: OpenAI;
		/** @deprecated Use the `client` option instead */
		openAiApiKey?: never;
		/** @deprecated Use the `client` option instead */
		openAiOrganizationId?: never;
		/** @deprecated Use the `client` option instead */
		openAiBaseUrl?: never;
		/** @deprecated Use the `client` option instead */
		openAiDefaultHeaders?: never;
		/** @deprecated Use the `client` option instead */
		openAiDangerouslyAllowBrowser?: never;
		/** @deprecated Use the `client` option instead */
		azureOpenAi?: never;
		};
		interface AzureOpenAiAuth {
		@@ -45,3 +63,7 @@ apiKey: string;
		var __inherited_braintrust_wrap_openai: ((openai: any) => any) \| undefined;
		var __client: OpenAI \| undefined;
		}
		declare const init: ({ client }?: {
		client?: OpenAI;
		}) => void;

		@@ -68,9 +90,14 @@ declare const modelGradedSpecSchema: z.ZodObject<{
		type ModelGradedSpec = z.infer<typeof modelGradedSpecSchema>;
		declare const templates: Record<"battle" \| "closed_q_a" \| "factuality" \| "humor" \| "possible" \| "security" \| "sql" \| "summary" \| "translation", {
		prompt: string;
		choice_scores: Record<string, number>;
		model?: string \| undefined;
		use_cot?: boolean \| undefined;
		temperature?: number \| undefined;
		}>;
		declare const templateStrings: {
		readonly battle: string;
		readonly closed_q_a: string;
		readonly factuality: string;
		readonly humor: string;
		readonly possible: string;
		readonly security: string;
		readonly sql: string;
		readonly summary: string;
		readonly translation: string;
		};
		declare const templates: Record<keyof typeof templateStrings, ModelGradedSpec>;

		@@ -138,3 +165,3 @@ interface ScorerWithPartial<Output, Extra> extends Scorer<Output, Extra> {
		output: string;
		expected?: string \| undefined;
		expected?: string;
		}>>;
		@@ -304,2 +331,2 @@ /**

		export { AnswerCorrectness, AnswerRelevancy, AnswerSimilarity, Battle, ClosedQA, ContextEntityRecall, ContextPrecision, ContextRecall, ContextRelevancy, DEFAULT_MODEL, EmbeddingSimilarity, Evaluators, ExactMatch, Factuality, Faithfulness, Humor, JSONDiff, type LLMArgs, type LLMClassifierArgs, LLMClassifierFromSpec, LLMClassifierFromSpecFile, LLMClassifierFromTemplate, Levenshtein, LevenshteinScorer, ListContains, type ModelGradedSpec, Moderation, NumericDiff, OpenAIClassifier, type OpenAIClassifierArgs, Possible, type ScorerWithPartial, Security, Sql, Summary, Translation, ValidJSON, buildClassificationTools, makePartial, modelGradedSpecSchema, normalizeValue, templates };
		export { AnswerCorrectness, AnswerRelevancy, AnswerSimilarity, Battle, ClosedQA, ContextEntityRecall, ContextPrecision, ContextRecall, ContextRelevancy, DEFAULT_MODEL, EmbeddingSimilarity, Evaluators, ExactMatch, Factuality, Faithfulness, Humor, JSONDiff, type LLMArgs, type LLMClassifierArgs, LLMClassifierFromSpec, LLMClassifierFromSpecFile, LLMClassifierFromTemplate, Levenshtein, LevenshteinScorer, ListContains, type ModelGradedSpec, Moderation, NumericDiff, OpenAIClassifier, type OpenAIClassifierArgs, Possible, type ScorerWithPartial, Security, Sql, Summary, Translation, ValidJSON, buildClassificationTools, init, makePartial, modelGradedSpecSchema, normalizeValue, templates };

package.json

		{
		"name": "autoevals",
		"version": "0.0.123",
		"version": "0.0.124",
		"description": "Universal library for evaluating AI models",
		@@ -43,4 +43,5 @@ "repository": {
		"jest-text-transformer": "^1.0.4",
		"msw": "^2.7.3",
		"ts-jest": "^29.1.1",
		"tsup": "^8.0.1",
		"tsup": "^8.4.0",
		"tsx": "^3.14.0",
		@@ -52,3 +53,3 @@ "typedoc": "^0.25.4",
		"dependencies": {
		"@braintrust/core": "0.0.81",
		"@braintrust/core": "0.0.82",
		"ajv": "^8.13.0",
		@@ -55,0 +56,0 @@ "compute-cosine-similarity": "^1.1.0",

261

README.md

		@@ -7,3 +7,3 @@ # Autoevals

		- LLM-as-a-Judge
		- LLM-as-a-judge
		- Heuristic (e.g. Levenshtein distance)
		@@ -22,73 +22,31 @@ - Statistical (e.g. BLEU)

		## Installation
		<div className="hidden">

		Autoevals is distributed as a [Python library on PyPI](https://pypi.org/project/autoevals/) and
		[Node.js library on NPM](https://www.npmjs.com/package/autoevals).
		### Requirements

		```bash
		npm install autoevals
		```
		- Python 3.9 or higher
		- Compatible with both OpenAI Python SDK v0.x and v1.x

		## Example
		</div>

		Use Autoevals to model-grade an example LLM completion using the [factuality prompt](templates/factuality.yaml).
		By default, Autoevals uses your `OPENAI_API_KEY` environment variable to authenticate with OpenAI's API.
		## Installation

		#### Use with other AI providers through the AI proxy
		<div className="tabs">

		Autoevals will look for an `OPENAI_BASE_URL` environment variable to use as the base for requests to an OpenAI compatible API. If `OPENAI_BASE_URL` is not set, it will default to the [AI proxy](https://www.braintrust.dev/docs/guides/proxy). This provides numerous benefits like simplified access to many AI providers, reduced costs with automatic request caching, and increased observability when you enable logging to Braintrust. The proxy is free to use, even if you don't have a Braintrust account.
		### TypeScript

		If you have a Braintrust account, you can set the `BRAINTUST_API_KEY` environment variable instead of `OPENAI_API_KEY` to unlock additional features like logging and monitoring. Additionally, you can route requests to [supported AI providers and models](https://www.braintrust.dev/docs/guides/proxy#supported-models) or custom models you have configured in Braintrust.

		```python
		# NOTE: ensure BRAINTRUST_API_KEY is set in your environment and OPENAI_API_KEY is not set
		from autoevals.llm import *

		# Create an LLM-based evaluator using the Claude 3.5 Sonnet model from Anthropic
		evaluator = Factuality(model="claude-3-5-sonnet-latest")

		# Evaluate an example LLM completion
		input = "Which country has the highest population?"
		output = "People's Republic of China"
		expected = "China"

		result = evaluator(output, expected, input=input)

		# The evaluator returns a score from [0,1] and includes the raw outputs from the evaluator
		print(f"Factuality score: {result.score}")
		print(f"Factuality metadata: {result.metadata['rationale']}")
		```bash
		npm install autoevals
		```

		#### Custom Client
		## Getting started

		If you need to use a different OpenAI compatible API or require custom behavior, you can initialize the library with a custom client.
		Use Autoevals to model-grade an example LLM completion using the [Factuality prompt](templates/factuality.yaml).
		By default, Autoevals uses your `OPENAI_API_KEY` environment variable to authenticate with OpenAI's API.

		```python
		import openai
		from autoevals import init
		from autoevals.oai import LLMClient
		<div className="tabs">

		openai_client = openai.OpenAI(base_url="https://api.openai.com/v1/")
		### TypeScript

		class CustomClient(LLMClient):
		openai=openai_client # you can also pass in openai module and we will instantiate it for you
		embed = openai.embeddings.create
		moderation = openai.moderations.create
		RateLimitError = openai.RateLimitError

		def complete(self, **kwargs):
		# make adjustments as needed
		return self.openai.chat.completions.create(**kwargs)

		# Autoevals will now use your custom client
		client = init(client=CustomClient)
		```

		If you only need to use a custom client for a specific evaluator, you can pass in the client to the evaluator.

		```python
		evaluator = Factuality(client=CustomClient)
		```

		```javascript
		```typescript
		import { Factuality } from "autoevals";
		@@ -103,13 +61,27 @@
		console.log(`Factuality score: ${result.score}`);
		console.log(`Factuality metadata: ${result.metadata.rationale}`);
		console.log(`Factuality metadata: ${result.metadata?.rationale}`);
		})();
		```

		#### Use with other AI providers through the AI proxy
		</div>

		Autoevals will look for an `OPENAI_BASE_URL` environment variable to use as the base for requests to an OpenAI compatible API. If `OPENAI_BASE_URL` is not set, it will default to the [AI proxy](https://www.braintrust.dev/docs/guides/proxy). This provides numerous benefits like simplified access to many AI providers, reduced costs with automatic request caching, and increased observability when you enable logging to Braintrust. The proxy is free to use, even if you don't have a Braintrust account.
		## Using other AI providers

		If you have a Braintrust account, you can set the `BRAINTUST_API_KEY` environment variable instead of `OPENAI_API_KEY` to unlock additional features like logging and monitoring. Additionally, you can route requests to [supported AI providers and models](https://www.braintrust.dev/docs/guides/proxy#supported-models) or custom models you have configured in Braintrust.
		When you use Autoevals, it will look for an `OPENAI_BASE_URL` environment variable to use as the base for requests to an OpenAI compatible API. If `OPENAI_BASE_URL` is not set, it will default to the [AI proxy](https://www.braintrust.dev/docs/guides/proxy).

		```javascript
		If you choose to use the proxy, you'll also get:

		- Simplified access to many AI providers
		- Reduced costs with automatic request caching
		- Increased observability when you enable logging to Braintrust

		The proxy is free to use, even if you don't have a Braintrust account.

		If you have a Braintrust account, you can optionally set the `BRAINTRUST_API_KEY` environment variable instead of `OPENAI_API_KEY` to unlock additional features like logging and monitoring. You can also route requests to [supported AI providers and models](https://www.braintrust.dev/docs/guides/proxy#supported-models) or custom models you have configured in Braintrust.

		<div className="tabs">

		### TypeScript

		```typescript
		// NOTE: ensure BRAINTRUST_API_KEY is set in your environment and OPENAI_API_KEY is not set
		@@ -137,9 +109,84 @@ import { Factuality } from "autoevals";

		## Using Braintrust with Autoevals
		</div>

		Once you grade an output using Autoevals, it's convenient to use [Braintrust](https://www.braintrust.dev/docs/libs/python) to log and compare your evaluation results.
		## Custom client configuration

		Create a file named `example.eval.js` (it must end with `.eval.js` or `.eval.js`):
		There are two ways you can configure a custom client when you need to use a different OpenAI compatible API:

		```javascript
		1. Global configuration: Initialize a client that will be used by all evaluators
		2. Instance configuration: Configure a client for a specific evaluator

		### Global configuration

		Set up a client that all your evaluators will use:

		<div className="tabs">

		#### TypeScript

		```typescript
		import OpenAI from "openai";
		import { init, Factuality } from "autoevals";

		const client = new OpenAI({
		baseURL: "https://api.openai.com/v1/",
		});

		init({ client });

		(async () => {
		const result = await Factuality({
		input: "What is the speed of light in a vacuum?",
		output: "The speed of light in a vacuum is 299,792,458 meters per second.",
		expected:
		"The speed of light in a vacuum is approximately 300,000 kilometers per second (or precisely 299,792,458 meters per second).",
		});

		console.log("Factuality Score:", result);
		})();
		```

		</div>

		### Instance configuration

		Configure a client for a specific evaluator instance:

		<div className="tabs">

		#### TypeScript

		```typescript
		import OpenAI from "openai";
		import { Factuality } from "autoevals";

		(async () => {
		const customClient = new OpenAI({
		baseURL: "https://custom-api.example.com/v1/",
		});

		const result = await Factuality({
		client: customClient,
		output: "Paris is the capital of France",
		expected:
		"Paris is the capital of France and has a population of over 2 million",
		input: "Tell me about Paris",
		});
		console.log(result);
		})();
		```

		</div>

		## Using Braintrust with Autoevals (optional)

		Once you grade an output using Autoevals, you can optionally use [Braintrust](https://www.braintrust.dev/docs/libs/python) to log and compare your evaluation results. This integration is completely optional and not required for using Autoevals.

		<div className="tabs">

		### TypeScript

		Create a file named `example.eval.js` (it must take the form `*.eval.[ts\|tsx\|js\|jsx]`):

		```typescript
		import { Eval } from "braintrust";
		@@ -166,8 +213,8 @@ import { Factuality } from "autoevals";

		## Supported Evaluation Methods
		## Supported evaluation methods

		### LLM-as-a-Judge
		### LLM-as-a-judge evaluations

		- Battle
		- ClosedQA
		- Closed QA
		- Humor
		@@ -182,3 +229,3 @@ - Factuality

		### RAG
		### RAG evaluations

		@@ -188,10 +235,9 @@ - Context precision
		- Context recall
		- Context entities recall
		- Faithfullness
		- Answer relevance
		- Answer semantic similarity
		- Context entity recall
		- Faithfulness
		- Answer relevancy
		- Answer similarity
		- Answer correctness
		- Aspect critique

		### Composite
		### Composite evaluations

		@@ -201,8 +247,7 @@ - Semantic list contains

		### Embeddings
		### Embedding evaluations

		- Embedding similarity
		- BERTScore

		### Heuristic
		### Heuristic evaluations

		@@ -213,15 +258,12 @@ - Levenshtein distance
		- JSON diff
		- Jaccard distance

		### Statistical
		## Custom evaluation prompts

		- BLEU
		- ROUGE
		- METEOR
		Autoevals supports custom evaluation prompts for model-graded evaluation. To use them, simply pass in a prompt and scoring mechanism:

		## Custom Evaluation Prompts
		<div className="tabs">

		Autoevals supports custom evaluation prompts for model-graded evaluation. To use them, simply pass in a prompt and scoring mechanism:
		### TypeScript

		```javascript
		```typescript
		import { LLMClassifierFromTemplate } from "autoevals";
		@@ -242,11 +284,8 @@

		const evaluator =
		LLMClassifierFromTemplate <
		{ input: string } >
		{
		name: "TitleQuality",
		promptTemplate,
		choiceScores,
		useCoT: true,
		};
		const evaluator = LLMClassifierFromTemplate<{ input: string }>({
		name: "TitleQuality",
		promptTemplate,
		choiceScores,
		useCoT: true,
		});

		@@ -266,2 +305,4 @@ const input = `As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client,

		</div>

		## Creating custom scorers
		@@ -272,3 +313,7 @@

		```javascript
		<div className="tabs">

		### TypeScript

		```typescript
		import { Score } from "autoevals";
		@@ -298,2 +343,4 @@

		</div>

		## Why does this library exist?
		@@ -308,4 +355,18 @@

		<div className="hidden">

		## Documentation

		The full docs are available [here](https://www.braintrust.dev/docs/reference/autoevals).
		The full docs are available [for your reference](https://www.braintrust.dev/docs/reference/autoevals).

		## Contributing

		We welcome contributions!

		To install the development dependencies, run `make develop`, and run `source env.sh` to activate the environment. Make a `.env` file from the `.env.example` file and set the environment variables. Run `direnv allow` to load the environment variables.

		To run the tests, run `pytest` from the root directory.

		Send a PR and we'll review it! We'll take care of versioning and releasing.

		</div>

jsdist/index.d.mts

Sorry, the diff of this file is not supported yet

jsdist/index.js

Sorry, the diff of this file is too big to display

jsdist/index.mjs

Sorry, the diff of this file is not supported yet

autoevals - npm Package Compare versions

Improved metrics

Worsened metrics

Dependency changes