autoevals
Advanced tools
Comparing version 0.0.75 to 0.0.76
@@ -1,2 +0,2 @@ | ||
import { ScorerArgs, Score, Scorer } from '@braintrust/core'; | ||
import { Scorer, ScorerArgs, Score } from '@braintrust/core'; | ||
export { Score, Scorer, ScorerArgs } from '@braintrust/core'; | ||
@@ -40,2 +40,9 @@ import { ChatCompletion, ChatCompletionMessageParam, ChatCompletionCreateParams } from 'openai/resources'; | ||
interface ScorerWithPartial<Output, Extra> extends Scorer<Output, Extra> { | ||
partial: <T extends keyof Extra>(args: { | ||
[K in T]: Extra[K]; | ||
}) => Scorer<Output, Omit<Extra, T> & Partial<Pick<Extra, T>>>; | ||
} | ||
declare function makePartial<Output, Extra>(fn: Scorer<Output, Extra>, name?: string): ScorerWithPartial<Output, Extra>; | ||
type LLMArgs = { | ||
@@ -96,3 +103,3 @@ maxTokens?: number; | ||
*/ | ||
declare const Battle: Scorer<any, LLMClassifierArgs<{ | ||
declare const Battle: ScorerWithPartial<string, LLMClassifierArgs<{ | ||
instructions: string; | ||
@@ -104,3 +111,3 @@ }>>; | ||
*/ | ||
declare const ClosedQA: Scorer<any, LLMClassifierArgs<{ | ||
declare const ClosedQA: ScorerWithPartial<string, LLMClassifierArgs<{ | ||
input: string; | ||
@@ -112,10 +119,10 @@ criteria: any; | ||
*/ | ||
declare const Humor: Scorer<any, LLMClassifierArgs<{}>>; | ||
declare const Humor: ScorerWithPartial<string, LLMClassifierArgs<{}>>; | ||
/** | ||
* Test whether an output is factual, compared to an original (`expected`) value. | ||
*/ | ||
declare const Factuality: Scorer<any, LLMClassifierArgs<{ | ||
declare const Factuality: ScorerWithPartial<string, LLMClassifierArgs<{ | ||
input: string; | ||
output: string; | ||
expected?: string | undefined; | ||
expected?: string; | ||
}>>; | ||
@@ -125,3 +132,3 @@ /** | ||
*/ | ||
declare const Possible: Scorer<any, LLMClassifierArgs<{ | ||
declare const Possible: ScorerWithPartial<string, LLMClassifierArgs<{ | ||
input: string; | ||
@@ -132,7 +139,7 @@ }>>; | ||
*/ | ||
declare const Security: Scorer<any, LLMClassifierArgs<{}>>; | ||
declare const Security: ScorerWithPartial<string, LLMClassifierArgs<{}>>; | ||
/** | ||
* Test whether a SQL query is semantically the same as a reference (output) query. | ||
*/ | ||
declare const Sql: Scorer<any, LLMClassifierArgs<{ | ||
declare const Sql: ScorerWithPartial<string, LLMClassifierArgs<{ | ||
input: string; | ||
@@ -143,3 +150,3 @@ }>>; | ||
*/ | ||
declare const Summary: Scorer<any, LLMClassifierArgs<{ | ||
declare const Summary: ScorerWithPartial<string, LLMClassifierArgs<{ | ||
input: string; | ||
@@ -151,3 +158,3 @@ }>>; | ||
*/ | ||
declare const Translation: Scorer<any, LLMClassifierArgs<{ | ||
declare const Translation: ScorerWithPartial<string, LLMClassifierArgs<{ | ||
language: string; | ||
@@ -160,4 +167,4 @@ input: string; | ||
*/ | ||
declare const Levenshtein: Scorer<string, {}>; | ||
declare const LevenshteinScorer: Scorer<string, {}>; | ||
declare const Levenshtein: ScorerWithPartial<string, {}>; | ||
declare const LevenshteinScorer: ScorerWithPartial<string, {}>; | ||
/** | ||
@@ -173,3 +180,3 @@ * A scorer that uses cosine similarity to compare two strings. | ||
*/ | ||
declare const EmbeddingSimilarity: Scorer<string, { | ||
declare const EmbeddingSimilarity: ScorerWithPartial<string, { | ||
prefix?: string; | ||
@@ -185,3 +192,3 @@ expectedMin?: number; | ||
*/ | ||
declare const ListContains: Scorer<string[], { | ||
declare const ListContains: ScorerWithPartial<string[], { | ||
pairwiseScorer?: Scorer<string, {}>; | ||
@@ -201,3 +208,3 @@ allowExtraEntities?: boolean; | ||
*/ | ||
declare const Moderation: Scorer<string, { | ||
declare const Moderation: ScorerWithPartial<string, { | ||
threshold?: number; | ||
@@ -209,3 +216,3 @@ } & OpenAIAuth>; | ||
*/ | ||
declare const NumericDiff: Scorer<number, {}>; | ||
declare const NumericDiff: ScorerWithPartial<number, {}>; | ||
@@ -216,3 +223,3 @@ /** | ||
*/ | ||
declare const JSONDiff: Scorer<any, { | ||
declare const JSONDiff: ScorerWithPartial<any, { | ||
stringScorer?: Scorer<string, {}>; | ||
@@ -225,3 +232,3 @@ numberScorer?: Scorer<number, {}>; | ||
*/ | ||
declare const ValidJSON: Scorer<string, { | ||
declare const ValidJSON: ScorerWithPartial<string, { | ||
schema?: any; | ||
@@ -239,12 +246,12 @@ }>; | ||
*/ | ||
declare const ContextEntityRecall: Scorer<string, RagasArgs & { | ||
declare const ContextEntityRecall: ScorerWithPartial<string, RagasArgs & { | ||
pairwiseScorer?: Scorer<string, {}>; | ||
}>; | ||
declare const ContextRelevancy: Scorer<string, RagasArgs>; | ||
declare const ContextRecall: Scorer<string, RagasArgs>; | ||
declare const ContextPrecision: Scorer<string, RagasArgs>; | ||
declare const ContextRelevancy: ScorerWithPartial<string, RagasArgs>; | ||
declare const ContextRecall: ScorerWithPartial<string, RagasArgs>; | ||
declare const ContextPrecision: ScorerWithPartial<string, RagasArgs>; | ||
/** | ||
* Measures factual consistency of the generated answer with the given context. | ||
*/ | ||
declare const Faithfulness: Scorer<string, RagasArgs>; | ||
declare const Faithfulness: ScorerWithPartial<string, RagasArgs>; | ||
/** | ||
@@ -254,3 +261,3 @@ * Scores the relevancy of the generated answer to the given question. | ||
*/ | ||
declare const AnswerRelevancy: Scorer<string, RagasArgs & { | ||
declare const AnswerRelevancy: ScorerWithPartial<string, RagasArgs & { | ||
strictness?: number; | ||
@@ -261,3 +268,3 @@ }>; | ||
*/ | ||
declare const AnswerSimilarity: Scorer<string, RagasArgs>; | ||
declare const AnswerSimilarity: ScorerWithPartial<string, RagasArgs>; | ||
/** | ||
@@ -267,3 +274,3 @@ * Measures answer correctness compared to ground truth using a weighted | ||
*/ | ||
declare const AnswerCorrectness: Scorer<string, RagasArgs & { | ||
declare const AnswerCorrectness: ScorerWithPartial<string, RagasArgs & { | ||
factualityWeight?: number; | ||
@@ -274,7 +281,11 @@ answerSimilarityWeight?: number; | ||
interface AutoevalMethod { | ||
method: ScorerWithPartial<any, any>; | ||
description: string; | ||
} | ||
declare const Evaluators: { | ||
label: string; | ||
methods: Scorer<any, any>[]; | ||
methods: AutoevalMethod[]; | ||
}[]; | ||
export { AnswerCorrectness, AnswerRelevancy, AnswerSimilarity, Battle, ClosedQA, ContextEntityRecall, ContextPrecision, ContextRecall, ContextRelevancy, EmbeddingSimilarity, Evaluators, Factuality, Faithfulness, Humor, JSONDiff, type LLMArgs, type LLMClassifierArgs, LLMClassifierFromSpec, LLMClassifierFromSpecFile, LLMClassifierFromTemplate, Levenshtein, LevenshteinScorer, ListContains, type ModelGradedSpec, Moderation, NumericDiff, OpenAIClassifier, type OpenAIClassifierArgs, Possible, Security, Sql, Summary, Translation, ValidJSON, buildClassificationFunctions, templates }; | ||
export { AnswerCorrectness, AnswerRelevancy, AnswerSimilarity, Battle, ClosedQA, ContextEntityRecall, ContextPrecision, ContextRecall, ContextRelevancy, EmbeddingSimilarity, Evaluators, Factuality, Faithfulness, Humor, JSONDiff, type LLMArgs, type LLMClassifierArgs, LLMClassifierFromSpec, LLMClassifierFromSpecFile, LLMClassifierFromTemplate, Levenshtein, LevenshteinScorer, ListContains, type ModelGradedSpec, Moderation, NumericDiff, OpenAIClassifier, type OpenAIClassifierArgs, Possible, type ScorerWithPartial, Security, Sql, Summary, Translation, ValidJSON, buildClassificationFunctions, makePartial, templates }; |
{ | ||
"name": "autoevals", | ||
"version": "0.0.75", | ||
"version": "0.0.76", | ||
"description": "Universal library for evaluating AI models", | ||
@@ -46,3 +46,3 @@ "main": "./jsdist/index.js", | ||
"dependencies": { | ||
"@braintrust/core": "0.0.44", | ||
"@braintrust/core": "0.0.45", | ||
"ajv": "^8.13.0", | ||
@@ -49,0 +49,0 @@ "compute-cosine-similarity": "^1.1.0", |
@@ -11,3 +11,3 @@ # Autoevals | ||
Autoevals is developed by the team at [BrainTrust](https://braintrustdata.com/). | ||
Autoevals is developed by the team at [Braintrust](https://braintrust.dev/). | ||
@@ -52,3 +52,3 @@ Autoevals uses model-graded evaluation for a variety of subjective tasks including fact checking, | ||
Once you grade an output using Autoevals, it's convenient to use [BrainTrust](https://www.braintrustdata.com/docs/libs/python) to log and compare your evaluation results. | ||
Once you grade an output using Autoevals, it's convenient to use [Braintrust](https://www.braintrust.dev/docs/libs/python) to log and compare your evaluation results. | ||
@@ -94,3 +94,3 @@ Create a file named `example.eval.js` (it must end with `.eval.js` or `.eval.js`): | ||
### RAGAS | ||
### RAG | ||
@@ -203,2 +203,2 @@ - Context precision | ||
The full docs are available [here](https://www.braintrustdata.com/docs/autoevals/overview). | ||
The full docs are available [here](https://www.braintrust.dev/docs/reference/autoevals). |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
159245
3334
+ Added@braintrust/core@0.0.45(transitive)
- Removed@braintrust/core@0.0.44(transitive)
Updated@braintrust/core@0.0.45