autoevals
Advanced tools
Comparing version 0.0.57 to 0.0.58
@@ -177,5 +177,20 @@ import { ScorerArgs, Score, Scorer } from '@braintrust/core'; | ||
pairwiseScorer?: Scorer<string, {}>; | ||
allowExtraEntities?: boolean; | ||
}>; | ||
/** | ||
* A scorer that uses OpenAI's moderation API to determine if AI response contains ANY flagged content. | ||
* | ||
* @param args | ||
* @param args.threshold Optional. Threshold to use to determine whether content has exceeded threshold. By | ||
* default, it uses OpenAI's default. (Using `flagged` from the response payload.) | ||
* @param args.categories Optional. Specific categories to look for. If not set, all categories will | ||
* be considered. | ||
* @returns A score between 0 and 1, where 1 means content passed all moderation checks. | ||
*/ | ||
declare const Moderation: Scorer<string, { | ||
threshold?: number; | ||
} & OpenAIAuth>; | ||
/** | ||
* A simple scorer that compares numbers by normalizing their difference. | ||
@@ -199,2 +214,2 @@ */ | ||
export { Battle, ClosedQA, EmbeddingSimilarity, Evaluators, Factuality, Humor, JSONDiff, type LLMClassifierArgs, LLMClassifierFromSpec, LLMClassifierFromSpecFile, LLMClassifierFromTemplate, Levenshtein, LevenshteinScorer, ListContains, type ModelGradedSpec, NumericDiff, OpenAIClassifier, type OpenAIClassifierArgs, Possible, Security, Sql, Summary, Translation, buildClassificationFunctions, templates }; | ||
export { Battle, ClosedQA, EmbeddingSimilarity, Evaluators, Factuality, Humor, JSONDiff, type LLMArgs, type LLMClassifierArgs, LLMClassifierFromSpec, LLMClassifierFromSpecFile, LLMClassifierFromTemplate, Levenshtein, LevenshteinScorer, ListContains, type ModelGradedSpec, Moderation, NumericDiff, OpenAIClassifier, type OpenAIClassifierArgs, Possible, Security, Sql, Summary, Translation, buildClassificationFunctions, templates }; |
@@ -46,2 +46,3 @@ "use strict"; | ||
ListContains: () => ListContains, | ||
Moderation: () => Moderation, | ||
NumericDiff: () => NumericDiff, | ||
@@ -496,3 +497,3 @@ OpenAIClassifier: () => OpenAIClassifier, | ||
var ListContains = async (args) => { | ||
const { output, expected } = args; | ||
const { output, expected, allowExtraEntities } = args; | ||
if (expected === void 0) { | ||
@@ -542,3 +543,4 @@ throw new Error("ListContains requires an expected value"); | ||
).filter((pair) => pair !== null); | ||
const avgScore = pairs.reduce((acc, pair) => acc + pair.score, 0) / Math.max(output.length, expected.length); | ||
const denominator = allowExtraEntities ? expected.length : Math.max(output.length, expected.length); | ||
const avgScore = pairs.reduce((acc, pair) => acc + pair.score, 0) / denominator; | ||
return { | ||
@@ -553,2 +555,40 @@ name: "ListContains", | ||
// js/moderation.ts | ||
var MODERATION_NAME = "Moderation"; | ||
function computeScore(result, threshold) { | ||
if (threshold === void 0) { | ||
return result.flagged ? 0 : 1; | ||
} | ||
for (const key of Object.keys(result.category_scores)) { | ||
const score = result.category_scores[key]; | ||
if (score > threshold) { | ||
return 0; | ||
} | ||
} | ||
return 1; | ||
} | ||
var Moderation = async (args) => { | ||
var _a; | ||
const threshold = (_a = args.threshold) != null ? _a : void 0; | ||
const output = args.output; | ||
const openai = buildOpenAIClient(args); | ||
const moderationResults = await openai.moderations.create({ | ||
input: output | ||
}); | ||
const result = moderationResults.results[0]; | ||
return { | ||
name: MODERATION_NAME, | ||
score: computeScore(result, threshold), | ||
metadata: { | ||
threshold, | ||
// @NOTE: `as unknown ...` is intentional. See https://stackoverflow.com/a/57280262 | ||
category_scores: result.category_scores || void 0 | ||
} | ||
}; | ||
}; | ||
Object.defineProperty(Moderation, "name", { | ||
value: MODERATION_NAME, | ||
configurable: true | ||
}); | ||
// js/number.ts | ||
@@ -677,2 +717,3 @@ var NumericDiff = (args) => { | ||
ListContains, | ||
Moderation, | ||
NumericDiff, | ||
@@ -679,0 +720,0 @@ OpenAIClassifier, |
{ | ||
"name": "autoevals", | ||
"version": "0.0.57", | ||
"version": "0.0.58", | ||
"description": "Universal library for evaluating AI models", | ||
@@ -52,4 +52,6 @@ "main": "./jsdist/index.js", | ||
"mustache": "^4.2.0", | ||
"openai": "4.23.0" | ||
"openai": "4.23.0", | ||
"zod": "^3.22.4", | ||
"zod-to-json-schema": "^3.22.5" | ||
} | ||
} |
@@ -85,2 +85,3 @@ # Autoevals | ||
- Factuality | ||
- Moderation | ||
- Security | ||
@@ -92,2 +93,18 @@ - Summarization | ||
### RAGAS | ||
- Context precision | ||
- Context relevancy | ||
- Context recall | ||
- Context entities recall | ||
- [ ] Faithfullness | ||
- [ ] Answer relevance | ||
- [ ] Answer semantic similarity | ||
- [ ] Answer correctness | ||
- [ ] Aspect critique | ||
### Composite | ||
- Semantic list contains | ||
### Embeddings | ||
@@ -94,0 +111,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
68782
1559
199
9
+ Addedzod@^3.22.4
+ Addedzod-to-json-schema@^3.22.5
+ Addedzod-to-json-schema@3.24.1(transitive)