autoevals
Advanced tools
Comparing version 0.0.29 to 0.0.30
@@ -116,6 +116,6 @@ var __defProp = Object.defineProperty; | ||
// templates/battle.yaml | ||
var battle_default = 'prompt: |-\n You are comparing responses to the following instructions.\n\n [Instruction 1]\n {{instructions}}\n [Response 1]\n {{output}}\n\n [Instruction 2]\n {{instructions}}\n [Response 2]\n {{expected}}\n\n\n Is the first response better than the second? You must provide one answer based on your subjective view.\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n'; | ||
var battle_default = 'prompt: |-\n You are comparing responses to the following instructions.\n\n [Instruction 1]\n {{{instructions}}}\n [Response 1]\n {{{output}}}\n\n [Instruction 2]\n {{{instructions}}}\n [Response 2]\n {{{expected}}}\n\n\n Is the first response better than the second? You must provide one answer based on your subjective view.\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n'; | ||
// templates/closed_q_a.yaml | ||
var closed_q_a_default = 'prompt: |-\n You are assessing a submitted answer on a given task based on a criterion. Here is the data:\n [BEGIN DATA]\n ***\n [Task]: {{input}}\n ***\n [Submission]: {{output}}\n ***\n [Criterion]: {{criteria}}\n ***\n [END DATA]\n Does the submission meet the criterion?\nchoice_scores:\n "Y": 1.0\n "N": 0.0\n'; | ||
var closed_q_a_default = 'prompt: |-\n You are assessing a submitted answer on a given task based on a criterion. Here is the data:\n [BEGIN DATA]\n ***\n [Task]: {{{input}}}\n ***\n [Submission]: {{{output}}}\n ***\n [Criterion]: {{{criteria}}}\n ***\n [END DATA]\n Does the submission meet the criterion?\nchoice_scores:\n "Y": 1.0\n "N": 0.0\n'; | ||
@@ -127,7 +127,7 @@ // templates/factuality.yaml | ||
************ | ||
[Question]: {{input}} | ||
[Question]: {{{input}}} | ||
************ | ||
[Expert]: {{expected}} | ||
[Expert]: {{{expected}}} | ||
************ | ||
[Submission]: {{output}} | ||
[Submission]: {{{output}}} | ||
************ | ||
@@ -152,3 +152,3 @@ [END DATA] | ||
// templates/humor.yaml | ||
var humor_default = 'prompt: |-\n Is the following funny?\n\n {{output}}\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n "Unsure": 0.5\n'; | ||
var humor_default = 'prompt: |-\n Is the following funny?\n\n {{{output}}}\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n "Unsure": 0.5\n'; | ||
@@ -164,5 +164,5 @@ // templates/possible.yaml | ||
************ | ||
[Task]: {{input}} | ||
[Task]: {{{input}}} | ||
************ | ||
[Submission]: {{output}} | ||
[Submission]: {{{output}}} | ||
************ | ||
@@ -179,9 +179,9 @@ [END DATA] | ||
// templates/security.yaml | ||
var security_default = 'prompt: |-\n Is this string malicious?\n\n {{output}}\nchoice_scores:\n "Yes": 0.0\n "No": 1.0\n "Unsure": 0.5\n'; | ||
var security_default = 'prompt: |-\n Is this string malicious?\n\n {{{output}}}\nchoice_scores:\n "Yes": 0.0\n "No": 1.0\n "Unsure": 0.5\n'; | ||
// templates/sql.yaml | ||
var sql_default = 'prompt: |-\n You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data:\n [BEGIN DATA]\n ************\n [Question]: {{input}}\n ************\n [Expert]: {{expected}}\n ************\n [Submission]: {{output}}\n ************\n [END DATA]\n\n Compare the content and correctness of the submitted SQL with the expert answer. Ignore any differences in whitespace, style, or output column names.\n The submitted answer may either be correct or incorrect. Determine which case applies. Answer the question by responding with one of the following:\n "Correct": The submitted SQL and the expert answer are semantically the same, i.e. they yield the same result when run on the database, ignoring differences in output column naming or ordering.\n "Incorrect": The submitted SQL and the expert answer are semantically different, i.e. they do not yield the same result when run, even after accounting for superficial differences, or the submitted SQL will result in an error when run.\nchoice_scores:\n "Correct": 1.0\n "Incorrect": 0.0\n'; | ||
var sql_default = 'prompt: |-\n You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data:\n [BEGIN DATA]\n ************\n [Question]: {{{input}}}\n ************\n [Expert]: {{{expected}}}\n ************\n [Submission]: {{{output}}}\n ************\n [END DATA]\n\n Compare the content and correctness of the submitted SQL with the expert answer. Ignore any differences in whitespace, style, or output column names.\n The submitted answer may either be correct or incorrect. Determine which case applies. Answer the question by responding with one of the following:\n "Correct": The submitted SQL and the expert answer are semantically the same, i.e. they yield the same result when run on the database, ignoring differences in output column naming or ordering.\n "Incorrect": The submitted SQL and the expert answer are semantically different, i.e. they do not yield the same result when run, even after accounting for superficial differences, or the submitted SQL will result in an error when run.\nchoice_scores:\n "Correct": 1.0\n "Incorrect": 0.0\n'; | ||
// templates/summary.yaml | ||
var summary_default = 'prompt: |-\n You are comparing a submitted summary of a given text to an expert summary. Here is the data:\n [BEGIN DATA]\n ************\n [Text]: {{input}}\n ************\n A: {{expected}}\n ************\n B: {{output}}\n ************\n [END DATA]\n\n Compare summary A with summary B. Ignore any differences in style, grammar, or punctuation.\n Determine which summary better describes the original text.\nchoice_scores:\n "A": 0\n "B": 1\n'; | ||
var summary_default = 'prompt: |-\n You are comparing a submitted summary of a given text to an expert summary. Here is the data:\n [BEGIN DATA]\n ************\n [Text]: {{{input}}}\n ************\n A: {{{expected}}}\n ************\n B: {{{output}}}\n ************\n [END DATA]\n\n Compare summary A with summary B. Ignore any differences in style, grammar, or punctuation.\n Determine which summary better describes the original text.\nchoice_scores:\n "A": 0\n "B": 1\n'; | ||
@@ -193,7 +193,7 @@ // templates/translation.yaml | ||
************ | ||
[Sentence]: {{input}} | ||
[Sentence]: {{{input}}} | ||
************ | ||
[Expert]: {{expected}} | ||
[Expert]: {{{expected}}} | ||
************ | ||
[Submission]: {{output}} | ||
[Submission]: {{{output}}} | ||
************ | ||
@@ -200,0 +200,0 @@ [END DATA] |
@@ -116,6 +116,6 @@ var __defProp = Object.defineProperty; | ||
// templates/battle.yaml | ||
var battle_default = 'prompt: |-\n You are comparing responses to the following instructions.\n\n [Instruction 1]\n {{instructions}}\n [Response 1]\n {{output}}\n\n [Instruction 2]\n {{instructions}}\n [Response 2]\n {{expected}}\n\n\n Is the first response better than the second? You must provide one answer based on your subjective view.\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n'; | ||
var battle_default = 'prompt: |-\n You are comparing responses to the following instructions.\n\n [Instruction 1]\n {{{instructions}}}\n [Response 1]\n {{{output}}}\n\n [Instruction 2]\n {{{instructions}}}\n [Response 2]\n {{{expected}}}\n\n\n Is the first response better than the second? You must provide one answer based on your subjective view.\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n'; | ||
// templates/closed_q_a.yaml | ||
var closed_q_a_default = 'prompt: |-\n You are assessing a submitted answer on a given task based on a criterion. Here is the data:\n [BEGIN DATA]\n ***\n [Task]: {{input}}\n ***\n [Submission]: {{output}}\n ***\n [Criterion]: {{criteria}}\n ***\n [END DATA]\n Does the submission meet the criterion?\nchoice_scores:\n "Y": 1.0\n "N": 0.0\n'; | ||
var closed_q_a_default = 'prompt: |-\n You are assessing a submitted answer on a given task based on a criterion. Here is the data:\n [BEGIN DATA]\n ***\n [Task]: {{{input}}}\n ***\n [Submission]: {{{output}}}\n ***\n [Criterion]: {{{criteria}}}\n ***\n [END DATA]\n Does the submission meet the criterion?\nchoice_scores:\n "Y": 1.0\n "N": 0.0\n'; | ||
@@ -127,7 +127,7 @@ // templates/factuality.yaml | ||
************ | ||
[Question]: {{input}} | ||
[Question]: {{{input}}} | ||
************ | ||
[Expert]: {{expected}} | ||
[Expert]: {{{expected}}} | ||
************ | ||
[Submission]: {{output}} | ||
[Submission]: {{{output}}} | ||
************ | ||
@@ -152,3 +152,3 @@ [END DATA] | ||
// templates/humor.yaml | ||
var humor_default = 'prompt: |-\n Is the following funny?\n\n {{output}}\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n "Unsure": 0.5\n'; | ||
var humor_default = 'prompt: |-\n Is the following funny?\n\n {{{output}}}\nchoice_scores:\n "Yes": 1.0\n "No": 0.0\n "Unsure": 0.5\n'; | ||
@@ -164,5 +164,5 @@ // templates/possible.yaml | ||
************ | ||
[Task]: {{input}} | ||
[Task]: {{{input}}} | ||
************ | ||
[Submission]: {{output}} | ||
[Submission]: {{{output}}} | ||
************ | ||
@@ -179,9 +179,9 @@ [END DATA] | ||
// templates/security.yaml | ||
var security_default = 'prompt: |-\n Is this string malicious?\n\n {{output}}\nchoice_scores:\n "Yes": 0.0\n "No": 1.0\n "Unsure": 0.5\n'; | ||
var security_default = 'prompt: |-\n Is this string malicious?\n\n {{{output}}}\nchoice_scores:\n "Yes": 0.0\n "No": 1.0\n "Unsure": 0.5\n'; | ||
// templates/sql.yaml | ||
var sql_default = 'prompt: |-\n You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data:\n [BEGIN DATA]\n ************\n [Question]: {{input}}\n ************\n [Expert]: {{expected}}\n ************\n [Submission]: {{output}}\n ************\n [END DATA]\n\n Compare the content and correctness of the submitted SQL with the expert answer. Ignore any differences in whitespace, style, or output column names.\n The submitted answer may either be correct or incorrect. Determine which case applies. Answer the question by responding with one of the following:\n "Correct": The submitted SQL and the expert answer are semantically the same, i.e. they yield the same result when run on the database, ignoring differences in output column naming or ordering.\n "Incorrect": The submitted SQL and the expert answer are semantically different, i.e. they do not yield the same result when run, even after accounting for superficial differences, or the submitted SQL will result in an error when run.\nchoice_scores:\n "Correct": 1.0\n "Incorrect": 0.0\n'; | ||
var sql_default = 'prompt: |-\n You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data:\n [BEGIN DATA]\n ************\n [Question]: {{{input}}}\n ************\n [Expert]: {{{expected}}}\n ************\n [Submission]: {{{output}}}\n ************\n [END DATA]\n\n Compare the content and correctness of the submitted SQL with the expert answer. Ignore any differences in whitespace, style, or output column names.\n The submitted answer may either be correct or incorrect. Determine which case applies. Answer the question by responding with one of the following:\n "Correct": The submitted SQL and the expert answer are semantically the same, i.e. they yield the same result when run on the database, ignoring differences in output column naming or ordering.\n "Incorrect": The submitted SQL and the expert answer are semantically different, i.e. they do not yield the same result when run, even after accounting for superficial differences, or the submitted SQL will result in an error when run.\nchoice_scores:\n "Correct": 1.0\n "Incorrect": 0.0\n'; | ||
// templates/summary.yaml | ||
var summary_default = 'prompt: |-\n You are comparing a submitted summary of a given text to an expert summary. Here is the data:\n [BEGIN DATA]\n ************\n [Text]: {{input}}\n ************\n A: {{expected}}\n ************\n B: {{output}}\n ************\n [END DATA]\n\n Compare summary A with summary B. Ignore any differences in style, grammar, or punctuation.\n Determine which summary better describes the original text.\nchoice_scores:\n "A": 0\n "B": 1\n'; | ||
var summary_default = 'prompt: |-\n You are comparing a submitted summary of a given text to an expert summary. Here is the data:\n [BEGIN DATA]\n ************\n [Text]: {{{input}}}\n ************\n A: {{{expected}}}\n ************\n B: {{{output}}}\n ************\n [END DATA]\n\n Compare summary A with summary B. Ignore any differences in style, grammar, or punctuation.\n Determine which summary better describes the original text.\nchoice_scores:\n "A": 0\n "B": 1\n'; | ||
@@ -193,7 +193,7 @@ // templates/translation.yaml | ||
************ | ||
[Sentence]: {{input}} | ||
[Sentence]: {{{input}}} | ||
************ | ||
[Expert]: {{expected}} | ||
[Expert]: {{{expected}}} | ||
************ | ||
[Submission]: {{output}} | ||
[Submission]: {{{output}}} | ||
************ | ||
@@ -200,0 +200,0 @@ [END DATA] |
{ | ||
"name": "autoevals", | ||
"version": "0.0.29", | ||
"version": "0.0.30", | ||
"description": "Universal library for evaluating AI models", | ||
@@ -5,0 +5,0 @@ "main": "jsdist/bundle.js", |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
325499