Security News
Fluent Assertions Faces Backlash After Abandoning Open Source Licensing
Fluent Assertions is facing backlash after dropping the Apache license for a commercial model, leaving users blindsided and questioning contributor rights.
A robust, production-ready framework for statistically rigorous evaluation of language models, implementing the methodology described in "A Statistical Approach to Model Evaluations" (2024).
pip3 install -U evalops
import os
from dotenv import load_dotenv
from swarm_models import OpenAIChat
from swarms import Agent
from evalops import StatisticalModelEvaluator
load_dotenv()
# Get the OpenAI API key from the environment variable
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
# Create instances of the OpenAIChat class with different models
model_gpt4 = OpenAIChat(
openai_api_key=api_key, model_name="gpt-4o", temperature=0.1
)
model_gpt35 = OpenAIChat(
openai_api_key=api_key, model_name="gpt-4o-mini", temperature=0.1
)
# Initialize a general knowledge agent
agent = Agent(
agent_name="General-Knowledge-Agent",
system_prompt="You are a helpful assistant that answers general knowledge questions accurately and concisely.",
llm=model_gpt4,
max_loops=1,
dynamic_temperature_enabled=True,
saved_state_path="general_agent.json",
user_name="swarms_corp",
context_length=200000,
return_step_meta=False,
output_type="string",
)
evaluator = StatisticalModelEvaluator(cache_dir="./eval_cache")
# General knowledge test cases
general_questions = [
"What is the capital of France?",
"Who wrote Romeo and Juliet?",
"What is the largest planet in our solar system?",
"What is the chemical symbol for gold?",
"Who painted the Mona Lisa?",
]
general_answers = [
"Paris",
"William Shakespeare",
"Jupiter",
"Au",
"Leonardo da Vinci",
]
# Evaluate models on general knowledge questions
result_gpt4 = evaluator.evaluate_model(
model=agent,
questions=general_questions,
correct_answers=general_answers,
num_samples=5,
)
result_gpt35 = evaluator.evaluate_model(
model=agent,
questions=general_questions,
correct_answers=general_answers,
num_samples=5,
)
# Compare model performance
comparison = evaluator.compare_models(result_gpt4, result_gpt35)
# Print results
print(f"GPT-4 Mean Score: {result_gpt4.mean_score:.3f}")
print(f"GPT-3.5 Mean Score: {result_gpt35.mean_score:.3f}")
print(
f"Significant Difference: {comparison['significant_difference']}"
)
print(f"P-value: {comparison['p_value']:.3f}")
class MyLanguageModel:
def run(self, task: str) -> str:
# Your model implementation
return "model response"
evaluator = StatisticalModelEvaluator(
cache_dir="./eval_cache",
log_level="INFO",
random_seed=42
)
# Prepare your evaluation data
questions = ["Question 1", "Question 2", ...]
answers = ["Answer 1", "Answer 2", ...]
# Run evaluation
result = evaluator.evaluate_model(
model=MyLanguageModel(),
questions=questions,
correct_answers=answers,
num_samples=3, # Number of times to sample each question
batch_size=32, # Batch size for parallel processing
cache_key="model_v1" # Optional caching key
)
# Access results
print(f"Mean Score: {result.mean_score:.3f}")
print(f"95% CI: [{result.ci_lower:.3f}, {result.ci_upper:.3f}]")
# For questions that are grouped (e.g., multiple questions about the same passage)
cluster_ids = ["passage1", "passage1", "passage2", "passage2", ...]
result = evaluator.evaluate_model(
model=MyLanguageModel(),
questions=questions,
correct_answers=answers,
cluster_ids=cluster_ids
)
# Evaluate two models
result_a = evaluator.evaluate_model(model=ModelA(), ...)
result_b = evaluator.evaluate_model(model=ModelB(), ...)
# Compare results
comparison = evaluator.compare_models(result_a, result_b)
print(f"Mean Difference: {comparison['mean_difference']:.3f}")
print(f"P-value: {comparison['p_value']:.4f}")
print(f"Significant Difference: {comparison['significant_difference']}")
required_samples = evaluator.calculate_required_samples(
effect_size=0.05, # Minimum difference to detect
baseline_variance=0.1, # Estimated variance in scores
power=0.8, # Desired statistical power
alpha=0.05 # Significance level
)
print(f"Required number of samples: {required_samples}")
import os
from dotenv import load_dotenv
from swarm_models import OpenAIChat
from swarms import Agent
from evalops import StatisticalModelEvaluator
from evalops.huggingface_loader import EvalDatasetLoader
load_dotenv()
# Get the OpenAI API key from the environment variable
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
# Create instance of OpenAIChat
model_gpt4 = OpenAIChat(
openai_api_key=api_key, model_name="gpt-4o", temperature=0.1
)
# Initialize a general knowledge agent
agent = Agent(
agent_name="General-Knowledge-Agent",
system_prompt="You are a helpful assistant that answers general knowledge questions accurately and concisely.",
llm=model_gpt4,
max_loops=1,
dynamic_temperature_enabled=True,
saved_state_path="general_agent.json",
user_name="swarms_corp",
context_length=200000,
return_step_meta=False,
output_type="string",
)
evaluator = StatisticalModelEvaluator(cache_dir="./eval_cache")
# Initialize the dataset loader
eval_loader = EvalDatasetLoader(cache_dir="./eval_cache")
# Load a common evaluation dataset
questions, answers = eval_loader.load_dataset(
dataset_name="truthful_qa",
subset="multiple_choice",
split="validation",
answer_key="best_question",
)
# Use the loaded questions and answers with your evaluator
result_gpt4 = evaluator.evaluate_model(
model=agent,
questions=questions,
correct_answers=answers,
num_samples=5,
)
# Print results
print(result_gpt4)
eval
is a simple function that wraps the evaluator class and makes it easy to use.
import os
from dotenv import load_dotenv
from swarm_models import OpenAIChat
from swarms import Agent
from evalops.wrapper import eval
load_dotenv()
# Get the OpenAI API key from the environment variable
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
# Create instance of OpenAIChat
model_gpt4 = OpenAIChat(
openai_api_key=api_key, model_name="gpt-4o", temperature=0.1
)
# Initialize a general knowledge agent
agent = Agent(
agent_name="General-Knowledge-Agent",
system_prompt="You are a helpful assistant that answers general knowledge questions accurately and concisely.",
llm=model_gpt4,
max_loops=1,
dynamic_temperature_enabled=True,
saved_state_path="general_agent.json",
user_name="swarms_corp",
context_length=200000,
return_step_meta=False,
output_type="string",
)
# General knowledge test cases
general_questions = [
"What is the capital of France?",
"Who wrote Romeo and Juliet?",
"What is the largest planet in our solar system?",
"What is the chemical symbol for gold?",
"Who painted the Mona Lisa?",
]
# Answers
general_answers = [
"Paris",
"William Shakespeare",
"Jupiter",
"Au",
"Leonardo da Vinci",
]
print(eval(
questions = general_questions,
answers=general_answers,
agent=agent,
samples=2,
))
Parameter | Description | Default |
---|---|---|
cache_dir | Directory for caching results | None |
log_level | Logging verbosity ("DEBUG", "INFO", etc.) | "INFO" |
random_seed | Seed for reproducibility | None |
batch_size | Batch size for parallel processing | 32 |
num_samples | Samples per question | 1 |
@dataclass
class EvalResult:
mean_score: float # Average score across questions
sem: float # Standard error of the mean
ci_lower: float # Lower bound of 95% CI
ci_upper: float # Upper bound of 95% CI
raw_scores: List[float] # Individual question scores
metadata: Dict # Additional evaluation metadata
{
"mean_difference": float, # Difference between means
"correlation": float, # Score correlation
"t_statistic": float, # T-test statistic
"p_value": float, # Statistical significance
"significant_difference": bool # True if p < 0.05
}
We welcome contributions! Please see our Contributing Guidelines for details.
git checkout -b feature/AmazingFeature
)git commit -m 'Add some AmazingFeature'
)git push origin feature/AmazingFeature
)This project is licensed under the MIT License - see the LICENSE file for details.
FAQs
evalops - TGSC
We found that evalops demonstrated a healthy version release cadence and project activity because the last version was released less than a year ago. It has 1 open source maintainer collaborating on the project.
Did you know?
Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.
Security News
Fluent Assertions is facing backlash after dropping the Apache license for a commercial model, leaving users blindsided and questioning contributor rights.
Research
Security News
Socket researchers uncover the risks of a malicious Python package targeting Discord developers.
Security News
The UK is proposing a bold ban on ransomware payments by public entities to disrupt cybercrime, protect critical services, and lead global cybersecurity efforts.