Prompt flow evaluators

Introduction
Evaluators are custom or prebuilt promptflow flows that are designed to measure the quality of the outputs from language models.
Usage
Users can create evaluator runs on the local machine as shown in the example below:
import os
from pprint import pprint
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluators import RelevanceEvaluator
from promptflow.evals.evaluators.content_safety import ViolenceEvaluator
def answer_length(answer, **kwargs):
return {"value": len(answer)}
if __name__ == "__main__":
model_config = AzureOpenAIModelConfiguration(
azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_KEY"),
azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
)
relevance_eval = RelevanceEvaluator(model_config)
relevance_score = relevance_eval(
answer="The Alpine Explorer Tent is the most waterproof.",
context="From the our product list,"
" the alpine explorer tent is the most waterproof."
" The Adventure Dining Table has higher weight.",
question="Which tent is the most waterproof?",
)
pprint(relevance_score)
project_scope = {
"subscription_id": "e0fd569c-e34a-4249-8c24-e8d723c7f054",
"resource_group_name": "rg-test",
"project_name": "project-test",
}
violence_eval = ViolenceEvaluator(project_scope)
violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
pprint(violence_score)
answer_length("The Alpine Explorer Tent is the most waterproof.")
result = evaluate(
data="evaluate_test_data.jsonl",
evaluators={
"answer_length": answer_length,
"violence": violence_eval,
},
)
pprint(result)