Skip to content

create eval framework for praisonaiagents, mini...Β #967

@MervinPraison

Description

@MervinPraison

@claude

create eval framework for praisonaiagents, minimal code implementation on client side, keep it as minimal as possible

πŸ’‘ Proposed PraisonAI Agents Implementation

1. Accuracy Evaluation

from praisonaiagents import Agent
from praisonaiagents.eval import AccuracyEval

Basic usage

agent = Agent(
name="Analyst",
role="Data Analyst",
llm="gpt-4"
)

Simple accuracy check

eval = AccuracyEval(
agent=agent,
input="What is the capital of France?",
expected_output="Paris"
)

result = eval.run()
print(f"Accuracy: {result.score}/10")
Advanced accuracy evaluation:

from praisonaiagents.eval import AccuracyEval, EvalCriteria

Multi-criteria evaluation

eval = AccuracyEval(
agent=agent,
test_cases=[
{
"input": "Summarize the Q1 report",
"expected_output": "Q1 showed 15% growth...",
"weight": 2.0 # Higher importance
},
{
"input": "What are the key risks?",
"expected_output": "Supply chain, market volatility..."
}
],
criteria=EvalCriteria(
factual_accuracy=0.4, # 40% weight
completeness=0.3, # 30% weight
relevance=0.3 # 30% weight
),
evaluator_llm="gpt-4", # Model for evaluation
iterations=5, # Statistical reliability
save_results="eval_results.json"
)

Run with detailed output

result = eval.run(verbose=True)

Access statistics

print(f"Average: {result.avg_score:.2f}")
print(f"Std Dev: {result.std_dev:.2f}")
print(f"Confidence: {result.confidence_interval}")

2. Reliability Testing

from praisonaiagents.eval import ReliabilityEval

Test if agent uses expected tools

eval = ReliabilityEval(
agent=agent,
test_scenarios=[
{
"input": "Search weather and create report",
"expected_tools": ["web_search", "create_file"],
"required_order": True # Tools must be called in order
},
{
"input": "Analyze CSV data",
"expected_tools": ["read_csv", "analyze_data"],
"allow_additional": True # Other tools allowed
}
]
)

results = eval.run()
for scenario in results.scenarios:
print(f"Scenario: {scenario.name} - {scenario.status}")
if scenario.failed_tools:
print(f" Failed: {scenario.failed_tools}")

3. Performance Evaluation

from praisonaiagents.eval import PerformanceEval

Benchmark agent performance

eval = PerformanceEval(
agent=agent,
benchmark_queries=[
"Simple question",
"Complex analysis task",
"Multi-step reasoning"
],
metrics={
"runtime": True,
"memory": True,
"tokens": True, # Token usage tracking
"ttft": True # Time to first token
},
iterations=50,
warmup=5
)

result = eval.run()

Detailed performance report

result.print_report()

Outputs table with avg, p50, p95, p99 for each metric

Compare agents

comparison = PerformanceEval.compare(
agents=[agent1, agent2, agent3],
benchmark_suite="standard", # Predefined benchmarks
export_format="html" # Visual comparison report
)

4. Automated Test Suite

from praisonaiagents.eval import EvalSuite, TestCase

Define comprehensive test suite

suite = EvalSuite(
name="Agent Quality Assurance",
agents=[agent],
test_cases=[
TestCase(
name="Basic Math",
input="What is 15 * 23?",
expected_output="345",
eval_type="accuracy",
tags=["math", "simple"]
),
TestCase(
name="Tool Usage",
input="Search and summarize AI news",
expected_tools=["web_search", "summarize"],
eval_type="reliability"
),
TestCase(
name="Performance Baseline",
input="Standard benchmark query",
max_runtime=2.0, # seconds
max_memory=100, # MB
eval_type="performance"
)
],
# Automation features
schedule="0 2 * * *", # Run daily at 2 AM
alerts={
"email": "team@example.com",
"threshold": 0.8 # Alert if score < 80%
},
export_results="s3://bucket/eval-results/"
)

Run full suite

results = suite.run()

CI/CD integration

if not results.passed:
raise EvalFailure(f"Quality gate failed: {results.summary}")

Generate report

suite.generate_report(
format="html",
include_graphs=True,
compare_with="last_week"
)

5. Integration with Existing PraisonAI Features

from praisonaiagents import Agent, Process, Task
from praisonaiagents.eval import EvalSuite
from praisonaiagents.memory import Memory
from praisonaiagents.tools import Tools

Evaluation-aware agent with memory

agent = Agent(
name="EvalAgent",
llm="gpt-4",
memory=Memory(
provider="rag",
quality_threshold=0.8
),
tools=Tools(["web_search", "calculator"]),
# Built-in evaluation
eval_config={
"track_accuracy": True,
"sample_rate": 0.1, # Evaluate 10% of runs
"baseline": "eval_baseline.json"
}
)

Process with automatic evaluation

process = Process(
agents=[agent],
tasks=[task1, task2],
# Enable evaluation mode
eval_mode=True,
eval_criteria={
"min_accuracy": 0.85,
"max_runtime": 5.0
}
)

Run with evaluation

result = process.start()

Access evaluation metrics

print(f"Process accuracy: {result.eval_metrics.accuracy}")
print(f"Task performances: {result.eval_metrics.task_times}")

Save evaluation data for analysis

result.eval_metrics.export("process_eval.json")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions