-
-
Notifications
You must be signed in to change notification settings - Fork 773
Description
create eval framework for praisonaiagents, minimal code implementation on client side, keep it as minimal as possible
π‘ Proposed PraisonAI Agents Implementation
1. Accuracy Evaluation
from praisonaiagents import Agent
from praisonaiagents.eval import AccuracyEvalBasic usage
agent = Agent(
name="Analyst",
role="Data Analyst",
llm="gpt-4"
)Simple accuracy check
eval = AccuracyEval(
agent=agent,
input="What is the capital of France?",
expected_output="Paris"
)result = eval.run()
print(f"Accuracy: {result.score}/10")
Advanced accuracy evaluation:from praisonaiagents.eval import AccuracyEval, EvalCriteria
Multi-criteria evaluation
eval = AccuracyEval(
agent=agent,
test_cases=[
{
"input": "Summarize the Q1 report",
"expected_output": "Q1 showed 15% growth...",
"weight": 2.0 # Higher importance
},
{
"input": "What are the key risks?",
"expected_output": "Supply chain, market volatility..."
}
],
criteria=EvalCriteria(
factual_accuracy=0.4, # 40% weight
completeness=0.3, # 30% weight
relevance=0.3 # 30% weight
),
evaluator_llm="gpt-4", # Model for evaluation
iterations=5, # Statistical reliability
save_results="eval_results.json"
)Run with detailed output
result = eval.run(verbose=True)
Access statistics
print(f"Average: {result.avg_score:.2f}")
print(f"Std Dev: {result.std_dev:.2f}")
print(f"Confidence: {result.confidence_interval}")2. Reliability Testing
from praisonaiagents.eval import ReliabilityEval
Test if agent uses expected tools
eval = ReliabilityEval(
agent=agent,
test_scenarios=[
{
"input": "Search weather and create report",
"expected_tools": ["web_search", "create_file"],
"required_order": True # Tools must be called in order
},
{
"input": "Analyze CSV data",
"expected_tools": ["read_csv", "analyze_data"],
"allow_additional": True # Other tools allowed
}
]
)results = eval.run()
for scenario in results.scenarios:
print(f"Scenario: {scenario.name} - {scenario.status}")
if scenario.failed_tools:
print(f" Failed: {scenario.failed_tools}")3. Performance Evaluation
from praisonaiagents.eval import PerformanceEval
Benchmark agent performance
eval = PerformanceEval(
agent=agent,
benchmark_queries=[
"Simple question",
"Complex analysis task",
"Multi-step reasoning"
],
metrics={
"runtime": True,
"memory": True,
"tokens": True, # Token usage tracking
"ttft": True # Time to first token
},
iterations=50,
warmup=5
)result = eval.run()
Detailed performance report
result.print_report()
Outputs table with avg, p50, p95, p99 for each metric
Compare agents
comparison = PerformanceEval.compare(
agents=[agent1, agent2, agent3],
benchmark_suite="standard", # Predefined benchmarks
export_format="html" # Visual comparison report
)4. Automated Test Suite
from praisonaiagents.eval import EvalSuite, TestCase
Define comprehensive test suite
suite = EvalSuite(
name="Agent Quality Assurance",
agents=[agent],
test_cases=[
TestCase(
name="Basic Math",
input="What is 15 * 23?",
expected_output="345",
eval_type="accuracy",
tags=["math", "simple"]
),
TestCase(
name="Tool Usage",
input="Search and summarize AI news",
expected_tools=["web_search", "summarize"],
eval_type="reliability"
),
TestCase(
name="Performance Baseline",
input="Standard benchmark query",
max_runtime=2.0, # seconds
max_memory=100, # MB
eval_type="performance"
)
],
# Automation features
schedule="0 2 * * *", # Run daily at 2 AM
alerts={
"email": "team@example.com",
"threshold": 0.8 # Alert if score < 80%
},
export_results="s3://bucket/eval-results/"
)Run full suite
results = suite.run()
CI/CD integration
if not results.passed:
raise EvalFailure(f"Quality gate failed: {results.summary}")Generate report
suite.generate_report(
format="html",
include_graphs=True,
compare_with="last_week"
)5. Integration with Existing PraisonAI Features
from praisonaiagents import Agent, Process, Task
from praisonaiagents.eval import EvalSuite
from praisonaiagents.memory import Memory
from praisonaiagents.tools import ToolsEvaluation-aware agent with memory
agent = Agent(
name="EvalAgent",
llm="gpt-4",
memory=Memory(
provider="rag",
quality_threshold=0.8
),
tools=Tools(["web_search", "calculator"]),
# Built-in evaluation
eval_config={
"track_accuracy": True,
"sample_rate": 0.1, # Evaluate 10% of runs
"baseline": "eval_baseline.json"
}
)Process with automatic evaluation
process = Process(
agents=[agent],
tasks=[task1, task2],
# Enable evaluation mode
eval_mode=True,
eval_criteria={
"min_accuracy": 0.85,
"max_runtime": 5.0
}
)Run with evaluation
result = process.start()
Access evaluation metrics
print(f"Process accuracy: {result.eval_metrics.accuracy}")
print(f"Task performances: {result.eval_metrics.task_times}")Save evaluation data for analysis
result.eval_metrics.export("process_eval.json")