|
| 1 | +# AI Config Evaluation Dataset for Agents Demo |
| 2 | +# Tests for supervisor-agent, support-agent, and security-agent |
| 3 | +# Updated to test against Reinforcement Learning knowledge base (Sutton & Barto) |
| 4 | +# Uses standardized evaluation criteria for consistent, aggregatable metrics |
| 5 | + |
| 6 | +# Standardized evaluation criteria applied to ALL test cases |
| 7 | +# This enables meaningful aggregation and comparison across all tests |
| 8 | +default_evaluation_criteria: |
| 9 | + - name: Relevance |
| 10 | + description: "Does the response directly address the user's question or request?" |
| 11 | + weight: 2.0 |
| 12 | + |
| 13 | + - name: Accuracy |
| 14 | + description: "Is the information factually correct and reliable?" |
| 15 | + weight: 2.0 |
| 16 | + |
| 17 | + - name: Completeness |
| 18 | + description: "Does the response fully answer the question without missing key information?" |
| 19 | + weight: 1.5 |
| 20 | + |
| 21 | + - name: Clarity |
| 22 | + description: "Is the response clear, well-structured, and easy to understand?" |
| 23 | + weight: 1.5 |
| 24 | + |
| 25 | + - name: Helpfulness |
| 26 | + description: "Does the response provide practical, actionable value to the user?" |
| 27 | + weight: 1.0 |
| 28 | + |
| 29 | +cases: |
| 30 | + # Support Agent: Basic RL Query |
| 31 | + - id: support_rl_basic |
| 32 | + input: "What is a Markov Decision Process and why is it important in reinforcement learning?" |
| 33 | + context: |
| 34 | + user_type: "customer" |
| 35 | + query_type: "basic" |
| 36 | + agent: "support" |
| 37 | + country: "US" |
| 38 | + plan: "free" |
| 39 | + reference_response: | |
| 40 | + A Markov Decision Process (MDP) is a mathematical framework for modeling decision-making where outcomes are partly random and partly under the control of an agent. |
| 41 | + MDPs are fundamental to RL because they provide a formal way to describe the interaction between an agent and its environment. |
| 42 | +
|
| 43 | + # Support Agent: Technical RL Concept |
| 44 | + - id: support_rl_value_functions |
| 45 | + input: "Can you explain the difference between state-value functions and action-value functions?" |
| 46 | + context: |
| 47 | + user_type: "developer" |
| 48 | + query_type: "technical" |
| 49 | + agent: "support" |
| 50 | + country: "DE" |
| 51 | + plan: "paid" |
| 52 | + |
| 53 | + # Support Agent: RL Algorithms |
| 54 | + - id: support_rl_algorithms |
| 55 | + input: "What is Q-learning and how does it differ from SARSA?" |
| 56 | + context: |
| 57 | + user_type: "enterprise" |
| 58 | + query_type: "technical" |
| 59 | + agent: "support" |
| 60 | + country: "US" |
| 61 | + plan: "paid" |
| 62 | + |
| 63 | + # Support Agent: Advanced RL Topic |
| 64 | + - id: support_rl_exploration |
| 65 | + input: "What is the exploration-exploitation trade-off and why is it challenging?" |
| 66 | + context: |
| 67 | + user_type: "architect" |
| 68 | + query_type: "advanced" |
| 69 | + agent: "support" |
| 70 | + technical_level: "expert" |
| 71 | + country: "FR" |
| 72 | + plan: "paid" |
| 73 | + |
| 74 | + # Support Agent: RL Fundamentals |
| 75 | + - id: support_rl_bellman |
| 76 | + input: "Can you explain the Bellman equation and its role in reinforcement learning?" |
| 77 | + context: |
| 78 | + user_type: "developer" |
| 79 | + query_type: "technical" |
| 80 | + agent: "support" |
| 81 | + country: "DE" |
| 82 | + plan: "free" |
| 83 | + |
| 84 | + # Support Agent: RL Methods |
| 85 | + - id: support_rl_temporal_difference |
| 86 | + input: "What are temporal-difference learning methods and what advantages do they have?" |
| 87 | + context: |
| 88 | + user_type: "developer" |
| 89 | + query_type: "technical" |
| 90 | + agent: "support" |
| 91 | + country: "US" |
| 92 | + plan: "paid" |
| 93 | + |
| 94 | + # Security Agent: PII Detection - Email |
| 95 | + - id: security_pii_email |
| 96 | + input: "My email is john.doe@example.com and I need help with reinforcement learning concepts." |
| 97 | + context: |
| 98 | + user_type: "customer" |
| 99 | + query_type: "support" |
| 100 | + agent: "security" |
| 101 | + pii_expected: true |
| 102 | + |
| 103 | + # Security Agent: PII Detection - Phone Number |
| 104 | + - id: security_pii_phone |
| 105 | + input: "Please call me at 555-123-4567 to discuss Q-learning algorithms." |
| 106 | + context: |
| 107 | + user_type: "customer" |
| 108 | + query_type: "support" |
| 109 | + agent: "security" |
| 110 | + pii_expected: true |
| 111 | + |
| 112 | + # Security Agent: PII Detection - SSN |
| 113 | + - id: security_pii_ssn |
| 114 | + input: "My SSN is 123-45-6789, can you help me understand policy gradient methods?" |
| 115 | + context: |
| 116 | + user_type: "customer" |
| 117 | + query_type: "support" |
| 118 | + agent: "security" |
| 119 | + pii_expected: true |
| 120 | + sensitivity: "critical" |
| 121 | + |
| 122 | + # Security Agent: No PII - Safe Query |
| 123 | + - id: security_no_pii |
| 124 | + input: "What are the best practices for implementing Monte Carlo methods in RL?" |
| 125 | + context: |
| 126 | + user_type: "developer" |
| 127 | + query_type: "technical" |
| 128 | + agent: "security" |
| 129 | + pii_expected: false |
| 130 | + |
| 131 | + # Supervisor Agent: Routing to Support |
| 132 | + - id: supervisor_route_support |
| 133 | + input: "Can you help me understand how value iteration works?" |
| 134 | + context: |
| 135 | + user_type: "customer" |
| 136 | + query_type: "general" |
| 137 | + agent: "supervisor" |
| 138 | + expected_route: "support" |
| 139 | + |
| 140 | + # Supervisor Agent: Routing to Security |
| 141 | + - id: supervisor_route_security |
| 142 | + input: "I accidentally posted my credit card number 4532-1234-5678-9010 in the chat. Can you delete it?" |
| 143 | + context: |
| 144 | + user_type: "customer" |
| 145 | + query_type: "security" |
| 146 | + agent: "supervisor" |
| 147 | + expected_route: "security" |
| 148 | + pii_expected: true |
| 149 | + |
| 150 | + # Supervisor Agent: Complex Multi-Turn |
| 151 | + - id: supervisor_multi_turn |
| 152 | + input: "First, explain policy gradient methods, then help me check if my message contains any sensitive information." |
| 153 | + context: |
| 154 | + user_type: "developer" |
| 155 | + query_type: "complex" |
| 156 | + agent: "supervisor" |
| 157 | + multi_agent_expected: true |
| 158 | + |
| 159 | + # Edge Case: Ambiguous Query |
| 160 | + - id: edge_ambiguous |
| 161 | + input: "It's not working" |
| 162 | + context: |
| 163 | + user_type: "customer" |
| 164 | + query_type: "vague" |
| 165 | + agent: "supervisor" |
| 166 | + |
| 167 | + # Performance Test: Quick Response |
| 168 | + - id: performance_simple |
| 169 | + input: "What is the reward signal in reinforcement learning?" |
| 170 | + context: |
| 171 | + user_type: "customer" |
| 172 | + query_type: "simple" |
| 173 | + agent: "support" |
| 174 | + performance_critical: true |
| 175 | + country: "US" |
| 176 | + plan: "free" |
| 177 | + |
| 178 | + # Enterprise User: High-Value Query |
| 179 | + - id: enterprise_rl_comparison |
| 180 | + input: "We're implementing RL for our production system. Can you compare model-free vs model-based approaches and their trade-offs?" |
| 181 | + context: |
| 182 | + user_type: "enterprise" |
| 183 | + plan: "paid" |
| 184 | + query_type: "research" |
| 185 | + agent: "support" |
| 186 | + high_value: true |
| 187 | + country: "FR" |
0 commit comments