@@ -209,8 +209,10 @@ def _run_tracked_task(
209209 if model :
210210 agent_kwargs ["llm" ] = model
211211
212- if not verbose :
213- agent_kwargs ["output" ] = "silent"
212+ if verbose :
213+ pass # Use default full output
214+ else :
215+ agent_kwargs ["output" ] = "status" # Real-time status like Agent(output="status")
214216
215217 agent = Agent (** agent_kwargs )
216218
@@ -419,10 +421,28 @@ def _print_summary(result: TrackerResult) -> None:
419421 console .print (f" • { gap } " )
420422
421423
422- @app .callback (invoke_without_command = True )
423- def tracker_main (
424- ctx : typer .Context ,
425- task : Optional [str ] = typer .Argument (None , help = "Task for the agent to complete" ),
424+ @app .callback ()
425+ def tracker_main (ctx : typer .Context ):
426+ """Autonomous agent tracking with step-by-step analysis.
427+
428+ Run tasks with full execution tracing and optional quality judging.
429+
430+ Commands:
431+
432+ praisonai tracker run "Search for Python best practices"
433+
434+ praisonai tracker judge "What is 2+2?" --expected "4"
435+
436+ praisonai tracker tools
437+
438+ praisonai tracker batch tasks.json
439+ """
440+ pass
441+
442+
443+ @app .command (name = "run" )
444+ def tracker_run (
445+ task : str = typer .Argument (..., help = "Task for the agent to complete" ),
426446 max_iterations : int = typer .Option (20 , "--max-iterations" , "-n" , help = "Maximum iterations (default: 20)" ),
427447 model : Optional [str ] = typer .Option (None , "--model" , "-m" , help = "LLM model to use" ),
428448 tools : Optional [str ] = typer .Option (None , "--tools" , "-t" , help = "Comma-separated tool names to use" ),
@@ -432,31 +452,32 @@ def tracker_main(
432452):
433453 """Run an agent with step-by-step tracking.
434454
435- The tracker runs an agent in autonomous mode and records every step,
436- tool call, and decision. At the end, it displays a summary table
437- showing all steps taken and any gaps identified.
438-
439455 Examples:
440456
441- praisonai tracker "Search for Python best practices and summarize"
457+ praisonai tracker run "Search for Python best practices and summarize"
442458
443- praisonai tracker "Read config.yaml and explain its structure" -v
459+ praisonai tracker run "Read config.yaml and explain its structure" -v
444460
445- praisonai tracker "Find trending AI news" --tools search_web,web_crawl
461+ praisonai tracker run "Find trending AI news" --tools search_web,web_crawl
446462 """
447- if ctx .invoked_subcommand is not None :
448- return
449-
450- if not task :
451- typer .echo (ctx .get_help ())
452- return
453-
463+ _run_and_display (task , max_iterations , model , tools , extended , verbose , live )
464+
465+
466+ def _run_and_display (
467+ task : str ,
468+ max_iterations : int = 20 ,
469+ model : Optional [str ] = None ,
470+ tools : Optional [str ] = None ,
471+ extended : bool = False ,
472+ verbose : bool = False ,
473+ live : bool = True ,
474+ ) -> TrackerResult :
475+ """Shared logic: run a tracked task and display results."""
454476 # Resolve tools
455477 tool_names = AUTONOMY_DEFAULT_TOOLS .copy ()
456478 if extended :
457479 tool_names .extend (EXTENDED_TOOLS )
458480 if tools :
459- # Override with user-specified tools
460481 tool_names = [t .strip () for t in tools .split ("," )]
461482
462483 resolved_tools = _get_tools (tool_names )
@@ -488,6 +509,8 @@ def step_callback(step: TrackedStep):
488509 _print_step_table (result .steps )
489510 console .print ("\n " )
490511 _print_summary (result )
512+
513+ return result
491514
492515
493516@app .command (name = "batch" )
@@ -617,3 +640,171 @@ def tracker_tools():
617640
618641 console .print ("\n [dim]Use --tools to specify custom tools, e.g.:[/dim]" )
619642 console .print ("[dim] praisonai tracker 'task' --tools search_web,read_file[/dim]" )
643+
644+
645+ # ============================================================================
646+ # JUDGE FEATURE
647+ # ============================================================================
648+
649+ DEFAULT_JUDGE_CRITERIA = """Evaluate this autonomous agent execution trace:
650+ 1. Task Completion: Did the agent fully complete the assigned task?
651+ 2. Tool Selection: Were appropriate tools chosen for each step?
652+ 3. Efficiency: Was the task completed with minimal unnecessary steps?
653+ 4. Error Handling: Were errors handled gracefully without crashing?
654+ 5. Output Quality: Is the final output accurate and useful?"""
655+
656+
657+ def _format_trace_for_judge (result : TrackerResult ) -> str :
658+ """Convert TrackerResult into a structured string for LLM evaluation."""
659+ lines = []
660+ lines .append (f"Task: \" { result .task } \" " )
661+ status = "success" if result .success else "failed"
662+ lines .append (f"Completion: { status } (reason: { result .completion_reason } )" )
663+ lines .append (f"Duration: { result .total_duration :.1f} s | Steps: { result .total_steps } | Tools Used: { ', ' .join (result .tools_used ) if result .tools_used else 'None' } " )
664+ lines .append ("" )
665+
666+ for step in result .steps :
667+ icon = "✅" if step .success else "❌"
668+ lines .append (f"Step { step .step_number } : [{ step .action_type } ] { step .action_name } ({ step .duration_seconds :.1f} s) { icon } " )
669+ lines .append (f" Input: { step .input_summary } " )
670+ lines .append (f" Output: { step .output_summary } " )
671+ if step .error :
672+ lines .append (f" Error: { step .error } " )
673+ lines .append ("" )
674+
675+ if result .gaps_identified :
676+ lines .append (f"Gaps: { '; ' .join (result .gaps_identified )} " )
677+ else :
678+ lines .append ("Gaps: None" )
679+
680+ return "\n " .join (lines )
681+
682+
683+ def _print_judge_verdict (judge_result , threshold : float ) -> None :
684+ """Print the judge verdict with rich formatting."""
685+ score = getattr (judge_result , 'score' , 0 ) or 0
686+ passed = score >= threshold
687+ reasoning = getattr (judge_result , 'reasoning' , '' ) or ''
688+ suggestions = getattr (judge_result , 'suggestions' , []) or []
689+
690+ # Score bar
691+ bar_len = 20
692+ filled = int (score / 10 * bar_len )
693+ bar = "█" * filled + "░" * (bar_len - filled )
694+
695+ color = "green" if passed else ("yellow" if score >= 5 else "red" )
696+ icon = "✅" if passed else "❌"
697+
698+ console .print (Panel (
699+ f"""
700+ [bold]{ icon } Score: [{ color } ]{ score :.1f} /10[/{ color } ][/bold] [{ color } ]{ bar } [/{ color } ]
701+ [bold]Threshold:[/bold] { threshold } | [bold]Verdict:[/bold] [{ 'green' if passed else 'red' } ]{ 'PASS' if passed else 'FAIL' } [/{ 'green' if passed else 'red' } ]
702+
703+ [bold]Reasoning:[/bold]
704+ { reasoning }
705+ """ ,
706+ title = "⚖️ Judge Verdict" ,
707+ border_style = color ,
708+ ))
709+
710+ if suggestions :
711+ console .print ("[bold yellow]💡 Suggestions:[/bold yellow]" )
712+ for s in suggestions :
713+ console .print (f" • { s } " )
714+
715+
716+ @app .command (name = "judge" )
717+ def tracker_judge (
718+ task : str = typer .Argument (..., help = "Task to execute and judge" ),
719+ criteria : Optional [str ] = typer .Option (None , "--criteria" , "-c" , help = "Custom evaluation criteria" ),
720+ expected : Optional [str ] = typer .Option (None , "--expected" , "-e" , help = "Expected output for accuracy evaluation" ),
721+ threshold : float = typer .Option (7.0 , "--threshold" , help = "Pass/fail score threshold (1-10)" ),
722+ max_iterations : int = typer .Option (20 , "--max-iterations" , "-n" , help = "Maximum iterations (default: 20)" ),
723+ model : Optional [str ] = typer .Option (None , "--model" , "-m" , help = "LLM model to use" ),
724+ judge_model : Optional [str ] = typer .Option (None , "--judge-model" , help = "LLM model for judge (default: same as agent)" ),
725+ tools : Optional [str ] = typer .Option (None , "--tools" , "-t" , help = "Comma-separated tool names to use" ),
726+ extended : bool = typer .Option (False , "--extended" , help = "Include extended tools (may require API keys)" ),
727+ verbose : bool = typer .Option (False , "--verbose" , "-v" , help = "Show verbose output" ),
728+ ):
729+ """Run a task and judge the execution quality.
730+
731+ Executes the task with step tracking, then evaluates the execution
732+ trace using an LLM judge. Reports a score (1-10), pass/fail verdict,
733+ reasoning, and improvement suggestions.
734+
735+ Examples:
736+
737+ praisonai tracker judge "Calculate fibonacci(10) using execute_code"
738+
739+ praisonai tracker judge "Search for AI news" --criteria "Must use search_web"
740+
741+ praisonai tracker judge "What is 2+2?" --expected "4" --threshold 8.0
742+ """
743+ # Resolve tools
744+ tool_names = AUTONOMY_DEFAULT_TOOLS .copy ()
745+ if extended :
746+ tool_names .extend (EXTENDED_TOOLS )
747+ if tools :
748+ tool_names = [t .strip () for t in tools .split ("," )]
749+
750+ resolved_tools = _get_tools (tool_names )
751+
752+ console .print (f"\n [bold cyan]⚖️ Agent Tracker + Judge[/bold cyan]" )
753+ console .print (f"[dim]Task: { _summarize_text (task , 70 )} [/dim]" )
754+ console .print (f"[dim]Tools: { len (resolved_tools )} loaded | Threshold: { threshold } [/dim]\n " )
755+
756+ # Step 1: Run the task
757+ console .print ("[bold]Phase 1: Executing task...[/bold]\n " )
758+
759+ def step_callback (step : TrackedStep ):
760+ status = "✅" if step .success else "❌"
761+ console .print (f" [{ step .step_number } ] { status } { step .action_type } : { step .action_name } ({ step .duration_seconds :.2f} s)" )
762+
763+ result = _run_tracked_task (
764+ task = task ,
765+ tools = resolved_tools ,
766+ max_iterations = max_iterations ,
767+ model = model ,
768+ verbose = verbose ,
769+ step_callback = step_callback ,
770+ )
771+
772+ # Print step table + summary
773+ console .print ("\n " )
774+ _print_step_table (result .steps )
775+ console .print ("\n " )
776+ _print_summary (result )
777+
778+ # Step 2: Judge the execution
779+ console .print ("\n [bold]Phase 2: Judging execution...[/bold]\n " )
780+
781+ try :
782+ from praisonaiagents .eval import Judge
783+
784+ trace_text = _format_trace_for_judge (result )
785+
786+ judge_kwargs = {}
787+ if judge_model :
788+ judge_kwargs ["model" ] = judge_model
789+ elif model :
790+ judge_kwargs ["model" ] = model
791+
792+ judge = Judge (threshold = threshold , ** judge_kwargs )
793+
794+ eval_criteria = criteria or DEFAULT_JUDGE_CRITERIA
795+
796+ judge_result = judge .run (
797+ output = trace_text ,
798+ criteria = eval_criteria ,
799+ expected = expected ,
800+ input = task ,
801+ )
802+
803+ _print_judge_verdict (judge_result , threshold )
804+
805+ except ImportError :
806+ console .print ("[red]Error: praisonaiagents.eval not available[/red]" )
807+ console .print ("[dim]Install with: pip install praisonaiagents[/dim]" )
808+ except Exception as e :
809+ console .print (f"[red]Judge error: { e } [/red]" )
810+
0 commit comments