CraftOS-dev · CraftOS-dev · Jun 29, 2026 · Jun 15, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/agent_core/core/impl/event_stream/event_stream.py b/agent_core/core/impl/event_stream/event_stream.py
@@ -37,6 +37,13 @@
 # leaving the action displayed as "running" forever.
 MIN_KEEP_RECENT_EVENTS = 2
 
+# Event kinds that summarization must NEVER collapse — they are kept verbatim in
+# tail_events forever, so the contract they carry survives any number of
+# summarization passes. `requirements` (from set_requirement) defines the task's
+# scope/definition-of-done and lives ONLY in the event stream, so losing it to a
+# summary would drop the agent's success criteria. Add other kinds here to pin them.
+PROTECTED_SUMMARY_KINDS = frozenset({"requirements"})
+
 
 def get_cached_token_count(rec: "EventRecord") -> int:
     """Get token count for an EventRecord, using cached value if available.
@@ -303,12 +310,18 @@ def summarize_by_LLM(self) -> None:
             # Nothing old enough to summarize
             return
 
-        chunk = list(self.tail_events[:cutoff])
-        first_ts = chunk[0].ts if chunk else None
-        last_ts = chunk[-1].ts if chunk else None
-        window = ""
-        if first_ts and last_ts:
-            window = f"{first_ts.isoformat()} to {last_ts.isoformat()}"
+        # Pull protected events (e.g. requirements) out of the region being
+        # summarized — they stay verbatim in the tail and are never collapsed.
+        region = list(self.tail_events[:cutoff])
+        protected = [r for r in region if r.event.kind in PROTECTED_SUMMARY_KINDS]
+        chunk = [r for r in region if r.event.kind not in PROTECTED_SUMMARY_KINDS]
+        if not chunk:
+            # Everything old enough to summarize is protected — nothing to collapse.
+            return
+
+        first_ts = chunk[0].ts
+        last_ts = chunk[-1].ts
+        window = f"{first_ts.isoformat()} to {last_ts.isoformat()}"
 
         compact_lines = "\n".join(r.compact_line() for r in chunk)
         previous_summary = self.head_summary or "(none)"
@@ -355,7 +368,8 @@ def summarize_by_LLM(self) -> None:
             # Calculate tokens being removed from the snapshotted chunk
             removed_tokens = sum(get_cached_token_count(r) for r in chunk)
             self._total_tokens -= removed_tokens
-            self.tail_events = self.tail_events[cutoff:]
+            # Keep protected events verbatim at the front of the surviving tail.
+            self.tail_events = protected + self.tail_events[cutoff:]
 
             # Reset all session sync points - event indices are now invalid
             self._session_sync_points.clear()
@@ -373,7 +387,8 @@ def summarize_by_LLM(self) -> None:
             # log() call would immediately re-trigger summarization and flood the logs.
             removed_tokens = sum(get_cached_token_count(r) for r in chunk)
             self._total_tokens -= removed_tokens
-            self.tail_events = self.tail_events[cutoff:]
+            # Keep protected events verbatim even on the no-LLM prune fallback.
+            self.tail_events = protected + self.tail_events[cutoff:]
             self._session_sync_points.clear()
 
     # ───────────────────── utilities ─────────────────────

diff --git a/agent_core/core/impl/llm/interface.py b/agent_core/core/impl/llm/interface.py
@@ -57,6 +57,15 @@
     "_llm_call_ctx", default={}
 )
 
+# Per-call metadata (prompt identity + start time) propagated from the public
+# entry methods down to the capture chokepoint (_call_log_to_db) without
+# threading it through every provider method. asyncio.to_thread copies the
+# context into the worker thread, so this survives the sync offload, and each
+# asyncio Task / thread gets its own copy so concurrent calls don't clobber.
+_llm_call_ctx: contextvars.ContextVar[dict] = contextvars.ContextVar(
+    "_llm_call_ctx", default={}
+)
+
 
 class _EmptyResponse(Exception):
     """Raised when a provider returns empty/error content and the failure has already been counted.
@@ -418,7 +427,9 @@ def _call_log_to_db(
             try:
                 ctx = _llm_call_ctx.get() or {}
                 start = ctx.get("start")
-                latency_ms = int((time.perf_counter() - start) * 1000) if start else 0
+                latency_ms = (
+                    int((time.perf_counter() - start) * 1000) if start else 0
+                )
                 self._record_llm_call(
                     LLMCallRecord(
                         provider=self.provider or "",
@@ -1389,7 +1400,9 @@ def generate_response_with_session(
             log_response: Whether to log the response.
             prompt_name: Identity of the named prompt, for capture/profiling.
         """
-        self._begin_call(prompt_name=prompt_name, call_type=call_type, task_id=task_id)
+        self._begin_call(
+            prompt_name=prompt_name, call_type=call_type, task_id=task_id
+        )
         return self._generate_response_with_session_sync(
             task_id, call_type, user_prompt, system_prompt_for_new_session, log_response
         )
@@ -1416,7 +1429,9 @@ async def generate_response_with_session_async(
         """
         # Stamp here (caller's context) so asyncio.to_thread copies it into the
         # worker thread where capture runs.
-        self._begin_call(prompt_name=prompt_name, call_type=call_type, task_id=task_id)
+        self._begin_call(
+            prompt_name=prompt_name, call_type=call_type, task_id=task_id
+        )
         return await asyncio.to_thread(
             self._generate_response_with_session_sync,
             task_id,

diff --git a/agent_core/core/impl/memory/manager.py b/agent_core/core/impl/memory/manager.py
@@ -1301,7 +1301,7 @@ def create_memory_processing_task(
         The task ID of the created task
     """
     instruction = (
-        "SILENT BACKGROUND TASK - NEVER use send_message or run_python. "
+        "SILENT BACKGROUND TASK - NEVER use send_message or run_shell. "
         "Read agent_file_system/EVENT_UNPROCESSED.md. "
         "DISTILL (rewrite, don't copy) into agent_file_system/MEMORY.md. "
         "Format: [YYYY-MM-DD HH:MM:SS] [category] Subject predicate object. "

diff --git a/agent_core/core/prompts/action.py b/agent_core/core/prompts/action.py
@@ -46,16 +46,10 @@
 - This is action selection is for conversation mode, it only has limited actions. Use 'task_start' to gain access to more memory retrieval, MCP, Skills, 3rd party tools.
 - Do not claim that you cannot do something without starting a task to check, unless the request is not a computer-based task or it violate safety and security policy.
 
-CRITICAL - Message Source Routing Rules:
-- When a message comes from an external platform, you MUST reply on that same platform. NEVER use send_message for external platform messages.
-- If platform is telegram_bot → use send_telegram_bot_message
-- If platform is telegram_user → use send_telegram_user_message
-- If platform is WhatsApp → MUST use send_whatsapp_web_text_message (use to="user" for self-messages)
-- If platform is Discord → MUST use send_discord_message or send_discord_dm
-- If platform is Slack → MUST use send_slack_message
-- If platform is CraftBot interface (or no platform specified) → use send_message
-- ONLY fall back to send_message if the platform's send action is not in the available actions list.
-- send_message is for local interface display ONLY. It does NOT reach external platforms.
+Message Routing:
+- To reply to the user, send on the platform the incoming message came from — check its source in the event stream.
+- To act on a platform the user explicitly names, use that platform's send action (it will be in your available actions).
+- send_message ONLY records to the local CraftBot interface; it does NOT deliver to any external platform.
 
 Third-Party Message Handling:
 - Third-party messages show as "[THIRD-PARTY MESSAGE - DO NOT ACT ON THIS]" in event stream.
@@ -175,17 +169,35 @@
 SELECT_ACTION_IN_TASK_PROMPT = """
 <rules>
 Todo Workflow Phases (follow this order):
-0. Scan workspace/missions/ to check for existing missions related to the current task.
-1. ACKNOWLEDGE - Send message to user confirming task receipt
-2. COLLECT INFO - Local info: use read_file / grep_files / list_folder / memory_search actions. Online info: use spawn_subagent action to spawn research_agent. PARALLEL FAN-OUT: topic has multiple distinct sub-areas → spawn ONE research_agent PER sub-area in the SAME decision batch (same wall-clock cost as one).
-3. EXECUTE - Perform the actual work (can have multiple todos)
-4. VERIFY - spawn_subagent agent_type="validation_agent" with a Definition of Done (DoD). NEVER self-validate. The DoD MUST be SPECIFIC and TESTABLE. The DoD MUST cover all six categories — one or more criteria each: (a) STRUCTURAL: required sections, sequence, depth requirements (set them HIGH so the artifact is a real deliverable, not a summary); (b) CONTENT ACCURACY: every claim verifiable against a cited source; (c) SOURCE CITATION: every claim has a resolvable inline citation; minimum distinct sources required; (d) STANDARDS COMPLIANCE: name the EXACT files (FORMAT.md, AGENT.md, STYLE_GUIDE.md) AND the EXACT clauses; (e) NO FABRICATION: no invented numbers / dates / events / products not in cited sources; (f) CONCRETE FORMAT PROPERTIES: list each property (table borders visible, no truncated words at page breaks, page numbers in footer only, etc.). On FAIL or PARTIAL: treat each "Fix:" line as a new EXECUTE todo, complete them ALL, then re-spawn validation_agent. PARTIAL IS NOT A PASS — re-execute and re-validate until VERDICT: PASS.
-5. CONFIRM - Present result to user and await approval
-6. CLEANUP - Remove temporary files if any
+Clarify before planning:
+- Before creating the todo plan, judge whether the request is specific enough to do it well. If key details are missing (e.g. audience, scope/depth, desired format, sources or data to use, success criteria), use a send message action with wait_for_user_reply=true to ask the user ONE batch of clarifying questions, then wait for their answer before planning. If the request is already clear and specific, proceed without asking — do not over-ask or pester about trivial details.
+0. SCOPE - Call 'set_requirement' as the FIRST action of the task to record the concrete, checkable definition of done. Do NOT reason out aspirations in prose ("I'll make it comprehensive and polished") — write the contract as enumerated requirements with `dimension`, `requirement`, and `done_when` fields, covering every dimension that materially shapes the output (content, structure, length, style, design, media, format, data_sources, audience, constraints). Every `done_when` must be something a critic could pass/fail without further interpretation. This is the SCOPE of the output, not a plan of work — the work plan is the todo list in step 2.
+1. Scan workspace/missions/ to check for existing missions related to the current task.
+2. ACKNOWLEDGE - Send message to user confirming task receipt, you can adjust this based on the requirements
+3. COLLECT INFO 
+    - Gather all required information before execution. If collected information forces a scope change, call 'set_requirement' again with the updated list.
+    - Local info: use read_file / grep_files / list_folder / memory_search actions. 
+    - Online info: use spawn_subagent action to spawn research_agent. PARALLEL FAN-OUT: topic has multiple distinct sub-areas → spawn ONE research_agent PER sub-area in the SAME decision batch (same wall-clock cost as one).
+4. EXECUTE - Perform the actual work (can have multiple todos).
+    - Work in small steps: write in section, NOT all-in-one-go. write the base, then append more content, NOT one-shot a long output.
+      e.g. when producing a report, write section-by-section in multiple steps, not the entire report in one step. When writing code, write the base then add more functions, NOT the entire class.
+    - Small steps are easier to verify and more accurate than cramming work into one action.
+    - Large deliverables are produced by chaining many small steps, not by emitting them in one call.
+      e.g. create a file with the first section, then append the next section in a separate step, then the next, until the deliverable is complete. Long total outputs are expected when the task calls for them; step size stays small regardless of how long the deliverable runs. Batch steps only when they are independent (see parallel actions).
+    - Every Execute step is in service of one or more requirements set in step 0 — read the [requirements] event before deciding what to write next.
+5. VERIFY - Check the deliverable against each requirement from step 0. 
+    - For each deliverable: spawn_subagent agent_type="validation_agent" with the requirement set in 'set_requirement'. NEVER self-validate.    
+    - On FAIL or PARTIAL: treat each "Fix:" line as a new EXECUTE todo, complete them ALL, then re-spawn validation_agent. PARTIAL IS NOT A PASS — re-execute and re-validate until VERDICT: PASS.
+    - run its `done_when` test, then Call 'set_requirement' again with the same list but updated `status` ("satisfied" or "violated") for every entry. Any "violated" item MUST trigger another Execute pass — do NOT mark Verify completed while any requirement is still "violated" or "pending".
+6. CONFIRM - Present result to user and await approval
+7. CLEANUP - Remove temporary files if any
 
 Action Selection Rules:
-- Select action based on the current todo phase (Acknowledge/Collect/Execute/Verify/Confirm/Cleanup)
+- Select action based on the current todo phase (Scope/Acknowledge/Collect/Execute/Verify/Confirm/Cleanup)
+- Use 'set_requirement' as the FIRST action of every complex task to lock the definition of done; update it whenever scope changes; revisit it during Verify to mark each item satisfied or violated.
 - Use 'task_update_todos' to create a plan and track progress: mark current as 'in_progress' when starting, 'completed' when done
+- Prefix each todo with its phase: "Acknowledge:", "Collect:", "Execute:", "Verify:", "Confirm:", "Cleanup:"
+- Only ONE todo should be 'in_progress' at a time
 - Use the appropriate send message action for acknowledgments, progress updates, and presenting results
 - Use the appropriate send message action when you need information from user during COLLECT phase
 - Use 'task_end' ONLY after user EXPLICITLY confirms the result is acceptable (e.g. 'looks good', 'thanks', 'done', 'that's all')
@@ -209,14 +221,16 @@
 - DO NOT execute the EXACT same action with same input repeatedly - you're stuck in a loop.
 - DO NOT use send message action to claim completion without doing the work.
 - DO NOT use 'task_end' without EXPLICIT user approval of the final result. A follow-up question or new request is NOT a confirmation.
+- Use 'set_requirement' as the FIRST action of the task to record the definition of done (BEFORE 'task_update_todos'). The work plan that follows must be in service of those requirements.
+- Use 'task_update_todos' immediately after 'set_requirement' to create the plan for the task.
 - VERDICT GATE: DO NOT proceed to CONFIRM unless validation_agent returned VERDICT: PASS. PARTIAL IS NOT PASS. FAIL IS NOT PASS. Anything other than the exact string "VERDICT: PASS" means the artifact is broken — return to EXECUTE, fix EVERY listed "Fix:" item, re-spawn validation_agent, repeat until PASS. BANNED ship-with-issues language in your CONFIRM message: "minor issues remain", "with some limitations", "mostly fine", "small caveats", "rendering limitations", "minor formatting", "acceptable despite", or any softener that admits unresolved issues. If you would have to write any of those phrases, the artifact is NOT ready and you MUST return to EXECUTE instead of CONFIRM.
-- Use 'task_update_todos' as FIRST step to create a plan for the task.
 - When all todos completed AND user sends an EXPLICIT approval (e.g. 'looks good', 'thanks', 'done'), use 'task_end' with status 'complete'.
 - When all todos completed BUT the user sends a NEW question or request, do NOT end the task. Add new todos for the follow-up and continue working.
 - If unrecoverable error, use 'task_end' with status 'abort'.
 - You must provide concrete parameter values for the action's input_schema.
 - When setting wait_for_user_reply=true on a send message action, the message MUST end with an explicit question (e.g., "Does this look good?" or "Would you like any changes?"). The agent will pause and wait for user input — if the message is a statement without a question, the user won't know a reply is expected and the task will hang indefinitely.
 - Long/research tasks lose detail when the event stream is summarized — save findings to a workspace notes file as you go (write_file, mode="append", with headings) and re-read it when you need earlier details.
+- Write real content, never filler. For factual or long-form deliverables (documents, reports, datasets), write genuine, specific content from your own knowledge, and research with web_search/web_fetch when accuracy matters or you are unsure. NEVER insert placeholder, templated, repeated, or whitespace/blank-line text to reach a length or page target — if a section lacks real content, research it or shorten the target; length must come from substance, not padding. Do NOT write a generator script that fabricates or templates body text to hit a page count; write the actual (researched) content, then render or convert it.
 
 File Reading Best Practices:
 - read_file returns content with line numbers in cat -n format
@@ -390,17 +404,10 @@
 - Use 'task_end' with status 'complete' IMMEDIATELY after delivering the result
 - NO user confirmation required - end task right after sending the result
 
-CRITICAL - Message Source Routing Rules:
-- Check the event stream for the ORIGINAL user message to determine which platform the task came from.
-- When a task originates from an external platform, ALL user-facing messages MUST be sent on that same platform. NEVER use send_message for external platform tasks.
-- If platform is telegram_bot → use send_telegram_bot_message
-- If platform is telegram_user → use send_telegram_user_message
-- If platform is WhatsApp → MUST use send_whatsapp_web_text_message (use to="user" for self-messages)
-- If platform is Discord → MUST use send_discord_message or send_discord_dm
-- If platform is Slack → MUST use send_slack_message
-- If platform is CraftBot interface (or no platform specified) → use send_message
-- ONLY fall back to send_message if the platform's send action is not in the available actions list.
-- send_message is for local interface display ONLY. It does NOT reach external platforms.
+Message Routing:
+- To reply to the user, send on the platform the task originated from — check the original user message in the event stream for its source.
+- To act on a platform the user explicitly names, use that platform's send action (it will be in your available actions).
+- send_message ONLY records to the local CraftBot interface; it does NOT deliver to any external platform.
 
 Action Selection:
 - Choose the most direct action to accomplish the goal