← Back to task

Commit fb019f46

commit fb019f46c3adcf772df2dacf688cc75c30ed6e8e
Author: Ben Sima <ben@bensima.com>
Date:   Mon Dec 1 10:02:12 2025

    Add guardrails and progress tracking to Jr agent
    
    Implement runtime guardrails in Engine.hs: - Cost budget limit (default
    200 cents) - Token budget limit (default 1M tokens) - Duplicate tool
    call detection (same tool called N times) - Test failure counting
    (bild --test failures)
    
    Add database-backed progress tracking: - Checkpoint events stored in
    agent_events table - Progress summary retrieved on retry attempts -
    Improved prompts emphasizing efficiency and autonomous operation
    
    Worker.hs improvements: - Uses guardrails configuration - Reports
    guardrail violations via callbacks - Better prompt structure for
    autonomous operation
    
    Task-Id: t-203

diff --git a/Omni/Agent/Engine.hs b/Omni/Agent/Engine.hs
index 7da7fa51..a2a24a15 100644
--- a/Omni/Agent/Engine.hs
+++ b/Omni/Agent/Engine.hs
@@ -19,6 +19,8 @@ module Omni.Agent.Engine
     EngineConfig (..),
     AgentConfig (..),
     AgentResult (..),
+    Guardrails (..),
+    GuardrailResult (..),
     Message (..),
     Role (..),
     ToolCall (..),
@@ -31,6 +33,7 @@ module Omni.Agent.Engine
     defaultLLM,
     defaultEngineConfig,
     defaultAgentConfig,
+    defaultGuardrails,
     chat,
     runAgent,
     main,
@@ -41,6 +44,7 @@ where
 import Alpha
 import Data.Aeson ((.!=), (.:), (.:?), (.=))
 import qualified Data.Aeson as Aeson
+import qualified Data.Aeson.KeyMap as KeyMap
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.CaseInsensitive as CI
 import qualified Data.Map.Strict as Map
@@ -164,7 +168,57 @@ test =
         forM_ roles <| \role ->
           case Aeson.decode (Aeson.encode role) of
             Nothing -> Test.assertFailure ("Failed to decode Role: " <> show role)
-            Just decoded -> decoded Test.@=? role
+            Just decoded -> decoded Test.@=? role,
+      Test.unit "defaultGuardrails has sensible defaults" <| do
+        guardrailMaxCostCents defaultGuardrails Test.@=? 100.0
+        guardrailMaxTokens defaultGuardrails Test.@=? 500000
+        guardrailMaxDuplicateToolCalls defaultGuardrails Test.@=? 3
+        guardrailMaxTestFailures defaultGuardrails Test.@=? 3,
+      Test.unit "checkCostGuardrail detects exceeded budget" <| do
+        let g = defaultGuardrails {guardrailMaxCostCents = 50.0}
+        checkCostGuardrail g 60.0 Test.@=? GuardrailCostExceeded 60.0 50.0
+        checkCostGuardrail g 40.0 Test.@=? GuardrailOk,
+      Test.unit "checkTokenGuardrail detects exceeded budget" <| do
+        let g = defaultGuardrails {guardrailMaxTokens = 1000}
+        checkTokenGuardrail g 1500 Test.@=? GuardrailTokensExceeded 1500 1000
+        checkTokenGuardrail g 500 Test.@=? GuardrailOk,
+      Test.unit "checkDuplicateGuardrail detects repeated calls" <| do
+        let g = defaultGuardrails {guardrailMaxDuplicateToolCalls = 3}
+            counts = Map.fromList [("bash", 3), ("read_file", 1)]
+        case checkDuplicateGuardrail g counts of
+          GuardrailDuplicateToolCalls name count -> do
+            name Test.@=? "bash"
+            count Test.@=? 3
+          _ -> Test.assertFailure "Expected GuardrailDuplicateToolCalls"
+        checkDuplicateGuardrail g (Map.fromList [("bash", 2)]) Test.@=? GuardrailOk,
+      Test.unit "checkTestFailureGuardrail detects failures" <| do
+        let g = defaultGuardrails {guardrailMaxTestFailures = 3}
+        checkTestFailureGuardrail g 3 Test.@=? GuardrailTestFailures 3
+        checkTestFailureGuardrail g 2 Test.@=? GuardrailOk,
+      Test.unit "updateToolCallCounts accumulates correctly" <| do
+        let tc1 = ToolCall "1" "function" (FunctionCall "bash" "{}")
+            tc2 = ToolCall "2" "function" (FunctionCall "bash" "{}")
+            tc3 = ToolCall "3" "function" (FunctionCall "read_file" "{}")
+            counts = updateToolCallCounts Map.empty [tc1, tc2, tc3]
+        Map.lookup "bash" counts Test.@=? Just 2
+        Map.lookup "read_file" counts Test.@=? Just 1,
+      Test.unit "Guardrails JSON roundtrip" <| do
+        let g = Guardrails 75.0 100000 5 4
+        case Aeson.decode (Aeson.encode g) of
+          Nothing -> Test.assertFailure "Failed to decode Guardrails"
+          Just decoded -> decoded Test.@=? g,
+      Test.unit "GuardrailResult JSON roundtrip" <| do
+        let results =
+              [ GuardrailOk,
+                GuardrailCostExceeded 100.0 50.0,
+                GuardrailTokensExceeded 2000 1000,
+                GuardrailDuplicateToolCalls "bash" 5,
+                GuardrailTestFailures 3
+              ]
+        forM_ results <| \r ->
+          case Aeson.decode (Aeson.encode r) of
+            Nothing -> Test.assertFailure ("Failed to decode GuardrailResult: " <> show r)
+            Just decoded -> decoded Test.@=? r
     ]
 
 data Tool = Tool
@@ -249,16 +303,51 @@ data AgentConfig = AgentConfig
   { agentModel :: Text,
     agentTools :: [Tool],
     agentSystemPrompt :: Text,
-    agentMaxIterations :: Int
+    agentMaxIterations :: Int,
+    agentGuardrails :: Guardrails
   }
 
+data Guardrails = Guardrails
+  { guardrailMaxCostCents :: Double,
+    guardrailMaxTokens :: Int,
+    guardrailMaxDuplicateToolCalls :: Int,
+    guardrailMaxTestFailures :: Int
+  }
+  deriving (Show, Eq, Generic)
+
+instance Aeson.ToJSON Guardrails
+
+instance Aeson.FromJSON Guardrails
+
+data GuardrailResult
+  = GuardrailOk
+  | GuardrailCostExceeded Double Double
+  | GuardrailTokensExceeded Int Int
+  | GuardrailDuplicateToolCalls Text Int
+  | GuardrailTestFailures Int
+  deriving (Show, Eq, Generic)
+
+instance Aeson.ToJSON GuardrailResult
+
+instance Aeson.FromJSON GuardrailResult
+
+defaultGuardrails :: Guardrails
+defaultGuardrails =
+  Guardrails
+    { guardrailMaxCostCents = 100.0,
+      guardrailMaxTokens = 500000,
+      guardrailMaxDuplicateToolCalls = 3,
+      guardrailMaxTestFailures = 3
+    }
+
 defaultAgentConfig :: AgentConfig
 defaultAgentConfig =
   AgentConfig
     { agentModel = "gpt-4",
       agentTools = [],
       agentSystemPrompt = "You are a helpful assistant.",
-      agentMaxIterations = 10
+      agentMaxIterations = 10,
+      agentGuardrails = defaultGuardrails
     }
 
 data EngineConfig = EngineConfig
@@ -269,7 +358,8 @@ data EngineConfig = EngineConfig
     engineOnAssistant :: Text -> IO (),
     engineOnToolResult :: Text -> Bool -> Text -> IO (),
     engineOnComplete :: IO (),
-    engineOnError :: Text -> IO ()
+    engineOnError :: Text -> IO (),
+    engineOnGuardrail :: GuardrailResult -> IO ()
   }
 
 defaultEngineConfig :: EngineConfig
@@ -282,7 +372,8 @@ defaultEngineConfig =
       engineOnAssistant = \_ -> pure (),
       engineOnToolResult = \_ _ _ -> pure (),
       engineOnComplete = pure (),
-      engineOnError = \_ -> pure ()
+      engineOnError = \_ -> pure (),
+      engineOnGuardrail = \_ -> pure ()
     }
 
 data AgentResult = AgentResult
@@ -511,72 +602,138 @@ runAgent engineCfg agentCfg userPrompt = do
       initialMessages = [systemMsg, userMsg]
 
   engineOnActivity engineCfg "Starting agent loop"
-  loop llm tools toolMap initialMessages 0 0 0
+  loop llm tools toolMap initialMessages 0 0 0 0.0 Map.empty 0
   where
     maxIter = agentMaxIterations agentCfg
-
-    loop :: LLM -> [Tool] -> Map.Map Text Tool -> [Message] -> Int -> Int -> Int -> IO (Either Text AgentResult)
-    loop llm tools' toolMap msgs iteration totalCalls totalTokens
+    guardrails' = agentGuardrails agentCfg
+
+    loop ::
+      LLM ->
+      [Tool] ->
+      Map.Map Text Tool ->
+      [Message] ->
+      Int ->
+      Int ->
+      Int ->
+      Double ->
+      Map.Map Text Int ->
+      Int ->
+      IO (Either Text AgentResult)
+    loop llm tools' toolMap msgs iteration totalCalls totalTokens totalCost toolCallCounts testFailures
       | iteration >= maxIter = do
           let errMsg = "Max iterations (" <> tshow maxIter <> ") reached"
           engineOnError engineCfg errMsg
           pure <| Left errMsg
       | otherwise = do
-          engineOnActivity engineCfg <| "Iteration " <> tshow (iteration + 1)
-          result <- chatWithUsage llm tools' msgs
-          case result of
-            Left err -> do
-              engineOnError engineCfg err
-              pure (Left err)
-            Right chatRes -> do
-              let msg = chatMessage chatRes
-                  tokens = maybe 0 usageTotalTokens (chatUsage chatRes)
-                  -- Use actual cost from API response when available
-                  -- OpenRouter returns cost in dollars, convert to cents
-                  cost = case chatUsage chatRes +> usageCost of
-                    Just actualCost -> actualCost * 100
-                    Nothing -> estimateCost (llmModel llm) tokens
-              engineOnCost engineCfg tokens cost
-              let newTokens = totalTokens + tokens
-              let assistantText = msgContent msg
-              unless (Text.null assistantText)
-                <| engineOnAssistant engineCfg assistantText
-              case msgToolCalls msg of
-                Nothing -> do
-                  engineOnActivity engineCfg "Agent completed"
-                  engineOnComplete engineCfg
-                  pure
-                    <| Right
-                    <| AgentResult
-                      { resultFinalMessage = msgContent msg,
-                        resultToolCallCount = totalCalls,
-                        resultIterations = iteration + 1,
-                        resultTotalCost = estimateTotalCost (llmModel llm) newTokens,
-                        resultTotalTokens = newTokens
-                      }
-                Just [] -> do
-                  engineOnActivity engineCfg "Agent completed (empty tool calls)"
-                  engineOnComplete engineCfg
-                  pure
-                    <| Right
-                    <| AgentResult
-                      { resultFinalMessage = msgContent msg,
-                        resultToolCallCount = totalCalls,
-                        resultIterations = iteration + 1,
-                        resultTotalCost = estimateTotalCost (llmModel llm) newTokens,
-                        resultTotalTokens = newTokens
-                      }
-                Just tcs -> do
-                  toolResults <- executeToolCalls engineCfg toolMap tcs
-                  let newMsgs = msgs <> [msg] <> toolResults
-                      newCalls = totalCalls + length tcs
-                  loop llm tools' toolMap newMsgs (iteration + 1) newCalls newTokens
+          let guardrailViolation = findGuardrailViolation guardrails' totalCost totalTokens toolCallCounts testFailures
+          case guardrailViolation of
+            Just (g, errMsg) -> do
+              engineOnGuardrail engineCfg g
+              pure <| Left errMsg
+            Nothing -> do
+              engineOnActivity engineCfg <| "Iteration " <> tshow (iteration + 1)
+              result <- chatWithUsage llm tools' msgs
+              case result of
+                Left err -> do
+                  engineOnError engineCfg err
+                  pure (Left err)
+                Right chatRes -> do
+                  let msg = chatMessage chatRes
+                      tokens = maybe 0 usageTotalTokens (chatUsage chatRes)
+                      cost = case chatUsage chatRes +> usageCost of
+                        Just actualCost -> actualCost * 100
+                        Nothing -> estimateCost (llmModel llm) tokens
+                  engineOnCost engineCfg tokens cost
+                  let newTokens = totalTokens + tokens
+                      newCost = totalCost + cost
+                  let assistantText = msgContent msg
+                  unless (Text.null assistantText)
+                    <| engineOnAssistant engineCfg assistantText
+                  case msgToolCalls msg of
+                    Nothing -> do
+                      engineOnActivity engineCfg "Agent completed"
+                      engineOnComplete engineCfg
+                      pure
+                        <| Right
+                        <| AgentResult
+                          { resultFinalMessage = msgContent msg,
+                            resultToolCallCount = totalCalls,
+                            resultIterations = iteration + 1,
+                            resultTotalCost = newCost,
+                            resultTotalTokens = newTokens
+                          }
+                    Just [] -> do
+                      engineOnActivity engineCfg "Agent completed (empty tool calls)"
+                      engineOnComplete engineCfg
+                      pure
+                        <| Right
+                        <| AgentResult
+                          { resultFinalMessage = msgContent msg,
+                            resultToolCallCount = totalCalls,
+                            resultIterations = iteration + 1,
+                            resultTotalCost = newCost,
+                            resultTotalTokens = newTokens
+                          }
+                    Just tcs -> do
+                      (toolResults, newTestFailures) <- executeToolCallsWithTracking engineCfg toolMap tcs testFailures
+                      let newMsgs = msgs <> [msg] <> toolResults
+                          newCalls = totalCalls + length tcs
+                          newToolCallCounts = updateToolCallCounts toolCallCounts tcs
+                      loop llm tools' toolMap newMsgs (iteration + 1) newCalls newTokens newCost newToolCallCounts newTestFailures
+
+checkCostGuardrail :: Guardrails -> Double -> GuardrailResult
+checkCostGuardrail g cost
+  | cost > guardrailMaxCostCents g = GuardrailCostExceeded cost (guardrailMaxCostCents g)
+  | otherwise = GuardrailOk
+
+checkTokenGuardrail :: Guardrails -> Int -> GuardrailResult
+checkTokenGuardrail g tokens
+  | tokens > guardrailMaxTokens g = GuardrailTokensExceeded tokens (guardrailMaxTokens g)
+  | otherwise = GuardrailOk
+
+checkDuplicateGuardrail :: Guardrails -> Map.Map Text Int -> GuardrailResult
+checkDuplicateGuardrail g counts =
+  let maxAllowed = guardrailMaxDuplicateToolCalls g
+      violations = [(name, count) | (name, count) <- Map.toList counts, count >= maxAllowed]
+   in case violations of
+        ((name, count) : _) -> GuardrailDuplicateToolCalls name count
+        [] -> GuardrailOk
+
+checkTestFailureGuardrail :: Guardrails -> Int -> GuardrailResult
+checkTestFailureGuardrail g failures
+  | failures >= guardrailMaxTestFailures g = GuardrailTestFailures failures
+  | otherwise = GuardrailOk
+
+updateToolCallCounts :: Map.Map Text Int -> [ToolCall] -> Map.Map Text Int
+updateToolCallCounts =
+  foldr (\tc m -> Map.insertWith (+) (fcName (tcFunction tc)) 1 m)
+
+findGuardrailViolation :: Guardrails -> Double -> Int -> Map.Map Text Int -> Int -> Maybe (GuardrailResult, Text)
+findGuardrailViolation g cost tokens toolCallCounts testFailures =
+  case checkCostGuardrail g cost of
+    r@(GuardrailCostExceeded actual limit) ->
+      Just (r, "Guardrail: cost budget exceeded (" <> tshow actual <> "/" <> tshow limit <> " cents)")
+    _ -> case checkTokenGuardrail g tokens of
+      r@(GuardrailTokensExceeded actual limit) ->
+        Just (r, "Guardrail: token budget exceeded (" <> tshow actual <> "/" <> tshow limit <> " tokens)")
+      _ -> case checkDuplicateGuardrail g toolCallCounts of
+        r@(GuardrailDuplicateToolCalls tool count) ->
+          Just (r, "Guardrail: duplicate tool calls (" <> tool <> " called " <> tshow count <> " times)")
+        _ -> case checkTestFailureGuardrail g testFailures of
+          r@(GuardrailTestFailures count) ->
+            Just (r, "Guardrail: too many test failures (" <> tshow count <> ")")
+          _ -> Nothing
 
 buildToolMap :: [Tool] -> Map.Map Text Tool
 buildToolMap = Map.fromList <. map (\t -> (toolName t, t))
 
-executeToolCalls :: EngineConfig -> Map.Map Text Tool -> [ToolCall] -> IO [Message]
-executeToolCalls engineCfg toolMap = traverse executeSingle
+executeToolCallsWithTracking :: EngineConfig -> Map.Map Text Tool -> [ToolCall] -> Int -> IO ([Message], Int)
+executeToolCallsWithTracking engineCfg toolMap tcs initialFailures = do
+  results <- traverse executeSingle tcs
+  let msgs = map fst results
+      failureDeltas = map snd results
+      totalNewFailures = initialFailures + sum failureDeltas
+  pure (msgs, totalNewFailures)
   where
     executeSingle tc = do
       let name = fcName (tcFunction tc)
@@ -588,18 +745,35 @@ executeToolCalls engineCfg toolMap = traverse executeSingle
         Nothing -> do
           let errMsg = "Tool not found: " <> name
           engineOnToolResult engineCfg name False errMsg
-          pure <| Message ToolRole errMsg Nothing (Just callId)
+          pure (Message ToolRole errMsg Nothing (Just callId), 0)
         Just tool -> do
           case Aeson.decode (BL.fromStrict (TE.encodeUtf8 argsText)) of
             Nothing -> do
               let errMsg = "Invalid JSON arguments: " <> argsText
               engineOnToolResult engineCfg name False errMsg
-              pure <| Message ToolRole errMsg Nothing (Just callId)
+              pure (Message ToolRole errMsg Nothing (Just callId), 0)
             Just args -> do
               resultValue <- toolExecute tool args
               let resultText = TE.decodeUtf8 (BL.toStrict (Aeson.encode resultValue))
+                  isTestCall = name == "bash" && ("bild --test" `Text.isInfixOf` argsText || "bild -t" `Text.isInfixOf` argsText)
+                  isTestFailure = isTestCall && isFailureResult resultValue
+                  failureDelta = if isTestFailure then 1 else 0
               engineOnToolResult engineCfg name True resultText
-              pure <| Message ToolRole resultText Nothing (Just callId)
+              pure (Message ToolRole resultText Nothing (Just callId), failureDelta)
+
+    isFailureResult :: Aeson.Value -> Bool
+    isFailureResult (Aeson.Object obj) =
+      case KeyMap.lookup "exit_code" obj of
+        Just (Aeson.Number n) -> n /= 0
+        _ -> False
+    isFailureResult (Aeson.String s) =
+      "error"
+        `Text.isInfixOf` Text.toLower s
+        || "failed"
+        `Text.isInfixOf` Text.toLower s
+        || "FAILED"
+        `Text.isInfixOf` s
+    isFailureResult _ = False
 
 -- | Estimate cost in cents from token count
 estimateCost :: Text -> Int -> Double
@@ -609,6 +783,3 @@ estimateCost model tokens
   | "gpt-4" `Text.isInfixOf` model = fromIntegral tokens * 3 / 100000
   | "claude" `Text.isInfixOf` model = fromIntegral tokens * 3 / 100000
   | otherwise = fromIntegral tokens / 100000
-
-estimateTotalCost :: Text -> Int -> Double
-estimateTotalCost = estimateCost
diff --git a/Omni/Agent/Worker.hs b/Omni/Agent/Worker.hs
index 38e29cb4..bbdba9d2 100644
--- a/Omni/Agent/Worker.hs
+++ b/Omni/Agent/Worker.hs
@@ -49,6 +49,18 @@ toMetadata pairs =
   let obj = Aeson.object [(AesonKey.fromText k, Aeson.String v) | (k, v) <- pairs]
    in TE.decodeUtf8 (BSL.toStrict (Aeson.encode obj))
 
+-- | Format guardrail result for logging
+formatGuardrailResult :: Engine.GuardrailResult -> Text
+formatGuardrailResult Engine.GuardrailOk = "OK"
+formatGuardrailResult (Engine.GuardrailCostExceeded actual limit) =
+  "Cost exceeded: " <> tshow actual <> " cents (limit: " <> tshow limit <> ")"
+formatGuardrailResult (Engine.GuardrailTokensExceeded actual limit) =
+  "Tokens exceeded: " <> tshow actual <> " (limit: " <> tshow limit <> ")"
+formatGuardrailResult (Engine.GuardrailDuplicateToolCalls tool count) =
+  "Duplicate tool calls: " <> tool <> " called " <> tshow count <> " times"
+formatGuardrailResult (Engine.GuardrailTestFailures count) =
+  "Test failures: " <> tshow count <> " failures"
+
 runOnce :: Core.Worker -> Maybe Text -> IO ()
 runOnce worker maybeTaskId = do
   -- Find work
@@ -209,8 +221,8 @@ runWithEngine worker repo task = do
       -- Check for retry context
       maybeRetry <- TaskCore.getRetryContext (TaskCore.taskId task)
 
-      -- Read progress file if it exists
-      progressContent <- readProgressFile repo (TaskCore.taskId task)
+      -- Get progress from database (checkpoint events from previous sessions)
+      progressContent <- TaskCore.getProgressSummary (TaskCore.taskId task)
 
       -- Build the full prompt
       let ns = fromMaybe "." (TaskCore.taskNamespace task)
@@ -291,16 +303,28 @@ runWithEngine worker repo task = do
                   logEventText "Complete" "",
                 Engine.engineOnError = \err -> do
                   sayLog <| "[error] " <> err
-                  logEventText "Error" err
+                  logEventText "Error" err,
+                Engine.engineOnGuardrail = \guardrailResult -> do
+                  let guardrailMsg = formatGuardrailResult guardrailResult
+                  sayLog <| "[guardrail] " <> guardrailMsg
+                  logEventJson "Guardrail" (Aeson.toJSON guardrailResult)
               }
 
-      -- Build Agent config
-      let agentCfg =
+      -- Build Agent config with guardrails
+      let guardrails =
+            Engine.Guardrails
+              { Engine.guardrailMaxCostCents = 200.0,
+                Engine.guardrailMaxTokens = 1000000,
+                Engine.guardrailMaxDuplicateToolCalls = 5,
+                Engine.guardrailMaxTestFailures = 3
+              }
+          agentCfg =
             Engine.AgentConfig
               { Engine.agentModel = model,
                 Engine.agentTools = Tools.allTools,
                 Engine.agentSystemPrompt = systemPrompt,
-                Engine.agentMaxIterations = 100
+                Engine.agentMaxIterations = 100,
+                Engine.agentGuardrails = guardrails
               }
 
       -- Run the agent
@@ -316,40 +340,37 @@ runWithEngine worker repo task = do
 -- | Build the base prompt for the agent
 buildBasePrompt :: TaskCore.Task -> Text -> FilePath -> Text
 buildBasePrompt task ns repo =
-  "You are a Worker Agent.\n"
+  "You are an autonomous Worker Agent.\n"
     <> "Your goal is to implement the following task:\n\n"
     <> formatTask task
     <> "\n\nCRITICAL INSTRUCTIONS:\n"
-    <> "1. Read AGENTS.md and any existing progress file for this task.\n"
-    <> "2. Pick ONE specific change to implement (not everything at once).\n"
-    <> "3. Analyze the codebase to understand where to make that change.\n"
-    <> "4. Implement ONLY that one change.\n"
-    <> "5. BEFORE finishing, you MUST run: bild --test "
+    <> "1. Read AGENTS.md first to understand the codebase conventions.\n"
+    <> "2. Complete ONE logical change (e.g., update schema + call sites + tests).\n"
+    <> "3. Run 'bild --test "
     <> ns
-    <> "\n"
-    <> "6. Fix ALL errors from bild --test (including lint issues).\n"
-    <> "7. Keep running bild --test until it passes with no errors.\n"
-    <> "8. After tests pass, write progress to: _/llm/"
-    <> TaskCore.taskId task
-    <> "-progress.md\n"
-    <> "9. Do NOT update task status or manage git.\n"
-    <> "10. Only exit after bild --test passes and progress is saved.\n\n"
-    <> "INCREMENTAL WORKFLOW (IMPORTANT):\n"
-    <> "- DO NOT try to implement everything in one go\n"
-    <> "- Make ONE focused change, test it, save progress, then stop\n"
-    <> "- The task may be run multiple times to complete all changes\n"
-    <> "- Each session should leave the code in a clean, testable state\n"
-    <> "- If the task is already complete, just verify tests pass and note that in progress\n\n"
-    <> "IMPORTANT: The git commit will fail if lint finds issues.\n"
-    <> "You must fix all lint suggestions.\n\n"
+    <> "' ONCE after implementing.\n"
+    <> "4. If tests pass, you are DONE - stop immediately.\n"
+    <> "5. If tests fail, fix the issue and run tests again.\n"
+    <> "6. If tests fail 3 times on the same issue, STOP - the task will be marked for human review.\n"
+    <> "7. Do NOT update task status or manage git - the worker handles that.\n\n"
+    <> "AUTONOMOUS OPERATION (NO HUMAN IN LOOP):\n"
+    <> "- You are running autonomously without human intervention\n"
+    <> "- There is NO human to ask questions or get clarification from\n"
+    <> "- Make reasonable decisions based on the task description\n"
+    <> "- If something is truly ambiguous, implement the most straightforward interpretation\n"
+    <> "- Guardrails will stop you if you exceed cost/token budgets or make repeated mistakes\n\n"
     <> "BUILD SYSTEM NOTES:\n"
-    <> "- Running 'bild --test "
+    <> "- 'bild --test "
     <> ns
-    <> "' automatically tests ALL dependencies of that namespace\n"
-    <> "- You do NOT need to run bild --test on individual files - just the main namespace ONCE\n"
-    <> "- Once tests pass, do NOT re-run them unless you make more changes\n"
-    <> "- The 'lint' command will be run automatically during git commit via hooks\n"
-    <> "- You can run 'lint --fix' on changed files if needed, but it's optional\n\n"
+    <> "' tests ALL dependencies transitively - run it ONCE, not per-file\n"
+    <> "- Do NOT run bild --test on individual files separately\n"
+    <> "- Once tests pass, STOP - do not continue adding features or re-running tests\n"
+    <> "- Use 'lint --fix' for formatting issues (not hlint directly)\n\n"
+    <> "EFFICIENCY REQUIREMENTS:\n"
+    <> "- Do not repeat the same action multiple times\n"
+    <> "- Do not re-run passing tests\n"
+    <> "- Do not test files individually when namespace test covers them\n"
+    <> "- Aim to complete the task in under 50 tool calls\n\n"
     <> "Context:\n"
     <> "- Working directory: "
     <> Text.pack repo
@@ -358,28 +379,18 @@ buildBasePrompt task ns repo =
     <> ns
     <> "\n"
 
--- | Read progress file for a task if it exists
-readProgressFile :: FilePath -> Text -> IO (Maybe Text)
-readProgressFile repo taskId = do
-  let progressPath = repo </> "_" </> "llm" </> Text.unpack taskId <> "-progress.md"
-  exists <- Directory.doesFileExist progressPath
-  if exists
-    then Just </ readFile progressPath
-    else pure Nothing
-
 -- | Build progress context prompt
 buildProgressPrompt :: Maybe Text -> Text
 buildProgressPrompt Nothing = ""
 buildProgressPrompt (Just progress) =
-  "\n\n## PROGRESS FROM PREVIOUS SESSIONS\n\n"
-    <> "This task has been worked on before. Here's what has been completed:\n\n"
+  "\n\n## PROGRESS FROM PREVIOUS SESSIONS (from database)\n\n"
+    <> "This task has been worked on before. Here are the checkpoint notes:\n\n"
     <> progress
     <> "\n\n"
     <> "IMPORTANT:\n"
-    <> "- Review this progress to understand what's already done\n"
+    <> "- Review these checkpoints to understand what's already done\n"
     <> "- Do NOT repeat work that's already completed\n"
-    <> "- Pick the NEXT logical step that hasn't been done yet\n"
-    <> "- Update the progress file after completing your change\n\n"
+    <> "- If the task appears complete, verify tests pass and exit\n\n"
 
 -- | Build retry context prompt
 buildRetryPrompt :: Maybe TaskCore.RetryContext -> Text
diff --git a/Omni/Task/Core.hs b/Omni/Task/Core.hs
index f54cf81e..1212a569 100644
--- a/Omni/Task/Core.hs
+++ b/Omni/Task/Core.hs
@@ -1702,3 +1702,26 @@ getEventsSince sessionId lastId =
       "SELECT id, task_id, session_id, timestamp, event_type, content \
       \FROM agent_events WHERE session_id = ? AND id > ? ORDER BY id ASC"
       (sessionId, lastId)
+
+-- | Insert a checkpoint event (for progress tracking)
+insertCheckpoint :: Text -> Text -> Text -> IO ()
+insertCheckpoint taskId sessionId =
+  insertAgentEvent taskId sessionId "Checkpoint"
+
+-- | Get all checkpoints for a task (across all sessions)
+getCheckpointsForTask :: Text -> IO [StoredEvent]
+getCheckpointsForTask taskId =
+  withDb <| \conn ->
+    SQL.query
+      conn
+      "SELECT id, task_id, session_id, timestamp, event_type, content \
+      \FROM agent_events WHERE task_id = ? AND event_type = 'Checkpoint' ORDER BY id ASC"
+      (SQL.Only taskId)
+
+-- | Get progress summary for a task (concatenated checkpoint contents)
+getProgressSummary :: Text -> IO (Maybe Text)
+getProgressSummary taskId = do
+  checkpoints <- getCheckpointsForTask taskId
+  if null checkpoints
+    then pure Nothing
+    else pure <| Just <| T.intercalate "\n\n---\n\n" [storedEventContent e | e <- checkpoints]