← Back to task

Commit 40b93224

commit 40b93224adfa973dc1734c11d09e17c14a1b10a2
Author: Coder Agent <coder@agents.omni>
Date:   Wed Feb 11 17:22:51 2026

    Omni/Ide: add per-task retry circuit breaker to dev loop
    
    - add --max-retries with default 5 (0 disables limit)
    - record failed retry attempts in task comments with role+patchset tags
    - make retry accounting patchset-aware with legacy patchset-1 fallback
    - open circuit when retries exhausted and annotate task once
    - prevent hot-poll when --task-id points to exhausted task
    - add exponential backoff between failed dev retries (capped)
    - document max-retries and retry behavior in workflow docs
    
    Task-Id: t-587.2

diff --git a/Omni/Ide/DEV_REVIEW_RELEASE.md b/Omni/Ide/DEV_REVIEW_RELEASE.md
index 4eb1e046..135a440c 100644
--- a/Omni/Ide/DEV_REVIEW_RELEASE.md
+++ b/Omni/Ide/DEV_REVIEW_RELEASE.md
@@ -44,6 +44,7 @@ Optional flags:
 - `--root _/worktrees/my-flow`
 - `--parent t-587` (scope to one epic)
 - `--task-id t-587.1` (scope to one task)
+- `--max-retries 5` (circuit-break after failed attempts per patchset)
 - `--no-auto-stash` (disable automatic dirty-workspace recovery)
 - `--once`
 - `--dry-run`
@@ -71,6 +72,11 @@ Omni/Ide/dev-review-release.sh cleanup-branches --apply
 When dev produces a new task-branch commit SHA, it automatically increments
 `patchset_count` via `task patchset <id> --increment`.
 
+Retry accounting and circuit-breaker behavior are patchset-aware:
+- Failed attempts are tracked per role and patchset.
+- Exceeding `--max-retries` opens the circuit for that patchset and skips new runs.
+- Dev retries use exponential backoff between failed attempts (capped).
+
 Expected lifecycle:
 
 `open -> in-progress -> review -> approved -> done`
diff --git a/Omni/Ide/README.md b/Omni/Ide/README.md
index 2fb49236..c52041e7 100644
--- a/Omni/Ide/README.md
+++ b/Omni/Ide/README.md
@@ -76,6 +76,9 @@ Omni/Ide/dev-review-release.sh loop --role dev --task-id t-587.1
 
 # Disable dirty-workspace auto-stash recovery
 Omni/Ide/dev-review-release.sh loop --role dev --no-auto-stash
+
+# Cap failed retries per patchset (dev loop circuit breaker)
+Omni/Ide/dev-review-release.sh loop --role dev --max-retries 5
 ```
 
 The convention for all programs in the omnirepo is to run their tests if the first argument is `test`. So for example:
diff --git a/Omni/Ide/dev-review-release.sh b/Omni/Ide/dev-review-release.sh
index 32057617..8ed0c537 100755
--- a/Omni/Ide/dev-review-release.sh
+++ b/Omni/Ide/dev-review-release.sh
@@ -10,6 +10,7 @@ DEFAULT_INTERVAL_SECONDS=20
 DEFAULT_TIMEOUT_SECONDS=1800
 DEFAULT_MAX_ITER=80
 DEFAULT_MAX_COST_CENTS=300
+DEFAULT_MAX_RETRIES=5
 
 usage() {
   cat <<'EOF'
@@ -44,6 +45,7 @@ Loop options:
   --timeout SEC     agentd timeout per run (default: 1800)
   --max-iter N      agentd max iterations per run (default: 80)
   --max-cost CENTS  agentd max cost cents per run (default: 300)
+  --max-retries N   Max failed attempts per task patchset before skipping (default: 5, 0 = unlimited)
   --no-auto-stash   Disable auto-stashing dirty workspace state before retry
   --once            Process at most one task, then exit
   --dry-run         Print what would run, do not invoke agentd
@@ -248,6 +250,157 @@ auto_stash_workspace() {
   return 0
 }
 
+task_patchset_count() {
+  local tid="$1"
+  task show "$tid" --json | jq -r '.taskPatchsetCount // .patchset_count // 0'
+}
+
+get_retry_count() {
+  local tid="$1"
+  local role="$2"
+  local patchset="$3"
+
+  local comments_json
+  if ! comments_json="$(task show "$tid" --json 2>/dev/null)"; then
+    echo "0"
+    return
+  fi
+
+  local patchset_pattern="Automation \\($role\\) patchset $patchset attempt ([0-9]+)/[0-9]+ failed"
+  local legacy_pattern="Automation \\($role\\) attempt ([0-9]+)/[0-9]+ failed"
+  local max_attempt=0
+
+  while IFS= read -r comment_text; do
+    if [[ "$comment_text" =~ $patchset_pattern ]]; then
+      local attempt="${BASH_REMATCH[1]}"
+      if [[ "$attempt" -gt "$max_attempt" ]]; then
+        max_attempt="$attempt"
+      fi
+      continue
+    fi
+
+    # Backward-compat for existing patchset-1 comments emitted before patchset tagging.
+    if [[ "$patchset" -eq 1 && "$comment_text" =~ $legacy_pattern ]]; then
+      local legacy_attempt="${BASH_REMATCH[1]}"
+      if [[ "$legacy_attempt" -gt "$max_attempt" ]]; then
+        max_attempt="$legacy_attempt"
+      fi
+    fi
+  done < <(jq -r '.taskComments[]?.commentText // empty' <<<"$comments_json" 2>/dev/null)
+
+  echo "$max_attempt"
+}
+
+retry_exceeded_comment_exists() {
+  local tid="$1"
+  local role="$2"
+  local patchset="$3"
+
+  local comments_json
+  if ! comments_json="$(task show "$tid" --json 2>/dev/null)"; then
+    return 1
+  fi
+
+  local marker="Automation ($role) patchset $patchset exceeded max retries"
+  jq -r '.taskComments[]?.commentText // empty' <<<"$comments_json" | grep -Fq "$marker"
+}
+
+task_exceeded_retries() {
+  local tid="$1"
+  local role="$2"
+  local patchset="$3"
+  local max_retries="$4"
+
+  if [[ "$max_retries" -eq 0 ]]; then
+    return 1
+  fi
+
+  local retry_count
+  retry_count="$(get_retry_count "$tid" "$role" "$patchset")"
+
+  if [[ "$retry_count" -ge "$max_retries" ]]; then
+    return 0
+  fi
+
+  return 1
+}
+
+retry_backoff_remaining_seconds() {
+  local tid="$1"
+  local role="$2"
+  local patchset="$3"
+  local base_interval="$4"
+
+  local comments_json
+  if ! comments_json="$(task show "$tid" --json 2>/dev/null)"; then
+    echo "0"
+    return
+  fi
+
+  local failure_pattern="Automation \\($role\\) patchset $patchset attempt ([0-9]+)/[0-9]+ failed"
+  local latest_attempt=0
+  local latest_ts=""
+
+  while IFS=$'\t' read -r comment_ts comment_text; do
+    if [[ "$comment_text" =~ $failure_pattern ]]; then
+      local attempt="${BASH_REMATCH[1]}"
+      if [[ "$attempt" -ge "$latest_attempt" ]]; then
+        latest_attempt="$attempt"
+        latest_ts="$comment_ts"
+      fi
+    fi
+  done < <(jq -r '.taskComments[]? | [.commentCreatedAt, .commentText] | @tsv' <<<"$comments_json" 2>/dev/null)
+
+  if [[ "$latest_attempt" -le 0 || -z "$latest_ts" ]]; then
+    echo "0"
+    return
+  fi
+
+  local delay=$((base_interval * (2 ** (latest_attempt - 1))))
+  if [[ "$delay" -gt 600 ]]; then
+    delay=600
+  fi
+
+  local now ts_epoch elapsed remaining
+  now="$(date +%s)"
+  ts_epoch="$(date -d "$latest_ts" +%s 2>/dev/null || echo 0)"
+  elapsed=$((now - ts_epoch))
+  remaining=$((delay - elapsed))
+
+  if [[ "$remaining" -gt 0 ]]; then
+    echo "$remaining"
+  else
+    echo "0"
+  fi
+}
+
+record_retry_attempt() {
+  local tid="$1"
+  local role="$2"
+  local patchset="$3"
+  local run_name="$4"
+  local max_retries="$5"
+  local failed="$6"
+
+  if [[ "$max_retries" -eq 0 || "$failed" != "true" ]]; then
+    return 0
+  fi
+
+  local retry_count
+  retry_count="$(get_retry_count "$tid" "$role" "$patchset")"
+  local attempt=$((retry_count + 1))
+
+  local comment_msg
+  comment_msg="Automation ($role) patchset $patchset attempt $attempt/$max_retries failed for run $run_name."
+
+  if [[ "$attempt" -ge "$max_retries" ]]; then
+    comment_msg="$comment_msg Automation ($role) patchset $patchset exceeded max retries, needs human attention."
+    log "Task $tid patchset $patchset exceeded max retries ($attempt/$max_retries), opening circuit"
+  fi
+
+  task comment "$tid" "$comment_msg" --json >/dev/null || true
+}
+
 prepare_workspace_for_task() {
   local role="$1"
   local workspace="$2"
@@ -337,6 +490,8 @@ run_single_task() {
   local task_filter="$9"
   local parent_filter="${10}"
   local auto_stash_dirty="${11}"
+  local max_retries="${12}"
+  local interval_seconds="${13}"
 
   local tid
   tid="$(select_next_task "$role" "$task_filter" "$parent_filter")"
@@ -348,6 +503,31 @@ run_single_task() {
 
   log "Picked task $tid for role=$role"
 
+  local patchset_count
+  patchset_count="$(task_patchset_count "$tid")"
+
+  # Circuit breaker: skip tasks whose current patchset already exhausted retries.
+  if task_exceeded_retries "$tid" "$role" "$patchset_count" "$max_retries"; then
+    log "Task $tid patchset $patchset_count exceeded max retries ($max_retries), skipping"
+    if ! retry_exceeded_comment_exists "$tid" "$role" "$patchset_count"; then
+      task comment "$tid" "Automation ($role) patchset $patchset_count exceeded max retries, needs human attention." --json >/dev/null || true
+    fi
+    if [[ -n "$task_filter" && "$task_filter" == "$tid" ]]; then
+      return 3
+    fi
+    return 2
+  fi
+
+  # Exponential backoff between failed retries (dev role only).
+  if [[ "$role" == "dev" ]]; then
+    local backoff_remaining
+    backoff_remaining="$(retry_backoff_remaining_seconds "$tid" "$role" "$patchset_count" "$interval_seconds")"
+    if [[ "$backoff_remaining" -gt 0 ]]; then
+      log "Task $tid patchset $patchset_count in retry backoff (${backoff_remaining}s remaining), skipping"
+      return 2
+    fi
+  fi
+
   local run_name
   run_name="${role}-${tid}-$(date +%Y%m%d-%H%M%S)"
 
@@ -424,7 +604,7 @@ run_single_task() {
 
   if [[ $rc -ne 0 ]]; then
     log "Run failed for $tid (run=$run_name)"
-    task comment "$tid" "Automation ($role) failed in run $run_name; inspect agentd logs/status." --json >/dev/null || true
+    record_retry_attempt "$tid" "$role" "$patchset_count" "$run_name" "$max_retries" "true"
     return $rc
   fi
 
@@ -547,6 +727,7 @@ loop_cmd() {
   local timeout="$DEFAULT_TIMEOUT_SECONDS"
   local max_iter="$DEFAULT_MAX_ITER"
   local max_cost="$DEFAULT_MAX_COST_CENTS"
+  local max_retries="$DEFAULT_MAX_RETRIES"
   local auto_stash_dirty="true"
   local once="false"
   local dry_run="false"
@@ -593,6 +774,10 @@ loop_cmd() {
         max_cost="$2"
         shift 2
         ;;
+      --max-retries)
+        max_retries="$2"
+        shift 2
+        ;;
       --no-auto-stash)
         auto_stash_dirty="false"
         shift
@@ -634,11 +819,11 @@ loop_cmd() {
   fi
 
   log "Starting $role loop"
-  log "workspace=$workspace base=$base_branch interval=${interval}s dry_run=$dry_run auto_stash_dirty=$auto_stash_dirty task_filter=${task_filter:-<none>} parent_filter=${parent_filter:-<none>}"
+  log "workspace=$workspace base=$base_branch interval=${interval}s dry_run=$dry_run auto_stash_dirty=$auto_stash_dirty max_retries=$max_retries task_filter=${task_filter:-<none>} parent_filter=${parent_filter:-<none>}"
 
   while true; do
     set +e
-    run_single_task "$role" "$workspace" "$base_branch" "$provider" "$timeout" "$max_iter" "$max_cost" "$dry_run" "$task_filter" "$parent_filter" "$auto_stash_dirty"
+    run_single_task "$role" "$workspace" "$base_branch" "$provider" "$timeout" "$max_iter" "$max_cost" "$dry_run" "$task_filter" "$parent_filter" "$auto_stash_dirty" "$max_retries" "$interval"
     rc=$?
     set -e
 
@@ -646,6 +831,11 @@ loop_cmd() {
       exit 0
     fi
 
+    if [[ $rc -eq 3 ]]; then
+      log "Filtered task exceeded retry limit; exiting loop to avoid hot-poll"
+      exit 0
+    fi
+
     if [[ $rc -eq 2 ]]; then
       sleep "$interval"
       continue