commit 40b93224adfa973dc1734c11d09e17c14a1b10a2
Author: Coder Agent <coder@agents.omni>
Date: Wed Feb 11 17:22:51 2026
Omni/Ide: add per-task retry circuit breaker to dev loop
- add --max-retries with default 5 (0 disables limit)
- record failed retry attempts in task comments with role+patchset tags
- make retry accounting patchset-aware with legacy patchset-1 fallback
- open circuit when retries exhausted and annotate task once
- prevent hot-poll when --task-id points to exhausted task
- add exponential backoff between failed dev retries (capped)
- document max-retries and retry behavior in workflow docs
Task-Id: t-587.2
diff --git a/Omni/Ide/DEV_REVIEW_RELEASE.md b/Omni/Ide/DEV_REVIEW_RELEASE.md
index 4eb1e046..135a440c 100644
--- a/Omni/Ide/DEV_REVIEW_RELEASE.md
+++ b/Omni/Ide/DEV_REVIEW_RELEASE.md
@@ -44,6 +44,7 @@ Optional flags:
- `--root _/worktrees/my-flow`
- `--parent t-587` (scope to one epic)
- `--task-id t-587.1` (scope to one task)
+- `--max-retries 5` (circuit-break after failed attempts per patchset)
- `--no-auto-stash` (disable automatic dirty-workspace recovery)
- `--once`
- `--dry-run`
@@ -71,6 +72,11 @@ Omni/Ide/dev-review-release.sh cleanup-branches --apply
When dev produces a new task-branch commit SHA, it automatically increments
`patchset_count` via `task patchset <id> --increment`.
+Retry accounting and circuit-breaker behavior are patchset-aware:
+- Failed attempts are tracked per role and patchset.
+- Exceeding `--max-retries` opens the circuit for that patchset and skips new runs.
+- Dev retries use exponential backoff between failed attempts (capped).
+
Expected lifecycle:
`open -> in-progress -> review -> approved -> done`
diff --git a/Omni/Ide/README.md b/Omni/Ide/README.md
index 2fb49236..c52041e7 100644
--- a/Omni/Ide/README.md
+++ b/Omni/Ide/README.md
@@ -76,6 +76,9 @@ Omni/Ide/dev-review-release.sh loop --role dev --task-id t-587.1
# Disable dirty-workspace auto-stash recovery
Omni/Ide/dev-review-release.sh loop --role dev --no-auto-stash
+
+# Cap failed retries per patchset (dev loop circuit breaker)
+Omni/Ide/dev-review-release.sh loop --role dev --max-retries 5
```
The convention for all programs in the omnirepo is to run their tests if the first argument is `test`. So for example:
diff --git a/Omni/Ide/dev-review-release.sh b/Omni/Ide/dev-review-release.sh
index 32057617..8ed0c537 100755
--- a/Omni/Ide/dev-review-release.sh
+++ b/Omni/Ide/dev-review-release.sh
@@ -10,6 +10,7 @@ DEFAULT_INTERVAL_SECONDS=20
DEFAULT_TIMEOUT_SECONDS=1800
DEFAULT_MAX_ITER=80
DEFAULT_MAX_COST_CENTS=300
+DEFAULT_MAX_RETRIES=5
usage() {
cat <<'EOF'
@@ -44,6 +45,7 @@ Loop options:
--timeout SEC agentd timeout per run (default: 1800)
--max-iter N agentd max iterations per run (default: 80)
--max-cost CENTS agentd max cost cents per run (default: 300)
+ --max-retries N Max failed attempts per task patchset before skipping (default: 5, 0 = unlimited)
--no-auto-stash Disable auto-stashing dirty workspace state before retry
--once Process at most one task, then exit
--dry-run Print what would run, do not invoke agentd
@@ -248,6 +250,157 @@ auto_stash_workspace() {
return 0
}
+task_patchset_count() {
+ local tid="$1"
+ task show "$tid" --json | jq -r '.taskPatchsetCount // .patchset_count // 0'
+}
+
+get_retry_count() {
+ local tid="$1"
+ local role="$2"
+ local patchset="$3"
+
+ local comments_json
+ if ! comments_json="$(task show "$tid" --json 2>/dev/null)"; then
+ echo "0"
+ return
+ fi
+
+ local patchset_pattern="Automation \\($role\\) patchset $patchset attempt ([0-9]+)/[0-9]+ failed"
+ local legacy_pattern="Automation \\($role\\) attempt ([0-9]+)/[0-9]+ failed"
+ local max_attempt=0
+
+ while IFS= read -r comment_text; do
+ if [[ "$comment_text" =~ $patchset_pattern ]]; then
+ local attempt="${BASH_REMATCH[1]}"
+ if [[ "$attempt" -gt "$max_attempt" ]]; then
+ max_attempt="$attempt"
+ fi
+ continue
+ fi
+
+ # Backward-compat for existing patchset-1 comments emitted before patchset tagging.
+ if [[ "$patchset" -eq 1 && "$comment_text" =~ $legacy_pattern ]]; then
+ local legacy_attempt="${BASH_REMATCH[1]}"
+ if [[ "$legacy_attempt" -gt "$max_attempt" ]]; then
+ max_attempt="$legacy_attempt"
+ fi
+ fi
+ done < <(jq -r '.taskComments[]?.commentText // empty' <<<"$comments_json" 2>/dev/null)
+
+ echo "$max_attempt"
+}
+
+retry_exceeded_comment_exists() {
+ local tid="$1"
+ local role="$2"
+ local patchset="$3"
+
+ local comments_json
+ if ! comments_json="$(task show "$tid" --json 2>/dev/null)"; then
+ return 1
+ fi
+
+ local marker="Automation ($role) patchset $patchset exceeded max retries"
+ jq -r '.taskComments[]?.commentText // empty' <<<"$comments_json" | grep -Fq "$marker"
+}
+
+task_exceeded_retries() {
+ local tid="$1"
+ local role="$2"
+ local patchset="$3"
+ local max_retries="$4"
+
+ if [[ "$max_retries" -eq 0 ]]; then
+ return 1
+ fi
+
+ local retry_count
+ retry_count="$(get_retry_count "$tid" "$role" "$patchset")"
+
+ if [[ "$retry_count" -ge "$max_retries" ]]; then
+ return 0
+ fi
+
+ return 1
+}
+
+retry_backoff_remaining_seconds() {
+ local tid="$1"
+ local role="$2"
+ local patchset="$3"
+ local base_interval="$4"
+
+ local comments_json
+ if ! comments_json="$(task show "$tid" --json 2>/dev/null)"; then
+ echo "0"
+ return
+ fi
+
+ local failure_pattern="Automation \\($role\\) patchset $patchset attempt ([0-9]+)/[0-9]+ failed"
+ local latest_attempt=0
+ local latest_ts=""
+
+ while IFS=$'\t' read -r comment_ts comment_text; do
+ if [[ "$comment_text" =~ $failure_pattern ]]; then
+ local attempt="${BASH_REMATCH[1]}"
+ if [[ "$attempt" -ge "$latest_attempt" ]]; then
+ latest_attempt="$attempt"
+ latest_ts="$comment_ts"
+ fi
+ fi
+ done < <(jq -r '.taskComments[]? | [.commentCreatedAt, .commentText] | @tsv' <<<"$comments_json" 2>/dev/null)
+
+ if [[ "$latest_attempt" -le 0 || -z "$latest_ts" ]]; then
+ echo "0"
+ return
+ fi
+
+ local delay=$((base_interval * (2 ** (latest_attempt - 1))))
+ if [[ "$delay" -gt 600 ]]; then
+ delay=600
+ fi
+
+ local now ts_epoch elapsed remaining
+ now="$(date +%s)"
+ ts_epoch="$(date -d "$latest_ts" +%s 2>/dev/null || echo 0)"
+ elapsed=$((now - ts_epoch))
+ remaining=$((delay - elapsed))
+
+ if [[ "$remaining" -gt 0 ]]; then
+ echo "$remaining"
+ else
+ echo "0"
+ fi
+}
+
+record_retry_attempt() {
+ local tid="$1"
+ local role="$2"
+ local patchset="$3"
+ local run_name="$4"
+ local max_retries="$5"
+ local failed="$6"
+
+ if [[ "$max_retries" -eq 0 || "$failed" != "true" ]]; then
+ return 0
+ fi
+
+ local retry_count
+ retry_count="$(get_retry_count "$tid" "$role" "$patchset")"
+ local attempt=$((retry_count + 1))
+
+ local comment_msg
+ comment_msg="Automation ($role) patchset $patchset attempt $attempt/$max_retries failed for run $run_name."
+
+ if [[ "$attempt" -ge "$max_retries" ]]; then
+ comment_msg="$comment_msg Automation ($role) patchset $patchset exceeded max retries, needs human attention."
+ log "Task $tid patchset $patchset exceeded max retries ($attempt/$max_retries), opening circuit"
+ fi
+
+ task comment "$tid" "$comment_msg" --json >/dev/null || true
+}
+
prepare_workspace_for_task() {
local role="$1"
local workspace="$2"
@@ -337,6 +490,8 @@ run_single_task() {
local task_filter="$9"
local parent_filter="${10}"
local auto_stash_dirty="${11}"
+ local max_retries="${12}"
+ local interval_seconds="${13}"
local tid
tid="$(select_next_task "$role" "$task_filter" "$parent_filter")"
@@ -348,6 +503,31 @@ run_single_task() {
log "Picked task $tid for role=$role"
+ local patchset_count
+ patchset_count="$(task_patchset_count "$tid")"
+
+ # Circuit breaker: skip tasks whose current patchset already exhausted retries.
+ if task_exceeded_retries "$tid" "$role" "$patchset_count" "$max_retries"; then
+ log "Task $tid patchset $patchset_count exceeded max retries ($max_retries), skipping"
+ if ! retry_exceeded_comment_exists "$tid" "$role" "$patchset_count"; then
+ task comment "$tid" "Automation ($role) patchset $patchset_count exceeded max retries, needs human attention." --json >/dev/null || true
+ fi
+ if [[ -n "$task_filter" && "$task_filter" == "$tid" ]]; then
+ return 3
+ fi
+ return 2
+ fi
+
+ # Exponential backoff between failed retries (dev role only).
+ if [[ "$role" == "dev" ]]; then
+ local backoff_remaining
+ backoff_remaining="$(retry_backoff_remaining_seconds "$tid" "$role" "$patchset_count" "$interval_seconds")"
+ if [[ "$backoff_remaining" -gt 0 ]]; then
+ log "Task $tid patchset $patchset_count in retry backoff (${backoff_remaining}s remaining), skipping"
+ return 2
+ fi
+ fi
+
local run_name
run_name="${role}-${tid}-$(date +%Y%m%d-%H%M%S)"
@@ -424,7 +604,7 @@ run_single_task() {
if [[ $rc -ne 0 ]]; then
log "Run failed for $tid (run=$run_name)"
- task comment "$tid" "Automation ($role) failed in run $run_name; inspect agentd logs/status." --json >/dev/null || true
+ record_retry_attempt "$tid" "$role" "$patchset_count" "$run_name" "$max_retries" "true"
return $rc
fi
@@ -547,6 +727,7 @@ loop_cmd() {
local timeout="$DEFAULT_TIMEOUT_SECONDS"
local max_iter="$DEFAULT_MAX_ITER"
local max_cost="$DEFAULT_MAX_COST_CENTS"
+ local max_retries="$DEFAULT_MAX_RETRIES"
local auto_stash_dirty="true"
local once="false"
local dry_run="false"
@@ -593,6 +774,10 @@ loop_cmd() {
max_cost="$2"
shift 2
;;
+ --max-retries)
+ max_retries="$2"
+ shift 2
+ ;;
--no-auto-stash)
auto_stash_dirty="false"
shift
@@ -634,11 +819,11 @@ loop_cmd() {
fi
log "Starting $role loop"
- log "workspace=$workspace base=$base_branch interval=${interval}s dry_run=$dry_run auto_stash_dirty=$auto_stash_dirty task_filter=${task_filter:-<none>} parent_filter=${parent_filter:-<none>}"
+ log "workspace=$workspace base=$base_branch interval=${interval}s dry_run=$dry_run auto_stash_dirty=$auto_stash_dirty max_retries=$max_retries task_filter=${task_filter:-<none>} parent_filter=${parent_filter:-<none>}"
while true; do
set +e
- run_single_task "$role" "$workspace" "$base_branch" "$provider" "$timeout" "$max_iter" "$max_cost" "$dry_run" "$task_filter" "$parent_filter" "$auto_stash_dirty"
+ run_single_task "$role" "$workspace" "$base_branch" "$provider" "$timeout" "$max_iter" "$max_cost" "$dry_run" "$task_filter" "$parent_filter" "$auto_stash_dirty" "$max_retries" "$interval"
rc=$?
set -e
@@ -646,6 +831,11 @@ loop_cmd() {
exit 0
fi
+ if [[ $rc -eq 3 ]]; then
+ log "Filtered task exceeded retry limit; exiting loop to avoid hot-poll"
+ exit 0
+ fi
+
if [[ $rc -eq 2 ]]; then
sleep "$interval"
continue