← Back to task

Commit 72e886a2

commit 72e886a2a5f38b80b5dd099bd2c613fdf7c39e29
Author: Ben Sima <ben@bensima.com>
Date:   Thu Jan 1 00:35:46 2026

    Add retry logic for transient API errors (t-304)
    
    - Add RetryConfig with max attempts, base delay, and max delay cap
    - Add retryWithBackoff helper with exponential backoff
    - Wrap chatWithUsage in retry logic
    - Retry on 429, 500, 502, 503, 504 status codes
    - Retry on timeout/rate limit/overloaded error messages
    - Default: 3 attempts, 1s base delay, 30s max delay
    
    Task-Id: t-304

diff --git a/Omni/Agent/Engine.hs b/Omni/Agent/Engine.hs
index 90edcac9..d9940cf2 100644
--- a/Omni/Agent/Engine.hs
+++ b/Omni/Agent/Engine.hs
@@ -593,8 +593,48 @@ data ChatResult = ChatResult
   }
   deriving (Show, Eq)
 
+-- | Retry configuration for transient API errors
+data RetryConfig = RetryConfig
+  { retryMaxAttempts :: Int,    -- ^ Maximum number of attempts (including first try)
+    retryBaseDelayMs :: Int,    -- ^ Initial delay in milliseconds
+    retryMaxDelayMs :: Int      -- ^ Maximum delay cap in milliseconds
+  }
+  deriving (Show, Eq)
+
+-- | Default retry config: 3 attempts, 1s base delay, 30s max
+defaultRetryConfig :: RetryConfig
+defaultRetryConfig = RetryConfig 3 1000 30000
+
+-- | Check if an HTTP status code is retryable
+
+-- | Check if an error message indicates a transient/retryable error
+isRetryableError :: Text -> Bool
+isRetryableError err =
+  any (`Text.isInfixOf` err) ["429", "500", "502", "503", "504"]
+    || any (`Text.isInfixOf` Text.toLower err) ["timeout", "rate limit", "overloaded"]
+
+-- | Retry an IO action with exponential backoff
+retryWithBackoff :: RetryConfig -> IO (Either Text a) -> IO (Either Text a)
+retryWithBackoff cfg action = go 1
+  where
+    go attempt = do
+      result <- action
+      case result of
+        Right _ -> pure result
+        Left err
+          | attempt >= retryMaxAttempts cfg -> pure (Left err)  -- No more retries
+          | isRetryableError err -> do
+              let delayMs = min (retryMaxDelayMs cfg) (retryBaseDelayMs cfg * (2 ^ (attempt - 1)))
+              threadDelay (delayMs * 1000)
+              go (attempt + 1)
+          | otherwise -> pure (Left err)  -- Non-retryable error
+
 chatWithUsage :: LLM -> [Tool] -> [Message] -> IO (Either Text ChatResult)
-chatWithUsage llm tools messages = do
+chatWithUsage llm tools messages = retryWithBackoff defaultRetryConfig <| chatWithUsageOnce llm tools messages
+
+-- | Single attempt at chat completion (no retry)
+chatWithUsageOnce :: LLM -> [Tool] -> [Message] -> IO (Either Text ChatResult)
+chatWithUsageOnce llm tools messages = do
   let url = Text.unpack (llmBaseUrl llm) <> "/chat/completions"
   req0 <- HTTP.parseRequest url
   let toolApis = [encodeToolForApi t | not (null tools), t <- tools]