commit 72e886a2a5f38b80b5dd099bd2c613fdf7c39e29
Author: Ben Sima <ben@bensima.com>
Date: Thu Jan 1 00:35:46 2026
Add retry logic for transient API errors (t-304)
- Add RetryConfig with max attempts, base delay, and max delay cap
- Add retryWithBackoff helper with exponential backoff
- Wrap chatWithUsage in retry logic
- Retry on 429, 500, 502, 503, 504 status codes
- Retry on timeout/rate limit/overloaded error messages
- Default: 3 attempts, 1s base delay, 30s max delay
Task-Id: t-304
diff --git a/Omni/Agent/Engine.hs b/Omni/Agent/Engine.hs
index 90edcac9..d9940cf2 100644
--- a/Omni/Agent/Engine.hs
+++ b/Omni/Agent/Engine.hs
@@ -593,8 +593,48 @@ data ChatResult = ChatResult
}
deriving (Show, Eq)
+-- | Retry configuration for transient API errors
+data RetryConfig = RetryConfig
+ { retryMaxAttempts :: Int, -- ^ Maximum number of attempts (including first try)
+ retryBaseDelayMs :: Int, -- ^ Initial delay in milliseconds
+ retryMaxDelayMs :: Int -- ^ Maximum delay cap in milliseconds
+ }
+ deriving (Show, Eq)
+
+-- | Default retry config: 3 attempts, 1s base delay, 30s max
+defaultRetryConfig :: RetryConfig
+defaultRetryConfig = RetryConfig 3 1000 30000
+
+-- | Check if an HTTP status code is retryable
+
+-- | Check if an error message indicates a transient/retryable error
+isRetryableError :: Text -> Bool
+isRetryableError err =
+ any (`Text.isInfixOf` err) ["429", "500", "502", "503", "504"]
+ || any (`Text.isInfixOf` Text.toLower err) ["timeout", "rate limit", "overloaded"]
+
+-- | Retry an IO action with exponential backoff
+retryWithBackoff :: RetryConfig -> IO (Either Text a) -> IO (Either Text a)
+retryWithBackoff cfg action = go 1
+ where
+ go attempt = do
+ result <- action
+ case result of
+ Right _ -> pure result
+ Left err
+ | attempt >= retryMaxAttempts cfg -> pure (Left err) -- No more retries
+ | isRetryableError err -> do
+ let delayMs = min (retryMaxDelayMs cfg) (retryBaseDelayMs cfg * (2 ^ (attempt - 1)))
+ threadDelay (delayMs * 1000)
+ go (attempt + 1)
+ | otherwise -> pure (Left err) -- Non-retryable error
+
chatWithUsage :: LLM -> [Tool] -> [Message] -> IO (Either Text ChatResult)
-chatWithUsage llm tools messages = do
+chatWithUsage llm tools messages = retryWithBackoff defaultRetryConfig <| chatWithUsageOnce llm tools messages
+
+-- | Single attempt at chat completion (no retry)
+chatWithUsageOnce :: LLM -> [Tool] -> [Message] -> IO (Either Text ChatResult)
+chatWithUsageOnce llm tools messages = do
let url = Text.unpack (llmBaseUrl llm) <> "/chat/completions"
req0 <- HTTP.parseRequest url
let toolApis = [encodeToolForApi t | not (null tools), t <- tools]