commit 548cbd5fe5e52086e7ec897b3d476b9a6a48a387
Author: Ben Sima <ben@bensima.com>
Date: Thu Jan 1 15:06:40 2026
Fix orchestrator async thread silent death (t-330)
The orchestrator async thread could die silently without notifying
the user or logging the error. This happened because:
1. Exceptions before the try block weren't caught
2. No finally block to ensure cleanup
3. Async exceptions could kill the thread silently
Fixed by:
1. Wrapping entire async body in Exception.finally for cleanup
2. Moving exception handling to catch ALL exceptions
3. Separated runOrchestratorThread for cleaner structure
4. Added logging on both success and failure paths
5. Cleanup (unregisterOrchestrator) now runs regardless of outcome
Now if the orchestrator crashes for any reason:
- User gets a Telegram notification
- Error is logged to journalctl
- Orchestrator is unregistered from the active registry
Task-Id: t-330
diff --git a/Omni/Agent/Telegram/Orchestrator.hs b/Omni/Agent/Telegram/Orchestrator.hs
index 78a7158d..3b651239 100644
--- a/Omni/Agent/Telegram/Orchestrator.hs
+++ b/Omni/Agent/Telegram/Orchestrator.hs
@@ -64,6 +64,7 @@ import qualified Control.Concurrent.STM as STM
import qualified Data.Map.Strict as Map
import qualified Data.Time as Time
import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exception
-- | Configuration for an orchestrator run
data OrchestratorConfig = OrchestratorConfig
@@ -208,59 +209,86 @@ formatPhase tid maxIter = \case
"❌ " <> tid <> " failed: " <> Text.take 200 err
-- | Spawn orchestrator as async thread, returns immediately
+--
+-- The async thread is wrapped in comprehensive exception handling:
+-- 1. finally block ensures cleanup (unregister) always happens
+-- 2. try/catch around entire body catches all synchronous exceptions
+-- 3. Errors are logged AND sent to Telegram so user is always notified
spawnOrchestrator :: Types.TelegramConfig -> OrchestratorConfig -> IO (Async.Async ())
spawnOrchestrator tgCfg cfg = do
putText <| "spawnOrchestrator: About to spawn async for " <> orchTaskId cfg
hFlush stdout
- -- Look up task title for status display
+ -- Look up task title for status display (do this BEFORE spawning)
allTasks <- Task.loadTasks
let mTask = Task.findTask (orchTaskId cfg) allTasks
taskTitle = maybe "Unknown task" Task.taskTitle mTask
+ chatId = orchChatId cfg
+ tid = orchTaskId cfg
Async.async <| do
- putText <| "Orchestrator async thread started for " <> orchTaskId cfg
+ -- Wrap ENTIRE body in exception handler + finally for cleanup
+ let cleanup = do
+ putText <| "Orchestrator cleanup running for " <> tid
+ hFlush stdout
+ unregisterOrchestrator chatId
+
+ Exception.finally (runOrchestratorThread tgCfg cfg taskTitle) cleanup
+
+-- | Inner orchestrator thread logic, separated for cleaner exception handling
+runOrchestratorThread :: Types.TelegramConfig -> OrchestratorConfig -> Text -> IO ()
+runOrchestratorThread tgCfg cfg taskTitle = do
+ let chatId = orchChatId cfg
+ tid = orchTaskId cfg
+ maxIter = orchMaxIterations cfg
+
+ -- Catch ANY exception and report it
+ result <- try @SomeException <| do
+ putText <| "Orchestrator async thread started for " <> tid
hFlush stdout
-- Register this orchestrator
now <- Time.getCurrentTime
let status = OrchestratorStatus
- { osTaskId = orchTaskId cfg,
+ { osTaskId = tid,
osTaskTitle = taskTitle,
- osChatId = orchChatId cfg,
+ osChatId = chatId,
osPhase = PhaseStarting,
osIteration = 0,
- osMaxIterations = orchMaxIterations cfg,
+ osMaxIterations = maxIter,
osStartedAt = now,
osCurrentProcess = Nothing
}
registerOrchestrator status
-- Create initial status message
- mMsgId <- createStatusMessage tgCfg (orchChatId cfg) (orchThreadId cfg)
- (formatPhase (orchTaskId cfg) (orchMaxIterations cfg) PhaseStarting)
+ mMsgId <- createStatusMessage tgCfg chatId (orchThreadId cfg)
+ (formatPhase tid maxIter PhaseStarting)
putText <| "Orchestrator: createStatusMessage returned " <> tshow mMsgId
hFlush stdout
case mMsgId of
Nothing -> do
-- Failed to create status message, send error
- _ <- sendMessage tgCfg (orchChatId cfg) (orchThreadId cfg)
- ("❌ Failed to start orchestrator for " <> orchTaskId cfg <> ": couldn't create status message")
+ _ <- sendMessage tgCfg chatId (orchThreadId cfg)
+ ("❌ Failed to start orchestrator for " <> tid <> ": couldn't create status message")
pure ()
Just msgId -> do
- -- Run orchestrator with status updates
- result <- try @SomeException (runOrchestrator tgCfg cfg msgId)
- case result of
- Left err -> do
- -- Orchestrator crashed - unregister and report error
- unregisterOrchestrator (orchChatId cfg)
- updateStatusMessage tgCfg (orchChatId cfg) msgId
- (formatPhase (orchTaskId cfg) (orchMaxIterations cfg) (PhaseFailed (tshow err)))
- _ <- sendMessage tgCfg (orchChatId cfg) (orchThreadId cfg)
- ("❌ " <> orchTaskId cfg <> " orchestrator crashed: " <> Text.take 200 (tshow err))
- pure ()
- Right _ -> pure ()
+ -- Run the main orchestrator loop
+ runOrchestrator tgCfg cfg msgId
+
+ -- Handle any exception that occurred
+ case result of
+ Left err -> do
+ putText <| "Orchestrator " <> tid <> " crashed with exception: " <> tshow err
+ hFlush stdout
+ -- Try to notify user (this might also fail, but at least we try)
+ _ <- try @SomeException <| sendMessage tgCfg chatId (orchThreadId cfg)
+ ("❌ " <> tid <> " orchestrator crashed: " <> Text.take 200 (tshow err))
+ pure ()
+ Right () -> do
+ putText <| "Orchestrator " <> tid <> " completed normally"
+ hFlush stdout
-- | Run the orchestrator loop
runOrchestrator :: Types.TelegramConfig -> OrchestratorConfig -> Int -> IO ()