← Back to task

Commit 548cbd5f

commit 548cbd5fe5e52086e7ec897b3d476b9a6a48a387
Author: Ben Sima <ben@bensima.com>
Date:   Thu Jan 1 15:06:40 2026

    Fix orchestrator async thread silent death (t-330)
    
    The orchestrator async thread could die silently without notifying
    the user or logging the error. This happened because:
    1. Exceptions before the try block weren't caught
    2. No finally block to ensure cleanup
    3. Async exceptions could kill the thread silently
    
    Fixed by:
    1. Wrapping entire async body in Exception.finally for cleanup
    2. Moving exception handling to catch ALL exceptions
    3. Separated runOrchestratorThread for cleaner structure
    4. Added logging on both success and failure paths
    5. Cleanup (unregisterOrchestrator) now runs regardless of outcome
    
    Now if the orchestrator crashes for any reason:
    - User gets a Telegram notification
    - Error is logged to journalctl
    - Orchestrator is unregistered from the active registry
    
    Task-Id: t-330

diff --git a/Omni/Agent/Telegram/Orchestrator.hs b/Omni/Agent/Telegram/Orchestrator.hs
index 78a7158d..3b651239 100644
--- a/Omni/Agent/Telegram/Orchestrator.hs
+++ b/Omni/Agent/Telegram/Orchestrator.hs
@@ -64,6 +64,7 @@ import qualified Control.Concurrent.STM as STM
 import qualified Data.Map.Strict as Map
 import qualified Data.Time as Time
 import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exception
 
 -- | Configuration for an orchestrator run
 data OrchestratorConfig = OrchestratorConfig
@@ -208,59 +209,86 @@ formatPhase tid maxIter = \case
     "❌ " <> tid <> " failed: " <> Text.take 200 err
 
 -- | Spawn orchestrator as async thread, returns immediately
+--
+-- The async thread is wrapped in comprehensive exception handling:
+-- 1. finally block ensures cleanup (unregister) always happens
+-- 2. try/catch around entire body catches all synchronous exceptions
+-- 3. Errors are logged AND sent to Telegram so user is always notified
 spawnOrchestrator :: Types.TelegramConfig -> OrchestratorConfig -> IO (Async.Async ())
 spawnOrchestrator tgCfg cfg = do
   putText <| "spawnOrchestrator: About to spawn async for " <> orchTaskId cfg
   hFlush stdout
   
-  -- Look up task title for status display
+  -- Look up task title for status display (do this BEFORE spawning)
   allTasks <- Task.loadTasks
   let mTask = Task.findTask (orchTaskId cfg) allTasks
       taskTitle = maybe "Unknown task" Task.taskTitle mTask
+      chatId = orchChatId cfg
+      tid = orchTaskId cfg
   
   Async.async <| do
-    putText <| "Orchestrator async thread started for " <> orchTaskId cfg
+    -- Wrap ENTIRE body in exception handler + finally for cleanup
+    let cleanup = do
+          putText <| "Orchestrator cleanup running for " <> tid
+          hFlush stdout
+          unregisterOrchestrator chatId
+    
+    Exception.finally (runOrchestratorThread tgCfg cfg taskTitle) cleanup
+
+-- | Inner orchestrator thread logic, separated for cleaner exception handling
+runOrchestratorThread :: Types.TelegramConfig -> OrchestratorConfig -> Text -> IO ()
+runOrchestratorThread tgCfg cfg taskTitle = do
+  let chatId = orchChatId cfg
+      tid = orchTaskId cfg
+      maxIter = orchMaxIterations cfg
+  
+  -- Catch ANY exception and report it
+  result <- try @SomeException <| do
+    putText <| "Orchestrator async thread started for " <> tid
     hFlush stdout
     
     -- Register this orchestrator
     now <- Time.getCurrentTime
     let status = OrchestratorStatus
-          { osTaskId = orchTaskId cfg,
+          { osTaskId = tid,
             osTaskTitle = taskTitle,
-            osChatId = orchChatId cfg,
+            osChatId = chatId,
             osPhase = PhaseStarting,
             osIteration = 0,
-            osMaxIterations = orchMaxIterations cfg,
+            osMaxIterations = maxIter,
             osStartedAt = now,
             osCurrentProcess = Nothing
           }
     registerOrchestrator status
     
     -- Create initial status message
-    mMsgId <- createStatusMessage tgCfg (orchChatId cfg) (orchThreadId cfg)
-      (formatPhase (orchTaskId cfg) (orchMaxIterations cfg) PhaseStarting)
+    mMsgId <- createStatusMessage tgCfg chatId (orchThreadId cfg)
+      (formatPhase tid maxIter PhaseStarting)
     putText <| "Orchestrator: createStatusMessage returned " <> tshow mMsgId
     hFlush stdout
     
     case mMsgId of
       Nothing -> do
         -- Failed to create status message, send error
-        _ <- sendMessage tgCfg (orchChatId cfg) (orchThreadId cfg)
-          ("❌ Failed to start orchestrator for " <> orchTaskId cfg <> ": couldn't create status message")
+        _ <- sendMessage tgCfg chatId (orchThreadId cfg)
+          ("❌ Failed to start orchestrator for " <> tid <> ": couldn't create status message")
         pure ()
       Just msgId -> do
-        -- Run orchestrator with status updates
-        result <- try @SomeException (runOrchestrator tgCfg cfg msgId)
-        case result of
-          Left err -> do
-            -- Orchestrator crashed - unregister and report error
-            unregisterOrchestrator (orchChatId cfg)
-            updateStatusMessage tgCfg (orchChatId cfg) msgId
-              (formatPhase (orchTaskId cfg) (orchMaxIterations cfg) (PhaseFailed (tshow err)))
-            _ <- sendMessage tgCfg (orchChatId cfg) (orchThreadId cfg)
-              ("❌ " <> orchTaskId cfg <> " orchestrator crashed: " <> Text.take 200 (tshow err))
-            pure ()
-          Right _ -> pure ()
+        -- Run the main orchestrator loop
+        runOrchestrator tgCfg cfg msgId
+  
+  -- Handle any exception that occurred
+  case result of
+    Left err -> do
+      putText <| "Orchestrator " <> tid <> " crashed with exception: " <> tshow err
+      hFlush stdout
+      -- Try to notify user (this might also fail, but at least we try)
+      _ <- try @SomeException <| sendMessage tgCfg chatId (orchThreadId cfg)
+        ("❌ " <> tid <> " orchestrator crashed: " <> Text.take 200 (tshow err))
+      pure ()
+    Right () -> do
+      putText <| "Orchestrator " <> tid <> " completed normally"
+      hFlush stdout
 
 -- | Run the orchestrator loop
 runOrchestrator :: Types.TelegramConfig -> OrchestratorConfig -> Int -> IO ()