← Back to task

Commit b697c457

commit b697c457c5a67317b3ef17dd8a4bfac1aee7b814
Author: Coder Agent <coder@agents.omni>
Date:   Tue Feb 17 12:07:39 2026

    agentd: fix log handle close, webhook validation, graceful shutdown
    
    t-562: Log handle close on pi crash
    - readPiOutput/readPiStderr now track log handle health via IORef
    - After first write failure, stop all future log writes (no more
      hundreds of 'handle is closed' errors in journalctl)
    - Also stop reading on non-EOF pipe errors (broken pipe = dead process)
    
    t-563: Webhook URL validation
    - New validateWebhookUrl function checks http:// or https:// prefix
    - spawnHandler returns 400 for invalid webhook URLs at spawn time
    - spawnPiAgent also validates and records failure if invalid
    - Prevents confusing InvalidUrlException later in agent lifecycle
    
    t-564: Graceful SIGTERM shutdown
    - Signal handler uses MVar to prevent double-shutdown races
    - Calls gracefulShutdown then exitSuccess (previously Warp kept running)
    - gracefulShutdown wraps stopAgentProcess in try/catch per agent
    - Closes SQLite connection on shutdown
    - Agents get quit command → 1s grace → SIGTERM sequence
    
    Task-Id: t-562
    Task-Id: t-563
    Task-Id: t-564

diff --git a/Omni/Agentd/Daemon.hs b/Omni/Agentd/Daemon.hs
index 12e28c3e..8f572a05 100644
--- a/Omni/Agentd/Daemon.hs
+++ b/Omni/Agentd/Daemon.hs
@@ -661,6 +661,13 @@ resolveWorkspace config req runId = case spawnWorkspace req of
             }
       )
 
+-- | Validate that a webhook URL is a full http/https URL (t-563).
+validateWebhookUrl :: Maybe Text -> Either Text (Maybe Text)
+validateWebhookUrl Nothing = Right Nothing
+validateWebhookUrl (Just url)
+  | Text.isPrefixOf "http://" url || Text.isPrefixOf "https://" url = Right (Just url)
+  | otherwise = Left ("Invalid webhook URL: must start with http:// or https://, got: " <> Text.take 100 url)
+
 -- | Spawn a pi agent directly (no pi_rpc.py wrapper)
 spawnPiAgent :: DaemonState -> SpawnRequest -> IO Text
 spawnPiAgent state req = do
@@ -670,8 +677,29 @@ spawnPiAgent state req = do
       logRoot = dcLogRoot config
       model = fromMaybe defaultModel (spawnModel req)
       piPath = dcPiPath config
-      webhookUrl = spawnWebhook req
 
+  -- Validate webhook URL before proceeding (t-563)
+  case validateWebhookUrl (spawnWebhook req) of
+    Left err -> do
+      insertAgent conn runId (spawnPrompt req) "" Nothing
+      now <- Time.getCurrentTime
+      updateAgentCompleted conn runId now (Just err) Nothing Nothing
+      pure runId
+    Right webhookUrl -> spawnPiAgentInner state req runId conn config logRoot model piPath webhookUrl
+
+-- | Inner spawn logic after validation
+spawnPiAgentInner ::
+  DaemonState ->
+  SpawnRequest ->
+  Text ->
+  SQL.Connection ->
+  DaemonConfig ->
+  FilePath ->
+  Text ->
+  FilePath ->
+  Maybe Text ->
+  IO Text
+spawnPiAgentInner state req runId conn config logRoot model piPath webhookUrl = do
   -- Create log directory for this run
   let runDir = logRoot </> Text.unpack runId
   Dir.createDirectoryIfMissing True runDir
@@ -862,34 +890,39 @@ updateWorkspaceSizeFromAgent conn runId = do
     sizeBytes <- calculateWorkspaceSize (Text.unpack (wrPath workspace))
     updateWorkspaceSize conn runId sizeBytes
 
--- | Read pi output, update status, log events, send webhooks
+-- | Read pi output, update status, log events, send webhooks.
+-- Uses an IORef to track whether the log handle is still usable,
+-- so we stop writing after the handle closes (t-562).
 readPiOutput :: DaemonState -> Text -> IO.Handle -> IO.Handle -> TVar AgentStatus -> Maybe Text -> IO ()
-readPiOutput state runId stdoutH logH statusVar webhookUrl =
-  loop Nothing 0
+readPiOutput state runId stdoutH logH statusVar webhookUrl = do
+  logOk <- IORef.newIORef True
+  loop logOk Nothing 0
   where
-    loop :: Maybe Text -> Int -> IO ()
-    loop lastSummary totalCostCents = do
+    loop :: IORef.IORef Bool -> Maybe Text -> Int -> IO ()
+    loop logOk lastSummary totalCostCents = do
       readResult <- Exception.try @Exception.IOException (BS.hGetLine stdoutH)
       case readResult of
         Left err
           | IOError.isEOFError err -> pure ()
           | otherwise -> do
               TextIO.hPutStrLn IO.stderr <| "Failed to read pi output: " <> tshow err
-              loop lastSummary totalCostCents
+              pure () -- Stop reading on non-EOF errors (pipe broken = process dead)
         Right lineBytes -> do
           let line = TextEncoding.decodeUtf8With TextEncodingError.lenientDecode lineBytes
-          -- Log to file
-          logResult <-
-            Exception.try @SomeException <| do
-              TextIO.hPutStrLn logH line
-              IO.hFlush logH
-          case logResult of
-            Left err -> TextIO.hPutStrLn IO.stderr <| "Failed to write pi log: " <> tshow err
-            Right () -> pure ()
+          -- Log to file (skip if handle already known to be closed)
+          canLog <- IORef.readIORef logOk
+          when canLog <| do
+            logResult <-
+              Exception.try @SomeException <| do
+                TextIO.hPutStrLn logH line
+                IO.hFlush logH
+            case logResult of
+              Left _err -> IORef.writeIORef logOk False -- Stop future log writes
+              Right () -> pure ()
 
           -- Parse and process event
           case Aeson.decode (BL.fromStrict (encodeUtf8 line)) :: Maybe Aeson.Value of
-            Nothing -> loop lastSummary totalCostCents
+            Nothing -> loop logOk lastSummary totalCostCents
             Just event -> do
               processResult <-
                 Exception.try @SomeException
@@ -897,30 +930,36 @@ readPiOutput state runId stdoutH logH statusVar webhookUrl =
               case processResult of
                 Left err -> do
                   TextIO.hPutStrLn IO.stderr <| "Failed to process pi event: " <> tshow err
-                  loop lastSummary totalCostCents
-                Right (newSummary, newCost) -> loop newSummary newCost
+                  loop logOk lastSummary totalCostCents
+                Right (newSummary, newCost) -> loop logOk newSummary newCost
 
+-- | Read pi stderr and write to log file.
+-- Stops writing to log if the handle is closed (t-562).
 readPiStderr :: IO.Handle -> IO.Handle -> IO ()
-readPiStderr logH stderrH = loop
+readPiStderr logH stderrH = do
+  logOk <- IORef.newIORef True
+  loop logOk
   where
-    loop = do
+    loop logOk = do
       readResult <- Exception.try @Exception.IOException (BS.hGetLine stderrH)
       case readResult of
         Left err
           | IOError.isEOFError err -> pure ()
           | otherwise -> do
               TextIO.hPutStrLn IO.stderr <| "Failed to read pi stderr: " <> tshow err
-              loop
+              pure () -- Stop on non-EOF errors
         Right lineBytes -> do
           let line = TextEncoding.decodeUtf8With TextEncodingError.lenientDecode lineBytes
-          logResult <-
-            Exception.try @SomeException <| do
-              TextIO.hPutStrLn logH line
-              IO.hFlush logH
-          case logResult of
-            Left err -> TextIO.hPutStrLn IO.stderr <| "Failed to write pi log: " <> tshow err
-            Right () -> pure ()
-          loop
+          canLog <- IORef.readIORef logOk
+          when canLog <| do
+            logResult <-
+              Exception.try @SomeException <| do
+                TextIO.hPutStrLn logH line
+                IO.hFlush logH
+            case logResult of
+              Left _err -> IORef.writeIORef logOk False
+              Right () -> pure ()
+          loop logOk
 
 -- | Process a single pi event
 processEvent ::
@@ -1183,8 +1222,12 @@ server state =
 
 spawnHandler :: DaemonState -> SpawnRequest -> Handler SpawnResponse
 spawnHandler state req = do
-  runId <- liftIO <| spawnPiAgent state req
-  pure <| SpawnResponse runId "spawned"
+  -- Validate webhook URL upfront (t-563)
+  case validateWebhookUrl (spawnWebhook req) of
+    Left err -> throwError err400 {errBody = BL.fromStrict (encodeUtf8 err)}
+    Right _ -> do
+      runId <- liftIO <| spawnPiAgent state req
+      pure <| SpawnResponse runId "spawned"
 
 listHandler :: DaemonState -> Handler [AgentInfo]
 listHandler state = do
@@ -1246,17 +1289,17 @@ runDaemon config = do
             dsDbConn = conn
           }
 
-  -- Handle SIGTERM/SIGINT gracefully
-  _ <-
-    Signals.installHandler
-      Signals.sigTERM
-      (Signals.Catch (gracefulShutdown state))
-      Nothing
-  _ <-
-    Signals.installHandler
-      Signals.sigINT
-      (Signals.Catch (gracefulShutdown state))
-      Nothing
+  -- Shutdown signal (t-564): MVar to coordinate graceful exit
+  shutdownSignal <- MVar.newEmptyMVar
+
+  -- Handle SIGTERM/SIGINT gracefully (t-564)
+  let onSignal = do
+        alreadyShutting <- not </ MVar.tryPutMVar shutdownSignal ()
+        unless alreadyShutting <| do
+          gracefulShutdown state
+          Exit.exitSuccess
+  _ <- Signals.installHandler Signals.sigTERM (Signals.Catch onSignal) Nothing
+  _ <- Signals.installHandler Signals.sigINT (Signals.Catch onSignal) Nothing
 
   -- Start server
   putText <| "Starting agentd daemon on port " <> tshow (dcPort config)
@@ -1268,15 +1311,23 @@ runDaemon config = do
   where
     takeDirectory = Text.unpack <. Text.intercalate "/" <. List.init <. Text.splitOn "/" <. Text.pack
 
--- | Graceful shutdown: stop all running agents
+-- | Graceful shutdown: stop all running agents, close handles (t-564)
 gracefulShutdown :: DaemonState -> IO ()
 gracefulShutdown state = do
   putText "Shutting down agentd daemon..."
   running <- STM.atomically <| STM.readTVar (dsRunning state)
+  -- Stop all agents with a timeout
   forM_ (Map.keys running) <| \runId -> do
     putText <| "  Stopping agent: " <> runId
-    _ <- stopAgentProcess state runId
-    pure ()
+    stopResult <- try @SomeException <| stopAgentProcess state runId
+    case stopResult of
+      Left err -> putText <| "  Warning: failed to stop " <> runId <> ": " <> tshow err
+      Right _ -> pure ()
+  -- Close database connection
+  closeResult <- try @SomeException <| SQL.close (dsDbConn state)
+  case closeResult of
+    Left err -> putText <| "  Warning: failed to close DB: " <> tshow err
+    Right _ -> pure ()
   putText "Shutdown complete."
 
 -- * Tests