Add checkpoint/resume support to the Op system.
Long-running agent tasks (coding, research) can fail partway through. Checkpointing allows saving state at key points and resuming from there instead of starting over.
Read Omni/Agent/ARCHITECTURE.md for full design rationale.
data Checkpoint = Checkpoint
{ cpName :: Text -- human-readable checkpoint name
, cpTraceId :: TraceId -- which trace this belongs to
, cpSequence :: Int -- position in trace
, cpTimestamp :: UTCTime
, cpTrace :: Trace -- events up to this point
, cpState :: ByteString -- serialized state
, cpStateType :: Text -- type name for deserialization
}
deriving (Show, Eq, Generic)
instance ToJSON Checkpoint
instance FromJSON Checkpoint
-- | Save checkpoint to file
saveCheckpoint :: FilePath -> Checkpoint -> IO ()
-- | Load checkpoint from file
loadCheckpoint :: FilePath -> IO (Either Text Checkpoint)
-- | List checkpoints for a trace
listCheckpoints :: FilePath -> TraceId -> IO [Checkpoint]
-- | Directory structure:
-- {workspace}/_/checkpoints/{trace-id}/
-- checkpoint-{sequence}-{name}.json
checkpointPath :: FilePath -> TraceId -> Int -> Text -> FilePath
checkpointPath workspace traceId seq name =
workspace </> "_" </> "checkpoints" </> unTraceId traceId
</> "checkpoint-" <> show seq <> "-" <> Text.unpack name <> ".json"
In Sequential.hs and Parallel.hs:
Free (Checkpoint name k) -> do
-- Emit checkpoint event
let event = EventCheckpoint name <timestamp>
onEvent config event
let trace' = appendEvent event trace
-- Serialize current state
stateBytes <- serializeState state
-- Save checkpoint
let cp = Checkpoint
{ cpName = name
, cpTraceId = currentTraceId
, cpSequence = length (traceEvents trace')
, cpTimestamp = <timestamp>
, cpTrace = trace'
, cpState = stateBytes
, cpStateType = typeOf state -- need Typeable constraint
}
saveCheckpoint workspace cp
-- Continue
interpret config state trace' k
-- | Resume execution from a checkpoint
resume
:: (FromJSON s, Typeable s)
=> SeqConfig
-> Checkpoint
-> Op s a -- the original program
-> IO (Either Text (a, Trace, s))
resume config cp program = do
-- Deserialize state
case deserializeState (cpState cp) of
Left err -> pure (Left ("Failed to deserialize checkpoint state: " <> err))
Right state -> do
-- Create continuation trace (appends to checkpoint trace)
let trace = cpTrace cp
-- Find where to resume in program
-- This is the hard part - we need to "fast-forward" the program
-- to the checkpoint position
resumeFrom config state trace (cpSequence cp) program
-- | Fast-forward program to checkpoint position
-- Run program but skip actual execution until we reach checkpoint
resumeFrom
:: SeqConfig -> s -> Trace -> Int -> Op s a
-> IO (Either Text (a, Trace, s))
resumeFrom config state trace targetSeq program = do
-- Option 1: Replay trace events to reconstruct state
-- Option 2: Store "remaining program" in checkpoint (hard to serialize)
-- Option 3: Re-run program with mock provider until checkpoint
-- Option 1 is simplest: checkpoint already has state and trace
-- Just continue from there
runSequential config state program
-- But this re-runs from beginning...
-- Better: checkpoints must be at explicit boundaries
-- Program structure: setup >> checkpoint "ready" >> work >> checkpoint "done" >> finish
-- To resume from "ready", we need to skip "setup"
-- This requires the program to be structured for resumability
...
-- | A program structured for checkpointing
-- After each checkpoint, check if we're resuming
data ResumePoint = ResumePoint Text | Fresh
resumableProgram :: ResumePoint -> Op s a
resumableProgram resume = do
-- Phase 1
when (resume == Fresh) $ do
setupStuff
checkpoint "after-setup"
-- Phase 2
when (resume <= ResumePoint "after-setup") $ do
doWork
checkpoint "after-work"
-- Phase 3
finish
-- Alternative: use labels
data ProgramState = Setup | Working | Finishing
checkpoint' :: ProgramState -> Op s ()
checkpoint' phase = checkpoint (tshow phase)
-- On resume, skip phases before the checkpoint
-- In agentd or agent CLI:
-- agentd resume <checkpoint-file>
-- agentd list-checkpoints <trace-id>
-- agentd checkpoint-info <checkpoint-file>
1. Continuation serialization - Free monad continuations are functions, can't serialize
2. State serialization - Need Serialize instance for state type
3. Provider state - Provider might have internal state (token counts, etc.)