Create Omni/Agent/Eval.hs module for running agent evaluations to prevent regression.
When changing prompts, tools, or switching models, we need to verify the agent still performs correctly. Evals provide automated testing of agent behavior.
data EvalCase = EvalCase
{ evalId :: Text
, evalName :: Text
, evalDescription :: Text
, evalPrompt :: Text -- User prompt to send
, evalTools :: [Engine.Tool] -- Tools available
, evalExpectedBehavior :: ExpectedBehavior
, evalTimeout :: Maybe Int -- Seconds, default 300
}
data ExpectedBehavior
= ContainsText Text -- Output contains this text
| MatchesRegex Text -- Output matches regex
| FileCreated FilePath -- This file was created
| FileContains FilePath Text -- File contains text
| ExitSuccess -- Agent completed without error
| CustomCheck (AgentResult -> IO Bool) -- Custom validation function
data EvalResult = EvalResult
{ evalResultId :: Text
, evalResultPassed :: Bool
, evalResultDuration :: Double -- Seconds
, evalResultCost :: Double -- Cents
, evalResultOutput :: Text -- Final agent message
, evalResultError :: Maybe Text -- Error if failed
}
runEval :: EngineConfig -> EvalCase -> IO EvalResult
runEvalSuite :: EngineConfig -> [EvalCase] -> IO [EvalResult]
-- Pretty print results
printEvalResults :: [EvalResult] -> IO ()
coderEvalSuite :: [EvalCase]
coderEvalSuite =
[ EvalCase
{ evalId = "create-file"
, evalName = "Create a simple file"
, evalPrompt = "Create a file at /tmp/eval-test.txt containing 'hello world'"
, evalTools = coderTools
, evalExpectedBehavior = FileContains "/tmp/eval-test.txt" "hello world"
, evalTimeout = Just 60
}
, EvalCase
{ evalId = "edit-file"
, evalName = "Edit existing file"
, evalPrompt = "Change 'hello' to 'goodbye' in /tmp/eval-test.txt"
, evalTools = coderTools
, evalExpectedBehavior = FileContains "/tmp/eval-test.txt" "goodbye"
, evalTimeout = Just 60
}
, EvalCase
{ evalId = "search-codebase"
, evalName = "Search and report"
, evalPrompt = "How many Haskell files are in Omni/Agent/?"
, evalTools = coderTools
, evalExpectedBehavior = ContainsText "8" -- or whatever the count is
, evalTimeout = Just 120
}
]
In Omni/Jr.hs, add:
jr eval [--suite=NAME] [--case=ID] Run agent evaluations