Download CUAD dataset and establish single-agent baseline for contract clause extraction.
We're testing whether a swarm of agents with STM shared memory can outperform a single agent on tasks that exhaust context windows. Contract review is ideal because:
# Clone or download
cd Omni/Agent/Experiments
git clone https://github.com/TheAtticusProject/cuad cuad-data
# Or download from HuggingFace: https://huggingface.co/datasets/cuad
-- Omni/Agent/Experiments/CUAD.hs
data Contract = Contract
{ contractId :: Text
, contractText :: Text
, contractAnnotations :: [Annotation]
}
data Annotation = Annotation
{ annClauseType :: ClauseType
, annText :: Text
, annStart :: Int
, annEnd :: Int
}
data ClauseType
= Indemnification
| LiabilityLimitation
| TerminationForConvenience
| ChangeOfControl
| NonCompete
| AuditRights
| Insurance
| MostFavoredNation
| Confidentiality
| Exclusivity
-- ... (41 total, start with top 10 most common)
deriving (Show, Eq, Enum, Bounded)
-- Load N contracts from CUAD
loadContracts :: Int -> IO [Contract]
-- Get ground truth for a contract
getGroundTruth :: Contract -> [Annotation]
Look at CUAD stats and pick the 10 most commonly occurring clause types. These will be our extraction targets.
-- Omni/Agent/Experiments/ContractReview.hs
reviewContract :: Provider -> Contract -> IO [Finding]
reviewContract provider contract = do
-- Use code-only approach: think + execute
code <- think provider (reviewPrompt contract)
result <- execute sandbox code
parseFindings result
data Finding = Finding
{ findingClauseType :: ClauseType
, findingText :: Text
, findingConfidence :: Double
, findingNotes :: Text
}
reviewPrompt :: Contract -> Text
reviewPrompt contract = Text.unlines
[ "Review this contract and extract all instances of:"
, "- Indemnification clauses"
, "- Liability limitations"
, "- Termination for convenience"
, "- Change of control provisions"
, "- Non-compete clauses"
, "[etc for top 10 types]"
, ""
, "Contract text:"
, contractText contract
, ""
, "Output JSON: [{\"clause_type\": ..., \"text\": ..., \"notes\": ...}, ...]"
]
data EvalResult = EvalResult
{ precision :: Double -- correct findings / total findings
, recall :: Double -- correct findings / total ground truth
, f1 :: Double
, perClauseMetrics :: Map ClauseType (Double, Double, Double) -- P, R, F1
}
evaluate :: [Finding] -> [Annotation] -> EvalResult
evaluate findings groundTruth = ...
-- Match finding to annotation (fuzzy text match)
findingMatches :: Finding -> Annotation -> Bool
findingMatches f a =
findingClauseType f == annClauseType a &&
textOverlap (findingText f) (annText a) > 0.5
runBaseline :: IO ()
runBaseline = do
provider <- getProvider
-- Scale experiments
for_ [5, 10, 20, 50] $ \n -> do
contracts <- loadContracts n
putStrLn $ "=== Single agent on " <> show n <> " contracts ==="
(results, time) <- timed $ do
-- Sequential: review one at a time
traverse (reviewContract provider) contracts
-- Evaluate against ground truth
let findings = concat results
groundTruth = concatMap getGroundTruth contracts
eval = evaluate findings groundTruth
putStrLn $ "Time: " <> show time
putStrLn $ "Precision: " <> show (precision eval)
putStrLn $ "Recall: " <> show (recall eval)
putStrLn $ "F1: " <> show (f1 eval)
-- Track where it breaks down
putStrLn $ "Findings: " <> show (length findings)
putStrLn $ "Expected: " <> show (length groundTruth)
| Contracts | Expected Behavior | |-----------|-------------------| | 5 | Should work well | | 10 | Might start degrading | | 20 | Likely missing clauses, context pressure | | 50 | Fails or hallucinates significantly |