Replace estimated token counts with actual API-reported usage.
The code-only spike estimates tokens:
-- Token tracking not available in simple chat - estimate 500 per call
tokens = 500
This is inaccurate. We need real token counts for:
Provider.hs has Usage type:
data Usage = Usage
{ usagePromptTokens :: Int
, usageCompletionTokens :: Int
, usageTotalTokens :: Int
, usageCost :: Maybe Double
}
And chatWithUsage returns it:
chatWithUsage :: Provider -> [ToolApi] -> [Message] -> IO (Either Text ChatResult)
data ChatResult = ChatResult
{ chatMessage :: Message
, chatUsage :: Maybe Usage
}
But CodeOnly.hs uses plain chat which doesn't expose usage.
-- Before
think :: Provider -> Text -> Text -> IO (Text, Int)
think provider task context = do
result <- Provider.chat provider [] messages
-- estimates tokens = 500
-- After
think :: Provider -> Text -> Text -> IO (Text, Usage)
think provider task context = do
result <- Provider.chatWithUsage provider [] messages
case result of
Left err -> pure (err, emptyUsage)
Right chatRes ->
let code = extractCode (Provider.msgContent (Provider.chatMessage chatRes))
usage = fromMaybe emptyUsage (Provider.chatUsage chatRes)
in pure (code, usage)
codeOnlyAgent :: ExperimentConfig -> Provider -> Text -> IO RunResult
codeOnlyAgent config provider task = do
(output, iterations, totalUsage, history, mError) <- loop emptyUsage ...
pure RunResult
{ rrTokensUsed = usageTotalTokens totalUsage
, rrCostCents = fromMaybe (estimateCost ...) (usageCost totalUsage) * 100
...
}
data RunResult = RunResult
{ ...
, rrPromptTokens :: Int
, rrCompletionTokens :: Int
, rrTotalTokens :: Int
, rrApiReportedCost :: Maybe Double -- from API if available
, rrEstimatedCost :: Double -- our estimate as fallback
...
}
The benchmark output should show actual API-reported costs, not estimates.