← Back to task

Commit 5646f168

commit 5646f1682fa061b9c64479b8799c803a12ea545c
Author: Coder Agent <coder@agents.omni>
Date:   Thu Feb 19 14:05:15 2026

    Newsreader: fetch full text for truncated feed entries
    
    Detect likely truncated RSS snippets and automatically try full-page extraction. Add polite crawling in the extractor with per-host rate limiting and robots.txt checks, while falling back to feed snippets when extraction fails.
    
    Task-Id: t-643

diff --git a/Omni/Newsreader/Extractor.hs b/Omni/Newsreader/Extractor.hs
index d9705ad5..170ee2e8 100644
--- a/Omni/Newsreader/Extractor.hs
+++ b/Omni/Newsreader/Extractor.hs
@@ -7,6 +7,10 @@
 -- Fetches full article content from URLs and extracts clean text.
 -- Uses TagSoup for HTML parsing and simple heuristics for content extraction.
 --
+-- Includes polite crawling behavior:
+-- - Per-host rate limiting
+-- - robots.txt checks (with in-memory caching)
+--
 -- : dep req
 -- : dep tagsoup
 -- : dep bytestring
@@ -23,11 +27,16 @@ module Omni.Newsreader.Extractor
 where
 
 import Alpha
+import Control.Concurrent (threadDelay)
+import Control.Concurrent.MVar (MVar, modifyMVar_, newMVar, withMVar)
 import qualified Data.ByteString.Lazy as LBS
 import qualified Data.List as List
+import qualified Data.Map.Strict as Map
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as TE
+import Data.Time (UTCTime, diffUTCTime, getCurrentTime)
 import qualified Network.HTTP.Req as Req
+import System.IO.Unsafe (unsafePerformIO)
 import qualified Text.HTML.TagSoup as Tag
 import qualified Text.URI as URI
 
@@ -40,64 +49,373 @@ data ExtractResult
   | ExtractFailure Text -- Error message
   deriving (Show, Eq)
 
+-- | Per-host minimum delay between HTTP requests.
+perHostRateLimitMicros :: Int
+perHostRateLimitMicros = 1000000
+
+-- | Timeout for article/page fetches (30s).
+articleFetchTimeoutMicros :: Int
+articleFetchTimeoutMicros = 30000000
+
+-- | Timeout for robots.txt fetches (10s).
+robotsFetchTimeoutMicros :: Int
+robotsFetchTimeoutMicros = 10000000
+
+-- | User-Agent used for all crawler requests.
+userAgent :: Text
+userAgent = "Omni-Newsreader/1.0"
+
+-- | Lowercase User-Agent token used for robots matching.
+robotsUserAgent :: Text
+robotsUserAgent = T.toLower userAgent
+
+-- | Family token used by many robots files (without version suffix).
+robotsUserAgentFamily :: Text
+robotsUserAgentFamily = T.takeWhile (/= '/') robotsUserAgent
+
+-- | Parsed URL components used for crawl policy checks.
+data UrlParts = UrlParts
+  { urlPartsScheme :: Text,
+    urlPartsHost :: Text,
+    urlPartsPath :: Text
+  }
+  deriving (Show, Eq)
+
+-- | Robots policy rules.
+data RobotsRules = RobotsRules
+  { robotsAllowRules :: [Text],
+    robotsDisallowRules :: [Text]
+  }
+  deriving (Show, Eq)
+
+emptyRobotsRules :: RobotsRules
+emptyRobotsRules = RobotsRules [] []
+
+disallowAllRobotsRules :: RobotsRules
+disallowAllRobotsRules = RobotsRules [] ["/"]
+
+appendRobotsRules :: RobotsRules -> RobotsRules -> RobotsRules
+appendRobotsRules older newer =
+  RobotsRules
+    { robotsAllowRules = robotsAllowRules older <> robotsAllowRules newer,
+      robotsDisallowRules = robotsDisallowRules older <> robotsDisallowRules newer
+    }
+
+-- | Global per-host last-request timestamps for polite rate-limiting.
+{-# NOINLINE hostRateLimitState #-}
+hostRateLimitState :: MVar (Map.Map Text UTCTime)
+hostRateLimitState = unsafePerformIO <| newMVar Map.empty
+
+-- | Cached robots rules per origin (scheme + host).
+{-# NOINLINE robotsRulesCache #-}
+robotsRulesCache :: MVar (Map.Map Text RobotsRules)
+robotsRulesCache = unsafePerformIO <| newMVar Map.empty
+
 -- | Extract article content from a URL.
--- Fetches the page, parses HTML, and extracts main content.
+--
+-- Steps:
+-- 1. Parse URL to host/path
+-- 2. Check robots.txt policy
+-- 3. Rate-limit requests per host
+-- 4. Fetch article HTML
+-- 5. Extract title + main content
 extractArticle :: Text -> IO ExtractResult
-extractArticle url = do
-  result <-
-    try <| Req.runReq Req.defaultHttpConfig <| do
-      case parseUrl url of
-        Left err -> panic <| "Invalid URL: " <> err
-        Right reqUrl -> do
-          response <-
-            Req.req
-              Req.GET
-              reqUrl
-              Req.NoReqBody
-              Req.lbsResponse
-              ( Req.header "User-Agent" "Omni-Newsreader/1.0"
-                  <> Req.responseTimeout 30000000 -- 30 seconds
-              )
-          let statusCode = Req.responseStatusCode response
-          pure
-            <| if statusCode >= 200 && statusCode < 300
-              then
-                let content = Req.responseBody response
-                    html = TE.decodeUtf8 <| LBS.toStrict content
-                    extracted = extractMainContent html
-                    title = extractTitle html
-                 in ExtractSuccess extracted title
-              else ExtractFailure <| "HTTP " <> show statusCode
-  case result of
-    Right extractResult -> pure extractResult
-    Left (e :: SomeException) -> pure <| ExtractFailure (show e)
+extractArticle rawUrl = do
+  let url = T.strip rawUrl
+  case parseUrlParts url of
+    Nothing -> pure <| ExtractFailure <| "Could not parse URL: " <> url
+    Just urlParts -> do
+      allowed <- isAllowedByRobots urlParts
+      if not allowed
+        then pure <| ExtractFailure "Blocked by robots.txt"
+        else do
+          fetchResult <- fetchUrlWithPolicy urlParts url articleFetchTimeoutMicros
+          case fetchResult of
+            Left err -> pure <| ExtractFailure err
+            Right (statusCode, content)
+              | statusCode >= 200 && statusCode < 300 ->
+                  let html = TE.decodeUtf8Lenient <| LBS.toStrict content
+                      extracted = extractMainContent html
+                      title = extractTitle html
+                   in pure <| ExtractSuccess extracted title
+              | otherwise -> pure <| ExtractFailure <| "HTTP " <> show statusCode
+
+-- | Parse a URL into scheme, host, and path for crawler policy checks.
+parseUrlParts :: Text -> Maybe UrlParts
+parseUrlParts rawUrl
+  | "https://" `T.isPrefixOf` lowered = parseWithScheme "https" 8
+  | "http://" `T.isPrefixOf` lowered = parseWithScheme "http" 7
+  | otherwise = Nothing
+  where
+    lowered = T.toLower rawUrl
+
+    parseWithScheme :: Text -> Int -> Maybe UrlParts
+    parseWithScheme scheme prefixLen =
+      let rest = T.drop prefixLen rawUrl
+          (authority, remainder0) = T.break isPathStart rest
+          hostWithUser = T.takeWhile (\c -> c /= '?' && c /= '#') authority
+          host = dropUserInfo hostWithUser
+          remainder =
+            if T.null remainder0
+              then "/"
+              else case T.head remainder0 of
+                '/' -> remainder0
+                '?' -> "/" <> remainder0
+                '#' -> "/"
+                _ -> "/" <> remainder0
+          pathNoFragment = fst <| T.breakOn "#" remainder
+          pathNoQuery = fst <| T.breakOn "?" pathNoFragment
+          path = normalizePath pathNoQuery
+       in if T.null host then Nothing else Just (UrlParts scheme host path)
+
+    isPathStart c = c == '/' || c == '?' || c == '#'
+
+    dropUserInfo authority =
+      case lastMaybe <| T.splitOn "@" authority of
+        Nothing -> authority
+        Just auth -> auth
+
+-- | Fetch URL with per-host rate limiting.
+fetchUrlWithPolicy :: UrlParts -> Text -> Int -> IO (Either Text (Int, LBS.ByteString))
+fetchUrlWithPolicy urlParts url timeoutMicros = do
+  waitForHostRateLimit (urlPartsHost urlParts)
+  fetchUrl url timeoutMicros
 
--- | Parse a URL into req's Url type.
-parseUrl :: Text -> Either Text (Req.Url 'Req.Https)
-parseUrl url =
+-- | Perform a simple HTTP GET, supporting both HTTPS and HTTP URLs.
+fetchUrl :: Text -> Int -> IO (Either Text (Int, LBS.ByteString))
+fetchUrl url timeoutMicros =
   case URI.mkURI url of
-    Nothing -> Left <| "Could not parse URL: " <> url
-    Just uri -> case Req.useHttpsURI uri of
-      Just (reqUrl, _) -> Right reqUrl
-      Nothing -> Left <| "Could not parse URL: " <> url
+    Nothing -> pure <| Left <| "Could not parse URL: " <> url
+    Just uri -> do
+      result <-
+        try <| Req.runReq Req.defaultHttpConfig <| do
+          case Req.useHttpsURI uri of
+            Just (reqUrl, reqOpts) -> do
+              response <-
+                Req.req
+                  Req.GET
+                  reqUrl
+                  Req.NoReqBody
+                  Req.lbsResponse
+                  (requestOptions reqOpts timeoutMicros)
+              pure (Req.responseStatusCode response, Req.responseBody response)
+            Nothing ->
+              case Req.useHttpURI uri of
+                Just (reqUrl, reqOpts) -> do
+                  response <-
+                    Req.req
+                      Req.GET
+                      reqUrl
+                      Req.NoReqBody
+                      Req.lbsResponse
+                      (requestOptions reqOpts timeoutMicros)
+                  pure (Req.responseStatusCode response, Req.responseBody response)
+                Nothing -> panic <| "Could not parse URL: " <> url
+      case result of
+        Right response -> pure <| Right response
+        Left (e :: SomeException) -> pure <| Left (show e)
+
+-- | Shared request headers/options.
+requestOptions reqOpts timeoutMicros =
+  reqOpts
+    <> Req.header "User-Agent" userAgent
+    <> Req.responseTimeout timeoutMicros
+
+-- | Enforce minimum delay between requests to a host.
+waitForHostRateLimit :: Text -> IO ()
+waitForHostRateLimit host =
+  modifyMVar_ hostRateLimitState <| \state -> do
+    now <- getCurrentTime
+    let waitMicros =
+          case Map.lookup host state of
+            Nothing -> 0
+            Just lastRequestAt ->
+              let elapsedMicros = floor ((realToFrac <| diffUTCTime now lastRequestAt) * (1000000 :: Double))
+               in max 0 (perHostRateLimitMicros - elapsedMicros)
+    when (waitMicros > 0) <| threadDelay waitMicros
+    updatedNow <- getCurrentTime
+    pure <| Map.insert host updatedNow state
+
+-- | Check robots.txt policy for a URL path.
+isAllowedByRobots :: UrlParts -> IO Bool
+isAllowedByRobots urlParts = do
+  let origin = urlPartsOrigin urlParts
+  cached <- withMVar robotsRulesCache <| pure <. Map.lookup origin
+  rules <-
+    case cached of
+      Just r -> pure r
+      Nothing -> do
+        fetched <- fetchRobotsRules urlParts
+        modifyMVar_ robotsRulesCache <| pure <. Map.insert origin fetched
+        pure fetched
+  pure <| isPathAllowedByRobots rules (urlPartsPath urlParts)
+
+-- | Fetch and parse robots.txt for a given origin.
+fetchRobotsRules :: UrlParts -> IO RobotsRules
+fetchRobotsRules urlParts = do
+  let robotsUrl = urlPartsOrigin urlParts <> "/robots.txt"
+  result <- fetchUrlWithPolicy urlParts robotsUrl robotsFetchTimeoutMicros
+  case result of
+    Left _ -> pure emptyRobotsRules
+    Right (statusCode, body)
+      | statusCode >= 200 && statusCode < 300 ->
+          pure <| parseRobotsRules <| TE.decodeUtf8Lenient <| LBS.toStrict body
+      | statusCode == 401 || statusCode == 403 || statusCode == 429 -> pure disallowAllRobotsRules
+      | otherwise -> pure emptyRobotsRules
+
+-- | Parse robots.txt into rules applicable to Omni-Newsreader.
+parseRobotsRules :: Text -> RobotsRules
+parseRobotsRules robotsText =
+  let (rulesByAgent, _, _) = foldl' step (Map.empty, [], False) (T.lines robotsText)
+      specificRules =
+        [rules | (agent, rules) <- Map.toList rulesByAgent, matchesSpecificUserAgent agent]
+   in case specificRules of
+        [] -> fromMaybe emptyRobotsRules <| Map.lookup "*" rulesByAgent
+        _ -> foldl' appendRobotsRules emptyRobotsRules specificRules
+  where
+    step :: (Map.Map Text RobotsRules, [Text], Bool) -> Text -> (Map.Map Text RobotsRules, [Text], Bool)
+    step (rulesByAgent, activeAgents, hasRules) rawLine =
+      let line = sanitizeRobotsLine rawLine
+       in if T.null line
+            then (rulesByAgent, [], False)
+            else
+              case parseRobotsDirective line of
+                Just ("user-agent", agentRaw) ->
+                  let agent = T.toLower <| T.strip agentRaw
+                   in if T.null agent
+                        then (rulesByAgent, activeAgents, hasRules)
+                        else
+                          let nextAgents = if hasRules then [agent] else activeAgents <> [agent]
+                           in (rulesByAgent, nextAgents, False)
+                Just ("allow", ruleRaw) ->
+                  (addAllowRule activeAgents (T.strip ruleRaw) rulesByAgent, activeAgents, True)
+                Just ("disallow", ruleRaw) ->
+                  (addDisallowRule activeAgents (T.strip ruleRaw) rulesByAgent, activeAgents, True)
+                _ -> (rulesByAgent, activeAgents, hasRules)
+
+    addAllowRule :: [Text] -> Text -> Map.Map Text RobotsRules -> Map.Map Text RobotsRules
+    addAllowRule agents ruleText =
+      addRule agents ruleText <| \rule ->
+        emptyRobotsRules {robotsAllowRules = [rule]}
+
+    addDisallowRule :: [Text] -> Text -> Map.Map Text RobotsRules -> Map.Map Text RobotsRules
+    addDisallowRule agents ruleText =
+      addRule agents ruleText <| \rule ->
+        emptyRobotsRules {robotsDisallowRules = [rule]}
+
+    addRule :: [Text] -> Text -> (Text -> RobotsRules) -> Map.Map Text RobotsRules -> Map.Map Text RobotsRules
+    addRule agents ruleText toRule rulesByAgent
+      | null agents || T.null ruleText = rulesByAgent
+      | otherwise =
+          foldl'
+            ( \acc agent ->
+                Map.insertWith
+                  (\newer older -> appendRobotsRules older newer)
+                  agent
+                  (toRule ruleText)
+                  acc
+            )
+            rulesByAgent
+            agents
+
+-- | Remove comments and trim whitespace from a robots.txt line.
+sanitizeRobotsLine :: Text -> Text
+sanitizeRobotsLine line =
+  T.strip <| fst <| T.breakOn "#" line
+
+-- | Parse a single robots directive line.
+parseRobotsDirective :: Text -> Maybe (Text, Text)
+parseRobotsDirective line =
+  let (key, rest) = T.breakOn ":" line
+   in if T.null rest
+        then Nothing
+        else Just (T.toLower <| T.strip key, T.strip <| T.drop 1 rest)
+
+-- | Whether a robots user-agent token matches Omni-Newsreader.
+matchesSpecificUserAgent :: Text -> Bool
+matchesSpecificUserAgent agent =
+  let normalized = T.toLower <| T.strip agent
+   in not (T.null normalized)
+        && normalized /= "*"
+        && ( normalized `T.isPrefixOf` robotsUserAgent
+               || normalized `T.isPrefixOf` robotsUserAgentFamily
+               || robotsUserAgentFamily `T.isPrefixOf` normalized
+           )
+
+-- | Apply robots allow/disallow rules using longest-match precedence.
+--
+-- If no rule matches, path is allowed.
+-- If both allow and disallow match, the longest rule wins.
+-- If lengths tie, allow wins.
+isPathAllowedByRobots :: RobotsRules -> Text -> Bool
+isPathAllowedByRobots rules rawPath =
+  let path = normalizePath rawPath
+      bestAllow = bestMatchLength path (robotsAllowRules rules)
+      bestDisallow = bestMatchLength path (robotsDisallowRules rules)
+   in case (bestAllow, bestDisallow) of
+        (Nothing, Nothing) -> True
+        (Just _, Nothing) -> True
+        (Nothing, Just _) -> False
+        (Just allowLen, Just disallowLen) -> allowLen >= disallowLen
+
+bestMatchLength :: Text -> [Text] -> Maybe Int
+bestMatchLength path rules =
+  let lengths = [T.length rule | rule <- rules, pathMatchesRule path rule]
+   in case lengths of
+        [] -> Nothing
+        _ -> Just <| maximum lengths
+
+-- | Very small subset of robots matching.
+--
+-- Supports:
+-- - Prefix matches
+-- - Empty rule ignored
+-- - '$' end-anchor
+-- - '*' treated as "prefix before wildcard"
+pathMatchesRule :: Text -> Text -> Bool
+pathMatchesRule rawPath rawRule =
+  let rule0 = T.strip rawRule
+   in if T.null rule0
+        then False
+        else
+          let hasEndAnchor = "$" `T.isSuffixOf` rule0
+              noAnchor = if hasEndAnchor then T.dropEnd 1 rule0 else rule0
+              noWildcard = fst <| T.breakOn "*" noAnchor
+              rule = normalizePath noWildcard
+              path = normalizePath rawPath
+           in if hasEndAnchor
+                then path == rule
+                else rule `T.isPrefixOf` path
+
+normalizePath :: Text -> Text
+normalizePath path
+  | T.null path = "/"
+  | "/" `T.isPrefixOf` path = path
+  | otherwise = "/" <> path
+
+urlPartsOrigin :: UrlParts -> Text
+urlPartsOrigin urlParts =
+  urlPartsScheme urlParts <> "://" <> urlPartsHost urlParts
 
 -- | Extract title from HTML using simple heuristics.
 -- Looks for <title> tag first, then <h1>.
 extractTitle :: Text -> Maybe Text
 extractTitle html =
   let tags = Tag.parseTags html
-      titleTag = List.find isTitle tags
-      h1Tag = List.find isH1 tags
-   in case titleTag of
-        Just (Tag.TagText txt) -> Just <| T.strip txt
-        _ -> case h1Tag of
-          Just (Tag.TagText txt) -> Just <| T.strip txt
-          _ -> Nothing
-  where
-    isTitle (Tag.TagOpen "title" _) = True
-    isTitle _ = False
-    isH1 (Tag.TagOpen "h1" _) = True
-    isH1 _ = False
+   in extractTextFromTag "title" tags <|> extractTextFromTag "h1" tags
+
+extractTextFromTag :: Text -> [Tag.Tag Text] -> Maybe Text
+extractTextFromTag _ [] = Nothing
+extractTextFromTag name (tag : rest) =
+  case tag of
+    Tag.TagOpen tagName _
+      | T.toLower tagName == T.toLower name ->
+          let inside = takeWhile (not <. Tag.isTagCloseName tagName) rest
+              txt = cleanText <| T.unwords [t | Tag.TagText t <- inside]
+           in if T.null txt
+                then extractTextFromTag name rest
+                else Just txt
+    _ -> extractTextFromTag name rest
 
 -- | Extract main content from HTML using simple heuristics.
 -- This is a basic implementation - could be improved with readability algorithms.
@@ -202,3 +520,8 @@ cleanText txt =
   let noTabs = T.replace "\t" " " txt
       noExtraSpaces = T.unwords <| T.words noTabs
    in T.strip noExtraSpaces
+
+lastMaybe :: [a] -> Maybe a
+lastMaybe [] = Nothing
+lastMaybe [x] = Just x
+lastMaybe (_ : xs) = lastMaybe xs
diff --git a/Omni/Newsreader/Ingest.hs b/Omni/Newsreader/Ingest.hs
index 1c7a865d..ede9bdd5 100644
--- a/Omni/Newsreader/Ingest.hs
+++ b/Omni/Newsreader/Ingest.hs
@@ -168,35 +168,108 @@ data EntryResult
 -- | Process a single feed entry.
 processEntry :: SQL.Connection -> Feed.FeedId -> Fetcher.FeedEntry -> IO EntryResult
 processEntry conn feedId entry = do
-  let url = Fetcher.entryLink entry
+  let url = T.strip (Fetcher.entryLink entry)
   let title = Fetcher.entryTitle entry
+  let feedContent = Fetcher.entryContent entry
 
   -- Check if article already exists
   exists <- Article.articleExists conn feedId url
   if exists
     then pure ArticleSkipped
     else do
-      -- Use feed content (usually HTML from RSS/Atom) when available.
-      -- Only fetch the full article URL when feed content is very short.
-      let feedContent = Fetcher.entryContent entry
-      if T.length feedContent > 100
+      if shouldFetchFullArticle entry
         then do
-          -- Feed has substantial content — use it directly (preserves HTML)
-          storeArticleWithContent conn feedId url title feedContent (Fetcher.entryPublished entry)
-          pure ArticleNew
-        else do
-          -- Feed content is empty/short — try extracting from URL
+          -- Feed appears truncated. Try fetching and extracting the full article.
           extractResult <- Extractor.extractArticle url
           case extractResult of
             Extractor.ExtractFailure err -> do
-              putText <| "Failed to extract article " <> url <> ": " <> err
+              putText <| "Failed to extract full article " <> url <> ": " <> err
               storeArticleWithContent conn feedId url title feedContent (Fetcher.entryPublished entry)
               pure ArticleNew
             Extractor.ExtractSuccess content extractedTitle -> do
-              let finalTitle = fromMaybe title extractedTitle
-              let finalContent = if T.null content then feedContent else content
+              let finalTitle = fromMaybe title (extractedTitle >>= nonEmptyText)
+              let finalContent = preferExtractedContent feedContent content
               storeArticleWithContent conn feedId url finalTitle finalContent (Fetcher.entryPublished entry)
               pure ArticleNew
+        else do
+          -- Feed appears to contain full content already.
+          storeArticleWithContent conn feedId url title feedContent (Fetcher.entryPublished entry)
+          pure ArticleNew
+
+-- | Heuristic threshold for considering feed content "short".
+truncatedSnippetThreshold :: Int
+truncatedSnippetThreshold = 500
+
+-- | Common patterns used by feeds when truncating article bodies.
+readMorePatterns :: [Text]
+readMorePatterns =
+  [ "read more",
+    "continue reading",
+    "continue...",
+    "continue…",
+    "full story",
+    "keep reading",
+    "view article",
+    "[...]",
+    "[…]"
+  ]
+
+-- | Determine whether we should fetch the linked page for full text.
+--
+-- Truncation heuristics:
+-- - content snippet is shorter than threshold, OR
+-- - snippet includes "read more" style language.
+--
+-- We only attempt extraction when the entry has an HTTP(S) link.
+shouldFetchFullArticle :: Fetcher.FeedEntry -> Bool
+shouldFetchFullArticle entry =
+  let url = Fetcher.entryLink entry
+      rawContent = Fetcher.entryContent entry
+      plainSnippet = normalizeSnippet rawContent
+      plainLen = T.length plainSnippet
+      shortSnippet = plainLen < truncatedSnippetThreshold
+      hasReadMore = containsReadMorePattern rawContent || containsReadMorePattern plainSnippet
+   in hasFetchableLink url && (shortSnippet || hasReadMore)
+
+-- | Keep whichever content is likely better for reading.
+--
+-- Prefer extracted content when it is non-empty and at least as substantial
+-- as the feed snippet; otherwise fall back to the feed snippet.
+preferExtractedContent :: Text -> Text -> Text
+preferExtractedContent feedSnippet extracted =
+  let feedLen = T.length <| normalizeSnippet feedSnippet
+      extractedTrimmed = T.strip extracted
+      extractedLen = T.length extractedTrimmed
+   in if extractedLen == 0
+        then feedSnippet
+        else
+          if extractedLen >= feedLen
+            then extracted
+            else feedSnippet
+
+-- | Normalize a feed snippet for heuristic checks.
+normalizeSnippet :: Text -> Text
+normalizeSnippet = T.toLower <. T.strip <. Extractor.stripHtml
+
+-- | Whether content contains common truncation markers.
+containsReadMorePattern :: Text -> Bool
+containsReadMorePattern txt =
+  let lowered = T.toLower txt
+      stripped = T.strip lowered
+   in any (`T.isInfixOf` lowered) readMorePatterns
+        || "..." `T.isSuffixOf` stripped
+        || "…" `T.isSuffixOf` stripped
+
+-- | Ensure a URL looks fetchable over HTTP(S).
+hasFetchableLink :: Text -> Bool
+hasFetchableLink rawUrl =
+  let url = T.toLower <| T.strip rawUrl
+   in "https://" `T.isPrefixOf` url || "http://" `T.isPrefixOf` url
+
+nonEmptyText :: Text -> Maybe Text
+nonEmptyText txt =
+  let trimmed = T.strip txt
+   in if T.null trimmed then Nothing else Just trimmed
 
 -- | Store article with content.
 storeArticleWithContent ::