commit 5646f1682fa061b9c64479b8799c803a12ea545c
Author: Coder Agent <coder@agents.omni>
Date: Thu Feb 19 14:05:15 2026
Newsreader: fetch full text for truncated feed entries
Detect likely truncated RSS snippets and automatically try full-page extraction. Add polite crawling in the extractor with per-host rate limiting and robots.txt checks, while falling back to feed snippets when extraction fails.
Task-Id: t-643
diff --git a/Omni/Newsreader/Extractor.hs b/Omni/Newsreader/Extractor.hs
index d9705ad5..170ee2e8 100644
--- a/Omni/Newsreader/Extractor.hs
+++ b/Omni/Newsreader/Extractor.hs
@@ -7,6 +7,10 @@
-- Fetches full article content from URLs and extracts clean text.
-- Uses TagSoup for HTML parsing and simple heuristics for content extraction.
--
+-- Includes polite crawling behavior:
+-- - Per-host rate limiting
+-- - robots.txt checks (with in-memory caching)
+--
-- : dep req
-- : dep tagsoup
-- : dep bytestring
@@ -23,11 +27,16 @@ module Omni.Newsreader.Extractor
where
import Alpha
+import Control.Concurrent (threadDelay)
+import Control.Concurrent.MVar (MVar, modifyMVar_, newMVar, withMVar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.List as List
+import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
+import Data.Time (UTCTime, diffUTCTime, getCurrentTime)
import qualified Network.HTTP.Req as Req
+import System.IO.Unsafe (unsafePerformIO)
import qualified Text.HTML.TagSoup as Tag
import qualified Text.URI as URI
@@ -40,64 +49,373 @@ data ExtractResult
| ExtractFailure Text -- Error message
deriving (Show, Eq)
+-- | Per-host minimum delay between HTTP requests.
+perHostRateLimitMicros :: Int
+perHostRateLimitMicros = 1000000
+
+-- | Timeout for article/page fetches (30s).
+articleFetchTimeoutMicros :: Int
+articleFetchTimeoutMicros = 30000000
+
+-- | Timeout for robots.txt fetches (10s).
+robotsFetchTimeoutMicros :: Int
+robotsFetchTimeoutMicros = 10000000
+
+-- | User-Agent used for all crawler requests.
+userAgent :: Text
+userAgent = "Omni-Newsreader/1.0"
+
+-- | Lowercase User-Agent token used for robots matching.
+robotsUserAgent :: Text
+robotsUserAgent = T.toLower userAgent
+
+-- | Family token used by many robots files (without version suffix).
+robotsUserAgentFamily :: Text
+robotsUserAgentFamily = T.takeWhile (/= '/') robotsUserAgent
+
+-- | Parsed URL components used for crawl policy checks.
+data UrlParts = UrlParts
+ { urlPartsScheme :: Text,
+ urlPartsHost :: Text,
+ urlPartsPath :: Text
+ }
+ deriving (Show, Eq)
+
+-- | Robots policy rules.
+data RobotsRules = RobotsRules
+ { robotsAllowRules :: [Text],
+ robotsDisallowRules :: [Text]
+ }
+ deriving (Show, Eq)
+
+emptyRobotsRules :: RobotsRules
+emptyRobotsRules = RobotsRules [] []
+
+disallowAllRobotsRules :: RobotsRules
+disallowAllRobotsRules = RobotsRules [] ["/"]
+
+appendRobotsRules :: RobotsRules -> RobotsRules -> RobotsRules
+appendRobotsRules older newer =
+ RobotsRules
+ { robotsAllowRules = robotsAllowRules older <> robotsAllowRules newer,
+ robotsDisallowRules = robotsDisallowRules older <> robotsDisallowRules newer
+ }
+
+-- | Global per-host last-request timestamps for polite rate-limiting.
+{-# NOINLINE hostRateLimitState #-}
+hostRateLimitState :: MVar (Map.Map Text UTCTime)
+hostRateLimitState = unsafePerformIO <| newMVar Map.empty
+
+-- | Cached robots rules per origin (scheme + host).
+{-# NOINLINE robotsRulesCache #-}
+robotsRulesCache :: MVar (Map.Map Text RobotsRules)
+robotsRulesCache = unsafePerformIO <| newMVar Map.empty
+
-- | Extract article content from a URL.
--- Fetches the page, parses HTML, and extracts main content.
+--
+-- Steps:
+-- 1. Parse URL to host/path
+-- 2. Check robots.txt policy
+-- 3. Rate-limit requests per host
+-- 4. Fetch article HTML
+-- 5. Extract title + main content
extractArticle :: Text -> IO ExtractResult
-extractArticle url = do
- result <-
- try <| Req.runReq Req.defaultHttpConfig <| do
- case parseUrl url of
- Left err -> panic <| "Invalid URL: " <> err
- Right reqUrl -> do
- response <-
- Req.req
- Req.GET
- reqUrl
- Req.NoReqBody
- Req.lbsResponse
- ( Req.header "User-Agent" "Omni-Newsreader/1.0"
- <> Req.responseTimeout 30000000 -- 30 seconds
- )
- let statusCode = Req.responseStatusCode response
- pure
- <| if statusCode >= 200 && statusCode < 300
- then
- let content = Req.responseBody response
- html = TE.decodeUtf8 <| LBS.toStrict content
- extracted = extractMainContent html
- title = extractTitle html
- in ExtractSuccess extracted title
- else ExtractFailure <| "HTTP " <> show statusCode
- case result of
- Right extractResult -> pure extractResult
- Left (e :: SomeException) -> pure <| ExtractFailure (show e)
+extractArticle rawUrl = do
+ let url = T.strip rawUrl
+ case parseUrlParts url of
+ Nothing -> pure <| ExtractFailure <| "Could not parse URL: " <> url
+ Just urlParts -> do
+ allowed <- isAllowedByRobots urlParts
+ if not allowed
+ then pure <| ExtractFailure "Blocked by robots.txt"
+ else do
+ fetchResult <- fetchUrlWithPolicy urlParts url articleFetchTimeoutMicros
+ case fetchResult of
+ Left err -> pure <| ExtractFailure err
+ Right (statusCode, content)
+ | statusCode >= 200 && statusCode < 300 ->
+ let html = TE.decodeUtf8Lenient <| LBS.toStrict content
+ extracted = extractMainContent html
+ title = extractTitle html
+ in pure <| ExtractSuccess extracted title
+ | otherwise -> pure <| ExtractFailure <| "HTTP " <> show statusCode
+
+-- | Parse a URL into scheme, host, and path for crawler policy checks.
+parseUrlParts :: Text -> Maybe UrlParts
+parseUrlParts rawUrl
+ | "https://" `T.isPrefixOf` lowered = parseWithScheme "https" 8
+ | "http://" `T.isPrefixOf` lowered = parseWithScheme "http" 7
+ | otherwise = Nothing
+ where
+ lowered = T.toLower rawUrl
+
+ parseWithScheme :: Text -> Int -> Maybe UrlParts
+ parseWithScheme scheme prefixLen =
+ let rest = T.drop prefixLen rawUrl
+ (authority, remainder0) = T.break isPathStart rest
+ hostWithUser = T.takeWhile (\c -> c /= '?' && c /= '#') authority
+ host = dropUserInfo hostWithUser
+ remainder =
+ if T.null remainder0
+ then "/"
+ else case T.head remainder0 of
+ '/' -> remainder0
+ '?' -> "/" <> remainder0
+ '#' -> "/"
+ _ -> "/" <> remainder0
+ pathNoFragment = fst <| T.breakOn "#" remainder
+ pathNoQuery = fst <| T.breakOn "?" pathNoFragment
+ path = normalizePath pathNoQuery
+ in if T.null host then Nothing else Just (UrlParts scheme host path)
+
+ isPathStart c = c == '/' || c == '?' || c == '#'
+
+ dropUserInfo authority =
+ case lastMaybe <| T.splitOn "@" authority of
+ Nothing -> authority
+ Just auth -> auth
+
+-- | Fetch URL with per-host rate limiting.
+fetchUrlWithPolicy :: UrlParts -> Text -> Int -> IO (Either Text (Int, LBS.ByteString))
+fetchUrlWithPolicy urlParts url timeoutMicros = do
+ waitForHostRateLimit (urlPartsHost urlParts)
+ fetchUrl url timeoutMicros
--- | Parse a URL into req's Url type.
-parseUrl :: Text -> Either Text (Req.Url 'Req.Https)
-parseUrl url =
+-- | Perform a simple HTTP GET, supporting both HTTPS and HTTP URLs.
+fetchUrl :: Text -> Int -> IO (Either Text (Int, LBS.ByteString))
+fetchUrl url timeoutMicros =
case URI.mkURI url of
- Nothing -> Left <| "Could not parse URL: " <> url
- Just uri -> case Req.useHttpsURI uri of
- Just (reqUrl, _) -> Right reqUrl
- Nothing -> Left <| "Could not parse URL: " <> url
+ Nothing -> pure <| Left <| "Could not parse URL: " <> url
+ Just uri -> do
+ result <-
+ try <| Req.runReq Req.defaultHttpConfig <| do
+ case Req.useHttpsURI uri of
+ Just (reqUrl, reqOpts) -> do
+ response <-
+ Req.req
+ Req.GET
+ reqUrl
+ Req.NoReqBody
+ Req.lbsResponse
+ (requestOptions reqOpts timeoutMicros)
+ pure (Req.responseStatusCode response, Req.responseBody response)
+ Nothing ->
+ case Req.useHttpURI uri of
+ Just (reqUrl, reqOpts) -> do
+ response <-
+ Req.req
+ Req.GET
+ reqUrl
+ Req.NoReqBody
+ Req.lbsResponse
+ (requestOptions reqOpts timeoutMicros)
+ pure (Req.responseStatusCode response, Req.responseBody response)
+ Nothing -> panic <| "Could not parse URL: " <> url
+ case result of
+ Right response -> pure <| Right response
+ Left (e :: SomeException) -> pure <| Left (show e)
+
+-- | Shared request headers/options.
+requestOptions reqOpts timeoutMicros =
+ reqOpts
+ <> Req.header "User-Agent" userAgent
+ <> Req.responseTimeout timeoutMicros
+
+-- | Enforce minimum delay between requests to a host.
+waitForHostRateLimit :: Text -> IO ()
+waitForHostRateLimit host =
+ modifyMVar_ hostRateLimitState <| \state -> do
+ now <- getCurrentTime
+ let waitMicros =
+ case Map.lookup host state of
+ Nothing -> 0
+ Just lastRequestAt ->
+ let elapsedMicros = floor ((realToFrac <| diffUTCTime now lastRequestAt) * (1000000 :: Double))
+ in max 0 (perHostRateLimitMicros - elapsedMicros)
+ when (waitMicros > 0) <| threadDelay waitMicros
+ updatedNow <- getCurrentTime
+ pure <| Map.insert host updatedNow state
+
+-- | Check robots.txt policy for a URL path.
+isAllowedByRobots :: UrlParts -> IO Bool
+isAllowedByRobots urlParts = do
+ let origin = urlPartsOrigin urlParts
+ cached <- withMVar robotsRulesCache <| pure <. Map.lookup origin
+ rules <-
+ case cached of
+ Just r -> pure r
+ Nothing -> do
+ fetched <- fetchRobotsRules urlParts
+ modifyMVar_ robotsRulesCache <| pure <. Map.insert origin fetched
+ pure fetched
+ pure <| isPathAllowedByRobots rules (urlPartsPath urlParts)
+
+-- | Fetch and parse robots.txt for a given origin.
+fetchRobotsRules :: UrlParts -> IO RobotsRules
+fetchRobotsRules urlParts = do
+ let robotsUrl = urlPartsOrigin urlParts <> "/robots.txt"
+ result <- fetchUrlWithPolicy urlParts robotsUrl robotsFetchTimeoutMicros
+ case result of
+ Left _ -> pure emptyRobotsRules
+ Right (statusCode, body)
+ | statusCode >= 200 && statusCode < 300 ->
+ pure <| parseRobotsRules <| TE.decodeUtf8Lenient <| LBS.toStrict body
+ | statusCode == 401 || statusCode == 403 || statusCode == 429 -> pure disallowAllRobotsRules
+ | otherwise -> pure emptyRobotsRules
+
+-- | Parse robots.txt into rules applicable to Omni-Newsreader.
+parseRobotsRules :: Text -> RobotsRules
+parseRobotsRules robotsText =
+ let (rulesByAgent, _, _) = foldl' step (Map.empty, [], False) (T.lines robotsText)
+ specificRules =
+ [rules | (agent, rules) <- Map.toList rulesByAgent, matchesSpecificUserAgent agent]
+ in case specificRules of
+ [] -> fromMaybe emptyRobotsRules <| Map.lookup "*" rulesByAgent
+ _ -> foldl' appendRobotsRules emptyRobotsRules specificRules
+ where
+ step :: (Map.Map Text RobotsRules, [Text], Bool) -> Text -> (Map.Map Text RobotsRules, [Text], Bool)
+ step (rulesByAgent, activeAgents, hasRules) rawLine =
+ let line = sanitizeRobotsLine rawLine
+ in if T.null line
+ then (rulesByAgent, [], False)
+ else
+ case parseRobotsDirective line of
+ Just ("user-agent", agentRaw) ->
+ let agent = T.toLower <| T.strip agentRaw
+ in if T.null agent
+ then (rulesByAgent, activeAgents, hasRules)
+ else
+ let nextAgents = if hasRules then [agent] else activeAgents <> [agent]
+ in (rulesByAgent, nextAgents, False)
+ Just ("allow", ruleRaw) ->
+ (addAllowRule activeAgents (T.strip ruleRaw) rulesByAgent, activeAgents, True)
+ Just ("disallow", ruleRaw) ->
+ (addDisallowRule activeAgents (T.strip ruleRaw) rulesByAgent, activeAgents, True)
+ _ -> (rulesByAgent, activeAgents, hasRules)
+
+ addAllowRule :: [Text] -> Text -> Map.Map Text RobotsRules -> Map.Map Text RobotsRules
+ addAllowRule agents ruleText =
+ addRule agents ruleText <| \rule ->
+ emptyRobotsRules {robotsAllowRules = [rule]}
+
+ addDisallowRule :: [Text] -> Text -> Map.Map Text RobotsRules -> Map.Map Text RobotsRules
+ addDisallowRule agents ruleText =
+ addRule agents ruleText <| \rule ->
+ emptyRobotsRules {robotsDisallowRules = [rule]}
+
+ addRule :: [Text] -> Text -> (Text -> RobotsRules) -> Map.Map Text RobotsRules -> Map.Map Text RobotsRules
+ addRule agents ruleText toRule rulesByAgent
+ | null agents || T.null ruleText = rulesByAgent
+ | otherwise =
+ foldl'
+ ( \acc agent ->
+ Map.insertWith
+ (\newer older -> appendRobotsRules older newer)
+ agent
+ (toRule ruleText)
+ acc
+ )
+ rulesByAgent
+ agents
+
+-- | Remove comments and trim whitespace from a robots.txt line.
+sanitizeRobotsLine :: Text -> Text
+sanitizeRobotsLine line =
+ T.strip <| fst <| T.breakOn "#" line
+
+-- | Parse a single robots directive line.
+parseRobotsDirective :: Text -> Maybe (Text, Text)
+parseRobotsDirective line =
+ let (key, rest) = T.breakOn ":" line
+ in if T.null rest
+ then Nothing
+ else Just (T.toLower <| T.strip key, T.strip <| T.drop 1 rest)
+
+-- | Whether a robots user-agent token matches Omni-Newsreader.
+matchesSpecificUserAgent :: Text -> Bool
+matchesSpecificUserAgent agent =
+ let normalized = T.toLower <| T.strip agent
+ in not (T.null normalized)
+ && normalized /= "*"
+ && ( normalized `T.isPrefixOf` robotsUserAgent
+ || normalized `T.isPrefixOf` robotsUserAgentFamily
+ || robotsUserAgentFamily `T.isPrefixOf` normalized
+ )
+
+-- | Apply robots allow/disallow rules using longest-match precedence.
+--
+-- If no rule matches, path is allowed.
+-- If both allow and disallow match, the longest rule wins.
+-- If lengths tie, allow wins.
+isPathAllowedByRobots :: RobotsRules -> Text -> Bool
+isPathAllowedByRobots rules rawPath =
+ let path = normalizePath rawPath
+ bestAllow = bestMatchLength path (robotsAllowRules rules)
+ bestDisallow = bestMatchLength path (robotsDisallowRules rules)
+ in case (bestAllow, bestDisallow) of
+ (Nothing, Nothing) -> True
+ (Just _, Nothing) -> True
+ (Nothing, Just _) -> False
+ (Just allowLen, Just disallowLen) -> allowLen >= disallowLen
+
+bestMatchLength :: Text -> [Text] -> Maybe Int
+bestMatchLength path rules =
+ let lengths = [T.length rule | rule <- rules, pathMatchesRule path rule]
+ in case lengths of
+ [] -> Nothing
+ _ -> Just <| maximum lengths
+
+-- | Very small subset of robots matching.
+--
+-- Supports:
+-- - Prefix matches
+-- - Empty rule ignored
+-- - '$' end-anchor
+-- - '*' treated as "prefix before wildcard"
+pathMatchesRule :: Text -> Text -> Bool
+pathMatchesRule rawPath rawRule =
+ let rule0 = T.strip rawRule
+ in if T.null rule0
+ then False
+ else
+ let hasEndAnchor = "$" `T.isSuffixOf` rule0
+ noAnchor = if hasEndAnchor then T.dropEnd 1 rule0 else rule0
+ noWildcard = fst <| T.breakOn "*" noAnchor
+ rule = normalizePath noWildcard
+ path = normalizePath rawPath
+ in if hasEndAnchor
+ then path == rule
+ else rule `T.isPrefixOf` path
+
+normalizePath :: Text -> Text
+normalizePath path
+ | T.null path = "/"
+ | "/" `T.isPrefixOf` path = path
+ | otherwise = "/" <> path
+
+urlPartsOrigin :: UrlParts -> Text
+urlPartsOrigin urlParts =
+ urlPartsScheme urlParts <> "://" <> urlPartsHost urlParts
-- | Extract title from HTML using simple heuristics.
-- Looks for <title> tag first, then <h1>.
extractTitle :: Text -> Maybe Text
extractTitle html =
let tags = Tag.parseTags html
- titleTag = List.find isTitle tags
- h1Tag = List.find isH1 tags
- in case titleTag of
- Just (Tag.TagText txt) -> Just <| T.strip txt
- _ -> case h1Tag of
- Just (Tag.TagText txt) -> Just <| T.strip txt
- _ -> Nothing
- where
- isTitle (Tag.TagOpen "title" _) = True
- isTitle _ = False
- isH1 (Tag.TagOpen "h1" _) = True
- isH1 _ = False
+ in extractTextFromTag "title" tags <|> extractTextFromTag "h1" tags
+
+extractTextFromTag :: Text -> [Tag.Tag Text] -> Maybe Text
+extractTextFromTag _ [] = Nothing
+extractTextFromTag name (tag : rest) =
+ case tag of
+ Tag.TagOpen tagName _
+ | T.toLower tagName == T.toLower name ->
+ let inside = takeWhile (not <. Tag.isTagCloseName tagName) rest
+ txt = cleanText <| T.unwords [t | Tag.TagText t <- inside]
+ in if T.null txt
+ then extractTextFromTag name rest
+ else Just txt
+ _ -> extractTextFromTag name rest
-- | Extract main content from HTML using simple heuristics.
-- This is a basic implementation - could be improved with readability algorithms.
@@ -202,3 +520,8 @@ cleanText txt =
let noTabs = T.replace "\t" " " txt
noExtraSpaces = T.unwords <| T.words noTabs
in T.strip noExtraSpaces
+
+lastMaybe :: [a] -> Maybe a
+lastMaybe [] = Nothing
+lastMaybe [x] = Just x
+lastMaybe (_ : xs) = lastMaybe xs
diff --git a/Omni/Newsreader/Ingest.hs b/Omni/Newsreader/Ingest.hs
index 1c7a865d..ede9bdd5 100644
--- a/Omni/Newsreader/Ingest.hs
+++ b/Omni/Newsreader/Ingest.hs
@@ -168,35 +168,108 @@ data EntryResult
-- | Process a single feed entry.
processEntry :: SQL.Connection -> Feed.FeedId -> Fetcher.FeedEntry -> IO EntryResult
processEntry conn feedId entry = do
- let url = Fetcher.entryLink entry
+ let url = T.strip (Fetcher.entryLink entry)
let title = Fetcher.entryTitle entry
+ let feedContent = Fetcher.entryContent entry
-- Check if article already exists
exists <- Article.articleExists conn feedId url
if exists
then pure ArticleSkipped
else do
- -- Use feed content (usually HTML from RSS/Atom) when available.
- -- Only fetch the full article URL when feed content is very short.
- let feedContent = Fetcher.entryContent entry
- if T.length feedContent > 100
+ if shouldFetchFullArticle entry
then do
- -- Feed has substantial content — use it directly (preserves HTML)
- storeArticleWithContent conn feedId url title feedContent (Fetcher.entryPublished entry)
- pure ArticleNew
- else do
- -- Feed content is empty/short — try extracting from URL
+ -- Feed appears truncated. Try fetching and extracting the full article.
extractResult <- Extractor.extractArticle url
case extractResult of
Extractor.ExtractFailure err -> do
- putText <| "Failed to extract article " <> url <> ": " <> err
+ putText <| "Failed to extract full article " <> url <> ": " <> err
storeArticleWithContent conn feedId url title feedContent (Fetcher.entryPublished entry)
pure ArticleNew
Extractor.ExtractSuccess content extractedTitle -> do
- let finalTitle = fromMaybe title extractedTitle
- let finalContent = if T.null content then feedContent else content
+ let finalTitle = fromMaybe title (extractedTitle >>= nonEmptyText)
+ let finalContent = preferExtractedContent feedContent content
storeArticleWithContent conn feedId url finalTitle finalContent (Fetcher.entryPublished entry)
pure ArticleNew
+ else do
+ -- Feed appears to contain full content already.
+ storeArticleWithContent conn feedId url title feedContent (Fetcher.entryPublished entry)
+ pure ArticleNew
+
+-- | Heuristic threshold for considering feed content "short".
+truncatedSnippetThreshold :: Int
+truncatedSnippetThreshold = 500
+
+-- | Common patterns used by feeds when truncating article bodies.
+readMorePatterns :: [Text]
+readMorePatterns =
+ [ "read more",
+ "continue reading",
+ "continue...",
+ "continue…",
+ "full story",
+ "keep reading",
+ "view article",
+ "[...]",
+ "[…]"
+ ]
+
+-- | Determine whether we should fetch the linked page for full text.
+--
+-- Truncation heuristics:
+-- - content snippet is shorter than threshold, OR
+-- - snippet includes "read more" style language.
+--
+-- We only attempt extraction when the entry has an HTTP(S) link.
+shouldFetchFullArticle :: Fetcher.FeedEntry -> Bool
+shouldFetchFullArticle entry =
+ let url = Fetcher.entryLink entry
+ rawContent = Fetcher.entryContent entry
+ plainSnippet = normalizeSnippet rawContent
+ plainLen = T.length plainSnippet
+ shortSnippet = plainLen < truncatedSnippetThreshold
+ hasReadMore = containsReadMorePattern rawContent || containsReadMorePattern plainSnippet
+ in hasFetchableLink url && (shortSnippet || hasReadMore)
+
+-- | Keep whichever content is likely better for reading.
+--
+-- Prefer extracted content when it is non-empty and at least as substantial
+-- as the feed snippet; otherwise fall back to the feed snippet.
+preferExtractedContent :: Text -> Text -> Text
+preferExtractedContent feedSnippet extracted =
+ let feedLen = T.length <| normalizeSnippet feedSnippet
+ extractedTrimmed = T.strip extracted
+ extractedLen = T.length extractedTrimmed
+ in if extractedLen == 0
+ then feedSnippet
+ else
+ if extractedLen >= feedLen
+ then extracted
+ else feedSnippet
+
+-- | Normalize a feed snippet for heuristic checks.
+normalizeSnippet :: Text -> Text
+normalizeSnippet = T.toLower <. T.strip <. Extractor.stripHtml
+
+-- | Whether content contains common truncation markers.
+containsReadMorePattern :: Text -> Bool
+containsReadMorePattern txt =
+ let lowered = T.toLower txt
+ stripped = T.strip lowered
+ in any (`T.isInfixOf` lowered) readMorePatterns
+ || "..." `T.isSuffixOf` stripped
+ || "…" `T.isSuffixOf` stripped
+
+-- | Ensure a URL looks fetchable over HTTP(S).
+hasFetchableLink :: Text -> Bool
+hasFetchableLink rawUrl =
+ let url = T.toLower <| T.strip rawUrl
+ in "https://" `T.isPrefixOf` url || "http://" `T.isPrefixOf` url
+
+nonEmptyText :: Text -> Maybe Text
+nonEmptyText txt =
+ let trimmed = T.strip txt
+ in if T.null trimmed then Nothing else Just trimmed
-- | Store article with content.
storeArticleWithContent ::