src/Hakyll/Web/Html.hs
--------------------------------------------------------------------------------
-- | Provides utilities to manipulate HTML pages
module Hakyll.Web.Html
( -- * Generic
withTags
-- * Headers
, demoteHeaders
-- * Url manipulation
, getUrls
, withUrls
, toUrl
, toSiteRoot
, isExternal
-- * Stripping/escaping
, stripTags
, escapeHtml
) where
--------------------------------------------------------------------------------
import Data.Char (digitToInt, intToDigit,
isDigit, toLower)
import Data.List (isPrefixOf)
import qualified Data.Set as S
import System.FilePath.Posix (joinPath, splitPath,
takeDirectory)
import Text.Blaze.Html (toHtml)
import Text.Blaze.Html.Renderer.String (renderHtml)
import qualified Text.HTML.TagSoup as TS
import Network.URI (isUnreserved, escapeURIString)
--------------------------------------------------------------------------------
-- | Map over all tags in the document
withTags :: (TS.Tag String -> TS.Tag String) -> String -> String
withTags f = renderTags' . map f . parseTags'
--------------------------------------------------------------------------------
-- | Map every @h1@ to an @h2@, @h2@ to @h3@, etc.
demoteHeaders :: String -> String
demoteHeaders = withTags $ \tag -> case tag of
TS.TagOpen t a -> TS.TagOpen (demote t) a
TS.TagClose t -> TS.TagClose (demote t)
t -> t
where
demote t@['h', n]
| isDigit n = ['h', intToDigit (min 6 $ digitToInt n + 1)]
| otherwise = t
demote t = t
--------------------------------------------------------------------------------
isUrlAttribute :: String -> Bool
isUrlAttribute = (`elem` ["src", "href", "data", "poster"])
--------------------------------------------------------------------------------
getUrls :: [TS.Tag String] -> [String]
getUrls tags = [v | TS.TagOpen _ as <- tags, (k, v) <- as, isUrlAttribute k]
--------------------------------------------------------------------------------
-- | Apply a function to each URL on a webpage
withUrls :: (String -> String) -> String -> String
withUrls f = withTags tag
where
tag (TS.TagOpen s a) = TS.TagOpen s $ map attr a
tag x = x
attr (k, v) = (k, if isUrlAttribute k then f v else v)
--------------------------------------------------------------------------------
-- | Customized TagSoup renderer. The default TagSoup renderer escape CSS
-- within style tags, and doesn't properly minimize.
renderTags' :: [TS.Tag String] -> String
renderTags' = TS.renderTagsOptions TS.RenderOptions
{ TS.optRawTag = (`elem` ["script", "style"]) . map toLower
, TS.optMinimize = (`S.member` minimize) . map toLower
, TS.optEscape = id
}
where
-- A list of elements which must be minimized
minimize = S.fromList
[ "area", "br", "col", "embed", "hr", "img", "input", "meta", "link"
, "param"
]
--------------------------------------------------------------------------------
-- | Customized TagSoup parser: do not decode any entities.
parseTags' :: String -> [TS.Tag String]
parseTags' = TS.parseTagsOptions (TS.parseOptions :: TS.ParseOptions String)
{ TS.optEntityData = \(str, b) -> [TS.TagText $ "&" ++ str ++ [';' | b]]
, TS.optEntityAttrib = \(str, b) -> ("&" ++ str ++ [';' | b], [])
}
--------------------------------------------------------------------------------
-- | Convert a filepath to an URL starting from the site root
--
-- Example:
--
-- > toUrl "foo/bar.html"
--
-- Result:
--
-- > "/foo/bar.html"
--
-- This also sanitizes the URL, e.g. converting spaces into '%20'
toUrl :: FilePath -> String
toUrl url = case url of
('/' : xs) -> '/' : sanitize xs
xs -> '/' : sanitize xs
where
-- Everything but unreserved characters should be escaped as we are
-- sanitising the path therefore reserved characters which have a
-- meaning in URI does not appear. Special casing for `/`, because it has
-- a special meaning in FilePath as well as in URI.
sanitize = escapeURIString (\c -> c == '/' || isUnreserved c)
--------------------------------------------------------------------------------
-- | Get the relative url to the site root, for a given (absolute) url
toSiteRoot :: String -> String
toSiteRoot = emptyException . joinPath . map parent
. filter relevant . splitPath . takeDirectory
where
parent = const ".."
emptyException [] = "."
emptyException x = x
relevant "." = False
relevant "/" = False
relevant "./" = False
relevant _ = True
--------------------------------------------------------------------------------
-- | Check if an URL links to an external HTTP(S) source
isExternal :: String -> Bool
isExternal url = any (flip isPrefixOf url) ["http://", "https://", "//"]
--------------------------------------------------------------------------------
-- | Strip all HTML tags from a string
--
-- Example:
--
-- > stripTags "<p>foo</p>"
--
-- Result:
--
-- > "foo"
--
-- This also works for incomplete tags
--
-- Example:
--
-- > stripTags "<p>foo</p"
--
-- Result:
--
-- > "foo"
stripTags :: String -> String
stripTags [] = []
stripTags ('<' : xs) = stripTags $ drop 1 $ dropWhile (/= '>') xs
stripTags (x : xs) = x : stripTags xs
--------------------------------------------------------------------------------
-- | HTML-escape a string
--
-- Example:
--
-- > escapeHtml "Me & Dean"
--
-- Result:
--
-- > "Me & Dean"
escapeHtml :: String -> String
escapeHtml = renderHtml . toHtml