1
{-# LANGUAGE OverloadedStrings #-}
3
Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu>
5
This program is free software; you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation; either version 2 of the License, or
8
(at your option) any later version.
10
This program is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License
16
along with this program; if not, write to the Free Software
17
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
Module : Text.Pandoc.SelfContained
22
Copyright : Copyright (C) 2011 John MacFarlane
23
License : GNU GPL, version 2 or above
25
Maintainer : John MacFarlane <jgm@berkeley.edu>
27
Portability : portable
29
Functions for converting an HTML file into one that can be viewed
30
offline, by incorporating linked images, CSS, and scripts into
31
the HTML using data URIs.
33
module Text.Pandoc.SelfContained ( makeSelfContained ) where
34
import Text.HTML.TagSoup
35
import Network.URI (isAbsoluteURI, parseURI, escapeURIString)
37
import Data.ByteString.Base64
38
import qualified Data.ByteString.Char8 as B
39
import Data.ByteString (ByteString)
40
import Data.ByteString.UTF8 (toString, fromString)
41
import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>))
42
import Data.Char (toLower, isAscii, isAlphaNum)
43
import Codec.Compression.GZip as Gzip
44
import qualified Data.ByteString.Lazy as L
45
import Text.Pandoc.Shared (findDataFile)
46
import Text.Pandoc.MIME (getMimeType)
47
import System.Directory (doesFileExist)
49
getItem :: Maybe FilePath -> String -> IO (ByteString, Maybe String)
54
let mime = case takeExtension f of
55
".gz" -> getMimeType $ dropExtension f
57
exists <- doesFileExist f
63
res <- findDataFile userdata f
64
exists' <- doesFileExist res
67
cont <- B.readFile res
69
else error $ "Could not find `" ++ f ++ "'"
71
-- TODO - have this return mime type too - then it can work for google
73
openURL :: String -> IO (ByteString, Maybe String)
74
openURL u = getBodyAndMimeType =<< simpleHTTP (getReq u)
75
where getReq v = case parseURI v of
76
Nothing -> error $ "Could not parse URI: " ++ v
77
Just u' -> mkRequest GET u'
78
getBodyAndMimeType (Left err) = fail (show err)
79
getBodyAndMimeType (Right r) = return (rspBody r, findHeader HdrContentType r)
82
isOk c = isAscii c && isAlphaNum c
84
convertTag :: Maybe FilePath -> Tag String -> IO (Tag String)
85
convertTag userdata t@(TagOpen "img" as) =
86
case fromAttrib "src" t of
89
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
90
let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
91
return $ TagOpen "img" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
92
convertTag userdata t@(TagOpen "video" as) =
93
case fromAttrib "src" t of
96
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
97
let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
98
return $ TagOpen "video" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
99
convertTag userdata t@(TagOpen "script" as) =
100
case fromAttrib "src" t of
103
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
104
let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
105
return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
106
convertTag userdata t@(TagOpen "link" as) =
107
case fromAttrib "href" t of
110
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
111
let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
112
return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
113
convertTag _ t = return t
115
cssURLs :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString
116
cssURLs userdata d orig =
117
case B.breakSubstring "url(" orig of
118
(x,y) | B.null y -> return orig
120
let (u,v) = B.breakSubstring ")" $ B.drop 4 y
123
"\"" -> B.takeWhile (/='"') $ B.drop 1 u
125
(raw, mime) <- getRaw userdata "" (d </> url)
126
rest <- cssURLs userdata d v
127
let enc = "data:" `B.append` fromString mime `B.append`
128
";base64," `B.append` (encode raw)
129
return $ x `B.append` "url(" `B.append` enc `B.append` rest
131
getRaw :: Maybe FilePath -> String -> String -> IO (ByteString, String)
132
getRaw userdata mimetype src = do
133
let ext = map toLower $ takeExtension src
134
(raw, respMime) <- getItem userdata src
135
let raw' = if ext == ".gz"
136
then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
139
let mime = case (mimetype, respMime) of
140
("",Nothing) -> error
141
$ "Could not determine mime type for `" ++ src ++ "'"
144
result <- if mime == "text/css"
145
then cssURLs userdata (takeDirectory src) raw'
147
return (result, mime)
149
-- | Convert HTML into self-contained HTML, incorporating images,
150
-- scripts, and CSS using data: URIs. Items specified using absolute
151
-- URLs will be downloaded; those specified using relative URLs will
152
-- be sought first relative to the working directory, then relative
153
-- to the user data directory (if the first parameter is 'Just'
154
-- a directory), and finally relative to pandoc's default data
156
makeSelfContained :: Maybe FilePath -> String -> IO String
157
makeSelfContained userdata inp = do
158
let tags = parseTags inp
159
out' <- mapM (convertTag userdata) tags
160
return $ renderTagsOptions renderOptions{ optMinimize = (\t -> t == "br"
161
|| t == "img" || t == "meta" || t == "link" ) } out'