2
* Copyright (C) 2011 Google Inc. All rights reserved.
4
* Redistribution and use in source and binary forms, with or without
5
* modification, are permitted provided that the following conditions are
8
* * Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* * Redistributions in binary form must reproduce the above
11
* copyright notice, this list of conditions and the following disclaimer
12
* in the documentation and/or other materials provided with the
14
* * Neither the name of Google Inc. nor the names of its
15
* contributors may be used to endorse or promote products derived from
16
* this software without specific prior written permission.
18
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
#include "MHTMLArchive.h"
38
#include "MHTMLParser.h"
39
#include "MIMETypeRegistry.h"
41
#include "PageSerializer.h"
42
#include "QuotedPrintable.h"
43
#include "SchemeRegistry.h"
44
#include "SharedBuffer.h"
46
#include <wtf/CryptographicallyRandomNumber.h>
47
#include <wtf/DateMath.h>
48
#include <wtf/GregorianDateTime.h>
49
#include <wtf/StdLibExtras.h>
50
#include <wtf/text/Base64.h>
51
#include <wtf/text/StringBuilder.h>
61
const char* const quotedPrintable = "quoted-printable";
62
const char* const base64 = "base64";
63
const char* const binary = "binary";
65
static String generateRandomBoundary()
67
// Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0).
68
const size_t randomValuesLength = 10;
69
char randomValues[randomValuesLength];
70
cryptographicallyRandomValues(&randomValues, randomValuesLength);
71
StringBuilder stringBuilder;
72
stringBuilder.append("----=_NextPart_000_");
73
for (size_t i = 0; i < randomValuesLength; ++i) {
75
stringBuilder.append('_');
77
stringBuilder.append('.');
78
stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i]));
79
stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i]));
81
return stringBuilder.toString();
84
static String replaceNonPrintableCharacters(const String& text)
86
StringBuilder stringBuilder;
87
for (size_t i = 0; i < text.length(); ++i) {
88
if (isASCIIPrintable(text[i]))
89
stringBuilder.append(text[i]);
91
stringBuilder.append('?');
93
return stringBuilder.toString();
96
MHTMLArchive::MHTMLArchive()
100
MHTMLArchive::~MHTMLArchive()
102
// Because all frames know about each other we need to perform a deep clearing of the archives graph.
103
clearAllSubframeArchives();
106
PassRefPtr<MHTMLArchive> MHTMLArchive::create()
108
return adoptRef(new MHTMLArchive);
111
PassRefPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data)
113
// For security reasons we only load MHTML pages from local URLs.
114
if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol()))
117
MHTMLParser parser(data);
118
RefPtr<MHTMLArchive> mainArchive = parser.parseArchive();
120
return 0; // Invalid MHTML file.
122
// Since MHTML is a flat format, we need to make all frames aware of all resources.
123
for (size_t i = 0; i < parser.frameCount(); ++i) {
124
RefPtr<MHTMLArchive> archive = parser.frameAt(i);
125
for (size_t j = 1; j < parser.frameCount(); ++j) {
127
archive->addSubframeArchive(parser.frameAt(j));
129
for (size_t j = 0; j < parser.subResourceCount(); ++j)
130
archive->addSubresource(parser.subResourceAt(j));
132
return mainArchive.release();
135
PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page)
137
return generateMHTMLData(page, false);
140
PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLDataUsingBinaryEncoding(Page* page)
142
return generateMHTMLData(page, true);
145
PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page, bool useBinaryEncoding)
147
Vector<PageSerializer::Resource> resources;
148
PageSerializer pageSerializer(&resources);
149
pageSerializer.serialize(page);
151
String boundary = generateRandomBoundary();
152
String endOfResourceBoundary = makeString("--", boundary, "\r\n");
154
GregorianDateTime now;
155
now.setToCurrentLocalTime();
156
String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60);
158
StringBuilder stringBuilder;
159
stringBuilder.append("From: <Saved by WebKit>\r\n");
160
stringBuilder.append("Subject: ");
161
// We replace non ASCII characters with '?' characters to match IE's behavior.
162
stringBuilder.append(replaceNonPrintableCharacters(page->mainFrame()->document()->title()));
163
stringBuilder.append("\r\nDate: ");
164
stringBuilder.append(dateString);
165
stringBuilder.append("\r\nMIME-Version: 1.0\r\n");
166
stringBuilder.append("Content-Type: multipart/related;\r\n");
167
stringBuilder.append("\ttype=\"");
168
stringBuilder.append(page->mainFrame()->document()->suggestedMIMEType());
169
stringBuilder.append("\";\r\n");
170
stringBuilder.append("\tboundary=\"");
171
stringBuilder.append(boundary);
172
stringBuilder.append("\"\r\n\r\n");
174
// We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it).
175
ASSERT(stringBuilder.toString().containsOnlyASCII());
176
CString asciiString = stringBuilder.toString().utf8();
177
RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create();
178
mhtmlData->append(asciiString.data(), asciiString.length());
180
for (size_t i = 0; i < resources.size(); ++i) {
181
const PageSerializer::Resource& resource = resources[i];
183
stringBuilder.clear();
184
stringBuilder.append(endOfResourceBoundary);
185
stringBuilder.append("Content-Type: ");
186
stringBuilder.append(resource.mimeType);
188
const char* contentEncoding = 0;
189
if (useBinaryEncoding)
190
contentEncoding = binary;
191
else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType))
192
contentEncoding = quotedPrintable;
194
contentEncoding = base64;
196
stringBuilder.append("\r\nContent-Transfer-Encoding: ");
197
stringBuilder.append(contentEncoding);
198
stringBuilder.append("\r\nContent-Location: ");
199
stringBuilder.append(resource.url);
200
stringBuilder.append("\r\n\r\n");
202
asciiString = stringBuilder.toString().utf8();
203
mhtmlData->append(asciiString.data(), asciiString.length());
205
if (!strcmp(contentEncoding, binary)) {
208
while (size_t length = resource.data->getSomeData(data, position)) {
209
mhtmlData->append(data, length);
213
// FIXME: ideally we would encode the content as a stream without having to fetch it all.
214
const char* data = resource.data->data();
215
size_t dataLength = resource.data->size();
216
Vector<char> encodedData;
217
if (!strcmp(contentEncoding, quotedPrintable)) {
218
quotedPrintableEncode(data, dataLength, encodedData);
219
mhtmlData->append(encodedData.data(), encodedData.size());
220
mhtmlData->append("\r\n", 2);
222
ASSERT(!strcmp(contentEncoding, base64));
223
// We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs.
224
base64Encode(data, dataLength, encodedData);
225
const size_t maximumLineLength = 76;
227
size_t encodedDataLength = encodedData.size();
229
size_t lineLength = std::min(encodedDataLength - index, maximumLineLength);
230
mhtmlData->append(encodedData.data() + index, lineLength);
231
mhtmlData->append("\r\n", 2);
232
index += maximumLineLength;
233
} while (index < encodedDataLength);
238
asciiString = makeString("--", boundary, "--\r\n").utf8();
239
mhtmlData->append(asciiString.data(), asciiString.length());
241
return mhtmlData.release();