2
* Copyright (C) 2004, 2007, 2008, 2011, 2012 Apple Inc. All rights reserved.
3
* Copyright (C) 2012 Research In Motion Limited. All rights reserved.
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
14
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
#include "DecodeEscapeSequences.h"
31
#include "PlatformMemoryInstrumentation.h"
32
#include "TextEncoding.h"
34
#include <wtf/HashMap.h>
36
#include <wtf/HexNumber.h>
38
#include <wtf/MemoryInstrumentationString.h>
39
#include <wtf/StdLibExtras.h>
40
#include <wtf/text/CString.h>
41
#include <wtf/text/StringBuilder.h>
42
#include <wtf/text/StringHash.h>
45
#include <unicode/uidna.h>
46
#elif USE(QT4_UNICODE)
48
#elif USE(GLIB_UNICODE)
50
#include <wtf/gobject/GOwnPtr.h>
53
// FIXME: This file makes too much use of the + operator on String.
54
// We either have to optimize that operator so it doesn't involve
55
// so many allocations, or change this to use StringBuffer instead.
62
typedef Vector<char, 512> CharBuffer;
63
typedef Vector<UChar, 512> UCharBuffer;
65
static const unsigned maximumValidPortNumber = 0xFFFE;
66
static const unsigned invalidPortNumber = 0xFFFF;
68
static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
70
ASSERT(isASCIILower(lowercaseLetter));
71
return (character | 0x20) == lowercaseLetter;
74
#if !USE(GOOGLEURL) && !USE(WTFURL)
76
static const char wsScheme[] = {'w', 's'};
77
static const char ftpScheme[] = {'f', 't', 'p'};
78
static const char ftpPort[] = {'2', '1'};
79
static const char wssScheme[] = {'w', 's', 's'};
80
static const char fileScheme[] = {'f', 'i', 'l', 'e'};
81
static const char httpScheme[] = {'h', 't', 't', 'p'};
82
static const char httpPort[] = {'8', '0'};
83
static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
84
static const char httpsPort[] = {'4', '4', '3'};
85
static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
86
static const char gopherPort[] = {'7', '0'};
88
static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
90
ASSERT(isASCIILower(lowercaseLetter));
91
return (character | 0x20) == lowercaseLetter;
94
enum URLCharacterClasses {
96
SchemeFirstChar = 1 << 0,
98
// ( alpha | digit | "+" | "-" | "." )
101
// mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
102
// unreserved = alphanum | mark
103
// ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
104
UserInfoChar = 1 << 2,
106
// alnum | "." | "-" | "%"
107
// The above is what the specification says, but we are lenient to
108
// match existing practice and also allow:
110
HostnameChar = 1 << 3,
112
// hexdigit | ":" | "%"
115
// "#" | "?" | "/" | nul
116
PathSegmentEndChar = 1 << 5,
118
// not allowed in path
122
static const unsigned char characterClassTable[256] = {
123
/* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar,
124
/* 2 stx */ BadChar, /* 3 etx */ BadChar,
125
/* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar,
126
/* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar,
127
/* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar,
128
/* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar,
129
/* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar,
130
/* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar,
131
/* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar,
132
/* 32 sp */ BadChar, /* 33 ! */ UserInfoChar,
133
/* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar,
134
/* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
135
/* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar,
136
/* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar,
137
/* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar,
138
/* 44 , */ UserInfoChar,
139
/* 45 - */ SchemeChar | UserInfoChar | HostnameChar,
140
/* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
141
/* 47 / */ PathSegmentEndChar,
142
/* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
143
/* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
144
/* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
145
/* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
146
/* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
147
/* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
148
/* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
149
/* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
150
/* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
151
/* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
152
/* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar,
153
/* 60 < */ BadChar, /* 61 = */ UserInfoChar,
154
/* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar,
156
/* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
157
/* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
158
/* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
159
/* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
160
/* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
161
/* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
162
/* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
163
/* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
164
/* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
165
/* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
166
/* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
167
/* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
168
/* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
169
/* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
170
/* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
171
/* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
172
/* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
173
/* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
174
/* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
175
/* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
176
/* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
177
/* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
178
/* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
179
/* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
180
/* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
181
/* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
183
/* 92 \ */ 0, /* 93 ] */ 0,
185
/* 95 _ */ UserInfoChar | HostnameChar,
187
/* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
188
/* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
189
/* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
190
/* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
191
/* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
192
/* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
193
/* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
194
/* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
195
/* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
196
/* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
197
/* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
198
/* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
199
/* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
200
/* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
201
/* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
202
/* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
203
/* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
204
/* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
205
/* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
206
/* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
207
/* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
208
/* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
209
/* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
210
/* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
211
/* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
212
/* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
214
/* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar,
215
/* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
216
/* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
217
/* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
218
/* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
219
/* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
220
/* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
221
/* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
222
/* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
223
/* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
224
/* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
225
/* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
226
/* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
227
/* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
228
/* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
229
/* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
230
/* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
231
/* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
232
/* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
233
/* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
234
/* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
235
/* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
236
/* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
237
/* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
238
/* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
239
/* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
240
/* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
241
/* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
242
/* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
243
/* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
244
/* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
245
/* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
246
/* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
249
static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
250
static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
251
static String substituteBackslashes(const String&);
253
static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
254
static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
255
static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
256
static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
257
static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
258
static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
259
static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
260
static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
261
static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
262
static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
264
static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
266
ASSERT(isSchemeChar(character));
267
ASSERT(schemeCharacter & 0x20);
268
ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
269
return (character | 0x20) == schemeCharacter;
272
// Copies the source to the destination, assuming all the source characters are
273
// ASCII. The destination buffer must be large enough. Null characters are allowed
274
// in the source string, and no attempt is made to null-terminate the result.
275
static void copyASCII(const String& string, char* dest)
277
if (string.isEmpty())
281
memcpy(dest, string.characters8(), string.length());
283
const UChar* src = string.characters16();
284
size_t length = string.length();
285
for (size_t i = 0; i < length; i++)
286
dest[i] = static_cast<char>(src[i]);
290
static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
292
buffer.resize(base.length() + len + 1);
293
copyASCII(base, buffer.data());
294
memcpy(buffer.data() + base.length(), rel, len);
295
buffer[buffer.size() - 1] = '\0';
298
// FIXME: Move to WTFString.h eventually.
299
// Returns the index of the first index in string |s| of any of the characters
300
// in |toFind|. |toFind| should be a null-terminated string, all characters up
301
// to the null will be searched. Returns int if not found.
302
static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind)
304
for (int i = startPos; i < sLen; i++) {
305
const char* cur = toFind;
307
if (s[i] == *(cur++))
314
static inline void checkEncodedString(const String& url)
316
ASSERT_UNUSED(url, url.containsOnlyASCII());
317
ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
320
inline bool KURL::protocolIs(const String& string, const char* protocol)
322
return WebCore::protocolIs(string, protocol);
325
void KURL::invalidate()
328
m_protocolIsInHTTPFamily = false;
336
m_pathAfterLastSlash = 0;
341
KURL::KURL(ParsedURLStringTag, const String& url)
344
ASSERT(url == m_string);
347
KURL::KURL(const KURL& base, const String& relative)
349
init(base, relative, UTF8Encoding());
352
KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding)
354
// For UTF-{7,16,32}, we want to use UTF-8 for the query part as
355
// we do when submitting a form. A form with GET method
356
// has its contents added to a URL as query params and it makes sense
358
init(base, relative, encoding.encodingForFormSubmission());
361
static bool shouldTrimFromURL(unsigned char c)
363
// Browsers ignore leading/trailing whitespace and control
364
// characters from URLs. Note that c is an *unsigned* char here
365
// so this comparison should only catch control characters.
369
void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding)
371
// Allow resolutions with a null or empty base URL, but not with any other invalid one.
372
// FIXME: Is this a good rule?
373
if (!base.m_isValid && !base.isEmpty()) {
379
// For compatibility with Win IE, treat backslashes as if they were slashes,
380
// as long as we're not dealing with javascript: or data: URLs.
381
String rel = relative;
382
if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
383
rel = substituteBackslashes(rel);
385
bool allASCII = rel.containsOnlyASCII();
386
CharBuffer strBuffer;
391
strBuffer.resize(len + 1);
392
copyASCII(rel, strBuffer.data());
394
str = strBuffer.data();
396
encodeRelativeString(rel, encoding, strBuffer);
397
str = strBuffer.data();
401
// Get rid of leading whitespace and control characters.
402
while (len && shouldTrimFromURL(*str)) {
407
// Get rid of trailing whitespace and control characters.
408
while (len && shouldTrimFromURL(str[len - 1]))
411
// According to the RFC, the reference should be interpreted as an
412
// absolute URI if possible, using the "leftmost, longest"
413
// algorithm. If the URI reference is absolute it will have a
414
// scheme, meaning that it will have a colon before the first
415
// non-scheme element.
416
bool absolute = false;
418
if (isSchemeFirstChar(*p)) {
420
while (isSchemeChar(*p)) {
424
if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical())
431
CharBuffer parseBuffer;
434
parse(str, &relative);
436
// If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
437
// unless the relative URL is a single fragment.
438
if (!base.isHierarchical()) {
440
appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
441
parse(parseBuffer.data(), &relative);
451
// The reference is empty, so this is a reference to the same document with any fragment identifier removed.
453
removeFragmentIdentifier();
456
// must be fragment-only reference
457
appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
458
parse(parseBuffer.data(), &relative);
462
// query-only reference, special case needed for non-URL results
463
appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
464
parse(parseBuffer.data(), &relative);
468
// must be net-path or absolute-path reference
471
appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
472
parse(parseBuffer.data(), &relative);
475
appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
476
parse(parseBuffer.data(), &relative);
481
// must be relative-path reference
483
// Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
484
const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
485
parseBuffer.resize(bufferSize);
487
char* bufferPos = parseBuffer.data();
488
char* bufferStart = bufferPos;
490
// first copy everything before the path from the base
491
CharBuffer baseStringBuffer(base.m_string.length());
492
copyASCII(base.m_string, baseStringBuffer.data());
493
const char* baseString = baseStringBuffer.data();
494
const char* baseStringStart = baseString;
495
const char* pathStart = baseStringStart + base.m_portEnd;
496
while (baseStringStart < pathStart)
497
*bufferPos++ = *baseStringStart++;
498
char* bufferPathStart = bufferPos;
500
// now copy the base path
501
const char* baseStringEnd = baseString + base.m_pathEnd;
503
// go back to the last slash
504
while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
507
if (baseStringEnd == baseStringStart) {
508
// no path in base, add a path separator if necessary
509
if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
512
bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
515
const char* relStringStart = str;
516
const char* relStringPos = relStringStart;
518
while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
519
if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
520
if (isPathSegmentEndChar(relStringPos[1])) {
521
// skip over "." segment
523
if (relStringPos[0] == '/')
526
} else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
527
// skip over ".." segment and rewind the last segment
528
// the RFC leaves it up to the app to decide what to do with excess
529
// ".." segments - we choose to drop them since some web content
532
if (relStringPos[0] == '/')
534
if (bufferPos > bufferPathStart + 1)
536
while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/')
542
*bufferPos = *relStringPos;
547
// all done with the path work, now copy any remainder
548
// of the relative reference; this will also add a null terminator
549
strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
551
parse(parseBuffer.data(), &relative);
553
ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
560
KURL KURL::copy() const
563
result.m_string = result.m_string.isolatedCopy();
567
bool KURL::hasPath() const
569
return m_pathEnd != m_portEnd;
572
String KURL::lastPathComponent() const
577
unsigned end = m_pathEnd - 1;
578
if (m_string[end] == '/')
581
size_t start = m_string.reverseFind('/', end);
582
if (start < static_cast<unsigned>(m_portEnd))
586
return m_string.substring(start, end - start + 1);
589
String KURL::protocol() const
591
return m_string.left(m_schemeEnd);
594
String KURL::host() const
596
int start = hostStart();
597
return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start));
600
unsigned short KURL::port() const
602
// We return a port of 0 if there is no port specified. This can happen in two situations:
603
// 1) The URL contains no colon after the host name and before the path component of the URL.
604
// 2) The URL contains a colon but there's no port number before the path component of the URL begins.
605
if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1)
608
const UChar* stringData = m_string.characters();
610
unsigned number = charactersToUIntStrict(stringData + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
611
if (!ok || number > maximumValidPortNumber)
612
return invalidPortNumber;
616
String KURL::pass() const
618
if (m_passwordEnd == m_userEnd)
621
return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
624
String KURL::user() const
626
return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
629
String KURL::fragmentIdentifier() const
631
if (m_fragmentEnd == m_queryEnd)
634
return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
637
bool KURL::hasFragmentIdentifier() const
639
return m_fragmentEnd != m_queryEnd;
642
String KURL::baseAsString() const
644
return m_string.left(m_pathAfterLastSlash);
649
static inline void assertProtocolIsGood(const char*)
655
static void assertProtocolIsGood(const char* protocol)
657
const char* p = protocol;
659
ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
666
bool KURL::protocolIs(const char* protocol) const
668
assertProtocolIsGood(protocol);
670
// JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
671
// The free function protocolIsJavaScript() should be used instead.
672
ASSERT(!equalIgnoringCase(protocol, String("javascript")));
677
// Do the comparison without making a new string object.
678
for (int i = 0; i < m_schemeEnd; ++i) {
679
if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
682
return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
685
String KURL::query() const
687
if (m_queryEnd == m_pathEnd)
690
return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
693
String KURL::path() const
695
return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
698
bool KURL::setProtocol(const String& s)
700
// Firefox and IE remove everything after the first ':'.
701
size_t separatorPosition = s.find(':');
702
String newProtocol = s.substring(0, separatorPosition);
704
if (!isValidProtocol(newProtocol))
708
parse(newProtocol + ":" + m_string);
712
parse(newProtocol + m_string.substring(m_schemeEnd));
716
void KURL::setHost(const String& s)
721
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
722
// and to avoid changing more than just the host.
724
bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
726
parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
729
void KURL::removePort()
731
if (m_hostEnd == m_portEnd)
733
parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
736
void KURL::setPort(unsigned short i)
741
bool colonNeeded = m_portEnd == m_hostEnd;
742
int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
744
parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
747
void KURL::setHostAndPort(const String& hostAndPort)
752
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
753
// and to avoid changing more than just host and port.
755
bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
757
parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
760
void KURL::setUser(const String& user)
765
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
766
// and to avoid changing more than just the user login.
769
if (!user.isEmpty()) {
771
if (m_userStart == m_schemeEnd + 1)
773
// Add '@' if we didn't have one before.
774
if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
776
parse(m_string.left(m_userStart) + u + m_string.substring(end));
778
// Remove '@' if we now have neither user nor password.
779
if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
781
// We don't want to parse in the extremely common case where we are not going to make a change.
782
if (m_userStart != end)
783
parse(m_string.left(m_userStart) + m_string.substring(end));
787
void KURL::setPass(const String& password)
792
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
793
// and to avoid changing more than just the user password.
795
int end = m_passwordEnd;
796
if (!password.isEmpty()) {
797
String p = ":" + password + "@";
798
if (m_userEnd == m_schemeEnd + 1)
800
// Eat the existing '@' since we are going to add our own.
801
if (end != m_hostEnd && m_string[end] == '@')
803
parse(m_string.left(m_userEnd) + p + m_string.substring(end));
805
// Remove '@' if we now have neither user nor password.
806
if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
808
// We don't want to parse in the extremely common case where we are not going to make a change.
809
if (m_userEnd != end)
810
parse(m_string.left(m_userEnd) + m_string.substring(end));
814
void KURL::setFragmentIdentifier(const String& s)
819
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
820
parse(m_string.left(m_queryEnd) + "#" + s);
823
void KURL::removeFragmentIdentifier()
827
parse(m_string.left(m_queryEnd));
830
void KURL::setQuery(const String& query)
835
// FIXME: '#' and non-ASCII characters must be encoded and escaped.
836
// Usually, the query is encoded using document encoding, not UTF-8, but we don't have
837
// access to the document in this function.
838
if ((query.isEmpty() || query[0] != '?') && !query.isNull())
839
parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
841
parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
845
void KURL::setPath(const String& s)
850
// FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
851
// may be inadvertently affected.
853
if (path.isEmpty() || path[0] != '/')
856
parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
859
String decodeURLEscapeSequences(const String& string)
861
return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
864
String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
866
return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
869
// Caution: This function does not bounds check.
870
static void appendEscapedChar(char*& buffer, unsigned char c)
873
placeByteAsHex(c, buffer);
876
static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
880
const char* str = strStart;
881
const char* strEnd = strStart + length;
882
while (str < strEnd) {
883
unsigned char c = *str++;
885
if (c == '%' || c == '?')
887
else if (c != 0x09 && c != 0x0a && c != 0x0d)
888
appendEscapedChar(p, c);
896
static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
900
const char* str = strStart;
901
const char* strEnd = strStart + length;
902
while (str < strEnd) {
903
unsigned char c = *str++;
904
// Strip CR, LF and Tab from fragments, per:
905
// https://bugs.webkit.org/show_bug.cgi?id=8770
906
if (c == 0x09 || c == 0x0a || c == 0x0d)
909
// Chrome and IE allow non-ascii characters in fragments, however doing
910
// so would hit an ASSERT in checkEncodedString, so for now we don't.
911
if (c < 0x20 || c >= 127) {
912
appendEscapedChar(p, c);
921
// copy a path, accounting for "." and ".." segments
922
static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
924
char* bufferPathStart = dst;
926
// empty path is a special case, and need not have a leading slash
927
if (srcStart != srcEnd) {
928
const char* baseStringStart = src + srcStart;
929
const char* baseStringEnd = src + srcEnd;
930
const char* baseStringPos = baseStringStart;
932
// this code is unprepared for paths that do not begin with a
933
// slash and we should always have one in the source string
934
ASSERT(baseStringPos[0] == '/');
936
// copy the leading slash into the destination
937
*dst = *baseStringPos;
941
while (baseStringPos < baseStringEnd) {
942
if (baseStringPos[0] == '.' && dst[-1] == '/') {
943
if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
944
// skip over "." segment
947
} else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
948
baseStringPos + 2 == baseStringEnd)) {
949
// skip over ".." segment and rewind the last segment
950
// the RFC leaves it up to the app to decide what to do with excess
951
// ".." segments - we choose to drop them since some web content
954
if (dst > bufferPathStart + 1)
956
while (dst > bufferPathStart && dst[-1] != '/')
962
*dst = *baseStringPos;
968
return dst - bufferPathStart;
971
static inline bool hasSlashDotOrDotDot(const char* str)
973
const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
976
unsigned char pc = *p;
977
while (unsigned char c = *++p) {
978
if (c == '.' && (pc == '/' || pc == '.'))
985
void KURL::parse(const String& string)
987
checkEncodedString(string);
989
CharBuffer buffer(string.length() + 1);
990
copyASCII(string, buffer.data());
991
buffer[string.length()] = '\0';
992
parse(buffer.data(), &string);
995
template<size_t length>
996
static inline bool equal(const char* a, const char (&b)[length])
998
for (size_t i = 0; i < length; ++i) {
1005
template<size_t lengthB>
1006
static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
1008
return lengthA == lengthB && equal(stringA, stringB);
1011
// List of default schemes is taken from google-url:
1012
// http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
1013
static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
1015
// This switch is theoretically a performance optimization. It came over when
1016
// the code was moved from google-url, but may be removed later.
1017
switch (schemeLength) {
1019
return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
1021
if (equal(scheme, ftpScheme))
1022
return equal(port, portLength, ftpPort);
1023
if (equal(scheme, wssScheme))
1024
return equal(port, portLength, httpsPort);
1027
return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
1029
return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
1031
return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
1036
static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userEndChar)
1038
return userEndChar == '@' && hostStart == portEnd;
1041
static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
1043
switch (schemeLength) {
1045
return equal(scheme, wsScheme);
1047
return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1049
return equal(scheme, httpScheme);
1051
return equal(scheme, httpsScheme);
1053
return equal(scheme, gopherScheme);
1058
static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
1060
switch (schemeLength) {
1062
return equal(scheme, wsScheme);
1064
return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1066
return equal(scheme, httpScheme) || equal(scheme, fileScheme);
1068
return equal(scheme, httpsScheme);
1070
return equal(scheme, gopherScheme);
1075
void KURL::parse(const char* url, const String* originalString)
1077
if (!url || url[0] == '\0') {
1078
// valid URL must be non-empty
1079
m_string = originalString ? *originalString : url;
1084
if (!isSchemeFirstChar(url[0])) {
1085
// scheme must start with an alphabetic character
1086
m_string = originalString ? *originalString : url;
1092
while (isSchemeChar(url[schemeEnd]))
1095
if (url[schemeEnd] != ':') {
1096
m_string = originalString ? *originalString : url;
1101
int userStart = schemeEnd + 1;
1110
bool hierarchical = url[schemeEnd + 1] == '/';
1111
bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
1113
bool isFile = schemeEnd == 4
1114
&& isLetterMatchIgnoringCase(url[0], 'f')
1115
&& isLetterMatchIgnoringCase(url[1], 'i')
1116
&& isLetterMatchIgnoringCase(url[2], 'l')
1117
&& isLetterMatchIgnoringCase(url[3], 'e');
1119
#if PLATFORM(BLACKBERRY)
1120
// Parse local: urls the same as file: urls.
1122
isFile = schemeEnd == 5
1123
&& isLetterMatchIgnoringCase(url[0], 'l')
1124
&& isLetterMatchIgnoringCase(url[1], 'o')
1125
&& isLetterMatchIgnoringCase(url[2], 'c')
1126
&& isLetterMatchIgnoringCase(url[3], 'a')
1127
&& isLetterMatchIgnoringCase(url[4], 'l');
1130
m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
1131
&& isLetterMatchIgnoringCase(url[1], 't')
1132
&& isLetterMatchIgnoringCase(url[2], 't')
1133
&& isLetterMatchIgnoringCase(url[3], 'p')
1134
&& (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1136
if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
1137
// The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
1138
// Attempt to find an authority.
1139
// FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
1145
userEnd = userStart;
1148
while (isUserInfoChar(url[userEnd])) {
1149
if (url[userEnd] == ':' && colonPos == 0)
1154
if (url[userEnd] == '@') {
1155
// actual end of the userinfo, start on the host
1156
if (colonPos != 0) {
1157
passwordEnd = userEnd;
1159
passwordStart = colonPos + 1;
1161
passwordStart = passwordEnd = userEnd;
1163
hostStart = passwordEnd + 1;
1164
} else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
1165
// hit the end of the authority, must have been no user
1166
// or looks like an IPv6 hostname
1167
// either way, try to parse it as a hostname
1168
userEnd = userStart;
1169
passwordStart = passwordEnd = userEnd;
1170
hostStart = userStart;
1172
// invalid character
1173
m_string = originalString ? *originalString : url;
1178
hostEnd = hostStart;
1181
if (url[hostEnd] == '[') {
1183
while (isIPv6Char(url[hostEnd]))
1185
if (url[hostEnd] == ']')
1188
// invalid character
1189
m_string = originalString ? *originalString : url;
1194
while (isHostnameChar(url[hostEnd]))
1198
if (url[hostEnd] == ':') {
1199
portStart = portEnd = hostEnd + 1;
1201
// possible start of port
1202
portEnd = portStart;
1203
while (isASCIIDigit(url[portEnd]))
1206
portStart = portEnd = hostEnd;
1208
if (!isPathSegmentEndChar(url[portEnd])) {
1209
// invalid character
1210
m_string = originalString ? *originalString : url;
1215
if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[userEnd])) {
1216
// in this circumstance, act as if there is an erroneous hostname containing an '@'
1217
userEnd = userStart;
1218
hostStart = userEnd;
1221
if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
1222
// No authority found, which means that this is not a net_path, but rather an abs_path whose first two
1223
// path segments are empty. For file, http and https only, an empty authority is allowed.
1225
userEnd = userStart;
1226
passwordStart = userEnd;
1227
passwordEnd = passwordStart;
1228
hostStart = passwordEnd;
1229
hostEnd = hostStart;
1230
portStart = hostEnd;
1234
// the part after the scheme must be an opaque_part or an abs_path
1235
userEnd = userStart;
1236
passwordStart = passwordEnd = userEnd;
1237
hostStart = hostEnd = passwordEnd;
1238
portStart = portEnd = hostEnd;
1241
int pathStart = portEnd;
1242
int pathEnd = pathStart;
1243
while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
1246
int queryStart = pathEnd;
1247
int queryEnd = queryStart;
1248
if (url[queryStart] == '?') {
1249
while (url[queryEnd] && url[queryEnd] != '#')
1253
int fragmentStart = queryEnd;
1254
int fragmentEnd = fragmentStart;
1255
if (url[fragmentStart] == '#') {
1257
fragmentEnd = fragmentStart;
1258
while (url[fragmentEnd])
1262
// assemble it all, remembering the real ranges
1264
Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
1266
char *p = buffer.data();
1267
const char *strPtr = url;
1269
// copy in the scheme
1270
const char *schemeEndPtr = url + schemeEnd;
1271
while (strPtr < schemeEndPtr)
1272
*p++ = toASCIILower(*strPtr++);
1273
m_schemeEnd = p - buffer.data();
1275
bool hostIsLocalHost = portEnd - userStart == 9
1276
&& isLetterMatchIgnoringCase(url[userStart], 'l')
1277
&& isLetterMatchIgnoringCase(url[userStart+1], 'o')
1278
&& isLetterMatchIgnoringCase(url[userStart+2], 'c')
1279
&& isLetterMatchIgnoringCase(url[userStart+3], 'a')
1280
&& isLetterMatchIgnoringCase(url[userStart+4], 'l')
1281
&& isLetterMatchIgnoringCase(url[userStart+5], 'h')
1282
&& isLetterMatchIgnoringCase(url[userStart+6], 'o')
1283
&& isLetterMatchIgnoringCase(url[userStart+7], 's')
1284
&& isLetterMatchIgnoringCase(url[userStart+8], 't');
1286
// File URLs need a host part unless it is just file:// or file://localhost
1287
bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
1289
bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || portStart != portEnd;
1291
// add ":" after scheme
1294
// if we have at least one authority part or a file URL - add "//" and authority
1295
if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
1299
m_userStart = p - buffer.data();
1302
strPtr = url + userStart;
1303
const char* userEndPtr = url + userEnd;
1304
while (strPtr < userEndPtr)
1306
m_userEnd = p - buffer.data();
1308
// copy in the password
1309
if (passwordEnd != passwordStart) {
1311
strPtr = url + passwordStart;
1312
const char* passwordEndPtr = url + passwordEnd;
1313
while (strPtr < passwordEndPtr)
1316
m_passwordEnd = p - buffer.data();
1318
// If we had any user info, add "@"
1319
if (p - buffer.data() != m_userStart)
1322
// copy in the host, except in the case of a file URL with authority="localhost"
1323
if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
1324
strPtr = url + hostStart;
1325
const char* hostEndPtr = url + hostEnd;
1326
if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
1327
while (strPtr < hostEndPtr)
1328
*p++ = toASCIILower(*strPtr++);
1330
while (strPtr < hostEndPtr)
1334
m_hostEnd = p - buffer.data();
1336
// Copy in the port if the URL has one (and it's not default).
1337
if (hostEnd != portStart) {
1338
const char* portStr = url + portStart;
1339
size_t portLength = portEnd - portStart;
1340
if (portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd)) {
1342
const char* portEndPtr = url + portEnd;
1343
while (portStr < portEndPtr)
1347
m_portEnd = p - buffer.data();
1350
ASSERT(degenerateFilePath);
1354
m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
1357
// For canonicalization, ensure we have a '/' for no path.
1358
// Do this only for URL with protocol file, http or https.
1359
if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
1362
// add path, escaping bad characters
1364
escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
1365
else if (!hasSlashDotOrDotDot(url))
1366
appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
1368
CharBuffer pathBuffer(pathEnd - pathStart + 1);
1369
size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
1370
appendEscapingBadChars(p, pathBuffer.data(), length);
1373
m_pathEnd = p - buffer.data();
1375
// Find the position after the last slash in the path, or
1376
// the position before the path if there are no slashes in it.
1378
for (i = m_pathEnd; i > m_portEnd; --i) {
1379
if (buffer[i - 1] == '/')
1382
m_pathAfterLastSlash = i;
1384
// add query, escaping bad characters
1385
appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
1386
m_queryEnd = p - buffer.data();
1388
// add fragment, escaping bad characters
1389
if (fragmentEnd != queryEnd) {
1391
escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
1393
m_fragmentEnd = p - buffer.data();
1395
ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1396
ASSERT(buffer.size() > 0);
1398
// If we didn't end up actually changing the original string and
1399
// it was already in a String, reuse it to avoid extra allocation.
1400
if (originalString && *originalString == buffer)
1401
m_string = *originalString;
1403
m_string = String(buffer.data(), m_fragmentEnd);
1408
bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
1410
if (a.m_queryEnd != b.m_queryEnd)
1412
unsigned queryLength = a.m_queryEnd;
1413
for (unsigned i = 0; i < queryLength; ++i)
1414
if (a.string()[i] != b.string()[i])
1419
bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
1421
if (a.m_schemeEnd != b.m_schemeEnd)
1424
int hostStartA = a.hostStart();
1425
int hostLengthA = a.hostEnd() - hostStartA;
1426
int hostStartB = b.hostStart();
1427
int hostLengthB = b.hostEnd() - b.hostStart();
1428
if (hostLengthA != hostLengthB)
1432
for (int i = 0; i < a.m_schemeEnd; ++i)
1433
if (a.string()[i] != b.string()[i])
1437
for (int i = 0; i < hostLengthA; ++i)
1438
if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1441
if (a.port() != b.port())
1447
String encodeWithURLEscapeSequences(const String& notEncodedString)
1449
CString asUTF8 = notEncodedString.utf8();
1451
CharBuffer buffer(asUTF8.length() * 3 + 1);
1452
char* p = buffer.data();
1454
const char* str = asUTF8.data();
1455
const char* strEnd = str + asUTF8.length();
1456
while (str < strEnd) {
1457
unsigned char c = *str++;
1459
appendEscapedChar(p, c);
1464
ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1466
return String(buffer.data(), p - buffer.data());
1469
// Appends the punycoded hostname identified by the given string and length to
1470
// the output buffer. The result will not be null terminated.
1471
static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen)
1473
// Needs to be big enough to hold an IDN-encoded name.
1474
// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
1475
const unsigned hostnameBufferLength = 2048;
1477
if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) {
1478
buffer.append(str, strLen);
1482
#if USE(ICU_UNICODE)
1483
UChar hostnameBuffer[hostnameBufferLength];
1484
UErrorCode error = U_ZERO_ERROR;
1485
int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer,
1486
hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
1487
if (error == U_ZERO_ERROR)
1488
buffer.append(hostnameBuffer, numCharactersConverted);
1489
#elif USE(QT4_UNICODE)
1490
QByteArray result = QUrl::toAce(String(str, strLen));
1491
buffer.append(result.constData(), result.length());
1492
#elif USE(GLIB_UNICODE)
1493
GOwnPtr<gchar> utf8Hostname;
1494
GOwnPtr<GError> utf8Err;
1495
utf8Hostname.set(g_utf16_to_utf8(str, strLen, 0, 0, &utf8Err.outPtr()));
1499
GOwnPtr<gchar> encodedHostname;
1500
encodedHostname.set(g_hostname_to_ascii(utf8Hostname.get()));
1501
if (!encodedHostname)
1504
buffer.append(encodedHostname.get(), strlen(encodedHostname.get()));
1508
static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<pair<int, int> >& nameRanges)
1510
// In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
1511
// Skip quoted strings so that characters in them don't confuse us.
1512
// When we find a '?' character, we are past the part of the URL that contains host names.
1518
// Find start of host name or of quoted string.
1519
int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?");
1520
if (hostnameOrStringStart == -1)
1522
UChar c = str[hostnameOrStringStart];
1523
p = hostnameOrStringStart + 1;
1529
// Find end of host name.
1530
int hostnameStart = p;
1531
int hostnameEnd = findFirstOf(str, strLen, p, ">,?");
1533
if (hostnameEnd == -1) {
1534
hostnameEnd = strLen;
1541
nameRanges.append(make_pair(hostnameStart, hostnameEnd));
1546
// Skip quoted string.
1549
int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\");
1550
if (escapedCharacterOrStringEnd == -1)
1553
c = str[escapedCharacterOrStringEnd];
1554
p = escapedCharacterOrStringEnd + 1;
1556
// If we are the end of the string, then break from the string loop back to the host name loop.
1560
// Skip escaped character.
1571
static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset)
1573
// Find the host name in a hierarchical URL.
1574
// It comes after a "://" sequence, with scheme characters preceding, and
1575
// this should be the first colon in the string.
1576
// It ends with the end of the string or a ":" or a path segment ending character.
1577
// If there is a "@" character, the host part is just the part after the "@".
1578
int separator = findFirstOf(str, strLen, 0, ":");
1579
if (separator == -1 || separator + 2 >= strLen ||
1580
str[separator + 1] != '/' || str[separator + 2] != '/')
1583
// Check that all characters before the :// are valid scheme characters.
1584
if (!isSchemeFirstChar(str[0]))
1586
for (int i = 1; i < separator; ++i) {
1587
if (!isSchemeChar(str[i]))
1591
// Start after the separator.
1592
int authorityStart = separator + 3;
1594
// Find terminating character.
1595
int hostnameEnd = strLen;
1596
for (int i = authorityStart; i < strLen; ++i) {
1598
if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
1604
// Find "@" for the start of the host name.
1605
int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@");
1607
if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
1608
hostnameStart = authorityStart;
1610
hostnameStart = userInfoTerminator + 1;
1612
startOffset = hostnameStart;
1613
endOffset = hostnameEnd;
1617
// Converts all hostnames found in the given input to punycode, preserving the
1618
// rest of the URL unchanged. The output will NOT be null-terminated.
1619
static void encodeHostnames(const String& str, UCharBuffer& output)
1623
if (protocolIs(str, "mailto")) {
1624
Vector<pair<int, int> > hostnameRanges;
1625
findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges);
1626
int n = hostnameRanges.size();
1628
for (int i = 0; i < n; ++i) {
1629
const pair<int, int>& r = hostnameRanges[i];
1630
output.append(&str.characters()[p], r.first - p);
1631
appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first);
1634
// This will copy either everything after the last hostname, or the
1635
// whole thing if there is no hostname.
1636
output.append(&str.characters()[p], str.length() - p);
1638
int hostStart, hostEnd;
1639
if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) {
1640
output.append(str.characters(), hostStart); // Before hostname.
1641
appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart);
1642
output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname.
1644
// No hostname to encode, return the input.
1645
output.append(str.characters(), str.length());
1650
static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
1653
encodeHostnames(rel, s);
1655
TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
1658
if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
1659
// Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
1660
pathEnd = findFirstOf(s.data(), s.size(), 0, "#?");
1663
if (pathEnd == -1) {
1664
CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables);
1665
output.resize(decoded.length());
1666
memcpy(output.data(), decoded.data(), decoded.length());
1668
CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables);
1669
// Unencodable characters in URLs are represented by converting
1670
// them to XML entities and escaping non-alphanumeric characters.
1671
CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables);
1673
output.resize(pathDecoded.length() + otherDecoded.length());
1674
memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
1675
memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
1677
output.append('\0'); // null-terminate the output.
1680
static String substituteBackslashes(const String& string)
1682
size_t questionPos = string.find('?');
1683
size_t hashPos = string.find('#');
1686
if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
1688
else if (questionPos != notFound)
1689
pathEnd = questionPos;
1691
pathEnd = string.length();
1693
return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
1696
bool KURL::isHierarchical() const
1700
ASSERT(m_string[m_schemeEnd] == ':');
1701
return m_string[m_schemeEnd + 1] == '/';
1704
void KURL::copyToBuffer(CharBuffer& buffer) const
1706
// FIXME: This throws away the high bytes of all the characters in the string!
1707
// That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
1708
buffer.resize(m_string.length());
1709
copyASCII(m_string, buffer.data());
1712
bool protocolIs(const String& url, const char* protocol)
1714
// Do the comparison without making a new string object.
1715
assertProtocolIsGood(protocol);
1716
for (int i = 0; ; ++i) {
1718
return url[i] == ':';
1719
if (!isLetterMatchIgnoringCase(url[i], protocol[i]))
1724
bool isValidProtocol(const String& protocol)
1726
// RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
1727
if (protocol.isEmpty())
1729
if (!isSchemeFirstChar(protocol[0]))
1731
unsigned protocolLength = protocol.length();
1732
for (unsigned i = 1; i < protocolLength; i++) {
1733
if (!isSchemeChar(protocol[i]))
1740
void KURL::print() const
1742
printf("%s\n", m_string.utf8().data());
1746
#endif // !USE(GOOGLEURL) && !USE(WTFURL)
1748
String KURL::strippedForUseAsReferrer() const
1750
KURL referrer(*this);
1751
referrer.setUser(String());
1752
referrer.setPass(String());
1753
referrer.removeFragmentIdentifier();
1754
return referrer.string();
1757
bool KURL::isLocalFile() const
1759
// Including feed here might be a bad idea since drag and drop uses this check
1760
// and including feed would allow feeds to potentially let someone's blog
1761
// read the contents of the clipboard on a drag, even without a drop.
1762
// Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
1763
return protocolIs("file");
1766
bool protocolIsJavaScript(const String& url)
1768
return protocolIs(url, "javascript");
1771
const KURL& blankURL()
1773
DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank"));
1774
return staticBlankURL;
1777
bool KURL::isBlankURL() const
1779
return protocolIs("about");
1782
bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
1784
if (protocol.isEmpty())
1787
typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap;
1788
DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ());
1789
if (defaultPorts.isEmpty()) {
1790
defaultPorts.set("http", 80);
1791
defaultPorts.set("https", 443);
1792
defaultPorts.set("ftp", 21);
1793
defaultPorts.set("ftps", 990);
1795
return defaultPorts.get(protocol) == port;
1798
bool portAllowed(const KURL& url)
1800
unsigned short port = url.port();
1802
// Since most URLs don't have a port, return early for the "no port" case.
1806
// This blocked port list matches the port blocking that Mozilla implements.
1807
// See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
1808
static const unsigned short blockedPortList[] = {
1842
135, // loc-srv / epmap
1848
512, // print / exec
1865
3659, // apple-sasl / PasswordServer [Apple addition]
1868
6665, // Alternate IRC [Apple addition]
1869
6666, // Alternate IRC [Apple addition]
1870
6667, // Standard IRC [Apple addition]
1871
6668, // Alternate IRC [Apple addition]
1872
6669, // Alternate IRC [Apple addition]
1873
invalidPortNumber, // Used to block all invalid port numbers
1875
const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
1878
// The port list must be sorted for binary_search to work.
1879
static bool checkedPortList = false;
1880
if (!checkedPortList) {
1881
for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
1882
ASSERT(*p < *(p + 1));
1883
checkedPortList = true;
1887
// If the port is not in the blocked port list, allow it.
1888
if (!binary_search(blockedPortList, blockedPortListEnd, port))
1891
// Allow ports 21 and 22 for FTP URLs, as Mozilla does.
1892
if ((port == 21 || port == 22) && url.protocolIs("ftp"))
1895
// Allow any port number in a file URL, since the port number is ignored.
1896
if (url.protocolIs("file"))
1899
#if PLATFORM(BLACKBERRY)
1900
if (url.protocolIs("local"))
1907
String mimeTypeFromDataURL(const String& url)
1909
ASSERT(protocolIs(url, "data"));
1910
size_t index = url.find(';');
1911
if (index == notFound)
1912
index = url.find(',');
1913
if (index != notFound) {
1915
return url.substring(5, index - 5);
1916
return "text/plain"; // Data URLs with no MIME type are considered text/plain.
1921
void KURL::reportMemoryUsage(MemoryObjectInfo* memoryObjectInfo) const
1923
MemoryClassInfo info(memoryObjectInfo, this);
1925
info.addMember(m_url);
1927
info.addMember(m_urlImpl);
1928
#else // !USE(GOOGLEURL)
1929
info.addMember(m_string);