2
namespace TSEP\Component\Indexer\Helper;
5
* This file has been modified from its original form
6
* The License and comments were not altered
10
* Edited by Nitin Kr. Gupta, publicmind.in
14
* Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
15
* All rights reserved.
17
* Redistribution and use in source and binary forms, with or without
18
* modification, are permitted provided that the following conditions
21
* * Redistributions of source code must retain the above copyright
22
* notice, this list of conditions and the following disclaimer.
24
* * Redistributions in binary form must reproduce the above
25
* copyright notice, this list of conditions and the following
26
* disclaimer in the documentation and/or other materials provided
27
* with the distribution.
29
* * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
30
* the names of its contributors may be used to endorse or promote
31
* products derived from this software without specific prior
34
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
37
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
38
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
39
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
40
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
41
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
42
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
43
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
44
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
49
* This is a BSD License approved by the Open Source Initiative (OSI).
50
* See: http://www.opensource.org/licenses/bsd-license.php
57
* Combine a base URL and a relative URL to produce a new
58
* absolute URL. The base URL is often the URL of a page,
59
* and the relative URL is a URL embedded on that page.
61
* This function implements the "absolutize" algorithm from
62
* the RFC3986 specification for URLs.
64
* This function supports multi-byte characters with the UTF-8 encoding,
65
* per the URL specification.
68
* baseUrl the absolute base URL.
70
* url the relative URL to convert.
73
* An absolute URL that combines parts of the base and relative
74
* URLs, or FALSE if the base URL is not absolute or if either
75
* URL cannot be parsed.
78
public static function urlToAbsolute( $baseUrl, $relativeUrl )
80
// If relative URL has a scheme, clean path and return.
81
$r = self::splitUrl( $relativeUrl );
84
if ( !empty( $r['scheme'] ) )
86
if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
87
$r['path'] = self::urlRemoveDotSegments( $r['path'] );
88
return self::joinURL( $r );
91
// Make sure the base URL is absolute.
92
$b = self::splitUrl( $baseUrl );
93
if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
95
$r['scheme'] = $b['scheme'];
97
// If relative URL has an authority, clean path and return.
98
if ( isset( $r['host'] ) )
100
if ( !empty( $r['path'] ) )
101
$r['path'] = self::urlRemoveDotSegments( $r['path'] );
102
return self::joinURL( $r );
108
// Copy base authority.
109
$r['host'] = $b['host'];
110
if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
111
if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
112
if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
114
// If relative URL has no path, use base path
115
if ( empty( $r['path'] ) )
117
if ( !empty( $b['path'] ) )
118
$r['path'] = $b['path'];
119
if ( !isset( $r['query'] ) && isset( $b['query'] ) )
120
$r['query'] = $b['query'];
121
return self::joinURL( $r );
124
// If relative URL path doesn't start with /, merge with base path
125
if ( $r['path'][0] != '/' )
127
$base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
128
if ( $base === FALSE ) $base = '';
129
$r['path'] = $base . '/' . $r['path'];
131
$r['path'] = self::urlRemoveDotSegments( $r['path'] );
132
return self::joinURL( $r );
136
* Filter out "." and ".." segments from a URL's path and return
139
* This function implements the "remove_dot_segments" algorithm from
140
* the RFC3986 specification for URLs.
142
* This function supports multi-byte characters with the UTF-8 encoding,
143
* per the URL specification.
146
* path the path to filter
149
* The filtered path with "." and ".." removed.
151
protected static function urlRemoveDotSegments( $path )
153
// multi-byte character explode
154
$inSegs = preg_split( '!/!u', $path );
156
foreach ( $inSegs as $seg )
158
if ( $seg == '' || $seg == '.')
161
array_pop( $outSegs );
163
array_push( $outSegs, $seg );
165
$outPath = implode( '/', $outSegs );
166
if ( $path[0] == '/' )
167
$outPath = '/' . $outPath;
168
// compare last multi-byte character against '/'
169
if ( $outPath != '/' &&
170
(mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
177
* This function parses an absolute or relative URL and splits it
178
* into individual components.
180
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
181
* A portion of the ABNFs are repeated here:
183
* URI-reference = URI
186
* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
188
* relative-ref = relative-part [ "?" query ] [ "#" fragment ]
190
* hier-part = "//" authority path-abempty
195
* relative-part = "//" authority path-abempty
200
* authority = [ userinfo "@" ] host [ ":" port ]
202
* So, a URL has the following major components:
205
* The name of a method used to interpret the rest of
206
* the URL. Examples: "http", "https", "mailto", "file'.
209
* The name of the authority governing the URL's name
210
* space. Examples: "example.com", "user@example.com",
211
* "example.com:80", "user:password@example.com:80".
213
* The authority may include a host name, port number,
214
* user name, and password.
216
* The host may be a name, an IPv4 numeric address, or
217
* an IPv6 numeric address.
220
* The hierarchical path to the URL's resource.
221
* Examples: "/index.htm", "/scripts/page.php".
224
* The data for a query. Examples: "?search=google.com".
227
* The name of a secondary resource relative to that named
228
* by the path. Examples: "#section1", "#header".
230
* An "absolute" URL must include a scheme and path. The authority, query,
231
* and fragment components are optional.
233
* A "relative" URL does not include a scheme and must include a path. The
234
* authority, query, and fragment components are optional.
236
* This function splits the $url argument into the following components
237
* and returns them in an associative array. Keys to that array include:
239
* "scheme" The scheme, such as "http".
240
* "host" The host name, IPv4, or IPv6 address.
241
* "port" The port number.
242
* "user" The user name.
243
* "pass" The user password.
244
* "path" The path, such as a file path for "http".
246
* "fragment" The fragment.
248
* One or more of these may not be present, depending upon the URL.
250
* Optionally, the "user", "pass", "host" (if a name, not an IP address),
251
* "path", "query", and "fragment" may have percent-encoded characters
252
* decoded. The "scheme" and "port" cannot include percent-encoded
253
* characters and are never decoded. Decoding occurs after the URL has
257
* url the URL to parse.
259
* decode an optional boolean flag selecting whether
260
* to decode percent encoding or not. Default = TRUE.
263
* the associative array of URL parts, or FALSE if the URL is
264
* too malformed to recognize any parts.
266
protected static function splitUrl( $url, $decode=FALSE)
268
// Character sets from RFC3986.
269
$xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
270
$xpchar = $xunressub . ':@%';
272
// Scheme from RFC3986.
273
$xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)';
275
// User info (user + password) from RFC3986.
276
$xuserinfo = '(([' . $xunressub . '%]*)' .
277
'(:([' . $xunressub . ':%]*))?)';
279
// IPv4 from RFC3986 (without digit constraints).
280
$xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
282
// IPv6 from RFC2732 (without digit and grouping constraints).
283
$xipv6 = '(\[([a-fA-F\d.:]+)\])';
285
// Host name from RFC1035. Technically, must start with a letter.
286
// Relax that restriction to better parse URL structure, then
287
// leave host name validation to application.
288
$xhost_name = '([a-zA-Z\d-.%]+)';
290
// Authority from RFC3986. Skip IP future.
291
$xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
293
$xauthority = '((' . $xuserinfo . '@)?' . $xhost .
294
'?(:' . $xport . ')?)';
296
// Path from RFC3986. Blend absolute & relative for efficiency.
297
$xslash_seg = '(/[' . $xpchar . ']*)';
298
$xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
299
$xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)';
300
$xpath_abs = '(/(' . $xpath_rel . ')?)';
301
$xapath = '(' . $xpath_authabs . '|' . $xpath_abs .
302
'|' . $xpath_rel . ')';
304
// Query and fragment from RFC3986.
305
$xqueryfrag = '([' . $xpchar . '/?' . ']*)';
308
$xurl = '^(' . $xscheme . ':)?' . $xapath . '?' .
309
'(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
312
// Split the URL into components.
313
if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
316
if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]);
318
if ( !empty($m[7]) ) {
319
if ( isset( $m[9] ) ) $parts['user'] = $m[9];
320
else $parts['user'] = '';
322
if ( !empty($m[10]) ) $parts['pass'] = $m[11];
324
if ( !empty($m[13]) ) $h=$parts['host'] = $m[13];
325
else if ( !empty($m[14]) ) $parts['host'] = $m[14];
326
else if ( !empty($m[16]) ) $parts['host'] = $m[16];
327
else if ( !empty( $m[5] ) ) $parts['host'] = '';
328
if ( !empty($m[17]) ) $parts['port'] = $m[18];
330
if ( !empty($m[19]) ) $parts['path'] = $m[19];
331
else if ( !empty($m[21]) ) $parts['path'] = $m[21];
332
else if ( !empty($m[25]) ) $parts['path'] = $m[25];
334
if ( !empty($m[27]) ) $parts['query'] = $m[28];
335
if ( !empty($m[29]) ) $parts['fragment']= $m[30];
339
if ( !empty($parts['user']) )
340
$parts['user'] = rawurldecode( $parts['user'] );
341
if ( !empty($parts['pass']) )
342
$parts['pass'] = rawurldecode( $parts['pass'] );
343
if ( !empty($parts['path']) )
344
$parts['path'] = rawurldecode( $parts['path'] );
346
$parts['host'] = rawurldecode( $parts['host'] );
347
if ( !empty($parts['query']) )
348
$parts['query'] = rawurldecode( $parts['query'] );
349
if ( !empty($parts['fragment']) )
350
$parts['fragment'] = rawurldecode( $parts['fragment'] );
356
* This function joins together URL components to form a complete URL.
358
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
359
* This function implements the specification's "component recomposition"
360
* algorithm for combining URI components into a full URI string.
362
* The $parts argument is an associative array containing zero or
363
* more of the following:
365
* "scheme" The scheme, such as "http".
366
* "host" The host name, IPv4, or IPv6 address.
367
* "port" The port number.
368
* "user" The user name.
369
* "pass" The user password.
370
* "path" The path, such as a file path for "http".
372
* "fragment" The fragment.
374
* The "port", "user", and "pass" values are only used when a "host"
377
* The optional $encode argument indicates if appropriate URL components
378
* should be percent-encoded as they are assembled into the URL. Encoding
379
* is only applied to the "user", "pass", "host" (if a host name, not an
380
* IP address), "path", "query", and "fragment" components. The "scheme"
381
* and "port" are never encoded. When a "scheme" and "host" are both
382
* present, the "path" is presumed to be hierarchical and encoding
383
* processes each segment of the hierarchy separately (i.e., the slashes
386
* The assembled URL string is returned.
389
* parts an associative array of strings containing the
390
* individual parts of a URL.
392
* encode an optional boolean flag selecting whether
393
* to do percent encoding or not. Default = true.
396
* Returns the assembled URL string. The string is an absolute
397
* URL if a scheme is supplied, and a relative URL if not. An
398
* empty string is returned if the $parts array does not contain
399
* any of the needed values.
401
protected static function joinURL( $parts, $encode=FALSE)
405
if ( isset( $parts['user'] ) )
406
$parts['user'] = rawurlencode( $parts['user'] );
407
if ( isset( $parts['pass'] ) )
408
$parts['pass'] = rawurlencode( $parts['pass'] );
409
if ( isset( $parts['host'] ) &&
410
!preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
411
$parts['host'] = rawurlencode( $parts['host'] );
412
if ( !empty( $parts['path'] ) )
413
$parts['path'] = preg_replace( '!%2F!ui', '/',
414
rawurlencode( $parts['path'] ) );
415
if ( isset( $parts['query'] ) )
416
$parts['query'] = rawurlencode( $parts['query'] );
417
if ( isset( $parts['fragment'] ) )
418
$parts['fragment'] = rawurlencode( $parts['fragment'] );
422
if ( !empty( $parts['scheme'] ) )
423
$url .= $parts['scheme'] . ':';
424
if ( isset( $parts['host'] ) )
427
if ( isset( $parts['user'] ) )
429
$url .= $parts['user'];
430
if ( isset( $parts['pass'] ) )
431
$url .= ':' . $parts['pass'];
434
if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
435
$url .= '[' . $parts['host'] . ']'; // IPv6
437
$url .= $parts['host']; // IPv4 or name
438
if ( isset( $parts['port'] ) )
439
$url .= ':' . $parts['port'];
440
if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
443
if ( !empty( $parts['path'] ) )
444
$url .= $parts['path'];
445
if ( isset( $parts['query'] ) )
446
$url .= '?' . $parts['query'];
447
if ( isset( $parts['fragment'] ) )
448
$url .= '#' . $parts['fragment'];