~canonical-sysadmins/wordpress/4.7.2

« back to all changes in this revision

Viewing changes to wp-includes/Requests/IDNAEncoder.php

  • Committer: Barry Price
  • Date: 2016-08-17 04:50:12 UTC
  • mfrom: (1.1.18 upstream)
  • Revision ID: barry.price@canonical.com-20160817045012-qfui81zhqnqv2ba9
Merge WP4.6 from upstream

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
<?php
 
2
 
 
3
/**
 
4
 * IDNA URL encoder
 
5
 *
 
6
 * Note: Not fully compliant, as nameprep does nothing yet.
 
7
 *
 
8
 * @package Requests
 
9
 * @subpackage Utilities
 
10
 * @see https://tools.ietf.org/html/rfc3490 IDNA specification
 
11
 * @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification
 
12
 */
 
13
class Requests_IDNAEncoder {
 
14
        /**
 
15
         * ACE prefix used for IDNA
 
16
         *
 
17
         * @see https://tools.ietf.org/html/rfc3490#section-5
 
18
         * @var string
 
19
         */
 
20
        const ACE_PREFIX = 'xn--';
 
21
 
 
22
        /**#@+
 
23
         * Bootstrap constant for Punycode
 
24
         *
 
25
         * @see https://tools.ietf.org/html/rfc3492#section-5
 
26
         * @var int
 
27
         */
 
28
        const BOOTSTRAP_BASE         = 36;
 
29
        const BOOTSTRAP_TMIN         = 1;
 
30
        const BOOTSTRAP_TMAX         = 26;
 
31
        const BOOTSTRAP_SKEW         = 38;
 
32
        const BOOTSTRAP_DAMP         = 700;
 
33
        const BOOTSTRAP_INITIAL_BIAS = 72;
 
34
        const BOOTSTRAP_INITIAL_N    = 128;
 
35
        /**#@-*/
 
36
 
 
37
        /**
 
38
         * Encode a hostname using Punycode
 
39
         *
 
40
         * @param string $string Hostname
 
41
         * @return string Punycode-encoded hostname
 
42
         */
 
43
        public static function encode($string) {
 
44
                $parts = explode('.', $string);
 
45
                foreach ($parts as &$part) {
 
46
                        $part = self::to_ascii($part);
 
47
                }
 
48
                return implode('.', $parts);
 
49
        }
 
50
 
 
51
        /**
 
52
         * Convert a UTF-8 string to an ASCII string using Punycode
 
53
         *
 
54
         * @throws Requests_Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`)
 
55
         * @throws Requests_Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`)
 
56
         * @throws Requests_Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`)
 
57
         * @throws Requests_Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`)
 
58
         *
 
59
         * @param string $string ASCII or UTF-8 string (max length 64 characters)
 
60
         * @return string ASCII string
 
61
         */
 
62
        public static function to_ascii($string) {
 
63
                // Step 1: Check if the string is already ASCII
 
64
                if (self::is_ascii($string)) {
 
65
                        // Skip to step 7
 
66
                        if (strlen($string) < 64) {
 
67
                                return $string;
 
68
                        }
 
69
 
 
70
                        throw new Requests_Exception('Provided string is too long', 'idna.provided_too_long', $string);
 
71
                }
 
72
 
 
73
                // Step 2: nameprep
 
74
                $string = self::nameprep($string);
 
75
 
 
76
                // Step 3: UseSTD3ASCIIRules is false, continue
 
77
                // Step 4: Check if it's ASCII now
 
78
                if (self::is_ascii($string)) {
 
79
                        // Skip to step 7
 
80
                        if (strlen($string) < 64) {
 
81
                                return $string;
 
82
                        }
 
83
 
 
84
                        throw new Requests_Exception('Prepared string is too long', 'idna.prepared_too_long', $string);
 
85
                }
 
86
 
 
87
                // Step 5: Check ACE prefix
 
88
                if (strpos($string, self::ACE_PREFIX) === 0) {
 
89
                        throw new Requests_Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string);
 
90
                }
 
91
 
 
92
                // Step 6: Encode with Punycode
 
93
                $string = self::punycode_encode($string);
 
94
 
 
95
                // Step 7: Prepend ACE prefix
 
96
                $string = self::ACE_PREFIX . $string;
 
97
 
 
98
                // Step 8: Check size
 
99
                if (strlen($string) < 64) {
 
100
                        return $string;
 
101
                }
 
102
 
 
103
                throw new Requests_Exception('Encoded string is too long', 'idna.encoded_too_long', $string);
 
104
        }
 
105
 
 
106
        /**
 
107
         * Check whether a given string contains only ASCII characters
 
108
         *
 
109
         * @internal (Testing found regex was the fastest implementation)
 
110
         *
 
111
         * @param string $string
 
112
         * @return bool Is the string ASCII-only?
 
113
         */
 
114
        protected static function is_ascii($string) {
 
115
                return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1);
 
116
        }
 
117
 
 
118
        /**
 
119
         * Prepare a string for use as an IDNA name
 
120
         *
 
121
         * @todo Implement this based on RFC 3491 and the newer 5891
 
122
         * @param string $string
 
123
         * @return string Prepared string
 
124
         */
 
125
        protected static function nameprep($string) {
 
126
                return $string;
 
127
        }
 
128
 
 
129
        /**
 
130
         * Convert a UTF-8 string to a UCS-4 codepoint array
 
131
         *
 
132
         * Based on Requests_IRI::replace_invalid_with_pct_encoding()
 
133
         *
 
134
         * @throws Requests_Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`)
 
135
         * @param string $input
 
136
         * @return array Unicode code points
 
137
         */
 
138
        protected static function utf8_to_codepoints($input) {
 
139
                $codepoints = array();
 
140
 
 
141
                // Get number of bytes
 
142
                $strlen = strlen($input);
 
143
 
 
144
                for ($position = 0; $position < $strlen; $position++) {
 
145
                        $value = ord($input[$position]);
 
146
 
 
147
                        // One byte sequence:
 
148
                        if ((~$value & 0x80) === 0x80) {
 
149
                                $character = $value;
 
150
                                $length = 1;
 
151
                                $remaining = 0;
 
152
                        }
 
153
                        // Two byte sequence:
 
154
                        elseif (($value & 0xE0) === 0xC0) {
 
155
                                $character = ($value & 0x1F) << 6;
 
156
                                $length = 2;
 
157
                                $remaining = 1;
 
158
                        }
 
159
                        // Three byte sequence:
 
160
                        elseif (($value & 0xF0) === 0xE0) {
 
161
                                $character = ($value & 0x0F) << 12;
 
162
                                $length = 3;
 
163
                                $remaining = 2;
 
164
                        }
 
165
                        // Four byte sequence:
 
166
                        elseif (($value & 0xF8) === 0xF0) {
 
167
                                $character = ($value & 0x07) << 18;
 
168
                                $length = 4;
 
169
                                $remaining = 3;
 
170
                        }
 
171
                        // Invalid byte:
 
172
                        else {
 
173
                                throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value);
 
174
                        }
 
175
 
 
176
                        if ($remaining > 0) {
 
177
                                if ($position + $length > $strlen) {
 
178
                                        throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
 
179
                                }
 
180
                                for ($position++; $remaining > 0; $position++) {
 
181
                                        $value = ord($input[$position]);
 
182
 
 
183
                                        // If it is invalid, count the sequence as invalid and reprocess the current byte:
 
184
                                        if (($value & 0xC0) !== 0x80) {
 
185
                                                throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
 
186
                                        }
 
187
 
 
188
                                        $character |= ($value & 0x3F) << (--$remaining * 6);
 
189
                                }
 
190
                                $position--;
 
191
                        }
 
192
 
 
193
                        if (
 
194
                                // Non-shortest form sequences are invalid
 
195
                                   $length > 1 && $character <= 0x7F
 
196
                                || $length > 2 && $character <= 0x7FF
 
197
                                || $length > 3 && $character <= 0xFFFF
 
198
                                // Outside of range of ucschar codepoints
 
199
                                // Noncharacters
 
200
                                || ($character & 0xFFFE) === 0xFFFE
 
201
                                || $character >= 0xFDD0 && $character <= 0xFDEF
 
202
                                || (
 
203
                                        // Everything else not in ucschar
 
204
                                           $character > 0xD7FF && $character < 0xF900
 
205
                                        || $character < 0x20
 
206
                                        || $character > 0x7E && $character < 0xA0
 
207
                                        || $character > 0xEFFFD
 
208
                                )
 
209
                        ) {
 
210
                                throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
 
211
                        }
 
212
 
 
213
                        $codepoints[] = $character;
 
214
                }
 
215
 
 
216
                return $codepoints;
 
217
        }
 
218
 
 
219
        /**
 
220
         * RFC3492-compliant encoder
 
221
         *
 
222
         * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code
 
223
         * @throws Requests_Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`)
 
224
         *
 
225
         * @param string $input UTF-8 encoded string to encode
 
226
         * @return string Punycode-encoded string
 
227
         */
 
228
        public static function punycode_encode($input) {
 
229
                $output = '';
 
230
#               let n = initial_n
 
231
                $n = self::BOOTSTRAP_INITIAL_N;
 
232
#               let delta = 0
 
233
                $delta = 0;
 
234
#               let bias = initial_bias
 
235
                $bias = self::BOOTSTRAP_INITIAL_BIAS;
 
236
#               let h = b = the number of basic code points in the input
 
237
                $h = $b = 0; // see loop
 
238
#               copy them to the output in order
 
239
                $codepoints = self::utf8_to_codepoints($input);
 
240
                $extended = array();
 
241
 
 
242
                foreach ($codepoints as $char) {
 
243
                        if ($char < 128) {
 
244
                                // Character is valid ASCII
 
245
                                // TODO: this should also check if it's valid for a URL
 
246
                                $output .= chr($char);
 
247
                                $h++;
 
248
                        }
 
249
                        // Check if the character is non-ASCII, but below initial n
 
250
                        // This never occurs for Punycode, so ignore in coverage
 
251
                        // @codeCoverageIgnoreStart
 
252
                        elseif ($char < $n) {
 
253
                                throw new Requests_Exception('Invalid character', 'idna.character_outside_domain', $char);
 
254
                        }
 
255
                        // @codeCoverageIgnoreEnd
 
256
                        else {
 
257
                                $extended[$char] = true;
 
258
                        }
 
259
                }
 
260
                $extended = array_keys($extended);
 
261
                sort($extended);
 
262
                $b = $h;
 
263
#               [copy them] followed by a delimiter if b > 0
 
264
                if (strlen($output) > 0) {
 
265
                        $output .= '-';
 
266
                }
 
267
#               {if the input contains a non-basic code point < n then fail}
 
268
#               while h < length(input) do begin
 
269
                while ($h < count($codepoints)) {
 
270
#                       let m = the minimum code point >= n in the input
 
271
                        $m = array_shift($extended);
 
272
                        //printf('next code point to insert is %s' . PHP_EOL, dechex($m));
 
273
#                       let delta = delta + (m - n) * (h + 1), fail on overflow
 
274
                        $delta += ($m - $n) * ($h + 1);
 
275
#                       let n = m
 
276
                        $n = $m;
 
277
#                       for each code point c in the input (in order) do begin
 
278
                        for ($num = 0; $num < count($codepoints); $num++) {
 
279
                                $c = $codepoints[$num];
 
280
#                               if c < n then increment delta, fail on overflow
 
281
                                if ($c < $n) {
 
282
                                        $delta++;
 
283
                                }
 
284
#                               if c == n then begin
 
285
                                elseif ($c === $n) {
 
286
#                                       let q = delta
 
287
                                        $q = $delta;
 
288
#                                       for k = base to infinity in steps of base do begin
 
289
                                        for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) {
 
290
#                                               let t = tmin if k <= bias {+ tmin}, or
 
291
#                                                               tmax if k >= bias + tmax, or k - bias otherwise
 
292
                                                if ($k <= ($bias + self::BOOTSTRAP_TMIN)) {
 
293
                                                        $t = self::BOOTSTRAP_TMIN;
 
294
                                                }
 
295
                                                elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) {
 
296
                                                        $t = self::BOOTSTRAP_TMAX;
 
297
                                                }
 
298
                                                else {
 
299
                                                        $t = $k - $bias;
 
300
                                                }
 
301
#                                               if q < t then break
 
302
                                                if ($q < $t) {
 
303
                                                        break;
 
304
                                                }
 
305
#                                               output the code point for digit t + ((q - t) mod (base - t))
 
306
                                                $digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t));
 
307
                                                $output .= self::digit_to_char($digit);
 
308
#                                               let q = (q - t) div (base - t)
 
309
                                                $q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t));
 
310
#                                       end
 
311
                                        }
 
312
#                                       output the code point for digit q
 
313
                                        $output .= self::digit_to_char($q);
 
314
#                                       let bias = adapt(delta, h + 1, test h equals b?)
 
315
                                        $bias = self::adapt($delta, $h + 1, $h === $b);
 
316
#                                       let delta = 0
 
317
                                        $delta = 0;
 
318
#                                       increment h
 
319
                                        $h++;
 
320
#                               end
 
321
                                }
 
322
#                       end
 
323
                        }
 
324
#                       increment delta and n
 
325
                        $delta++;
 
326
                        $n++;
 
327
#               end
 
328
                }
 
329
 
 
330
                return $output;
 
331
        }
 
332
 
 
333
        /**
 
334
         * Convert a digit to its respective character
 
335
         *
 
336
         * @see https://tools.ietf.org/html/rfc3492#section-5
 
337
         * @throws Requests_Exception On invalid digit (`idna.invalid_digit`)
 
338
         *
 
339
         * @param int $digit Digit in the range 0-35
 
340
         * @return string Single character corresponding to digit
 
341
         */
 
342
        protected static function digit_to_char($digit) {
 
343
                // @codeCoverageIgnoreStart
 
344
                // As far as I know, this never happens, but still good to be sure.
 
345
                if ($digit < 0 || $digit > 35) {
 
346
                        throw new Requests_Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit);
 
347
                }
 
348
                // @codeCoverageIgnoreEnd
 
349
                $digits = 'abcdefghijklmnopqrstuvwxyz0123456789';
 
350
                return substr($digits, $digit, 1);
 
351
        }
 
352
 
 
353
        /**
 
354
         * Adapt the bias
 
355
         *
 
356
         * @see https://tools.ietf.org/html/rfc3492#section-6.1
 
357
         * @param int $delta
 
358
         * @param int $numpoints
 
359
         * @param bool $firsttime
 
360
         * @return int New bias
 
361
         */
 
362
        protected static function adapt($delta, $numpoints, $firsttime) {
 
363
#       function adapt(delta,numpoints,firsttime):
 
364
#               if firsttime then let delta = delta div damp
 
365
                if ($firsttime) {
 
366
                        $delta = floor($delta / self::BOOTSTRAP_DAMP);
 
367
                }
 
368
#               else let delta = delta div 2
 
369
                else {
 
370
                        $delta = floor($delta / 2);
 
371
                }
 
372
#               let delta = delta + (delta div numpoints)
 
373
                $delta += floor($delta / $numpoints);
 
374
#               let k = 0
 
375
                $k = 0;
 
376
#               while delta > ((base - tmin) * tmax) div 2 do begin
 
377
                $max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2);
 
378
                while ($delta > $max) {
 
379
#                       let delta = delta div (base - tmin)
 
380
                        $delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN));
 
381
#                       let k = k + base
 
382
                        $k += self::BOOTSTRAP_BASE;
 
383
#               end
 
384
                }
 
385
#               return k + (((base - tmin + 1) * delta) div (delta + skew))
 
386
                return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW));
 
387
        }
 
388
}
 
 
b'\\ No newline at end of file'