3
HTMLPurifier_ConfigSchema::define(
4
'Core', 'Encoding', 'utf-8', 'istring',
5
'If for some reason you are unable to convert all webpages to UTF-8, '.
6
'you can use this directive as a stop-gap compatibility change to '.
7
'let HTML Purifier deal with non UTF-8 input. This technique has '.
8
'notable deficiencies: absolutely no characters outside of the selected '.
9
'character encoding will be preserved, not even the ones that have '.
10
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
11
'that automatically resolves all entities), making it pretty useless '.
12
'for anything except the most I18N-blind applications, although '.
13
'%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
14
'another tradeoff. This directive '.
15
'only accepts ISO-8859-1 if iconv is not enabled.'
18
HTMLPurifier_ConfigSchema::define(
19
'Core', 'EscapeNonASCIICharacters', false, 'bool',
20
'This directive overcomes a deficiency in %Core.Encoding by blindly '.
21
'converting all non-ASCII characters into decimal numeric entities before '.
22
'converting it to its native encoding. This means that even '.
23
'characters that can be expressed in the non-UTF-8 encoding will '.
24
'be entity-ized, which can be a real downer for encodings like Big5. '.
25
'It also assumes that the ASCII repetoire is available, although '.
26
'this is the case for almost all encodings. Anyway, use UTF-8! This '.
27
'directive has been available since 1.4.0.'
30
if ( !function_exists('iconv') ) {
31
// only encodings with native PHP support
32
HTMLPurifier_ConfigSchema::defineAllowedValues(
33
'Core', 'Encoding', array(
38
HTMLPurifier_ConfigSchema::defineValueAliases(
39
'Core', 'Encoding', array(
40
'iso8859-1' => 'iso-8859-1'
45
HTMLPurifier_ConfigSchema::define(
46
'Test', 'ForceNoIconv', false, 'bool',
47
'When set to true, HTMLPurifier_Encoder will act as if iconv does not '.
48
'exist and use only pure PHP implementations.'
52
4
* A UTF-8 specific character encoder that handles cleaning and transforming.
53
5
* @note All functions in this class should be static.
55
7
class HTMLPurifier_Encoder
59
11
* Constructor throws fatal error if you attempt to instantiate class
61
function HTMLPurifier_Encoder() {
13
private function __construct() {
62
14
trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
66
18
* Error-handler that mutes errors, alternative to shut-up operator.
68
function muteErrorHandler() {}
20
public static function muteErrorHandler() {}
72
23
* Cleans a UTF-8 string for well-formedness and SGML validity
74
25
* It will parse according to UTF-8 and return a valid UTF8 string, with
75
26
* non-SGML codepoints excluded.
78
28
* @note Just for reference, the non-SGML code points are 0 to 31 and
79
29
* 127 to 159, inclusive. However, we allow code points 9, 10
80
30
* and 13, which are the tab, line feed and carriage return
81
31
* respectively. 128 and above the code points map to multibyte
82
32
* UTF-8 representations.
84
34
* @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
85
35
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
86
36
* LGPL license. Notes on what changed are inside, but in general,
104
54
if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
108
58
$mState = 0; // cached expected number of octets after the current octet
109
59
// until the beginning of the next UTF8 character sequence
110
60
$mUcs4 = 0; // cached Unicode character
111
61
$mBytes = 1; // cached expected number of octets in the current sequence
113
63
// original code involved an $out that was an array of Unicode
114
64
// codepoints. Instead of having to convert back into UTF-8, we've
115
65
// decided to directly append valid UTF-8 characters onto a string
116
66
// $out once they're done. $char accumulates raw bytes, while $mUcs4
117
67
// turns into the Unicode code point, so there's some redundancy.
122
72
$len = strlen($str);
123
73
for($i = 0; $i < $len; $i++) {
124
74
$in = ord($str{$i});
125
75
$char .= $str[$i]; // append byte to char
126
76
if (0 == $mState) {
127
// When mState is zero we expect either a US-ASCII character
77
// When mState is zero we expect either a US-ASCII character
128
78
// or a multi-octet sequence.
129
79
if (0 == (0x80 & ($in))) {
130
80
// US-ASCII, pass straight through.
131
if (($in <= 31 || $in == 127) &&
81
if (($in <= 31 || $in == 127) &&
132
82
!($in == 9 || $in == 13 || $in == 10) // save \r\t\n
134
84
// control characters, remove
299
248
$z = (($code >> 12) & 63) | 128;
300
249
$w = (($code >> 18) & 7) | 240;
304
253
// set up the actual character
306
255
if($w) $ret .= chr($w);
307
256
if($z) $ret .= chr($z);
308
257
if($y) $ret .= chr($y);
315
264
* Converts a string to UTF-8 based on configuration.
318
function convertToUTF8($str, $config, &$context) {
319
$encoding = $config->get('Core', 'Encoding');
266
public static function convertToUTF8($str, $config, $context) {
267
$encoding = $config->get('Core.Encoding');
320
268
if ($encoding === 'utf-8') return $str;
321
269
static $iconv = null;
322
270
if ($iconv === null) $iconv = function_exists('iconv');
323
271
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
324
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
272
if ($iconv && !$config->get('Test.ForceNoIconv')) {
325
273
$str = iconv($encoding, 'utf-8//IGNORE', $str);
274
if ($str === false) {
275
// $encoding is not a valid encoding
276
restore_error_handler();
277
trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
326
280
// If the string is bjorked by Shift_JIS or a similar encoding
327
281
// that doesn't support all of ASCII, convert the naughty
328
282
// characters to their true byte-wise ASCII/UTF-8 equivalents.
334
288
restore_error_handler();
337
trigger_error('Encoding not supported', E_USER_ERROR);
291
trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
341
295
* Converts a string from UTF-8 based on configuration.
343
296
* @note Currently, this is a lossy conversion, with unexpressable
344
297
* characters being omitted.
346
function convertFromUTF8($str, $config, &$context) {
347
$encoding = $config->get('Core', 'Encoding');
299
public static function convertFromUTF8($str, $config, $context) {
300
$encoding = $config->get('Core.Encoding');
348
301
if ($encoding === 'utf-8') return $str;
349
302
static $iconv = null;
350
303
if ($iconv === null) $iconv = function_exists('iconv');
351
if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
304
if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
352
305
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
354
307
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
355
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
308
if ($iconv && !$config->get('Test.ForceNoIconv')) {
356
309
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
357
310
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
358
311
if (!$escape && !empty($ascii_fix)) {
425
377
* This expensive function tests whether or not a given character
426
378
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
427
379
* fail this test, and require special processing. Variable width
428
380
* encodings shouldn't ever fail.
430
382
* @param string $encoding Encoding name to test, as per iconv format
431
383
* @param bool $bypass Whether or not to bypass the precompiled arrays.
432
384
* @return Array of UTF-8 characters to their corresponding ASCII,
433
385
* which can be used to "undo" any overzealous iconv action.
435
function testEncodingSupportsASCII($encoding, $bypass = false) {
387
public static function testEncodingSupportsASCII($encoding, $bypass = false) {
436
388
static $encodings = array();
438
390
if (isset($encodings[$encoding])) return $encodings[$encoding];
449
401
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
450
402
if (iconv('UTF-8', $encoding, 'a') === false) return false;
451
403
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
453
if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
404
$c = chr($i); // UTF-8 char
405
$r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
408
// This line is needed for iconv implementations that do not
409
// omit characters that do not exist in the target character set
410
($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
454
412
// Reverse engineer: what's the UTF-8 equiv of this byte
455
413
// sequence? This assumes that there's no variable width
456
414
// encoding that doesn't support ASCII.