projects.mako.cc - scuttle/blob - includes/utf8/utils/bad.php

   1 <?php
   2 /**
   3 * @version $Id: bad.php,v 1.2 2006/02/26 13:20:44 harryf Exp $
   4 * Tools for locating / replacing bad bytes in UTF-8 strings
   5 * The Original Code is Mozilla Communicator client code.
   6 * The Initial Developer of the Original Code is
   7 * Netscape Communications Corporation.
   8 * Portions created by the Initial Developer are Copyright (C) 1998
   9 * the Initial Developer. All Rights Reserved.
  10 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  11 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  13 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  14 * @see http://hsivonen.iki.fi/php-utf8/
  15 * @package utf8
  16 * @subpackage bad
  17 * @see utf8_is_valid
  18 */
  19
  20 //--------------------------------------------------------------------
  21 /**
  22 * Locates the first bad byte in a UTF-8 string returning it's
  23 * byte index in the string
  24 * PCRE Pattern to locate bad bytes in a UTF-8 string
  25 * Comes from W3 FAQ: Multilingual Forms
  26 * Note: modified to include full ASCII range including control chars
  27 * @see http://www.w3.org/International/questions/qa-forms-utf-8
  28 * @param string
  29 * @return mixed integer byte index or FALSE if no bad found
  30 * @package utf8
  31 * @subpackage bad
  32 */
  33 function utf8_bad_find($str) {
  34     $UTF8_BAD =
  35     '([\x00-\x7F]'.                          # ASCII (including control chars)
  36     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
  37     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
  38     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
  39     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
  40     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
  41     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
  42     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
  43     '|(.{1}))';                              # invalid byte
  44     $pos = 0;
  45     $badList = array();
  46     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  47         $bytes = strlen($matches[0]);
  48         if ( isset($matches[2])) {
  49             return $pos;
  50         }
  51         $pos += $bytes;
  52         $str = substr($str,$bytes);
  53     }
  54     return FALSE;
  55 }
  56
  57 //--------------------------------------------------------------------
  58 /**
  59 * Locates all bad bytes in a UTF-8 string and returns a list of their
  60 * byte index in the string
  61 * PCRE Pattern to locate bad bytes in a UTF-8 string
  62 * Comes from W3 FAQ: Multilingual Forms
  63 * Note: modified to include full ASCII range including control chars
  64 * @see http://www.w3.org/International/questions/qa-forms-utf-8
  65 * @param string
  66 * @return mixed array of integers or FALSE if no bad found
  67 * @package utf8
  68 * @subpackage bad
  69 */
  70 function utf8_bad_findall($str) {
  71     $UTF8_BAD =
  72     '([\x00-\x7F]'.                          # ASCII (including control chars)
  73     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
  74     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
  75     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
  76     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
  77     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
  78     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
  79     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
  80     '|(.{1}))';                              # invalid byte
  81     $pos = 0;
  82     $badList = array();
  83     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  84         $bytes = strlen($matches[0]);
  85         if ( isset($matches[2])) {
  86             $badList[] = $pos;
  87         }
  88         $pos += $bytes;
  89         $str = substr($str,$bytes);
  90     }
  91     if ( count($badList) > 0 ) {
  92         return $badList;
  93     }
  94     return FALSE;
  95 }
  96
  97 //--------------------------------------------------------------------
  98 /**
  99 * Strips out any bad bytes from a UTF-8 string and returns the rest
 100 * PCRE Pattern to locate bad bytes in a UTF-8 string
 101 * Comes from W3 FAQ: Multilingual Forms
 102 * Note: modified to include full ASCII range including control chars
 103 * @see http://www.w3.org/International/questions/qa-forms-utf-8
 104 * @param string
 105 * @return string
 106 * @package utf8
 107 * @subpackage bad
 108 */
 109 function utf8_bad_strip($str) {
 110     $UTF8_BAD =
 111     '([\x00-\x7F]'.                          # ASCII (including control chars)
 112     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 113     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 114     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 115     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 116     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 117     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 118     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 119     '|(.{1}))';                              # invalid byte
 120     ob_start();
 121     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 122         if ( !isset($matches[2])) {
 123             echo $matches[0];
 124         }
 125         $str = substr($str,strlen($matches[0]));
 126     }
 127     $result = ob_get_contents();
 128     ob_end_clean();
 129     return $result;
 130 }
 131
 132 //--------------------------------------------------------------------
 133 /**
 134 * Replace bad bytes with an alternative character - ASCII character
 135 * recommended is replacement char
 136 * PCRE Pattern to locate bad bytes in a UTF-8 string
 137 * Comes from W3 FAQ: Multilingual Forms
 138 * Note: modified to include full ASCII range including control chars
 139 * @see http://www.w3.org/International/questions/qa-forms-utf-8
 140 * @param string to search
 141 * @param string to replace bad bytes with (defaults to '?') - use ASCII
 142 * @return string
 143 * @package utf8
 144 * @subpackage bad
 145 */
 146 function utf8_bad_replace($str, $replace = '?') {
 147     $UTF8_BAD =
 148     '([\x00-\x7F]'.                          # ASCII (including control chars)
 149     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 150     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 151     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 152     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 153     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 154     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 155     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 156     '|(.{1}))';                              # invalid byte
 157     ob_start();
 158     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 159         if ( !isset($matches[2])) {
 160             echo $matches[0];
 161         } else {
 162             echo $replace;
 163         }
 164         $str = substr($str,strlen($matches[0]));
 165     }
 166     $result = ob_get_contents();
 167     ob_end_clean();
 168     return $result;
 169 }
 170
 171 //--------------------------------------------------------------------
 172 /**
 173 * Return code from utf8_bad_identify() when a five octet sequence is detected.
 174 * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
 175 * do not represent a useful character
 176 * @see utf8_bad_identify
 177 * @package utf8
 178 * @subpackage bad
 179 */
 180 define('UTF8_BAD_5OCTET',1);
 181
 182 /**
 183 * Return code from utf8_bad_identify() when a six octet sequence is detected.
 184 * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
 185 * do not represent a useful character
 186 * @see utf8_bad_identify
 187 * @package utf8
 188 * @subpackage bad
 189 */
 190 define('UTF8_BAD_6OCTET',2);
 191
 192 /**
 193 * Return code from utf8_bad_identify().
 194 * Invalid octet for use as start of multi-byte UTF-8 sequence
 195 * @see utf8_bad_identify
 196 * @package utf8
 197 * @subpackage bad
 198 */
 199 define('UTF8_BAD_SEQID',3);
 200
 201 /**
 202 * Return code from utf8_bad_identify().
 203 * From Unicode 3.1, non-shortest form is illegal
 204 * @see utf8_bad_identify
 205 * @package utf8
 206 * @subpackage bad
 207 */
 208 define('UTF8_BAD_NONSHORT',4);
 209
 210 /**
 211 * Return code from utf8_bad_identify().
 212 * From Unicode 3.2, surrogate characters are illegal
 213 * @see utf8_bad_identify
 214 * @package utf8
 215 * @subpackage bad
 216 */
 217 define('UTF8_BAD_SURROGATE',5);
 218
 219 /**
 220 * Return code from utf8_bad_identify().
 221 * Codepoints outside the Unicode range are illegal
 222 * @see utf8_bad_identify
 223 * @package utf8
 224 * @subpackage bad
 225 */
 226 define('UTF8_BAD_UNIOUTRANGE',6);
 227
 228 /**
 229 * Return code from utf8_bad_identify().
 230 * Incomplete multi-octet sequence
 231 * Note: this is kind of a "catch-all"
 232 * @see utf8_bad_identify
 233 * @package utf8
 234 * @subpackage bad
 235 */
 236 define('UTF8_BAD_SEQINCOMPLETE',7);
 237
 238 //--------------------------------------------------------------------
 239 /**
 240 * Reports on the type of bad byte found in a UTF-8 string. Returns a
 241 * status code on the first bad byte found
 242 * @author <hsivonen@iki.fi>
 243 * @param string UTF-8 encoded string
 244 * @return mixed integer constant describing problem or FALSE if valid UTF-8
 245 * @see utf8_bad_explain
 246 * @see http://hsivonen.iki.fi/php-utf8/
 247 * @package utf8
 248 * @subpackage bad
 249 */
 250 function utf8_bad_identify($str, &$i) {
 251
 252     $mState = 0;     // cached expected number of octets after the current octet
 253                      // until the beginning of the next UTF8 character sequence
 254     $mUcs4  = 0;     // cached Unicode character
 255     $mBytes = 1;     // cached expected number of octets in the current sequence
 256
 257     $len = strlen($str);
 258
 259     for($i = 0; $i < $len; $i++) {
 260
 261         $in = ord($str{$i});
 262
 263         if ( $mState == 0) {
 264
 265             // When mState is zero we expect either a US-ASCII character or a
 266             // multi-octet sequence.
 267             if (0 == (0x80 & ($in))) {
 268                 // US-ASCII, pass straight through.
 269                 $mBytes = 1;
 270
 271             } else if (0xC0 == (0xE0 & ($in))) {
 272                 // First octet of 2 octet sequence
 273                 $mUcs4 = ($in);
 274                 $mUcs4 = ($mUcs4 & 0x1F) << 6;
 275                 $mState = 1;
 276                 $mBytes = 2;
 277
 278             } else if (0xE0 == (0xF0 & ($in))) {
 279                 // First octet of 3 octet sequence
 280                 $mUcs4 = ($in);
 281                 $mUcs4 = ($mUcs4 & 0x0F) << 12;
 282                 $mState = 2;
 283                 $mBytes = 3;
 284
 285             } else if (0xF0 == (0xF8 & ($in))) {
 286                 // First octet of 4 octet sequence
 287                 $mUcs4 = ($in);
 288                 $mUcs4 = ($mUcs4 & 0x07) << 18;
 289                 $mState = 3;
 290                 $mBytes = 4;
 291
 292             } else if (0xF8 == (0xFC & ($in))) {
 293
 294                 /* First octet of 5 octet sequence.
 295                 *
 296                 * This is illegal because the encoded codepoint must be either
 297                 * (a) not the shortest form or
 298                 * (b) outside the Unicode range of 0-0x10FFFF.
 299                 */
 300
 301                 return UTF8_BAD_5OCTET;
 302
 303             } else if (0xFC == (0xFE & ($in))) {
 304
 305                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
 306                 return UTF8_BAD_6OCTET;
 307
 308             } else {
 309                 // Current octet is neither in the US-ASCII range nor a legal first
 310                 // octet of a multi-octet sequence.
 311                 return UTF8_BAD_SEQID;
 312
 313             }
 314
 315         } else {
 316
 317             // When mState is non-zero, we expect a continuation of the multi-octet
 318             // sequence
 319             if (0x80 == (0xC0 & ($in))) {
 320
 321                 // Legal continuation.
 322                 $shift = ($mState - 1) * 6;
 323                 $tmp = $in;
 324                 $tmp = ($tmp & 0x0000003F) << $shift;
 325                 $mUcs4 |= $tmp;
 326
 327                 /**
 328                 * End of the multi-octet sequence. mUcs4 now contains the final
 329                 * Unicode codepoint to be output
 330                 */
 331                 if (0 == --$mState) {
 332
 333                     // From Unicode 3.1, non-shortest form is illegal
 334                     if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 335                         ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 336                         ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
 337                         return UTF8_BAD_NONSHORT;
 338
 339                     // From Unicode 3.2, surrogate characters are illegal
 340                     } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
 341                         return UTF8_BAD_SURROGATE;
 342
 343                     // Codepoints outside the Unicode range are illegal
 344                     } else if ($mUcs4 > 0x10FFFF) {
 345                         return UTF8_BAD_UNIOUTRANGE;
 346                     }
 347
 348                     //initialize UTF8 cache
 349                     $mState = 0;
 350                     $mUcs4  = 0;
 351                     $mBytes = 1;
 352                 }
 353
 354             } else {
 355                 // ((0xC0 & (*in) != 0x80) && (mState != 0))
 356                 // Incomplete multi-octet sequence.
 357                 $i--;
 358                 return UTF8_BAD_SEQINCOMPLETE;
 359             }
 360         }
 361     }
 362
 363     if ( $mState != 0 ) {
 364         // Incomplete multi-octet sequence.
 365         $i--;
 366         return UTF8_BAD_SEQINCOMPLETE;
 367     }
 368
 369     // No bad octets found
 370     $i = NULL;
 371     return FALSE;
 372 }
 373
 374 //--------------------------------------------------------------------
 375 /**
 376 * Takes a return code from utf8_bad_identify() are returns a message
 377 * (in English) explaining what the problem is.
 378 * @param int return code from utf8_bad_identify
 379 * @return mixed string message or FALSE if return code unknown
 380 * @see utf8_bad_identify
 381 * @package utf8
 382 * @subpackage bad
 383 */
 384 function utf8_bad_explain($code) {
 385
 386     switch ($code) {
 387
 388         case UTF8_BAD_5OCTET:
 389             return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
 390         break;
 391
 392         case UTF8_BAD_6OCTET:
 393             return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
 394         break;
 395
 396         case UTF8_BAD_SEQID:
 397             return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
 398         break;
 399
 400         case UTF8_BAD_NONSHORT:
 401             return 'From Unicode 3.1, non-shortest form is illegal';
 402         break;
 403
 404         case UTF8_BAD_SURROGATE:
 405             return 'From Unicode 3.2, surrogate characters are illegal';
 406         break;
 407
 408         case UTF8_BAD_UNIOUTRANGE:
 409             return 'Codepoints outside the Unicode range are illegal';
 410         break;
 411
 412         case UTF8_BAD_SEQINCOMPLETE:
 413             return 'Incomplete multi-octet sequence';
 414         break;
 415
 416     }
 417
 418     trigger_error('Unknown error code: '.$code,E_USER_WARNING);
 419     return FALSE;
 420
 421 }