projects.mako.cc - scuttle/blob - includes/utf8/utils/validation.php

   1 <?php
   2 /**
   3 * @version $Id: validation.php,v 1.2 2006/02/26 13:20:44 harryf Exp $
   4 * Tools for validing a UTF-8 string is well formed.
   5 * The Original Code is Mozilla Communicator client code.
   6 * The Initial Developer of the Original Code is
   7 * Netscape Communications Corporation.
   8 * Portions created by the Initial Developer are Copyright (C) 1998
   9 * the Initial Developer. All Rights Reserved.
  10 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  11 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  13 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  14 * @see http://hsivonen.iki.fi/php-utf8/
  15 * @package utf8
  16 * @subpackage validation
  17 */
  18
  19 //--------------------------------------------------------------------
  20 /**
  21 * Tests a string as to whether it's valid UTF-8 and supported by the
  22 * Unicode standard
  23 * Note: this function has been modified to simple return true or false
  24 * @author <hsivonen@iki.fi>
  25 * @param string UTF-8 encoded string
  26 * @return boolean true if valid
  27 * @see http://hsivonen.iki.fi/php-utf8/
  28 * @see utf8_compliant
  29 * @package utf8
  30 * @subpackage validation
  31 */
  32 function utf8_is_valid($str) {
  33
  34     $mState = 0;     // cached expected number of octets after the current octet
  35                      // until the beginning of the next UTF8 character sequence
  36     $mUcs4  = 0;     // cached Unicode character
  37     $mBytes = 1;     // cached expected number of octets in the current sequence
  38
  39     $len = strlen($str);
  40
  41     for($i = 0; $i < $len; $i++) {
  42
  43         $in = ord($str{$i});
  44
  45         if ( $mState == 0) {
  46
  47             // When mState is zero we expect either a US-ASCII character or a
  48             // multi-octet sequence.
  49             if (0 == (0x80 & ($in))) {
  50                 // US-ASCII, pass straight through.
  51                 $mBytes = 1;
  52
  53             } else if (0xC0 == (0xE0 & ($in))) {
  54                 // First octet of 2 octet sequence
  55                 $mUcs4 = ($in);
  56                 $mUcs4 = ($mUcs4 & 0x1F) << 6;
  57                 $mState = 1;
  58                 $mBytes = 2;
  59
  60             } else if (0xE0 == (0xF0 & ($in))) {
  61                 // First octet of 3 octet sequence
  62                 $mUcs4 = ($in);
  63                 $mUcs4 = ($mUcs4 & 0x0F) << 12;
  64                 $mState = 2;
  65                 $mBytes = 3;
  66
  67             } else if (0xF0 == (0xF8 & ($in))) {
  68                 // First octet of 4 octet sequence
  69                 $mUcs4 = ($in);
  70                 $mUcs4 = ($mUcs4 & 0x07) << 18;
  71                 $mState = 3;
  72                 $mBytes = 4;
  73
  74             } else if (0xF8 == (0xFC & ($in))) {
  75                 /* First octet of 5 octet sequence.
  76                 *
  77                 * This is illegal because the encoded codepoint must be either
  78                 * (a) not the shortest form or
  79                 * (b) outside the Unicode range of 0-0x10FFFF.
  80                 * Rather than trying to resynchronize, we will carry on until the end
  81                 * of the sequence and let the later error handling code catch it.
  82                 */
  83                 $mUcs4 = ($in);
  84                 $mUcs4 = ($mUcs4 & 0x03) << 24;
  85                 $mState = 4;
  86                 $mBytes = 5;
  87
  88             } else if (0xFC == (0xFE & ($in))) {
  89                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
  90                 $mUcs4 = ($in);
  91                 $mUcs4 = ($mUcs4 & 1) << 30;
  92                 $mState = 5;
  93                 $mBytes = 6;
  94
  95             } else {
  96                 /* Current octet is neither in the US-ASCII range nor a legal first
  97                  * octet of a multi-octet sequence.
  98                  */
  99                 return FALSE;
 100
 101             }
 102
 103         } else {
 104
 105             // When mState is non-zero, we expect a continuation of the multi-octet
 106             // sequence
 107             if (0x80 == (0xC0 & ($in))) {
 108
 109                 // Legal continuation.
 110                 $shift = ($mState - 1) * 6;
 111                 $tmp = $in;
 112                 $tmp = ($tmp & 0x0000003F) << $shift;
 113                 $mUcs4 |= $tmp;
 114
 115                 /**
 116                 * End of the multi-octet sequence. mUcs4 now contains the final
 117                 * Unicode codepoint to be output
 118                 */
 119                 if (0 == --$mState) {
 120
 121                     /*
 122                     * Check for illegal sequences and codepoints.
 123                     */
 124                     // From Unicode 3.1, non-shortest form is illegal
 125                     if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 126                         ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 127                         ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 128                         (4 < $mBytes) ||
 129                         // From Unicode 3.2, surrogate characters are illegal
 130                         (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 131                         // Codepoints outside the Unicode range are illegal
 132                         ($mUcs4 > 0x10FFFF)) {
 133
 134                         return FALSE;
 135
 136                     }
 137
 138                     //initialize UTF8 cache
 139                     $mState = 0;
 140                     $mUcs4  = 0;
 141                     $mBytes = 1;
 142                 }
 143
 144             } else {
 145                 /**
 146                 *((0xC0 & (*in) != 0x80) && (mState != 0))
 147                 * Incomplete multi-octet sequence.
 148                 */
 149
 150                 return FALSE;
 151             }
 152         }
 153     }
 154     return TRUE;
 155 }
 156
 157 //--------------------------------------------------------------------
 158 /**
 159 * Tests whether a string complies as UTF-8. This will be much
 160 * faster than utf8_is_valid but will pass five and six octet
 161 * UTF-8 sequences, which are not supported by Unicode and
 162 * so cannot be displayed correctly in a browser. In other words
 163 * it is not as strict as utf8_is_valid but it's faster. If you use
 164 * is to validate user input, you place yourself at the risk that
 165 * attackers will be able to inject 5 and 6 byte sequences (which
 166 * may or may not be a significant risk, depending on what you are
 167 * are doing)
 168 * @see utf8_is_valid
 169 * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
 170 * @param string UTF-8 string to check
 171 * @return boolean TRUE if string is valid UTF-8
 172 * @package utf8
 173 * @subpackage validation
 174 */
 175 function utf8_compliant($str) {
 176     if ( strlen($str) == 0 ) {
 177         return TRUE;
 178     }
 179     // If even just the first character can be matched, when the /u
 180     // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
 181     // invalid, nothing at all will match, even if the string contains
 182     // some valid sequences
 183     return (preg_match('/^.{1}/us',$str,$ar) == 1);
 184 }
 185