projects.mako.cc - scuttle/blob - includes/utf8/utils/position.php

   1 <?php
   2 /**
   3 * Locate a byte index given a UTF-8 character index
   4 * @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $
   5 * @package utf8
   6 * @subpackage position
   7 */
   8
   9 //--------------------------------------------------------------------
  10 /**
  11 * Given a string and a character index in the string, in
  12 * terms of the UTF-8 character position, returns the byte
  13 * index of that character. Can be useful when you want to
  14 * PHP's native string functions but we warned, locating
  15 * the byte can be expensive
  16 * Takes variable number of parameters - first must be
  17 * the search string then 1 to n UTF-8 character positions
  18 * to obtain byte indexes for - it is more efficient to search
  19 * the string for multiple characters at once, than make
  20 * repeated calls to this function
  21 *
  22 * @author Chris Smith<chris@jalakai.co.uk>
  23 * @param string string to locate index in
  24 * @param int (n times)
  25 * @return mixed - int if only one input int, array if more
  26 * @return boolean TRUE if it's all ASCII
  27 * @package utf8
  28 * @subpackage position
  29 */
  30 function utf8_byte_position() {
  31
  32     $args = func_get_args();
  33     $str =& array_shift($args);
  34     if (!is_string($str)) return false;
  35
  36     $result = array();
  37
  38     // trivial byte index, character offset pair
  39     $prev = array(0,0);
  40
  41     // use a short piece of str to estimate bytes per character
  42     // $i (& $j) -> byte indexes into $str
  43     $i = utf8_locate_next_chr($str, 300);
  44
  45     // $c -> character offset into $str
  46     $c = strlen(utf8_decode(substr($str,0,$i)));
  47
  48     // deal with arguments from lowest to highest
  49     sort($args);
  50
  51     foreach ($args as $offset) {
  52         // sanity checks FIXME
  53
  54         // 0 is an easy check
  55         if ($offset == 0) { $result[] = 0; continue; }
  56
  57         // ensure no endless looping
  58         $safety_valve = 50;
  59
  60         do {
  61
  62             if ( ($c - $prev[1]) == 0 ) {
  63                 // Hack: gone past end of string
  64                 $error = 0;
  65                 $i = strlen($str);
  66                 break;
  67             }
  68
  69             $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
  70
  71             // correct to utf8 character boundary
  72             $j = utf8_locate_next_chr($str, $j);
  73
  74             // save the index, offset for use next iteration
  75             $prev = array($i,$c);
  76
  77             if ($j > $i) {
  78                 // determine new character offset
  79                 $c += strlen(utf8_decode(substr($str,$i,$j-$i)));
  80             } else {
  81                 // ditto
  82                 $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
  83             }
  84
  85             $error = abs($c-$offset);
  86
  87             // ready for next time around
  88             $i = $j;
  89
  90         // from 7 it is faster to iterate over the string
  91         } while ( ($error > 7) && --$safety_valve) ;
  92
  93         if ($error && $error <= 7) {
  94
  95             if ($c < $offset) {
  96                 // move up
  97                 while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
  98             } else {
  99                 // move down
 100                 while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
 101             }
 102
 103             // ready for next arg
 104             $c = $offset;
 105         }
 106         $result[] = $i;
 107     }
 108
 109     if ( count($result) == 1 ) {
 110         return $result[0];
 111     }
 112
 113     return $result;
 114 }
 115
 116 //--------------------------------------------------------------------
 117 /**
 118 * Given a string and any byte index, returns the byte index
 119 * of the start of the current UTF-8 character, relative to supplied
 120 * position. If the current character begins at the same place as the
 121 * supplied byte index, that byte index will be returned. Otherwise
 122 * this function will step backwards, looking for the index where
 123 * curent UTF-8 character begins
 124 * @author Chris Smith<chris@jalakai.co.uk>
 125 * @param string
 126 * @param int byte index in the string
 127 * @return int byte index of start of next UTF-8 character
 128 * @package utf8
 129 * @subpackage position
 130 */
 131 function utf8_locate_current_chr( &$str, $idx ) {
 132
 133     if ($idx <= 0) return 0;
 134
 135     $limit = strlen($str);
 136     if ($idx >= $limit) return $limit;
 137
 138     // Binary value for any byte after the first in a multi-byte UTF-8 character
 139     // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
 140     // of byte - assuming well formed UTF-8
 141     while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;
 142
 143     return $idx;
 144 }
 145
 146 //--------------------------------------------------------------------
 147 /**
 148 * Given a string and any byte index, returns the byte index
 149 * of the start of the next UTF-8 character, relative to supplied
 150 * position. If the next character begins at the same place as the
 151 * supplied byte index, that byte index will be returned.
 152 * @author Chris Smith<chris@jalakai.co.uk>
 153 * @param string
 154 * @param int byte index in the string
 155 * @return int byte index of start of next UTF-8 character
 156 * @package utf8
 157 * @subpackage position
 158 */
 159 function utf8_locate_next_chr( &$str, $idx ) {
 160
 161     if ($idx <= 0) return 0;
 162
 163     $limit = strlen($str);
 164     if ($idx >= $limit) return $limit;
 165
 166     // Binary value for any byte after the first in a multi-byte UTF-8 character
 167     // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
 168     // of byte - assuming well formed UTF-8
 169     while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;
 170
 171     return $idx;
 172 }
 173