X-Git-Url: https://projects.mako.cc/source/scuttle/blobdiff_plain/bce919af7b49bbd06223f79b8c37a53a3d263ff0..c7f63c8b9b12efd7b3c10b9f80cda06eaf32068f:/includes/utf8/utils/position.php diff --git a/includes/utf8/utils/position.php b/includes/utf8/utils/position.php new file mode 100644 index 0000000..db8ba3b --- /dev/null +++ b/includes/utf8/utils/position.php @@ -0,0 +1,173 @@ + +* @param string string to locate index in +* @param int (n times) +* @return mixed - int if only one input int, array if more +* @return boolean TRUE if it's all ASCII +* @package utf8 +* @subpackage position +*/ +function utf8_byte_position() { + + $args = func_get_args(); + $str =& array_shift($args); + if (!is_string($str)) return false; + + $result = array(); + + // trivial byte index, character offset pair + $prev = array(0,0); + + // use a short piece of str to estimate bytes per character + // $i (& $j) -> byte indexes into $str + $i = utf8_locate_next_chr($str, 300); + + // $c -> character offset into $str + $c = strlen(utf8_decode(substr($str,0,$i))); + + // deal with arguments from lowest to highest + sort($args); + + foreach ($args as $offset) { + // sanity checks FIXME + + // 0 is an easy check + if ($offset == 0) { $result[] = 0; continue; } + + // ensure no endless looping + $safety_valve = 50; + + do { + + if ( ($c - $prev[1]) == 0 ) { + // Hack: gone past end of string + $error = 0; + $i = strlen($str); + break; + } + + $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1])); + + // correct to utf8 character boundary + $j = utf8_locate_next_chr($str, $j); + + // save the index, offset for use next iteration + $prev = array($i,$c); + + if ($j > $i) { + // determine new character offset + $c += strlen(utf8_decode(substr($str,$i,$j-$i))); + } else { + // ditto + $c -= strlen(utf8_decode(substr($str,$j,$i-$j))); + } + + $error = abs($c-$offset); + + // ready for next time around + $i = $j; + + // from 7 it is faster to iterate over the string + } while ( ($error > 7) && --$safety_valve) ; + + if ($error && $error <= 7) { + + if ($c < $offset) { + // move up + while ($error--) { $i = utf8_locate_next_chr($str,++$i); } + } else { + // move down + while ($error--) { $i = utf8_locate_current_chr($str,--$i); } + } + + // ready for next arg + $c = $offset; + } + $result[] = $i; + } + + if ( count($result) == 1 ) { + return $result[0]; + } + + return $result; +} + +//-------------------------------------------------------------------- +/** +* Given a string and any byte index, returns the byte index +* of the start of the current UTF-8 character, relative to supplied +* position. If the current character begins at the same place as the +* supplied byte index, that byte index will be returned. Otherwise +* this function will step backwards, looking for the index where +* curent UTF-8 character begins +* @author Chris Smith +* @param string +* @param int byte index in the string +* @return int byte index of start of next UTF-8 character +* @package utf8 +* @subpackage position +*/ +function utf8_locate_current_chr( &$str, $idx ) { + + if ($idx <= 0) return 0; + + $limit = strlen($str); + if ($idx >= $limit) return $limit; + + // Binary value for any byte after the first in a multi-byte UTF-8 character + // will be like 10xxxxxx so & 0xC0 can be used to detect this kind + // of byte - assuming well formed UTF-8 + while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--; + + return $idx; +} + +//-------------------------------------------------------------------- +/** +* Given a string and any byte index, returns the byte index +* of the start of the next UTF-8 character, relative to supplied +* position. If the next character begins at the same place as the +* supplied byte index, that byte index will be returned. +* @author Chris Smith +* @param string +* @param int byte index in the string +* @return int byte index of start of next UTF-8 character +* @package utf8 +* @subpackage position +*/ +function utf8_locate_next_chr( &$str, $idx ) { + + if ($idx <= 0) return 0; + + $limit = strlen($str); + if ($idx >= $limit) return $limit; + + // Binary value for any byte after the first in a multi-byte UTF-8 character + // will be like 10xxxxxx so & 0xC0 can be used to detect this kind + // of byte - assuming well formed UTF-8 + while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++; + + return $idx; +} +