projects.mako.cc - scuttle/blob - includes/utf8/docs/phputf8.pod

   1 # $Id: phputf8.pod,v 1.7 2006/09/30 23:38:19 harryf Exp $
   2
   3 =head1 NAME
   4
   5 phputf8 - Tools for working with UTF-8 in PHP
   6
   7 =head1 SYNOPSIS
   8
   9     require_once '/path/to/utf8/utf8.php';
  10     require_once UTF8 . '/utils/validation.php';
  11     require_once UTF8 . '/utils/ascii.php';
  12
  13     # Check the UTF-8 is well formed
  14     if ( !utf8_is_valid($_POST['somecontent']) ) {
  15
  16         require_once UTF8 . '/utils/bad.php';
  17         trigger_error('Bad UTF-8 detected. Clearning', E_USER_NOTICE);
  18
  19         # Strip out bad sequences - replace with ? character
  20         $_POST['somecontent'] = utf8_bad_replace($_POST['somecontent']);
  21
  22     }
  23
  24     # This works fine with UTF-8
  25     $_POST['somecontent'] = ltrim($_POST['somecontent']);
  26
  27     # If it contains only ascii chars, use native str fns for speed...
  28     if ( !utf8_is_ascii($_POST['somecontent']) ) {
  29
  30         $endfirstword = strpos($_POST['somecontent'],' ');
  31         $firstword = substr($_POST['somecontent'],0,$endOfFirstWord);
  32         $firstword = strtoupper($firstword);
  33         $therest = substr($_POST['somecontent'],$endOfFirstWord);
  34
  35     } else {
  36
  37         # It contains multibyte sequences - use the slower but safe
  38         $endfirstword = utf8_strpos($_POST['somecontent'],' ');
  39         $firstword = utf8_substr($_POST['somecontent'],0,$endOfFirstWord);
  40         $firstword = utf8_strtoupper($firstword);
  41         $therest = utf8_substr($_POST['somecontent'],$endOfFirstWord);
  42
  43     }
  44
  45     # htmlspecialchars is also safe for use with UTF-8
  46     header("Content-Type: text/html; charset=utf-8");
  47     echo "<pre>";
  48     echo "<strong>".htmlspecialchars($firstword)."</strong>";
  49     echo htmlspecialchars($therest);
  50     echo "</pre>";
  51
  52
  53 =head1 DESCRIPTION
  54
  55 phputf8 does a few things for you;
  56
  57 =over
  58
  59 =item * Provides UTF-8 aware versions of PHP's string functions
  60
  61 All of these functions are prefixed with C<utf8_>. Six of these functions
  62 are loaded "on the fly", depending on whether you have the mbstring
  63 extension available. The rest build on top of those six.
  64
  65 See L</"String Functions">.
  66
  67 =item * Detection of bad UTF-8 sequences
  68
  69 The file C<UTF8 . '/utils/validation.php'> contains functions for testing
  70 strings for bad UTF-8 sequences. Note that other functions in the library
  71 assume valid UTF-8.
  72
  73 See L</"UTF-8 Validation and Cleaning">
  74
  75 =item * Cleaning of bad UTF-8 sequences
  76
  77 Functions for stripping or replacing bad sequences are available in
  78 C<UTF8 . '/utils/bad.php'>
  79
  80 See L</"UTF-8 Validation and Cleaning">
  81
  82 =item * Detecting pure ASCII & stripping non-ASCII
  83
  84 The file C<UTF8 . '/utils/ascii.php'> contains utilities to detect
  85 whether a UTF-8 string contains just ASCII characters (allowing
  86 you to use PHP's faster, native, string functions) and also stripping
  87 everything non-ASCII from a string
  88
  89 See L</"Performance and Optimization">
  90
  91 =item * Basic transliteration
  92
  93 The file C<UTF8 . '/utils/specials.php'> contains basic transliteration
  94 functionality (L<http://en.wikipedia.org/wiki/Transliteration>) - not
  95 much but enough to convert common European, non-ascii characters to
  96 a reasonable ASCII equivalent. You might use these when preparing a
  97 string for use as a filename, afterwhich you strip all other non-ascii
  98 characters using the ASCII utilities.
  99
 100 Further transliteration is provided in the C<utf8_to_ascii> package
 101 at L<http://sourceforge.net/projects/phputf8>. Much more powerful
 102 functionality is provided by the pecl transliteration extension -
 103 L<http://derickrethans.nl/translit.php> and
 104 L<http://pecl.php.net/package/translit>.
 105
 106 See L</"Transliteration">
 107
 108 =back
 109
 110 =head1 String Functions
 111
 112 There are seven essential functions provided by phputf8, which are
 113 required by many of the other functions. These are all loaded
 114 when you include the main C<utf8.php> script e.g.
 115
 116     require_once '/path/to/utf8/utf8.php';
 117
 118 Six of these functions depend on whether the mbstring extension is
 119 installed (see L<http://www.php.net/mbstring>) - if it is available,
 120 the following functions will be wrappers around the equivalent
 121 mb_string functions;
 122
 123 =over
 124
 125 =item * C<utf8_strlen>
 126
 127 =item * C<utf8_strpos>
 128
 129 =item * C<utf8_strrpos>
 130
 131 =item * C<utf8_substr>
 132
 133 =item * C<utf8_strtolower>
 134
 135 =item * C<utf8_strtoupper>
 136
 137 =back
 138
 139 B<Note:> phputf8 cannot support mbstring function overloading;
 140 it relies in some cases on PHP's native string functions
 141 counting characters as bytes.
 142
 143 The seventh function is C<utf8_substr_replace>, which is
 144 implemented independent of mbstring (mbstring doesn't
 145 provide it).
 146
 147 B<Important Note> - if you do not load C<utf8.php> and you wish
 148 to use the mbstring implementations, you need to set the mbstring
 149 encoding to UTF-8 yourself - see L<http://www.php.net/mb_internal_encoding>.
 150
 151 =head2 Further string functions
 152
 153 All other string functions must be included on demand. They are
 154 available directly under the C<UTF8> directory with filenames
 155 corresponding to the equivalent PHP string functions, but still
 156 with the function prefix C<utf8_>.
 157
 158 For example, to load the strrev implementation;
 159
 160     # Load the main script
 161     require_once '/path/to/utf8/utf8.php';
 162
 163     # Load the UTF-8 aware strrev implementation
 164     require_once UTF8 . '/strrev.php';
 165     print utf8_strrev('Iñtërnâtiônàlizætiøn')."\n";
 166
 167 All string implementations are found in the C<UTF8> directory.
 168 For documentation for each function, see the phpdocs
 169 L<http://phputf8.sourceforge.net/api>.
 170
 171 B<TODO> Some of the functions, such as C<utf8_strcspn> take
 172 arguments like 'start' and 'length', requiring values in terms
 173 of I<characters> not bytes - i.e. return values from functions
 174 like C<utf8_strlen> and C<utf8_strpos>. Additional implementations
 175 would be useful which take byte indexes instead of character
 176 positions - this would allow further advantage to be taken of
 177 UTF-8's design and more use of PHP's native functions for performance.
 178
 179 =head1 UTF-8 Validation and Cleaning
 180
 181 It's important to understand that multi-byte UTF-8 characters can be
 182 badly formed. UTF-8 has rules regarding multi-byte characters and those
 183 rules can be broken. Some possible reasons why a sequence of bytes
 184 might be badly formed UTF-8;
 185
 186 =over
 187
 188 =item It's a different character encoding
 189
 190 For example, 8 bit characters in ISO-8859-1 would be badly formed UTF-8.
 191 That said, characters declared as ISO-8859-1 but still within the ASCII-7
 192 range would still be valid UTF-8.
 193
 194 =item It's a corrupted UTF-8 string
 195
 196 Something has mangled the UTF-8 string (PHP's native strrev function,
 197 for example, would do this).
 198
 199 =item Someone is injecting badly formed UTF-8 input deliberately.
 200
 201 They might be attempting to "break" you RSS feed, for example.
 202
 203 =back
 204
 205 With that in mind, the functions provided in C<./utils/validation.php>
 206 and C<./utils/bad.php> are intend to help guard against such problems.
 207
 208 =head2 Validation
 209
 210 There are two functions in C<./utils/validation.php>, one "strict"
 211 and the other slightly more relaxed.
 212
 213 The strict version is C<utf8_is_valid> - as well is checking each
 214 sequence, byte-by-byte, it also regards sequences which are not
 215 part of the Unicode standard as being invalid (UTF-8 allows for
 216 5 and 6 byte sequences but have no meaning in Unicode, and will
 217 result in browsers displaying "junk" characters (e.g. ? character).
 218
 219 The second function C<utf8_compliant> relies of behaviour of
 220 PHP's PCRE extension, to spot invalid UTF-8 sequences. This
 221 function will pass 5 and 6 byte sequences but also performs
 222 much better than C<utf8_is_valid>.
 223
 224 Both are simple to use;
 225
 226     require_once UTF8 . '/utils/validation.php';
 227     if ( utf8_is_valid($str) ) {
 228         print "Its valid\n";
 229     }
 230     if ( utf8_is_compliant($str) ) {
 231         print "Its compliant\n";
 232     }
 233
 234
 235 =head2 Cleaning UTF-8
 236
 237 If you detect a UTF-8 encoded string contains badly formed
 238 sequences, functions in C<./utils/bad.php> can help. Be warned
 239 that performance on large strings will be an issue.
 240
 241 It provides the following functitons;
 242
 243 =over
 244
 245 =item * C<utf8_bad_find>
 246
 247 Locates the I<first> bad byte in a UTF-8 string, returning it's
 248 I<byte> (not chacacter) position in the string. You might use this
 249 for iterative cleaning or analysis of a UTF-8 string for example;
 250
 251     require_once UTF8 . '/utils/validation.php';
 252     require_once UTF8 . '/utils/bad.php';
 253
 254     $clean = '';
 255     while ( FALSE !== ( $badIndex = utf8_bad_find($str) ) ) {
 256         print "Bad byte found at $badIndex\n";
 257         $clean .= substr($str,0,$badIndex);
 258         $str = substr($str,$badIndex+1);
 259     }
 260     $clean .= $str;
 261
 262 =item * C<utf8_bad_findall>
 263
 264 The same as C<utf8_bad_find> but searches the complete string and
 265 returns the index of all bad bytes found in an array
 266
 267 =item * C<utf8_bad_strip>
 268
 269 Removes all bad bytes from a UTF-8 string, returning the cleaned string
 270
 271 =item * C<utf8_bad_replace>
 272
 273 Removes all bad bytes from a UTF-8 string and replaces them with some
 274 other character (default is ?)
 275
 276 =item * C<utf8_bad_identify> and C<utf8_bad_explain>
 277
 278 Together these two functions attempt to provide a reason why a
 279 particular byte is not valid UTF-8. Perhaps you might use these
 280 when logging errors.
 281
 282 =back
 283
 284 =head2 Warning on ASCII Control Characters
 285
 286 The above functions for validating and cleaning UTF-8 strings
 287 all regard ASCII control characters as being valid and
 288 acceptable. But ASCII control chars are not acceptable in XML
 289 documents - use the C<utf8_strip_ascii_ctrl> function in
 290 C<./utils/ascii.php> (available v0.3+), which will remove
 291 all ASCII control characters that are illegal in XML.
 292
 293 See L<http://hsivonen.iki.fi/producing-xml/#controlchar>.
 294
 295 =head2 Strategy
 296
 297 Because validation and cleaning UTF-8 strings comes with a pretty high
 298 cost, in terms of performance, you should be aiming to do this once
 299 only, at the point where you receive some input (e.g. a submitted form)
 300 before going on to using the rest of the string functions in this library.
 301
 302 You should also be aware that validation and cleaning is your job -
 303 the utf8_* string functions I<assume> they are being given well formed
 304 UTF-8 to process, because the performance overhead of checking, every
 305 time you called C<utf8_strlen>, for example, would be very high.
 306
 307 =head1 Performance and Optimization
 308
 309 The first thing you I<shouldn't> be attempting to do is replace all use of PHP's
 310 native string functions with functions from this library. Doing so will have
 311 a dramatic (and bad) effect on your codes performance. It also misses opportunities
 312 you may have to continue using PHP's native string functions.
 313
 314 There are two main areas to consider, when working out how to support UTF-8
 315 with this library and achieve optimal performance.
 316
 317 =head2 When data is 99% ASCII
 318
 319 First, if the majority of the data your application will be processing is
 320 written in English, most of the time you will be able to use PHP's native
 321 string functions, only using the utf8_* string functions when you encounter
 322 multibyte characters. This has already been implied above in the example
 323 in the L</"SYNOPSIS">. Most characters used in English fall within the
 324 ASCII-7 range and ASCII characters in UTF-8 are no different to normal
 325 ASCII characters.
 326
 327 So check whether a string is 100% ASCII first, and if so, use PHP's native
 328 string functions on it.
 329
 330     require_once '/path/to/utf8/utf8.php';
 331     require_once UTF8 . '/utils/ascii.php';
 332
 333     if ( utf8_is_ascii($string) ) {
 334         # use native PHP string functions
 335     } else {
 336         # use utf8_* string functions
 337     }
 338
 339 =head2 Exploiting UTF-8's design
 340
 341 Second, you may be able to exploit UTF-8's design to your advantage,
 342 depending on what I<exactly> you are doing to a string. This road
 343 requires more effort and a good understanding of UTF-8's design.
 344
 345 As a starting point, you really need to examine the range table
 346 shown on Wikipedias page on UTF-8 L<http://en.wikipedia.org/wiki/UTF-8>.
 347
 348 Some key points about UTF-8's design;
 349
 350 =over
 351
 352 =item UTF-8 is a superset of ASCII
 353
 354 In other words ASCII-7 characters are encoded in exactly the same
 355 way as normal. These characters are those shown of the I<first>
 356 table L<http://www.lookuptables.com/> - the first 128 characters.
 357
 358 Note that the second table shown at L<http://www.lookuptables.com/>
 359 "Extended ASCII characters" are not ASCII-7 characters are I<are>
 360 encoded differently in UTF-8 (probably using 2 bytes). Those
 361 characters seem to be ISO-8859-1 - occasionally you will seen
 362 people saying UTF-8 is backwards compatible with ISO-8859-1 - this
 363 is I<wrong>.
 364
 365 One specific example which illustrates this;
 366
 367     $new_utf8_str = strstr('Iñtërnâtiônàlizætiøn','l');
 368
 369 Using the "needle" character 'l' (in the ASCII-7 range), this
 370 example works without any problems, the variable C<$new_utf8_str>
 371 being assigned the value 'lizætiøn', even though the haystack
 372 string contains multibyte characters.
 373
 374 Actually this example leads into the next point...
 375
 376 =item Every character sequence is unique in UTF-8
 377
 378 Assuming that a UTF-8 encoded string is well formed, any sequence
 379 in that string representing a single character (be it a single
 380 byte ASCII character or a multi byte character) cannot be mistaken
 381 is as a subsequence of a larger multi byte sequence.
 382
 383 That means all of the following examples work;
 384
 385     # Pop off a piece of a string using multi-byte character
 386     $new_utf8_str = strstr('Iñtërnâtiônàlizætiøn','ô');
 387
 388     # Explode string using multibyte character
 389     $array = explode('ô','Iñtërnâtiônàlizætiøn');
 390
 391     # Using byte index instead of chacter index...
 392     $haystack = 'Iñtërnâtiônàlizætiøn';
 393     $needle = 'ô';
 394     $pos = strpos($haystack, $needle);
 395     print "Position in bytes is $pos<br>";
 396     $substr = substr($haystack, 0, $pos);
 397     print "Substr: $substr<br>";
 398
 399
 400 =back
 401
 402 Put those together and often you will be able to use existing code
 403 with little or no modification.
 404
 405 Often you will be able to continue working in bytes instead of
 406 logical characters (as the last example above shows).
 407
 408 There are some functions which you I<will> always need to replace,
 409 for example C<strtoupper>. You should be able to get some idea of
 410 which these functions are by looking at
 411 L<http://www.phpwact.org/php/i18n/utf-8>.
 412
 413
 414 =head1 Transliteration
 415
 416 Sometimes you will need to be able to remove all multi-byte
 417 characters from a UTF-8 string and use only ASCII. Some
 418 possible reasons why;
 419
 420 =over
 421
 422 =item Interfaces to systems with no support for UTF-8
 423
 424 An application might be accessing data from your application
 425 but lack support for UTF-8. You may need to remove all non-
 426 ASCII-7 characters for it.
 427
 428 =item Filenames
 429
 430 Although most modern operating systems support Unicode, not
 431 all applications running under that OS may do so and you may
 432 be exposing yourself to security issues by allowing multi
 433 byte characters in filenames.
 434
 435 =item Urls
 436
 437 Similar issues to filenames - most modern browsers support
 438 the use of UTF-8 in URLs but doing so may not be a smart
 439 idea e.g. potential for phishing via the use of similar
 440 looking (to humans) characters.
 441
 442 =item Primary Keys / Identifiers
 443
 444 It is probably unwise to allow multi-byte UTF-8 characters into
 445 certain critical "fields" in your application, such as a username.
 446 Someone might be able to register a user with a similar looking
 447 name to an admin user - consider "admin" vs. "admın" < hard to
 448 spot the difference (note the ı character in the second example).
 449
 450 =back
 451
 452 =head2 Stripping multi byte characters
 453
 454 To simply remove all multibyte characters, the C<./utils/ascii.php>
 455 collection of functions can help e.g.;
 456
 457     require_once '/path/to/utf8/utf8.php';
 458     require_once UTF8 . '/utils/ascii.php';
 459     $str = "admın";
 460     print utf8_strip_non_ascii($str); // prints "admn"
 461
 462 Not also the C<utf8_strip_non_ascii_ctrl> function which also -
 463 strips out ASCII control codes - see
 464 L</"Warning on ASCII Control Characters"> for information on that
 465 topic.
 466
 467 =head2 Transliteration Utilities
 468
 469 Now simply throwing out characters is not kind to users. An
 470 alternative is transliteration, where you try to replace multi
 471 byte characters with equivalent ASCII characters that a human
 472 would understand. For example "Zürich" could be converted to
 473 "Zuerich", the multi byte "ü" character being replaced by "ue".
 474
 475 See L<http://en.wikipedia.org/wiki/Transliteration> for a
 476 general introduction to transliteration.
 477
 478 The main phputf8 package contains a single function in
 479 the C<./utils/ascii.php> script that does some (basic)
 480 replacements of accented characters common in languages
 481 like French. After using this function, you should still
 482 strip out all remaining multi-byte characters. For
 483 example;
 484
 485     require_once '/path/to/utf8/utf8.php';
 486     require_once UTF8 . '/utils/ascii.php';
 487
 488     $filename = utf8_accents_to_ascii($filename);
 489     $filename = utf8_strip_non_ascii($filename);
 490
 491 This will at least preserve I<some> characters in an
 492 ASCII form that will be understandable by users.
 493
 494 Further an much more powerful transliteration
 495 capabilities are provided in the seperate utf8_to_ascii
 496 package distributed at L<http://sourceforge.net/projects/phputf8>.
 497 Because it is a port of Perls' L<Text::Unidecode> package
 498 to PHP, it is distruted under the same license.
 499
 500 A quick intro to utf8_to_ascii and be found at
 501 L<http://www.sitepoint.com/blogs/2006/03/03/us-ascii-transliterations-of-unicode-text/>
 502
 503 Be warned that utf8_to_ascii does have limitations and a better
 504 choice, if you have rights to install it in your environemt, is
 505 Derick Rethans transliteration extension:
 506 L<http://pecl.php.net/package/translit>.
 507
 508
 509 =head1 SEE ALSO
 510
 511 L<http://www.phpwact.org/php/i18n/charsets>,
 512 L<http://www.phpwact.org/php/i18n/utf-8>
 513 L<http://wiki.silverorange.com/UTF-8_Notes>
 514 L<http://svn.wikimedia.org/viewvc/mediawiki/trunk/phase3/includes/normal/> - Unicode normalization in PHP
 515 L<http://www.webtuesday.ch/_media/meetings/utf-8_survival.pdf>