]> projects.mako.cc - scuttle/blob - includes/utf8/utils/patterns.php
Tagged 0.7.5
[scuttle] / includes / utf8 / utils / patterns.php
1 <?php
2 /**
3 * PCRE Regular expressions for UTF-8. Note this file is not actually used by
4 * the rest of the library but these regular expressions can be useful to have
5 * available.
6 * @version $Id: patterns.php,v 1.1 2006/02/25 14:20:02 harryf Exp $
7 * @see http://www.w3.org/International/questions/qa-forms-utf-8
8 * @package utf8
9 * @subpackage patterns
10 */
11
12 //--------------------------------------------------------------------
13 /**
14 * PCRE Pattern to check a UTF-8 string is valid
15 * Comes from W3 FAQ: Multilingual Forms
16 * Note: modified to include full ASCII range including control chars
17 * @see http://www.w3.org/International/questions/qa-forms-utf-8
18 * @package utf8
19 * @subpackage patterns
20 */
21 $UTF8_VALID = '^('.
22     '[\x00-\x7F]'.                          # ASCII (including control chars)
23     '|[\xC2-\xDF][\x80-\xBF]'.              # non-overlong 2-byte
24     '|\xE0[\xA0-\xBF][\x80-\xBF]'.          # excluding overlongs
25     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.   # straight 3-byte
26     '|\xED[\x80-\x9F][\x80-\xBF]'.          # excluding surrogates
27     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.       # planes 1-3
28     '|[\xF1-\xF3][\x80-\xBF]{3}'.           # planes 4-15
29     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.       # plane 16
30     ')*$';
31
32 //--------------------------------------------------------------------
33 /**
34 * PCRE Pattern to match single UTF-8 characters
35 * Comes from W3 FAQ: Multilingual Forms
36 * Note: modified to include full ASCII range including control chars
37 * @see http://www.w3.org/International/questions/qa-forms-utf-8
38 * @package utf8
39 * @subpackage patterns
40 */
41 $UTF8_MATCH =
42     '([\x00-\x7F])'.                          # ASCII (including control chars)
43     '|([\xC2-\xDF][\x80-\xBF])'.              # non-overlong 2-byte
44     '|(\xE0[\xA0-\xBF][\x80-\xBF])'.          # excluding overlongs
45     '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'.   # straight 3-byte
46     '|(\xED[\x80-\x9F][\x80-\xBF])'.          # excluding surrogates
47     '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'.       # planes 1-3
48     '|([\xF1-\xF3][\x80-\xBF]{3})'.           # planes 4-15
49     '|(\xF4[\x80-\x8F][\x80-\xBF]{2})';       # plane 16
50
51 //--------------------------------------------------------------------
52 /**
53 * PCRE Pattern to locate bad bytes in a UTF-8 string
54 * Comes from W3 FAQ: Multilingual Forms
55 * Note: modified to include full ASCII range including control chars
56 * @see http://www.w3.org/International/questions/qa-forms-utf-8
57 * @package utf8
58 * @subpackage patterns
59 */
60 $UTF8_BAD =
61     '([\x00-\x7F]'.                          # ASCII (including control chars)
62     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
63     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
64     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
65     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
66     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
67     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
68     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
69     '|(.{1}))';                              # invalid byte

Benjamin Mako Hill || Want to submit a patch?