3 * PCRE Regular expressions for UTF-8. Note this file is not actually used by
4 * the rest of the library but these regular expressions can be useful to have
6 * @version $Id: patterns.php,v 1.1 2006/02/25 14:20:02 harryf Exp $
7 * @see http://www.w3.org/International/questions/qa-forms-utf-8
12 //--------------------------------------------------------------------
14 * PCRE Pattern to check a UTF-8 string is valid
15 * Comes from W3 FAQ: Multilingual Forms
16 * Note: modified to include full ASCII range including control chars
17 * @see http://www.w3.org/International/questions/qa-forms-utf-8
19 * @subpackage patterns
22 '[\x00-\x7F]'. # ASCII (including control chars)
23 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
24 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
25 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
26 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
27 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
28 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
29 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
32 //--------------------------------------------------------------------
34 * PCRE Pattern to match single UTF-8 characters
35 * Comes from W3 FAQ: Multilingual Forms
36 * Note: modified to include full ASCII range including control chars
37 * @see http://www.w3.org/International/questions/qa-forms-utf-8
39 * @subpackage patterns
42 '([\x00-\x7F])'. # ASCII (including control chars)
43 '|([\xC2-\xDF][\x80-\xBF])'. # non-overlong 2-byte
44 '|(\xE0[\xA0-\xBF][\x80-\xBF])'. # excluding overlongs
45 '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'. # straight 3-byte
46 '|(\xED[\x80-\x9F][\x80-\xBF])'. # excluding surrogates
47 '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'. # planes 1-3
48 '|([\xF1-\xF3][\x80-\xBF]{3})'. # planes 4-15
49 '|(\xF4[\x80-\x8F][\x80-\xBF]{2})'; # plane 16
51 //--------------------------------------------------------------------
53 * PCRE Pattern to locate bad bytes in a UTF-8 string
54 * Comes from W3 FAQ: Multilingual Forms
55 * Note: modified to include full ASCII range including control chars
56 * @see http://www.w3.org/International/questions/qa-forms-utf-8
58 * @subpackage patterns
61 '([\x00-\x7F]'. # ASCII (including control chars)
62 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
63 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
64 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
65 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
66 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
67 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
68 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
69 '|(.{1}))'; # invalid byte