X-Git-Url: https://projects.mako.cc/source/scuttle/blobdiff_plain/bce919af7b49bbd06223f79b8c37a53a3d263ff0..c7f63c8b9b12efd7b3c10b9f80cda06eaf32068f:/includes/utf8/utils/ascii.php

diff --git a/includes/utf8/utils/ascii.php b/includes/utf8/utils/ascii.php
new file mode 100644
index 0000000..cb05297
--- /dev/null
+++ b/includes/utf8/utils/ascii.php
@@ -0,0 +1,220 @@
+<?php
+/**
+* Tools to help with ASCII in UTF-8
+* @version $Id: ascii.php,v 1.5 2006/10/16 20:38:12 harryf Exp $
+* @package utf8
+* @subpackage ascii
+*/
+
+//--------------------------------------------------------------------
+/**
+* Tests whether a string contains only 7bit ASCII bytes.
+* You might use this to conditionally check whether a string
+* needs handling as UTF-8 or not, potentially offering performance
+* benefits by using the native PHP equivalent if it's just ASCII e.g.;
+*
+* <code>
+* if ( utf8_is_ascii($someString) ) {
+*     // It's just ASCII - use the native PHP version
+*     $someString = strtolower($someString);
+* } else {
+*     $someString = utf8_strtolower($someString);
+* }
+* </code>
+* 
+* @param string
+* @return boolean TRUE if it's all ASCII
+* @package utf8
+* @subpackage ascii
+* @see utf8_is_ascii_ctrl
+*/
+function utf8_is_ascii($str) {
+    // Search for any bytes which are outside the ASCII range...
+    return (preg_match('/(?:[^\x00-\x7F])/',$str) !== 1);
+}
+
+//--------------------------------------------------------------------
+/**
+* Tests whether a string contains only 7bit ASCII bytes with device
+* control codes omitted. The device control codes can be found on the
+* second table here: http://www.w3schools.com/tags/ref_ascii.asp
+* 
+* @param string
+* @return boolean TRUE if it's all ASCII without device control codes
+* @package utf8
+* @subpackage ascii
+* @see utf8_is_ascii
+*/
+function utf8_is_ascii_ctrl($str) {
+    if ( strlen($str) > 0 ) {
+        // Search for any bytes which are outside the ASCII range,
+        // or are device control codes
+        return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/',$str) !== 1);
+    }
+    return FALSE;
+}
+
+//--------------------------------------------------------------------
+/**
+* Strip out all non-7bit ASCII bytes
+* If you need to transmit a string to system which you know can only
+* support 7bit ASCII, you could use this function.
+* @param string
+* @return string with non ASCII bytes removed
+* @package utf8
+* @subpackage ascii
+* @see utf8_strip_non_ascii_ctrl
+*/
+function utf8_strip_non_ascii($str) {
+    ob_start();
+    while ( preg_match(
+        '/^([\x00-\x7F]+)|([^\x00-\x7F]+)/S',
+            $str, $matches) ) {
+        if ( !isset($matches[2]) ) {
+            echo $matches[0];
+        }
+        $str = substr($str, strlen($matches[0]));
+    }
+    $result = ob_get_contents();
+    ob_end_clean();
+    return $result;
+}
+
+//--------------------------------------------------------------------
+/**
+* Strip out device control codes in the ASCII range
+* which are not permitted in XML. Note that this leaves
+* multi-byte characters untouched - it only removes device
+* control codes
+* @see http://hsivonen.iki.fi/producing-xml/#controlchar
+* @param string
+* @return string control codes removed
+*/
+function utf8_strip_ascii_ctrl($str) {
+    ob_start();
+    while ( preg_match(
+        '/^([^\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)|([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)/S',
+            $str, $matches) ) {
+        if ( !isset($matches[2]) ) {
+            echo $matches[0];
+        }
+        $str = substr($str, strlen($matches[0]));
+    }
+    $result = ob_get_contents();
+    ob_end_clean();
+    return $result;
+}
+
+//--------------------------------------------------------------------
+/**
+* Strip out all non 7bit ASCII bytes and ASCII device control codes.
+* For a list of ASCII device control codes see the 2nd table here:
+* http://www.w3schools.com/tags/ref_ascii.asp
+* 
+* @param string
+* @return boolean TRUE if it's all ASCII
+* @package utf8
+* @subpackage ascii
+*/
+function utf8_strip_non_ascii_ctrl($str) {
+    ob_start();
+    while ( preg_match(
+        '/^([\x09\x0A\x0D\x20-\x7E]+)|([^\x09\x0A\x0D\x20-\x7E]+)/S',
+            $str, $matches) ) {
+        if ( !isset($matches[2]) ) {
+            echo $matches[0];
+        }
+        $str = substr($str, strlen($matches[0]));
+    }
+    $result = ob_get_contents();
+    ob_end_clean();
+    return $result;
+}
+
+//---------------------------------------------------------------
+/**
+* Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents".
+* The purpose of this function is to replace characters commonly found in Latin
+* alphabets with something more or less equivalent from the ASCII range. This can
+* be useful for converting a UTF-8 to something ready for a filename, for example.
+* Following the use of this function, you would probably also pass the string
+* through utf8_strip_non_ascii to clean out any other non-ASCII chars
+* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
+* letters. Default is to deaccent both cases ($case = 0)
+*
+* For a more complete implementation of transliteration, see the utf8_to_ascii package
+* available from the phputf8 project downloads:
+* http://prdownloads.sourceforge.net/phputf8
+*
+* @param string UTF-8 string
+* @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases
+* @param string UTF-8 with accented characters replaced by ASCII chars
+* @return string accented chars replaced with ascii equivalents
+* @author Andreas Gohr <andi@splitbrain.org>
+* @package utf8
+* @subpackage ascii
+*/
+function utf8_accents_to_ascii( $str, $case=0 ){
+    
+    static $UTF8_LOWER_ACCENTS = NULL;
+    static $UTF8_UPPER_ACCENTS = NULL;
+    
+    if($case <= 0){
+        
+        if ( is_null($UTF8_LOWER_ACCENTS) ) {
+            $UTF8_LOWER_ACCENTS = array(
+  'Ã ' => 'a', 'Ã´' => 'o', 'Ä' => 'd', 'á¸' => 'f', 'Ã«' => 'e', 'Å¡' => 's', 'Æ¡' => 'o',
+  'Ã' => 'ss', 'Ä' => 'a', 'Å' => 'r', 'È' => 't', 'Å' => 'n', 'Ä' => 'a', 'Ä·' => 'k',
+  'Å' => 's', 'á»³' => 'y', 'Å' => 'n', 'Äº' => 'l', 'Ä§' => 'h', 'á¹' => 'p', 'Ã³' => 'o',
+  'Ãº' => 'u', 'Ä' => 'e', 'Ã©' => 'e', 'Ã§' => 'c', 'áº' => 'w', 'Ä' => 'c', 'Ãµ' => 'o',
+  'á¹¡' => 's', 'Ã¸' => 'o', 'Ä£' => 'g', 'Å§' => 't', 'È' => 's', 'Ä' => 'e', 'Ä' => 'c',
+  'Å' => 's', 'Ã®' => 'i', 'Å±' => 'u', 'Ä' => 'c', 'Ä' => 'e', 'Åµ' => 'w', 'á¹«' => 't',
+  'Å«' => 'u', 'Ä' => 'c', 'Ã¶' => 'oe', 'Ã¨' => 'e', 'Å·' => 'y', 'Ä' => 'a', 'Å' => 'l',
+  'Å³' => 'u', 'Å¯' => 'u', 'Å' => 's', 'Ä' => 'g', 'Ä¼' => 'l', 'Æ' => 'f', 'Å¾' => 'z',
+  'áº' => 'w', 'á¸' => 'b', 'Ã¥' => 'a', 'Ã¬' => 'i', 'Ã¯' => 'i', 'á¸' => 'd', 'Å¥' => 't',
+  'Å' => 'r', 'Ã¤' => 'ae', 'Ã­' => 'i', 'Å' => 'r', 'Ãª' => 'e', 'Ã¼' => 'ue', 'Ã²' => 'o',
+  'Ä' => 'e', 'Ã±' => 'n', 'Å' => 'n', 'Ä¥' => 'h', 'Ä' => 'g', 'Ä' => 'd', 'Äµ' => 'j',
+  'Ã¿' => 'y', 'Å©' => 'u', 'Å­' => 'u', 'Æ°' => 'u', 'Å£' => 't', 'Ã½' => 'y', 'Å' => 'o',
+  'Ã¢' => 'a', 'Ä¾' => 'l', 'áº' => 'w', 'Å¼' => 'z', 'Ä«' => 'i', 'Ã£' => 'a', 'Ä¡' => 'g',
+  'á¹' => 'm', 'Å' => 'o', 'Ä©' => 'i', 'Ã¹' => 'u', 'Ä¯' => 'i', 'Åº' => 'z', 'Ã¡' => 'a',
+  'Ã»' => 'u', 'Ã¾' => 'th', 'Ã°' => 'dh', 'Ã¦' => 'ae', 'Âµ' => 'u', 'Ä' => 'e', 
+            );
+        }
+        
+        $str = str_replace(
+                array_keys($UTF8_LOWER_ACCENTS),
+                array_values($UTF8_LOWER_ACCENTS),
+                $str
+            );
+    }
+    
+    if($case >= 0){
+        if ( is_null($UTF8_UPPER_ACCENTS) ) {
+            $UTF8_UPPER_ACCENTS = array(
+  'Ã' => 'A', 'Ã' => 'O', 'Ä' => 'D', 'á¸' => 'F', 'Ã' => 'E', 'Å ' => 'S', 'Æ ' => 'O',
+  'Ä' => 'A', 'Å' => 'R', 'È' => 'T', 'Å' => 'N', 'Ä' => 'A', 'Ä¶' => 'K',
+  'Å' => 'S', 'á»²' => 'Y', 'Å' => 'N', 'Ä¹' => 'L', 'Ä¦' => 'H', 'á¹' => 'P', 'Ã' => 'O',
+  'Ã' => 'U', 'Ä' => 'E', 'Ã' => 'E', 'Ã' => 'C', 'áº' => 'W', 'Ä' => 'C', 'Ã' => 'O',
+  'á¹ ' => 'S', 'Ã' => 'O', 'Ä¢' => 'G', 'Å¦' => 'T', 'È' => 'S', 'Ä' => 'E', 'Ä' => 'C',
+  'Å' => 'S', 'Ã' => 'I', 'Å°' => 'U', 'Ä' => 'C', 'Ä' => 'E', 'Å´' => 'W', 'á¹ª' => 'T',
+  'Åª' => 'U', 'Ä' => 'C', 'Ã' => 'Oe', 'Ã' => 'E', 'Å¶' => 'Y', 'Ä' => 'A', 'Å' => 'L',
+  'Å²' => 'U', 'Å®' => 'U', 'Å' => 'S', 'Ä' => 'G', 'Ä»' => 'L', 'Æ' => 'F', 'Å½' => 'Z',
+  'áº' => 'W', 'á¸' => 'B', 'Ã' => 'A', 'Ã' => 'I', 'Ã' => 'I', 'á¸' => 'D', 'Å¤' => 'T',
+  'Å' => 'R', 'Ã' => 'Ae', 'Ã' => 'I', 'Å' => 'R', 'Ã' => 'E', 'Ã' => 'Ue', 'Ã' => 'O',
+  'Ä' => 'E', 'Ã' => 'N', 'Å' => 'N', 'Ä¤' => 'H', 'Ä' => 'G', 'Ä' => 'D', 'Ä´' => 'J',
+  'Å¸' => 'Y', 'Å¨' => 'U', 'Å¬' => 'U', 'Æ¯' => 'U', 'Å¢' => 'T', 'Ã' => 'Y', 'Å' => 'O',
+  'Ã' => 'A', 'Ä½' => 'L', 'áº' => 'W', 'Å»' => 'Z', 'Äª' => 'I', 'Ã' => 'A', 'Ä ' => 'G',
+  'á¹' => 'M', 'Å' => 'O', 'Ä¨' => 'I', 'Ã' => 'U', 'Ä®' => 'I', 'Å¹' => 'Z', 'Ã' => 'A',
+  'Ã' => 'U', 'Ã' => 'Th', 'Ã' => 'Dh', 'Ã' => 'Ae', 'Ä' => 'E',
+            );
+        }
+        $str = str_replace(
+                array_keys($UTF8_UPPER_ACCENTS),
+                array_values($UTF8_UPPER_ACCENTS),
+                $str
+            );
+    }
+    
+    return $str;
+    
+}