[ Index ]

PHP Cross Reference of Limb3

title

Body

[close]

/i18n/src/charset/ -> lmbUTF8BaseDriver.class.php (source)

   1  <?php
   2  /*
   3   * Limb PHP Framework
   4   *
   5   * @link http://limb-project.com 
   6   * @copyright  Copyright &copy; 2004-2007 BIT(http://bit-creative.com)
   7   * @license    LGPL http://www.gnu.org/copyleft/lesser.html 
   8   */
   9  
  10  // This class is based on Harry Fuecks' phputf8 library code(http://sourceforge.net/projects/phputf8)

  11  // and original ideas taken from http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php
  12  
  13  /**

  14   * class lmbUTF8BaseDriver.

  15   *

  16   * @package i18n

  17   * @version $Id: lmbUTF8BaseDriver.class.php 5945 2007-06-06 08:31:43Z pachanga $

  18   */
  19  class lmbUTF8BaseDriver {
  20      /**

  21      * URL-Encode a filename to allow unicodecharacters

  22      *

  23      * Slashes are not encoded

  24      *

  25      * When the second parameter is true the string will

  26      * be encoded only if non ASCII characters are detected -

  27      * This makes it safe to run it multiple times on the

  28      * same string (default is true)

  29      *

  30      * @author Andreas Gohr <andi@splitbrain.org>

  31      * @see urlencode

  32      */
  33      function UTF8EncodeFN($file, $safe = true) {
  34          if ($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#', $file))
  35              return $file;
  36  
  37          $file = urlencode($file);
  38          $file = str_replace('%2F', '/', $file);
  39          return $file;
  40      }
  41  
  42      /**

  43      * URL-Decode a filename

  44      *

  45      * This is just a wrapper around urldecode

  46      *

  47      * @author Andreas Gohr <andi@splitbrain.org>

  48      * @see urldecode

  49      */
  50      function UTF8DecodeFN($file) {
  51          $file = urldecode($file);
  52          return $file;
  53      }
  54  
  55      /**

  56      * Checks if a string contains 7bit ASCII only

  57      *

  58      * @author Andreas Gohr <andi@splitbrain.org>

  59      */
  60      function isASCII($str) {
  61          for($i = 0; $i < strlen($str); $i++)
  62          if (ord($str{$i}) > 127) return false;
  63  
  64          return true;
  65      }
  66  
  67      /**

  68      * Strips all highbyte chars

  69      *

  70      * Returns a pure ASCII7 string

  71      *

  72      * @author Andreas Gohr <andi@splitbrain.org>

  73      */
  74      function UTF8Strip($str) {
  75          $ascii = '';
  76          for($i = 0; $i < strlen($str); $i++) {
  77              if (ord($str{$i}) < 128)
  78                  $ascii .= $str{$i};
  79          }
  80          return $ascii;
  81      }
  82  
  83      /**

  84      * Tries to detect if a string is in utf8 encoding

  85      *

  86      * @author <bmorel@ssi.fr>

  87      * @link http://www.php.net/manual/en/function.utf8-encode.php

  88      */
  89      function UTF8Check($str) {
  90          for($i = 0; $i < strlen($str); $i++) {
  91              if (ord($str[$i]) < 0x80) continue; # 0bbbbbbb

  92              elseif ((ord($str[$i]) &0xE0) == 0xC0) $n = 1; # 110bbbbb

  93              elseif ((ord($str[$i]) &0xF0) == 0xE0) $n = 2; # 1110bbbb

  94              elseif ((ord($str[$i]) &0xF8) == 0xF0) $n = 3; # 11110bbb

  95              elseif ((ord($str[$i]) &0xFC) == 0xF8) $n = 4; # 111110bb

  96              elseif ((ord($str[$i]) &0xFE) == 0xFC) $n = 5; # 1111110b

  97              else return false; # Does not match any model

  98  
  99              // n bytes matching 10bbbbbb follow ?

 100              for($j = 0; $j < $n; $j++) {
 101                  if ((++$i == strlen($str)) || ((ord($str[$i]) &0xC0) != 0x80))
 102                      return false;
 103              }
 104          }
 105          return true;
 106      }
 107  
 108      /**

 109      * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents

 110      *

 111      * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)

 112      * letters. Default is to deaccent both cases ($case = 0)

 113      *

 114      * @author Andreas Gohr <andi@splitbrain.org>

 115      */
 116      function UTF8Deaccent($string, $case = 0) {
 117          if ($case <= 0) {
 118              global $UTF8_LOWER_ACCENTS;
 119              $string = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $string);
 120          }
 121          if ($case >= 0) {
 122              global $UTF8_UPPER_ACCENTS;
 123              $string = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $string);
 124          }
 125          return $string;
 126      }
 127  
 128      /**

 129      * Removes special characters (nonalphanumeric) from a UTF-8 string

 130      *

 131      * Be sure to specify all specialchars you give in $repl in $keep, too

 132      * or it won't work.

 133      *

 134      * This function adds the controlchars 0x00 to 0x19 to the array of

 135      * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)

 136      *

 137      * @author Andreas Gohr <andi@splitbrain.org>

 138      * @param string $string The UTF8 string to strip of special chars

 139      * @param string $repl Replace special with this string

 140      * @param string $keep Special chars to keep (in UTF8)

 141      */
 142      function UTF8StripSpecials($string, $repl = '', $keep = '') {
 143          global $UTF8_SPECIAL_CHARS;
 144          if ($keep != '')
 145              $specials = array_diff($UTF8_SPECIAL_CHARS, $this->toUnicode($keep));
 146          else
 147              $specials = $UTF8_SPECIAL_CHARS;
 148  
 149          $specials = $this->toUTF8($specials);
 150          $specials = preg_quote($specials, '/');
 151  
 152          return preg_replace('/[\x00-\x19' . $specials . ']/u', $repl, $string);
 153      }
 154  
 155      /**

 156      * UTF8 aware replacement for strlen()

 157      *

 158      * utf8_decode() converts characters that are not in ISO-8859-1

 159      * to '?', which, for the purpose of counting, is alright - It's

 160      * even faster than mb_strlen.

 161      *

 162      * @author <chernyshevsky at hotmail dot com>

 163      * @see strlen

 164      * @see utf8_decode

 165      */
 166      function _strlen($string) {
 167          return strlen(utf8_decode($string));
 168      }
 169  
 170      /**

 171      * UTF8 aware replacement for substr()

 172      *

 173      * @todo Handle negative positions etc.

 174      * @author Harry Fuecks <hfuecks@gmail.com>

 175      * @see substr

 176      */
 177      function _substr($str, $start, $length=null) {
 178          $start = (int)$start;
 179          if (!is_null($length)) $length = (int)$length;
 180  
 181          $strlen = $this->_strlen($str);
 182  
 183          if (!is_null($length) && abs($length) > $strlen)
 184              $length = ($length > 0) ? $strlen : -1 * $strlen;
 185  
 186          if ($start < 0)
 187              $start = $strlen + $start;
 188  
 189          if ($length < 0)
 190              $length = $strlen + $length - $start;
 191  
 192          if (is_null($length) || $length >= $strlen)
 193              $length = '*';
 194          else
 195              $length = '{0,' . $length . '}';
 196  
 197          $pattern = '/^.{' . $start . '}(.' . $length . ')/us';
 198          preg_match($pattern, $str, $matches);
 199  
 200          if (isset($matches[1]))
 201              return $matches[1];
 202  
 203          return false;
 204      }
 205  
 206      /**

 207      * UTF8 aware replacement for strrepalce()

 208      *

 209      * @todo support PHP5 count (fourth arg)

 210      * @author Harry Fuecks <hfuecks@gmail.com>

 211      * @see str_replace();

 212      */
 213      function _str_replace($s, $r, $str) {
 214          if (!is_array($s)) {
 215              $s = '!' . preg_quote($s, '!') . '!u';
 216          } else {
 217              foreach ($s as $k => $v)
 218              $s[$k] = '!' . preg_quote($v) . '!u';
 219          }
 220          return preg_replace($s, $r, $str);
 221      }
 222  
 223      /**

 224      * UTF8 aware replacement for ltrim()

 225      *

 226      * @author Andreas Gohr <andi@splitbrain.org>

 227      * @see ltrim

 228      * @return string

 229      */
 230      function _ltrim($str, $charlist = '') {
 231          if ($charlist == '')
 232              return ltrim($str);
 233  
 234          $chars = preg_split('//u', $charlist, -1, PREG_SPLIT_NO_EMPTY);
 235          $regex = '(' . implode('|', array_map('preg_quote', $chars)) . ')';
 236  
 237          return preg_replace('/^' . $regex . '+/u', '', $str);
 238      }
 239  
 240      /**

 241      * UTF8 aware replacement for ltrim()

 242      *

 243      * @author Andreas Gohr <andi@splitbrain.org>

 244      * @see rtrim

 245      * @return string

 246      */
 247      function _rtrim($str, $charlist = '') {
 248          if ($charlist == '')
 249              return rtrim($str);
 250  
 251          $chars = preg_split('//u', $charlist, -1, PREG_SPLIT_NO_EMPTY);
 252          $regex = '(' . implode('|', array_map('preg_quote', $chars)) . ')';
 253  
 254          return preg_replace('/' . $regex . '+$/u', '', $str);
 255      }
 256  
 257      /**

 258      * UTF8 aware replacement for trim()

 259      *

 260      * @author Andreas Gohr <andi@splitbrain.org>

 261      * @see trim

 262      * @return string

 263      */
 264      function _trim($str, $charlist = '') {
 265          if ($charlist == '')
 266              return trim($str);
 267  
 268          return $this->_ltrim($this->_rtrim($str, $charlist), $charlist);
 269      }
 270  
 271      /**

 272      * This is a unicode aware replacement for strtolower()

 273      *

 274      * @author Andreas Gohr <andi@splitbrain.org>

 275      * @see strtolower

 276      */
 277      function _strtolower($string) {
 278          global $UTF8_UPPER_TO_LOWER;
 279          $uni = $this->toUnicode($string);
 280          for($i = 0; $i < count($uni); $i++) {
 281              if (isset($UTF8_UPPER_TO_LOWER[$uni[$i]]))
 282                  $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
 283          }
 284          return $this->toUTF8($uni);
 285      }
 286  
 287      /**

 288      * This is a unicode aware replacement for strtoupper()

 289      *

 290      * @author Andreas Gohr <andi@splitbrain.org>

 291      * @see strtoupper

 292      */
 293      function _strtoupper($string) {
 294          global $UTF8_LOWER_TO_UPPER;
 295          $uni = $this->toUnicode($string);
 296          for($i = 0; $i < count($uni); $i++) {
 297              if (isset($UTF8_LOWER_TO_UPPER[$uni[$i]]))
 298                  $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
 299          }
 300          return $this->toUTF8($uni);
 301      }
 302  
 303      /**

 304      * This is an UTF8 aware replacement for strpos

 305      *

 306      * @author Harry Fuecks <hfuecks@gmail.com>

 307      * @see strpos

 308      */
 309      function _strpos($haystack, $needle, $offset=false) {
 310          if ($offset === false) {
 311              $ar = explode($needle, $haystack);
 312              if (count($ar) > 1)
 313                  return $this->_strlen($ar[0]);
 314  
 315              return false;
 316          } else {
 317              if (!is_int($offset)) {
 318                  trigger_error('Offset must be an integer', E_USER_WARNING);
 319                  return false;
 320              }
 321  
 322              $haystack = $this->_substr($haystack, $offset);
 323  
 324              if (false !== ($pos = $this->_strpos($haystack, $needle)))
 325                  return $pos + $offset;
 326  
 327              return false;
 328          }
 329      }
 330  
 331      /**

 332      * This is an UTF-8 aware alternative to strrpos

 333      *

 334      * Find position of last occurrence of a char in a string

 335      * Note: This will get alot slower if offset is used

 336      * @author Harry Fuecks <hfuecks@gmail.com>

 337      */
 338      function _strrpos($str, $needle, $offset=false) {
 339          if ($offset === false) {
 340              $ar = explode($needle, $str);
 341              if ( count($ar) > 1 ) {
 342                  // Pop off the end of the string where the last match was made

 343                  array_pop($ar);
 344                  $str = join($needle,$ar);
 345                  return $this->_strlen($str);
 346              }
 347              return false;
 348          } else {
 349              if ( !is_int($offset) ) {
 350                  trigger_error('_strrpos: Offset must be an integer', E_USER_ERROR);
 351                  return false;
 352              }
 353              $str = $this->_substr($str, $offset);
 354              if ( false !== ( $pos = $this->_strrpos($str, $needle) ) ) {
 355                  return $pos + $offset;
 356              }
 357              return false;
 358          }
 359      }
 360  
 361      /*

 362      * This is UTF-8 aware alternative to ucfirst

 363      *

 364      * Make a string's first character uppercase

 365      * @author Harry Fuecks <hfuecks@gmail.com>

 366      */
 367      function _ucfirst($str) {
 368          //the regex below doesn't work :(

 369          //preg_match('/^(\w{1})(.*)$/us', $str, $matches);

 370          preg_match('/^(.)(.*)$/us', $str, $matches);
 371  
 372          if ( isset($matches[1]) && isset($matches[2]) ) {
 373              return $this->_strtoupper($matches[1]) . $matches[2];
 374          } else {
 375              return $str;
 376          }
 377      }
 378  
 379      /*

 380      * UTF-8 aware alternative to strcasecmp

 381      * A case insensivite string comparison

 382      *

 383      * @author Harry Fuecks <hfuecks@gmail.com>

 384      */
 385      function _strcasecmp($strX, $strY) {
 386          return strcmp($this->_strtolower($strX),
 387                        $this->_strtolower($strY));
 388      }
 389  
 390      /**

 391      * UTF-8 aware alternative to substr_count

 392      *

 393      */
 394      function _substr_count($haystack, $needle) {
 395          if(preg_match_all('/(' . preg_quote($needle) . ')/u', $haystack, $matches)) {
 396              return sizeof($matches[1]);
 397          }
 398          return 0;
 399      }
 400  
 401      /**

 402      * UTF-8 aware alternative to str_split

 403      * Convert a string to an array

 404      *

 405      * @author Harry Fuecks <hfuecks@gmail.com>

 406      */
 407      function _str_split($str, $split_len=1) {
 408          $split_len = (int)$split_len;
 409          if ( !preg_match('/^[0-9]+$/',$split_len) || $split_len < 1 ) {
 410              return false;
 411          }
 412  
 413          $len = $this->_strlen($str);
 414          if ( $len <= $split_len ) {
 415              return array($str);
 416          }
 417  
 418          preg_match_all('/.{'.$split_len.'}|[^\x00]{1,'.$split_len.'}$/us', $str, $ar);
 419          return $ar[0];
 420      }
 421  
 422      /*

 423      * This is UTF-8 aware alternative to preg_match

 424      */
 425      function _preg_match($pattern, $subject, &$matches, $flags=null, $offset=null) {
 426          if(!is_null($flags) && !is_null($offset)) {
 427              return preg_match($pattern . 'u', $subject, $matches, $flags, $offset);
 428          } elseif (is_null($flags) && !is_null($offset)) {
 429              return preg_match($pattern .'u', $subject, $matches, $flags);
 430          } else {
 431              return preg_match($pattern . 'u', $subject, $matches);
 432          }
 433      }
 434  
 435      /*

 436      * This is UTF-8 aware alternative to preg_match_all

 437      */
 438      function _preg_match_all($pattern, $subject, &$matches, $flags=null, $offset=null) {
 439          if(!is_null($flags) && !is_null($offset)) {
 440              return preg_match_all($pattern . 'u', $subject, $matches, $flags, $offset);
 441          } elseif (is_null($flags) && !is_null($offset)) {
 442              return preg_match_all($pattern .'u', $subject, $matches, $flags);
 443          } else {
 444              return preg_match_all($pattern . 'u', $subject, $matches);
 445          }
 446      }
 447  
 448      /*

 449      * This is UTF-8 aware alternative to preg_replace

 450      */
 451      function _preg_replace($pattern, $replacement, $subject, $limit=null) {
 452          if(!is_null($limit)) {
 453              return preg_replace($pattern .'u', $replacement, $subject, $limit);
 454          } else {
 455              return preg_replace($pattern .'u', $replacement, $subject);
 456          }
 457      }
 458  
 459      /*

 460      * This is UTF-8 aware alternative to _preg_replace_callback

 461      */
 462      function _preg_replace_callback($pattern, $callback, $subject, $limit=null) {
 463          if(!is_null($limit)) {
 464              return preg_replace_callback($pattern .'u', $callback, $subject, $limit);
 465          } else {
 466              return preg_replace_callback($pattern .'u', $callback, $subject);
 467          }
 468      }
 469  
 470      /*

 471      * This is UTF-8 aware alternative to preg_split

 472      */
 473      function _preg_split($pattern, $subject, $limit=null, $flags=null) {
 474          if(!is_null($limit) && !is_null($flags)) {
 475              return preg_split($pattern . 'u', $subject, $limit, $flags);
 476          } elseif (is_null($flags) && !is_null($limit)) {
 477              return preg_split($pattern .'u', $subject, $limit);
 478          } else {
 479              return preg_split($pattern . 'u', $subject);
 480          }
 481      }
 482  
 483      /**

 484      * This function returns any UTF-8 encoded text as a list of

 485      * Unicode values:

 486      *

 487      * @author Scott Michael Reynen <scott@randomchaos.com>

 488      * @link http://www.randomchaos.com/document.php?source=php_and_unicode

 489