Creating repository for dokuwiki modifications for sudaraka.org
[sudaraka-org:dokuwiki-mods.git] / inc / utf8.php
1 <?php
2 /**
3  * UTF8 helper functions
4  *
5  * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6  * @author     Andreas Gohr <andi@splitbrain.org>
7  */
8
9 /**
10  * check for mb_string support
11  */
12 if(!defined('UTF8_MBSTRING')){
13     if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14         define('UTF8_MBSTRING',1);
15     }else{
16         define('UTF8_MBSTRING',0);
17     }
18 }
19
20 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22 if(!function_exists('utf8_isASCII')){
23     /**
24      * Checks if a string contains 7bit ASCII only
25      *
26      * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
27      */
28     function utf8_isASCII($str){
29         return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
30     }
31 }
32
33 if(!function_exists('utf8_strip')){
34     /**
35      * Strips all highbyte chars
36      *
37      * Returns a pure ASCII7 string
38      *
39      * @author Andreas Gohr <andi@splitbrain.org>
40      */
41     function utf8_strip($str){
42         $ascii = '';
43         $len = strlen($str);
44         for($i=0; $i<$len; $i++){
45             if(ord($str{$i}) <128){
46                 $ascii .= $str{$i};
47             }
48         }
49         return $ascii;
50     }
51 }
52
53 if(!function_exists('utf8_check')){
54     /**
55      * Tries to detect if a string is in Unicode encoding
56      *
57      * @author <bmorel@ssi.fr>
58      * @link   http://www.php.net/manual/en/function.utf8-encode.php
59      */
60     function utf8_check($Str) {
61         $len = strlen($Str);
62         for ($i=0; $i<$len; $i++) {
63             $b = ord($Str[$i]);
64             if ($b < 0x80) continue; # 0bbbbbbb
65             elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
66             elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
67             elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
68             elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
69             elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70             else return false; # Does not match any model
71
72             for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
73                 if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74                     return false;
75             }
76         }
77         return true;
78     }
79 }
80
81 if(!function_exists('utf8_strlen')){
82     /**
83      * Unicode aware replacement for strlen()
84      *
85      * utf8_decode() converts characters that are not in ISO-8859-1
86      * to '?', which, for the purpose of counting, is alright - It's
87      * even faster than mb_strlen.
88      *
89      * @author <chernyshevsky at hotmail dot com>
90      * @see    strlen()
91      * @see    utf8_decode()
92      */
93     function utf8_strlen($string){
94         return strlen(utf8_decode($string));
95     }
96 }
97
98 if(!function_exists('utf8_substr')){
99     /**
100      * UTF-8 aware alternative to substr
101      *
102      * Return part of a string given character offset (and optionally length)
103      *
104      * @author Harry Fuecks <hfuecks@gmail.com>
105      * @author Chris Smith <chris@jalakai.co.uk>
106      * @param string
107      * @param integer number of UTF-8 characters offset (from left)
108      * @param integer (optional) length in UTF-8 characters from offset
109      * @return mixed string or false if failure
110      */
111     function utf8_substr($str, $offset, $length = null) {
112         if(UTF8_MBSTRING){
113             if( $length === null ){
114                 return mb_substr($str, $offset);
115             }else{
116                 return mb_substr($str, $offset, $length);
117             }
118         }
119
120         /*
121          * Notes:
122          *
123          * no mb string support, so we'll use pcre regex's with 'u' flag
124          * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
125          * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
126          *
127          * substr documentation states false can be returned in some cases (e.g. offset > string length)
128          * mb_substr never returns false, it will return an empty string instead.
129          *
130          * calculating the number of characters in the string is a relatively expensive operation, so
131          * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
132          */
133
134         // cast parameters to appropriate types to avoid multiple notices/warnings
135         $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
136         $offset = (int)$offset;
137         if (!is_null($length)) $length = (int)$length;
138
139         // handle trivial cases
140         if ($length === 0) return '';
141         if ($offset < 0 && $length < 0 && $length < $offset) return '';
142
143         $offset_pattern = '';
144         $length_pattern = '';
145
146         // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
147         if ($offset < 0) {
148             $strlen = strlen(utf8_decode($str));        // see notes
149             $offset = $strlen + $offset;
150             if ($offset < 0) $offset = 0;
151         }
152
153         // establish a pattern for offset, a non-captured group equal in length to offset
154         if ($offset > 0) {
155             $Ox = (int)($offset/65535);
156             $Oy = $offset%65535;
157
158             if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
159             $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
160         } else {
161             $offset_pattern = '^';                      // offset == 0; just anchor the pattern
162         }
163
164         // establish a pattern for length
165         if (is_null($length)) {
166             $length_pattern = '(.*)$';                  // the rest of the string
167         } else {
168
169             if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
170             if ($offset > $strlen) return '';           // another trivial case
171
172             if ($length > 0) {
173
174                 $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
175
176                 $Lx = (int)($length/65535);
177                 $Ly = $length%65535;
178
179                 // +ve length requires ... a captured group of length characters
180                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
181                     $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
182
183             } else if ($length < 0) {
184
185                 if ($length < ($offset - $strlen)) return '';
186
187                 $Lx = (int)((-$length)/65535);
188                 $Ly = (-$length)%65535;
189
190                 // -ve length requires ... capture everything except a group of -length characters
191                 //                         anchored at the tail-end of the string
192                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
193                 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
194             }
195         }
196
197         if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
198         return $match[1];
199     }
200 }
201
202 if(!function_exists('utf8_substr_replace')){
203     /**
204      * Unicode aware replacement for substr_replace()
205      *
206      * @author Andreas Gohr <andi@splitbrain.org>
207      * @see    substr_replace()
208      */
209     function utf8_substr_replace($string, $replacement, $start , $length=0 ){
210         $ret = '';
211         if($start>0) $ret .= utf8_substr($string, 0, $start);
212         $ret .= $replacement;
213         $ret .= utf8_substr($string, $start+$length);
214         return $ret;
215     }
216 }
217
218 if(!function_exists('utf8_ltrim')){
219     /**
220      * Unicode aware replacement for ltrim()
221      *
222      * @author Andreas Gohr <andi@splitbrain.org>
223      * @see    ltrim()
224      * @return string
225      */
226     function utf8_ltrim($str,$charlist=''){
227         if($charlist == '') return ltrim($str);
228
229         //quote charlist for use in a characterclass
230         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
231
232         return preg_replace('/^['.$charlist.']+/u','',$str);
233     }
234 }
235
236 if(!function_exists('utf8_rtrim')){
237     /**
238      * Unicode aware replacement for rtrim()
239      *
240      * @author Andreas Gohr <andi@splitbrain.org>
241      * @see    rtrim()
242      * @return string
243      */
244     function  utf8_rtrim($str,$charlist=''){
245         if($charlist == '') return rtrim($str);
246
247         //quote charlist for use in a characterclass
248         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
249
250         return preg_replace('/['.$charlist.']+$/u','',$str);
251     }
252 }
253
254 if(!function_exists('utf8_trim')){
255     /**
256      * Unicode aware replacement for trim()
257      *
258      * @author Andreas Gohr <andi@splitbrain.org>
259      * @see    trim()
260      * @return string
261      */
262     function  utf8_trim($str,$charlist='') {
263         if($charlist == '') return trim($str);
264
265         return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
266     }
267 }
268
269 if(!function_exists('utf8_strtolower')){
270     /**
271      * This is a unicode aware replacement for strtolower()
272      *
273      * Uses mb_string extension if available
274      *
275      * @author Leo Feyer <leo@typolight.org>
276      * @see    strtolower()
277      * @see    utf8_strtoupper()
278      */
279     function utf8_strtolower($string){
280         if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
281
282         global $UTF8_UPPER_TO_LOWER;
283         return strtr($string,$UTF8_UPPER_TO_LOWER);
284     }
285 }
286
287 if(!function_exists('utf8_strtoupper')){
288     /**
289      * This is a unicode aware replacement for strtoupper()
290      *
291      * Uses mb_string extension if available
292      *
293      * @author Leo Feyer <leo@typolight.org>
294      * @see    strtoupper()
295      * @see    utf8_strtoupper()
296      */
297     function utf8_strtoupper($string){
298         if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
299
300         global $UTF8_LOWER_TO_UPPER;
301         return strtr($string,$UTF8_LOWER_TO_UPPER);
302     }
303 }
304
305 if(!function_exists('utf8_ucfirst')){
306     /**
307      * UTF-8 aware alternative to ucfirst
308      * Make a string's first character uppercase
309      *
310      * @author Harry Fuecks
311      * @param string
312      * @return string with first character as upper case (if applicable)
313      */
314     function utf8_ucfirst($str){
315         switch ( utf8_strlen($str) ) {
316             case 0:
317                 return '';
318             case 1:
319                 return utf8_strtoupper($str);
320             default:
321                 preg_match('/^(.{1})(.*)$/us', $str, $matches);
322                 return utf8_strtoupper($matches[1]).$matches[2];
323         }
324     }
325 }
326
327 if(!function_exists('utf8_ucwords')){
328     /**
329      * UTF-8 aware alternative to ucwords
330      * Uppercase the first character of each word in a string
331      *
332      * @author Harry Fuecks
333      * @param string
334      * @return string with first char of each word uppercase
335      * @see http://www.php.net/ucwords
336      */
337     function utf8_ucwords($str) {
338         // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
339         // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
340         // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
341         $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
342
343         return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
344     }
345
346     /**
347      * Callback function for preg_replace_callback call in utf8_ucwords
348      * You don't need to call this yourself
349      *
350      * @author Harry Fuecks
351      * @param array of matches corresponding to a single word
352      * @return string with first char of the word in uppercase
353      * @see utf8_ucwords
354      * @see utf8_strtoupper
355      */
356     function utf8_ucwords_callback($matches) {
357         $leadingws = $matches[2];
358         $ucfirst = utf8_strtoupper($matches[3]);
359         $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
360         return $leadingws . $ucword;
361     }
362 }
363
364 if(!function_exists('utf8_deaccent')){
365     /**
366      * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
367      *
368      * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
369      * letters. Default is to deaccent both cases ($case = 0)
370      *
371      * @author Andreas Gohr <andi@splitbrain.org>
372      */
373     function utf8_deaccent($string,$case=0){
374         if($case <= 0){
375             global $UTF8_LOWER_ACCENTS;
376             $string = strtr($string,$UTF8_LOWER_ACCENTS);
377         }
378         if($case >= 0){
379             global $UTF8_UPPER_ACCENTS;
380             $string = strtr($string,$UTF8_UPPER_ACCENTS);
381         }
382         return $string;
383     }
384 }
385
386 if(!function_exists('utf8_romanize')){
387     /**
388      * Romanize a non-latin string
389      *
390      * @author Andreas Gohr <andi@splitbrain.org>
391      */
392     function utf8_romanize($string){
393         if(utf8_isASCII($string)) return $string; //nothing to do
394
395         global $UTF8_ROMANIZATION;
396         return strtr($string,$UTF8_ROMANIZATION);
397     }
398 }
399
400 if(!function_exists('utf8_stripspecials')){
401     /**
402      * Removes special characters (nonalphanumeric) from a UTF-8 string
403      *
404      * This function adds the controlchars 0x00 to 0x19 to the array of
405      * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
406      *
407      * @author Andreas Gohr <andi@splitbrain.org>
408      * @param  string $string     The UTF8 string to strip of special chars
409      * @param  string $repl       Replace special with this string
410      * @param  string $additional Additional chars to strip (used in regexp char class)
411      */
412     function utf8_stripspecials($string,$repl='',$additional=''){
413         global $UTF8_SPECIAL_CHARS;
414         global $UTF8_SPECIAL_CHARS2;
415
416         static $specials = null;
417         if(is_null($specials)){
418             #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
419             $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
420         }
421
422         return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
423     }
424 }
425
426 if(!function_exists('utf8_strpos')){
427     /**
428      * This is an Unicode aware replacement for strpos
429      *
430      * @author Leo Feyer <leo@typolight.org>
431      * @see    strpos()
432      * @param  string
433      * @param  string
434      * @param  integer
435      * @return integer
436      */
437     function utf8_strpos($haystack, $needle, $offset=0){
438         $comp = 0;
439         $length = null;
440
441         while (is_null($length) || $length < $offset) {
442             $pos = strpos($haystack, $needle, $offset + $comp);
443
444             if ($pos === false)
445                 return false;
446
447             $length = utf8_strlen(substr($haystack, 0, $pos));
448
449             if ($length < $offset)
450                 $comp = $pos - $length;
451         }
452
453         return $length;
454     }
455 }
456
457 if(!function_exists('utf8_tohtml')){
458     /**
459      * Encodes UTF-8 characters to HTML entities
460      *
461      * @author Tom N Harris <tnharris@whoopdedo.org>
462      * @author <vpribish at shopping dot com>
463      * @link   http://www.php.net/manual/en/function.utf8-decode.php
464      */
465     function utf8_tohtml ($str) {
466         $ret = '';
467         foreach (utf8_to_unicode($str) as $cp) {
468             if ($cp < 0x80)
469                 $ret .= chr($cp);
470             elseif ($cp < 0x100)
471                 $ret .= "&#$cp;";
472             else
473                 $ret .= '&#x'.dechex($cp).';';
474         }
475         return $ret;
476     }
477 }
478
479 if(!function_exists('utf8_unhtml')){
480     /**
481      * Decodes HTML entities to UTF-8 characters
482      *
483      * Convert any &#..; entity to a codepoint,
484      * The entities flag defaults to only decoding numeric entities.
485      * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
486      * are handled as well. Avoids the problem that would occur if you
487      * had to decode "&amp;#38;&#38;amp;#38;"
488      *
489      * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
490      * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
491      * what it should be                   -> "&#38;&amp#38;"
492      *
493      * @author Tom N Harris <tnharris@whoopdedo.org>
494      * @param  string  $str      UTF-8 encoded string
495      * @param  boolean $entities Flag controlling decoding of named entities.
496      * @return UTF-8 encoded string with numeric (and named) entities replaced.
497      */
498     function utf8_unhtml($str, $entities=null) {
499         static $decoder = null;
500         if (is_null($decoder))
501             $decoder = new utf8_entity_decoder();
502         if (is_null($entities))
503             return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
504                                          'utf8_decode_numeric', $str);
505         else
506             return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
507                                          array(&$decoder, 'decode'), $str);
508     }
509 }
510
511 if(!function_exists('utf8_decode_numeric')){
512     function utf8_decode_numeric($ent) {
513         switch ($ent[2]) {
514             case 'X':
515             case 'x':
516                 $cp = hexdec($ent[3]);
517                 break;
518             default:
519                 $cp = intval($ent[3]);
520                 break;
521         }
522         return unicode_to_utf8(array($cp));
523     }
524 }
525
526 if(!class_exists('utf8_entity_decoder')){
527     class utf8_entity_decoder {
528         var $table;
529         function utf8_entity_decoder() {
530             $table = get_html_translation_table(HTML_ENTITIES);
531             $table = array_flip($table);
532             $this->table = array_map(array(&$this,'makeutf8'), $table);
533         }
534         function makeutf8($c) {
535             return unicode_to_utf8(array(ord($c)));
536         }
537         function decode($ent) {
538             if ($ent[1] == '#') {
539                 return utf8_decode_numeric($ent);
540             } elseif (array_key_exists($ent[0],$this->table)) {
541                 return $this->table[$ent[0]];
542             } else {
543                 return $ent[0];
544             }
545         }
546     }
547 }
548
549 if(!function_exists('utf8_to_unicode')){
550     /**
551      * Takes an UTF-8 string and returns an array of ints representing the
552      * Unicode characters. Astral planes are supported ie. the ints in the
553      * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
554      * are not allowed.
555      *
556      * If $strict is set to true the function returns false if the input
557      * string isn't a valid UTF-8 octet sequence and raises a PHP error at
558      * level E_USER_WARNING
559      *
560      * Note: this function has been modified slightly in this library to
561      * trigger errors on encountering bad bytes
562      *
563      * @author <hsivonen@iki.fi>
564      * @author Harry Fuecks <hfuecks@gmail.com>
565      * @param  string  UTF-8 encoded string
566      * @param  boolean Check for invalid sequences?
567      * @return mixed array of unicode code points or false if UTF-8 invalid
568      * @see    unicode_to_utf8
569      * @link   http://hsivonen.iki.fi/php-utf8/
570      * @link   http://sourceforge.net/projects/phputf8/
571      */
572     function utf8_to_unicode($str,$strict=false) {
573         $mState = 0;     // cached expected number of octets after the current octet
574                          // until the beginning of the next UTF8 character sequence
575         $mUcs4  = 0;     // cached Unicode character
576         $mBytes = 1;     // cached expected number of octets in the current sequence
577
578         $out = array();
579
580         $len = strlen($str);
581
582         for($i = 0; $i < $len; $i++) {
583
584             $in = ord($str{$i});
585
586             if ( $mState == 0) {
587
588                 // When mState is zero we expect either a US-ASCII character or a
589                 // multi-octet sequence.
590                 if (0 == (0x80 & ($in))) {
591                     // US-ASCII, pass straight through.
592                     $out[] = $in;
593                     $mBytes = 1;
594
595                 } else if (0xC0 == (0xE0 & ($in))) {
596                     // First octet of 2 octet sequence
597                     $mUcs4 = ($in);
598                     $mUcs4 = ($mUcs4 & 0x1F) << 6;
599                     $mState = 1;
600                     $mBytes = 2;
601
602                 } else if (0xE0 == (0xF0 & ($in))) {
603                     // First octet of 3 octet sequence
604                     $mUcs4 = ($in);
605                     $mUcs4 = ($mUcs4 & 0x0F) << 12;
606                     $mState = 2;
607                     $mBytes = 3;
608
609                 } else if (0xF0 == (0xF8 & ($in))) {
610                     // First octet of 4 octet sequence
611                     $mUcs4 = ($in);
612                     $mUcs4 = ($mUcs4 & 0x07) << 18;
613                     $mState = 3;
614                     $mBytes = 4;
615
616                 } else if (0xF8 == (0xFC & ($in))) {
617                     /* First octet of 5 octet sequence.
618                      *
619                      * This is illegal because the encoded codepoint must be either
620                      * (a) not the shortest form or
621                      * (b) outside the Unicode range of 0-0x10FFFF.
622                      * Rather than trying to resynchronize, we will carry on until the end
623                      * of the sequence and let the later error handling code catch it.
624                      */
625                     $mUcs4 = ($in);
626                     $mUcs4 = ($mUcs4 & 0x03) << 24;
627                     $mState = 4;
628                     $mBytes = 5;
629
630                 } else if (0xFC == (0xFE & ($in))) {
631                     // First octet of 6 octet sequence, see comments for 5 octet sequence.
632                     $mUcs4 = ($in);
633                     $mUcs4 = ($mUcs4 & 1) << 30;
634                     $mState = 5;
635                     $mBytes = 6;
636
637                 } elseif($strict) {
638                     /* Current octet is neither in the US-ASCII range nor a legal first
639                      * octet of a multi-octet sequence.
640                      */
641                     trigger_error(
642                             'utf8_to_unicode: Illegal sequence identifier '.
643                                 'in UTF-8 at byte '.$i,
644                             E_USER_WARNING
645                         );
646                     return false;
647
648                 }
649
650             } else {
651
652                 // When mState is non-zero, we expect a continuation of the multi-octet
653                 // sequence
654                 if (0x80 == (0xC0 & ($in))) {
655
656                     // Legal continuation.
657                     $shift = ($mState - 1) * 6;
658                     $tmp = $in;
659                     $tmp = ($tmp & 0x0000003F) << $shift;
660                     $mUcs4 |= $tmp;
661
662                     /**
663                      * End of the multi-octet sequence. mUcs4 now contains the final
664                      * Unicode codepoint to be output
665                      */
666                     if (0 == --$mState) {
667
668                         /*
669                          * Check for illegal sequences and codepoints.
670                          */
671                         // From Unicode 3.1, non-shortest form is illegal
672                         if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
673                             ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
674                             ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
675                             (4 < $mBytes) ||
676                             // From Unicode 3.2, surrogate characters are illegal
677                             (($mUcs4 & 0xFFFFF800) == 0xD800) ||
678                             // Codepoints outside the Unicode range are illegal
679                             ($mUcs4 > 0x10FFFF)) {
680
681                             if($strict){
682                                 trigger_error(
683                                         'utf8_to_unicode: Illegal sequence or codepoint '.
684                                             'in UTF-8 at byte '.$i,
685                                         E_USER_WARNING
686                                     );
687
688                                 return false;
689                             }
690
691                         }
692
693                         if (0xFEFF != $mUcs4) {
694                             // BOM is legal but we don't want to output it
695                             $out[] = $mUcs4;
696                         }
697
698                         //initialize UTF8 cache
699                         $mState = 0;
700                         $mUcs4  = 0;
701                         $mBytes = 1;
702                     }
703
704                 } elseif($strict) {
705                     /**
706                      *((0xC0 & (*in) != 0x80) && (mState != 0))
707                      * Incomplete multi-octet sequence.
708                      */
709                     trigger_error(
710                             'utf8_to_unicode: Incomplete multi-octet '.
711                             '   sequence in UTF-8 at byte '.$i,
712                             E_USER_WARNING
713                         );
714
715                     return false;
716                 }
717             }
718         }
719         return $out;
720     }
721 }
722
723 if(!function_exists('unicode_to_utf8')){
724     /**
725      * Takes an array of ints representing the Unicode characters and returns
726      * a UTF-8 string. Astral planes are supported ie. the ints in the
727      * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
728      * are not allowed.
729      *
730      * If $strict is set to true the function returns false if the input
731      * array contains ints that represent surrogates or are outside the
732      * Unicode range and raises a PHP error at level E_USER_WARNING
733      *
734      * Note: this function has been modified slightly in this library to use
735      * output buffering to concatenate the UTF-8 string (faster) as well as
736      * reference the array by it's keys
737      *
738      * @param  array of unicode code points representing a string
739      * @param  boolean Check for invalid sequences?
740      * @return mixed UTF-8 string or false if array contains invalid code points
741      * @author <hsivonen@iki.fi>
742      * @author Harry Fuecks <hfuecks@gmail.com>
743      * @see    utf8_to_unicode
744      * @link   http://hsivonen.iki.fi/php-utf8/
745      * @link   http://sourceforge.net/projects/phputf8/
746      */
747     function unicode_to_utf8($arr,$strict=false) {
748         if (!is_array($arr)) return '';
749         ob_start();
750
751         foreach (array_keys($arr) as $k) {
752
753             if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
754                 # ASCII range (including control chars)
755
756                 echo chr($arr[$k]);
757
758             } else if ($arr[$k] <= 0x07ff) {
759                 # 2 byte sequence
760
761                 echo chr(0xc0 | ($arr[$k] >> 6));
762                 echo chr(0x80 | ($arr[$k] & 0x003f));
763
764             } else if($arr[$k] == 0xFEFF) {
765                 # Byte order mark (skip)
766
767                 // nop -- zap the BOM
768
769             } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
770                 # Test for illegal surrogates
771
772                 // found a surrogate
773                 if($strict){
774                     trigger_error(
775                         'unicode_to_utf8: Illegal surrogate '.
776                             'at index: '.$k.', value: '.$arr[$k],
777                         E_USER_WARNING
778                         );
779                     return false;
780                 }
781
782             } else if ($arr[$k] <= 0xffff) {
783                 # 3 byte sequence
784
785                 echo chr(0xe0 | ($arr[$k] >> 12));
786                 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
787                 echo chr(0x80 | ($arr[$k] & 0x003f));
788
789             } else if ($arr[$k] <= 0x10ffff) {
790                 # 4 byte sequence
791
792                 echo chr(0xf0 | ($arr[$k] >> 18));
793                 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
794                 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
795                 echo chr(0x80 | ($arr[$k] & 0x3f));
796
797             } elseif($strict) {
798
799                 trigger_error(
800                     'unicode_to_utf8: Codepoint out of Unicode range '.
801                         'at index: '.$k.', value: '.$arr[$k],
802                     E_USER_WARNING
803                     );
804
805                 // out of range
806                 return false;
807             }
808         }
809
810         $result = ob_get_contents();
811         ob_end_clean();
812         return $result;
813     }
814 }
815
816 if(!function_exists('utf8_to_utf16be')){
817     /**
818      * UTF-8 to UTF-16BE conversion.
819      *
820      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
821      */
822     function utf8_to_utf16be(&$str, $bom = false) {
823         $out = $bom ? "\xFE\xFF" : '';
824         if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
825
826         $uni = utf8_to_unicode($str);
827         foreach($uni as $cp){
828             $out .= pack('n',$cp);
829         }
830         return $out;
831     }
832 }
833
834 if(!function_exists('utf16be_to_utf8')){
835     /**
836      * UTF-8 to UTF-16BE conversion.
837      *
838      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
839      */
840     function utf16be_to_utf8(&$str) {
841         $uni = unpack('n*',$str);
842         return unicode_to_utf8($uni);
843     }
844 }
845
846 if(!function_exists('utf8_bad_replace')){
847     /**
848      * Replace bad bytes with an alternative character
849      *
850      * ASCII character is recommended for replacement char
851      *
852      * PCRE Pattern to locate bad bytes in a UTF-8 string
853      * Comes from W3 FAQ: Multilingual Forms
854      * Note: modified to include full ASCII range including control chars
855      *
856      * @author Harry Fuecks <hfuecks@gmail.com>
857      * @see http://www.w3.org/International/questions/qa-forms-utf-8
858      * @param string to search
859      * @param string to replace bad bytes with (defaults to '?') - use ASCII
860      * @return string
861      */
862     function utf8_bad_replace($str, $replace = '') {
863         $UTF8_BAD =
864          '([\x00-\x7F]'.                          # ASCII (including control chars)
865          '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
866          '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
867          '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
868          '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
869          '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
870          '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
871          '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
872          '|(.{1}))';                              # invalid byte
873         ob_start();
874         while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
875             if ( !isset($matches[2])) {
876                 echo $matches[0];
877             } else {
878                 echo $replace;
879             }
880             $str = substr($str,strlen($matches[0]));
881         }
882         $result = ob_get_contents();
883         ob_end_clean();
884         return $result;
885     }
886 }
887
888 if(!function_exists('utf8_correctIdx')){
889     /**
890      * adjust a byte index into a utf8 string to a utf8 character boundary
891      *
892      * @param $str   string   utf8 character string
893      * @param $i     int      byte index into $str
894      * @param $next  bool     direction to search for boundary,
895      *                           false = up (current character)
896      *                           true = down (next character)
897      *
898      * @return int            byte index into $str now pointing to a utf8 character boundary
899      *
900      * @author       chris smith <chris@jalakai.co.uk>
901      */
902     function utf8_correctIdx(&$str,$i,$next=false) {
903
904         if ($i <= 0) return 0;
905
906         $limit = strlen($str);
907         if ($i>=$limit) return $limit;
908
909         if ($next) {
910             while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
911         } else {
912             while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
913         }
914
915         return $i;
916     }
917 }
918
919 // only needed if no mb_string available
920 if(!UTF8_MBSTRING){
921     /**
922      * UTF-8 Case lookup table
923      *
924      * This lookuptable defines the upper case letters to their correspponding
925      * lower case letter in UTF-8
926      *
927      * @author Andreas Gohr <andi@splitbrain.org>
928      */
929     global $UTF8_LOWER_TO_UPPER;
930     if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
931             "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
932             "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
933             "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
934             "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
935             "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
936             "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
937             "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
938             "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
939             "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
940             "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
941             "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
942             "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
943             "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
944             "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
945             "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
946             "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
947             "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
948             "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
949             "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
950             "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
951             "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
952             "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
953             "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
954             "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
955             "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
956             "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
957             "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
958             "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
959             "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
960             "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
961             "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
962             "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
963             "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
964             "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
965             "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
966             "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
967             "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
968             "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
969             "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
970             "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
971             "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
972             "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
973             "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
974             "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
975             "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
976             "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
977             "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
978             "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
979             "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
980             "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
981             "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
982             "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
983             "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
984             "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
985             "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
986             "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
987             "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
988             "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
989             "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
990             "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
991             "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
992             "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
993             "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
994             "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
995             "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
996             "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
997             "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
998                 );
999
1000     /**
1001      * UTF-8 Case lookup table
1002      *
1003      * This lookuptable defines the lower case letters to their correspponding
1004      * upper case letter in UTF-8
1005      *
1006      * @author Andreas Gohr <andi@splitbrain.org>
1007      */
1008     global $UTF8_UPPER_TO_LOWER;
1009     if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1010             "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1011             "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1012             "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1013             "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1014             "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1015             "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1016             "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1017             "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1018             "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1019             "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1020             "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1021             "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1022             "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1023             "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1024             "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1025             "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1026             "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1027             "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1028             "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1029             "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1030             "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1031             "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1032             "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1033             "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1034             "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1035             "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1036             "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1037             "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1038             "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1039             "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1040             "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1041             "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1042             "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1043             "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1044             "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1045             "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1046             "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1047             "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1048             "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1049             "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1050             "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1051             "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1052             "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1053             "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1054             "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1055             "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1056             "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1057             "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1058             "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1059             "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1060             "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1061             "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1062             "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1063             "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1064             "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1065             "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1066             "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1067             "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1068             "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1069             "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1070             "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1071             "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1072             "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1073             "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1074             "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1075             "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1076             "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1077                 );
1078 }; // end of case lookup tables
1079
1080 /**
1081  * UTF-8 lookup table for lower case accented letters
1082  *
1083  * This lookuptable defines replacements for accented characters from the ASCII-7
1084  * range. This are lower case letters only.
1085  *
1086  * @author Andreas Gohr <andi@splitbrain.org>
1087  * @see    utf8_deaccent()
1088  */
1089 global $UTF8_LOWER_ACCENTS;
1090 if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1091   'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1092   'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1093   'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1094   'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1095   'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1096   'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1097   'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1098   'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1099   'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1100   'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1101   'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1102   'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1103   'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1104   'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1105   'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1106 );
1107
1108 /**
1109  * UTF-8 lookup table for upper case accented letters
1110  *
1111  * This lookuptable defines replacements for accented characters from the ASCII-7
1112  * range. This are upper case letters only.
1113  *
1114  * @author Andreas Gohr <andi@splitbrain.org>
1115  * @see    utf8_deaccent()
1116  */
1117 global $UTF8_UPPER_ACCENTS;
1118 if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1119   'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1120   'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1121   'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1122   'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1123   'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1124   'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1125   'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1126   'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1127   'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1128   'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1129   'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1130   'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1131   'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1132   'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1133   'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1134 );
1135
1136 /**
1137  * UTF-8 array of common special characters
1138  *
1139  * This array should contain all special characters (not a letter or digit)
1140  * defined in the various local charsets - it's not a complete list of non-alphanum
1141  * characters in UTF-8. It's not perfect but should match most cases of special
1142  * chars.
1143  *
1144  * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1145  * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1146  *
1147  * @author Andreas Gohr <andi@splitbrain.org>
1148  * @see    utf8_stripspecials()
1149  */
1150 global $UTF8_SPECIAL_CHARS;
1151 if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1152   0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1153   0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1154           0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1155   0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1156   0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1157   0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1158   0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1159   0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1160   0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1161   0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1162   0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1163   0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1164   0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1165   0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1166   0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1167   0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1168   0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1169   0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1170   0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1171   0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1172   0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1173   0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1174   0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1175   0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1176   0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1177   0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1178   0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1179   0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1180   0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1181   0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1182   0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1183   0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1184   0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1185   0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1186   0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1187   0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1188   0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1189   0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1190   0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1191   0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1192   0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1193   0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1194   0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1195   0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1196   0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1197   0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1198   0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1199   0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1200   0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1201   0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1202   0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1203   0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1204   0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1205           0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1206   0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1207   0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1208   0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1209   0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1210   0xffeb, 0xffec, 0xffed, 0xffee,
1211   0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1212   0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1213   0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1214   0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1215   0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1216 );
1217
1218 // utf8 version of above data
1219 global $UTF8_SPECIAL_CHARS2;
1220 if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1221     "\x1A".'\e\1c\1d\1e\1f !"#$%&\'()+,/;<=>?@[\]^`{|}~\7f\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95�'.
1222     '�\97\98\99\9a\9b\9c\9d\9e\9f ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1223     '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1224     '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1225     '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1226     '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1227     '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1228     '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1229     '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1230     '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1231     '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1232     '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1233     '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1234     '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1235     '➷➸➹➺➻➼➽➾'.
1236     ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1237     '�'.
1238     '�ﹼﹽ'.
1239     '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1240     '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1241     '𝛼𝛽𝛾𝛿𝜀𝜁𝜂𝜃𝜄𝜅𝜆𝜇𝜈𝜉𝜊𝜋𝜌𝜍𝜎𝜏𝜐𝜑𝜒𝜓𝜔𝜕𝜖𝜗𝜘𝜙𝜚𝜛'.
1242     '   ⁠';
1243
1244 /**
1245  * Romanization lookup table
1246  *
1247  * This lookup tables provides a way to transform strings written in a language
1248  * different from the ones based upon latin letters into plain ASCII.
1249  *
1250  * Please note: this is not a scientific transliteration table. It only works
1251  * oneway from nonlatin to ASCII and it works by simple character replacement
1252  * only. Specialities of each language are not supported.
1253  *
1254  * @author Andreas Gohr <andi@splitbrain.org>
1255  * @author Vitaly Blokhin <vitinfo@vitn.com>
1256  * @link   http://www.uconv.com/translit.htm
1257  * @author Bisqwit <bisqwit@iki.fi>
1258  * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1259  * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1260  * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1261  * @link   http://www.btranslations.com/resources/romanization/korean.asp
1262  * @author Arthit Suriyawongkul <arthit@gmail.com>
1263  * @author Denis Scheither <amorphis@uni-bremen.de>
1264  * @author Eivind Morland <eivind.morland@gmail.com>
1265  */
1266 global $UTF8_ROMANIZATION;
1267 if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1268   // scandinavian - differs from what we do in deaccent
1269   'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1270
1271   //russian cyrillic
1272   'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1273   'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1274   'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1275   'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1276   'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1277   'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1278   'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1279   'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1280   'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1281   // Ukrainian cyrillic
1282   'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1283   // Georgian
1284   'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1285   'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1286   'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1287   'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1288   'ჰ'=>'xh',
1289   //Sanskrit
1290   'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1291   'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1292   'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1293   'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1294   'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1295   'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1296   'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1297   //Sanskrit diacritics
1298   'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1299   'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1300   'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1301   'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1302   //Hebrew
1303   'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1304   'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1305   'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1306   'ש'=>'sh','ת'=>'t',
1307   //Arabic
1308   'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1309   'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1310   'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1311   'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1312
1313   // Japanese characters  (last update: 2008-05-09)
1314
1315   // Japanese hiragana
1316
1317   // 3 character syllables, っ doubles the consonant after
1318   'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1319   'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1320   'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1321   'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1322   // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1323   'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1324   'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1325   'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1326   'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1327   'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1328   'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1329
1330   // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1331   'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1332   'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1333
1334    // 2 character syllables - normal
1335   'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1336   'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1337   'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1338   'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1339   'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1340   'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1341   'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1342   'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1343   'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1344   'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1345   'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1346   'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1347   'うぇ'=>'we','うぃ'=>'wi',
1348   'いぇ'=>'ye',
1349
1350   // 2 character syllables, っ doubles the consonant after
1351   'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1352   'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1353   'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1354   'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1355   'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1356   'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1357   'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1358   'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1359   'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1360   'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1361   'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1362
1363   // 1 character syllabels
1364   'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1365   'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1366   'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1367   'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1368   'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1369   'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1370   'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1371   'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1372   'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1373   'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1374   'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1375   'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1376   'わ'=>'wa','を'=>'wo',
1377   'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1378   'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1379   // old characters
1380   'ゑ'=>'we','ゐ'=>'wi',
1381
1382   //  convert what's left (probably only kicks in when something's missing above)
1383   // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1384   // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1385
1386   // never seen one of those (disabled for the moment)
1387   // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1388   // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1389   // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1390   // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1391   // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1392   // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1393   // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1394   // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1395   // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1396   // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1397   // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1398   // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1399   // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1400   // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1401
1402   // 'spare' characters from other romanization systems
1403   // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1404   // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1405   // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1406   // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1407   //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1408   //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1409   //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1410   //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1411   //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1412   //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1413
1414
1415   // Japanese katakana
1416
1417   // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1418   'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1419   'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1420   'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1421   'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1422   'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1423   'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1424   'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1425   'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1426   'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1427   'ッティー'=>'ttii',
1428   'ッヂィー'=>'ddii',
1429
1430   // 3 character syllables - doubled vowels
1431   'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1432   'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1433   'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1434   'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1435   'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1436   'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1437   'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1438   'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1439   'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1440   'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1441   'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1442   'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1443   'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1444   'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1445   'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1446   'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1447   'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1448   'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1449   'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1450   'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1451   'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1452   'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1453   'ウェー'=>'wee','ウィー'=>'wii',
1454   'イェー'=>'yee',
1455   'ティー'=>'tii',
1456   'ヂィー'=>'dii',
1457
1458   // 3 character syllables - doubled consonants
1459   'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1460   'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1461   'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1462   'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1463   'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1464   'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1465   'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1466   'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1467   'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1468   'ッティ'=>'tti',
1469   'ッヂィ'=>'ddi',
1470
1471   // 3 character syllables - doubled vowel and consonants
1472   'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1473   'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1474   'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1475   'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1476   'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1477   'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1478   'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1479   'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1480   'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1481   'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1482   'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1483
1484   // 2 character syllables - normal
1485   'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1486   // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1487   'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1488   'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1489   'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1490   'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1491   'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1492   'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1493   'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1494   'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1495   'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1496   'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1497   'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1498   'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1499   'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1500   'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1501   // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1502   'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1503   'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1504   'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1505   'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1506   'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1507   'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1508   'ウェ'=>'we','ウィ'=>'wi',
1509   'イェ'=>'ye',
1510   'ティ'=>'ti',
1511   'ヂィ'=>'di',
1512
1513   // 2 character syllables - doubled vocal
1514   'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1515   'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1516   'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1517   'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1518   'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1519   'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1520   'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1521   'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1522   'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1523   'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1524   'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1525   'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1526   'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1527   'ワー'=>'waa','ヲー'=>'woo',
1528   'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1529   'ヵー'=>'kaa','ヶー'=>'kee',
1530   // old characters
1531   'ヱー'=>'wee','ヰー'=>'wii',
1532
1533   // seperate katakana 'n'
1534   'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1535   'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1536
1537   // 2 character syllables - doubled consonants
1538   'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1539   'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1540   'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1541   'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1542   'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1543   'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1544   'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1545   'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1546   'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1547   'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1548   'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1549
1550   // 1 character syllables
1551   'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1552   'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1553   'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1554   'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1555   'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1556   'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1557   'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1558   'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1559   'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1560   'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1561   'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1562   'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1563   'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1564   'ワ'=>'wa','ヲ'=>'wo',
1565   'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1566   'ヵ'=>'ka','ヶ'=>'ke',
1567   // old characters
1568   'ヱ'=>'we','ヰ'=>'wi',
1569
1570   //  convert what's left (probably only kicks in when something's missing above)
1571   'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1572   'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1573
1574   // special characters
1575   '・'=>'_','、'=>'_',
1576   'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1577
1578   // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1579   // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1580   //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1581   // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1582   // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1583   //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1584   //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1585   // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1586   // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1587   //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1588   //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1589   //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1590
1591   // "Greeklish"
1592   'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1593   'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1594
1595   // Thai
1596   'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1597   'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1598   'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1599   'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1600   'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1601   'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1602   'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1603   'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1604   'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1605   'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1606   'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1607   'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1608   'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1609   '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1610   '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1611   'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1612   '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1613   '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1614
1615   // Korean
1616   'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1617   'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1618   'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1619   'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1620   'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1621   'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1622 );
1623
1624