Mereged updates from DokuWiki 38
[sudaraka-org:dokuwiki-mods.git] / inc / utf8.php
1 <?php
2 /**
3  * UTF8 helper functions
4  *
5  * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6  * @author     Andreas Gohr <andi@splitbrain.org>
7  */
8
9 /**
10  * check for mb_string support
11  */
12 if(!defined('UTF8_MBSTRING')){
13     if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14         define('UTF8_MBSTRING',1);
15     }else{
16         define('UTF8_MBSTRING',0);
17     }
18 }
19
20 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22 if(!function_exists('utf8_isASCII')){
23     /**
24      * Checks if a string contains 7bit ASCII only
25      *
26      * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
27      */
28     function utf8_isASCII($str){
29         return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
30     }
31 }
32
33 if(!function_exists('utf8_strip')){
34     /**
35      * Strips all highbyte chars
36      *
37      * Returns a pure ASCII7 string
38      *
39      * @author Andreas Gohr <andi@splitbrain.org>
40      */
41     function utf8_strip($str){
42         $ascii = '';
43         $len = strlen($str);
44         for($i=0; $i<$len; $i++){
45             if(ord($str{$i}) <128){
46                 $ascii .= $str{$i};
47             }
48         }
49         return $ascii;
50     }
51 }
52
53 if(!function_exists('utf8_check')){
54     /**
55      * Tries to detect if a string is in Unicode encoding
56      *
57      * @author <bmorel@ssi.fr>
58      * @link   http://www.php.net/manual/en/function.utf8-encode.php
59      */
60     function utf8_check($Str) {
61         $len = strlen($Str);
62         for ($i=0; $i<$len; $i++) {
63             $b = ord($Str[$i]);
64             if ($b < 0x80) continue; # 0bbbbbbb
65             elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
66             elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
67             elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
68             elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
69             elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70             else return false; # Does not match any model
71
72             for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
73                 if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74                     return false;
75             }
76         }
77         return true;
78     }
79 }
80
81 if(!function_exists('utf8_basename')){
82     /**
83      * A locale independent basename() implementation
84      *
85      * works around a bug in PHP's basename() implementation
86      *
87      * @see basename()
88      * @link   https://bugs.php.net/bug.php?id=37738
89      * @param string $path     A path
90      * @param string $suffix   If the name component ends in suffix this will also be cut off
91      * @return string
92      */
93     function utf8_basename($path, $suffix=''){
94         $path = trim($path,'\\/');
95         $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
96         if($rpos) $path = substr($path, $rpos+1);
97
98         $suflen = strlen($suffix);
99         if($suflen && (substr($path, -$suflen) == $suffix)){
100             $path = substr($path, 0, -$suflen);
101         }
102
103         return $path;
104     }
105 }
106
107 if(!function_exists('utf8_strlen')){
108     /**
109      * Unicode aware replacement for strlen()
110      *
111      * utf8_decode() converts characters that are not in ISO-8859-1
112      * to '?', which, for the purpose of counting, is alright - It's
113      * even faster than mb_strlen.
114      *
115      * @author <chernyshevsky at hotmail dot com>
116      * @see    strlen()
117      * @see    utf8_decode()
118      */
119     function utf8_strlen($string){
120         return strlen(utf8_decode($string));
121     }
122 }
123
124 if(!function_exists('utf8_substr')){
125     /**
126      * UTF-8 aware alternative to substr
127      *
128      * Return part of a string given character offset (and optionally length)
129      *
130      * @author Harry Fuecks <hfuecks@gmail.com>
131      * @author Chris Smith <chris@jalakai.co.uk>
132      * @param string $str
133      * @param int $offset number of UTF-8 characters offset (from left)
134      * @param int $length (optional) length in UTF-8 characters from offset
135      * @return mixed string or false if failure
136      */
137     function utf8_substr($str, $offset, $length = null) {
138         if(UTF8_MBSTRING){
139             if( $length === null ){
140                 return mb_substr($str, $offset);
141             }else{
142                 return mb_substr($str, $offset, $length);
143             }
144         }
145
146         /*
147          * Notes:
148          *
149          * no mb string support, so we'll use pcre regex's with 'u' flag
150          * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
151          * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
152          *
153          * substr documentation states false can be returned in some cases (e.g. offset > string length)
154          * mb_substr never returns false, it will return an empty string instead.
155          *
156          * calculating the number of characters in the string is a relatively expensive operation, so
157          * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
158          */
159
160         // cast parameters to appropriate types to avoid multiple notices/warnings
161         $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
162         $offset = (int)$offset;
163         if (!is_null($length)) $length = (int)$length;
164
165         // handle trivial cases
166         if ($length === 0) return '';
167         if ($offset < 0 && $length < 0 && $length < $offset) return '';
168
169         $offset_pattern = '';
170         $length_pattern = '';
171
172         // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
173         if ($offset < 0) {
174             $strlen = strlen(utf8_decode($str));        // see notes
175             $offset = $strlen + $offset;
176             if ($offset < 0) $offset = 0;
177         }
178
179         // establish a pattern for offset, a non-captured group equal in length to offset
180         if ($offset > 0) {
181             $Ox = (int)($offset/65535);
182             $Oy = $offset%65535;
183
184             if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
185             $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
186         } else {
187             $offset_pattern = '^';                      // offset == 0; just anchor the pattern
188         }
189
190         // establish a pattern for length
191         if (is_null($length)) {
192             $length_pattern = '(.*)$';                  // the rest of the string
193         } else {
194
195             if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
196             if ($offset > $strlen) return '';           // another trivial case
197
198             if ($length > 0) {
199
200                 $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
201
202                 $Lx = (int)($length/65535);
203                 $Ly = $length%65535;
204
205                 // +ve length requires ... a captured group of length characters
206                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
207                     $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
208
209             } else if ($length < 0) {
210
211                 if ($length < ($offset - $strlen)) return '';
212
213                 $Lx = (int)((-$length)/65535);
214                 $Ly = (-$length)%65535;
215
216                 // -ve length requires ... capture everything except a group of -length characters
217                 //                         anchored at the tail-end of the string
218                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
219                 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
220             }
221         }
222
223         if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
224         return $match[1];
225     }
226 }
227
228 if(!function_exists('utf8_substr_replace')){
229     /**
230      * Unicode aware replacement for substr_replace()
231      *
232      * @author Andreas Gohr <andi@splitbrain.org>
233      * @see    substr_replace()
234      */
235     function utf8_substr_replace($string, $replacement, $start , $length=0 ){
236         $ret = '';
237         if($start>0) $ret .= utf8_substr($string, 0, $start);
238         $ret .= $replacement;
239         $ret .= utf8_substr($string, $start+$length);
240         return $ret;
241     }
242 }
243
244 if(!function_exists('utf8_ltrim')){
245     /**
246      * Unicode aware replacement for ltrim()
247      *
248      * @author Andreas Gohr <andi@splitbrain.org>
249      * @see    ltrim()
250      * @param  string $str
251      * @param  string $charlist
252      * @return string
253      */
254     function utf8_ltrim($str,$charlist=''){
255         if($charlist == '') return ltrim($str);
256
257         //quote charlist for use in a characterclass
258         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
259
260         return preg_replace('/^['.$charlist.']+/u','',$str);
261     }
262 }
263
264 if(!function_exists('utf8_rtrim')){
265     /**
266      * Unicode aware replacement for rtrim()
267      *
268      * @author Andreas Gohr <andi@splitbrain.org>
269      * @see    rtrim()
270      * @param  string $str
271      * @param  string $charlist
272      * @return string
273      */
274     function  utf8_rtrim($str,$charlist=''){
275         if($charlist == '') return rtrim($str);
276
277         //quote charlist for use in a characterclass
278         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
279
280         return preg_replace('/['.$charlist.']+$/u','',$str);
281     }
282 }
283
284 if(!function_exists('utf8_trim')){
285     /**
286      * Unicode aware replacement for trim()
287      *
288      * @author Andreas Gohr <andi@splitbrain.org>
289      * @see    trim()
290      * @param  string $str
291      * @param  string $charlist
292      * @return string
293      */
294     function  utf8_trim($str,$charlist='') {
295         if($charlist == '') return trim($str);
296
297         return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
298     }
299 }
300
301 if(!function_exists('utf8_strtolower')){
302     /**
303      * This is a unicode aware replacement for strtolower()
304      *
305      * Uses mb_string extension if available
306      *
307      * @author Leo Feyer <leo@typolight.org>
308      * @see    strtolower()
309      * @see    utf8_strtoupper()
310      */
311     function utf8_strtolower($string){
312         if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
313
314         global $UTF8_UPPER_TO_LOWER;
315         return strtr($string,$UTF8_UPPER_TO_LOWER);
316     }
317 }
318
319 if(!function_exists('utf8_strtoupper')){
320     /**
321      * This is a unicode aware replacement for strtoupper()
322      *
323      * Uses mb_string extension if available
324      *
325      * @author Leo Feyer <leo@typolight.org>
326      * @see    strtoupper()
327      * @see    utf8_strtoupper()
328      */
329     function utf8_strtoupper($string){
330         if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
331
332         global $UTF8_LOWER_TO_UPPER;
333         return strtr($string,$UTF8_LOWER_TO_UPPER);
334     }
335 }
336
337 if(!function_exists('utf8_ucfirst')){
338     /**
339      * UTF-8 aware alternative to ucfirst
340      * Make a string's first character uppercase
341      *
342      * @author Harry Fuecks
343      * @param string
344      * @return string with first character as upper case (if applicable)
345      */
346     function utf8_ucfirst($str){
347         switch ( utf8_strlen($str) ) {
348             case 0:
349                 return '';
350             case 1:
351                 return utf8_strtoupper($str);
352             default:
353                 preg_match('/^(.{1})(.*)$/us', $str, $matches);
354                 return utf8_strtoupper($matches[1]).$matches[2];
355         }
356     }
357 }
358
359 if(!function_exists('utf8_ucwords')){
360     /**
361      * UTF-8 aware alternative to ucwords
362      * Uppercase the first character of each word in a string
363      *
364      * @author Harry Fuecks
365      * @param string
366      * @return string with first char of each word uppercase
367      * @see http://www.php.net/ucwords
368      */
369     function utf8_ucwords($str) {
370         // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
371         // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
372         // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
373         $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
374
375         return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
376     }
377
378     /**
379      * Callback function for preg_replace_callback call in utf8_ucwords
380      * You don't need to call this yourself
381      *
382      * @author Harry Fuecks
383      * @param  array $matches matches corresponding to a single word
384      * @return string with first char of the word in uppercase
385      * @see utf8_ucwords
386      * @see utf8_strtoupper
387      */
388     function utf8_ucwords_callback($matches) {
389         $leadingws = $matches[2];
390         $ucfirst = utf8_strtoupper($matches[3]);
391         $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
392         return $leadingws . $ucword;
393     }
394 }
395
396 if(!function_exists('utf8_deaccent')){
397     /**
398      * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
399      *
400      * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
401      * letters. Default is to deaccent both cases ($case = 0)
402      *
403      * @author Andreas Gohr <andi@splitbrain.org>
404      */
405     function utf8_deaccent($string,$case=0){
406         if($case <= 0){
407             global $UTF8_LOWER_ACCENTS;
408             $string = strtr($string,$UTF8_LOWER_ACCENTS);
409         }
410         if($case >= 0){
411             global $UTF8_UPPER_ACCENTS;
412             $string = strtr($string,$UTF8_UPPER_ACCENTS);
413         }
414         return $string;
415     }
416 }
417
418 if(!function_exists('utf8_romanize')){
419     /**
420      * Romanize a non-latin string
421      *
422      * @author Andreas Gohr <andi@splitbrain.org>
423      */
424     function utf8_romanize($string){
425         if(utf8_isASCII($string)) return $string; //nothing to do
426
427         global $UTF8_ROMANIZATION;
428         return strtr($string,$UTF8_ROMANIZATION);
429     }
430 }
431
432 if(!function_exists('utf8_stripspecials')){
433     /**
434      * Removes special characters (nonalphanumeric) from a UTF-8 string
435      *
436      * This function adds the controlchars 0x00 to 0x19 to the array of
437      * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
438      *
439      * @author Andreas Gohr <andi@splitbrain.org>
440      * @param  string $string     The UTF8 string to strip of special chars
441      * @param  string $repl       Replace special with this string
442      * @param  string $additional Additional chars to strip (used in regexp char class)
443      * @return string
444      */
445     function utf8_stripspecials($string,$repl='',$additional=''){
446         global $UTF8_SPECIAL_CHARS2;
447
448         static $specials = null;
449         if(is_null($specials)){
450             #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
451             $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
452         }
453
454         return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
455     }
456 }
457
458 if(!function_exists('utf8_strpos')){
459     /**
460      * This is an Unicode aware replacement for strpos
461      *
462      * @author Leo Feyer <leo@typolight.org>
463      * @see    strpos()
464      * @param  string
465      * @param  string
466      * @param  integer
467      * @return integer
468      */
469     function utf8_strpos($haystack, $needle, $offset=0){
470         $comp = 0;
471         $length = null;
472
473         while (is_null($length) || $length < $offset) {
474             $pos = strpos($haystack, $needle, $offset + $comp);
475
476             if ($pos === false)
477                 return false;
478
479             $length = utf8_strlen(substr($haystack, 0, $pos));
480
481             if ($length < $offset)
482                 $comp = $pos - $length;
483         }
484
485         return $length;
486     }
487 }
488
489 if(!function_exists('utf8_tohtml')){
490     /**
491      * Encodes UTF-8 characters to HTML entities
492      *
493      * @author Tom N Harris <tnharris@whoopdedo.org>
494      * @author <vpribish at shopping dot com>
495      * @link   http://www.php.net/manual/en/function.utf8-decode.php
496      */
497     function utf8_tohtml ($str) {
498         $ret = '';
499         foreach (utf8_to_unicode($str) as $cp) {
500             if ($cp < 0x80)
501                 $ret .= chr($cp);
502             elseif ($cp < 0x100)
503                 $ret .= "&#$cp;";
504             else
505                 $ret .= '&#x'.dechex($cp).';';
506         }
507         return $ret;
508     }
509 }
510
511 if(!function_exists('utf8_unhtml')){
512     /**
513      * Decodes HTML entities to UTF-8 characters
514      *
515      * Convert any &#..; entity to a codepoint,
516      * The entities flag defaults to only decoding numeric entities.
517      * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
518      * are handled as well. Avoids the problem that would occur if you
519      * had to decode "&amp;#38;&#38;amp;#38;"
520      *
521      * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
522      * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
523      * what it should be                   -> "&#38;&amp#38;"
524      *
525      * @author Tom N Harris <tnharris@whoopdedo.org>
526      * @param  string  $str      UTF-8 encoded string
527      * @param  boolean $entities Flag controlling decoding of named entities.
528      * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
529      */
530     function utf8_unhtml($str, $entities=null) {
531         static $decoder = null;
532         if (is_null($decoder))
533             $decoder = new utf8_entity_decoder();
534         if (is_null($entities))
535             return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
536                                          'utf8_decode_numeric', $str);
537         else
538             return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
539                                          array(&$decoder, 'decode'), $str);
540     }
541 }
542
543 if(!function_exists('utf8_decode_numeric')){
544     /**
545      * Decodes numeric HTML entities to their correct UTF-8 characters
546      *
547      * @param $ent string A numeric entity
548      * @return string
549      */
550     function utf8_decode_numeric($ent) {
551         switch ($ent[2]) {
552             case 'X':
553             case 'x':
554                 $cp = hexdec($ent[3]);
555                 break;
556             default:
557                 $cp = intval($ent[3]);
558                 break;
559         }
560         return unicode_to_utf8(array($cp));
561     }
562 }
563
564 if(!class_exists('utf8_entity_decoder')){
565     /**
566      * Encapsulate HTML entity decoding tables
567      */
568     class utf8_entity_decoder {
569         var $table;
570
571         /**
572          * Initializes the decoding tables
573          */
574         function __construct() {
575             $table = get_html_translation_table(HTML_ENTITIES);
576             $table = array_flip($table);
577             $this->table = array_map(array(&$this,'makeutf8'), $table);
578         }
579
580         /**
581          * Wrapper aorund unicode_to_utf8()
582          *
583          * @param $c string
584          * @return mixed
585          */
586         function makeutf8($c) {
587             return unicode_to_utf8(array(ord($c)));
588         }
589
590         /**
591          * Decodes any HTML entity to it's correct UTF-8 char equivalent
592          *
593          * @param $ent string An entity
594          * @return string
595          */
596         function decode($ent) {
597             if ($ent[1] == '#') {
598                 return utf8_decode_numeric($ent);
599             } elseif (array_key_exists($ent[0],$this->table)) {
600                 return $this->table[$ent[0]];
601             } else {
602                 return $ent[0];
603             }
604         }
605     }
606 }
607
608 if(!function_exists('utf8_to_unicode')){
609     /**
610      * Takes an UTF-8 string and returns an array of ints representing the
611      * Unicode characters. Astral planes are supported ie. the ints in the
612      * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
613      * are not allowed.
614      *
615      * If $strict is set to true the function returns false if the input
616      * string isn't a valid UTF-8 octet sequence and raises a PHP error at
617      * level E_USER_WARNING
618      *
619      * Note: this function has been modified slightly in this library to
620      * trigger errors on encountering bad bytes
621      *
622      * @author <hsivonen@iki.fi>
623      * @author Harry Fuecks <hfuecks@gmail.com>
624      * @param  string  $str UTF-8 encoded string
625      * @param  boolean $strict Check for invalid sequences?
626      * @return mixed array of unicode code points or false if UTF-8 invalid
627      * @see    unicode_to_utf8
628      * @link   http://hsivonen.iki.fi/php-utf8/
629      * @link   http://sourceforge.net/projects/phputf8/
630      */
631     function utf8_to_unicode($str,$strict=false) {
632         $mState = 0;     // cached expected number of octets after the current octet
633                          // until the beginning of the next UTF8 character sequence
634         $mUcs4  = 0;     // cached Unicode character
635         $mBytes = 1;     // cached expected number of octets in the current sequence
636
637         $out = array();
638
639         $len = strlen($str);
640
641         for($i = 0; $i < $len; $i++) {
642
643             $in = ord($str{$i});
644
645             if ( $mState == 0) {
646
647                 // When mState is zero we expect either a US-ASCII character or a
648                 // multi-octet sequence.
649                 if (0 == (0x80 & ($in))) {
650                     // US-ASCII, pass straight through.
651                     $out[] = $in;
652                     $mBytes = 1;
653
654                 } else if (0xC0 == (0xE0 & ($in))) {
655                     // First octet of 2 octet sequence
656                     $mUcs4 = ($in);
657                     $mUcs4 = ($mUcs4 & 0x1F) << 6;
658                     $mState = 1;
659                     $mBytes = 2;
660
661                 } else if (0xE0 == (0xF0 & ($in))) {
662                     // First octet of 3 octet sequence
663                     $mUcs4 = ($in);
664                     $mUcs4 = ($mUcs4 & 0x0F) << 12;
665                     $mState = 2;
666                     $mBytes = 3;
667
668                 } else if (0xF0 == (0xF8 & ($in))) {
669                     // First octet of 4 octet sequence
670                     $mUcs4 = ($in);
671                     $mUcs4 = ($mUcs4 & 0x07) << 18;
672                     $mState = 3;
673                     $mBytes = 4;
674
675                 } else if (0xF8 == (0xFC & ($in))) {
676                     /* First octet of 5 octet sequence.
677                      *
678                      * This is illegal because the encoded codepoint must be either
679                      * (a) not the shortest form or
680                      * (b) outside the Unicode range of 0-0x10FFFF.
681                      * Rather than trying to resynchronize, we will carry on until the end
682                      * of the sequence and let the later error handling code catch it.
683                      */
684                     $mUcs4 = ($in);
685                     $mUcs4 = ($mUcs4 & 0x03) << 24;
686                     $mState = 4;
687                     $mBytes = 5;
688
689                 } else if (0xFC == (0xFE & ($in))) {
690                     // First octet of 6 octet sequence, see comments for 5 octet sequence.
691                     $mUcs4 = ($in);
692                     $mUcs4 = ($mUcs4 & 1) << 30;
693                     $mState = 5;
694                     $mBytes = 6;
695
696                 } elseif($strict) {
697                     /* Current octet is neither in the US-ASCII range nor a legal first
698                      * octet of a multi-octet sequence.
699                      */
700                     trigger_error(
701                             'utf8_to_unicode: Illegal sequence identifier '.
702                                 'in UTF-8 at byte '.$i,
703                             E_USER_WARNING
704                         );
705                     return false;
706
707                 }
708
709             } else {
710
711                 // When mState is non-zero, we expect a continuation of the multi-octet
712                 // sequence
713                 if (0x80 == (0xC0 & ($in))) {
714
715                     // Legal continuation.
716                     $shift = ($mState - 1) * 6;
717                     $tmp = $in;
718                     $tmp = ($tmp & 0x0000003F) << $shift;
719                     $mUcs4 |= $tmp;
720
721                     /**
722                      * End of the multi-octet sequence. mUcs4 now contains the final
723                      * Unicode codepoint to be output
724                      */
725                     if (0 == --$mState) {
726
727                         /*
728                          * Check for illegal sequences and codepoints.
729                          */
730                         // From Unicode 3.1, non-shortest form is illegal
731                         if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
732                             ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
733                             ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
734                             (4 < $mBytes) ||
735                             // From Unicode 3.2, surrogate characters are illegal
736                             (($mUcs4 & 0xFFFFF800) == 0xD800) ||
737                             // Codepoints outside the Unicode range are illegal
738                             ($mUcs4 > 0x10FFFF)) {
739
740                             if($strict){
741                                 trigger_error(
742                                         'utf8_to_unicode: Illegal sequence or codepoint '.
743                                             'in UTF-8 at byte '.$i,
744                                         E_USER_WARNING
745                                     );
746
747                                 return false;
748                             }
749
750                         }
751
752                         if (0xFEFF != $mUcs4) {
753                             // BOM is legal but we don't want to output it
754                             $out[] = $mUcs4;
755                         }
756
757                         //initialize UTF8 cache
758                         $mState = 0;
759                         $mUcs4  = 0;
760                         $mBytes = 1;
761                     }
762
763                 } elseif($strict) {
764                     /**
765                      *((0xC0 & (*in) != 0x80) && (mState != 0))
766                      * Incomplete multi-octet sequence.
767                      */
768                     trigger_error(
769                             'utf8_to_unicode: Incomplete multi-octet '.
770                             '   sequence in UTF-8 at byte '.$i,
771                             E_USER_WARNING
772                         );
773
774                     return false;
775                 }
776             }
777         }
778         return $out;
779     }
780 }
781
782 if(!function_exists('unicode_to_utf8')){
783     /**
784      * Takes an array of ints representing the Unicode characters and returns
785      * a UTF-8 string. Astral planes are supported ie. the ints in the
786      * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
787      * are not allowed.
788      *
789      * If $strict is set to true the function returns false if the input
790      * array contains ints that represent surrogates or are outside the
791      * Unicode range and raises a PHP error at level E_USER_WARNING
792      *
793      * Note: this function has been modified slightly in this library to use
794      * output buffering to concatenate the UTF-8 string (faster) as well as
795      * reference the array by it's keys
796      *
797      * @param  array $arr of unicode code points representing a string
798      * @param  boolean $strict Check for invalid sequences?
799      * @return mixed UTF-8 string or false if array contains invalid code points
800      * @author <hsivonen@iki.fi>
801      * @author Harry Fuecks <hfuecks@gmail.com>
802      * @see    utf8_to_unicode
803      * @link   http://hsivonen.iki.fi/php-utf8/
804      * @link   http://sourceforge.net/projects/phputf8/
805      */
806     function unicode_to_utf8($arr,$strict=false) {
807         if (!is_array($arr)) return '';
808         ob_start();
809
810         foreach (array_keys($arr) as $k) {
811
812             if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
813                 # ASCII range (including control chars)
814
815                 echo chr($arr[$k]);
816
817             } else if ($arr[$k] <= 0x07ff) {
818                 # 2 byte sequence
819
820                 echo chr(0xc0 | ($arr[$k] >> 6));
821                 echo chr(0x80 | ($arr[$k] & 0x003f));
822
823             } else if($arr[$k] == 0xFEFF) {
824                 # Byte order mark (skip)
825
826                 // nop -- zap the BOM
827
828             } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
829                 # Test for illegal surrogates
830
831                 // found a surrogate
832                 if($strict){
833                     trigger_error(
834                         'unicode_to_utf8: Illegal surrogate '.
835                             'at index: '.$k.', value: '.$arr[$k],
836                         E_USER_WARNING
837                         );
838                     return false;
839                 }
840
841             } else if ($arr[$k] <= 0xffff) {
842                 # 3 byte sequence
843
844                 echo chr(0xe0 | ($arr[$k] >> 12));
845                 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
846                 echo chr(0x80 | ($arr[$k] & 0x003f));
847
848             } else if ($arr[$k] <= 0x10ffff) {
849                 # 4 byte sequence
850
851                 echo chr(0xf0 | ($arr[$k] >> 18));
852                 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
853                 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
854                 echo chr(0x80 | ($arr[$k] & 0x3f));
855
856             } elseif($strict) {
857
858                 trigger_error(
859                     'unicode_to_utf8: Codepoint out of Unicode range '.
860                         'at index: '.$k.', value: '.$arr[$k],
861                     E_USER_WARNING
862                     );
863
864                 // out of range
865                 return false;
866             }
867         }
868
869         $result = ob_get_contents();
870         ob_end_clean();
871         return $result;
872     }
873 }
874
875 if(!function_exists('utf8_to_utf16be')){
876     /**
877      * UTF-8 to UTF-16BE conversion.
878      *
879      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
880      */
881     function utf8_to_utf16be(&$str, $bom = false) {
882         $out = $bom ? "\xFE\xFF" : '';
883         if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
884
885         $uni = utf8_to_unicode($str);
886         foreach($uni as $cp){
887             $out .= pack('n',$cp);
888         }
889         return $out;
890     }
891 }
892
893 if(!function_exists('utf16be_to_utf8')){
894     /**
895      * UTF-8 to UTF-16BE conversion.
896      *
897      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
898      */
899     function utf16be_to_utf8(&$str) {
900         $uni = unpack('n*',$str);
901         return unicode_to_utf8($uni);
902     }
903 }
904
905 if(!function_exists('utf8_bad_replace')){
906     /**
907      * Replace bad bytes with an alternative character
908      *
909      * ASCII character is recommended for replacement char
910      *
911      * PCRE Pattern to locate bad bytes in a UTF-8 string
912      * Comes from W3 FAQ: Multilingual Forms
913      * Note: modified to include full ASCII range including control chars
914      *
915      * @author Harry Fuecks <hfuecks@gmail.com>
916      * @see http://www.w3.org/International/questions/qa-forms-utf-8
917      * @param string $str to search
918      * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
919      * @return string
920      */
921     function utf8_bad_replace($str, $replace = '') {
922         $UTF8_BAD =
923          '([\x00-\x7F]'.                          # ASCII (including control chars)
924          '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
925          '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
926          '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
927          '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
928          '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
929          '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
930          '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
931          '|(.{1}))';                              # invalid byte
932         ob_start();
933         while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
934             if ( !isset($matches[2])) {
935                 echo $matches[0];
936             } else {
937                 echo $replace;
938             }
939             $str = substr($str,strlen($matches[0]));
940         }
941         $result = ob_get_contents();
942         ob_end_clean();
943         return $result;
944     }
945 }
946
947 if(!function_exists('utf8_correctIdx')){
948     /**
949      * adjust a byte index into a utf8 string to a utf8 character boundary
950      *
951      * @param $str   string   utf8 character string
952      * @param $i     int      byte index into $str
953      * @param $next  bool     direction to search for boundary,
954      *                           false = up (current character)
955      *                           true = down (next character)
956      *
957      * @return int            byte index into $str now pointing to a utf8 character boundary
958      *
959      * @author       chris smith <chris@jalakai.co.uk>
960      */
961     function utf8_correctIdx(&$str,$i,$next=false) {
962
963         if ($i <= 0) return 0;
964
965         $limit = strlen($str);
966         if ($i>=$limit) return $limit;
967
968         if ($next) {
969             while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
970         } else {
971             while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
972         }
973
974         return $i;
975     }
976 }
977
978 // only needed if no mb_string available
979 if(!UTF8_MBSTRING){
980     /**
981      * UTF-8 Case lookup table
982      *
983      * This lookuptable defines the upper case letters to their correspponding
984      * lower case letter in UTF-8
985      *
986      * @author Andreas Gohr <andi@splitbrain.org>
987      */
988     global $UTF8_LOWER_TO_UPPER;
989     if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
990             "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
991             "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
992             "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
993             "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
994             "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
995             "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
996             "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
997             "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
998             "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
999             "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1000             "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1001             "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1002             "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1003             "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1004             "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1005             "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1006             "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1007             "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1008             "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1009             "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1010             "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1011             "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1012             "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1013             "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1014             "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1015             "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1016             "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1017             "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1018             "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1019             "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1020             "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1021             "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1022             "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1023             "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1024             "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1025             "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1026             "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1027             "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1028             "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1029             "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1030             "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1031             "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1032             "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1033             "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1034             "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1035             "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1036             "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1037             "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1038             "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1039             "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1040             "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1041             "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1042             "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1043             "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1044             "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1045             "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1046             "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1047             "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1048             "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1049             "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1050             "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1051             "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1052             "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1053             "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1054             "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1055             "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1056             "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1057                 );
1058
1059     /**
1060      * UTF-8 Case lookup table
1061      *
1062      * This lookuptable defines the lower case letters to their corresponding
1063      * upper case letter in UTF-8
1064      *
1065      * @author Andreas Gohr <andi@splitbrain.org>
1066      */
1067     global $UTF8_UPPER_TO_LOWER;
1068     if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1069             "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1070             "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1071             "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1072             "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1073             "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1074             "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1075             "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1076             "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1077             "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1078             "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1079             "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1080             "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1081             "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1082             "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1083             "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1084             "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1085             "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1086             "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1087             "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1088             "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1089             "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1090             "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1091             "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1092             "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1093             "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1094             "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1095             "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1096             "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1097             "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1098             "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1099             "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1100             "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1101             "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1102             "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1103             "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1104             "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1105             "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1106             "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1107             "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1108             "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1109             "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1110             "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1111             "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1112             "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1113             "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1114             "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1115             "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1116             "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1117             "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1118             "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1119             "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1120             "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1121             "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1122             "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1123             "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1124             "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1125             "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1126             "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1127             "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1128             "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1129             "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1130             "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1131             "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1132             "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1133             "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1134             "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1135             "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1136                 );
1137 }; // end of case lookup tables
1138
1139 /**
1140  * UTF-8 lookup table for lower case accented letters
1141  *
1142  * This lookuptable defines replacements for accented characters from the ASCII-7
1143  * range. This are lower case letters only.
1144  *
1145  * @author Andreas Gohr <andi@splitbrain.org>
1146  * @see    utf8_deaccent()
1147  */
1148 global $UTF8_LOWER_ACCENTS;
1149 if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1150   'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1151   'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1152   'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1153   'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1154   'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1155   'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1156   'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1157   'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1158   'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1159   'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1160   'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1161   'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1162   'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1163   'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1164   'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1165 );
1166
1167 /**
1168  * UTF-8 lookup table for upper case accented letters
1169  *
1170  * This lookuptable defines replacements for accented characters from the ASCII-7
1171  * range. This are upper case letters only.
1172  *
1173  * @author Andreas Gohr <andi@splitbrain.org>
1174  * @see    utf8_deaccent()
1175  */
1176 global $UTF8_UPPER_ACCENTS;
1177 if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1178   'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1179   'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1180   'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1181   'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1182   'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1183   'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1184   'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1185   'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1186   'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1187   'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1188   'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1189   'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1190   'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1191   'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1192   'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1193 );
1194
1195 /**
1196  * UTF-8 array of common special characters
1197  *
1198  * This array should contain all special characters (not a letter or digit)
1199  * defined in the various local charsets - it's not a complete list of non-alphanum
1200  * characters in UTF-8. It's not perfect but should match most cases of special
1201  * chars.
1202  *
1203  * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1204  * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1205  *
1206  * @author Andreas Gohr <andi@splitbrain.org>
1207  * @see    utf8_stripspecials()
1208  */
1209 global $UTF8_SPECIAL_CHARS;
1210 if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1211   0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1212   0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1213           0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1214   0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1215   0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1216   0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1217   0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1218   0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1219   0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1220   0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1221   0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1222   0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1223   0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1224   0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1225   0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1226   0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1227   0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1228   0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1229   0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1230   0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1231   0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1232   0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1233   0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1234   0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1235   0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1236   0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1237   0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1238   0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1239   0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1240   0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1241   0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1242   0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1243   0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1244   0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1245   0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1246   0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1247   0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1248   0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1249   0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1250   0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1251   0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1252   0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1253   0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1254   0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1255   0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1256   0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1257   0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1258   0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1259   0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1260   0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1261   0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1262   0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1263   0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1264           0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1265   0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1266   0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1267   0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1268   0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1269   0xffeb, 0xffec, 0xffed, 0xffee,
1270   0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1271   0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1272   0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1273   0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1274   0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1275 );
1276
1277 // utf8 version of above data
1278 global $UTF8_SPECIAL_CHARS2;
1279 if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1280     "\x1A".'\e\1c\1d\1e\1f !"#$%&\'()+,/;<=>?@[\]^`{|}~\7f\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95�'.
1281     '�\97\98\99\9a\9b\9c\9d\9e\9f ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1282     '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1283     '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1284     '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1285     '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1286     '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1287     '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1288     '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1289     '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1290     '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1291     '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1292     '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1293     '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1294     '➷➸➹➺➻➼➽➾'.
1295     ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1296     '�'.
1297     '�ﹼﹽ'.
1298     '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1299     '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1300     '𝛼𝛽𝛾𝛿𝜀𝜁𝜂𝜃𝜄𝜅𝜆𝜇𝜈𝜉𝜊𝜋𝜌𝜍𝜎𝜏𝜐𝜑𝜒𝜓𝜔𝜕𝜖𝜗𝜘𝜙𝜚𝜛'.
1301     '   ⁠';
1302
1303 /**
1304  * Romanization lookup table
1305  *
1306  * This lookup tables provides a way to transform strings written in a language
1307  * different from the ones based upon latin letters into plain ASCII.
1308  *
1309  * Please note: this is not a scientific transliteration table. It only works
1310  * oneway from nonlatin to ASCII and it works by simple character replacement
1311  * only. Specialities of each language are not supported.
1312  *
1313  * @author Andreas Gohr <andi@splitbrain.org>
1314  * @author Vitaly Blokhin <vitinfo@vitn.com>
1315  * @link   http://www.uconv.com/translit.htm
1316  * @author Bisqwit <bisqwit@iki.fi>
1317  * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1318  * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1319  * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1320  * @link   http://www.btranslations.com/resources/romanization/korean.asp
1321  * @author Arthit Suriyawongkul <arthit@gmail.com>
1322  * @author Denis Scheither <amorphis@uni-bremen.de>
1323  * @author Eivind Morland <eivind.morland@gmail.com>
1324  */
1325 global $UTF8_ROMANIZATION;
1326 if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1327   // scandinavian - differs from what we do in deaccent
1328   'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1329
1330   //russian cyrillic
1331   'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1332   'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1333   'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1334   'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1335   'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1336   'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1337   'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1338   'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1339   'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1340   // Ukrainian cyrillic
1341   'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1342   // Georgian
1343   'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1344   'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1345   'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1346   'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1347   'ჰ'=>'xh',
1348   //Sanskrit
1349   'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1350   'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1351   'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1352   'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1353   'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1354   'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1355   'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1356   //Sanskrit diacritics
1357   'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1358   'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1359   'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1360   'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1361   //Hebrew
1362   'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1363   'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1364   'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1365   'ש'=>'sh','ת'=>'t',
1366   //Arabic
1367   'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1368   'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1369   'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1370   'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1371
1372   // Japanese characters  (last update: 2008-05-09)
1373
1374   // Japanese hiragana
1375
1376   // 3 character syllables, っ doubles the consonant after
1377   'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1378   'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1379   'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1380   'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1381   // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1382   'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1383   'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1384   'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1385   'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1386   'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1387   'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1388
1389   // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1390   'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1391   'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1392
1393    // 2 character syllables - normal
1394   'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1395   'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1396   'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1397   'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1398   'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1399   'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1400   'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1401   'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1402   'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1403   'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1404   'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1405   'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1406   'うぇ'=>'we','うぃ'=>'wi',
1407   'いぇ'=>'ye',
1408
1409   // 2 character syllables, っ doubles the consonant after
1410   'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1411   'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1412   'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1413   'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1414   'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1415   'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1416   'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1417   'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1418   'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1419   'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1420   'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1421
1422   // 1 character syllabels
1423   'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1424   'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1425   'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1426   'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1427   'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1428   'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1429   'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1430   'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1431   'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1432   'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1433   'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1434   'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1435   'わ'=>'wa','を'=>'wo',
1436   'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1437   'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1438   // old characters
1439   'ゑ'=>'we','ゐ'=>'wi',
1440
1441   //  convert what's left (probably only kicks in when something's missing above)
1442   // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1443   // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1444
1445   // never seen one of those (disabled for the moment)
1446   // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1447   // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1448   // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1449   // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1450   // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1451   // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1452   // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1453   // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1454   // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1455   // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1456   // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1457   // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1458   // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1459   // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1460
1461   // 'spare' characters from other romanization systems
1462   // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1463   // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1464   // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1465   // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1466   //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1467   //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1468   //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1469   //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1470   //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1471   //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1472
1473
1474   // Japanese katakana
1475
1476   // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1477   'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1478   'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1479   'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1480   'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1481   'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1482   'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1483   'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1484   'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1485   'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1486   'ッティー'=>'ttii',
1487   'ッヂィー'=>'ddii',
1488
1489   // 3 character syllables - doubled vowels
1490   'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1491   'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1492   'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1493   'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1494   'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1495   'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1496   'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1497   'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1498   'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1499   'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1500   'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1501   'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1502   'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1503   'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1504   'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1505   'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1506   'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1507   'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1508   'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1509   'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1510   'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1511   'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1512   'ウェー'=>'wee','ウィー'=>'wii',
1513   'イェー'=>'yee',
1514   'ティー'=>'tii',
1515   'ヂィー'=>'dii',
1516
1517   // 3 character syllables - doubled consonants
1518   'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1519   'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1520   'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1521   'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1522   'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1523   'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1524   'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1525   'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1526   'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1527   'ッティ'=>'tti',
1528   'ッヂィ'=>'ddi',
1529
1530   // 3 character syllables - doubled vowel and consonants
1531   'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1532   'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1533   'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1534   'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1535   'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1536   'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1537   'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1538   'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1539   'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1540   'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1541   'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1542
1543   // 2 character syllables - normal
1544   'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1545   // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1546   'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1547   'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1548   'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1549   'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1550   'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1551   'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1552   'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1553   'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1554   'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1555   'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1556   'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1557   'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1558   'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1559   'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1560   // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1561   'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1562   'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1563   'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1564   'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1565   'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1566   'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1567   'ウェ'=>'we','ウィ'=>'wi',
1568   'イェ'=>'ye',
1569   'ティ'=>'ti',
1570   'ヂィ'=>'di',
1571
1572   // 2 character syllables - doubled vocal
1573   'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1574   'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1575   'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1576   'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1577   'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1578   'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1579   'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1580   'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1581   'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1582   'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1583   'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1584   'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1585   'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1586   'ワー'=>'waa','ヲー'=>'woo',
1587   'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1588   'ヵー'=>'kaa','ヶー'=>'kee',
1589   // old characters
1590   'ヱー'=>'wee','ヰー'=>'wii',
1591
1592   // seperate katakana 'n'
1593   'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1594   'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1595
1596   // 2 character syllables - doubled consonants
1597   'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1598   'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1599   'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1600   'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1601   'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1602   'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1603   'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1604   'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1605   'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1606   'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1607   'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1608
1609   // 1 character syllables
1610   'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1611   'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1612   'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1613   'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1614   'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1615   'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1616   'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1617   'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1618   'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1619   'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1620   'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1621   'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1622   'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1623   'ワ'=>'wa','ヲ'=>'wo',
1624   'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1625   'ヵ'=>'ka','ヶ'=>'ke',
1626   // old characters
1627   'ヱ'=>'we','ヰ'=>'wi',
1628
1629   //  convert what's left (probably only kicks in when something's missing above)
1630   'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1631   'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1632
1633   // special characters
1634   '・'=>'_','、'=>'_',
1635   'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1636
1637   // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1638   // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1639   //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1640   // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1641   // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1642   //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1643   //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1644   // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1645   // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1646   //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1647   //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1648   //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1649
1650   // "Greeklish"
1651   'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1652   'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1653
1654   // Thai
1655   'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1656   'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1657   'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1658   'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1659   'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1660   'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1661   'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1662   'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1663   'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1664   'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1665   'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1666   'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1667   'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1668   '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1669   '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1670   'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1671   '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1672   '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1673
1674   // Korean
1675   'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1676   'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1677   'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1678   'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1679   'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1680   'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1681 );
1682
1683