diff --git a/README.md b/README.md index 3426137..38a1a8a 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,28 @@ forceutf8 ========= -PHP Class Encoding featuring popular Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings. +PHP Class Encoding featuring popular \ForceUTF8\Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings. Description =========== If you apply the PHP function utf8_encode() to an already-UTF8 string it will return a garbled UTF8 string. -This class addresses this issue and provides a handy static function called Encoding::toUTF8(). +This class addresses this issue and provides a handy static function called \ForceUTF8\Encoding::toUTF8(). -You dont need to know what the encoding of your strings is. It can be Latin1 (iso 8859-1), Windows-1252 or UTF8, or the string can have a mix of them. Encoding::toUTF8() will convert everything to UTF8. +You don't need to know what the encoding of your strings is. It can be Latin1 (ISO 8859-1), Windows-1252 or UTF8, or the string can have a mix of them. \ForceUTF8\Encoding::toUTF8() will convert everything to UTF8. Sometimes you have to deal with services that are unreliable in terms of encoding, possibly mixing UTF8 and Latin1 in the same string. Update: -I've included another function, Encoding::fixUTF8(), wich will fix the double (or multiple) encoded UTF8 string that looks garbled. +I've included another function, \ForceUTF8\Encoding::fixUTF8(), which will fix the double (or multiple) encoded UTF8 string that looks garbled. Usage: ====== + use \ForceUTF8\Encoding; + $utf8_string = Encoding::toUTF8($utf8_or_latin1_or_mixed_string); $latin1_string = Encoding::toLatin1($utf8_or_latin1_or_mixed_string); @@ -31,10 +33,12 @@ also: Examples: - echo Encoding::fixUTF8("Fédération Camerounaise de Football"); - echo Encoding::fixUTF8("FÃédÃération Camerounaise de Football"); - echo Encoding::fixUTF8("FÃÃédÃÃération Camerounaise de Football"); - echo Encoding::fixUTF8("FÃÃÃédÃÃÃération Camerounaise de Football"); + use \ForceUTF8\Encoding; + + echo Encoding::fixUTF8("Fédération Camerounaise de Football\n"); + echo Encoding::fixUTF8("Fédération Camerounaise de Football\n"); + echo Encoding::fixUTF8("Fédération Camerounaise de Football\n"); + echo Encoding::fixUTF8("Fédération Camerounaise de Football\n"); will output: @@ -42,3 +46,58 @@ will output: Fédération Camerounaise de Football Fédération Camerounaise de Football Fédération Camerounaise de Football + +Options: +======== +By default, `Encoding::fixUTF8` will use the `Encoding::WITHOUT_ICONV` flag, signalling that iconv should not be used to fix garbled UTF8 strings. + +This class also provides options for iconv processing, such as `Encoding::ICONV_TRANSLIT` and `Encoding::ICONV_IGNORE` to enable these flags when the iconv class is utilized. The functionality of such flags are documented in the [PHP iconv documentation](http://php.net/manual/en/function.iconv.php). + +Examples: + + use \ForceUTF8\Encoding; + + $str = "Fédération Camerounaise—de—Football\n"; // Uses U+2014 which is invalid ISO8859-1 but exists in Win1252 + echo Encoding::fixUTF8($str); // Will break U+2014 + echo Encoding::fixUTF8($str, Encoding::ICONV_IGNORE); // Will preserve U+2014 + echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT); // Will preserve U+2014 + +will output: + + Fédération Camerounaise?de?Football + Fédération Camerounaise—de—Football + Fédération Camerounaise—de—Football + +while: + + use \ForceUTF8\Encoding; + + $str = "čęėįšųūž"; // Uses several characters not present in ISO8859-1 / Win1252 + echo Encoding::fixUTF8($str); // Will break invalid characters + echo Encoding::fixUTF8($str, Encoding::ICONV_IGNORE); // Will remove invalid characters, keep those present in Win1252 + echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT); // Will trasliterate invalid characters, keep those present in Win1252 + +will output: + + ???????? + šž + ceeišuuž + + +Install via composer: +===================== +Edit your composer.json file to include the following: + +```json +{ + "require": { + "neitanod/forceutf8": "~2.0" + } +} +``` + +Tips: +===== +You can tip me with Bitcoin if you want. :) + +1Awfu4TZpy99H7Pyzt1mooxU1aP2mJVdHP diff --git a/composer.json b/composer.json index 7f3fc0d..3584731 100644 --- a/composer.json +++ b/composer.json @@ -1,11 +1,18 @@ { "name": "neitanod/forceutf8", "homepage": "https://github.com/neitanod/forceutf8", + "license": "BSD-3-Clause", "type": "library", "description": "PHP Class Encoding featuring popular Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings.", "require": { - "php": ">=5.2.0" + "php": ">=5.3.0" }, + "authors": [ + { + "name": "Sebastián Grignoli", + "email": "grignoli@gmail.com" + } + ], "autoload": { "psr-0": { "ForceUTF8\\": "src/" diff --git a/resources/wallet.jpg b/resources/wallet.jpg new file mode 100644 index 0000000..ba15cbe Binary files /dev/null and b/resources/wallet.jpg differ diff --git a/src/ForceUTF8/Encoding.php b/src/ForceUTF8/Encoding.php index 219bc25..2031592 100644 --- a/src/ForceUTF8/Encoding.php +++ b/src/ForceUTF8/Encoding.php @@ -29,9 +29,9 @@ */ /** - * @author "Sebastián Grignoli" + * @author "Sebastián Grignoli" * @package Encoding - * @version 1.2 + * @version 2.0 * @link https://github.com/neitanod/forceutf8 * @example https://github.com/neitanod/forceutf8 * @license Revised BSD @@ -40,7 +40,11 @@ namespace ForceUTF8; class Encoding { - + + const ICONV_TRANSLIT = "TRANSLIT"; + const ICONV_IGNORE = "IGNORE"; + const WITHOUT_ICONV = ""; + protected static $win1252ToUtf8 = array( 128 => "\xe2\x82\xac", @@ -75,10 +79,10 @@ class Encoding { 158 => "\xc5\xbe", 159 => "\xc5\xb8" ); - + protected static $brokenUtf8ToUtf8 = array( "\xc2\x80" => "\xe2\x82\xac", - + "\xc2\x82" => "\xe2\x80\x9a", "\xc2\x83" => "\xc6\x92", "\xc2\x84" => "\xe2\x80\x9e", @@ -90,10 +94,10 @@ class Encoding { "\xc2\x8a" => "\xc5\xa0", "\xc2\x8b" => "\xe2\x80\xb9", "\xc2\x8c" => "\xc5\x92", - + "\xc2\x8e" => "\xc5\xbd", - - + + "\xc2\x91" => "\xe2\x80\x98", "\xc2\x92" => "\xe2\x80\x99", "\xc2\x93" => "\xe2\x80\x9c", @@ -106,14 +110,14 @@ class Encoding { "\xc2\x9a" => "\xc5\xa1", "\xc2\x9b" => "\xe2\x80\xba", "\xc2\x9c" => "\xc5\x93", - + "\xc2\x9e" => "\xc5\xbe", "\xc2\x9f" => "\xc5\xb8" ); - + protected static $utf8ToWin1252 = array( "\xe2\x82\xac" => "\x80", - + "\xe2\x80\x9a" => "\x82", "\xc6\x92" => "\x83", "\xe2\x80\x9e" => "\x84", @@ -125,10 +129,10 @@ class Encoding { "\xc5\xa0" => "\x8a", "\xe2\x80\xb9" => "\x8b", "\xc5\x92" => "\x8c", - + "\xc5\xbd" => "\x8e", - - + + "\xe2\x80\x98" => "\x91", "\xe2\x80\x99" => "\x92", "\xe2\x80\x9c" => "\x93", @@ -141,17 +145,17 @@ class Encoding { "\xc5\xa1" => "\x9a", "\xe2\x80\xba" => "\x9b", "\xc5\x93" => "\x9c", - + "\xc5\xbe" => "\x9e", "\xc5\xb8" => "\x9f" ); static function toUTF8($text){ /** - * Function Encoding::toUTF8 + * Function \ForceUTF8\Encoding::toUTF8 * * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. - * + * * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. * * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: @@ -160,7 +164,7 @@ static function toUTF8($text){ * are followed by any of these: ("group B") * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» - * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) + * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) * is also a valid unicode character, and will be left unchanged. * * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, @@ -179,123 +183,134 @@ static function toUTF8($text){ $text[$k] = self::toUTF8($v); } return $text; - } elseif(is_string($text)) { - - $max = strlen($text); - $buf = ""; - for($i = 0; $i < $max; $i++){ - $c1 = $text{$i}; - if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already - $c2 = $i+1 >= $max? "\x00" : $text{$i+1}; - $c3 = $i+2 >= $max? "\x00" : $text{$i+2}; - $c4 = $i+3 >= $max? "\x00" : $text{$i+3}; - if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2; - $i++; - } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; - } - } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3; - $i = $i + 2; - } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; - } - } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3; - $i = $i + 2; - } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; - } - } else { //doesn't look like UTF8, but should be converted - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = (($c1 & "\x3f") | "\x80"); - $buf .= $cc1 . $cc2; - } - } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion - if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases - $buf .= self::$win1252ToUtf8[ord($c1)]; - } else { - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = (($c1 & "\x3f") | "\x80"); - $buf .= $cc1 . $cc2; - } - } else { // it doesn't need convesion - $buf .= $c1; - } - } - return $buf; - } else { + } + + if(!is_string($text)) { return $text; } + + $max = self::strlen($text); + + $buf = ""; + for($i = 0; $i < $max; $i++){ + $c1 = $text[$i]; + if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already + $c2 = $i+1 >= $max? "\x00" : $text[$i+1]; + $c3 = $i+2 >= $max? "\x00" : $text[$i+2]; + $c4 = $i+3 >= $max? "\x00" : $text[$i+3]; + if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2; + $i++; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3; + $i = $i + 2; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3 . $c4; + $i = $i + 3; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } else { //doesn't look like UTF8, but should be converted + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } + } elseif(($c1 & "\xc0") === "\x80"){ // needs conversion + if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases + $buf .= self::$win1252ToUtf8[ord($c1)]; + } else { + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } + } else { // it doesn't need conversion + $buf .= $c1; + } + } + return $buf; } - static function toWin1252($text) { + static function toWin1252($text, $option = self::WITHOUT_ICONV) { if(is_array($text)) { foreach($text as $k => $v) { - $text[$k] = self::toWin1252($v); + $text[$k] = self::toWin1252($v, $option); } return $text; } elseif(is_string($text)) { - return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))); + return static::utf8_decode($text, $option); } else { return $text; } } - static function toISO8859($text) { - return self::toWin1252($text); + static function toISO8859($text, $option = self::WITHOUT_ICONV) { + return self::toWin1252($text, $option); } - static function toLatin1($text) { - return self::toWin1252($text); + static function toLatin1($text, $option = self::WITHOUT_ICONV) { + return self::toWin1252($text, $option); } - static function fixUTF8($text){ + static function fixUTF8($text, $option = self::WITHOUT_ICONV){ if(is_array($text)) { foreach($text as $k => $v) { - $text[$k] = self::fixUTF8($v); + $text[$k] = self::fixUTF8($v, $option); } return $text; } + if(!is_string($text)) { + return $text; + } + $last = ""; while($last <> $text){ $last = $text; - $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); + $text = self::toUTF8(static::utf8_decode($text, $option)); } - $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); + $text = self::toUTF8(static::utf8_decode($text, $option)); return $text; } - + static function UTF8FixWin1252Chars($text){ - // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 + // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. // See: http://en.wikipedia.org/wiki/Windows-1252 - + return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); } - + static function removeBOM($str=""){ - if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) { + if(substr($str, 0,3) === pack("CCC",0xef,0xbb,0xbf)) { $str=substr($str, 3); } return $str; } - + + protected static function strlen($text){ + return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ? + mb_strlen($text,'8bit') : strlen($text); + } + public static function normalizeEncoding($encodingLabel) { $encoding = strtoupper($encodingLabel); - $enc = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding); + $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding); $equivalences = array( 'ISO88591' => 'ISO-8859-1', 'ISO8859' => 'ISO-8859-1', @@ -307,19 +322,30 @@ public static function normalizeEncoding($encodingLabel) 'WIN1252' => 'ISO-8859-1', 'WINDOWS1252' => 'ISO-8859-1' ); - + if(empty($equivalences[$encoding])){ return 'UTF-8'; } - + return $equivalences[$encoding]; } public static function encode($encodingLabel, $text) { $encodingLabel = self::normalizeEncoding($encodingLabel); - if($encodingLabel == 'UTF-8') return Encoding::toUTF8($text); - if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text); + if($encodingLabel === 'ISO-8859-1') return self::toLatin1($text); + return self::toUTF8($text); } + protected static function utf8_decode($text, $option = self::WITHOUT_ICONV) + { + if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) { + $o = utf8_decode( + str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)) + ); + } else { + $o = iconv("UTF-8", "Windows-1252" . ($option === self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option === self::ICONV_IGNORE ? '//IGNORE' : '')), $text); + } + return $o; + } } diff --git a/test/ForceUTF8Test.php b/test/ForceUTF8Test.php new file mode 100644 index 0000000..02ec687 --- /dev/null +++ b/test/ForceUTF8Test.php @@ -0,0 +1,101 @@ + FAILED\n"; + static::$failed++; + } + + private static function passed($test_name){ + static::character("."); + static::$passed++; + } + + private static function character($char){ + echo $char; + static::$last_echoed = 'char'; + } + + private static function line($msg){ + if(static::$last_echoed == 'char') echo "\n"; + echo $msg."\n"; + static::$last_echoed = 'line'; + } + } + diff --git a/test/data/russian.txt b/test/data/russian.txt new file mode 100644 index 0000000..1c618ad --- /dev/null +++ b/test/data/russian.txt @@ -0,0 +1 @@ +hello žš, привет diff --git a/test/data/test1.txt b/test/data/test1.txt new file mode 100644 index 0000000..771829e --- /dev/null +++ b/test/data/test1.txt @@ -0,0 +1 @@ +Hírek diff --git a/test/data/test1Latin.txt b/test/data/test1Latin.txt new file mode 100644 index 0000000..0aa69d6 --- /dev/null +++ b/test/data/test1Latin.txt @@ -0,0 +1 @@ +Hrek