diff --git a/README.md b/README.md
index 3426137..38a1a8a 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,28 @@
forceutf8
=========
-PHP Class Encoding featuring popular Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings.
+PHP Class Encoding featuring popular \ForceUTF8\Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings.
Description
===========
If you apply the PHP function utf8_encode() to an already-UTF8 string it will return a garbled UTF8 string.
-This class addresses this issue and provides a handy static function called Encoding::toUTF8().
+This class addresses this issue and provides a handy static function called \ForceUTF8\Encoding::toUTF8().
-You dont need to know what the encoding of your strings is. It can be Latin1 (iso 8859-1), Windows-1252 or UTF8, or the string can have a mix of them. Encoding::toUTF8() will convert everything to UTF8.
+You don't need to know what the encoding of your strings is. It can be Latin1 (ISO 8859-1), Windows-1252 or UTF8, or the string can have a mix of them. \ForceUTF8\Encoding::toUTF8() will convert everything to UTF8.
Sometimes you have to deal with services that are unreliable in terms of encoding, possibly mixing UTF8 and Latin1 in the same string.
Update:
-I've included another function, Encoding::fixUTF8(), wich will fix the double (or multiple) encoded UTF8 string that looks garbled.
+I've included another function, \ForceUTF8\Encoding::fixUTF8(), which will fix the double (or multiple) encoded UTF8 string that looks garbled.
Usage:
======
+ use \ForceUTF8\Encoding;
+
$utf8_string = Encoding::toUTF8($utf8_or_latin1_or_mixed_string);
$latin1_string = Encoding::toLatin1($utf8_or_latin1_or_mixed_string);
@@ -31,10 +33,12 @@ also:
Examples:
- echo Encoding::fixUTF8("Fédération Camerounaise de Football");
- echo Encoding::fixUTF8("FÃédÃération Camerounaise de Football");
- echo Encoding::fixUTF8("FÃÃédÃÃération Camerounaise de Football");
- echo Encoding::fixUTF8("FÃÃÃédÃÃÃération Camerounaise de Football");
+ use \ForceUTF8\Encoding;
+
+ echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
+ echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
+ echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
+ echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
will output:
@@ -42,3 +46,58 @@ will output:
Fédération Camerounaise de Football
Fédération Camerounaise de Football
Fédération Camerounaise de Football
+
+Options:
+========
+By default, `Encoding::fixUTF8` will use the `Encoding::WITHOUT_ICONV` flag, signalling that iconv should not be used to fix garbled UTF8 strings.
+
+This class also provides options for iconv processing, such as `Encoding::ICONV_TRANSLIT` and `Encoding::ICONV_IGNORE` to enable these flags when the iconv class is utilized. The functionality of such flags are documented in the [PHP iconv documentation](http://php.net/manual/en/function.iconv.php).
+
+Examples:
+
+ use \ForceUTF8\Encoding;
+
+ $str = "Fédération Camerounaise—de—Football\n"; // Uses U+2014 which is invalid ISO8859-1 but exists in Win1252
+ echo Encoding::fixUTF8($str); // Will break U+2014
+ echo Encoding::fixUTF8($str, Encoding::ICONV_IGNORE); // Will preserve U+2014
+ echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT); // Will preserve U+2014
+
+will output:
+
+ Fédération Camerounaise?de?Football
+ Fédération Camerounaise—de—Football
+ Fédération Camerounaise—de—Football
+
+while:
+
+ use \ForceUTF8\Encoding;
+
+ $str = "čęėįšųūž"; // Uses several characters not present in ISO8859-1 / Win1252
+ echo Encoding::fixUTF8($str); // Will break invalid characters
+ echo Encoding::fixUTF8($str, Encoding::ICONV_IGNORE); // Will remove invalid characters, keep those present in Win1252
+ echo Encoding::fixUTF8($str, Encoding::ICONV_TRANSLIT); // Will trasliterate invalid characters, keep those present in Win1252
+
+will output:
+
+ ????????
+ šž
+ ceeišuuž
+
+
+Install via composer:
+=====================
+Edit your composer.json file to include the following:
+
+```json
+{
+ "require": {
+ "neitanod/forceutf8": "~2.0"
+ }
+}
+```
+
+Tips:
+=====
+You can tip me with Bitcoin if you want. :)
+
+
diff --git a/composer.json b/composer.json
index 7f3fc0d..3584731 100644
--- a/composer.json
+++ b/composer.json
@@ -1,11 +1,18 @@
{
"name": "neitanod/forceutf8",
"homepage": "https://github.com/neitanod/forceutf8",
+ "license": "BSD-3-Clause",
"type": "library",
"description": "PHP Class Encoding featuring popular Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings.",
"require": {
- "php": ">=5.2.0"
+ "php": ">=5.3.0"
},
+ "authors": [
+ {
+ "name": "Sebastián Grignoli",
+ "email": "grignoli@gmail.com"
+ }
+ ],
"autoload": {
"psr-0": {
"ForceUTF8\\": "src/"
diff --git a/resources/wallet.jpg b/resources/wallet.jpg
new file mode 100644
index 0000000..ba15cbe
Binary files /dev/null and b/resources/wallet.jpg differ
diff --git a/src/ForceUTF8/Encoding.php b/src/ForceUTF8/Encoding.php
index 219bc25..2031592 100644
--- a/src/ForceUTF8/Encoding.php
+++ b/src/ForceUTF8/Encoding.php
@@ -29,9 +29,9 @@
*/
/**
- * @author "Sebastián Grignoli"
+ * @author "Sebastián Grignoli"
* @package Encoding
- * @version 1.2
+ * @version 2.0
* @link https://github.com/neitanod/forceutf8
* @example https://github.com/neitanod/forceutf8
* @license Revised BSD
@@ -40,7 +40,11 @@
namespace ForceUTF8;
class Encoding {
-
+
+ const ICONV_TRANSLIT = "TRANSLIT";
+ const ICONV_IGNORE = "IGNORE";
+ const WITHOUT_ICONV = "";
+
protected static $win1252ToUtf8 = array(
128 => "\xe2\x82\xac",
@@ -75,10 +79,10 @@ class Encoding {
158 => "\xc5\xbe",
159 => "\xc5\xb8"
);
-
+
protected static $brokenUtf8ToUtf8 = array(
"\xc2\x80" => "\xe2\x82\xac",
-
+
"\xc2\x82" => "\xe2\x80\x9a",
"\xc2\x83" => "\xc6\x92",
"\xc2\x84" => "\xe2\x80\x9e",
@@ -90,10 +94,10 @@ class Encoding {
"\xc2\x8a" => "\xc5\xa0",
"\xc2\x8b" => "\xe2\x80\xb9",
"\xc2\x8c" => "\xc5\x92",
-
+
"\xc2\x8e" => "\xc5\xbd",
-
-
+
+
"\xc2\x91" => "\xe2\x80\x98",
"\xc2\x92" => "\xe2\x80\x99",
"\xc2\x93" => "\xe2\x80\x9c",
@@ -106,14 +110,14 @@ class Encoding {
"\xc2\x9a" => "\xc5\xa1",
"\xc2\x9b" => "\xe2\x80\xba",
"\xc2\x9c" => "\xc5\x93",
-
+
"\xc2\x9e" => "\xc5\xbe",
"\xc2\x9f" => "\xc5\xb8"
);
-
+
protected static $utf8ToWin1252 = array(
"\xe2\x82\xac" => "\x80",
-
+
"\xe2\x80\x9a" => "\x82",
"\xc6\x92" => "\x83",
"\xe2\x80\x9e" => "\x84",
@@ -125,10 +129,10 @@ class Encoding {
"\xc5\xa0" => "\x8a",
"\xe2\x80\xb9" => "\x8b",
"\xc5\x92" => "\x8c",
-
+
"\xc5\xbd" => "\x8e",
-
-
+
+
"\xe2\x80\x98" => "\x91",
"\xe2\x80\x99" => "\x92",
"\xe2\x80\x9c" => "\x93",
@@ -141,17 +145,17 @@ class Encoding {
"\xc5\xa1" => "\x9a",
"\xe2\x80\xba" => "\x9b",
"\xc5\x93" => "\x9c",
-
+
"\xc5\xbe" => "\x9e",
"\xc5\xb8" => "\x9f"
);
static function toUTF8($text){
/**
- * Function Encoding::toUTF8
+ * Function \ForceUTF8\Encoding::toUTF8
*
* This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
- *
+ *
* It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
*
* It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
@@ -160,7 +164,7 @@ static function toUTF8($text){
* are followed by any of these: ("group B")
* ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
* For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
- * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
+ * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
* is also a valid unicode character, and will be left unchanged.
*
* 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
@@ -179,123 +183,134 @@ static function toUTF8($text){
$text[$k] = self::toUTF8($v);
}
return $text;
- } elseif(is_string($text)) {
-
- $max = strlen($text);
- $buf = "";
- for($i = 0; $i < $max; $i++){
- $c1 = $text{$i};
- if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
- $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
- $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
- $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
- if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
- if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
- $buf .= $c1 . $c2;
- $i++;
- } else { //not valid UTF8. Convert it.
- $cc1 = (chr(ord($c1) / 64) | "\xc0");
- $cc2 = ($c1 & "\x3f") | "\x80";
- $buf .= $cc1 . $cc2;
- }
- } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
- if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
- $buf .= $c1 . $c2 . $c3;
- $i = $i + 2;
- } else { //not valid UTF8. Convert it.
- $cc1 = (chr(ord($c1) / 64) | "\xc0");
- $cc2 = ($c1 & "\x3f") | "\x80";
- $buf .= $cc1 . $cc2;
- }
- } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
- if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
- $buf .= $c1 . $c2 . $c3;
- $i = $i + 2;
- } else { //not valid UTF8. Convert it.
- $cc1 = (chr(ord($c1) / 64) | "\xc0");
- $cc2 = ($c1 & "\x3f") | "\x80";
- $buf .= $cc1 . $cc2;
- }
- } else { //doesn't look like UTF8, but should be converted
- $cc1 = (chr(ord($c1) / 64) | "\xc0");
- $cc2 = (($c1 & "\x3f") | "\x80");
- $buf .= $cc1 . $cc2;
- }
- } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
- if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
- $buf .= self::$win1252ToUtf8[ord($c1)];
- } else {
- $cc1 = (chr(ord($c1) / 64) | "\xc0");
- $cc2 = (($c1 & "\x3f") | "\x80");
- $buf .= $cc1 . $cc2;
- }
- } else { // it doesn't need convesion
- $buf .= $c1;
- }
- }
- return $buf;
- } else {
+ }
+
+ if(!is_string($text)) {
return $text;
}
+
+ $max = self::strlen($text);
+
+ $buf = "";
+ for($i = 0; $i < $max; $i++){
+ $c1 = $text[$i];
+ if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
+ $c2 = $i+1 >= $max? "\x00" : $text[$i+1];
+ $c3 = $i+2 >= $max? "\x00" : $text[$i+2];
+ $c4 = $i+3 >= $max? "\x00" : $text[$i+3];
+ if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
+ if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+ $buf .= $c1 . $c2;
+ $i++;
+ } else { //not valid UTF8. Convert it.
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = ($c1 & "\x3f") | "\x80";
+ $buf .= $cc1 . $cc2;
+ }
+ } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
+ if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+ $buf .= $c1 . $c2 . $c3;
+ $i = $i + 2;
+ } else { //not valid UTF8. Convert it.
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = ($c1 & "\x3f") | "\x80";
+ $buf .= $cc1 . $cc2;
+ }
+ } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
+ if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+ $buf .= $c1 . $c2 . $c3 . $c4;
+ $i = $i + 3;
+ } else { //not valid UTF8. Convert it.
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = ($c1 & "\x3f") | "\x80";
+ $buf .= $cc1 . $cc2;
+ }
+ } else { //doesn't look like UTF8, but should be converted
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = (($c1 & "\x3f") | "\x80");
+ $buf .= $cc1 . $cc2;
+ }
+ } elseif(($c1 & "\xc0") === "\x80"){ // needs conversion
+ if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
+ $buf .= self::$win1252ToUtf8[ord($c1)];
+ } else {
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = (($c1 & "\x3f") | "\x80");
+ $buf .= $cc1 . $cc2;
+ }
+ } else { // it doesn't need conversion
+ $buf .= $c1;
+ }
+ }
+ return $buf;
}
- static function toWin1252($text) {
+ static function toWin1252($text, $option = self::WITHOUT_ICONV) {
if(is_array($text)) {
foreach($text as $k => $v) {
- $text[$k] = self::toWin1252($v);
+ $text[$k] = self::toWin1252($v, $option);
}
return $text;
} elseif(is_string($text)) {
- return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
+ return static::utf8_decode($text, $option);
} else {
return $text;
}
}
- static function toISO8859($text) {
- return self::toWin1252($text);
+ static function toISO8859($text, $option = self::WITHOUT_ICONV) {
+ return self::toWin1252($text, $option);
}
- static function toLatin1($text) {
- return self::toWin1252($text);
+ static function toLatin1($text, $option = self::WITHOUT_ICONV) {
+ return self::toWin1252($text, $option);
}
- static function fixUTF8($text){
+ static function fixUTF8($text, $option = self::WITHOUT_ICONV){
if(is_array($text)) {
foreach($text as $k => $v) {
- $text[$k] = self::fixUTF8($v);
+ $text[$k] = self::fixUTF8($v, $option);
}
return $text;
}
+ if(!is_string($text)) {
+ return $text;
+ }
+
$last = "";
while($last <> $text){
$last = $text;
- $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+ $text = self::toUTF8(static::utf8_decode($text, $option));
}
- $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+ $text = self::toUTF8(static::utf8_decode($text, $option));
return $text;
}
-
+
static function UTF8FixWin1252Chars($text){
- // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
+ // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
// (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
// See: http://en.wikipedia.org/wiki/Windows-1252
-
+
return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
}
-
+
static function removeBOM($str=""){
- if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
+ if(substr($str, 0,3) === pack("CCC",0xef,0xbb,0xbf)) {
$str=substr($str, 3);
}
return $str;
}
-
+
+ protected static function strlen($text){
+ return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ?
+ mb_strlen($text,'8bit') : strlen($text);
+ }
+
public static function normalizeEncoding($encodingLabel)
{
$encoding = strtoupper($encodingLabel);
- $enc = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
+ $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
$equivalences = array(
'ISO88591' => 'ISO-8859-1',
'ISO8859' => 'ISO-8859-1',
@@ -307,19 +322,30 @@ public static function normalizeEncoding($encodingLabel)
'WIN1252' => 'ISO-8859-1',
'WINDOWS1252' => 'ISO-8859-1'
);
-
+
if(empty($equivalences[$encoding])){
return 'UTF-8';
}
-
+
return $equivalences[$encoding];
}
public static function encode($encodingLabel, $text)
{
$encodingLabel = self::normalizeEncoding($encodingLabel);
- if($encodingLabel == 'UTF-8') return Encoding::toUTF8($text);
- if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text);
+ if($encodingLabel === 'ISO-8859-1') return self::toLatin1($text);
+ return self::toUTF8($text);
}
+ protected static function utf8_decode($text, $option = self::WITHOUT_ICONV)
+ {
+ if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) {
+ $o = utf8_decode(
+ str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))
+ );
+ } else {
+ $o = iconv("UTF-8", "Windows-1252" . ($option === self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option === self::ICONV_IGNORE ? '//IGNORE' : '')), $text);
+ }
+ return $o;
+ }
}
diff --git a/test/ForceUTF8Test.php b/test/ForceUTF8Test.php
new file mode 100644
index 0000000..02ec687
--- /dev/null
+++ b/test/ForceUTF8Test.php
@@ -0,0 +1,101 @@
+ FAILED\n";
+ static::$failed++;
+ }
+
+ private static function passed($test_name){
+ static::character(".");
+ static::$passed++;
+ }
+
+ private static function character($char){
+ echo $char;
+ static::$last_echoed = 'char';
+ }
+
+ private static function line($msg){
+ if(static::$last_echoed == 'char') echo "\n";
+ echo $msg."\n";
+ static::$last_echoed = 'line';
+ }
+ }
+
diff --git a/test/data/russian.txt b/test/data/russian.txt
new file mode 100644
index 0000000..1c618ad
--- /dev/null
+++ b/test/data/russian.txt
@@ -0,0 +1 @@
+hello žš, привет
diff --git a/test/data/test1.txt b/test/data/test1.txt
new file mode 100644
index 0000000..771829e
--- /dev/null
+++ b/test/data/test1.txt
@@ -0,0 +1 @@
+Hírek
diff --git a/test/data/test1Latin.txt b/test/data/test1Latin.txt
new file mode 100644
index 0000000..0aa69d6
--- /dev/null
+++ b/test/data/test1Latin.txt
@@ -0,0 +1 @@
+Hrek