Rev 17 | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 17 | Rev 52 | ||
---|---|---|---|
1 | <?php |
1 | <?php |
2 | 2 | ||
3 | /* |
3 | /* |
4 | * XML Encoding Utilities |
4 | * XML Encoding Utilities |
5 | * Copyright 2011-2021 Daniel Marschall, ViaThinkSoft |
5 | * Copyright 2011-2021 Daniel Marschall, ViaThinkSoft |
6 | * Version 1.8 (2021-11-24) |
6 | * Version 1.8 (2021-11-24) |
7 | * |
7 | * |
8 | * Licensed under the Apache License, Version 2.0 (the "License"); |
8 | * Licensed under the Apache License, Version 2.0 (the "License"); |
9 | * you may not use this file except in compliance with the License. |
9 | * you may not use this file except in compliance with the License. |
10 | * You may obtain a copy of the License at |
10 | * You may obtain a copy of the License at |
11 | * |
11 | * |
12 | * http://www.apache.org/licenses/LICENSE-2.0 |
12 | * http://www.apache.org/licenses/LICENSE-2.0 |
13 | * |
13 | * |
14 | * Unless required by applicable law or agreed to in writing, software |
14 | * Unless required by applicable law or agreed to in writing, software |
15 | * distributed under the License is distributed on an "AS IS" BASIS, |
15 | * distributed under the License is distributed on an "AS IS" BASIS, |
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
17 | * See the License for the specific language governing permissions and |
17 | * See the License for the specific language governing permissions and |
18 | * limitations under the License. |
18 | * limitations under the License. |
19 | */ |
19 | */ |
20 | 20 | ||
21 | // http://www.viathinksoft.de/?page=codelib&showid=89 |
21 | // http://www.viathinksoft.de/?page=codelib&showid=89 |
22 | 22 | ||
23 | // Unicode-proof htmlentities. |
23 | // Unicode-proof htmlentities. |
24 | // Returns 'normal' chars as chars and weirdos as numeric html entites. |
24 | // Returns 'normal' chars as chars and weirdos as numeric html entites. |
25 | // Source: http://www.php.net/manual/en/function.htmlentities.php#107985 ; modified |
25 | // Source: http://www.php.net/manual/en/function.htmlentities.php#107985 ; modified |
26 | // Modified by Daniel Marschall, ViaThinkSoft |
26 | // Modified by Daniel Marschall, ViaThinkSoft |
27 | function htmlentities_numeric($str, $allow_html=false, $encode_linebreaks=false) { |
27 | function htmlentities_numeric($str, $allow_html=false, $encode_linebreaks=false) { |
28 | // Convert $str to UTF-8 if it is not already |
28 | // Convert $str to UTF-8 if it is not already |
29 | if (mb_detect_encoding($str, "auto", true) != 'UTF-8') { |
29 | if (mb_detect_encoding($str, "auto", true) != 'UTF-8') { |
30 | # $str = mb_convert_encoding($str, 'UTF-8', 'Windows-1252'); |
30 | # $str = mb_convert_encoding($str, 'UTF-8', 'Windows-1252'); |
31 | # $str = mb_convert_encoding($str, 'UTF-8', 'auto'); |
31 | # $str = mb_convert_encoding($str, 'UTF-8', 'auto'); |
32 | $str = mb_convert_encoding($str, 'UTF-8'); |
32 | $str = mb_convert_encoding($str, 'UTF-8'); |
33 | } |
33 | } |
34 | 34 | ||
35 | // get rid of existing entities else double-escape |
35 | // get rid of existing entities else double-escape |
36 | // DM 24.08.2016 Removed because of OIDplus 1.0 XML export |
36 | // DM 24.08.2016 Removed because of OIDplus 1.0 XML export |
37 | //$str = html_entity_decode(stripslashes($str),ENT_QUOTES,'UTF-8'); |
37 | //$str = html_entity_decode(stripslashes($str),ENT_QUOTES,'UTF-8'); |
38 | 38 | ||
39 | $ar = preg_split('/(?<!^)(?!$)/u', $str); // return array of every multi-byte character |
39 | $ar = preg_split('/(?<!^)(?!$)/u', $str); // return array of every multi-byte character |
40 | $str2 = ''; |
40 | $str2 = ''; |
41 | foreach ($ar as $c) { |
41 | foreach ($ar as $c) { |
42 | $o = ord($c); |
42 | $o = ord($c); |
43 | if ( |
43 | if ( |
44 | (strlen($c) > 1) || /* multi-byte [unicode] */ |
44 | (strlen($c) > 1) || /* multi-byte [unicode] */ |
45 | ($o < 32 || $o > 126) || /* <- control / latin weirdos -> */ |
45 | ($o < 32 || $o > 126) || /* <- control / latin weirdos -> */ |
46 | ($o > 33 && $o < 40) || /* quotes + ampersand */ |
46 | ($o > 33 && $o < 40) || /* quotes + ampersand */ |
47 | ($o > 59 && $o < 63) /* html */ |
47 | ($o > 59 && $o < 63) /* html */ |
48 | ) { |
48 | ) { |
49 | // convert to numeric entity |
49 | // convert to numeric entity |
50 | $c = mb_encode_numericentity($c, array(0x0, 0xffff, 0, 0xffff), 'UTF-8'); |
50 | $c = mb_encode_numericentity($c, array(0x0, 0xffff, 0, 0xffff), 'UTF-8'); |
51 | 51 | ||
52 | if ($allow_html) { |
52 | if ($allow_html) { |
53 | if ($c == '<') $c = '<'; |
53 | if ($c == '<') $c = '<'; |
54 | if ($c == '>') $c = '>'; |
54 | if ($c == '>') $c = '>'; |
55 | if ($c == '=') $c = '='; |
55 | if ($c == '=') $c = '='; |
56 | if ($c == '"') $c = '"'; |
56 | if ($c == '"') $c = '"'; |
57 | if ($c == ''') $c = '\''; |
57 | if ($c == ''') $c = '\''; |
58 | if ($c == '&') $c = '&'; // DM 24.08.2016 Re-added because OIDplus 1.0 XML export |
58 | if ($c == '&') $c = '&'; // DM 24.08.2016 Re-added because OIDplus 1.0 XML export |
59 | } |
59 | } |
60 | 60 | ||
61 | if (!$encode_linebreaks) { |
61 | if (!$encode_linebreaks) { |
62 | if ($allow_html) { |
62 | if ($allow_html) { |
63 | if ($c == " ") $c = "<br />"; |
63 | if ($c == " ") $c = "<br />"; |
64 | if ($c == " ") $c = "<br />"; |
64 | if ($c == " ") $c = "<br />"; |
65 | } else { |
65 | } else { |
66 | if ($c == " ") $c = "\n"; |
66 | if ($c == " ") $c = "\n"; |
67 | if ($c == " ") $c = "\r"; |
67 | if ($c == " ") $c = "\r"; |
68 | } |
68 | } |
69 | } |
69 | } |
70 | } |
70 | } |
71 | $str2 .= $c; |
71 | $str2 .= $c; |
72 | } |
72 | } |
73 | return $str2; |
73 | return $str2; |
74 | } |
74 | } |
75 | 75 | ||
76 | function ordUTF8($c, $index = 0, &$bytes = null) { |
76 | function ordUTF8($c, $index = 0, &$bytes = null) { |
77 | // http://de.php.net/manual/en/function.ord.php#78032 |
77 | // http://de.php.net/manual/en/function.ord.php#78032 |
78 | 78 | ||
79 | $len = strlen($c); |
79 | $len = strlen($c); |
80 | $bytes = 0; |
80 | $bytes = 0; |
81 | 81 | ||
82 | if ($index >= $len) { |
82 | if ($index >= $len) { |
83 | return false; |
83 | return false; |
84 | } |
84 | } |
85 | 85 | ||
86 | $h = ord($c[$index]); |
86 | $h = ord($c[$index]); |
87 | 87 | ||
88 | if ($h <= 0x7F) { |
88 | if ($h <= 0x7F) { |
89 | $bytes = 1; |
89 | $bytes = 1; |
90 | return $h; |
90 | return $h; |
91 | } else if ($h < 0xC2) { |
91 | } else if ($h < 0xC2) { |
92 | return false; |
92 | return false; |
93 | } else if ($h <= 0xDF && $index < $len - 1) { |
93 | } else if ($h <= 0xDF && $index < $len - 1) { |
94 | $bytes = 2; |
94 | $bytes = 2; |
95 | return ($h & 0x1F) << 6 | (ord($c[$index + 1]) & 0x3F); |
95 | return ($h & 0x1F) << 6 | (ord($c[$index + 1]) & 0x3F); |
96 | } else if ($h <= 0xEF && $index < $len - 2) { |
96 | } else if ($h <= 0xEF && $index < $len - 2) { |
97 | $bytes = 3; |
97 | $bytes = 3; |
98 | return ($h & 0x0F) << 12 | (ord($c[$index + 1]) & 0x3F) << 6 |
98 | return ($h & 0x0F) << 12 | (ord($c[$index + 1]) & 0x3F) << 6 |
99 | | (ord($c[$index + 2]) & 0x3F); |
99 | | (ord($c[$index + 2]) & 0x3F); |
100 | } else if ($h <= 0xF4 && $index < $len - 3) { |
100 | } else if ($h <= 0xF4 && $index < $len - 3) { |
101 | $bytes = 4; |
101 | $bytes = 4; |
102 | return ($h & 0x0F) << 18 | (ord($c[$index + 1]) & 0x3F) << 12 |
102 | return ($h & 0x0F) << 18 | (ord($c[$index + 1]) & 0x3F) << 12 |
103 | | (ord($c[$index + 2]) & 0x3F) << 6 |
103 | | (ord($c[$index + 2]) & 0x3F) << 6 |
104 | | (ord($c[$index + 3]) & 0x3F); |
104 | | (ord($c[$index + 3]) & 0x3F); |
105 | } else { |
105 | } else { |
106 | return false; |
106 | return false; |
107 | } |
107 | } |
108 | } |
108 | } |
109 | 109 | ||
110 | function utf16_to_utf8($str) { |
110 | function utf16_to_utf8($str) { |
111 | // http://betamode.de/2008/09/08/php-utf-16-zu-utf-8-konvertieren/ |
111 | // http://betamode.de/2008/09/08/php-utf-16-zu-utf-8-konvertieren/ |
112 | // http://www.moddular.org/log/utf16-to-utf8 |
112 | // http://www.moddular.org/log/utf16-to-utf8 |
113 | 113 | ||
114 | $c0 = ord($str[0]); |
114 | $c0 = ord($str[0]); |
115 | $c1 = ord($str[1]); |
115 | $c1 = ord($str[1]); |
116 | if ($c0 == 0xFE && $c1 == 0xFF) { |
116 | if ($c0 == 0xFE && $c1 == 0xFF) { |
117 | $be = true; |
117 | $be = true; |
118 | } else if ($c0 == 0xFF && $c1 == 0xFE) { |
118 | } else if ($c0 == 0xFF && $c1 == 0xFE) { |
119 | $be = false; |
119 | $be = false; |
120 | } else { |
120 | } else { |
121 | return $str; |
121 | return $str; |
122 | } |
122 | } |
123 | $str = substr($str, 2); |
123 | $str = substr($str, 2); |
124 | $len = strlen($str); |
124 | $len = strlen($str); |
125 | $dec = ''; |
125 | $dec = ''; |
126 | for ($i = 0; $i < $len; $i += 2) { |
126 | for ($i = 0; $i < $len; $i += 2) { |
127 | $c = ($be) ? ord($str[$i]) << 8 | ord($str[$i + 1]) : |
127 | $c = ($be) ? ord($str[$i]) << 8 | ord($str[$i + 1]) : |
128 | ord($str[$i + 1]) << 8 | ord($str[$i]); |
128 | ord($str[$i + 1]) << 8 | ord($str[$i]); |
129 | if ($c >= 0x0001 && $c <= 0x007F) { |
129 | if ($c >= 0x0001 && $c <= 0x007F) { |
130 | $dec .= chr($c); |
130 | $dec .= chr($c); |
131 | } else if ($c > 0x07FF) { |
131 | } else if ($c > 0x07FF) { |
132 | $dec .= chr(0xE0 | (($c >> 12) & 0x0F)); |
132 | $dec .= chr(0xE0 | (($c >> 12) & 0x0F)); |
133 | $dec .= chr(0x80 | (($c >> 6) & 0x3F)); |
133 | $dec .= chr(0x80 | (($c >> 6) & 0x3F)); |
134 | $dec .= chr(0x80 | (($c >> 0) & 0x3F)); |
134 | $dec .= chr(0x80 | (($c >> 0) & 0x3F)); |
135 | } else { |
135 | } else { |
136 | $dec .= chr(0xC0 | (($c >> 6) & 0x1F)); |
136 | $dec .= chr(0xC0 | (($c >> 6) & 0x1F)); |
137 | $dec .= chr(0x80 | (($c >> 0) & 0x3F)); |
137 | $dec .= chr(0x80 | (($c >> 0) & 0x3F)); |
138 | } |
138 | } |
139 | } |
139 | } |
140 | return $dec; |
140 | return $dec; |
141 | } |
141 | } |
142 | 142 | ||
143 | function html_named_to_numeric_entities($str) { |
143 | function html_named_to_numeric_entities($str) { |
144 | if (!mb_detect_encoding($str, 'UTF-8', true)) $str = utf8_encode($str); |
144 | $str = mb_convert_encoding($str, 'UTF-8'); |
145 | return mb_htmlentities(decodeNamedEntities($str)); |
145 | return mb_htmlentities(decodeNamedEntities($str)); |
146 | } |
146 | } |
147 | 147 | ||
148 | if (!function_exists('decodeNamedEntities')) { |
148 | if (!function_exists('decodeNamedEntities')) { |
149 | function decodeNamedEntities($string) { |
149 | function decodeNamedEntities($string) { |
150 | // https://stackoverflow.com/questions/20406599/how-to-encode-for-entity-igrave-not-defined-error-in-xml-feed |
150 | // https://stackoverflow.com/questions/20406599/how-to-encode-for-entity-igrave-not-defined-error-in-xml-feed |
151 | static $entities = NULL; |
151 | static $entities = NULL; |
152 | if (NULL === $entities) { |
152 | if (NULL === $entities) { |
153 | $entities = array_flip( |
153 | $entities = array_flip( |
154 | array_diff( |
154 | array_diff( |
155 | get_html_translation_table(HTML_ENTITIES, ENT_COMPAT | ENT_HTML401, 'UTF-8'), |
155 | get_html_translation_table(HTML_ENTITIES, ENT_COMPAT | ENT_HTML401, 'UTF-8'), |
156 | get_html_translation_table(HTML_ENTITIES, ENT_COMPAT | ENT_XML1, 'UTF-8') |
156 | get_html_translation_table(HTML_ENTITIES, ENT_COMPAT | ENT_XML1, 'UTF-8') |
157 | ) |
157 | ) |
158 | ); |
158 | ); |
159 | } |
159 | } |
160 | return str_replace(array_keys($entities), $entities, $string); |
160 | return str_replace(array_keys($entities), $entities, $string); |
161 | } |
161 | } |
162 | } |
162 | } |
163 | 163 | ||
164 | if (!function_exists('mb_convert_encoding')) { |
164 | if (!function_exists('mb_convert_encoding')) { |
165 | // https://riptutorial.com/php/example/15633/converting-unicode-characters-to-their-numeric-value-and-or-html-entities-using-php |
165 | // https://riptutorial.com/php/example/15633/converting-unicode-characters-to-their-numeric-value-and-or-html-entities-using-php |
166 | function mb_convert_encoding($str, $to_encoding, $from_encoding = NULL) { |
166 | function mb_convert_encoding($str, $to_encoding, $from_encoding = NULL) { |
167 | return iconv(($from_encoding === NULL) ? mb_internal_encoding() : $from_encoding, $to_encoding, $str); |
167 | return iconv(($from_encoding === NULL) ? mb_internal_encoding() : $from_encoding, $to_encoding, $str); |
168 | } |
168 | } |
169 | } |
169 | } |
170 | 170 | ||
171 | if (!function_exists('mb_ord')) { |
171 | if (!function_exists('mb_ord')) { |
172 | // https://riptutorial.com/php/example/15633/converting-unicode-characters-to-their-numeric-value-and-or-html-entities-using-php |
172 | // https://riptutorial.com/php/example/15633/converting-unicode-characters-to-their-numeric-value-and-or-html-entities-using-php |
173 | function mb_ord($char, $encoding = 'UTF-8') { |
173 | function mb_ord($char, $encoding = 'UTF-8') { |
174 | if ($encoding === 'UCS-4BE') { |
174 | if ($encoding === 'UCS-4BE') { |
175 | list(, $ord) = (strlen($char) === 4) ? @unpack('N', $char) : @unpack('n', $char); |
175 | list(, $ord) = (strlen($char) === 4) ? @unpack('N', $char) : @unpack('n', $char); |
176 | return $ord; |
176 | return $ord; |
177 | } else { |
177 | } else { |
178 | return mb_ord(mb_convert_encoding($char, 'UCS-4BE', $encoding), 'UCS-4BE'); |
178 | return mb_ord(mb_convert_encoding($char, 'UCS-4BE', $encoding), 'UCS-4BE'); |
179 | } |
179 | } |
180 | } |
180 | } |
181 | } |
181 | } |
182 | 182 | ||
183 | if (!function_exists('mb_htmlentities')) { |
183 | if (!function_exists('mb_htmlentities')) { |
184 | // https://riptutorial.com/php/example/15633/converting-unicode-characters-to-their-numeric-value-and-or-html-entities-using-php |
184 | // https://riptutorial.com/php/example/15633/converting-unicode-characters-to-their-numeric-value-and-or-html-entities-using-php |
185 | // modified |
185 | // modified |
186 | function mb_htmlentities($string, $hex = true, $encoding = 'UTF-8') { |
186 | function mb_htmlentities($string, $hex = true, $encoding = 'UTF-8') { |
187 | return preg_replace_callback('/[\x{80}-\x{10FFFF}]/u', function ($match) use ($hex, $encoding) { |
187 | return preg_replace_callback('/[\x{80}-\x{10FFFF}]/u', function ($match) use ($hex, $encoding) { |
188 | $ord = (strtoupper($encoding) == 'UTF-8') ? ordUTF8($match[0]) : mb_ord($match[0]); |
188 | $ord = (strtoupper($encoding) == 'UTF-8') ? ordUTF8($match[0]) : mb_ord($match[0]); |
189 | return sprintf($hex ? '&#x%X;' : '&#%d;', $ord); |
189 | return sprintf($hex ? '&#x%X;' : '&#%d;', $ord); |
190 | }, $string); |
190 | }, $string); |
191 | } |
191 | } |
192 | } |
192 | } |
193 | 193 |