Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1 | daniel-mar | 1 | unit ZMUTF819; |
2 | |||
3 | (* |
||
4 | ZMUTF19.pas - Some UTF8/16 utility functions |
||
5 | Copyright (C) 2009, 2010 by Russell J. Peters, Roger Aelbrecht, |
||
6 | Eric W. Engler and Chris Vleghert. |
||
7 | |||
8 | This file is part of TZipMaster Version 1.9. |
||
9 | |||
10 | TZipMaster is free software: you can redistribute it and/or modify |
||
11 | it under the terms of the GNU Lesser General Public License as published by |
||
12 | the Free Software Foundation, either version 3 of the License, or |
||
13 | (at your option) any later version. |
||
14 | |||
15 | TZipMaster is distributed in the hope that it will be useful, |
||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
18 | GNU Lesser General Public License for more details. |
||
19 | |||
20 | You should have received a copy of the GNU Lesser General Public License |
||
21 | along with TZipMaster. If not, see <http://www.gnu.org/licenses/>. |
||
22 | |||
23 | contact: problems@delphizip.org (include ZipMaster in the subject). |
||
24 | updates: http://www.delphizip.org |
||
25 | DelphiZip maillist subscribe at http://www.freelists.org/list/delphizip |
||
26 | |||
27 | modified 2009-06-25 |
||
28 | ---------------------------------------------------------------------------*) |
||
29 | |||
30 | {$INCLUDE '.\ZipVers19.inc'} |
||
31 | |||
32 | {$IFDEF VERD6up} |
||
33 | {$WARN UNIT_PLATFORM OFF} |
||
34 | {$WARN SYMBOL_PLATFORM OFF} |
||
35 | {$ENDIF} |
||
36 | |||
37 | interface |
||
38 | |||
39 | uses |
||
40 | SysUtils, Windows, Classes, ZipMstr19; |
||
41 | |||
42 | // convert to/from UTF8 characters |
||
43 | function UTF8ToStr(const astr: UTF8String): String; |
||
44 | function StrToUTF8(const ustr: string): UTF8String; |
||
45 | function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer = |
||
46 | -1): TZMString; |
||
47 | function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString; |
||
48 | function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String; |
||
49 | function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String; |
||
50 | |||
51 | // test for valid UTF8 character(s) > 0 _ some, 0 _ none, < 0 _ invalid |
||
52 | function ValidUTF8(pstr: PAnsiChar; len: integer): integer; overload; |
||
53 | function ValidUTF8(const str: AnsiString; len: integer = -1): integer; overload; |
||
54 | {$IFDEF UNICODE} |
||
55 | //overload; |
||
56 | function ValidUTF8(const str: UTF8String; len: integer = -1): integer; overload; |
||
57 | {$ENDIF} |
||
58 | |||
59 | // convert to UTF8 (if needed) |
||
60 | function AsUTF8Str(const zstr: TZMString): UTF8String; |
||
61 | |||
62 | function UTF8SeqLen(c: AnsiChar): integer; |
||
63 | function IsUTF8Trail(c: AnsiChar): boolean; |
||
64 | |||
65 | function PUTF8ToStr(const raw: PAnsiChar; len: integer): string; |
||
66 | function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString; |
||
67 | |||
68 | function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer): |
||
69 | TZMWideString; |
||
70 | |||
71 | // convert to Ansi/OEM escaping unsupported characters |
||
72 | function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString; |
||
73 | |||
74 | // test all characters are supported |
||
75 | function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean; |
||
76 | {$IFNDEF UNICODE} |
||
77 | function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString; |
||
78 | function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString; |
||
79 | function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean; |
||
80 | // only applicable when converting to OEM |
||
81 | function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean; |
||
82 | {$ENDIF} |
||
83 | // -------------------------- ------------ ------------------------- |
||
84 | implementation |
||
85 | |||
86 | uses ZMCompat19, ZMUtils19; |
||
87 | |||
88 | |||
89 | function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString; |
||
90 | const |
||
91 | MB_ERR_INVALID_CHARS = $00000008; // error for invalid chars |
||
92 | var |
||
93 | wcnt: integer; |
||
94 | flg: cardinal; |
||
95 | p: pAnsiChar; |
||
96 | rlen: Integer; |
||
97 | begin |
||
98 | Result := ''; |
||
99 | if (raw = nil) or (len = 0) then |
||
100 | exit; |
||
101 | rlen := Len; |
||
102 | if len < 0 then |
||
103 | begin |
||
104 | len := -1; |
||
105 | p := raw; |
||
106 | rlen := 0; |
||
107 | while p^ <> #0 do |
||
108 | begin |
||
109 | inc(p); |
||
110 | Inc(rlen); |
||
111 | end; |
||
112 | end; |
||
113 | rlen := rlen * 2; |
||
114 | {$IFDEF UNICODE} |
||
115 | flg := MB_ERR_INVALID_CHARS; |
||
116 | {$ELSE} |
||
117 | if Win32MajorVersion > 4 then |
||
118 | flg := MB_ERR_INVALID_CHARS |
||
119 | else |
||
120 | flg := 0; |
||
121 | {$ENDIF} |
||
122 | SetLength(Result, rlen); // plenty of room |
||
123 | wcnt := MultiByteToWideChar(CP_UTF8, flg, raw, len, |
||
124 | PWideChar(Result), rlen); |
||
125 | if wcnt = 0 then // try again assuming Ansi |
||
126 | wcnt := MultiByteToWideChar(0, flg, raw, len, |
||
127 | PWideChar(Result), rlen); |
||
128 | if (wcnt > 0) and (len = -1) then |
||
129 | dec(wcnt); // don't want end null |
||
130 | SetLength(Result, wcnt); |
||
131 | end; |
||
132 | |||
133 | function PUTF8ToStr(const raw: PAnsiChar; len: integer): string; |
||
134 | begin |
||
135 | Result := PUTF8ToWideStr(raw, len); |
||
136 | end; |
||
137 | |||
138 | function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String; |
||
139 | var |
||
140 | cnt: integer; |
||
141 | begin |
||
142 | Result := ''; |
||
143 | if len < 0 then |
||
144 | len := -1; |
||
145 | if len = 0 then |
||
146 | exit; |
||
147 | cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len, nil, 0, nil, nil); |
||
148 | if cnt > 0 then |
||
149 | begin |
||
150 | SetLength(Result, cnt); |
||
151 | cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len, |
||
152 | PAnsiChar(Result), cnt, nil, nil); |
||
153 | if cnt < 1 then |
||
154 | Result := ''; // oops - something went wrong |
||
155 | if (len = -1) and (Result[cnt] = #0) then |
||
156 | SetLength(Result, cnt - 1); // remove trailing nul |
||
157 | end//; |
||
158 | else |
||
159 | RaiseLastOSError; |
||
160 | end; |
||
161 | |||
162 | function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String; |
||
163 | begin |
||
164 | if len < 0 then |
||
165 | len := Length(astr); |
||
166 | Result := PWideToUTF8(@astr[1], len); |
||
167 | end; |
||
168 | |||
169 | function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString; |
||
170 | begin |
||
171 | Result := ''; |
||
172 | if len < 0 then |
||
173 | len := Length(astr); |
||
174 | Result := PUTF8ToWideStr(PAnsiChar(astr), len); |
||
175 | end; |
||
176 | |||
177 | function UTF8ToStr(const astr: UTF8String): String; |
||
178 | begin |
||
179 | Result := PUTF8ToStr(PAnsiChar(astr), Length(astr)); |
||
180 | end; |
||
181 | |||
182 | function StrToUTF8(const ustr: string): UTF8String; |
||
183 | var |
||
184 | wtemp: TZMWideString; |
||
185 | begin |
||
186 | wtemp := ustr; |
||
187 | Result := WideToUTF8(wtemp, -1); |
||
188 | end; |
||
189 | |||
190 | function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer = |
||
191 | -1): TZMString; |
||
192 | var |
||
193 | ws: TZMWideString; |
||
194 | begin |
||
195 | ws := StrToWideEx(astr, cp, len); |
||
196 | {$IFDEF UNICODE} |
||
197 | Result := ws; |
||
198 | {$ELSE} |
||
199 | Result := StrToUTF8(ws); |
||
200 | {$ENDIF} |
||
201 | end; |
||
202 | |||
203 | function IsUTF8Trail(c: AnsiChar): boolean; |
||
204 | begin |
||
205 | Result := (Ord(c) and $C0) = $80; |
||
206 | end; |
||
207 | |||
208 | function UTF8SeqLen(c: AnsiChar): integer; |
||
209 | var |
||
210 | u8: cardinal; |
||
211 | begin |
||
212 | Result := 1; |
||
213 | u8 := ord(c); |
||
214 | if u8 >= $80 then |
||
215 | begin |
||
216 | if (u8 and $FE) = $FC then |
||
217 | Result := 6 |
||
218 | else |
||
219 | if (u8 and $FC) = $F8 then |
||
220 | Result := 5 |
||
221 | else |
||
222 | if (u8 and $F8) = $F0 then |
||
223 | Result := 4 |
||
224 | else |
||
225 | if (u8 and $F0) = $E0 then |
||
226 | Result := 3 |
||
227 | else |
||
228 | if (u8 and $E0) = $C0 then |
||
229 | Result := 2 |
||
230 | else |
||
231 | Result := -1; // trailing byte - invalid |
||
232 | end; |
||
233 | end; |
||
234 | |||
235 | // test for valid UTF8 character(s) > 0 _ some, 0 _ none, < 0 _ invalid |
||
236 | function ValidUTF8(const str: AnsiString; len: integer): integer; |
||
237 | var |
||
238 | i, j, ul: integer; |
||
239 | begin |
||
240 | if len < 0 then |
||
241 | len := Length(str); |
||
242 | Result := 0; |
||
243 | i := 1; |
||
244 | while (i <= len) do |
||
245 | begin |
||
246 | ul := UTF8SeqLen(str[i]); |
||
247 | inc(i); |
||
248 | if ul <> 1 then |
||
249 | begin |
||
250 | if (ul < 1) or ((i + ul -2) > len) then |
||
251 | begin |
||
252 | Result := -1; // invalid |
||
253 | break; |
||
254 | end; |
||
255 | // first in seq |
||
256 | for j := 0 to ul -2 do |
||
257 | begin |
||
258 | if (ord(str[i]) and $C0) <> $80 then |
||
259 | begin |
||
260 | result := -1; |
||
261 | break; |
||
262 | end; |
||
263 | inc(i); |
||
264 | end; |
||
265 | if Result >= 0 then |
||
266 | inc(Result) // was valid so count it |
||
267 | else |
||
268 | break; |
||
269 | end; |
||
270 | end; |
||
271 | end; |
||
272 | |||
273 | // test for valid UTF8 character(s) > 0 _ some, 0 _ none, < 0 _ invalid |
||
274 | function ValidUTF8(pstr: PAnsiChar; len: integer): integer; |
||
275 | var |
||
276 | // i, |
||
277 | j, ul: integer; |
||
278 | begin |
||
279 | // if len < 0 then |
||
280 | // len := Length(str); |
||
281 | Result := 0; |
||
282 | // i := 1; |
||
283 | while (len > 0) do |
||
284 | begin |
||
285 | ul := UTF8SeqLen(pstr^); |
||
286 | inc(pstr); |
||
287 | Dec(len); |
||
288 | if ul <> 1 then |
||
289 | begin |
||
290 | // if (ul < 1) or ((i + ul -2) > len) then |
||
291 | if (ul < 1) or (( ul -1) > len) then |
||
292 | begin |
||
293 | Result := -1; // invalid |
||
294 | break; |
||
295 | end; |
||
296 | // first in seq |
||
297 | for j := 0 to ul -2 do |
||
298 | begin |
||
299 | if (ord(pstr^) and $C0) <> $80 then |
||
300 | begin |
||
301 | result := -1; |
||
302 | break; |
||
303 | end; |
||
304 | inc(pstr); |
||
305 | Dec(len); |
||
306 | end; |
||
307 | if Result >= 0 then |
||
308 | inc(Result) // was valid so count it |
||
309 | else |
||
310 | break; |
||
311 | end; |
||
312 | end; |
||
313 | end; |
||
314 | |||
315 | {$IFDEF UNICODE} |
||
316 | function ValidUTF8(const str: UTF8String; len: integer = -1): integer; |
||
317 | var |
||
318 | i, j, ul: integer; |
||
319 | begin |
||
320 | if len < 0 then |
||
321 | len := Length(str); |
||
322 | Result := 0; |
||
323 | i := 1; |
||
324 | while (i <= len) do |
||
325 | begin |
||
326 | ul := UTF8SeqLen(str[i]); |
||
327 | inc(i); |
||
328 | if ul <> 1 then |
||
329 | begin |
||
330 | if (ul < 1) or ((i + ul -2) > len) then |
||
331 | begin |
||
332 | Result := -1; // invalid |
||
333 | break; |
||
334 | end; |
||
335 | // first in seq |
||
336 | for j := 0 to ul -2 do |
||
337 | begin |
||
338 | if (ord(str[i]) and $C0) <> $80 then |
||
339 | begin |
||
340 | result := -1; |
||
341 | break; |
||
342 | end; |
||
343 | inc(i); |
||
344 | end; |
||
345 | if Result >= 0 then |
||
346 | inc(Result) // was valid so count it |
||
347 | else |
||
348 | break; |
||
349 | end; |
||
350 | end; |
||
351 | end; |
||
352 | {$ENDIF} |
||
353 | |||
354 | |||
355 | function AsUTF8Str(const zstr: TZMString): UTF8String; |
||
356 | begin |
||
357 | {$IFDEF UNICODE} |
||
358 | Result := UTF8String(zstr); |
||
359 | {$ELSE} |
||
360 | if ValidUTF8(zstr, -1) < 0 then |
||
361 | Result := StrToUTF8(zstr) |
||
362 | else |
||
363 | Result := zstr; |
||
364 | {$ENDIF} |
||
365 | end; |
||
366 | |||
367 | |||
368 | function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer): |
||
369 | TZMWideString; |
||
370 | var |
||
371 | cnt: integer; |
||
372 | s: AnsiString; |
||
373 | wcnt: integer; |
||
374 | begin |
||
375 | Result := ''; |
||
376 | if len < 0 then |
||
377 | len := Length(astr); |
||
378 | if len = 0 then |
||
379 | exit; |
||
380 | wcnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len, nil, 0); |
||
381 | if wcnt > 0 then |
||
382 | begin |
||
383 | SetLength(Result, wcnt); |
||
384 | cnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len, |
||
385 | pWideChar(Result), wcnt); |
||
386 | if cnt < 1 then |
||
387 | Result := ''; // oops - something went wrong |
||
388 | end |
||
389 | else |
||
390 | // RaiseLastOSError; |
||
391 | begin |
||
392 | s := astr; // assume it is Ansi |
||
393 | if (len > 0) and (len < length(astr)) then |
||
394 | SetLength(s, len); |
||
395 | Result := String(s); |
||
396 | end; |
||
397 | end; |
||
398 | |||
399 | // convert to MultiByte escaping unsupported characters |
||
400 | function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString; |
||
401 | {$IFNDEF UNICODE} |
||
402 | const WC_NO_BEST_FIT_CHARS = $00000400; |
||
403 | {$endif} |
||
404 | |||
405 | var |
||
406 | Bad: Bool; |
||
407 | c: AnsiChar; |
||
408 | cnt: Integer; |
||
409 | i: Integer; |
||
410 | pa: PAnsiChar; |
||
411 | tmp: AnsiString; |
||
412 | subst: array [0..1] of AnsiChar; |
||
413 | toCP: cardinal; |
||
414 | wc: WideChar; |
||
415 | wlen: Integer; |
||
416 | begin |
||
417 | Result := ''; |
||
418 | if wstr <> '' then |
||
419 | begin |
||
420 | if ToOEM then |
||
421 | toCP := CP_OEMCP |
||
422 | else |
||
423 | toCP := CP_ACP; |
||
424 | subst[0] := #$1B; // substitute char - escape |
||
425 | subst[1] := #0; |
||
426 | cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr), |
||
427 | Length(wstr), nil, 0, PAnsiChar(@subst), @Bad); |
||
428 | if cnt > 0 then |
||
429 | begin |
||
430 | SetLength(Result, cnt); |
||
431 | cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr), |
||
432 | Length(wstr), PAnsiChar(Result), cnt, PAnsiChar(@subst), @Bad); |
||
433 | if cnt < 1 then |
||
434 | Result := ''; // oops - something went wrong |
||
435 | end; |
||
436 | if Bad then |
||
437 | begin |
||
438 | tmp := Result; |
||
439 | Result := ''; |
||
440 | pa := PAnsiChar(tmp); |
||
441 | i := 1; |
||
442 | wc := #0; |
||
443 | wlen := Length(wstr); |
||
444 | while (pa^ <> #0) do |
||
445 | begin |
||
446 | c := pa^; |
||
447 | if i < wlen then |
||
448 | begin |
||
449 | wc := wstr[i]; |
||
450 | inc(i); |
||
451 | end; |
||
452 | if c = #$1B then |
||
453 | Result := Result + '#$' + AnsiString(IntToHex(Ord(wc), 4)) |
||
454 | else |
||
455 | Result := Result + c; |
||
456 | pa := CharNextExA(toCP, pa, 0); |
||
457 | end; |
||
458 | end; |
||
459 | end; |
||
460 | end; |
||
461 | |||
462 | {$IFNDEF UNICODE} |
||
463 | function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString; |
||
464 | begin |
||
465 | Result := WideToSafe(UTF8ToWide(ustr), ToOEM); |
||
466 | end; |
||
467 | {$ENDIF} |
||
468 | |||
469 | {$IFNDEF UNICODE} |
||
470 | function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString; |
||
471 | begin |
||
472 | Result := WideToSafe(PUTF8ToWideStr(raw, len), false); |
||
473 | end; |
||
474 | {$ENDIF} |
||
475 | |||
476 | // test all characters are supported |
||
477 | function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean; |
||
478 | var |
||
479 | Bad: Bool; |
||
480 | cnt: Integer; |
||
481 | toCP: cardinal; |
||
482 | begin |
||
483 | Result := true; |
||
484 | if ToOEM then |
||
485 | toCP := CP_OEMCP |
||
486 | else |
||
487 | toCP := CP_ACP; |
||
488 | if wstr <> '' then |
||
489 | begin |
||
490 | cnt := WideCharToMultiByte(toCP, 0, PWideChar(wstr), Length(wstr), |
||
491 | nil, 0, nil, @Bad); |
||
492 | Result := (not Bad) and (cnt > 0); |
||
493 | end; |
||
494 | end; |
||
495 | |||
496 | {$IFNDEF UNICODE} |
||
497 | function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean; |
||
498 | begin |
||
499 | Result := WideIsSafe(UTF8ToWide(ustr), ToOEM); |
||
500 | end; |
||
501 | {$ENDIF} |
||
502 | |||
503 | {$IFNDEF UNICODE} |
||
504 | // only applicable when converting to OEM |
||
505 | function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean; |
||
506 | begin |
||
507 | Result := True; |
||
508 | if ToOEM then |
||
509 | Result := WideIsSafe(WideString(ustr), ToOEM); |
||
510 | end; |
||
511 | {$ENDIF} |
||
512 | |||
513 | |||
514 | end. |
||
515 |