Subversion Repositories autosfx

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1 daniel-mar 1
unit ZMUTF819;
2
 
3
(*
4
  ZMUTF19.pas - Some UTF8/16 utility functions
5
    Copyright (C) 2009, 2010  by Russell J. Peters, Roger Aelbrecht,
6
      Eric W. Engler and Chris Vleghert.
7
 
8
        This file is part of TZipMaster Version 1.9.
9
 
10
    TZipMaster is free software: you can redistribute it and/or modify
11
    it under the terms of the GNU Lesser General Public License as published by
12
    the Free Software Foundation, either version 3 of the License, or
13
    (at your option) any later version.
14
 
15
    TZipMaster is distributed in the hope that it will be useful,
16
    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
    GNU Lesser General Public License for more details.
19
 
20
    You should have received a copy of the GNU Lesser General Public License
21
    along with TZipMaster.  If not, see <http://www.gnu.org/licenses/>.
22
 
23
    contact: problems@delphizip.org (include ZipMaster in the subject).
24
    updates: http://www.delphizip.org
25
    DelphiZip maillist subscribe at http://www.freelists.org/list/delphizip
26
 
27
  modified 2009-06-25
28
---------------------------------------------------------------------------*)
29
 
30
{$INCLUDE '.\ZipVers19.inc'}
31
 
32
{$IFDEF VERD6up}
33
{$WARN UNIT_PLATFORM OFF}
34
{$WARN SYMBOL_PLATFORM OFF}
35
{$ENDIF}
36
 
37
interface
38
 
39
uses
40
  SysUtils, Windows, Classes, ZipMstr19;
41
 
42
// convert to/from UTF8 characters
43
function UTF8ToStr(const astr: UTF8String): String;
44
function StrToUTF8(const ustr: string): UTF8String;
45
function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer =
46
    -1): TZMString;
47
function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString;
48
function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String;
49
function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String;
50
 
51
// test for valid UTF8 character(s)  > 0 _ some, 0 _ none, < 0 _ invalid
52
function ValidUTF8(pstr: PAnsiChar; len: integer): integer; overload;
53
function ValidUTF8(const str: AnsiString; len: integer = -1): integer; overload;
54
{$IFDEF UNICODE}
55
//overload;
56
function ValidUTF8(const str: UTF8String; len: integer = -1): integer; overload;
57
{$ENDIF}
58
 
59
// convert to UTF8 (if needed)
60
function AsUTF8Str(const zstr: TZMString): UTF8String;
61
 
62
function UTF8SeqLen(c: AnsiChar): integer;
63
function IsUTF8Trail(c: AnsiChar): boolean;
64
 
65
function PUTF8ToStr(const raw: PAnsiChar; len: integer): string;
66
function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString;
67
 
68
function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer):
69
    TZMWideString;
70
 
71
// convert to Ansi/OEM escaping unsupported characters
72
function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString;
73
 
74
// test all characters are supported
75
function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean;
76
{$IFNDEF UNICODE}
77
function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString;
78
function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString;
79
function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
80
// only applicable when converting to OEM
81
function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
82
{$ENDIF}
83
// -------------------------- ------------ -------------------------
84
implementation
85
 
86
uses ZMCompat19, ZMUtils19;
87
 
88
 
89
function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString;
90
const
91
  MB_ERR_INVALID_CHARS = $00000008; // error for invalid chars
92
var
93
  wcnt:  integer;
94
  flg: cardinal;
95
  p: pAnsiChar;
96
  rlen: Integer;
97
begin
98
  Result := '';
99
  if (raw = nil) or (len = 0) then
100
    exit;
101
  rlen := Len;
102
  if len < 0 then
103
  begin
104
    len := -1;
105
    p := raw;
106
    rlen := 0;
107
    while p^ <> #0 do
108
    begin
109
      inc(p);
110
      Inc(rlen);
111
    end;
112
  end;
113
  rlen := rlen * 2;
114
{$IFDEF UNICODE}
115
  flg := MB_ERR_INVALID_CHARS;
116
{$ELSE}
117
  if Win32MajorVersion > 4 then
118
    flg := MB_ERR_INVALID_CHARS
119
  else
120
    flg := 0;
121
{$ENDIF}
122
  SetLength(Result, rlen); // plenty of room
123
  wcnt := MultiByteToWideChar(CP_UTF8, flg, raw, len,
124
            PWideChar(Result), rlen);
125
  if wcnt = 0 then    // try again assuming Ansi
126
    wcnt := MultiByteToWideChar(0, flg, raw, len,
127
            PWideChar(Result), rlen);
128
  if (wcnt > 0) and (len = -1) then
129
    dec(wcnt);  // don't want end null
130
  SetLength(Result, wcnt);
131
end;
132
 
133
function PUTF8ToStr(const raw: PAnsiChar; len: integer): string;
134
begin
135
  Result := PUTF8ToWideStr(raw, len);
136
end;
137
 
138
function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String;
139
var
140
  cnt:   integer;
141
begin
142
  Result := '';
143
  if len < 0 then
144
    len := -1;
145
  if len = 0 then
146
    exit;
147
  cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len, nil, 0, nil, nil);
148
  if cnt > 0 then
149
  begin
150
    SetLength(Result, cnt);
151
    cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len,
152
      PAnsiChar(Result), cnt, nil, nil);
153
    if cnt < 1 then
154
      Result := '';  // oops - something went wrong
155
    if (len = -1) and (Result[cnt] = #0) then
156
      SetLength(Result, cnt - 1); // remove trailing nul
157
  end//;
158
  else
159
    RaiseLastOSError;
160
end;
161
 
162
function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String;
163
begin
164
  if len < 0 then
165
    len := Length(astr);
166
  Result := PWideToUTF8(@astr[1], len);
167
end;
168
 
169
function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString;
170
begin
171
  Result := '';
172
  if len < 0 then
173
    len := Length(astr);
174
  Result := PUTF8ToWideStr(PAnsiChar(astr), len);
175
end;
176
 
177
function UTF8ToStr(const astr: UTF8String): String;
178
begin
179
  Result := PUTF8ToStr(PAnsiChar(astr), Length(astr));
180
end;
181
 
182
function StrToUTF8(const ustr: string): UTF8String;  
183
var
184
  wtemp: TZMWideString;
185
begin
186
  wtemp := ustr;
187
  Result := WideToUTF8(wtemp, -1);
188
end;
189
 
190
function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer =
191
    -1): TZMString;
192
var
193
  ws: TZMWideString;
194
begin
195
  ws := StrToWideEx(astr, cp, len);
196
{$IFDEF UNICODE}
197
  Result := ws;
198
{$ELSE}
199
  Result := StrToUTF8(ws);
200
{$ENDIF}
201
end;
202
 
203
function IsUTF8Trail(c: AnsiChar): boolean;
204
begin
205
  Result := (Ord(c) and $C0) = $80;
206
end;
207
 
208
function UTF8SeqLen(c: AnsiChar): integer;
209
var
210
  u8: cardinal;
211
begin
212
  Result := 1;
213
  u8 := ord(c);
214
  if u8 >= $80 then
215
  begin
216
    if (u8 and $FE) = $FC then
217
      Result := 6
218
    else
219
    if (u8 and $FC) = $F8 then
220
      Result := 5
221
    else
222
    if (u8 and $F8) = $F0 then
223
      Result := 4
224
    else
225
    if (u8 and $F0) = $E0 then
226
      Result := 3
227
    else
228
    if (u8 and $E0) = $C0 then
229
      Result := 2
230
    else
231
      Result := -1;  // trailing byte - invalid
232
  end;
233
end;
234
 
235
// test for valid UTF8 character(s)  > 0 _ some, 0 _ none, < 0 _ invalid
236
function ValidUTF8(const str: AnsiString; len: integer): integer;
237
var
238
  i, j, ul: integer;
239
begin
240
  if len < 0 then
241
    len := Length(str);
242
  Result := 0;
243
  i := 1;
244
  while (i <= len) do
245
  begin
246
    ul := UTF8SeqLen(str[i]);
247
    inc(i);
248
    if ul <> 1 then
249
    begin
250
      if (ul < 1) or ((i + ul -2) > len) then
251
      begin
252
        Result := -1;  // invalid
253
        break;
254
      end;
255
      // first in seq
256
      for j := 0 to ul -2  do
257
      begin
258
        if (ord(str[i]) and $C0) <> $80 then
259
        begin
260
          result := -1;
261
          break;
262
        end;
263
        inc(i);
264
      end;
265
      if Result >= 0 then
266
        inc(Result)   // was valid so count it
267
      else
268
        break;
269
    end;
270
  end;
271
end;
272
 
273
// test for valid UTF8 character(s)  > 0 _ some, 0 _ none, < 0 _ invalid
274
function ValidUTF8(pstr: PAnsiChar; len: integer): integer;
275
var
276
//  i,
277
  j, ul: integer;
278
begin
279
//  if len < 0 then
280
//    len := Length(str);
281
  Result := 0;
282
//  i := 1;
283
  while (len > 0) do
284
  begin
285
    ul := UTF8SeqLen(pstr^);
286
    inc(pstr);
287
    Dec(len);
288
    if ul <> 1 then
289
    begin
290
//      if (ul < 1) or ((i + ul -2) > len) then
291
      if (ul < 1) or (( ul -1) > len) then
292
      begin
293
        Result := -1;  // invalid
294
        break;
295
      end;
296
      // first in seq
297
      for j := 0 to ul -2  do
298
      begin
299
        if (ord(pstr^) and $C0) <> $80 then
300
        begin
301
          result := -1;
302
          break;
303
        end;
304
        inc(pstr);
305
        Dec(len);
306
      end;
307
      if Result >= 0 then
308
        inc(Result)   // was valid so count it
309
      else
310
        break;
311
    end;
312
  end;
313
end;
314
 
315
{$IFDEF UNICODE}
316
function ValidUTF8(const str: UTF8String; len: integer = -1): integer;
317
var
318
  i, j, ul: integer;
319
begin
320
  if len < 0 then
321
    len := Length(str);
322
  Result := 0;
323
  i := 1;
324
  while (i <= len) do
325
  begin
326
    ul := UTF8SeqLen(str[i]);
327
    inc(i);
328
    if ul <> 1 then
329
    begin
330
      if (ul < 1) or ((i + ul -2) > len) then
331
      begin
332
        Result := -1;  // invalid
333
        break;
334
      end;
335
      // first in seq
336
      for j := 0 to ul -2  do
337
      begin
338
        if (ord(str[i]) and $C0) <> $80 then
339
        begin
340
          result := -1;
341
          break;
342
        end;
343
        inc(i);
344
      end;
345
      if Result >= 0 then
346
        inc(Result)   // was valid so count it
347
      else
348
        break;
349
    end;
350
  end;
351
end;
352
{$ENDIF}
353
 
354
 
355
function AsUTF8Str(const zstr: TZMString): UTF8String;
356
begin
357
{$IFDEF UNICODE}
358
    Result := UTF8String(zstr);
359
{$ELSE}
360
  if ValidUTF8(zstr, -1) < 0 then
361
    Result := StrToUTF8(zstr)
362
  else
363
    Result := zstr;
364
{$ENDIF}
365
end;
366
 
367
 
368
function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer):
369
    TZMWideString;
370
var
371
  cnt: integer;
372
  s: AnsiString;
373
  wcnt: integer;
374
begin
375
  Result := '';
376
  if len < 0 then
377
    len := Length(astr);
378
  if len = 0 then
379
    exit;
380
  wcnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len, nil, 0);
381
  if wcnt > 0 then
382
  begin
383
    SetLength(Result, wcnt);
384
    cnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len,
385
      pWideChar(Result), wcnt);
386
    if cnt < 1 then
387
      Result := '';  // oops - something went wrong
388
  end
389
  else
390
//    RaiseLastOSError;
391
  begin
392
    s := astr;   // assume it is Ansi
393
    if (len > 0) and (len < length(astr)) then
394
      SetLength(s, len);
395
    Result := String(s);
396
  end;
397
end;
398
 
399
// convert to MultiByte escaping unsupported characters
400
function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString;
401
{$IFNDEF UNICODE}
402
 const WC_NO_BEST_FIT_CHARS = $00000400;
403
{$endif}
404
 
405
var
406
  Bad: Bool;
407
  c: AnsiChar;
408
  cnt: Integer;
409
  i: Integer;
410
  pa: PAnsiChar;
411
  tmp: AnsiString;
412
  subst: array [0..1] of AnsiChar;
413
  toCP: cardinal;
414
  wc: WideChar;
415
  wlen: Integer;
416
begin
417
  Result := '';
418
  if wstr <> '' then
419
  begin
420
    if ToOEM then
421
      toCP := CP_OEMCP
422
    else
423
      toCP := CP_ACP;
424
    subst[0] := #$1B;   // substitute char - escape
425
    subst[1] := #0;
426
    cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr),
427
              Length(wstr), nil, 0, PAnsiChar(@subst), @Bad);
428
    if cnt > 0 then
429
    begin
430
      SetLength(Result, cnt);
431
      cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr),
432
        Length(wstr), PAnsiChar(Result), cnt, PAnsiChar(@subst), @Bad);
433
      if cnt < 1 then
434
        Result := '';  // oops - something went wrong
435
    end;
436
    if Bad then
437
    begin
438
      tmp := Result;
439
      Result := '';
440
      pa := PAnsiChar(tmp);
441
      i := 1;
442
      wc := #0;
443
      wlen := Length(wstr);
444
      while (pa^ <> #0) do
445
      begin
446
        c := pa^;
447
        if i < wlen then
448
        begin
449
          wc := wstr[i];
450
          inc(i);
451
        end;
452
        if c = #$1B then
453
          Result := Result + '#$' + AnsiString(IntToHex(Ord(wc), 4))
454
        else
455
          Result := Result + c;
456
        pa := CharNextExA(toCP, pa, 0);
457
      end;
458
    end;
459
  end;
460
end;
461
 
462
{$IFNDEF UNICODE}
463
function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString;
464
begin
465
  Result := WideToSafe(UTF8ToWide(ustr), ToOEM);
466
end;
467
{$ENDIF}
468
 
469
{$IFNDEF UNICODE}
470
function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString;
471
begin
472
  Result := WideToSafe(PUTF8ToWideStr(raw, len), false);
473
end;
474
{$ENDIF}
475
 
476
// test all characters are supported
477
function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean;
478
var
479
  Bad: Bool;
480
  cnt: Integer;
481
  toCP: cardinal;
482
begin
483
  Result := true;
484
  if ToOEM then
485
    toCP := CP_OEMCP
486
  else
487
    toCP := CP_ACP;
488
  if wstr <> '' then
489
  begin
490
    cnt := WideCharToMultiByte(toCP, 0, PWideChar(wstr), Length(wstr),
491
              nil, 0, nil, @Bad);
492
    Result := (not Bad) and (cnt > 0);
493
  end;
494
end;
495
 
496
{$IFNDEF UNICODE}
497
function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
498
begin
499
  Result := WideIsSafe(UTF8ToWide(ustr), ToOEM);
500
end;
501
{$ENDIF}
502
 
503
{$IFNDEF UNICODE}
504
// only applicable when converting to OEM
505
function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
506
begin
507
  Result := True;
508
  if ToOEM then
509
    Result := WideIsSafe(WideString(ustr), ToOEM);
510
end;
511
{$ENDIF}
512
 
513
 
514
end.
515