Subversion Repositories autosfx

Rev

Blame | Last modification | View Log | RSS feed

  1. unit ZMUTF819;
  2.  
  3. (*
  4.   ZMUTF19.pas - Some UTF8/16 utility functions
  5.     Copyright (C) 2009, 2010  by Russell J. Peters, Roger Aelbrecht,
  6.       Eric W. Engler and Chris Vleghert.
  7.  
  8.         This file is part of TZipMaster Version 1.9.
  9.  
  10.     TZipMaster is free software: you can redistribute it and/or modify
  11.     it under the terms of the GNU Lesser General Public License as published by
  12.     the Free Software Foundation, either version 3 of the License, or
  13.     (at your option) any later version.
  14.  
  15.     TZipMaster is distributed in the hope that it will be useful,
  16.     but WITHOUT ANY WARRANTY; without even the implied warranty of
  17.     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18.     GNU Lesser General Public License for more details.
  19.  
  20.     You should have received a copy of the GNU Lesser General Public License
  21.     along with TZipMaster.  If not, see <http://www.gnu.org/licenses/>.
  22.  
  23.     contact: problems@delphizip.org (include ZipMaster in the subject).
  24.     updates: http://www.delphizip.org
  25.     DelphiZip maillist subscribe at http://www.freelists.org/list/delphizip
  26.  
  27.   modified 2009-06-25
  28. ---------------------------------------------------------------------------*)
  29.  
  30. {$INCLUDE '.\ZipVers19.inc'}
  31.  
  32. {$IFDEF VERD6up}
  33. {$WARN UNIT_PLATFORM OFF}
  34. {$WARN SYMBOL_PLATFORM OFF}
  35. {$ENDIF}
  36.  
  37. interface
  38.  
  39. uses
  40.   SysUtils, Windows, Classes, ZipMstr19;
  41.  
  42. // convert to/from UTF8 characters
  43. function UTF8ToStr(const astr: UTF8String): String;
  44. function StrToUTF8(const ustr: string): UTF8String;
  45. function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer =
  46.     -1): TZMString;
  47. function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString;
  48. function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String;
  49. function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String;
  50.  
  51. // test for valid UTF8 character(s)  > 0 _ some, 0 _ none, < 0 _ invalid
  52. function ValidUTF8(pstr: PAnsiChar; len: integer): integer; overload;
  53. function ValidUTF8(const str: AnsiString; len: integer = -1): integer; overload;
  54. {$IFDEF UNICODE}
  55. //overload;
  56. function ValidUTF8(const str: UTF8String; len: integer = -1): integer; overload;
  57. {$ENDIF}
  58.  
  59. // convert to UTF8 (if needed)
  60. function AsUTF8Str(const zstr: TZMString): UTF8String;
  61.  
  62. function UTF8SeqLen(c: AnsiChar): integer;
  63. function IsUTF8Trail(c: AnsiChar): boolean;
  64.  
  65. function PUTF8ToStr(const raw: PAnsiChar; len: integer): string;
  66. function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString;
  67.  
  68. function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer):
  69.     TZMWideString;
  70.  
  71. // convert to Ansi/OEM escaping unsupported characters
  72. function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString;
  73.  
  74. // test all characters are supported
  75. function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean;
  76. {$IFNDEF UNICODE}
  77. function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString;
  78. function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString;
  79. function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
  80. // only applicable when converting to OEM
  81. function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
  82. {$ENDIF}
  83. // -------------------------- ------------ -------------------------
  84. implementation
  85.  
  86. uses ZMCompat19, ZMUtils19;
  87.  
  88.  
  89. function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString;
  90. const
  91.   MB_ERR_INVALID_CHARS = $00000008; // error for invalid chars
  92. var
  93.   wcnt:  integer;
  94.   flg: cardinal;
  95.   p: pAnsiChar;
  96.   rlen: Integer;
  97. begin
  98.   Result := '';
  99.   if (raw = nil) or (len = 0) then
  100.     exit;
  101.   rlen := Len;
  102.   if len < 0 then
  103.   begin
  104.     len := -1;
  105.     p := raw;
  106.     rlen := 0;
  107.     while p^ <> #0 do
  108.     begin
  109.       inc(p);
  110.       Inc(rlen);
  111.     end;
  112.   end;
  113.   rlen := rlen * 2;
  114. {$IFDEF UNICODE}
  115.   flg := MB_ERR_INVALID_CHARS;
  116. {$ELSE}
  117.   if Win32MajorVersion > 4 then
  118.     flg := MB_ERR_INVALID_CHARS
  119.   else
  120.     flg := 0;
  121. {$ENDIF}
  122.   SetLength(Result, rlen); // plenty of room
  123.   wcnt := MultiByteToWideChar(CP_UTF8, flg, raw, len,
  124.             PWideChar(Result), rlen);
  125.   if wcnt = 0 then    // try again assuming Ansi
  126.     wcnt := MultiByteToWideChar(0, flg, raw, len,
  127.             PWideChar(Result), rlen);
  128.   if (wcnt > 0) and (len = -1) then
  129.     dec(wcnt);  // don't want end null
  130.   SetLength(Result, wcnt);
  131. end;
  132.  
  133. function PUTF8ToStr(const raw: PAnsiChar; len: integer): string;
  134. begin
  135.   Result := PUTF8ToWideStr(raw, len);
  136. end;
  137.  
  138. function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String;
  139. var
  140.   cnt:   integer;
  141. begin
  142.   Result := '';
  143.   if len < 0 then
  144.     len := -1;
  145.   if len = 0 then
  146.     exit;
  147.   cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len, nil, 0, nil, nil);
  148.   if cnt > 0 then
  149.   begin
  150.     SetLength(Result, cnt);
  151.     cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len,
  152.       PAnsiChar(Result), cnt, nil, nil);
  153.     if cnt < 1 then
  154.       Result := '';  // oops - something went wrong
  155.     if (len = -1) and (Result[cnt] = #0) then
  156.       SetLength(Result, cnt - 1); // remove trailing nul
  157.   end//;
  158.   else
  159.     RaiseLastOSError;
  160. end;
  161.  
  162. function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String;
  163. begin
  164.   if len < 0 then
  165.     len := Length(astr);
  166.   Result := PWideToUTF8(@astr[1], len);
  167. end;
  168.  
  169. function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString;
  170. begin
  171.   Result := '';
  172.   if len < 0 then
  173.     len := Length(astr);
  174.   Result := PUTF8ToWideStr(PAnsiChar(astr), len);
  175. end;
  176.  
  177. function UTF8ToStr(const astr: UTF8String): String;
  178. begin
  179.   Result := PUTF8ToStr(PAnsiChar(astr), Length(astr));
  180. end;
  181.                                                  
  182. function StrToUTF8(const ustr: string): UTF8String;  
  183. var
  184.   wtemp: TZMWideString;
  185. begin
  186.   wtemp := ustr;
  187.   Result := WideToUTF8(wtemp, -1);
  188. end;
  189.  
  190. function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer =
  191.     -1): TZMString;
  192. var
  193.   ws: TZMWideString;
  194. begin
  195.   ws := StrToWideEx(astr, cp, len);
  196. {$IFDEF UNICODE}
  197.   Result := ws;
  198. {$ELSE}
  199.   Result := StrToUTF8(ws);
  200. {$ENDIF}
  201. end;
  202.  
  203. function IsUTF8Trail(c: AnsiChar): boolean;
  204. begin
  205.   Result := (Ord(c) and $C0) = $80;
  206. end;
  207.  
  208. function UTF8SeqLen(c: AnsiChar): integer;
  209. var
  210.   u8: cardinal;
  211. begin
  212.   Result := 1;
  213.   u8 := ord(c);
  214.   if u8 >= $80 then
  215.   begin
  216.     if (u8 and $FE) = $FC then
  217.       Result := 6
  218.     else
  219.     if (u8 and $FC) = $F8 then
  220.       Result := 5
  221.     else
  222.     if (u8 and $F8) = $F0 then
  223.       Result := 4
  224.     else
  225.     if (u8 and $F0) = $E0 then
  226.       Result := 3
  227.     else
  228.     if (u8 and $E0) = $C0 then
  229.       Result := 2
  230.     else
  231.       Result := -1;  // trailing byte - invalid
  232.   end;
  233. end;
  234.  
  235. // test for valid UTF8 character(s)  > 0 _ some, 0 _ none, < 0 _ invalid
  236. function ValidUTF8(const str: AnsiString; len: integer): integer;
  237. var
  238.   i, j, ul: integer;
  239. begin
  240.   if len < 0 then
  241.     len := Length(str);
  242.   Result := 0;
  243.   i := 1;
  244.   while (i <= len) do
  245.   begin
  246.     ul := UTF8SeqLen(str[i]);
  247.     inc(i);
  248.     if ul <> 1 then
  249.     begin
  250.       if (ul < 1) or ((i + ul -2) > len) then
  251.       begin
  252.         Result := -1;  // invalid
  253.         break;
  254.       end;
  255.       // first in seq
  256.       for j := 0 to ul -2  do
  257.       begin
  258.         if (ord(str[i]) and $C0) <> $80 then
  259.         begin
  260.           result := -1;
  261.           break;
  262.         end;
  263.         inc(i);
  264.       end;
  265.       if Result >= 0 then
  266.         inc(Result)   // was valid so count it
  267.       else
  268.         break;
  269.     end;
  270.   end;
  271. end;
  272.  
  273. // test for valid UTF8 character(s)  > 0 _ some, 0 _ none, < 0 _ invalid
  274. function ValidUTF8(pstr: PAnsiChar; len: integer): integer;
  275. var
  276. //  i,
  277.   j, ul: integer;
  278. begin
  279. //  if len < 0 then
  280. //    len := Length(str);
  281.   Result := 0;
  282. //  i := 1;
  283.   while (len > 0) do
  284.   begin
  285.     ul := UTF8SeqLen(pstr^);
  286.     inc(pstr);
  287.     Dec(len);
  288.     if ul <> 1 then
  289.     begin
  290. //      if (ul < 1) or ((i + ul -2) > len) then
  291.       if (ul < 1) or (( ul -1) > len) then
  292.       begin
  293.         Result := -1;  // invalid
  294.         break;
  295.       end;
  296.       // first in seq
  297.       for j := 0 to ul -2  do
  298.       begin
  299.         if (ord(pstr^) and $C0) <> $80 then
  300.         begin
  301.           result := -1;
  302.           break;
  303.         end;
  304.         inc(pstr);
  305.         Dec(len);
  306.       end;
  307.       if Result >= 0 then
  308.         inc(Result)   // was valid so count it
  309.       else
  310.         break;
  311.     end;
  312.   end;
  313. end;
  314.  
  315. {$IFDEF UNICODE}
  316. function ValidUTF8(const str: UTF8String; len: integer = -1): integer;
  317. var
  318.   i, j, ul: integer;
  319. begin
  320.   if len < 0 then
  321.     len := Length(str);
  322.   Result := 0;
  323.   i := 1;
  324.   while (i <= len) do
  325.   begin
  326.     ul := UTF8SeqLen(str[i]);
  327.     inc(i);
  328.     if ul <> 1 then
  329.     begin
  330.       if (ul < 1) or ((i + ul -2) > len) then
  331.       begin
  332.         Result := -1;  // invalid
  333.         break;
  334.       end;
  335.       // first in seq
  336.       for j := 0 to ul -2  do
  337.       begin
  338.         if (ord(str[i]) and $C0) <> $80 then
  339.         begin
  340.           result := -1;
  341.           break;
  342.         end;
  343.         inc(i);
  344.       end;
  345.       if Result >= 0 then
  346.         inc(Result)   // was valid so count it
  347.       else
  348.         break;
  349.     end;
  350.   end;
  351. end;
  352. {$ENDIF}
  353.  
  354.  
  355. function AsUTF8Str(const zstr: TZMString): UTF8String;
  356. begin
  357. {$IFDEF UNICODE}
  358.     Result := UTF8String(zstr);
  359. {$ELSE}
  360.   if ValidUTF8(zstr, -1) < 0 then
  361.     Result := StrToUTF8(zstr)
  362.   else
  363.     Result := zstr;
  364. {$ENDIF}
  365. end;
  366.  
  367.  
  368. function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer):
  369.     TZMWideString;
  370. var
  371.   cnt: integer;
  372.   s: AnsiString;
  373.   wcnt: integer;
  374. begin
  375.   Result := '';
  376.   if len < 0 then
  377.     len := Length(astr);
  378.   if len = 0 then
  379.     exit;
  380.   wcnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len, nil, 0);
  381.   if wcnt > 0 then
  382.   begin
  383.     SetLength(Result, wcnt);
  384.     cnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len,
  385.       pWideChar(Result), wcnt);
  386.     if cnt < 1 then
  387.       Result := '';  // oops - something went wrong
  388.   end
  389.   else
  390. //    RaiseLastOSError;
  391.   begin
  392.     s := astr;   // assume it is Ansi
  393.     if (len > 0) and (len < length(astr)) then
  394.       SetLength(s, len);
  395.     Result := String(s);
  396.   end;
  397. end;
  398.  
  399. // convert to MultiByte escaping unsupported characters
  400. function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString;
  401. {$IFNDEF UNICODE}
  402.  const WC_NO_BEST_FIT_CHARS = $00000400;
  403. {$endif}
  404.  
  405. var
  406.   Bad: Bool;
  407.   c: AnsiChar;
  408.   cnt: Integer;
  409.   i: Integer;
  410.   pa: PAnsiChar;
  411.   tmp: AnsiString;
  412.   subst: array [0..1] of AnsiChar;
  413.   toCP: cardinal;
  414.   wc: WideChar;
  415.   wlen: Integer;
  416. begin
  417.   Result := '';
  418.   if wstr <> '' then
  419.   begin
  420.     if ToOEM then
  421.       toCP := CP_OEMCP
  422.     else
  423.       toCP := CP_ACP;
  424.     subst[0] := #$1B;   // substitute char - escape
  425.     subst[1] := #0;
  426.     cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr),
  427.               Length(wstr), nil, 0, PAnsiChar(@subst), @Bad);
  428.     if cnt > 0 then
  429.     begin
  430.       SetLength(Result, cnt);
  431.       cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr),
  432.         Length(wstr), PAnsiChar(Result), cnt, PAnsiChar(@subst), @Bad);
  433.       if cnt < 1 then
  434.         Result := '';  // oops - something went wrong
  435.     end;
  436.     if Bad then
  437.     begin
  438.       tmp := Result;
  439.       Result := '';
  440.       pa := PAnsiChar(tmp);
  441.       i := 1;
  442.       wc := #0;
  443.       wlen := Length(wstr);
  444.       while (pa^ <> #0) do
  445.       begin
  446.         c := pa^;
  447.         if i < wlen then
  448.         begin
  449.           wc := wstr[i];
  450.           inc(i);
  451.         end;
  452.         if c = #$1B then
  453.           Result := Result + '#$' + AnsiString(IntToHex(Ord(wc), 4))
  454.         else
  455.           Result := Result + c;
  456.         pa := CharNextExA(toCP, pa, 0);
  457.       end;
  458.     end;
  459.   end;
  460. end;
  461.  
  462. {$IFNDEF UNICODE}
  463. function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString;
  464. begin
  465.   Result := WideToSafe(UTF8ToWide(ustr), ToOEM);
  466. end;
  467. {$ENDIF}
  468.  
  469. {$IFNDEF UNICODE}
  470. function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString;
  471. begin
  472.   Result := WideToSafe(PUTF8ToWideStr(raw, len), false);
  473. end;
  474. {$ENDIF}
  475.  
  476. // test all characters are supported
  477. function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean;
  478. var
  479.   Bad: Bool;
  480.   cnt: Integer;
  481.   toCP: cardinal;
  482. begin
  483.   Result := true;
  484.   if ToOEM then
  485.     toCP := CP_OEMCP
  486.   else
  487.     toCP := CP_ACP;
  488.   if wstr <> '' then
  489.   begin
  490.     cnt := WideCharToMultiByte(toCP, 0, PWideChar(wstr), Length(wstr),
  491.               nil, 0, nil, @Bad);
  492.     Result := (not Bad) and (cnt > 0);
  493.   end;
  494. end;
  495.  
  496. {$IFNDEF UNICODE}
  497. function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
  498. begin
  499.   Result := WideIsSafe(UTF8ToWide(ustr), ToOEM);
  500. end;
  501. {$ENDIF}
  502.  
  503. {$IFNDEF UNICODE}
  504. // only applicable when converting to OEM
  505. function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
  506. begin
  507.   Result := True;
  508.   if ToOEM then
  509.     Result := WideIsSafe(WideString(ustr), ToOEM);
  510. end;
  511. {$ENDIF}
  512.  
  513.  
  514. end.
  515.  
  516.