WebSVN – autosfx – Blame – /zmstr1900102/ZMUTF819.pas

Rev	Author	Line No.	Line
1	daniel-mar	1	unit ZMUTF819;
		2
		3	(*
		4	ZMUTF19.pas - Some UTF8/16 utility functions
		5	Copyright (C) 2009, 2010 by Russell J. Peters, Roger Aelbrecht,
		6	Eric W. Engler and Chris Vleghert.
		7
		8	This file is part of TZipMaster Version 1.9.
		9
		10	TZipMaster is free software: you can redistribute it and/or modify
		11	it under the terms of the GNU Lesser General Public License as published by
		12	the Free Software Foundation, either version 3 of the License, or
		13	(at your option) any later version.
		14
		15	TZipMaster is distributed in the hope that it will be useful,
		16	but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		18	GNU Lesser General Public License for more details.
		19
		20	You should have received a copy of the GNU Lesser General Public License
		21	along with TZipMaster. If not, see <http://www.gnu.org/licenses/>.
		22
		23	contact: problems@delphizip.org (include ZipMaster in the subject).
		24	updates: http://www.delphizip.org
		25	DelphiZip maillist subscribe at http://www.freelists.org/list/delphizip
		26
		27	modified 2009-06-25
		28	---------------------------------------------------------------------------*)
		29
		30	{$INCLUDE '.\ZipVers19.inc'}
		31
		32	{$IFDEF VERD6up}
		33	{$WARN UNIT_PLATFORM OFF}
		34	{$WARN SYMBOL_PLATFORM OFF}
		35	{$ENDIF}
		36
		37	interface
		38
		39	uses
		40	SysUtils, Windows, Classes, ZipMstr19;
		41
		42	// convert to/from UTF8 characters
		43	function UTF8ToStr(const astr: UTF8String): String;
		44	function StrToUTF8(const ustr: string): UTF8String;
		45	function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer =
		46	-1): TZMString;
		47	function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString;
		48	function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String;
		49	function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String;
		50
		51	// test for valid UTF8 character(s) > 0 _ some, 0 _ none, < 0 _ invalid
		52	function ValidUTF8(pstr: PAnsiChar; len: integer): integer; overload;
		53	function ValidUTF8(const str: AnsiString; len: integer = -1): integer; overload;
		54	{$IFDEF UNICODE}
		55	//overload;
		56	function ValidUTF8(const str: UTF8String; len: integer = -1): integer; overload;
		57	{$ENDIF}
		58
		59	// convert to UTF8 (if needed)
		60	function AsUTF8Str(const zstr: TZMString): UTF8String;
		61
		62	function UTF8SeqLen(c: AnsiChar): integer;
		63	function IsUTF8Trail(c: AnsiChar): boolean;
		64
		65	function PUTF8ToStr(const raw: PAnsiChar; len: integer): string;
		66	function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString;
		67
		68	function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer):
		69	TZMWideString;
		70
		71	// convert to Ansi/OEM escaping unsupported characters
		72	function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString;
		73
		74	// test all characters are supported
		75	function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean;
		76	{$IFNDEF UNICODE}
		77	function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString;
		78	function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString;
		79	function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
		80	// only applicable when converting to OEM
		81	function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
		82	{$ENDIF}
		83	// -------------------------- ------------ -------------------------
		84	implementation
		85
		86	uses ZMCompat19, ZMUtils19;
		87
		88
		89	function PUTF8ToWideStr(const raw: PAnsiChar; len: integer): TZMWideString;
		90	const
		91	MB_ERR_INVALID_CHARS = $00000008; // error for invalid chars
		92	var
		93	wcnt: integer;
		94	flg: cardinal;
		95	p: pAnsiChar;
		96	rlen: Integer;
		97	begin
		98	Result := '';
		99	if (raw = nil) or (len = 0) then
		100	exit;
		101	rlen := Len;
		102	if len < 0 then
		103	begin
		104	len := -1;
		105	p := raw;
		106	rlen := 0;
		107	while p^ <> #0 do
		108	begin
		109	inc(p);
		110	Inc(rlen);
		111	end;
		112	end;
		113	rlen := rlen * 2;
		114	{$IFDEF UNICODE}
		115	flg := MB_ERR_INVALID_CHARS;
		116	{$ELSE}
		117	if Win32MajorVersion > 4 then
		118	flg := MB_ERR_INVALID_CHARS
		119	else
		120	flg := 0;
		121	{$ENDIF}
		122	SetLength(Result, rlen); // plenty of room
		123	wcnt := MultiByteToWideChar(CP_UTF8, flg, raw, len,
		124	PWideChar(Result), rlen);
		125	if wcnt = 0 then // try again assuming Ansi
		126	wcnt := MultiByteToWideChar(0, flg, raw, len,
		127	PWideChar(Result), rlen);
		128	if (wcnt > 0) and (len = -1) then
		129	dec(wcnt); // don't want end null
		130	SetLength(Result, wcnt);
		131	end;
		132
		133	function PUTF8ToStr(const raw: PAnsiChar; len: integer): string;
		134	begin
		135	Result := PUTF8ToWideStr(raw, len);
		136	end;
		137
		138	function PWideToUTF8(const pwstr: PWideChar; len: integer = -1): UTF8String;
		139	var
		140	cnt: integer;
		141	begin
		142	Result := '';
		143	if len < 0 then
		144	len := -1;
		145	if len = 0 then
		146	exit;
		147	cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len, nil, 0, nil, nil);
		148	if cnt > 0 then
		149	begin
		150	SetLength(Result, cnt);
		151	cnt := WideCharToMultiByte(CP_UTF8, 0, pwstr, len,
		152	PAnsiChar(Result), cnt, nil, nil);
		153	if cnt < 1 then
		154	Result := ''; // oops - something went wrong
		155	if (len = -1) and (Result[cnt] = #0) then
		156	SetLength(Result, cnt - 1); // remove trailing nul
		157	end//;
		158	else
		159	RaiseLastOSError;
		160	end;
		161
		162	function WideToUTF8(const astr: TZMWideString; len: integer = -1): UTF8String;
		163	begin
		164	if len < 0 then
		165	len := Length(astr);
		166	Result := PWideToUTF8(@astr[1], len);
		167	end;
		168
		169	function UTF8ToWide(const astr: UTF8String; len: integer = -1): TZMWideString;
		170	begin
		171	Result := '';
		172	if len < 0 then
		173	len := Length(astr);
		174	Result := PUTF8ToWideStr(PAnsiChar(astr), len);
		175	end;
		176
		177	function UTF8ToStr(const astr: UTF8String): String;
		178	begin
		179	Result := PUTF8ToStr(PAnsiChar(astr), Length(astr));
		180	end;
		181
		182	function StrToUTF8(const ustr: string): UTF8String;
		183	var
		184	wtemp: TZMWideString;
		185	begin
		186	wtemp := ustr;
		187	Result := WideToUTF8(wtemp, -1);
		188	end;
		189
		190	function StrToUTFEx(const astr: AnsiString; cp: cardinal = 0; len: integer =
		191	-1): TZMString;
		192	var
		193	ws: TZMWideString;
		194	begin
		195	ws := StrToWideEx(astr, cp, len);
		196	{$IFDEF UNICODE}
		197	Result := ws;
		198	{$ELSE}
		199	Result := StrToUTF8(ws);
		200	{$ENDIF}
		201	end;
		202
		203	function IsUTF8Trail(c: AnsiChar): boolean;
		204	begin
		205	Result := (Ord(c) and $C0) = $80;
		206	end;
		207
		208	function UTF8SeqLen(c: AnsiChar): integer;
		209	var
		210	u8: cardinal;
		211	begin
		212	Result := 1;
		213	u8 := ord(c);
		214	if u8 >= $80 then
		215	begin
		216	if (u8 and $FE) = $FC then
		217	Result := 6
		218	else
		219	if (u8 and $FC) = $F8 then
		220	Result := 5
		221	else
		222	if (u8 and $F8) = $F0 then
		223	Result := 4
		224	else
		225	if (u8 and $F0) = $E0 then
		226	Result := 3
		227	else
		228	if (u8 and $E0) = $C0 then
		229	Result := 2
		230	else
		231	Result := -1; // trailing byte - invalid
		232	end;
		233	end;
		234
		235	// test for valid UTF8 character(s) > 0 _ some, 0 _ none, < 0 _ invalid
		236	function ValidUTF8(const str: AnsiString; len: integer): integer;
		237	var
		238	i, j, ul: integer;
		239	begin
		240	if len < 0 then
		241	len := Length(str);
		242	Result := 0;
		243	i := 1;
		244	while (i <= len) do
		245	begin
		246	ul := UTF8SeqLen(str[i]);
		247	inc(i);
		248	if ul <> 1 then
		249	begin
		250	if (ul < 1) or ((i + ul -2) > len) then
		251	begin
		252	Result := -1; // invalid
		253	break;
		254	end;
		255	// first in seq
		256	for j := 0 to ul -2 do
		257	begin
		258	if (ord(str[i]) and $C0) <> $80 then
		259	begin
		260	result := -1;
		261	break;
		262	end;
		263	inc(i);
		264	end;
		265	if Result >= 0 then
		266	inc(Result) // was valid so count it
		267	else
		268	break;
		269	end;
		270	end;
		271	end;
		272
		273	// test for valid UTF8 character(s) > 0 _ some, 0 _ none, < 0 _ invalid
		274	function ValidUTF8(pstr: PAnsiChar; len: integer): integer;
		275	var
		276	// i,
		277	j, ul: integer;
		278	begin
		279	// if len < 0 then
		280	// len := Length(str);
		281	Result := 0;
		282	// i := 1;
		283	while (len > 0) do
		284	begin
		285	ul := UTF8SeqLen(pstr^);
		286	inc(pstr);
		287	Dec(len);
		288	if ul <> 1 then
		289	begin
		290	// if (ul < 1) or ((i + ul -2) > len) then
		291	if (ul < 1) or (( ul -1) > len) then
		292	begin
		293	Result := -1; // invalid
		294	break;
		295	end;
		296	// first in seq
		297	for j := 0 to ul -2 do
		298	begin
		299	if (ord(pstr^) and $C0) <> $80 then
		300	begin
		301	result := -1;
		302	break;
		303	end;
		304	inc(pstr);
		305	Dec(len);
		306	end;
		307	if Result >= 0 then
		308	inc(Result) // was valid so count it
		309	else
		310	break;
		311	end;
		312	end;
		313	end;
		314
		315	{$IFDEF UNICODE}
		316	function ValidUTF8(const str: UTF8String; len: integer = -1): integer;
		317	var
		318	i, j, ul: integer;
		319	begin
		320	if len < 0 then
		321	len := Length(str);
		322	Result := 0;
		323	i := 1;
		324	while (i <= len) do
		325	begin
		326	ul := UTF8SeqLen(str[i]);
		327	inc(i);
		328	if ul <> 1 then
		329	begin
		330	if (ul < 1) or ((i + ul -2) > len) then
		331	begin
		332	Result := -1; // invalid
		333	break;
		334	end;
		335	// first in seq
		336	for j := 0 to ul -2 do
		337	begin
		338	if (ord(str[i]) and $C0) <> $80 then
		339	begin
		340	result := -1;
		341	break;
		342	end;
		343	inc(i);
		344	end;
		345	if Result >= 0 then
		346	inc(Result) // was valid so count it
		347	else
		348	break;
		349	end;
		350	end;
		351	end;
		352	{$ENDIF}
		353
		354
		355	function AsUTF8Str(const zstr: TZMString): UTF8String;
		356	begin
		357	{$IFDEF UNICODE}
		358	Result := UTF8String(zstr);
		359	{$ELSE}
		360	if ValidUTF8(zstr, -1) < 0 then
		361	Result := StrToUTF8(zstr)
		362	else
		363	Result := zstr;
		364	{$ENDIF}
		365	end;
		366
		367
		368	function StrToWideEx(const astr: AnsiString; cp: cardinal; len: integer):
		369	TZMWideString;
		370	var
		371	cnt: integer;
		372	s: AnsiString;
		373	wcnt: integer;
		374	begin
		375	Result := '';
		376	if len < 0 then
		377	len := Length(astr);
		378	if len = 0 then
		379	exit;
		380	wcnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len, nil, 0);
		381	if wcnt > 0 then
		382	begin
		383	SetLength(Result, wcnt);
		384	cnt := MultiByteToWideChar(cp, 0, PAnsiChar(astr), len,
		385	pWideChar(Result), wcnt);
		386	if cnt < 1 then
		387	Result := ''; // oops - something went wrong
		388	end
		389	else
		390	// RaiseLastOSError;
		391	begin
		392	s := astr; // assume it is Ansi
		393	if (len > 0) and (len < length(astr)) then
		394	SetLength(s, len);
		395	Result := String(s);
		396	end;
		397	end;
		398
		399	// convert to MultiByte escaping unsupported characters
		400	function WideToSafe(const wstr: TZMWideString; ToOEM: boolean): AnsiString;
		401	{$IFNDEF UNICODE}
		402	const WC_NO_BEST_FIT_CHARS = $00000400;
		403	{$endif}
		404
		405	var
		406	Bad: Bool;
		407	c: AnsiChar;
		408	cnt: Integer;
		409	i: Integer;
		410	pa: PAnsiChar;
		411	tmp: AnsiString;
		412	subst: array [0..1] of AnsiChar;
		413	toCP: cardinal;
		414	wc: WideChar;
		415	wlen: Integer;
		416	begin
		417	Result := '';
		418	if wstr <> '' then
		419	begin
		420	if ToOEM then
		421	toCP := CP_OEMCP
		422	else
		423	toCP := CP_ACP;
		424	subst[0] := #$1B; // substitute char - escape
		425	subst[1] := #0;
		426	cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr),
		427	Length(wstr), nil, 0, PAnsiChar(@subst), @Bad);
		428	if cnt > 0 then
		429	begin
		430	SetLength(Result, cnt);
		431	cnt := WideCharToMultiByte(ToCP, WC_NO_BEST_FIT_CHARS, PWideChar(wstr),
		432	Length(wstr), PAnsiChar(Result), cnt, PAnsiChar(@subst), @Bad);
		433	if cnt < 1 then
		434	Result := ''; // oops - something went wrong
		435	end;
		436	if Bad then
		437	begin
		438	tmp := Result;
		439	Result := '';
		440	pa := PAnsiChar(tmp);
		441	i := 1;
		442	wc := #0;
		443	wlen := Length(wstr);
		444	while (pa^ <> #0) do
		445	begin
		446	c := pa^;
		447	if i < wlen then
		448	begin
		449	wc := wstr[i];
		450	inc(i);
		451	end;
		452	if c = #$1B then
		453	Result := Result + '#$' + AnsiString(IntToHex(Ord(wc), 4))
		454	else
		455	Result := Result + c;
		456	pa := CharNextExA(toCP, pa, 0);
		457	end;
		458	end;
		459	end;
		460	end;
		461
		462	{$IFNDEF UNICODE}
		463	function UTF8ToSafe(const ustr: AnsiString; ToOEM: boolean): AnsiString;
		464	begin
		465	Result := WideToSafe(UTF8ToWide(ustr), ToOEM);
		466	end;
		467	{$ENDIF}
		468
		469	{$IFNDEF UNICODE}
		470	function PUTF8ToSafe(const raw: PAnsiChar; len: integer): AnsiString;
		471	begin
		472	Result := WideToSafe(PUTF8ToWideStr(raw, len), false);
		473	end;
		474	{$ENDIF}
		475
		476	// test all characters are supported
		477	function WideIsSafe(const wstr: TZMWideString; ToOEM: boolean): Boolean;
		478	var
		479	Bad: Bool;
		480	cnt: Integer;
		481	toCP: cardinal;
		482	begin
		483	Result := true;
		484	if ToOEM then
		485	toCP := CP_OEMCP
		486	else
		487	toCP := CP_ACP;
		488	if wstr <> '' then
		489	begin
		490	cnt := WideCharToMultiByte(toCP, 0, PWideChar(wstr), Length(wstr),
		491	nil, 0, nil, @Bad);
		492	Result := (not Bad) and (cnt > 0);
		493	end;
		494	end;
		495
		496	{$IFNDEF UNICODE}
		497	function UTF8IsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
		498	begin
		499	Result := WideIsSafe(UTF8ToWide(ustr), ToOEM);
		500	end;
		501	{$ENDIF}
		502
		503	{$IFNDEF UNICODE}
		504	// only applicable when converting to OEM
		505	function AnsiIsSafe(const ustr: AnsiString; ToOEM: boolean): Boolean;
		506	begin
		507	Result := True;
		508	if ToOEM then
		509	Result := WideIsSafe(WideString(ustr), ToOEM);
		510	end;
		511	{$ENDIF}
		512
		513
		514	end.
		515

Subversion Repositories autosfx

autosfx/zmstr1900102/ZMUTF819.pas @ 7 – Rev 1