Commit b26b5a8f by Robert Dewar Committed by Arnaud Charlet

g-byorma.adb, [...]: New files.

2007-12-06  Robert Dewar  <dewar@adacore.com>

	* g-byorma.adb, g-byorma.ads, g-decstr.adb, g-decstr.ads,
	g-deutst.ads, g-encstr.adb, g-encstr.ads, g-enutst.ads: New files.

	* scn.adb: Implement BOM recognition

From-SVN: r130849
parent 150bbaff
------------------------------------------------------------------------------
-- --
-- GNAT COMPILER COMPONENTS --
-- --
-- G N A T . B Y T E _ O R D E R _ M A R K --
-- --
-- B o d y --
-- --
-- Copyright (C) 2006-2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
package body GNAT.Byte_Order_Mark is
--------------
-- Read_BOM --
--------------
procedure Read_BOM
(Str : String;
Len : out Natural;
BOM : out BOM_Kind;
XML_Support : Boolean := False)
is
begin
-- UTF-16 (big-endian)
if Str'Length >= 2
and then Str (Str'First) = Character'Val (16#FE#)
and then Str (Str'First + 1) = Character'Val (16#FF#)
then
Len := 2;
BOM := UTF16_BE;
-- UTF-16 (little-endian)
elsif Str'Length >= 2
and then Str (Str'First) = Character'Val (16#FF#)
and then Str (Str'First + 1) = Character'Val (16#FE#)
then
Len := 2;
BOM := UTF16_LE;
-- UTF-32 (big-endian)
elsif Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#FE#)
and then Str (Str'First + 3) = Character'Val (16#FF#)
then
Len := 4;
BOM := UTF32_BE;
-- UTF-32 (little-endian)
elsif Str'Length >= 4
and then Str (Str'First) = Character'Val (16#FF#)
and then Str (Str'First + 1) = Character'Val (16#FE#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 4;
BOM := UTF32_LE;
-- UTF-8 (endian-independent)
elsif Str'Length >= 3
and then Str (Str'First) = Character'Val (16#EF#)
and then Str (Str'First + 1) = Character'Val (16#BB#)
and then Str (Str'First + 2) = Character'Val (16#BF#)
then
Len := 3;
BOM := UTF8_All;
-- UCS-4 (big-endian) XML only
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#3C#)
then
Len := 0;
BOM := UCS4_BE;
-- UCS-4 (little-endian) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#3C#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UCS4_LE;
-- UCS-4 (unusual byte order 2143) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#3C#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UCS4_2143;
-- UCS-4 (unusual byte order 3412) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#3C#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UCS4_3412;
-- UTF-16 (big-endian) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#3C#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#3F#)
then
Len := 0;
BOM := UTF16_BE;
-- UTF-32 (little-endian) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#3C#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#3F#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UTF16_LE;
-- Unrecognized special encodings XML only
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#3C#)
and then Str (Str'First + 1) = Character'Val (16#3F#)
and then Str (Str'First + 2) = Character'Val (16#78#)
and then Str (Str'First + 3) = Character'Val (16#6D#)
then
-- Utf8, ASCII, some part of ISO8859, Shift-JIS, EUC,...
Len := 0;
BOM := Unknown;
-- No BOM recognized
else
Len := 0;
BOM := Unknown;
end if;
end Read_BOM;
end GNAT.Byte_Order_Mark;
------------------------------------------------------------------------------
-- --
-- GNAT COMPILER COMPONENTS --
-- --
-- G N A T . B Y T E _ O R D E R _ M A R K --
-- --
-- S p e c --
-- --
-- Copyright (C) 2006-2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
-- This package provides a procedure for reading and interpreting the BOM
-- (byte order mark) used to publish the encoding method for a string (for
-- example, a UTF-8 encoded file in windows will start with the appropriate
-- BOM sequence to signal UTF-8 encoding.
-- There are two cases
-- Case 1. UTF encodings for Unicode files
-- Here the convention is to have the first character of the file be a
-- non-breaking zero width space character (16#0000_FEFF#). For the UTF
-- encodings, the representation of this character can be used to uniquely
-- determine the encoding. Furthermore, the possibility of any confusion
-- with unencoded files is minimal, since for example the UTF-8 encoding
-- of this character looks like the sequence:
-- LC_I_Diaeresis
-- Right_Angle_Quotation
-- Fraction_One_Half
-- which is so unlikely to occur legitimately in normal use that it can
-- safely be ignored in most cases (for example, no legitimate Ada source
-- file could start with this sequence of characters).
-- Case 2. Specialized XML encodings
-- The XML standard defines a number of other possible encodings and also
-- defines standardized sequences for marking these encodings. This package
-- can also optionally handle these XML defined BOM sequences. These XML
-- cases depend on the first character of the XML file being < so that the
-- encoding of this character can be recognized.
pragma Warnings (Off);
pragma Compiler_Unit;
pragma Warnings (On);
package GNAT.Byte_Order_Mark is
type BOM_Kind is
(UTF8_All, -- UTF8-encoding
UTF16_LE, -- UTF16 little-endian encoding
UTF16_BE, -- UTF16 big-endian encoding
UTF32_LE, -- UTF32 little-endian encoding
UTF32_BE, -- UTF32 big-endian encoding
-- The following cases are for XML only
UCS4_BE, -- UCS-4, big endian machine (1234 order)
UCS4_LE, -- UCS-4, little endian machine (4321 order)
UCS4_2143, -- UCS-4, unusual byte order (2143 order)
UCS4_3412, -- UCS-4, unusual byte order (3412 order)
-- Value returned if no BOM recognized
Unknown); -- Unknown, assumed to be ASCII compatible
procedure Read_BOM
(Str : String;
Len : out Natural;
BOM : out BOM_Kind;
XML_Support : Boolean := False);
-- This is the routine to read the BOM from the start of the given string
-- Str. On return BOM is set to the appropriate BOM_Kind and Len is set to
-- its length. The caller will typically skip the first Len characters in
-- the string to ignore the BOM sequence. The special XML possibilities are
-- recognized only if flag XML_Support is set to True. Note that for the
-- XML cases, Len is always set to zero on return (not to the length of the
-- relevant sequence) since in the XML cases, the sequence recognized is
-- for the first real character in the file (<) which is not to be skipped.
end GNAT.Byte_Order_Mark;
------------------------------------------------------------------------------
-- --
-- GNAT RUN-TIME COMPONENTS --
-- --
-- G N A T . D E C O D E _ S T R I N G --
-- --
-- S p e c --
-- --
-- Copyright (C) 2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
-- This package provides a utility routine for converting from an encoded
-- string to a corresponding Wide_String or Wide_Wide_String value.
with Interfaces; use Interfaces;
with System.WCh_Cnv; use System.WCh_Cnv;
with System.WCh_Con; use System.WCh_Con;
package body GNAT.Decode_String is
-----------------------
-- Local Subprograms --
-----------------------
procedure Bad;
pragma No_Return (Bad);
-- Raise error for bad encoding
procedure Past_End;
pragma No_Return (Past_End);
-- Raise error for off end of string
---------
-- Bad --
---------
procedure Bad is
begin
raise Constraint_Error with
"bad encoding or character out of range";
end Bad;
---------------------------
-- Decode_Wide_Character --
---------------------------
procedure Decode_Wide_Character
(Input : String;
Ptr : in out Natural;
Result : out Wide_Character)
is
Char : Wide_Wide_Character;
begin
Decode_Wide_Wide_Character (Input, Ptr, Char);
if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
Bad;
else
Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
end if;
end Decode_Wide_Character;
------------------------
-- Decode_Wide_String --
------------------------
function Decode_Wide_String (S : String) return Wide_String is
Result : Wide_String (1 .. S'Length);
Length : Natural;
begin
Decode_Wide_String (S, Result, Length);
return Result (1 .. Length);
end Decode_Wide_String;
procedure Decode_Wide_String
(S : String;
Result : out Wide_String;
Length : out Natural)
is
Ptr : Natural;
begin
Ptr := S'First;
Length := 0;
while Ptr <= S'Last loop
if Length >= Result'Last then
Past_End;
end if;
Length := Length + 1;
Decode_Wide_Character (S, Ptr, Result (Length));
end loop;
end Decode_Wide_String;
--------------------------------
-- Decode_Wide_Wide_Character --
--------------------------------
procedure Decode_Wide_Wide_Character
(Input : String;
Ptr : in out Natural;
Result : out Wide_Wide_Character)
is
C : Character;
function In_Char return Character;
pragma Inline (In_Char);
-- Function to get one input character
-------------
-- In_Char --
-------------
function In_Char return Character is
begin
if Ptr <= Input'Last then
Ptr := Ptr + 1;
return Input (Ptr - 1);
else
Past_End;
end if;
end In_Char;
-- Start of processing for Decode_Wide_Wide_Character
begin
C := In_Char;
-- Special fast processing for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
W : Unsigned_32;
procedure Get_UTF_Byte;
pragma Inline (Get_UTF_Byte);
-- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
-- Reads a byte, and raises CE if the first two bits are not 10.
-- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
------------------
-- Get_UTF_Byte --
------------------
procedure Get_UTF_Byte is
begin
U := Unsigned_32 (Character'Pos (In_Char));
if (U and 2#11000000#) /= 2#10_000000# then
Bad;
end if;
W := Shift_Left (W, 6) or (U and 2#00111111#);
end Get_UTF_Byte;
-- Start of processing for UTF8 case
begin
-- Note: for details of UTF8 encoding see RFC 3629
U := Unsigned_32 (Character'Pos (C));
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
if (U and 2#10000000#) = 2#00000000# then
Result := Wide_Wide_Character'Val (Character'Pos (C));
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
elsif (U and 2#11100000#) = 2#110_00000# then
W := U and 2#00011111#;
Get_UTF_Byte;
Result := Wide_Wide_Character'Val (W);
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
elsif (U and 2#11110000#) = 2#1110_0000# then
W := U and 2#00001111#;
Get_UTF_Byte;
Get_UTF_Byte;
Result := Wide_Wide_Character'Val (W);
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
elsif (U and 2#11111000#) = 2#11110_000# then
W := U and 2#00000111#;
for K in 1 .. 3 loop
Get_UTF_Byte;
end loop;
Result := Wide_Wide_Character'Val (W);
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
elsif (U and 2#11111100#) = 2#111110_00# then
W := U and 2#00000011#;
for K in 1 .. 4 loop
Get_UTF_Byte;
end loop;
Result := Wide_Wide_Character'Val (W);
-- All other cases are invalid, note that this includes:
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx 10xxxxxx
-- since Wide_Wide_Character does not include code values
-- greater than 16#03FF_FFFF#.
else
Bad;
end if;
end UTF8;
-- All encoding functions other than UTF-8
else
Non_UTF8 : declare
function Char_Sequence_To_UTF is
new Char_Sequence_To_UTF_32 (In_Char);
begin
-- For brackets, must test for specific case of [ not followed by
-- quotation, where we must not call Char_Sequence_To_UTF, but
-- instead just return the bracket unchanged.
if Encoding_Method = WCEM_Brackets
and then C = '['
and then (Ptr > Input'Last or else Input (Ptr) /= '"')
then
Result := '[';
-- All other cases including [" with Brackets
else
Result :=
Wide_Wide_Character'Val
(Char_Sequence_To_UTF (C, Encoding_Method));
end if;
end Non_UTF8;
end if;
end Decode_Wide_Wide_Character;
-----------------------------
-- Decode_Wide_Wide_String --
-----------------------------
function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
Result : Wide_Wide_String (1 .. S'Length);
Length : Natural;
begin
Decode_Wide_Wide_String (S, Result, Length);
return Result (1 .. Length);
end Decode_Wide_Wide_String;
procedure Decode_Wide_Wide_String
(S : String;
Result : out Wide_Wide_String;
Length : out Natural)
is
Ptr : Natural;
begin
Ptr := S'First;
Length := 0;
while Ptr <= S'Last loop
if Length >= Result'Last then
Past_End;
end if;
Length := Length + 1;
Decode_Wide_Wide_Character (S, Ptr, Result (Length));
end loop;
end Decode_Wide_Wide_String;
-------------------------
-- Next_Wide_Character --
-------------------------
procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
begin
if Ptr < Input'First then
Past_End;
end if;
-- Special efficient encoding for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
procedure Getc;
pragma Inline (Getc);
-- Gets the character at Input (Ptr) and returns code in U as
-- Unsigned_32 value. On return Ptr is bumped past the character.
procedure Skip_UTF_Byte;
pragma Inline (Skip_UTF_Byte);
-- Skips past one encoded byte which must be 2#10xxxxxx#
----------
-- Getc --
----------
procedure Getc is
begin
if Ptr > Input'Last then
Past_End;
else
U := Unsigned_32 (Character'Pos (Input (Ptr)));
Ptr := Ptr + 1;
end if;
end Getc;
-------------------
-- Skip_UTF_Byte --
-------------------
procedure Skip_UTF_Byte is
begin
Getc;
if (U and 2#11000000#) /= 2#10_000000# then
Bad;
end if;
end Skip_UTF_Byte;
-- Start of processing for UTF-8 case
begin
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
Getc;
if (U and 2#10000000#) = 2#00000000# then
return;
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
elsif (U and 2#11100000#) = 2#110_00000# then
Skip_UTF_Byte;
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
elsif (U and 2#11110000#) = 2#1110_0000# then
Skip_UTF_Byte;
Skip_UTF_Byte;
-- Any other code is invalid, note that this includes:
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx 10xxxxxx
-- since Wide_Character does not allow codes > 16#FFFF#
else
Bad;
end if;
end UTF8;
-- Non-UTF-8 cass
else
declare
Discard : Wide_Character;
begin
Decode_Wide_Character (Input, Ptr, Discard);
end;
end if;
end Next_Wide_Character;
------------------------------
-- Next_Wide_Wide_Character --
------------------------------
procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
begin
-- Special efficient encoding for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
procedure Getc;
pragma Inline (Getc);
-- Gets the character at Input (Ptr) and returns code in U as
-- Unsigned_32 value. On return Ptr is bumped past the character.
procedure Skip_UTF_Byte;
pragma Inline (Skip_UTF_Byte);
-- Skips past one encoded byte which must be 2#10xxxxxx#
----------
-- Getc --
----------
procedure Getc is
begin
if Ptr > Input'Last then
Past_End;
else
U := Unsigned_32 (Character'Pos (Input (Ptr)));
Ptr := Ptr + 1;
end if;
end Getc;
-------------------
-- Skip_UTF_Byte --
-------------------
procedure Skip_UTF_Byte is
begin
Getc;
if (U and 2#11000000#) /= 2#10_000000# then
Bad;
end if;
end Skip_UTF_Byte;
-- Start of processing for UTF-8 case
begin
if Ptr < Input'First then
Past_End;
end if;
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
Getc;
if (U and 2#10000000#) = 2#00000000# then
null;
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
elsif (U and 2#11100000#) = 2#110_00000# then
Skip_UTF_Byte;
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
elsif (U and 2#11110000#) = 2#1110_0000# then
Skip_UTF_Byte;
Skip_UTF_Byte;
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
elsif (U and 2#11111000#) = 2#11110_000# then
for K in 1 .. 3 loop
Skip_UTF_Byte;
end loop;
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
elsif (U and 2#11111100#) = 2#111110_00# then
for K in 1 .. 4 loop
Skip_UTF_Byte;
end loop;
-- Any other code is invalid, note that this includes:
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx 10xxxxxx
-- since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
else
Bad;
end if;
end UTF8;
-- Non-UTF-8 cass
else
declare
Discard : Wide_Wide_Character;
begin
Decode_Wide_Wide_Character (Input, Ptr, Discard);
end;
end if;
end Next_Wide_Wide_Character;
--------------
-- Past_End --
--------------
procedure Past_End is
begin
raise Constraint_Error with "past end of string";
end Past_End;
-------------------------
-- Prev_Wide_Character --
-------------------------
procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
begin
if Ptr > Input'Last + 1 then
Past_End;
end if;
-- Special efficient encoding for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
procedure Getc;
pragma Inline (Getc);
-- Gets the character at Input (Ptr - 1) and returns code in U as
-- Unsigned_32 value. On return Ptr is decremented by one.
procedure Skip_UTF_Byte;
pragma Inline (Skip_UTF_Byte);
-- Checks that U is 2#10xxxxxx# and then calls Get
----------
-- Getc --
----------
procedure Getc is
begin
if Ptr <= Input'First then
Past_End;
else
Ptr := Ptr - 1;
U := Unsigned_32 (Character'Pos (Input (Ptr)));
end if;
end Getc;
-------------------
-- Skip_UTF_Byte --
-------------------
procedure Skip_UTF_Byte is
begin
if (U and 2#11000000#) = 2#10_000000# then
Getc;
else
Bad;
end if;
end Skip_UTF_Byte;
-- Start of processing for UTF-8 case
begin
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
Getc;
if (U and 2#10000000#) = 2#00000000# then
return;
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11100000#) = 2#110_00000# then
return;
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11110000#) = 2#1110_0000# then
return;
-- Any other code is invalid, note that this includes:
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
-- 10xxxxxx
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- since Wide_Character does not allow codes > 16#FFFF#
else
Bad;
end if;
end if;
end if;
end UTF8;
-- Special efficient encoding for brackets case
elsif Encoding_Method = WCEM_Brackets then
Brackets : declare
P : Natural;
S : Natural;
begin
-- See if we have "] at end positions
if Ptr > Input'First + 1
and then Input (Ptr - 1) = ']'
and then Input (Ptr - 2) = '"'
then
P := Ptr - 2;
-- Loop back looking for [" at start
while P >= Ptr - 10 loop
if P <= Input'First + 1 then
Bad;
elsif Input (P - 1) = '"'
and then Input (P - 2) = '['
then
-- Found ["..."], scan forward to check it
S := P - 2;
P := S;
Next_Wide_Character (Input, P);
-- OK if at original pointer, else error
if P = Ptr then
Ptr := S;
return;
else
Bad;
end if;
end if;
P := P - 1;
end loop;
-- Falling through loop means more than 8 chars between the
-- enclosing brackets (or simply a missing left bracket)
Bad;
-- Here if no bracket sequence present
else
if Ptr = Input'First then
Past_End;
else
Ptr := Ptr - 1;
end if;
end if;
end Brackets;
-- Non-UTF-8/Brackets. These are the inefficient cases where we have to
-- go to the start of the string and skip forwards till Ptr matches.
else
Non_UTF_Brackets : declare
Discard : Wide_Character;
PtrS : Natural;
PtrP : Natural;
begin
PtrS := Input'First;
if Ptr <= PtrS then
Past_End;
end if;
loop
PtrP := PtrS;
Decode_Wide_Character (Input, PtrS, Discard);
if PtrS = Ptr then
Ptr := PtrP;
return;
elsif PtrS > Ptr then
Bad;
end if;
end loop;
exception
when Constraint_Error =>
Bad;
end Non_UTF_Brackets;
end if;
end Prev_Wide_Character;
------------------------------
-- Prev_Wide_Wide_Character --
------------------------------
procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
begin
if Ptr > Input'Last + 1 then
Past_End;
end if;
-- Special efficient encoding for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
procedure Getc;
pragma Inline (Getc);
-- Gets the character at Input (Ptr - 1) and returns code in U as
-- Unsigned_32 value. On return Ptr is decremented by one.
procedure Skip_UTF_Byte;
pragma Inline (Skip_UTF_Byte);
-- Checks that U is 2#10xxxxxx# and then calls Get
----------
-- Getc --
----------
procedure Getc is
begin
if Ptr <= Input'First then
Past_End;
else
Ptr := Ptr - 1;
U := Unsigned_32 (Character'Pos (Input (Ptr)));
end if;
end Getc;
-------------------
-- Skip_UTF_Byte --
-------------------
procedure Skip_UTF_Byte is
begin
if (U and 2#11000000#) = 2#10_000000# then
Getc;
else
Bad;
end if;
end Skip_UTF_Byte;
-- Start of processing for UTF-8 case
begin
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
Getc;
if (U and 2#10000000#) = 2#00000000# then
return;
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11100000#) = 2#110_00000# then
return;
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11110000#) = 2#1110_0000# then
return;
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
-- 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11111000#) = 2#11110_000# then
return;
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11111100#) = 2#111110_00# then
return;
-- Any other code is invalid, note that this includes:
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- since Wide_Wide_Character does not allow codes
-- greater than 16#03FF_FFFF#
else
Bad;
end if;
end if;
end if;
end if;
end if;
end UTF8;
-- Special efficient encoding for brackets case
elsif Encoding_Method = WCEM_Brackets then
Brackets : declare
P : Natural;
S : Natural;
begin
-- See if we have "] at end positions
if Ptr > Input'First + 1
and then Input (Ptr - 1) = ']'
and then Input (Ptr - 2) = '"'
then
P := Ptr - 2;
-- Loop back looking for [" at start
while P >= Ptr - 10 loop
if P <= Input'First + 1 then
Bad;
elsif Input (P - 1) = '"'
and then Input (P - 2) = '['
then
-- Found ["..."], scan forward to check it
S := P - 2;
P := S;
Next_Wide_Wide_Character (Input, P);
-- OK if at original pointer, else error
if P = Ptr then
Ptr := S;
return;
else
Bad;
end if;
end if;
P := P - 1;
end loop;
-- Falling through loop means more than 8 chars between the
-- enclosing brackets (or simply a missing left bracket)
Bad;
-- Here if no bracket sequence present
else
if Ptr = Input'First then
Past_End;
else
Ptr := Ptr - 1;
end if;
end if;
end Brackets;
-- Non-UTF-8/Brackets. These are the inefficient cases where we have to
-- go to the start of the string and skip forwards till Ptr matches.
else
Non_UTF8_Brackets : declare
Discard : Wide_Wide_Character;
PtrS : Natural;
PtrP : Natural;
begin
PtrS := Input'First;
if Ptr <= PtrS then
Past_End;
end if;
loop
PtrP := PtrS;
Decode_Wide_Wide_Character (Input, PtrS, Discard);
if PtrS = Ptr then
Ptr := PtrP;
return;
elsif PtrS > Ptr then
Bad;
end if;
end loop;
exception
when Constraint_Error =>
Bad;
end Non_UTF8_Brackets;
end if;
end Prev_Wide_Wide_Character;
--------------------------
-- Validate_Wide_String --
--------------------------
function Validate_Wide_String (S : String) return Boolean is
Ptr : Natural;
begin
Ptr := S'First;
while Ptr <= S'Last loop
Next_Wide_Character (S, Ptr);
end loop;
return True;
exception
when Constraint_Error =>
return False;
end Validate_Wide_String;
-------------------------------
-- Validate_Wide_Wide_String --
-------------------------------
function Validate_Wide_Wide_String (S : String) return Boolean is
Ptr : Natural;
begin
Ptr := S'First;
while Ptr <= S'Last loop
Next_Wide_Wide_Character (S, Ptr);
end loop;
return True;
exception
when Constraint_Error =>
return False;
end Validate_Wide_Wide_String;
end GNAT.Decode_String;
------------------------------------------------------------------------------
-- --
-- GNAT RUN-TIME COMPONENTS --
-- --
-- G N A T . D E C O D E _ S T R I N G --
-- --
-- S p e c --
-- --
-- Copyright (C) 2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
-- This generic package provides utility routines for converting from an
-- encoded string to a corresponding Wide_String or Wide_Wide_String value
-- using a specified encoding convention, which is supplied as the generic
-- parameter. UTF-8 is handled especially efficiently, and if the encoding
-- method is known at compile time to be WCEM_UTF8, then the instantiation
-- is specialized to handle only the UTF-8 case and exclude code for the
-- other encoding methods. The package also provides positioning routines
-- for skipping encoded characters in either direction, and for validating
-- strings for correct encodings.
-- Note: this package is only about decoding sequences of 8-bit characters
-- into corresponding 16-bit Wide_String or 32-bit Wide_Wide_String values.
-- It knows nothing at all about the character encodings being used for the
-- resulting Wide_Character and Wide_Wide_Character values. Most often this
-- will be Unicode/ISO-10646 as specified by the Ada RM, but this package
-- does not make any assumptions about the character coding. See also the
-- packages Ada.Wide_[Wide_]Characters.Unicode for unicode specific functions.
-- Note on the use of brackets encoding (WCEM_Brackets). The brackets encoding
-- method is ambiguous in the context of this package, since there is no way
-- to tell if ["1234"] is eight unencoded characters or one encoded character.
-- In the context of Ada sources, any sequence starting [" must be the start
-- of an encoding (since that sequence is not valid in Ada source otherwise).
-- The routines in this package use the same approach. If the input string
-- contains the sequence [" then this is assumed to be the start of a brackets
-- encoding sequence, and if it does not match the syntax, an error is raised.
-- In the case of the Prev functions, a sequence ending with "] is assumed to
-- be a valid brackets sequence, and an error is raised if it is not.
with System.WCh_Con;
generic
Encoding_Method : System.WCh_Con.WC_Encoding_Method;
package GNAT.Decode_String is
pragma Pure;
function Decode_Wide_String (S : String) return Wide_String;
pragma Inline (Decode_Wide_String);
-- Decode the given String, which is encoded using the indicated coding
-- method, returning the corresponding decoded Wide_String value. If S
-- contains a character code that cannot be represented with the given
-- encoding, then Constraint_Error is raised.
procedure Decode_Wide_String
(S : String;
Result : out Wide_String;
Length : out Natural);
-- Similar to the above function except that the result is stored in the
-- given Wide_String variable Result, starting at Result (Result'First). On
-- return, Length is set to the number of characters stored in Result. The
-- caller must ensure that Result is long enough (an easy choice is to set
-- the length equal to the S'Length, since decoding can never increase the
-- string length). If the length of Result is insufficient Constraint_Error
-- will be raised.
function Decode_Wide_Wide_String (S : String) return Wide_Wide_String;
pragma Inline (Decode_Wide_Wide_String);
-- Same as above function but for Wide_Wide_String output
procedure Decode_Wide_Wide_String
(S : String;
Result : out Wide_Wide_String;
Length : out Natural);
-- Same as above procedure, but for Wide_Wide_String output
function Validate_Wide_String (S : String) return Boolean;
-- This function inspects the string S to determine if it contains only
-- valid encodings corresponding to Wide_Character values using the
-- given encoding. If a call to Decode_Wide_String (S) would return
-- without raising Constraint_Error, then Validate_Wide_String will
-- return True. If the call would have raised Constraint_Error, then
-- Validate_Wide_String will return False.
function Validate_Wide_Wide_String (S : String) return Boolean;
-- Similar to Validate_Wide_String, except that it succeeds if the string
-- contains only encodings corresponding to Wide_Wide_Character values.
procedure Decode_Wide_Character
(Input : String;
Ptr : in out Natural;
Result : out Wide_Character);
pragma Inline (Decode_Wide_Character);
-- This is a lower level procedure that decodes a single character using
-- the given encoding method. The encoded character is stored in Input,
-- starting at Input (Ptr). The resulting output character is stored in
-- Result, and on return Ptr is updated past the input character or
-- encoding sequence. Constraint_Error will be raised if the input has
-- has a character that cannot be represented using the given encoding,
-- or if Ptr is outside the bounds of the Input string.
procedure Decode_Wide_Wide_Character
(Input : String;
Ptr : in out Natural;
Result : out Wide_Wide_Character);
-- Same as above procedure but with Wide_Wide_Character input
procedure Next_Wide_Character (Input : String; Ptr : in out Natural);
-- This procedure examines the input string starting at Input (Ptr), and
-- advances Ptr past one character in the encoded string, so that on return
-- Ptr points to the next encoded character. Constraint_Error is raised if
-- an invalid encoding is encountered, or the end of the string is reached
-- or if Ptr is less than String'First on entry, or if the character
-- skipped is not a valid Wide_Character code. This call may be more
-- efficient than calling Decode_Wide_Character and discarding the result.
procedure Prev_Wide_Character (Input : String; Ptr : in out Natural);
-- This procedure is similar to Next_Encoded_Character except that it moves
-- backwards in the string, so that on return, Ptr is set to point to the
-- previous encoded character. Constraint_Error is raised if the start of
-- the string is encountered. It is valid for Ptr to be one past the end
-- of the string for this call (in which case on return it will point to
-- the last encoded character).
--
-- Note: it is not generally possible to do this function efficiently with
-- all encodings, the current implementation is only efficient for the case
-- of UTF-8 (Encoding_Method = WCEM_UTF8) and Brackets (Encoding_Method =
-- WCEM_Brackets). For all other encodings, we work by starting at the
-- beginning of the string and moving forward till Ptr is reached, which
-- is correct but slow.
procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural);
-- Similar to Next_Wide_Character except that codes skipped must be valid
-- Wide_Wide_Character codes.
procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural);
-- Similar to Prev_Wide_Character except that codes skipped must be valid
-- Wide_Wide_Character codes.
end GNAT.Decode_String;
------------------------------------------------------------------------------
-- --
-- GNAT RUN-TIME COMPONENTS --
-- --
-- G N A T . D E C O D E _ U T F 8 _ S T R I N G --
-- --
-- S p e c --
-- --
-- Copyright (C) 2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
-- This package provides a pre-instantiation of GNAT.Decode_String for the
-- common case of UTF-8 encoding. As noted in the documentation of that
-- package, this UTF-8 instantiation is efficient and specialized so that
-- it has only the code for the UTF-8 case. See g-decstr.ads for full
-- documentation on this package.
with GNAT.Decode_String;
with System.WCh_Con;
package GNAT.Decode_UTF8_String is
new GNAT.Decode_String (System.WCh_Con.WCEM_UTF8);
------------------------------------------------------------------------------
-- --
-- GNAT RUN-TIME COMPONENTS --
-- --
-- G N A T . E N C O D E _ S T R I N G --
-- --
-- B o d y --
-- --
-- Copyright (C) 2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
with Interfaces; use Interfaces;
with System.WCh_Con; use System.WCh_Con;
with System.WCh_Cnv; use System.WCh_Cnv;
package body GNAT.Encode_String is
-----------------------
-- Local Subprograms --
-----------------------
procedure Bad;
pragma No_Return (Bad);
-- Raise error for bad character code
procedure Past_End;
pragma No_Return (Past_End);
-- Raise error for off end of string
---------
-- Bad --
---------
procedure Bad is
begin
raise Constraint_Error with
"character cannot be encoded with given Encoding_Method";
end Bad;
------------------------
-- Encode_Wide_String --
------------------------
function Encode_Wide_String (S : Wide_String) return String is
Long : constant Natural := WC_Longest_Sequences (Encoding_Method);
Result : String (1 .. S'Length * Long);
Length : Natural;
begin
Encode_Wide_String (S, Result, Length);
return Result (1 .. Length);
end Encode_Wide_String;
procedure Encode_Wide_String
(S : Wide_String;
Result : out String;
Length : out Natural)
is
Ptr : Natural;
begin
Ptr := S'First;
for J in S'Range loop
Encode_Wide_Character (S (J), Result, Ptr);
end loop;
Length := Ptr - S'First;
end Encode_Wide_String;
-----------------------------
-- Encode_Wide_Wide_String --
-----------------------------
function Encode_Wide_Wide_String (S : Wide_Wide_String) return String is
Long : constant Natural := WC_Longest_Sequences (Encoding_Method);
Result : String (1 .. S'Length * Long);
Length : Natural;
begin
Encode_Wide_Wide_String (S, Result, Length);
return Result (1 .. Length);
end Encode_Wide_Wide_String;
procedure Encode_Wide_Wide_String
(S : Wide_Wide_String;
Result : out String;
Length : out Natural)
is
Ptr : Natural;
begin
Ptr := S'First;
for J in S'Range loop
Encode_Wide_Wide_Character (S (J), Result, Ptr);
end loop;
Length := Ptr - S'First;
end Encode_Wide_Wide_String;
---------------------------
-- Encode_Wide_Character --
---------------------------
procedure Encode_Wide_Character
(Char : Wide_Character;
Result : in out String;
Ptr : in out Natural)
is
begin
Encode_Wide_Wide_Character
(Wide_Wide_Character'Val (Wide_Character'Pos (Char)), Result, Ptr);
exception
when Constraint_Error =>
Bad;
end Encode_Wide_Character;
--------------------------------
-- Encode_Wide_Wide_Character --
--------------------------------
procedure Encode_Wide_Wide_Character
(Char : Wide_Wide_Character;
Result : in out String;
Ptr : in out Natural)
is
U : Unsigned_32;
procedure Out_Char (C : Character);
pragma Inline (Out_Char);
-- Procedure to store one character for instantiation below
--------------
-- Out_Char --
--------------
procedure Out_Char (C : Character) is
begin
if Ptr > Result'Last then
Past_End;
else
Result (Ptr) := C;
Ptr := Ptr + 1;
end if;
end Out_Char;
-- Start of processing for Encode_Wide_Wide_Character;
begin
-- Efficient code for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
-- Note: for details of UTF8 encoding see RFC 3629
U := Unsigned_32 (Wide_Wide_Character'Pos (Char));
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
if U <= 16#00_007F# then
Out_Char (Character'Val (U));
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
elsif U <= 16#00_07FF# then
Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
-- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
elsif U <= 16#00_FFFF# then
Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
and 2#00111111#)));
Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
elsif U <= 16#10_FFFF# then
Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
and 2#00111111#)));
Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
and 2#00111111#)));
Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
elsif U <= 16#03FF_FFFF# then
Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
and 2#00111111#)));
Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
and 2#00111111#)));
Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
and 2#00111111#)));
Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
-- All other cases are invalid character codes, not this includes:
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx 10xxxxxx
-- since Wide_Wide_Character values cannot exceed 16#3F_FFFF#
else
Bad;
end if;
-- All encoding methods other than UTF-8
else
Non_UTF8 : declare
procedure UTF_32_To_String is
new UTF_32_To_Char_Sequence (Out_Char);
-- Instantiate conversion procedure with above Out_Char routine
begin
UTF_32_To_String
(UTF_32_Code (Wide_Wide_Character'Pos (Char)), Encoding_Method);
exception
when Constraint_Error =>
Bad;
end Non_UTF8;
end if;
end Encode_Wide_Wide_Character;
--------------
-- Past_End --
--------------
procedure Past_End is
begin
raise Constraint_Error with "past end of string";
end Past_End;
end GNAT.Encode_String;
------------------------------------------------------------------------------
-- --
-- GNAT RUN-TIME COMPONENTS --
-- --
-- G N A T . E N C O D E _ S T R I N G --
-- --
-- S p e c --
-- --
-- Copyright (C) 2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
-- This generic package provides utility routines for converting from
-- Wide_String or Wide_Wide_String to encoded String using a specified
-- encoding convention, which is supplied as the generic parameter. If
-- this parameter is a known at compile time constant (e.g. a constant
-- definned in System.WCh_Con), the instantiation is specialized so that
-- it applies only to this specified coding.
-- Note: this package is only about encoding sequences of 16- or 32-bit
-- characters into a sequence of 8-bit codes. It knows nothing at all about
-- the character encodings being used for the input Wide_Character and
-- Wide_Wide_Character values, although some of the encoding methods (notably
-- JIS and EUC) have built in assumptions about the range of possible input
-- code values. Most often the input will be Unicode/ISO-10646 as specified by
-- the Ada RM, but this package does not make any assumptions about the
-- character coding, and in the case of UTF-8 all possible code values can be
-- encoded. See also the packages Ada.Wide_[Wide_]Characters.Unicode for
-- unicode specific functions.
-- Note on brackets encoding (WCEM_Brackets). On input, upper half characters
-- can be represented as ["hh"] but the routines in this package will only use
-- brackets encodings for codes higher than 16#FF#, so upper half characters
-- will be output as single Character values.
with System.WCh_Con;
generic
Encoding_Method : System.WCh_Con.WC_Encoding_Method;
package GNAT.Encode_String is
pragma Pure;
function Encode_Wide_String (S : Wide_String) return String;
pragma Inline (Encode_Wide_String);
-- Encode the given Wide_String, returning a String encoded using the
-- given encoding method. Constraint_Error will be raised if the encoding
-- method cannot accomodate the input data.
procedure Encode_Wide_String
(S : Wide_String;
Result : out String;
Length : out Natural);
-- Encode the given Wide_String, storing the encoded string in Result,
-- with Length being set to the length of the encoded string. The caller
-- must ensure that Result is long enough (see useful constants defined
-- in System.WCh_Con: WC_Longest_Sequence, WC_Longest_Sequences). If the
-- length of Result is insufficient Constraint_Error will be raised.
-- Constraint_Error will also be raised if the encoding method cannot
-- accomodate the input data.
function Encode_Wide_Wide_String (S : Wide_Wide_String) return String;
pragma Inline (Encode_Wide_Wide_String);
-- Same as above function but for Wide_Wide_String input
procedure Encode_Wide_Wide_String
(S : Wide_Wide_String;
Result : out String;
Length : out Natural);
-- Same as above procedure, but for Wide_Wide_String input
procedure Encode_Wide_Character
(Char : Wide_Character;
Result : in out String;
Ptr : in out Natural);
pragma Inline (Encode_Wide_Character);
-- This is a lower level procedure that encodes the single character Char.
-- The output is stored in Result starting at Result (Ptr), and Ptr is
-- updated past the stored value. Constraint_Error is raised if Result
-- is not long enough to accomodate the result, or if the encoding method
-- specified does not accomodate the input character value, or if Ptr is
-- outside the bounds of the Result string.
procedure Encode_Wide_Wide_Character
(Char : Wide_Wide_Character;
Result : in out String;
Ptr : in out Natural);
-- Same as above procedure but with Wide_Wide_Character input
end GNAT.Encode_String;
------------------------------------------------------------------------------
-- --
-- GNAT RUN-TIME COMPONENTS --
-- --
-- G N A T . E N C O D E _ U T F 8 _ S T R I N G --
-- --
-- S p e c --
-- --
-- Copyright (C) 2007, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
-- Boston, MA 02110-1301, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
-- This package provides a pre-instantiation of GNAT.Encode_String for the
-- common case of UTF-8 encoding. As noted in the documentation of that
-- package, this UTF-8 instantiation is efficient and specialized so that
-- it has only the code for the UTF-8 case. See g-encstr.ads for full
-- documentation on this package.
with GNAT.Encode_String;
with System.WCh_Con;
package GNAT.Encode_UTF8_String is
new GNAT.Encode_String (System.WCh_Con.WCEM_UTF8);
......@@ -28,6 +28,7 @@ with Csets; use Csets;
with Hostparm; use Hostparm;
with Namet; use Namet;
with Opt; use Opt;
with Output; use Output;
with Restrict; use Restrict;
with Rident; use Rident;
with Scans; use Scans;
......@@ -35,6 +36,10 @@ with Sinfo; use Sinfo;
with Sinput; use Sinput;
with Uintp; use Uintp;
with GNAT.Byte_Order_Mark; use GNAT.Byte_Order_Mark;
with System.WCh_Con; use System.WCh_Con;
package body Scn is
use ASCII;
......@@ -266,6 +271,42 @@ package body Scn is
Set_License (Current_Source_File, Determine_License);
end if;
-- Check for BOM
declare
BOM : BOM_Kind;
Len : Natural;
Tst : String (1 .. 5);
begin
for J in 1 .. 5 loop
Tst (J) := Source (Scan_Ptr + Source_Ptr (J) - 1);
end loop;
Read_BOM (Tst, Len, BOM, False);
case BOM is
when UTF8_All =>
Scan_Ptr := Scan_Ptr + Source_Ptr (Len);
Wide_Character_Encoding_Method := WCEM_UTF8;
Upper_Half_Encoding := True;
when UTF16_LE | UTF16_BE =>
Write_Line ("UTF-16 encoding format not recognized");
raise Unrecoverable_Error;
when UTF32_LE | UTF32_BE =>
Write_Line ("UTF-32 encoding format not recognized");
raise Unrecoverable_Error;
when Unknown =>
null;
when others =>
raise Program_Error;
end case;
end;
-- Because of the License stuff above, Scng.Initialize_Scanner cannot
-- call Scan. Scan initial token (note this initializes Prev_Token,
-- Prev_Token_Ptr).
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment