Commit b917101e by Robert Dewar Committed by Arnaud Charlet

a-textio.adb, [...]: Extensive changes to private part for wide character encoding

2007-12-06  Robert Dewar  <dewar@adacore.com>

	* a-textio.adb, a-textio.ads: 
	Extensive changes to private part for wide character encoding

	* a-witeio.ads, a-witeio.adb, a-ztexio.ads, a-ztexio.adb
	(Look_Ahead): Fix mishandling of encoded sequences
	Move declaration of Wch_Con to private part (should not be visible)

	* ali.adb (Scan_ALI): Set default encoding method to brackets instead of
	UTF-8. Probably this is never used, but if it is, brackets is
	clearly correct.

	* bindgen.adb (Get_WC_Encoding): New procedure to properly handle
	setting wide character encoding for no main program case and when
	encoding is specified using -W?
	Initialize stack limit of environment task if stack limit method of
	stack checking is enabled.
	(Gen_Adainit_Ada): Use Get_WC_Encoding to output encoding method
	(Gen_Adainit_C): Use Get_WC_Encoding to output encoding method
	(Get_Main_Unit_Name): New function.
	(Gen_Adainit_Ada): Add call to main program for .NET when needed.
	(Gen_Output_File): Set Bind_Main_Program to True for .NET

	* bindusg.adb: Add line for -Wx switch

	* s-wchcon.adb, s-wchcon.ads: (Is_Start_Of_Encoding): New function
	Add comments
	Add new useful constant WC_Longest_Sequences

	* switch-b.adb: Clean up handling of -Wx switch
	For -gnatWx, set Wide_Character_Encoding_Method_Specified

	* switch-c.adb: -gnatg activates warning on assertion errors
	For -gnatWx, set Wide_Character_Encoding_Method_Specified

	* s-wchcon.adb: (Is_Start_Of_Encoding): New function

From-SVN: r130817
parent a6e8413c
......@@ -45,6 +45,7 @@ with Ada.IO_Exceptions;
with Ada.Streams;
with System;
with System.File_Control_Block;
with System.WCh_Con;
package Ada.Text_IO is
pragma Elaborate_Body;
......@@ -334,6 +335,11 @@ private
-- Text_IO File Control Block --
--------------------------------
Default_WCEM : System.WCh_Con.WC_Encoding_Method :=
System.WCh_Con.WCEM_UTF8;
-- This gets modified during initialization (see body) using
-- the default value established in the call to Set_Globals.
package FCB renames System.File_Control_Block;
type Text_AFCB;
......@@ -366,6 +372,31 @@ private
-- after a LM-PM sequence when logically we are before the LM-PM. This
-- flag can only be set if Before_LM is also set.
WC_Method : System.WCh_Con.WC_Encoding_Method := Default_WCEM;
-- Encoding method to be used for this file. Text_IO does not deal with
-- wide characters, but it does deal with upper half characters in the
-- range 16#80#-16#FF# which may need encoding, e.g. in UTF-8 mode.
Before_Upper_Half_Character : Boolean := False;
-- This flag is set to indicate that an encoded upper half character has
-- been read by Text_IO.Look_Ahead. If it is set to True, then it means
-- that the stream is logically positioned before the character but is
-- physically positioned after it. The character involved must be in
-- the range 16#80#-16#FF#, i.e. if the flag is set, then we know the
-- next character has a code greater than 16#7F#, and the value of this
-- character is saved in Saved_Upper_Half_Character.
Saved_Upper_Half_Character : Character;
-- This field is valid only if Before_Upper_Half_Character is set. It
-- contains an upper-half character read by Look_Ahead. If Look_Ahead
-- reads a character in the range 16#00# to 16#7F#, then it can use
-- ungetc to put it back, but ungetc cannot be called more than once,
-- so for characters above this range, we don't try to back up the
-- file. Instead we save the character in this field and set the flag
-- Before_Upper_Half_Character to True to indicate that we are logically
-- positioned before this character even though the stream is physically
-- positioned after it.
end record;
function AFCB_Allocate (Control_Block : Text_AFCB) return FCB.AFCB_Ptr;
......
......@@ -31,7 +31,6 @@
-- --
------------------------------------------------------------------------------
with Ada.Exceptions; use Ada.Exceptions;
with Ada.Streams; use Ada.Streams;
with Interfaces.C_Streams; use Interfaces.C_Streams;
......@@ -76,9 +75,9 @@ package body Ada.Wide_Text_IO is
-- done in Get_Immediate mode (i.e. without waiting for a line return).
procedure Set_WCEM (File : in out File_Type);
-- Called by Open and Create to set the wide character encoding method
-- for the file, processing a WCEM form parameter if one is present.
-- File is IN OUT because it may be closed in case of an error.
-- Called by Open and Create to set the wide character encoding method for
-- the file, processing a WCEM form parameter if one is present. File is
-- IN OUT because it may be closed in case of an error.
-------------------
-- AFCB_Allocate --
......@@ -249,7 +248,6 @@ package body Ada.Wide_Text_IO is
return False;
elsif File.Before_LM then
if File.Before_LM_PM then
return Nextc (File) = EOF;
end if;
......@@ -420,6 +418,8 @@ package body Ada.Wide_Text_IO is
File.Before_Wide_Character := False;
Item := File.Saved_Wide_Character;
-- Ada.Text_IO checks Before_LM_PM here, shouldn't we do the same???
else
Get_Character (File, C);
Item := Get_Wide_Char (C, File);
......@@ -555,6 +555,8 @@ package body Ada.Wide_Text_IO is
Item := Wide_Character'Val (LM);
else
-- Shouldn't we use getc_immediate_nowait here, like Text_IO???
ch := Getc_Immed (File);
if ch = EOF then
......@@ -749,7 +751,7 @@ package body Ada.Wide_Text_IO is
end if;
end In_Char;
-- Start of processing for In_Char
-- Start of processing for Get_Wide_Char
begin
return WC_In (C, File.WC_Method);
......@@ -904,7 +906,7 @@ package body Ada.Wide_Text_IO is
End_Of_Line := True;
Item := Wide_Character'Val (0);
-- If we are before a wide character, just return it (this happens
-- If we are before a wide character, just return it (this can happen
-- if there are two calls to Look_Ahead in a row).
elsif File.Before_Wide_Character then
......@@ -924,19 +926,21 @@ package body Ada.Wide_Text_IO is
Ungetc (ch, File);
Item := Wide_Character'Val (0);
-- If the character is in the range 16#0000# to 16#007F# it stands
-- for itself and occupies a single byte, so we can unget it with
-- Case where character obtained does not represent the start of an
-- encoded sequence so it stands for itself and we can unget it with
-- no difficulty.
elsif ch <= 16#0080# then
elsif not Is_Start_Of_Encoding
(Character'Val (ch), File.WC_Method)
then
End_Of_Line := False;
Ungetc (ch, File);
Item := Wide_Character'Val (ch);
-- For a character above this range, we read the character, using
-- the Get_Wide_Char routine. It may well occupy more than one byte
-- so we can't put it back with ungetc. Instead we save it in the
-- control block, setting a flag that everyone interested in reading
-- For the start of an encoding, we read the character using the
-- Get_Wide_Char routine. It will occupy more than one byte so we
-- can't put it back with ungetc. Instead we save it in the control
-- block, setting a flag that everyone interested in reading
-- characters must test before reading the stream.
else
......@@ -1552,7 +1556,7 @@ package body Ada.Wide_Text_IO is
end if;
Close (File);
Raise_Exception (Use_Error'Identity, "invalid WCEM form parameter");
raise Use_Error with "invalid WCEM form parameter";
end if;
end Set_WCEM;
......@@ -1638,7 +1642,6 @@ package body Ada.Wide_Text_IO is
Ungetc (ch, File);
end if;
end if;
end loop;
File.Before_Wide_Character := False;
......
......@@ -6,7 +6,7 @@
-- --
-- S p e c --
-- --
-- Copyright (C) 1992-2006, Free Software Foundation, Inc. --
-- Copyright (C) 1992-2007, Free Software Foundation, Inc. --
-- --
-- This specification is derived from the Ada Reference Manual for use with --
-- GNAT. The copyright notice above, and the license provisions that follow --
......@@ -50,8 +50,6 @@ with System.WCh_Con;
package Ada.Wide_Text_IO is
package WCh_Con renames System.WCh_Con;
type File_Type is limited private;
type File_Mode is (In_File, Out_File, Append_File);
......@@ -303,6 +301,8 @@ package Ada.Wide_Text_IO is
Layout_Error : exception renames IO_Exceptions.Layout_Error;
private
package WCh_Con renames System.WCh_Con;
-----------------------------------
-- Handling of Format Characters --
-----------------------------------
......
......@@ -31,7 +31,6 @@
-- --
------------------------------------------------------------------------------
with Ada.Exceptions; use Ada.Exceptions;
with Ada.Streams; use Ada.Streams;
with Interfaces.C_Streams; use Interfaces.C_Streams;
......@@ -76,9 +75,9 @@ package body Ada.Wide_Wide_Text_IO is
-- are done in Get_Immediate mode (i.e. without waiting for a line return).
procedure Set_WCEM (File : in out File_Type);
-- Called by Open and Create to set the wide character encoding method
-- for the file, processing a WCEM form parameter if one is present.
-- File is IN OUT because it may be closed in case of an error.
-- Called by Open and Create to set the wide character encoding method for
-- the file, processing a WCEM form parameter if one is present. File is
-- IN OUT because it may be closed in case of an error.
-------------------
-- AFCB_Allocate --
......@@ -249,7 +248,6 @@ package body Ada.Wide_Wide_Text_IO is
return False;
elsif File.Before_LM then
if File.Before_LM_PM then
return Nextc (File) = EOF;
end if;
......@@ -420,6 +418,8 @@ package body Ada.Wide_Wide_Text_IO is
File.Before_Wide_Wide_Character := False;
Item := File.Saved_Wide_Wide_Character;
-- Ada.Text_IO checks Before_LM_PM here, shouldn't we do the same???
else
Get_Character (File, C);
Item := Get_Wide_Wide_Char (C, File);
......@@ -555,6 +555,8 @@ package body Ada.Wide_Wide_Text_IO is
Item := Wide_Wide_Character'Val (LM);
else
-- Shouldn't we use getc_immediate_nowait here, like Text_IO???
ch := Getc_Immed (File);
if ch = EOF then
......@@ -904,7 +906,7 @@ package body Ada.Wide_Wide_Text_IO is
End_Of_Line := True;
Item := Wide_Wide_Character'Val (0);
-- If we are before a wide character, just return it (this happens
-- If we are before a wide character, just return it (this can happen
-- if there are two calls to Look_Ahead in a row).
elsif File.Before_Wide_Wide_Character then
......@@ -924,20 +926,22 @@ package body Ada.Wide_Wide_Text_IO is
Ungetc (ch, File);
Item := Wide_Wide_Character'Val (0);
-- If the character is in the range 16#0000# to 16#007F# it stands
-- for itself and occupies a single byte, so we can unget it with
-- Case where character obtained does not represent the start of an
-- encoded sequence so it stands for itself and we can unget it with
-- no difficulty.
elsif ch <= 16#0080# then
elsif not Is_Start_Of_Encoding
(Character'Val (ch), File.WC_Method)
then
End_Of_Line := False;
Ungetc (ch, File);
Item := Wide_Wide_Character'Val (ch);
-- For a character above this range, we read the character, using
-- the Get_Wide_Wide_Char routine. It may well occupy more than one
-- byte so we can't put it back with ungetc. Instead we save it in
-- the control block, setting a flag that everyone interested in
-- reading characters must test before reading the stream.
-- For the start of an encoding, we read the character using the
-- Get_Wide_Wide_Char routine. It will occupy more than one byte so
-- we can't put it back with ungetc. Instead we save it in the
-- control block, setting a flag that everyone interested in reading
-- characters must test before reading the stream.
else
Item := Get_Wide_Wide_Char (Character'Val (ch), File);
......@@ -1552,7 +1556,7 @@ package body Ada.Wide_Wide_Text_IO is
end if;
Close (File);
Raise_Exception (Use_Error'Identity, "invalid WCEM form parameter");
raise Use_Error with "invalid WCEM form parameter";
end if;
end Set_WCEM;
......@@ -1638,7 +1642,6 @@ package body Ada.Wide_Wide_Text_IO is
Ungetc (ch, File);
end if;
end if;
end loop;
File.Before_Wide_Wide_Character := False;
......
......@@ -6,7 +6,7 @@
-- --
-- S p e c --
-- --
-- Copyright (C) 1992-2006, Free Software Foundation, Inc. --
-- Copyright (C) 1992-2007, Free Software Foundation, Inc. --
-- --
-- This specification is derived from the Ada Reference Manual for use with --
-- GNAT. The copyright notice above, and the license provisions that follow --
......@@ -50,8 +50,6 @@ with System.WCh_Con;
package Ada.Wide_Wide_Text_IO is
package WCh_Con renames System.WCh_Con;
type File_Type is limited private;
type File_Mode is (In_File, Out_File, Append_File);
......@@ -303,6 +301,8 @@ package Ada.Wide_Wide_Text_IO is
Layout_Error : exception renames IO_Exceptions.Layout_Error;
private
package WCh_Con renames System.WCh_Con;
-----------------------------------
-- Handling of Format Characters --
-----------------------------------
......
......@@ -824,7 +824,7 @@ package body ALI is
Sfile => No_File,
Task_Dispatching_Policy => ' ',
Time_Slice_Value => -1,
WC_Encoding => '8',
WC_Encoding => 'b',
Unit_Exception_Table => False,
Ver => (others => ' '),
Ver_Len => 0,
......@@ -930,13 +930,23 @@ package body ALI is
else
Checkc (' ');
Name_Len := 0;
-- Scan out argument
Name_Len := 0;
while not At_Eol loop
Name_Len := Name_Len + 1;
Name_Buffer (Name_Len) := Getc;
end loop;
-- If -fstack-check, record that it occurred
if Name_Buffer (1 .. Name_Len) = "-fstack-check" then
Stack_Check_Switch_Set := True;
end if;
-- Store the argument
Args.Increment_Last;
Args.Table (Args.Last) := new String'(Name_Buffer (1 .. Name_Len));
......
......@@ -26,6 +26,8 @@
with Osint; use Osint;
with Output; use Output;
with System.WCh_Con; use System.WCh_Con;
package body Bindusg is
Already_Displayed : Boolean := False;
......@@ -222,11 +224,27 @@ package body Bindusg is
Write_Line (" -v Verbose mode. Error messages, " &
"header, summary output to stdout");
-- Lines for -w switch
-- Line for -w switch
Write_Line (" -wx Warning mode. (x=s/e for " &
"suppress/treat as error)");
-- Line for -W switch
Write_Str (" -W? Wide character encoding method (");
for J in WC_Encoding_Method loop
Write_Char (WC_Encoding_Letters (J));
if J = WC_Encoding_Method'Last then
Write_Char (')');
else
Write_Char ('/');
end if;
end loop;
Write_Eol;
-- Line for -x switch
Write_Line (" -x Exclude source files (check object " &
......
......@@ -71,4 +71,18 @@ package body System.WCh_Con is
end if;
end Get_WC_Encoding_Method;
--------------------------
-- Is_Start_Of_Encoding --
--------------------------
function Is_Start_Of_Encoding
(C : Character;
EM : WC_Encoding_Method) return Boolean
is
begin
return (EM in WC_Upper_Half_Encoding_Method
and then Character'Pos (C) >= 16#80#)
or else (EM in WC_ESC_Encoding_Method and then C = ASCII.ESC);
end Is_Start_Of_Encoding;
end System.WCh_Con;
......@@ -106,8 +106,8 @@ package System.WCh_Con is
-- sequence ESC a b c d (five characters, where abcd are ASCII hex
-- characters, using upper case for letters). This method is easy
-- to deal with in external environments that do not support wide
-- characters, and covers the whole BMP. This is the default encoding
-- method.
-- characters, and covers the whole 16-bit BMP. Codes larger than
-- 16#FFFF# are not representable using this encoding method.
WCEM_Upper : constant WC_Encoding_Method := 2;
-- The wide character with encoding 16#abcd#, where the upper bit is on
......@@ -115,7 +115,8 @@ package System.WCh_Con is
-- 16#cd#. The second byte may never be a format control character, but
-- is not required to be in the upper half. This method can be also used
-- for shift-JIS or EUC where the internal coding matches the external
-- coding.
-- coding. Codes larger than 16#FFFF# are not representable using this
-- encoding method.
WCEM_Shift_JIS : constant WC_Encoding_Method := 3;
-- A wide character is represented by a two character sequence 16#ab#
......@@ -123,19 +124,21 @@ package System.WCh_Con is
-- as described above. The internal character code is the corresponding
-- JIS character according to the standard algorithm for Shift-JIS
-- conversion. See the body of package System.JIS_Conversions for
-- further details.
-- further details. Codes larger than 16#FFFF are not representable
-- using this encoding method.
WCEM_EUC : constant WC_Encoding_Method := 4;
-- A wide character is represented by a two character sequence 16#ab# and
-- 16#cd#, with both characters being in the upper half set. The internal
-- character code is the corresponding JIS character according to the EUC
-- encoding algorithm. See the body of package System.JIS_Conversions for
-- further details.
-- further details. Codes larger than 16#FFFF# are not representable using
-- this encoding method.
WCEM_UTF8 : constant WC_Encoding_Method := 5;
-- An ISO 10646-1 BMP/Unicode wide character is represented in
-- UCS Transformation Format 8 (UTF-8) as defined in Annex R of ISO
-- 10646-1/Am.2. Depending on the character value, a Unicode character
-- An ISO 10646-1 BMP/Unicode wide character is represented in UCS
-- Transformation Format 8 (UTF-8), as defined in Annex R of ISO
-- 10646-1/Am.2. Depending on the character value, a Unicode character
-- is represented as the one to six byte sequence.
--
-- 16#0000_0000#-16#0000_007f#: 2#0xxxxxxx#
......@@ -151,7 +154,8 @@ package System.WCh_Con is
-- where the xxx bits correspond to the left-padded bits of the
-- 16-bit character value. Note that all lower half ASCII characters
-- are represented as ASCII bytes and all upper half characters and
-- other wide characters are represented as sequences of upper-half.
-- other wide characters are represented as sequences of upper-half. This
-- encoding method can represent the entire range of Wide_Wide_Character.
WCEM_Brackets : constant WC_Encoding_Method := 6;
-- A wide character is represented using one of the following sequences:
......@@ -161,7 +165,10 @@ package System.WCh_Con is
-- ["xxxxxx"]
-- ["xxxxxxxx"]
--
-- where xx are hexadecimal digits representing the character code.
-- where xx are hexadecimal digits representing the character code. This
-- encoding method can represent the entire range of Wide_Wide_Character
-- but in the general case results in ambiguous representations (there is
-- no ambiguity in Ada sources, since the above sequences are illegal Ada).
WC_Encoding_Letters : constant array (WC_Encoding_Method) of Character :=
(WCEM_Hex => 'h',
......@@ -183,10 +190,20 @@ package System.WCh_Con is
-- Encoding methods using an upper half character (16#80#..16#FF) at
-- the start of the sequence.
WC_Longest_Sequence : constant := 10;
WC_Longest_Sequence : constant := 12;
-- The longest number of characters that can be used for a wide character
-- or wide wide character sequence for any of the active encoding methods.
WC_Longest_Sequences : constant array (WC_Encoding_Method) of Natural :=
(WCEM_Hex => 5,
WCEM_Upper => 2,
WCEM_Shift_JIS => 2,
WCEM_EUC => 2,
WCEM_UTF8 => 6,
WCEM_Brackets => 12);
-- The longest number of characters that can be used for a wide character
-- or wide wide character sequence using the given encoding method.
function Get_WC_Encoding_Method (C : Character) return WC_Encoding_Method;
-- Given a character C, returns corresponding encoding method (see array
-- WC_Encoding_Letters above). Raises Constraint_Error if not in list.
......@@ -196,4 +213,12 @@ package System.WCh_Con is
-- utf8, brackets, return the corresponding encoding method. Raises
-- Constraint_Error if not in list.
function Is_Start_Of_Encoding
(C : Character;
EM : WC_Encoding_Method) return Boolean;
pragma Inline (Is_Start_Of_Encoding);
-- Returns True if the Character C is the start of a multi-character
-- encoding sequence for the given encoding method EM. If EM is set to
-- WCEM_Brackets, this function always returns False.
end System.WCh_Con;
......@@ -417,21 +417,21 @@ package body Switch.B is
-- Processing for W switch
when 'W' =>
if Ptr = Max then
Bad_Switch (Switch_Chars);
end if;
Ptr := Ptr + 1;
for J in WC_Encoding_Method loop
if Switch_Chars (Ptr) = WC_Encoding_Letters (J) then
Wide_Character_Encoding_Method := J;
exit;
if Ptr > Max then
Bad_Switch (Switch_Chars);
end if;
elsif J = WC_Encoding_Method'Last then
begin
Wide_Character_Encoding_Method :=
Get_WC_Encoding_Method (Switch_Chars (Ptr));
exception
when Constraint_Error =>
Bad_Switch (Switch_Chars);
end if;
end loop;
end;
Wide_Character_Encoding_Method_Specified := True;
Upper_Half_Encoding :=
Wide_Character_Encoding_Method in
......
......@@ -479,6 +479,7 @@ package body Switch.C is
Constant_Condition_Warnings := True;
Implementation_Unit_Warnings := True;
Ineffective_Inline_Warnings := True;
Warn_On_Assertion_Failure := True;
Warn_On_Assumed_Low_Bound := True;
Warn_On_Bad_Fixed_Value := True;
Warn_On_Constant := True;
......@@ -833,9 +834,11 @@ package body Switch.C is
Bad_Switch ("-gnatW" & Switch_Chars (Ptr .. Max));
end;
Wide_Character_Encoding_Method_Specified := True;
Upper_Half_Encoding :=
Wide_Character_Encoding_Method in
WC_Upper_Half_Encoding_Method;
WC_Upper_Half_Encoding_Method;
Ptr := Ptr + 1;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment