Makefile.in (LIBCPP_OBJS): Add cppcharset.o.

* Makefile.in (LIBCPP_OBJS): Add cppcharset.o. (cppcharset.o): New target. * c-lex.c (is_extended_char): Move to cppcharset.c. (utf8_extend_token): Delete. * cppcharset.c: New file. * cpphash.h (_cpp_valid_ucn): New. * cpplex.c (lex_identifier): Update prototype. (continues_identifier_p): Rename forms_identifier_p. Handle UCN escapes. (maybe_read_ucs): Rename maybe_read_ucn. Update to use code in cppcharset.c. (lex_number, lex_identifier, cpp_parse_escape): Update. (_cpp_lex_direct): Update to handle UCNs. (cpp_avoid_paste): Don't paste to form a UCN. testsuite: * ucs.c: Update diagnostic messages. From-SVN: r65845

Makefile.in (LIBCPP_OBJS): Add cppcharset.o.
* Makefile.in (LIBCPP_OBJS): Add cppcharset.o. (cppcharset.o): New target. * c-lex.c (is_extended_char): Move to cppcharset.c. (utf8_extend_token): Delete. * cppcharset.c: New file. * cpphash.h (_cpp_valid_ucn): New. * cpplex.c (lex_identifier): Update prototype. (continues_identifier_p): Rename forms_identifier_p. Handle UCN escapes. (maybe_read_ucs): Rename maybe_read_ucn. Update to use code in cppcharset.c. (lex_number, lex_identifier, cpp_parse_escape): Update. (_cpp_lex_direct): Update to handle UCNs. (cpp_avoid_paste): Don't paste to form a UCN. testsuite: * ucs.c: Update diagnostic messages. From-SVN: r65845
1613e52b · Neil Booth · Neil Booth · 0a45ec5c · 1613e52b · 1613e52b
Commit 1613e52b authored Apr 20, 2003 by Neil Booth Committed by Neil Booth Apr 20, 2003
Showing with 101 additions and 413 deletions

gcc/ChangeLog
+17 -0

gcc/Makefile.in
+2 -1

gcc/c-lex.c
+0 -309

gcc/cppcharset.c
+0 -0

gcc/cpphash.h
+4 -0

gcc/cpplex.c
+70 -99

gcc/testsuite/ChangeLog
+4 -0

gcc/testsuite/gcc.dg/cpp/ucs.c
+4 -4

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2003-04-20  Neil Booth  <neil@daikokuya.co.uk>
+
+	* Makefile.in (LIBCPP_OBJS): Add cppcharset.o.
+	(cppcharset.o): New target.
+	* c-lex.c (is_extended_char): Move to cppcharset.c.
+	(utf8_extend_token): Delete.
+	* cppcharset.c: New file.
+	* cpphash.h (_cpp_valid_ucn): New.
+	* cpplex.c (lex_identifier): Update prototype.
+	(continues_identifier_p): Rename forms_identifier_p.  Handle UCN
+	escapes.
+	(maybe_read_ucs): Rename maybe_read_ucn.  Update to use code
+	in cppcharset.c.
+	(lex_number, lex_identifier, cpp_parse_escape): Update.
+	(_cpp_lex_direct): Update to handle UCNs.
+	(cpp_avoid_paste): Don't paste to form a UCN.
+
 2003-04-19  Roger Sayle  <roger@eyesopen.com>

 	* builtins.c (expand_builtin):  Don't expand a pure or const

--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -2320,7 +2320,7 @@ PREPROCESSOR_DEFINES = \
  @TARGET_SYSTEM_ROOT_DEFINE@

 LIBCPP_OBJS =	cpplib.o cpplex.o cppmacro.o cppexp.o cppfiles.o cpptrad.o \
-		cpphash.o cpperror.o cppinit.o \
+		cpphash.o cpperror.o cppinit.o cppcharset.o \
 		hashtable.o line-map.o mkdeps.o mbchar.o cpppch.o

 LIBCPP_DEPS =	$(CPPLIB_H) cpphash.h line-map.h hashtable.h intl.h \
@@ -2333,6 +2333,7 @@ libcpp.a: $(LIBCPP_OBJS)
 	$(AR) $(AR_FLAGS) libcpp.a $(LIBCPP_OBJS)
 	-$(RANLIB) libcpp.a

+cppcharset.o: cppcharset.c $(LIBCPP_DEPS)
 cpperror.o: cpperror.c $(LIBCPP_DEPS)
 cppexp.o:   cppexp.c   $(LIBCPP_DEPS)
 cpplex.o:   cpplex.c   $(LIBCPP_DEPS) mbchar.h

--- a/gcc/c-lex.c
+++ b/gcc/c-lex.c
@@ -323,315 +323,6 @@ cb_undef (pfile, line, node)
 			 (const char *) NODE_NAME (node));
 }

-#if 0 /* not yet */
-/* Returns nonzero if C is a universal-character-name.  Give an error if it
-   is not one which may appear in an identifier, as per [extendid].
-
-   Note that extended character support in identifiers has not yet been
-   implemented.  It is my personal opinion that this is not a desirable
-   feature.  Portable code cannot count on support for more than the basic
-   identifier character set.  */
-
-static inline int
-is_extended_char (c)
-     int c;
-{
-#ifdef TARGET_EBCDIC
-  return 0;
-#else
-  /* ASCII.  */
-  if (c < 0x7f)
-    return 0;
-
-  /* None of the valid chars are outside the Basic Multilingual Plane (the
-     low 16 bits).  */
-  if (c > 0xffff)
-    {
-      error ("universal-character-name '\\U%08x' not valid in identifier", c);
-      return 1;
-    }
-  
-  /* Latin */
-  if ((c >= 0x00c0 && c <= 0x00d6)
-      || (c >= 0x00d8 && c <= 0x00f6)
-      || (c >= 0x00f8 && c <= 0x01f5)
-      || (c >= 0x01fa && c <= 0x0217)
-      || (c >= 0x0250 && c <= 0x02a8)
-      || (c >= 0x1e00 && c <= 0x1e9a)
-      || (c >= 0x1ea0 && c <= 0x1ef9))
-    return 1;
-
-  /* Greek */
-  if ((c == 0x0384)
-      || (c >= 0x0388 && c <= 0x038a)
-      || (c == 0x038c)
-      || (c >= 0x038e && c <= 0x03a1)
-      || (c >= 0x03a3 && c <= 0x03ce)
-      || (c >= 0x03d0 && c <= 0x03d6)
-      || (c == 0x03da)
-      || (c == 0x03dc)
-      || (c == 0x03de)
-      || (c == 0x03e0)
-      || (c >= 0x03e2 && c <= 0x03f3)
-      || (c >= 0x1f00 && c <= 0x1f15)
-      || (c >= 0x1f18 && c <= 0x1f1d)
-      || (c >= 0x1f20 && c <= 0x1f45)
-      || (c >= 0x1f48 && c <= 0x1f4d)
-      || (c >= 0x1f50 && c <= 0x1f57)
-      || (c == 0x1f59)
-      || (c == 0x1f5b)
-      || (c == 0x1f5d)
-      || (c >= 0x1f5f && c <= 0x1f7d)
-      || (c >= 0x1f80 && c <= 0x1fb4)
-      || (c >= 0x1fb6 && c <= 0x1fbc)
-      || (c >= 0x1fc2 && c <= 0x1fc4)
-      || (c >= 0x1fc6 && c <= 0x1fcc)
-      || (c >= 0x1fd0 && c <= 0x1fd3)
-      || (c >= 0x1fd6 && c <= 0x1fdb)
-      || (c >= 0x1fe0 && c <= 0x1fec)
-      || (c >= 0x1ff2 && c <= 0x1ff4)
-      || (c >= 0x1ff6 && c <= 0x1ffc))
-    return 1;
-
-  /* Cyrillic */
-  if ((c >= 0x0401 && c <= 0x040d)
-      || (c >= 0x040f && c <= 0x044f)
-      || (c >= 0x0451 && c <= 0x045c)
-      || (c >= 0x045e && c <= 0x0481)
-      || (c >= 0x0490 && c <= 0x04c4)
-      || (c >= 0x04c7 && c <= 0x04c8)
-      || (c >= 0x04cb && c <= 0x04cc)
-      || (c >= 0x04d0 && c <= 0x04eb)
-      || (c >= 0x04ee && c <= 0x04f5)
-      || (c >= 0x04f8 && c <= 0x04f9))
-    return 1;
-
-  /* Armenian */
-  if ((c >= 0x0531 && c <= 0x0556)
-      || (c >= 0x0561 && c <= 0x0587))
-    return 1;
-
-  /* Hebrew */
-  if ((c >= 0x05d0 && c <= 0x05ea)
-      || (c >= 0x05f0 && c <= 0x05f4))
-    return 1;
-
-  /* Arabic */
-  if ((c >= 0x0621 && c <= 0x063a)
-      || (c >= 0x0640 && c <= 0x0652)
-      || (c >= 0x0670 && c <= 0x06b7)
-      || (c >= 0x06ba && c <= 0x06be)
-      || (c >= 0x06c0 && c <= 0x06ce)
-      || (c >= 0x06e5 && c <= 0x06e7))
-    return 1;
-
-  /* Devanagari */
-  if ((c >= 0x0905 && c <= 0x0939)
-      || (c >= 0x0958 && c <= 0x0962))
-    return 1;
-
-  /* Bengali */
-  if ((c >= 0x0985 && c <= 0x098c)
-      || (c >= 0x098f && c <= 0x0990)
-      || (c >= 0x0993 && c <= 0x09a8)
-      || (c >= 0x09aa && c <= 0x09b0)
-      || (c == 0x09b2)
-      || (c >= 0x09b6 && c <= 0x09b9)
-      || (c >= 0x09dc && c <= 0x09dd)
-      || (c >= 0x09df && c <= 0x09e1)
-      || (c >= 0x09f0 && c <= 0x09f1))
-    return 1;
-
-  /* Gurmukhi */
-  if ((c >= 0x0a05 && c <= 0x0a0a)
-      || (c >= 0x0a0f && c <= 0x0a10)
-      || (c >= 0x0a13 && c <= 0x0a28)
-      || (c >= 0x0a2a && c <= 0x0a30)
-      || (c >= 0x0a32 && c <= 0x0a33)
-      || (c >= 0x0a35 && c <= 0x0a36)
-      || (c >= 0x0a38 && c <= 0x0a39)
-      || (c >= 0x0a59 && c <= 0x0a5c)
-      || (c == 0x0a5e))
-    return 1;
-
-  /* Gujarati */
-  if ((c >= 0x0a85 && c <= 0x0a8b)
-      || (c == 0x0a8d)
-      || (c >= 0x0a8f && c <= 0x0a91)
-      || (c >= 0x0a93 && c <= 0x0aa8)
-      || (c >= 0x0aaa && c <= 0x0ab0)
-      || (c >= 0x0ab2 && c <= 0x0ab3)
-      || (c >= 0x0ab5 && c <= 0x0ab9)
-      || (c == 0x0ae0))
-    return 1;
-
-  /* Oriya */
-  if ((c >= 0x0b05 && c <= 0x0b0c)
-      || (c >= 0x0b0f && c <= 0x0b10)
-      || (c >= 0x0b13 && c <= 0x0b28)
-      || (c >= 0x0b2a && c <= 0x0b30)
-      || (c >= 0x0b32 && c <= 0x0b33)
-      || (c >= 0x0b36 && c <= 0x0b39)
-      || (c >= 0x0b5c && c <= 0x0b5d)
-      || (c >= 0x0b5f && c <= 0x0b61))
-    return 1;
-
-  /* Tamil */
-  if ((c >= 0x0b85 && c <= 0x0b8a)
-      || (c >= 0x0b8e && c <= 0x0b90)
-      || (c >= 0x0b92 && c <= 0x0b95)
-      || (c >= 0x0b99 && c <= 0x0b9a)
-      || (c == 0x0b9c)
-      || (c >= 0x0b9e && c <= 0x0b9f)
-      || (c >= 0x0ba3 && c <= 0x0ba4)
-      || (c >= 0x0ba8 && c <= 0x0baa)
-      || (c >= 0x0bae && c <= 0x0bb5)
-      || (c >= 0x0bb7 && c <= 0x0bb9))
-    return 1;
-
-  /* Telugu */
-  if ((c >= 0x0c05 && c <= 0x0c0c)
-      || (c >= 0x0c0e && c <= 0x0c10)
-      || (c >= 0x0c12 && c <= 0x0c28)
-      || (c >= 0x0c2a && c <= 0x0c33)
-      || (c >= 0x0c35 && c <= 0x0c39)
-      || (c >= 0x0c60 && c <= 0x0c61))
-    return 1;
-
-  /* Kannada */
-  if ((c >= 0x0c85 && c <= 0x0c8c)
-      || (c >= 0x0c8e && c <= 0x0c90)
-      || (c >= 0x0c92 && c <= 0x0ca8)
-      || (c >= 0x0caa && c <= 0x0cb3)
-      || (c >= 0x0cb5 && c <= 0x0cb9)
-      || (c >= 0x0ce0 && c <= 0x0ce1))
-    return 1;
-
-  /* Malayalam */
-  if ((c >= 0x0d05 && c <= 0x0d0c)
-      || (c >= 0x0d0e && c <= 0x0d10)
-      || (c >= 0x0d12 && c <= 0x0d28)
-      || (c >= 0x0d2a && c <= 0x0d39)
-      || (c >= 0x0d60 && c <= 0x0d61))
-    return 1;
-
-  /* Thai */
-  if ((c >= 0x0e01 && c <= 0x0e30)
-      || (c >= 0x0e32 && c <= 0x0e33)
-      || (c >= 0x0e40 && c <= 0x0e46)
-      || (c >= 0x0e4f && c <= 0x0e5b))
-    return 1;
-
-  /* Lao */
-  if ((c >= 0x0e81 && c <= 0x0e82)
-      || (c == 0x0e84)
-      || (c == 0x0e87)
-      || (c == 0x0e88)
-      || (c == 0x0e8a)
-      || (c == 0x0e0d)
-      || (c >= 0x0e94 && c <= 0x0e97)
-      || (c >= 0x0e99 && c <= 0x0e9f)
-      || (c >= 0x0ea1 && c <= 0x0ea3)
-      || (c == 0x0ea5)
-      || (c == 0x0ea7)
-      || (c == 0x0eaa)
-      || (c == 0x0eab)
-      || (c >= 0x0ead && c <= 0x0eb0)
-      || (c == 0x0eb2)
-      || (c == 0x0eb3)
-      || (c == 0x0ebd)
-      || (c >= 0x0ec0 && c <= 0x0ec4)
-      || (c == 0x0ec6))
-    return 1;
-
-  /* Georgian */
-  if ((c >= 0x10a0 && c <= 0x10c5)
-      || (c >= 0x10d0 && c <= 0x10f6))
-    return 1;
-
-  /* Hiragana */
-  if ((c >= 0x3041 && c <= 0x3094)
-      || (c >= 0x309b && c <= 0x309e))
-    return 1;
-
-  /* Katakana */
-  if ((c >= 0x30a1 && c <= 0x30fe))
-    return 1;
-
-  /* Bopmofo */
-  if ((c >= 0x3105 && c <= 0x312c))
-    return 1;
-
-  /* Hangul */
-  if ((c >= 0x1100 && c <= 0x1159)
-      || (c >= 0x1161 && c <= 0x11a2)
-      || (c >= 0x11a8 && c <= 0x11f9))
-    return 1;
-
-  /* CJK Unified Ideographs */
-  if ((c >= 0xf900 && c <= 0xfa2d)
-      || (c >= 0xfb1f && c <= 0xfb36)
-      || (c >= 0xfb38 && c <= 0xfb3c)
-      || (c == 0xfb3e)
-      || (c >= 0xfb40 && c <= 0xfb41)
-      || (c >= 0xfb42 && c <= 0xfb44)
-      || (c >= 0xfb46 && c <= 0xfbb1)
-      || (c >= 0xfbd3 && c <= 0xfd3f)
-      || (c >= 0xfd50 && c <= 0xfd8f)
-      || (c >= 0xfd92 && c <= 0xfdc7)
-      || (c >= 0xfdf0 && c <= 0xfdfb)
-      || (c >= 0xfe70 && c <= 0xfe72)
-      || (c == 0xfe74)
-      || (c >= 0xfe76 && c <= 0xfefc)
-      || (c >= 0xff21 && c <= 0xff3a)
-      || (c >= 0xff41 && c <= 0xff5a)
-      || (c >= 0xff66 && c <= 0xffbe)
-      || (c >= 0xffc2 && c <= 0xffc7)
-      || (c >= 0xffca && c <= 0xffcf)
-      || (c >= 0xffd2 && c <= 0xffd7)
-      || (c >= 0xffda && c <= 0xffdc)
-      || (c >= 0x4e00 && c <= 0x9fa5))
-    return 1;
-
-  error ("universal-character-name '\\u%04x' not valid in identifier", c);
-  return 1;
-#endif
-}
-
-/* Add the UTF-8 representation of C to the token_buffer.  */
-
-static void
-utf8_extend_token (c)
-     int c;
-{
-  int shift, mask;
-
-  if      (c <= 0x0000007f)
-    {
-      extend_token (c);
-      return;
-    }
-  else if (c <= 0x000007ff)
-    shift = 6, mask = 0xc0;
-  else if (c <= 0x0000ffff)
-    shift = 12, mask = 0xe0;
-  else if (c <= 0x001fffff)
-    shift = 18, mask = 0xf0;
-  else if (c <= 0x03ffffff)
-    shift = 24, mask = 0xf8;
-  else
-    shift = 30, mask = 0xfc;
-
-  extend_token (mask | (c >> shift));
-  do
-    {
-      shift -= 6;
-      extend_token ((unsigned char) (0x80 | (c >> shift)));
-    }
-  while (shift);
-}
-#endif
-
 int
 c_lex (value)
     tree *value;

--- a/gcc/cppcharset.c
+++ b/gcc/cppcharset.c
--- a/gcc/cpphash.h
+++ b/gcc/cpphash.h
@@ -555,6 +555,10 @@ extern bool _cpp_expansions_different_trad PARAMS ((const cpp_macro *,
 extern uchar *_cpp_copy_replacement_text PARAMS ((const cpp_macro *, uchar *));
 extern size_t _cpp_replacement_text_len PARAMS ((const cpp_macro *));

+/* In cppcharset.c.  */
+cppchar_t _cpp_valid_ucn PARAMS ((cpp_reader *, const uchar **,
+				  int identifer_p));
+
 /* Utility routines and macros.  */
 #define DSC(str) (const uchar *)str, sizeof str - 1
 #define xnew(T)		(T *) xmalloc (sizeof(T))

--- a/gcc/cpplex.c
+++ b/gcc/cpplex.c
@@ -59,15 +59,14 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
 static void add_line_note PARAMS ((cpp_buffer *, const uchar *, unsigned int));
 static int skip_line_comment PARAMS ((cpp_reader *));
 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
-static cpp_hashnode *lex_identifier PARAMS ((cpp_reader *));
+static cpp_hashnode *lex_identifier PARAMS ((cpp_reader *, const uchar *));
 static void lex_number PARAMS ((cpp_reader *, cpp_string *));
-static bool continues_identifier_p PARAMS ((cpp_reader *));
+static bool forms_identifier_p PARAMS ((cpp_reader *, int));
 static void lex_string PARAMS ((cpp_reader *, cpp_token *));
 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
 				  cppchar_t));
 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
-static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
-				   const unsigned char *, cppchar_t *));
+static cppchar_t maybe_read_ucn PARAMS ((cpp_reader *, const uchar **));
 static tokenrun *next_tokenrun PARAMS ((tokenrun *));

 static unsigned int hex_digit_value PARAMS ((unsigned int));
@@ -361,33 +360,53 @@ name_p (pfile, string)
 }

 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
-   an identifier.  */
+   an identifier.  FIRST is TRUE if this starts an identifier.  */
 static bool
-continues_identifier_p (pfile)
+forms_identifier_p (pfile, first)
     cpp_reader *pfile;
+     int first;
 {
-  if (*pfile->buffer->cur != '$' || !CPP_OPTION (pfile, dollars_in_ident))
+  cpp_buffer *buffer = pfile->buffer;
+
+  if (*buffer->cur == '$')
+    {
+      if (!CPP_OPTION (pfile, dollars_in_ident))
 	return false;

-  if (CPP_PEDANTIC (pfile) && !pfile->state.skipping && !pfile->warned_dollar)
+      buffer->cur++;
+      if (CPP_PEDANTIC (pfile)
+	  && !pfile->state.skipping
+	  && !pfile->warned_dollar)
 	{
 	  pfile->warned_dollar = true;
 	  cpp_error (pfile, DL_PEDWARN, "'$' in identifier or number");
 	}
-  pfile->buffer->cur++;

      return true;
+    }
+
+  /* Is this a syntactically valid UCN?  */
+  if (0 && *buffer->cur == '\\'
+      && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+    {
+      buffer->cur += 2;
+      if (_cpp_valid_ucn (pfile, &buffer->cur, 1 + !first))
+	return true;
+      buffer->cur -= 2;
+    }
+
+  return false;
 }

 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 static cpp_hashnode *
-lex_identifier (pfile)
+lex_identifier (pfile, base)
     cpp_reader *pfile;
+     const uchar *base;
 {
  cpp_hashnode *result;
-  const uchar *cur, *base;
+  const uchar *cur;

-  base = pfile->buffer->cur - 1;
  do
    {
      cur = pfile->buffer->cur;
@@ -398,7 +417,7 @@ lex_identifier (pfile)

      pfile->buffer->cur = cur;
    }
-  while (continues_identifier_p (pfile));
+  while (forms_identifier_p (pfile, false));

  result = (cpp_hashnode *)
    ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
@@ -444,7 +463,7 @@ lex_number (pfile, number)

      pfile->buffer->cur = cur;
    }
-  while (continues_identifier_p (pfile));
+  while (forms_identifier_p (pfile, false));

  number->len = cur - base;
  dest = _cpp_unaligned_alloc (pfile, number->len + 1);
@@ -803,7 +822,6 @@ _cpp_lex_direct (pfile)
 	}
      /* Fall through.  */

-    start_ident:
    case '_':
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
@@ -816,7 +834,7 @@ _cpp_lex_direct (pfile)
    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    case 'Y': case 'Z':
      result->type = CPP_NAME;
-      result->val.node = lex_identifier (pfile);
+      result->val.node = lex_identifier (pfile, buffer->cur - 1);

      /* Convert named operators to their proper types.  */
      if (result->val.node->flags & NODE_OPERATOR)
@@ -1044,15 +1062,24 @@ _cpp_lex_direct (pfile)
    case '@': result->type = CPP_ATSIGN; break;

    case '$':
-      if (CPP_OPTION (pfile, dollars_in_ident))
-	goto start_ident;
-      /* Fall through...  */
+    case '\\':
+      {
+	const uchar *base = --buffer->cur;
+
+	if (forms_identifier_p (pfile, true))
+	  {
+	    result->type = CPP_NAME;
+	    result->val.node = lex_identifier (pfile, base);
+	    break;
+	  }
+	buffer->cur++;

      default:
 	result->type = CPP_OTHER;
 	result->val.c = c;
 	break;
      }
+    }

  return result;
 }
@@ -1321,9 +1348,11 @@ cpp_avoid_paste (pfile, token1, token2)
 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
    case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
 				|| c == '.' || c == '+' || c == '-');
-    case CPP_OTHER:	return (CPP_OPTION (pfile, objc)
+				      /* UCNs */
+    case CPP_OTHER:	return ((token1->val.c == '\\' && b == CPP_NAME)
+				|| (CPP_OPTION (pfile, objc)
 				    && token1->val.c == '@'
-				&& (b == CPP_NAME || b == CPP_STRING));
+				    && (b == CPP_NAME || b == CPP_STRING)));
    default:		break;
    }

@@ -1363,93 +1392,31 @@ hex_digit_value (c)
    abort ();
 }

-/* Parse a '\uNNNN' or '\UNNNNNNNN' sequence.  Returns 1 to indicate
-   failure if cpplib is not parsing C++ or C99.  Such failure is
-   silent, and no variables are updated.  Otherwise returns 0, and
-   warns if -Wtraditional.
-
-   [lex.charset]: The character designated by the universal character
-   name \UNNNNNNNN is that character whose character short name in
-   ISO/IEC 10646 is NNNNNNNN; the character designated by the
-   universal character name \uNNNN is that character whose character
-   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
-   for a universal character name is less than 0x20 or in the range
-   0x7F-0x9F (inclusive), or if the universal character name
-   designates a character in the basic source character set, then the
-   program is ill-formed.
-
-   We assume that wchar_t is Unicode, so we don't need to do any
-   mapping.  Is this ever wrong?
-
-   PC points to the 'u' or 'U', PSTR is points to the byte after PC,
-   LIMIT is the end of the string or charconst.  PSTR is updated to
-   point after the UCS on return, and the UCS is written into PC.  */
-
-static int
-maybe_read_ucs (pfile, pstr, limit, pc)
+/* Read a possible universal character name starting at *PSTR.  */
+static cppchar_t
+maybe_read_ucn (pfile, pstr)
     cpp_reader *pfile;
-     const unsigned char **pstr;
-     const unsigned char *limit;
-     cppchar_t *pc;
+     const uchar **pstr;
 {
-  const unsigned char *p = *pstr;
-  unsigned int code = 0;
-  unsigned int c = *pc, length;
-
-  /* Only attempt to interpret a UCS for C++ and C99.  */
-  if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
-    return 1;
+  cppchar_t result, c = (*pstr)[-1];

+  result = _cpp_valid_ucn (pfile, pstr, false);
+  if (result)
+    {
      if (CPP_WTRADITIONAL (pfile))
 	cpp_error (pfile, DL_WARNING,
-	       "the meaning of '\\%c' is different in traditional C", c);
+		   "the meaning of '\\%c' is different in traditional C",
+		   (int) c);

-  length = (c == 'u' ? 4: 8);
-
-  if ((size_t) (limit - p) < length)
-    {
-      cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
-      /* Skip to the end to avoid more diagnostics.  */
-      p = limit;
-    }
-  else
-    {
-      for (; length; length--, p++)
-	{
-	  c = *p;
-	  if (ISXDIGIT (c))
-	    code = (code << 4) + hex_digit_value (c);
-	  else
+      if (CPP_OPTION (pfile, EBCDIC))
 	{
 	  cpp_error (pfile, DL_ERROR,
-			 "non-hex digit '%c' in universal-character-name", c);
-	      /* We shouldn't skip in case there are multibyte chars.  */
-	      break;
-	    }
+		     "universal character with an EBCDIC target");
+	  result = 0x3f;  /* EBCDIC invalid character */
 	}
    }

-  if (CPP_OPTION (pfile, EBCDIC))
-    {
-      cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
-      code = 0x3f;  /* EBCDIC invalid character */
-    }
-  /* True extended characters are OK.  */
-  else if (code >= 0xa0
-	   && !(code & 0x80000000)
-	   && !(code >= 0xD800 && code <= 0xDFFF))
-    ;
-  /* The standard permits $, @ and ` to be specified as UCNs.  We use
-     hex escapes so that this also works with EBCDIC hosts.  */
-  else if (code == 0x24 || code == 0x40 || code == 0x60)
-    ;
-  /* Don't give another error if one occurred above.  */
-  else if (length == 0)
-    cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
-
-  *pstr = p;
-  *pc = code;
-  return 0;
+  return result;
 }

 /* Returns the value of an escape sequence, truncated to the correct
@@ -1470,7 +1437,7 @@ cpp_parse_escape (pfile, pstr, limit, wide)

  int unknown = 0;
  const unsigned char *str = *pstr, *charconsts;
-  cppchar_t c, mask;
+  cppchar_t c, ucn, mask;
  unsigned int width;

  if (CPP_OPTION (pfile, EBCDIC))
@@ -1519,7 +1486,11 @@ cpp_parse_escape (pfile, pstr, limit, wide)
      break;

    case 'u': case 'U':
-      unknown = maybe_read_ucs (pfile, &str, limit, &c);
+      ucn = maybe_read_ucn (pfile, &str);
+      if (ucn)
+	c = ucn;
+      else
+	unknown = true;
      break;

    case 'x':

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2003-04-20  Neil Booth  <neil@daikokuya.co.uk>
+
+	* ucs.c: Update diagnostic messages.
+
 2003-04-19  Neil Booth  <neil@daikokuya.co.uk>

 	* gcc.dg/cpp/truefalse.cpp: New test.

--- a/gcc/testsuite/gcc.dg/cpp/ucs.c
+++ b/gcc/testsuite/gcc.dg/cpp/ucs.c
@@ -51,7 +51,7 @@ void foo ()
  c = L'\ubad';		/* { dg-error "incomplete" "incompete UCN 1" } */
  c = L"\U1234"[0];	/* { dg-error "incomplete" "incompete UCN 2" } */

-  c = L'\u000x';	/* { dg-error "non-hex" "non-hex digit in UCN" } */
+  c = L'\u000x';	/* { dg-error "incomplete" "non-hex digit in UCN" } */
  /* If sizeof(HOST_WIDE_INT) > sizeof(wchar_t), we can get a multi-character
     constant warning even for wide characters.  */
  /* { dg-warning "too long|multi-character" "" { target *-*-* } 54 } */
@@ -61,7 +61,7 @@ void foo ()
  c = '\u00a0';		/* { dg-bogus "invalid" "00a0 is a valid UCN" } */
  c = '\U00000060';	/* { dg-bogus "invalid" "0060 is a valid UCN" } */

-  c = '\u0025';		/* { dg-error "range" "0025 is an invalid UCN" } */
-  c = L"\uD800"[0];	/* { dg-error "range" "D800 is an invalid UCN" } */
-  c = L'\U0000DFFF';	/* { dg-error "range" "DFFF is an invalid UCN" } */
+  c = '\u0025';		/* { dg-error "not a valid" "0025 invalid UCN" } */
+  c = L"\uD800"[0];	/* { dg-error "not a valid" "D800 invalid UCN" } */
+  c = L'\U0000DFFF';	/* { dg-error "not a valid" "DFFF invalid UCN" } */
 }