lex.c (java_new_lexer): Initialize new fields.

* lex.c (java_new_lexer): Initialize new fields. Work around broken iconv() implementations. (java_read_char): Swap bytes if required. Use fallback decoder if required. (byteswap_init, need_byteswap): New globals. (java_destroy_lexer): Only close iconv handle if it is in use. * lex.h (java_lexer): New fields read_anything, byte_swap, use_fallback. Made out_buffer unsigned. From-SVN: r37063

lex.c (java_new_lexer): Initialize new fields.
* lex.c (java_new_lexer): Initialize new fields. Work around broken iconv() implementations. (java_read_char): Swap bytes if required. Use fallback decoder if required. (byteswap_init, need_byteswap): New globals. (java_destroy_lexer): Only close iconv handle if it is in use. * lex.h (java_lexer): New fields read_anything, byte_swap, use_fallback. Made out_buffer unsigned. From-SVN: r37063
07b5e470 · Tom Tromey · Tom Tromey · 081b49f1 · 07b5e470 · 07b5e470
Commit 07b5e470 authored Oct 26, 2000 by Tom Tromey Committed by Tom Tromey Oct 26, 2000
Show whitespace changes
Inline Side-by-side

Showing with 121 additions and 12 deletions

gcc/java/ChangeLog
+12 -0

gcc/java/lex.c
+98 -11

gcc/java/lex.h
+11 -1

No files found.
--- a/gcc/java/ChangeLog
+++ b/gcc/java/ChangeLog
+2000-10-24  Tom Tromey  <tromey@cygnus.com>
+	* lex.c (java_new_lexer): Initialize new fields.  Work around
+	broken iconv() implementations.
+	(java_read_char): Swap bytes if required.  Use fallback decoder if
+	required.
+	(byteswap_init, need_byteswap): New globals.
+	(java_destroy_lexer): Only close iconv handle if it is in use.
+	* lex.h (java_lexer): New fields read_anything, byte_swap,
+	use_fallback.
+	Made out_buffer unsigned.
 2000-10-24  Alexandre Petit-Bianco  <apbianco@cygnus.com>
 	* parse.y (register_incomplete_type): Include JDEP_FIELD as a case

--- a/gcc/java/lex.c
+++ b/gcc/java/lex.c
@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void));
 static unicode_t java_sneak_unicode PARAMS ((void));
 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
+/* This is nonzero if we have initialized `need_byteswap'.  */
+static int byteswap_init = 0;
+/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
+   big-endian order -- not native endian order.  We handle this by
+   doing a conversion once at startup and seeing what happens.  This
+   flag holds the results of this determination.  */
+static int need_byteswap = 0;
 void
 java_init_lex (finput, encoding)
     FILE *finput;
@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding)
 #ifdef HAVE_ICONV
  lex->handle = iconv_open ("UCS-2", encoding);
-  if (lex->handle == (iconv_t) -1)
+  if (lex->handle != (iconv_t) -1)
    {
-      /* FIXME: we should give a nice error based on errno here.  */
-      enc_error = 1;
-    }
      lex->first = -1;
      lex->last = -1;
      lex->out_first = -1;
      lex->out_last = -1;
-#else /* HAVE_ICONV */
+      lex->read_anything = 0;
+      lex->use_fallback = 0;
+      /* Work around broken iconv() implementations by doing checking at
+	 runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
+	 then all UCS-2 encoders will be broken.  Perhaps not a valid
+	 assumption.  */
+      if (! byteswap_init)
+	{
+	  iconv_t handle;
+	  byteswap_init = 1;
+	  handle = iconv_open ("UCS-2", "UTF-8");
+	  if (handle != (iconv_t) -1)
+	    {
+	      unicode_t result;
+	      unsigned char in[3];
+	      char *inp, *outp;
+	      size_t inc, outc, r;
+	      /* This is the UTF-8 encoding of \ufeff.  */
+	      in[0] = 0xef;
+	      in[1] = 0xbb;
+	      in[2] = 0xbf;
+	      inp = in;
+	      inc = 3;
+	      outp = (char *) &result;
+	      outc = 2;
+	      r = iconv (handle, (const char **) &inp, &inc, &outp, &outc);
+	      /* Conversion must be complete for us to use the result.  */
+	      if (r != (size_t) -1 && inc == 0 && outc == 0)
+		need_byteswap = (result != 0xfeff);
+	    }
+	}
+      lex->byte_swap = need_byteswap;
+    }
+  else
+#endif /* HAVE_ICONV */
+    {
+      /* If iconv failed, use the internal decoder if the default
+	 encoding was requested.  This code is used on platforms where
+	 iconv() exists but is insufficient for our needs.  For
+	 instance, on Solaris 2.5 iconv() cannot handle UTF-8 or UCS-2.  */
      if (strcmp (encoding, DEFAULT_ENCODING))
 	enc_error = 1;
+#ifdef HAVE_ICONV
+      else
+	lex->use_fallback = 1;
 #endif /* HAVE_ICONV */
+    }
  if (enc_error)
    fatal ("unknown encoding: `%s'", encoding);
@@ -233,6 +289,7 @@ java_destroy_lexer (lex)
     java_lexer *lex;
 {
 #ifdef HAVE_ICONV
+  if (! lex->use_fallback)
    iconv_close (lex->handle);
 #endif
  free (lex);
@@ -250,6 +307,7 @@ java_read_char (lex)
    }
 #ifdef HAVE_ICONV
+  if (! lex->use_fallback)
    {
      size_t ir, inbytesleft, in_save, out_count, out_save;
      char *inp, *outp;
@@ -299,6 +357,37 @@ java_read_char (lex)
 	      outp = &lex->out_buffer[lex->out_last];
 	      ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
 			  &outp, &out_count);
+	      /* If we haven't read any bytes, then look to see if we
+		 have read a BOM.  */
+	      if (! lex->read_anything && out_save - out_count >= 2)
+		{
+		  unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
+		  if (uc == 0xfeff)
+		    {
+		      lex->byte_swap = 0;
+		      lex->out_first += 2;
+		    }
+		  else if (uc == 0xfffe)
+		    {
+		      lex->byte_swap = 1;
+		      lex->out_first += 2;
+		    }
+		  lex->read_anything = 1;
+		}
+	      if (lex->byte_swap)
+		{
+		  unsigned int i;
+		  for (i = 0; i < out_save - out_count; i += 2)
+		    {
+		      char t = lex->out_buffer[lex->out_last + i];
+		      lex->out_buffer[lex->out_last + i]
+			= lex->out_buffer[lex->out_last + i + 1];
+		      lex->out_buffer[lex->out_last + i + 1] = t;
+		    }
+		}
 	      lex->first += in_save - inbytesleft;
 	      lex->out_last += out_save - out_count;
@@ -338,14 +427,13 @@ java_read_char (lex)
 	  return UEOF;
 	}
-    /* Success.  We assume that UCS-2 is big-endian.  This appears to
+      /* Success.  */
-       be an ok assumption.  */
+      result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
-    result = ((((unsigned char) lex->out_buffer[lex->out_first]) << 8)
-	      | (unsigned char) lex->out_buffer[lex->out_first + 1]);
      lex->out_first += 2;
      return result;
    }
-#else /* HAVE_ICONV */
+  else
+#endif /* HAVE_ICONV */
    {
      int c, c1, c2;
      c = getc (lex->finput);
@@ -383,7 +471,6 @@ java_read_char (lex)
 	  java_lex_error ("malformed UTF-8 character", 0);
 	}
    }
-#endif /* HAVE_ICONV */
  /* We only get here on error.  */
  return UEOF;

--- a/gcc/java/lex.h
+++ b/gcc/java/lex.h
@@ -115,6 +115,16 @@ typedef struct java_lexer
  unicode_t unget_value;
 #ifdef HAVE_ICONV
+  /* Nonzero if we've read any bytes.  We only recognize the
+     byte-order-marker (BOM) as the first word.  */
+  int read_anything : 1;
+  /* Nonzero if we have to byte swap.  */
+  int byte_swap : 1;
+  /* Nonzero if we're using the fallback decoder.  */
+  int use_fallback : 1;
  /* The handle for the iconv converter we're using.  */
  iconv_t handle;
@@ -132,7 +142,7 @@ typedef struct java_lexer
  /* This is a buffer of characters already converted by iconv.  We
     use `char' here because we're assuming that iconv() converts to
     big-endian UCS-2, and then we convert it ourselves.  */
-  char out_buffer[1024];
+  unsigned char out_buffer[1024];
  /* Index of first valid output character.  -1 if no valid
     characters.  */