Commit 07b5e470 by Tom Tromey Committed by Tom Tromey

lex.c (java_new_lexer): Initialize new fields.

	* lex.c (java_new_lexer): Initialize new fields.  Work around
	broken iconv() implementations.
	(java_read_char): Swap bytes if required.  Use fallback decoder if
	required.
	(byteswap_init, need_byteswap): New globals.
	(java_destroy_lexer): Only close iconv handle if it is in use.
	* lex.h (java_lexer): New fields read_anything, byte_swap,
	use_fallback.
	Made out_buffer unsigned.

From-SVN: r37063
parent 081b49f1
2000-10-24 Tom Tromey <tromey@cygnus.com>
* lex.c (java_new_lexer): Initialize new fields. Work around
broken iconv() implementations.
(java_read_char): Swap bytes if required. Use fallback decoder if
required.
(byteswap_init, need_byteswap): New globals.
(java_destroy_lexer): Only close iconv handle if it is in use.
* lex.h (java_lexer): New fields read_anything, byte_swap,
use_fallback.
Made out_buffer unsigned.
2000-10-24 Alexandre Petit-Bianco <apbianco@cygnus.com> 2000-10-24 Alexandre Petit-Bianco <apbianco@cygnus.com>
* parse.y (register_incomplete_type): Include JDEP_FIELD as a case * parse.y (register_incomplete_type): Include JDEP_FIELD as a case
......
...@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void)); ...@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void));
static unicode_t java_sneak_unicode PARAMS ((void)); static unicode_t java_sneak_unicode PARAMS ((void));
java_lexer *java_new_lexer PARAMS ((FILE *, const char *)); java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
/* This is nonzero if we have initialized `need_byteswap'. */
static int byteswap_init = 0;
/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
big-endian order -- not native endian order. We handle this by
doing a conversion once at startup and seeing what happens. This
flag holds the results of this determination. */
static int need_byteswap = 0;
void void
java_init_lex (finput, encoding) java_init_lex (finput, encoding)
FILE *finput; FILE *finput;
...@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding) ...@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding)
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
lex->handle = iconv_open ("UCS-2", encoding); lex->handle = iconv_open ("UCS-2", encoding);
if (lex->handle == (iconv_t) -1) if (lex->handle != (iconv_t) -1)
{ {
/* FIXME: we should give a nice error based on errno here. */
enc_error = 1;
}
lex->first = -1; lex->first = -1;
lex->last = -1; lex->last = -1;
lex->out_first = -1; lex->out_first = -1;
lex->out_last = -1; lex->out_last = -1;
#else /* HAVE_ICONV */ lex->read_anything = 0;
lex->use_fallback = 0;
/* Work around broken iconv() implementations by doing checking at
runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
then all UCS-2 encoders will be broken. Perhaps not a valid
assumption. */
if (! byteswap_init)
{
iconv_t handle;
byteswap_init = 1;
handle = iconv_open ("UCS-2", "UTF-8");
if (handle != (iconv_t) -1)
{
unicode_t result;
unsigned char in[3];
char *inp, *outp;
size_t inc, outc, r;
/* This is the UTF-8 encoding of \ufeff. */
in[0] = 0xef;
in[1] = 0xbb;
in[2] = 0xbf;
inp = in;
inc = 3;
outp = (char *) &result;
outc = 2;
r = iconv (handle, (const char **) &inp, &inc, &outp, &outc);
/* Conversion must be complete for us to use the result. */
if (r != (size_t) -1 && inc == 0 && outc == 0)
need_byteswap = (result != 0xfeff);
}
}
lex->byte_swap = need_byteswap;
}
else
#endif /* HAVE_ICONV */
{
/* If iconv failed, use the internal decoder if the default
encoding was requested. This code is used on platforms where
iconv() exists but is insufficient for our needs. For
instance, on Solaris 2.5 iconv() cannot handle UTF-8 or UCS-2. */
if (strcmp (encoding, DEFAULT_ENCODING)) if (strcmp (encoding, DEFAULT_ENCODING))
enc_error = 1; enc_error = 1;
#ifdef HAVE_ICONV
else
lex->use_fallback = 1;
#endif /* HAVE_ICONV */ #endif /* HAVE_ICONV */
}
if (enc_error) if (enc_error)
fatal ("unknown encoding: `%s'", encoding); fatal ("unknown encoding: `%s'", encoding);
...@@ -233,6 +289,7 @@ java_destroy_lexer (lex) ...@@ -233,6 +289,7 @@ java_destroy_lexer (lex)
java_lexer *lex; java_lexer *lex;
{ {
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
if (! lex->use_fallback)
iconv_close (lex->handle); iconv_close (lex->handle);
#endif #endif
free (lex); free (lex);
...@@ -250,6 +307,7 @@ java_read_char (lex) ...@@ -250,6 +307,7 @@ java_read_char (lex)
} }
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
if (! lex->use_fallback)
{ {
size_t ir, inbytesleft, in_save, out_count, out_save; size_t ir, inbytesleft, in_save, out_count, out_save;
char *inp, *outp; char *inp, *outp;
...@@ -299,6 +357,37 @@ java_read_char (lex) ...@@ -299,6 +357,37 @@ java_read_char (lex)
outp = &lex->out_buffer[lex->out_last]; outp = &lex->out_buffer[lex->out_last];
ir = iconv (lex->handle, (const char **) &inp, &inbytesleft, ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
&outp, &out_count); &outp, &out_count);
/* If we haven't read any bytes, then look to see if we
have read a BOM. */
if (! lex->read_anything && out_save - out_count >= 2)
{
unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
if (uc == 0xfeff)
{
lex->byte_swap = 0;
lex->out_first += 2;
}
else if (uc == 0xfffe)
{
lex->byte_swap = 1;
lex->out_first += 2;
}
lex->read_anything = 1;
}
if (lex->byte_swap)
{
unsigned int i;
for (i = 0; i < out_save - out_count; i += 2)
{
char t = lex->out_buffer[lex->out_last + i];
lex->out_buffer[lex->out_last + i]
= lex->out_buffer[lex->out_last + i + 1];
lex->out_buffer[lex->out_last + i + 1] = t;
}
}
lex->first += in_save - inbytesleft; lex->first += in_save - inbytesleft;
lex->out_last += out_save - out_count; lex->out_last += out_save - out_count;
...@@ -338,14 +427,13 @@ java_read_char (lex) ...@@ -338,14 +427,13 @@ java_read_char (lex)
return UEOF; return UEOF;
} }
/* Success. We assume that UCS-2 is big-endian. This appears to /* Success. */
be an ok assumption. */ result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
result = ((((unsigned char) lex->out_buffer[lex->out_first]) << 8)
| (unsigned char) lex->out_buffer[lex->out_first + 1]);
lex->out_first += 2; lex->out_first += 2;
return result; return result;
} }
#else /* HAVE_ICONV */ else
#endif /* HAVE_ICONV */
{ {
int c, c1, c2; int c, c1, c2;
c = getc (lex->finput); c = getc (lex->finput);
...@@ -383,7 +471,6 @@ java_read_char (lex) ...@@ -383,7 +471,6 @@ java_read_char (lex)
java_lex_error ("malformed UTF-8 character", 0); java_lex_error ("malformed UTF-8 character", 0);
} }
} }
#endif /* HAVE_ICONV */
/* We only get here on error. */ /* We only get here on error. */
return UEOF; return UEOF;
......
...@@ -115,6 +115,16 @@ typedef struct java_lexer ...@@ -115,6 +115,16 @@ typedef struct java_lexer
unicode_t unget_value; unicode_t unget_value;
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
/* Nonzero if we've read any bytes. We only recognize the
byte-order-marker (BOM) as the first word. */
int read_anything : 1;
/* Nonzero if we have to byte swap. */
int byte_swap : 1;
/* Nonzero if we're using the fallback decoder. */
int use_fallback : 1;
/* The handle for the iconv converter we're using. */ /* The handle for the iconv converter we're using. */
iconv_t handle; iconv_t handle;
...@@ -132,7 +142,7 @@ typedef struct java_lexer ...@@ -132,7 +142,7 @@ typedef struct java_lexer
/* This is a buffer of characters already converted by iconv. We /* This is a buffer of characters already converted by iconv. We
use `char' here because we're assuming that iconv() converts to use `char' here because we're assuming that iconv() converts to
big-endian UCS-2, and then we convert it ourselves. */ big-endian UCS-2, and then we convert it ourselves. */
char out_buffer[1024]; unsigned char out_buffer[1024];
/* Index of first valid output character. -1 if no valid /* Index of first valid output character. -1 if no valid
characters. */ characters. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment