Commit 07b5e470 by Tom Tromey Committed by Tom Tromey

lex.c (java_new_lexer): Initialize new fields.

	* lex.c (java_new_lexer): Initialize new fields.  Work around
	broken iconv() implementations.
	(java_read_char): Swap bytes if required.  Use fallback decoder if
	required.
	(byteswap_init, need_byteswap): New globals.
	(java_destroy_lexer): Only close iconv handle if it is in use.
	* lex.h (java_lexer): New fields read_anything, byte_swap,
	use_fallback.
	Made out_buffer unsigned.

From-SVN: r37063
parent 081b49f1
2000-10-24 Tom Tromey <tromey@cygnus.com>
* lex.c (java_new_lexer): Initialize new fields. Work around
broken iconv() implementations.
(java_read_char): Swap bytes if required. Use fallback decoder if
required.
(byteswap_init, need_byteswap): New globals.
(java_destroy_lexer): Only close iconv handle if it is in use.
* lex.h (java_lexer): New fields read_anything, byte_swap,
use_fallback.
Made out_buffer unsigned.
2000-10-24 Alexandre Petit-Bianco <apbianco@cygnus.com> 2000-10-24 Alexandre Petit-Bianco <apbianco@cygnus.com>
* parse.y (register_incomplete_type): Include JDEP_FIELD as a case * parse.y (register_incomplete_type): Include JDEP_FIELD as a case
......
...@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void)); ...@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void));
static unicode_t java_sneak_unicode PARAMS ((void)); static unicode_t java_sneak_unicode PARAMS ((void));
java_lexer *java_new_lexer PARAMS ((FILE *, const char *)); java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
/* This is nonzero if we have initialized `need_byteswap'. */
static int byteswap_init = 0;
/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
big-endian order -- not native endian order. We handle this by
doing a conversion once at startup and seeing what happens. This
flag holds the results of this determination. */
static int need_byteswap = 0;
void void
java_init_lex (finput, encoding) java_init_lex (finput, encoding)
FILE *finput; FILE *finput;
...@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding) ...@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding)
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
lex->handle = iconv_open ("UCS-2", encoding); lex->handle = iconv_open ("UCS-2", encoding);
if (lex->handle == (iconv_t) -1) if (lex->handle != (iconv_t) -1)
{ {
/* FIXME: we should give a nice error based on errno here. */ lex->first = -1;
enc_error = 1; lex->last = -1;
lex->out_first = -1;
lex->out_last = -1;
lex->read_anything = 0;
lex->use_fallback = 0;
/* Work around broken iconv() implementations by doing checking at
runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
then all UCS-2 encoders will be broken. Perhaps not a valid
assumption. */
if (! byteswap_init)
{
iconv_t handle;
byteswap_init = 1;
handle = iconv_open ("UCS-2", "UTF-8");
if (handle != (iconv_t) -1)
{
unicode_t result;
unsigned char in[3];
char *inp, *outp;
size_t inc, outc, r;
/* This is the UTF-8 encoding of \ufeff. */
in[0] = 0xef;
in[1] = 0xbb;
in[2] = 0xbf;
inp = in;
inc = 3;
outp = (char *) &result;
outc = 2;
r = iconv (handle, (const char **) &inp, &inc, &outp, &outc);
/* Conversion must be complete for us to use the result. */
if (r != (size_t) -1 && inc == 0 && outc == 0)
need_byteswap = (result != 0xfeff);
}
}
lex->byte_swap = need_byteswap;
} }
lex->first = -1; else
lex->last = -1;
lex->out_first = -1;
lex->out_last = -1;
#else /* HAVE_ICONV */
if (strcmp (encoding, DEFAULT_ENCODING))
enc_error = 1;
#endif /* HAVE_ICONV */ #endif /* HAVE_ICONV */
{
/* If iconv failed, use the internal decoder if the default
encoding was requested. This code is used on platforms where
iconv() exists but is insufficient for our needs. For
instance, on Solaris 2.5 iconv() cannot handle UTF-8 or UCS-2. */
if (strcmp (encoding, DEFAULT_ENCODING))
enc_error = 1;
#ifdef HAVE_ICONV
else
lex->use_fallback = 1;
#endif /* HAVE_ICONV */
}
if (enc_error) if (enc_error)
fatal ("unknown encoding: `%s'", encoding); fatal ("unknown encoding: `%s'", encoding);
...@@ -233,7 +289,8 @@ java_destroy_lexer (lex) ...@@ -233,7 +289,8 @@ java_destroy_lexer (lex)
java_lexer *lex; java_lexer *lex;
{ {
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
iconv_close (lex->handle); if (! lex->use_fallback)
iconv_close (lex->handle);
#endif #endif
free (lex); free (lex);
} }
...@@ -250,140 +307,170 @@ java_read_char (lex) ...@@ -250,140 +307,170 @@ java_read_char (lex)
} }
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
{ if (! lex->use_fallback)
size_t ir, inbytesleft, in_save, out_count, out_save; {
char *inp, *outp; size_t ir, inbytesleft, in_save, out_count, out_save;
unicode_t result; char *inp, *outp;
unicode_t result;
/* If there is data which has already been converted, use it. */ /* If there is data which has already been converted, use it. */
if (lex->out_first == -1 || lex->out_first >= lex->out_last) if (lex->out_first == -1 || lex->out_first >= lex->out_last)
{ {
lex->out_first = 0; lex->out_first = 0;
lex->out_last = 0; lex->out_last = 0;
while (1) while (1)
{ {
/* See if we need to read more data. If FIRST == 0 then /* See if we need to read more data. If FIRST == 0 then
the previous conversion attempt ended in the middle of the previous conversion attempt ended in the middle of
a character at the end of the buffer. Otherwise we a character at the end of the buffer. Otherwise we
only have to read if the buffer is empty. */ only have to read if the buffer is empty. */
if (lex->first == 0 || lex->first >= lex->last) if (lex->first == 0 || lex->first >= lex->last)
{ {
int r; int r;
if (lex->first >= lex->last) if (lex->first >= lex->last)
{ {
lex->first = 0; lex->first = 0;
lex->last = 0; lex->last = 0;
} }
if (feof (lex->finput)) if (feof (lex->finput))
return UEOF;
r = fread (&lex->buffer[lex->last], 1,
sizeof (lex->buffer) - lex->last,
lex->finput);
lex->last += r;
}
inbytesleft = lex->last - lex->first;
out_count = sizeof (lex->out_buffer) - lex->out_last;
if (inbytesleft == 0)
{
/* We've tried to read and there is nothing left. */
return UEOF; return UEOF;
r = fread (&lex->buffer[lex->last], 1, }
sizeof (lex->buffer) - lex->last,
lex->finput);
lex->last += r;
}
inbytesleft = lex->last - lex->first; in_save = inbytesleft;
out_count = sizeof (lex->out_buffer) - lex->out_last; out_save = out_count;
inp = &lex->buffer[lex->first];
outp = &lex->out_buffer[lex->out_last];
ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
&outp, &out_count);
if (inbytesleft == 0) /* If we haven't read any bytes, then look to see if we
{ have read a BOM. */
/* We've tried to read and there is nothing left. */ if (! lex->read_anything && out_save - out_count >= 2)
return UEOF; {
} unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
if (uc == 0xfeff)
{
lex->byte_swap = 0;
lex->out_first += 2;
}
else if (uc == 0xfffe)
{
lex->byte_swap = 1;
lex->out_first += 2;
}
lex->read_anything = 1;
}
in_save = inbytesleft; if (lex->byte_swap)
out_save = out_count; {
inp = &lex->buffer[lex->first]; unsigned int i;
outp = &lex->out_buffer[lex->out_last]; for (i = 0; i < out_save - out_count; i += 2)
ir = iconv (lex->handle, (const char **) &inp, &inbytesleft, {
&outp, &out_count); char t = lex->out_buffer[lex->out_last + i];
lex->first += in_save - inbytesleft; lex->out_buffer[lex->out_last + i]
lex->out_last += out_save - out_count; = lex->out_buffer[lex->out_last + i + 1];
lex->out_buffer[lex->out_last + i + 1] = t;
/* If we converted anything at all, move along. */ }
if (out_count != out_save) }
break;
if (ir == (size_t) -1) lex->first += in_save - inbytesleft;
{ lex->out_last += out_save - out_count;
if (errno == EINVAL)
{
/* This is ok. This means that the end of our buffer
is in the middle of a character sequence. We just
move the valid part of the buffer to the beginning
to force a read. */
/* We use bcopy() because it should work for
overlapping strings. Use memmove() instead... */
bcopy (&lex->buffer[lex->first], &lex->buffer[0],
lex->last - lex->first);
lex->last -= lex->first;
lex->first = 0;
}
else
{
/* A more serious error. */
java_lex_error ("unrecognized character in input stream",
0);
return UEOF;
}
}
}
}
if (lex->out_first == -1 || lex->out_first >= lex->out_last) /* If we converted anything at all, move along. */
{ if (out_count != out_save)
/* Don't have any data. */ break;
return UEOF;
}
/* Success. We assume that UCS-2 is big-endian. This appears to if (ir == (size_t) -1)
be an ok assumption. */ {
result = ((((unsigned char) lex->out_buffer[lex->out_first]) << 8) if (errno == EINVAL)
| (unsigned char) lex->out_buffer[lex->out_first + 1]); {
lex->out_first += 2; /* This is ok. This means that the end of our buffer
return result; is in the middle of a character sequence. We just
} move the valid part of the buffer to the beginning
#else /* HAVE_ICONV */ to force a read. */
{ /* We use bcopy() because it should work for
int c, c1, c2; overlapping strings. Use memmove() instead... */
c = getc (lex->finput); bcopy (&lex->buffer[lex->first], &lex->buffer[0],
lex->last - lex->first);
if (c < 128) lex->last -= lex->first;
return (unicode_t)c; lex->first = 0;
if (c == EOF) }
return UEOF; else
else {
{ /* A more serious error. */
if ((c & 0xe0) == 0xc0) java_lex_error ("unrecognized character in input stream",
{ 0);
c1 = getc (lex->finput); return UEOF;
if ((c1 & 0xc0) == 0x80) }
return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f)); }
c = c1; }
} }
else if ((c & 0xf0) == 0xe0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
{
c2 = getc (lex->finput);
if ((c2 & 0xc0) == 0x80)
return (unicode_t)(((c & 0xf) << 12) +
(( c1 & 0x3f) << 6) + (c2 & 0x3f));
else
c = c2;
}
else
c = c1;
}
/* We simply don't support invalid characters. */ if (lex->out_first == -1 || lex->out_first >= lex->out_last)
java_lex_error ("malformed UTF-8 character", 0); {
} /* Don't have any data. */
} return UEOF;
}
/* Success. */
result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
lex->out_first += 2;
return result;
}
else
#endif /* HAVE_ICONV */ #endif /* HAVE_ICONV */
{
int c, c1, c2;
c = getc (lex->finput);
if (c < 128)
return (unicode_t)c;
if (c == EOF)
return UEOF;
else
{
if ((c & 0xe0) == 0xc0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
c = c1;
}
else if ((c & 0xf0) == 0xe0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
{
c2 = getc (lex->finput);
if ((c2 & 0xc0) == 0x80)
return (unicode_t)(((c & 0xf) << 12) +
(( c1 & 0x3f) << 6) + (c2 & 0x3f));
else
c = c2;
}
else
c = c1;
}
/* We simply don't support invalid characters. */
java_lex_error ("malformed UTF-8 character", 0);
}
}
/* We only get here on error. */ /* We only get here on error. */
return UEOF; return UEOF;
......
...@@ -115,6 +115,16 @@ typedef struct java_lexer ...@@ -115,6 +115,16 @@ typedef struct java_lexer
unicode_t unget_value; unicode_t unget_value;
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
/* Nonzero if we've read any bytes. We only recognize the
byte-order-marker (BOM) as the first word. */
int read_anything : 1;
/* Nonzero if we have to byte swap. */
int byte_swap : 1;
/* Nonzero if we're using the fallback decoder. */
int use_fallback : 1;
/* The handle for the iconv converter we're using. */ /* The handle for the iconv converter we're using. */
iconv_t handle; iconv_t handle;
...@@ -132,7 +142,7 @@ typedef struct java_lexer ...@@ -132,7 +142,7 @@ typedef struct java_lexer
/* This is a buffer of characters already converted by iconv. We /* This is a buffer of characters already converted by iconv. We
use `char' here because we're assuming that iconv() converts to use `char' here because we're assuming that iconv() converts to
big-endian UCS-2, and then we convert it ourselves. */ big-endian UCS-2, and then we convert it ourselves. */
char out_buffer[1024]; unsigned char out_buffer[1024];
/* Index of first valid output character. -1 if no valid /* Index of first valid output character. -1 if no valid
characters. */ characters. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment