Commit 07b5e470 by Tom Tromey Committed by Tom Tromey

lex.c (java_new_lexer): Initialize new fields.

	* lex.c (java_new_lexer): Initialize new fields.  Work around
	broken iconv() implementations.
	(java_read_char): Swap bytes if required.  Use fallback decoder if
	required.
	(byteswap_init, need_byteswap): New globals.
	(java_destroy_lexer): Only close iconv handle if it is in use.
	* lex.h (java_lexer): New fields read_anything, byte_swap,
	use_fallback.
	Made out_buffer unsigned.

From-SVN: r37063
parent 081b49f1
2000-10-24 Tom Tromey <tromey@cygnus.com>
* lex.c (java_new_lexer): Initialize new fields. Work around
broken iconv() implementations.
(java_read_char): Swap bytes if required. Use fallback decoder if
required.
(byteswap_init, need_byteswap): New globals.
(java_destroy_lexer): Only close iconv handle if it is in use.
* lex.h (java_lexer): New fields read_anything, byte_swap,
use_fallback.
Made out_buffer unsigned.
2000-10-24 Alexandre Petit-Bianco <apbianco@cygnus.com>
* parse.y (register_incomplete_type): Include JDEP_FIELD as a case
......
......@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void));
static unicode_t java_sneak_unicode PARAMS ((void));
java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
/* This is nonzero if we have initialized `need_byteswap'. */
static int byteswap_init = 0;
/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
big-endian order -- not native endian order. We handle this by
doing a conversion once at startup and seeing what happens. This
flag holds the results of this determination. */
static int need_byteswap = 0;
void
java_init_lex (finput, encoding)
FILE *finput;
......@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding)
#ifdef HAVE_ICONV
lex->handle = iconv_open ("UCS-2", encoding);
if (lex->handle == (iconv_t) -1)
if (lex->handle != (iconv_t) -1)
{
/* FIXME: we should give a nice error based on errno here. */
enc_error = 1;
lex->first = -1;
lex->last = -1;
lex->out_first = -1;
lex->out_last = -1;
lex->read_anything = 0;
lex->use_fallback = 0;
/* Work around broken iconv() implementations by doing checking at
runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
then all UCS-2 encoders will be broken. Perhaps not a valid
assumption. */
if (! byteswap_init)
{
iconv_t handle;
byteswap_init = 1;
handle = iconv_open ("UCS-2", "UTF-8");
if (handle != (iconv_t) -1)
{
unicode_t result;
unsigned char in[3];
char *inp, *outp;
size_t inc, outc, r;
/* This is the UTF-8 encoding of \ufeff. */
in[0] = 0xef;
in[1] = 0xbb;
in[2] = 0xbf;
inp = in;
inc = 3;
outp = (char *) &result;
outc = 2;
r = iconv (handle, (const char **) &inp, &inc, &outp, &outc);
/* Conversion must be complete for us to use the result. */
if (r != (size_t) -1 && inc == 0 && outc == 0)
need_byteswap = (result != 0xfeff);
}
}
lex->byte_swap = need_byteswap;
}
lex->first = -1;
lex->last = -1;
lex->out_first = -1;
lex->out_last = -1;
#else /* HAVE_ICONV */
if (strcmp (encoding, DEFAULT_ENCODING))
enc_error = 1;
else
#endif /* HAVE_ICONV */
{
/* If iconv failed, use the internal decoder if the default
encoding was requested. This code is used on platforms where
iconv() exists but is insufficient for our needs. For
instance, on Solaris 2.5 iconv() cannot handle UTF-8 or UCS-2. */
if (strcmp (encoding, DEFAULT_ENCODING))
enc_error = 1;
#ifdef HAVE_ICONV
else
lex->use_fallback = 1;
#endif /* HAVE_ICONV */
}
if (enc_error)
fatal ("unknown encoding: `%s'", encoding);
......@@ -233,7 +289,8 @@ java_destroy_lexer (lex)
java_lexer *lex;
{
#ifdef HAVE_ICONV
iconv_close (lex->handle);
if (! lex->use_fallback)
iconv_close (lex->handle);
#endif
free (lex);
}
......@@ -250,140 +307,170 @@ java_read_char (lex)
}
#ifdef HAVE_ICONV
{
size_t ir, inbytesleft, in_save, out_count, out_save;
char *inp, *outp;
unicode_t result;
if (! lex->use_fallback)
{
size_t ir, inbytesleft, in_save, out_count, out_save;
char *inp, *outp;
unicode_t result;
/* If there is data which has already been converted, use it. */
if (lex->out_first == -1 || lex->out_first >= lex->out_last)
{
lex->out_first = 0;
lex->out_last = 0;
/* If there is data which has already been converted, use it. */
if (lex->out_first == -1 || lex->out_first >= lex->out_last)
{
lex->out_first = 0;
lex->out_last = 0;
while (1)
{
/* See if we need to read more data. If FIRST == 0 then
the previous conversion attempt ended in the middle of
a character at the end of the buffer. Otherwise we
only have to read if the buffer is empty. */
if (lex->first == 0 || lex->first >= lex->last)
{
int r;
if (lex->first >= lex->last)
{
lex->first = 0;
lex->last = 0;
}
if (feof (lex->finput))
while (1)
{
/* See if we need to read more data. If FIRST == 0 then
the previous conversion attempt ended in the middle of
a character at the end of the buffer. Otherwise we
only have to read if the buffer is empty. */
if (lex->first == 0 || lex->first >= lex->last)
{
int r;
if (lex->first >= lex->last)
{
lex->first = 0;
lex->last = 0;
}
if (feof (lex->finput))
return UEOF;
r = fread (&lex->buffer[lex->last], 1,
sizeof (lex->buffer) - lex->last,
lex->finput);
lex->last += r;
}
inbytesleft = lex->last - lex->first;
out_count = sizeof (lex->out_buffer) - lex->out_last;
if (inbytesleft == 0)
{
/* We've tried to read and there is nothing left. */
return UEOF;
r = fread (&lex->buffer[lex->last], 1,
sizeof (lex->buffer) - lex->last,
lex->finput);
lex->last += r;
}
}
inbytesleft = lex->last - lex->first;
out_count = sizeof (lex->out_buffer) - lex->out_last;
in_save = inbytesleft;
out_save = out_count;
inp = &lex->buffer[lex->first];
outp = &lex->out_buffer[lex->out_last];
ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
&outp, &out_count);
if (inbytesleft == 0)
{
/* We've tried to read and there is nothing left. */
return UEOF;
}
/* If we haven't read any bytes, then look to see if we
have read a BOM. */
if (! lex->read_anything && out_save - out_count >= 2)
{
unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
if (uc == 0xfeff)
{
lex->byte_swap = 0;
lex->out_first += 2;
}
else if (uc == 0xfffe)
{
lex->byte_swap = 1;
lex->out_first += 2;
}
lex->read_anything = 1;
}
in_save = inbytesleft;
out_save = out_count;
inp = &lex->buffer[lex->first];
outp = &lex->out_buffer[lex->out_last];
ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
&outp, &out_count);
lex->first += in_save - inbytesleft;
lex->out_last += out_save - out_count;
/* If we converted anything at all, move along. */
if (out_count != out_save)
break;
if (lex->byte_swap)
{
unsigned int i;
for (i = 0; i < out_save - out_count; i += 2)
{
char t = lex->out_buffer[lex->out_last + i];
lex->out_buffer[lex->out_last + i]
= lex->out_buffer[lex->out_last + i + 1];
lex->out_buffer[lex->out_last + i + 1] = t;
}
}
if (ir == (size_t) -1)
{
if (errno == EINVAL)
{
/* This is ok. This means that the end of our buffer
is in the middle of a character sequence. We just
move the valid part of the buffer to the beginning
to force a read. */
/* We use bcopy() because it should work for
overlapping strings. Use memmove() instead... */
bcopy (&lex->buffer[lex->first], &lex->buffer[0],
lex->last - lex->first);
lex->last -= lex->first;
lex->first = 0;
}
else
{
/* A more serious error. */
java_lex_error ("unrecognized character in input stream",
0);
return UEOF;
}
}
}
}
lex->first += in_save - inbytesleft;
lex->out_last += out_save - out_count;
if (lex->out_first == -1 || lex->out_first >= lex->out_last)
{
/* Don't have any data. */
return UEOF;
}
/* If we converted anything at all, move along. */
if (out_count != out_save)
break;
/* Success. We assume that UCS-2 is big-endian. This appears to
be an ok assumption. */
result = ((((unsigned char) lex->out_buffer[lex->out_first]) << 8)
| (unsigned char) lex->out_buffer[lex->out_first + 1]);
lex->out_first += 2;
return result;
}
#else /* HAVE_ICONV */
{
int c, c1, c2;
c = getc (lex->finput);
if (c < 128)
return (unicode_t)c;
if (c == EOF)
return UEOF;
else
{
if ((c & 0xe0) == 0xc0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
c = c1;
}
else if ((c & 0xf0) == 0xe0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
{
c2 = getc (lex->finput);
if ((c2 & 0xc0) == 0x80)
return (unicode_t)(((c & 0xf) << 12) +
(( c1 & 0x3f) << 6) + (c2 & 0x3f));
else
c = c2;
}
else
c = c1;
}
if (ir == (size_t) -1)
{
if (errno == EINVAL)
{
/* This is ok. This means that the end of our buffer
is in the middle of a character sequence. We just
move the valid part of the buffer to the beginning
to force a read. */
/* We use bcopy() because it should work for
overlapping strings. Use memmove() instead... */
bcopy (&lex->buffer[lex->first], &lex->buffer[0],
lex->last - lex->first);
lex->last -= lex->first;
lex->first = 0;
}
else
{
/* A more serious error. */
java_lex_error ("unrecognized character in input stream",
0);
return UEOF;
}
}
}
}
/* We simply don't support invalid characters. */
java_lex_error ("malformed UTF-8 character", 0);
}
}
if (lex->out_first == -1 || lex->out_first >= lex->out_last)
{
/* Don't have any data. */
return UEOF;
}
/* Success. */
result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
lex->out_first += 2;
return result;
}
else
#endif /* HAVE_ICONV */
{
int c, c1, c2;
c = getc (lex->finput);
if (c < 128)
return (unicode_t)c;
if (c == EOF)
return UEOF;
else
{
if ((c & 0xe0) == 0xc0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
c = c1;
}
else if ((c & 0xf0) == 0xe0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
{
c2 = getc (lex->finput);
if ((c2 & 0xc0) == 0x80)
return (unicode_t)(((c & 0xf) << 12) +
(( c1 & 0x3f) << 6) + (c2 & 0x3f));
else
c = c2;
}
else
c = c1;
}
/* We simply don't support invalid characters. */
java_lex_error ("malformed UTF-8 character", 0);
}
}
/* We only get here on error. */
return UEOF;
......
......@@ -115,6 +115,16 @@ typedef struct java_lexer
unicode_t unget_value;
#ifdef HAVE_ICONV
/* Nonzero if we've read any bytes. We only recognize the
byte-order-marker (BOM) as the first word. */
int read_anything : 1;
/* Nonzero if we have to byte swap. */
int byte_swap : 1;
/* Nonzero if we're using the fallback decoder. */
int use_fallback : 1;
/* The handle for the iconv converter we're using. */
iconv_t handle;
......@@ -132,7 +142,7 @@ typedef struct java_lexer
/* This is a buffer of characters already converted by iconv. We
use `char' here because we're assuming that iconv() converts to
big-endian UCS-2, and then we convert it ourselves. */
char out_buffer[1024];
unsigned char out_buffer[1024];
/* Index of first valid output character. -1 if no valid
characters. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment