Commit d19cbcb5 by Tom Tromey Committed by Tom Tromey

re GNATS gcj/33 (gcj mangles composed characters)

	Fix for PR gcj/33:
	* jv-scan.c (help): Document --encoding.
	(options): Added `encoding' entry.
	(OPT_ENCODING): New define.
	(main): Handle --encoding.
	Include <langinfo.h> if nl_langinfo exists.
	* lang-options.h: Document --classpath, --CLASSPATH, --main, and
	--encoding.
	* jcf-parse.c Include <langinfo.h> if we have nl_langinfo.
	(parse_source_file): Correctly call java_init_lex.  Added `finput'
	argument.  Use nl_langinfo to determine default encoding.
	* java-tree.h (current_encoding): Declare.
	* parse.y (java_parser_context_restore_global): Don't restore
	`finput'.
	(java_parser_context_save_global): Don't set `finput' field.
	(java_pop_parser_context): Don't restore `finput'.  Free old lexer
	if required.
	* lang.c (current_encoding): New global.
	(lang_decode_option): Recognize `-fencoding='.
	(finish_parse): Don't close finput.
	* parse.h (struct parser_ctxt): Removed `finput' and
	`unget_utf8_value' fields.  Added `lexer' field.
	(java_init_lex): Fixed declaration.
	* lex.c (java_new_lexer): New function.
	(java_destroy_lexer): Likewise.
	(java_read_char): Added `lex' argument.  Handle iconv case.
	(java_read_unicode): Added `lex' argument.  Count backslashes in
	lexer structure.
	(java_init_lex): Added `finput' and `encoding' arguments.  Set
	`lexer' field in ctxp.
	(BAD_UTF8_VALUE): Removed.
	(java_lex): Handle seeing UEOF in the middle of a string literal.
	* lex.h: Include <iconv.h> if HAVE_ICONV defined.
	(java_lexer): New structure.
	(UNGETC): Removed.
	(GETC): Removed.
	(DEFAULT_ENCODING): New define.
	(java_destroy_lexer): Declare.

From-SVN: r36377
parent ee17a290
2000-09-12 Tom Tromey <tromey@cygnus.com> 2000-09-12 Tom Tromey <tromey@cygnus.com>
Fix for PR gcj/33:
* jv-scan.c (help): Document --encoding.
(options): Added `encoding' entry.
(OPT_ENCODING): New define.
(main): Handle --encoding.
Include <langinfo.h> if nl_langinfo exists.
* lang-options.h: Document --classpath, --CLASSPATH, --main, and
--encoding.
* jcf-parse.c Include <langinfo.h> if we have nl_langinfo.
(parse_source_file): Correctly call java_init_lex. Added `finput'
argument. Use nl_langinfo to determine default encoding.
* java-tree.h (current_encoding): Declare.
* parse.y (java_parser_context_restore_global): Don't restore
`finput'.
(java_parser_context_save_global): Don't set `finput' field.
(java_pop_parser_context): Don't restore `finput'. Free old lexer
if required.
* lang.c (current_encoding): New global.
(lang_decode_option): Recognize `-fencoding='.
(finish_parse): Don't close finput.
* parse.h (struct parser_ctxt): Removed `finput' and
`unget_utf8_value' fields. Added `lexer' field.
(java_init_lex): Fixed declaration.
* lex.c (java_new_lexer): New function.
(java_destroy_lexer): Likewise.
(java_read_char): Added `lex' argument. Handle iconv case.
(java_read_unicode): Added `lex' argument. Count backslashes in
lexer structure.
(java_init_lex): Added `finput' and `encoding' arguments. Set
`lexer' field in ctxp.
(BAD_UTF8_VALUE): Removed.
(java_lex): Handle seeing UEOF in the middle of a string literal.
* lex.h: Include <iconv.h> if HAVE_ICONV defined.
(java_lexer): New structure.
(UNGETC): Removed.
(GETC): Removed.
(DEFAULT_ENCODING): New define.
(java_destroy_lexer): Declare.
2000-09-12 Tom Tromey <tromey@cygnus.com>
Fix for PR gcj/343: Fix for PR gcj/343:
* lex.c (java_init_lex): Initialize java_io_serializable. * lex.c (java_init_lex): Initialize java_io_serializable.
* parse.y (java_io_serializable): New global. * parse.y (java_io_serializable): New global.
......
...@@ -169,6 +169,9 @@ extern int flag_use_boehm_gc; ...@@ -169,6 +169,9 @@ extern int flag_use_boehm_gc;
object to its synchronization structure. */ object to its synchronization structure. */
extern int flag_hash_synchronization; extern int flag_hash_synchronization;
/* Encoding used for source files. */
extern char *current_encoding;
/* The Java .class file that provides main_class; the main input file. */ /* The Java .class file that provides main_class; the main input file. */
extern struct JCF *current_jcf; extern struct JCF *current_jcf;
......
...@@ -35,6 +35,10 @@ The Free Software Foundation is independent of Sun Microsystems, Inc. */ ...@@ -35,6 +35,10 @@ The Free Software Foundation is independent of Sun Microsystems, Inc. */
#include "toplev.h" #include "toplev.h"
#include "parse.h" #include "parse.h"
#ifdef HAVE_NL_LANGINFO
#include <langinfo.h>
#endif
/* A CONSTANT_Utf8 element is converted to an IDENTIFIER_NODE at parse time. */ /* A CONSTANT_Utf8 element is converted to an IDENTIFIER_NODE at parse time. */
#define JPOOL_UTF(JCF, INDEX) CPOOL_UTF(&(JCF)->cpool, INDEX) #define JPOOL_UTF(JCF, INDEX) CPOOL_UTF(&(JCF)->cpool, INDEX)
#define JPOOL_UTF_LENGTH(JCF, INDEX) IDENTIFIER_LENGTH (JPOOL_UTF (JCF, INDEX)) #define JPOOL_UTF_LENGTH(JCF, INDEX) IDENTIFIER_LENGTH (JPOOL_UTF (JCF, INDEX))
...@@ -83,7 +87,7 @@ static struct JCF main_jcf[1]; ...@@ -83,7 +87,7 @@ static struct JCF main_jcf[1];
static tree give_name_to_class PARAMS ((JCF *jcf, int index)); static tree give_name_to_class PARAMS ((JCF *jcf, int index));
static void parse_zip_file_entries PARAMS ((void)); static void parse_zip_file_entries PARAMS ((void));
static void process_zip_dir PARAMS ((void)); static void process_zip_dir PARAMS ((void));
static void parse_source_file PARAMS ((tree)); static void parse_source_file PARAMS ((tree, FILE *));
static void jcf_parse_source PARAMS ((void)); static void jcf_parse_source PARAMS ((void));
static int jcf_figure_file_type PARAMS ((JCF *)); static int jcf_figure_file_type PARAMS ((JCF *));
static int find_in_current_zip PARAMS ((const char *, struct JCF **)); static int find_in_current_zip PARAMS ((const char *, struct JCF **));
...@@ -564,6 +568,7 @@ static void ...@@ -564,6 +568,7 @@ static void
jcf_parse_source () jcf_parse_source ()
{ {
tree file; tree file;
FILE *finput;
java_parser_context_save_global (); java_parser_context_save_global ();
java_push_parser_context (); java_push_parser_context ();
...@@ -576,7 +581,7 @@ jcf_parse_source () ...@@ -576,7 +581,7 @@ jcf_parse_source ()
if (!(finput = fopen (input_filename, "r"))) if (!(finput = fopen (input_filename, "r")))
fatal ("input file `%s' just disappeared - jcf_parse_source", fatal ("input file `%s' just disappeared - jcf_parse_source",
input_filename); input_filename);
parse_source_file (file); parse_source_file (file, finput);
if (fclose (finput)) if (fclose (finput))
fatal ("can't close input file `%s' stream - jcf_parse_source", fatal ("can't close input file `%s' stream - jcf_parse_source",
input_filename); input_filename);
...@@ -754,8 +759,9 @@ parse_class_file () ...@@ -754,8 +759,9 @@ parse_class_file ()
/* Parse a source file, as pointed by the current value of INPUT_FILENAME. */ /* Parse a source file, as pointed by the current value of INPUT_FILENAME. */
static void static void
parse_source_file (file) parse_source_file (file, finput)
tree file; tree file;
FILE *finput;
{ {
int save_error_count = java_error_count; int save_error_count = java_error_count;
/* Mark the file as parsed */ /* Mark the file as parsed */
...@@ -765,7 +771,21 @@ parse_source_file (file) ...@@ -765,7 +771,21 @@ parse_source_file (file)
lang_init_source (1); /* Error msgs have no method prototypes */ lang_init_source (1); /* Error msgs have no method prototypes */
java_init_lex (); /* Initialize the parser */ /* There's no point in trying to find the current encoding unless we
are going to do something intelligent with it -- hence the test
for iconv. */
#ifdef HAVE_ICONV
#ifdef HAVE_NL_LANGINFO
setlocale (LC_CTYPE, "");
if (current_encoding == NULL)
current_encoding = nl_langinfo (CODESET);
#endif /* HAVE_NL_LANGINFO */
#endif /* HAVE_ICONV */
if (current_encoding == NULL || *current_encoding == '\0')
current_encoding = DEFAULT_ENCODING;
/* Initialize the parser */
java_init_lex (finput, current_encoding);
java_parse_abort_on_error (); java_parse_abort_on_error ();
java_parse (); /* Parse and build partial tree nodes. */ java_parse (); /* Parse and build partial tree nodes. */
...@@ -796,6 +816,7 @@ yyparse () ...@@ -796,6 +816,7 @@ yyparse ()
int several_files = 0; int several_files = 0;
char *list = xstrdup (input_filename), *next; char *list = xstrdup (input_filename), *next;
tree node, current_file_list = NULL_TREE; tree node, current_file_list = NULL_TREE;
FILE *finput;
do do
{ {
...@@ -901,7 +922,7 @@ yyparse () ...@@ -901,7 +922,7 @@ yyparse ()
case JCF_SOURCE: case JCF_SOURCE:
java_push_parser_context (); java_push_parser_context ();
java_parser_context_save_global (); java_parser_context_save_global ();
parse_source_file (name); parse_source_file (name, finput);
java_parser_context_restore_global (); java_parser_context_restore_global ();
java_pop_parser_context (1); java_pop_parser_context (1);
break; break;
......
...@@ -26,6 +26,10 @@ Boston, MA 02111-1307, USA. */ ...@@ -26,6 +26,10 @@ Boston, MA 02111-1307, USA. */
#include "version.h" #include "version.h"
#ifdef HAVE_NL_LANGINFO
#include <langinfo.h>
#endif
#include <getopt.h> #include <getopt.h>
void fatal PARAMS ((const char *s, ...)) ATTRIBUTE_PRINTF_1 ATTRIBUTE_NORETURN; void fatal PARAMS ((const char *s, ...)) ATTRIBUTE_PRINTF_1 ATTRIBUTE_NORETURN;
...@@ -61,6 +65,7 @@ int flag_list_filename = 0; ...@@ -61,6 +65,7 @@ int flag_list_filename = 0;
#define OPT_HELP LONG_OPT (0) #define OPT_HELP LONG_OPT (0)
#define OPT_VERSION LONG_OPT (1) #define OPT_VERSION LONG_OPT (1)
#define OPT_ENCODING LONG_OPT (2)
static struct option options[] = static struct option options[] =
{ {
...@@ -69,6 +74,7 @@ static struct option options[] = ...@@ -69,6 +74,7 @@ static struct option options[] =
{ "print-main", no_argument, &flag_find_main, 1 }, { "print-main", no_argument, &flag_find_main, 1 },
{ "list-filename", no_argument, &flag_list_filename, 1 }, { "list-filename", no_argument, &flag_list_filename, 1 },
{ "list-class", no_argument, &flag_dump_class, 1 }, { "list-class", no_argument, &flag_dump_class, 1 },
{ "encoding", required_argument, NULL, OPT_ENCODING },
{ NULL, no_argument, NULL, 0 } { NULL, no_argument, NULL, 0 }
}; };
...@@ -84,6 +90,7 @@ help () ...@@ -84,6 +90,7 @@ help ()
{ {
printf ("Usage: jv-scan [OPTION]... FILE...\n\n"); printf ("Usage: jv-scan [OPTION]... FILE...\n\n");
printf ("Print useful information read from Java source files.\n\n"); printf ("Print useful information read from Java source files.\n\n");
printf (" --encoding NAME Specify encoding of input file\n");
printf (" --print-main Print name of class containing `main'\n"); printf (" --print-main Print name of class containing `main'\n");
printf (" --list-class List all classes defined in file\n"); printf (" --list-class List all classes defined in file\n");
printf (" --list-filename Print input filename when listing class names\n"); printf (" --list-filename Print input filename when listing class names\n");
...@@ -114,6 +121,7 @@ DEFUN (main, (argc, argv), ...@@ -114,6 +121,7 @@ DEFUN (main, (argc, argv),
{ {
int i = 1; int i = 1;
const char *output_file = NULL; const char *output_file = NULL;
const char *encoding = NULL;
long ft; long ft;
int opt; int opt;
...@@ -144,6 +152,10 @@ DEFUN (main, (argc, argv), ...@@ -144,6 +152,10 @@ DEFUN (main, (argc, argv),
version (); version ();
break; break;
case OPT_ENCODING:
encoding = optarg;
break;
default: default:
usage (); usage ();
break; break;
...@@ -172,7 +184,20 @@ DEFUN (main, (argc, argv), ...@@ -172,7 +184,20 @@ DEFUN (main, (argc, argv),
input_filename = argv [i]; input_filename = argv [i];
if ( (finput = fopen (argv [i], "r")) ) if ( (finput = fopen (argv [i], "r")) )
{ {
java_init_lex (); /* There's no point in trying to find the current encoding
unless we are going to do something intelligent with it
-- hence the test for iconv. */
#ifdef HAVE_ICONV
#ifdef HAVE_NL_LANGINFO
setlocale (LC_CTYPE, "");
if (encoding == NULL)
encoding = nl_langinfo (CODESET);
#endif /* HAVE_NL_LANGINFO */
#endif /* HAVE_ICONV */
if (encoding == NULL || *encoding == '\0')
encoding = DEFAULT_ENCODING;
java_init_lex (finput, encoding);
yyparse (); yyparse ();
if (ftell (out) != ft) if (ftell (out) != ft)
fputc ('\n', out); fputc ('\n', out);
......
...@@ -42,8 +42,10 @@ DEFINE_LANG_NAME ("Java") ...@@ -42,8 +42,10 @@ DEFINE_LANG_NAME ("Java")
{ "-M", "Print dependencies to stdout" }, { "-M", "Print dependencies to stdout" },
{ "-MM", "Print dependencies to stdout" }, { "-MM", "Print dependencies to stdout" },
#endif /* ! USE_CPPLIB */ #endif /* ! USE_CPPLIB */
{ "-fclasspath", "Set class path and suppress system path" }, { "--classpath", "Set class path and suppress system path" },
{ "-fCLASSPATH", "Set class path" }, { "--CLASSPATH", "Set class path" },
{ "--main", "Choose class whose main method should be used" },
{ "--encoding", "Choose input encoding (default is UTF-8)" },
{ "-I", "Add directory to class path" }, { "-I", "Add directory to class path" },
{ "-foutput-class-dir", "Directory where class files should be written" }, { "-foutput-class-dir", "Directory where class files should be written" },
{ "-fuse-divide-subroutine", "" }, { "-fuse-divide-subroutine", "" },
......
...@@ -121,6 +121,9 @@ int flag_hash_synchronization; ...@@ -121,6 +121,9 @@ int flag_hash_synchronization;
JNI, not CNI. */ JNI, not CNI. */
int flag_jni = 0; int flag_jni = 0;
/* The encoding of the source file. */
char *current_encoding = NULL;
/* When non zero, report the now deprecated empty statements. */ /* When non zero, report the now deprecated empty statements. */
int flag_extraneous_semicolon; int flag_extraneous_semicolon;
...@@ -222,6 +225,13 @@ lang_decode_option (argc, argv) ...@@ -222,6 +225,13 @@ lang_decode_option (argc, argv)
return 1; return 1;
} }
#undef ARG #undef ARG
#define ARG "-fencoding="
if (strncmp (p, ARG, sizeof (ARG) - 1) == 0)
{
current_encoding = p + sizeof (ARG) - 1;
return 1;
}
#undef ARG
if (p[0] == '-' && p[1] == 'f') if (p[0] == '-' && p[1] == 'f')
{ {
...@@ -309,7 +319,9 @@ lang_decode_option (argc, argv) ...@@ -309,7 +319,9 @@ lang_decode_option (argc, argv)
return 0; return 0;
} }
/* Global open file. */
FILE *finput; FILE *finput;
const char * const char *
init_parse (filename) init_parse (filename)
const char *filename; const char *filename;
...@@ -362,6 +374,7 @@ init_parse (filename) ...@@ -362,6 +374,7 @@ init_parse (filename)
} }
} }
} }
init_lex (); init_lex ();
return filename; return filename;
...@@ -370,7 +383,6 @@ init_parse (filename) ...@@ -370,7 +383,6 @@ init_parse (filename)
void void
finish_parse () finish_parse ()
{ {
fclose (finput);
jcf_dependency_write (); jcf_dependency_write ();
} }
......
...@@ -24,15 +24,15 @@ of Sun Microsystems, Inc. in the United States and other countries. ...@@ -24,15 +24,15 @@ of Sun Microsystems, Inc. in the United States and other countries.
The Free Software Foundation is independent of Sun Microsystems, Inc. */ The Free Software Foundation is independent of Sun Microsystems, Inc. */
/* It defines java_lex (yylex) that reads a Java ASCII source file /* It defines java_lex (yylex) that reads a Java ASCII source file
possibly containing Unicode escape sequence or utf8 encoded characters possibly containing Unicode escape sequence or utf8 encoded
and returns a token for everything found but comments, white spaces characters and returns a token for everything found but comments,
and line terminators. When necessary, it also fills the java_lval white spaces and line terminators. When necessary, it also fills
(yylval) union. It's implemented to be called by a re-entrant parser the java_lval (yylval) union. It's implemented to be called by a
generated by Bison. re-entrant parser generated by Bison.
The lexical analysis conforms to the Java grammar described in "The The lexical analysis conforms to the Java grammar described in "The
Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele. Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */ Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
#include "keyword.h" #include "keyword.h"
...@@ -55,15 +55,18 @@ static int java_letter_or_digit_p PARAMS ((unicode_t)); ...@@ -55,15 +55,18 @@ static int java_letter_or_digit_p PARAMS ((unicode_t));
static int java_parse_doc_section PARAMS ((unicode_t)); static int java_parse_doc_section PARAMS ((unicode_t));
static void java_parse_end_comment PARAMS ((unicode_t)); static void java_parse_end_comment PARAMS ((unicode_t));
static unicode_t java_get_unicode PARAMS ((void)); static unicode_t java_get_unicode PARAMS ((void));
static unicode_t java_read_unicode PARAMS ((int, int *)); static unicode_t java_read_unicode PARAMS ((java_lexer *, int, int *));
static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int)); static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
static unicode_t java_read_char PARAMS ((void)); static unicode_t java_read_char PARAMS ((java_lexer *));
static void java_allocate_new_line PARAMS ((void)); static void java_allocate_new_line PARAMS ((void));
static void java_unget_unicode PARAMS ((void)); static void java_unget_unicode PARAMS ((void));
static unicode_t java_sneak_unicode PARAMS ((void)); static unicode_t java_sneak_unicode PARAMS ((void));
java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
void void
java_init_lex () java_init_lex (finput, encoding)
FILE *finput;
const char *encoding;
{ {
#ifndef JC1_LITE #ifndef JC1_LITE
int java_lang_imported = 0; int java_lang_imported = 0;
...@@ -114,9 +117,9 @@ java_init_lex () ...@@ -114,9 +117,9 @@ java_init_lex ()
ctxp->lineno = lineno = 0; ctxp->lineno = lineno = 0;
ctxp->p_line = NULL; ctxp->p_line = NULL;
ctxp->c_line = NULL; ctxp->c_line = NULL;
ctxp->unget_utf8_value = 0;
ctxp->minus_seen = 0; ctxp->minus_seen = 0;
ctxp->java_error_flag = 0; ctxp->java_error_flag = 0;
ctxp->lexer = java_new_lexer (finput, encoding);
} }
static char * static char *
...@@ -194,59 +197,180 @@ java_allocate_new_line () ...@@ -194,59 +197,180 @@ java_allocate_new_line ()
ctxp->c_line->white_space_only = 1; ctxp->c_line->white_space_only = 1;
} }
#define BAD_UTF8_VALUE 0xFFFE /* Create a new lexer object. */
java_lexer *
static unicode_t java_new_lexer (finput, encoding)
java_read_char () FILE *finput;
const char *encoding;
{ {
int c; java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
int c1, c2; int enc_error = 0;
lex->finput = finput;
lex->bs_count = 0;
lex->unget_value = 0;
if (ctxp->unget_utf8_value) #ifdef HAVE_ICONV
lex->handle = iconv_open ("UCS-2", encoding);
if (lex->handle == (iconv_t) -1)
{ {
int to_return = ctxp->unget_utf8_value; /* FIXME: we should give a nice error based on errno here. */
ctxp->unget_utf8_value = 0; enc_error = 1;
return (to_return);
} }
lex->first = -1;
lex->last = -1;
#else /* HAVE_ICONV */
if (strcmp (encoding, DEFAULT_ENCODING))
enc_error = 1;
#endif /* HAVE_ICONV */
c = GETC (); if (enc_error)
fatal ("unknown encoding: `%s'", encoding);
if (c < 128) return lex;
return (unicode_t)c; }
if (c == EOF)
return UEOF; void
else java_destroy_lexer (lex)
java_lexer *lex;
{
#ifdef HAVE_ICONV
iconv_close (lex->handle);
#endif
free (lex);
}
static unicode_t
java_read_char (lex)
java_lexer *lex;
{
if (lex->unget_value)
{ {
if ((c & 0xe0) == 0xc0) unicode_t r = lex->unget_value;
{ lex->unget_value = 0;
c1 = GETC (); return r;
if ((c1 & 0xc0) == 0x80)
return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
c = c1;
}
else if ((c & 0xf0) == 0xe0)
{
c1 = GETC ();
if ((c1 & 0xc0) == 0x80)
{
c2 = GETC ();
if ((c2 & 0xc0) == 0x80)
return (unicode_t)(((c & 0xf) << 12) +
(( c1 & 0x3f) << 6) + (c2 & 0x3f));
else
c = c2;
}
else
c = c1;
}
/* We looked for a UTF8 multi-byte sequence (since we saw an initial
byte with the high bit set), but found invalid bytes instead.
If the most recent byte was Ascii (and not EOF), we should
unget it, in case it was a comment terminator or other delimitor. */
if ((c & 0x80) == 0)
UNGETC (c);
return BAD_UTF8_VALUE;
} }
#ifdef HAVE_ICONV
{
char out[2];
size_t ir, inbytesleft, in_save, out_count;
char *inp, *outp;
while (1)
{
/* See if we need to read more data. If FIRST == 0 then the
previous conversion attempt ended in the middle of a
character at the end of the buffer. Otherwise we only have
to read if the buffer is empty. */
if (lex->first == 0 || lex->first >= lex->last)
{
int r;
if (lex->first >= lex->last)
{
lex->first = 0;
lex->last = 0;
}
if (feof (lex->finput))
return UEOF;
r = fread (&lex->buffer[lex->last], 1,
sizeof (lex->buffer) - lex->last,
lex->finput);
lex->last += r;
}
inbytesleft = lex->last - lex->first;
if (inbytesleft == 0)
{
/* We've tried to read and there is nothing left. */
return UEOF;
}
in_save = inbytesleft;
out_count = 2;
inp = &lex->buffer[lex->first];
outp = out;
ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
&outp, &out_count);
lex->first += in_save - inbytesleft;
if (out_count == 0)
{
/* Success. We assume that UCS-2 is big-endian. This
appears to be an ok assumption. */
unicode_t result;
result = (((unsigned char) out[0]) << 8) | (unsigned char) out[1];
return result;
}
if (ir == (size_t) -1)
{
if (errno == EINVAL)
{
/* This is ok. This means that the end of our buffer
is in the middle of a character sequence. We just
move the valid part of the buffer to the beginning
to force a read. */
/* We use bcopy() because it should work for
overlapping strings. Use memmove() instead... */
bcopy (&lex->buffer[lex->first], &lex->buffer[0],
lex->last - lex->first);
lex->last -= lex->first;
lex->first = 0;
}
else
{
/* A more serious error. */
java_lex_error ("unrecognized character in input stream", 0);
return UEOF;
}
}
}
}
#else /* HAVE_ICONV */
{
int c, c1, c2;
c = getc (lex->finput);
if (c < 128)
return (unicode_t)c;
if (c == EOF)
return UEOF;
else
{
if ((c & 0xe0) == 0xc0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
c = c1;
}
else if ((c & 0xf0) == 0xe0)
{
c1 = getc (lex->finput);
if ((c1 & 0xc0) == 0x80)
{
c2 = getc (lex->finput);
if ((c2 & 0xc0) == 0x80)
return (unicode_t)(((c & 0xf) << 12) +
(( c1 & 0x3f) << 6) + (c2 & 0x3f));
else
c = c2;
}
else
c = c1;
}
/* We simply don't support invalid characters. */
java_lex_error ("malformed UTF-8 character", 0);
}
}
#endif /* HAVE_ICONV */
/* We only get here on error. */
return UEOF;
} }
static void static void
...@@ -267,56 +391,54 @@ java_store_unicode (l, c, unicode_escape_p) ...@@ -267,56 +391,54 @@ java_store_unicode (l, c, unicode_escape_p)
} }
static unicode_t static unicode_t
java_read_unicode (term_context, unicode_escape_p) java_read_unicode (lex, term_context, unicode_escape_p)
int term_context; java_lexer *lex;
int *unicode_escape_p; int term_context;
int *unicode_escape_p;
{ {
unicode_t c; unicode_t c;
long i, base;
c = java_read_char (); c = java_read_char (lex);
*unicode_escape_p = 0; *unicode_escape_p = 0;
if (c != '\\') if (c != '\\')
return ((term_context ? c : {
java_lineterminator (c) ? '\n' : (unicode_t)c)); lex->bs_count = 0;
return (term_context ? c : (java_lineterminator (c)
/* Count the number of preceeding '\' */ ? '\n'
for (base = ftell (finput), i = base-2; c == '\\';) : (unicode_t) c));
{
fseek (finput, i--, SEEK_SET);
c = java_read_char (); /* Will fail if reading utf8 stream. FIXME */
} }
fseek (finput, base, SEEK_SET);
if ((base-i-3)%2 == 0) /* If odd number of \ seen */ ++lex->bs_count;
if ((lex->bs_count) % 2 == 1)
{ {
c = java_read_char (); /* Odd number of \ seen. */
c = java_read_char (lex);
if (c == 'u') if (c == 'u')
{ {
unsigned short unicode = 0; unicode_t unicode = 0;
int shift = 12; int shift = 12;
/* Next should be 4 hex digits, otherwise it's an error. /* Next should be 4 hex digits, otherwise it's an error.
The hex value is converted into the unicode, pushed into The hex value is converted into the unicode, pushed into
the Unicode stream. */ the Unicode stream. */
for (shift = 12; shift >= 0; shift -= 4) for (shift = 12; shift >= 0; shift -= 4)
{ {
if ((c = java_read_char ()) == UEOF) if ((c = java_read_char (lex)) == UEOF)
return UEOF; return UEOF;
if (c >= '0' && c <= '9') if (c >= '0' && c <= '9')
unicode |= (unicode_t)((c-'0') << shift); unicode |= (unicode_t)((c-'0') << shift);
else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift); unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift);
else else
java_lex_error java_lex_error ("Non hex digit in Unicode escape sequence", 0);
("Non hex digit in Unicode escape sequence", 0);
} }
*unicode_escape_p = 1; *unicode_escape_p = 1;
return (term_context ? unicode : return (term_context
(java_lineterminator (c) ? '\n' : unicode)); ? unicode : (java_lineterminator (c) ? '\n' : unicode));
} }
ctxp->unget_utf8_value = c; lex->unget_value = c;
} }
return (unicode_t)'\\'; return (unicode_t) '\\';
} }
static unicode_t static unicode_t
...@@ -331,7 +453,7 @@ java_get_unicode () ...@@ -331,7 +453,7 @@ java_get_unicode ()
for (;;) for (;;)
{ {
int unicode_escape_p; int unicode_escape_p;
c = java_read_unicode (0, &unicode_escape_p); c = java_read_unicode (ctxp->lexer, 0, &unicode_escape_p);
java_store_unicode (ctxp->c_line, c, unicode_escape_p); java_store_unicode (ctxp->c_line, c, unicode_escape_p);
if (ctxp->c_line->white_space_only if (ctxp->c_line->white_space_only
&& !JAVA_WHITE_SPACE_P (c) && c!='\n') && !JAVA_WHITE_SPACE_P (c) && c!='\n')
...@@ -354,7 +476,7 @@ java_lineterminator (c) ...@@ -354,7 +476,7 @@ java_lineterminator (c)
else if (c == '\r') /* CR */ else if (c == '\r') /* CR */
{ {
int unicode_escape_p; int unicode_escape_p;
c = java_read_unicode (1, &unicode_escape_p); c = java_read_unicode (ctxp->lexer, 1, &unicode_escape_p);
if (c == '\r') if (c == '\r')
{ {
/* In this case we will have another terminator. For some /* In this case we will have another terminator. For some
...@@ -363,7 +485,7 @@ java_lineterminator (c) ...@@ -363,7 +485,7 @@ java_lineterminator (c)
up in the actual text of the line, causing an error. So up in the actual text of the line, causing an error. So
instead we choose a very low-level method. FIXME: this instead we choose a very low-level method. FIXME: this
is incredibly ugly. */ is incredibly ugly. */
UNGETC (c); ctxp->lexer->unget_value = c;
} }
else if (c != '\n') else if (c != '\n')
{ {
...@@ -939,7 +1061,7 @@ java_lex (java_lval) ...@@ -939,7 +1061,7 @@ java_lex (java_lval)
char *string; char *string;
for (no_error = 1, c = java_get_unicode (); for (no_error = 1, c = java_get_unicode ();
c != '"' && c != '\n'; c = java_get_unicode ()) c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
{ {
if (c == '\\') if (c == '\\')
c = java_parse_escape_sequence (); c = java_parse_escape_sequence ();
......
...@@ -35,6 +35,13 @@ extern int lineno; ...@@ -35,6 +35,13 @@ extern int lineno;
/* A Unicode character, as read from the input file */ /* A Unicode character, as read from the input file */
typedef unsigned short unicode_t; typedef unsigned short unicode_t;
#ifdef HAVE_ICONV
#include <iconv.h>
#endif /* HAVE_ICONV */
/* Default encoding to use if no encoding is specified. */
#define DEFAULT_ENCODING "UTF-8"
/* Debug macro to print-out what we match */ /* Debug macro to print-out what we match */
#ifdef JAVA_LEX_DEBUG #ifdef JAVA_LEX_DEBUG
#ifdef JAVA_LEX_DEBUG_CHAR #ifdef JAVA_LEX_DEBUG_CHAR
...@@ -96,12 +103,38 @@ typedef struct _java_lc { ...@@ -96,12 +103,38 @@ typedef struct _java_lc {
int col; int col;
} java_lc; } java_lc;
typedef struct java_lexer
{
/* The file from which we're reading. */
FILE *finput;
#define JAVA_LINE_MAX 80 /* Number of consecutive backslashes we've read. */
int bs_count;
/* If nonzero, a value that was pushed back. */
unicode_t unget_value;
#ifdef HAVE_ICONV
/* The handle for the iconv converter we're using. */
iconv_t handle;
/* Macro to read and unread bytes */ /* Bytes we've read from the file but have not sent to iconv. */
#define UNGETC(c) ungetc(c, finput) char buffer[1024];
#define GETC() getc(finput)
/* Index of first valid character in buffer, -1 if no valid
characters. */
int first;
/* Index of last valid character in buffer, plus one. -1 if no
valid characters in buffer. */
int last;
#endif /* HAVE_ICONV */
} java_lexer;
/* Destroy a lexer object. */
extern void java_destroy_lexer PARAMS ((java_lexer *));
#define JAVA_LINE_MAX 80
/* Build a location compound integer */ /* Build a location compound integer */
#define BUILD_LOCATION() ((ctxp->elc.line << 12) | (ctxp->elc.col & 0xfff)) #define BUILD_LOCATION() ((ctxp->elc.line << 12) | (ctxp->elc.col & 0xfff))
......
...@@ -728,13 +728,12 @@ typedef struct _jdeplist { ...@@ -728,13 +728,12 @@ typedef struct _jdeplist {
struct parser_ctxt { struct parser_ctxt {
const char *filename; /* Current filename */ const char *filename; /* Current filename */
FILE *finput; /* Current file input stream */
struct parser_ctxt *next; struct parser_ctxt *next;
java_lexer *lexer; /* Current lexer state */
char marker_begining; /* Marker. Should be a sub-struct */ char marker_begining; /* Marker. Should be a sub-struct */
struct java_line *p_line, *c_line; /* Previous and current line */ struct java_line *p_line, *c_line; /* Previous and current line */
java_lc elc; /* Error's line column info */ java_lc elc; /* Error's line column info */
unicode_t unget_utf8_value; /* An unget utf8 value */
int ccb_indent; /* Keep track of {} indent, lexer */ int ccb_indent; /* Keep track of {} indent, lexer */
int first_ccb_indent1; /* First { at ident level 1 */ int first_ccb_indent1; /* First { at ident level 1 */
int last_ccb_indent1; /* Last } at ident level 1 */ int last_ccb_indent1; /* Last } at ident level 1 */
...@@ -928,7 +927,7 @@ extern void reset_report PARAMS ((void)); ...@@ -928,7 +927,7 @@ extern void reset_report PARAMS ((void));
/* Always in use, no matter what you compile */ /* Always in use, no matter what you compile */
void java_push_parser_context PARAMS ((void)); void java_push_parser_context PARAMS ((void));
void java_pop_parser_context PARAMS ((int)); void java_pop_parser_context PARAMS ((int));
void java_init_lex PARAMS ((void)); void java_init_lex PARAMS ((FILE *, const char *));
extern void java_parser_context_save_global PARAMS ((void)); extern void java_parser_context_save_global PARAMS ((void));
extern void java_parser_context_restore_global PARAMS ((void)); extern void java_parser_context_restore_global PARAMS ((void));
int yyparse PARAMS ((void)); int yyparse PARAMS ((void));
......
...@@ -2618,10 +2618,13 @@ java_pop_parser_context (generate) ...@@ -2618,10 +2618,13 @@ java_pop_parser_context (generate)
next->incomplete_class = ctxp->incomplete_class; next->incomplete_class = ctxp->incomplete_class;
next->gclass_list = ctxp->gclass_list; next->gclass_list = ctxp->gclass_list;
lineno = ctxp->lineno; lineno = ctxp->lineno;
finput = ctxp->finput;
current_class = ctxp->current_class; current_class = ctxp->current_class;
} }
/* If the old and new lexers differ, then free the old one. */
if (ctxp->lexer && next && ctxp->lexer != next->lexer)
java_destroy_lexer (ctxp->lexer);
/* Set the single import class file flag to 0 for the current list /* Set the single import class file flag to 0 for the current list
of imported things */ of imported things */
for (current = ctxp->import_list; current; current = TREE_CHAIN (current)) for (current = ctxp->import_list; current; current = TREE_CHAIN (current))
...@@ -2661,7 +2664,6 @@ java_parser_context_save_global () ...@@ -2661,7 +2664,6 @@ java_parser_context_save_global ()
else if (ctxp->saved_data) else if (ctxp->saved_data)
create_new_parser_context (1); create_new_parser_context (1);
ctxp->finput = finput;
ctxp->lineno = lineno; ctxp->lineno = lineno;
ctxp->current_class = current_class; ctxp->current_class = current_class;
ctxp->filename = input_filename; ctxp->filename = input_filename;
...@@ -2675,7 +2677,6 @@ java_parser_context_save_global () ...@@ -2675,7 +2677,6 @@ java_parser_context_save_global ()
void void
java_parser_context_restore_global () java_parser_context_restore_global ()
{ {
finput = ctxp->finput;
lineno = ctxp->lineno; lineno = ctxp->lineno;
current_class = ctxp->current_class; current_class = ctxp->current_class;
input_filename = ctxp->filename; input_filename = ctxp->filename;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment