Fix use-after-free lexing unterminated raw strings (PR preprocessor/78811)

gcc/ChangeLog: PR preprocessor/78680 PR preprocessor/78811 * input.c (struct selftest::lexer_test): Add field m_implicitly_expect_EOF. (selftest::lexer_error_sink): New class. (selftest::lexer_error_sink::s_singleton): New global. (selftest::lexer_test::lexer_test): Initialize new field "m_implicitly_expect_EOF". (selftest::lexer_test::~lexer_test): Conditionalize the check for the EOF token on the new field. (selftest::test_lexer_string_locations_raw_string_unterminated): New function. (selftest::input_c_tests): Call the new test. libcpp/ChangeLog: PR preprocessor/78680 PR preprocessor/78811 * lex.c (_cpp_lex_direct): Only determine the end-location of the token and build a range for non-reserved start locations. Do not do it for EOF tokens. From-SVN: r243721

Fix use-after-free lexing unterminated raw strings (PR preprocessor/78811)
gcc/ChangeLog: PR preprocessor/78680 PR preprocessor/78811 * input.c (struct selftest::lexer_test): Add field m_implicitly_expect_EOF. (selftest::lexer_error_sink): New class. (selftest::lexer_error_sink::s_singleton): New global. (selftest::lexer_test::lexer_test): Initialize new field "m_implicitly_expect_EOF". (selftest::lexer_test::~lexer_test): Conditionalize the check for the EOF token on the new field. (selftest::test_lexer_string_locations_raw_string_unterminated): New function. (selftest::input_c_tests): Call the new test. libcpp/ChangeLog: PR preprocessor/78680 PR preprocessor/78811 * lex.c (_cpp_lex_direct): Only determine the end-location of the token and build a range for non-reserved start locations. Do not do it for EOF tokens. From-SVN: r243721
a3998c2f · David Malcolm · David Malcolm · a3038e19 · a3998c2f · a3998c2f
Commit a3998c2f authored Dec 15, 2016 by David Malcolm Committed by David Malcolm Dec 15, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 129 additions and 24 deletions

gcc/ChangeLog
+16 -0

gcc/input.c
+84 -5

libcpp/ChangeLog
+8 -0

libcpp/lex.c
+21 -19

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2016-12-15  David Malcolm  <dmalcolm@redhat.com>
+
+	PR preprocessor/78680
+	PR preprocessor/78811
+	* input.c (struct selftest::lexer_test): Add field
+	m_implicitly_expect_EOF.
+	(selftest::lexer_error_sink): New class.
+	(selftest::lexer_error_sink::s_singleton): New global.
+	(selftest::lexer_test::lexer_test): Initialize new field
+	"m_implicitly_expect_EOF".
+	(selftest::lexer_test::~lexer_test): Conditionalize the
+	check for the EOF token on the new field.
+	(selftest::test_lexer_string_locations_raw_string_unterminated):
+	New function.
+	(selftest::input_c_tests): Call the new test.
+
 2016-12-15  Wilco Dijkstra  <wdijkstr@arm.com>

 	* config/arm/arm.h (TARGET_BACKTRACE): Use crtl->is_leaf.
--- a/gcc/input.c
+++ b/gcc/input.c
@@ -1985,6 +1985,7 @@ struct lexer_test
  cpp_reader_ptr m_parser;
  temp_source_file m_tempfile;
  string_concat_db m_concats;
+  bool m_implicitly_expect_EOF;
 };

 /* Use an EBCDIC encoding for the execution charset, specifically
@@ -2046,6 +2047,54 @@ class ebcdic_execution_charset : public lexer_test_options

 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;

+/* A lexer_test_options subclass that records a list of error
+   messages emitted by the lexer.  */
+
+class lexer_error_sink : public lexer_test_options
+{
+ public:
+  lexer_error_sink ()
+  {
+    gcc_assert (s_singleton == NULL);
+    s_singleton = this;
+  }
+  ~lexer_error_sink ()
+  {
+    gcc_assert (s_singleton == this);
+    s_singleton = NULL;
+
+    int i;
+    char *str;
+    FOR_EACH_VEC_ELT (m_errors, i, str)
+      free (str);
+  }
+
+  void apply (lexer_test &test) FINAL OVERRIDE
+  {
+    cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
+    callbacks->error = on_error;
+  }
+
+  static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
+			int level ATTRIBUTE_UNUSED,
+			int reason ATTRIBUTE_UNUSED,
+			rich_location *richloc ATTRIBUTE_UNUSED,
+			const char *msgid, va_list *ap)
+    ATTRIBUTE_FPTR_PRINTF(5,0)
+  {
+    char *msg = xvasprintf (msgid, *ap);
+    s_singleton->m_errors.safe_push (msg);
+    return true;
+  }
+
+  auto_vec<char *> m_errors;
+
+ private:
+  static lexer_error_sink *s_singleton;
+};
+
+lexer_error_sink *lexer_error_sink::s_singleton;
+
 /* Constructor.  Override line_table with a new instance based on CASE_,
   and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
   start parsing the tempfile.  */
@@ -2056,7 +2105,8 @@ lexer_test::lexer_test (const line_table_case &case_, const char *content,
  m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
  /* Create a tempfile and write the text to it.  */
  m_tempfile (SELFTEST_LOCATION, ".c", content),
-  m_concats ()
+  m_concats (),
+  m_implicitly_expect_EOF (true)
 {
  if (options)
    options->apply (*this);
@@ -2069,16 +2119,19 @@ lexer_test::lexer_test (const line_table_case &case_, const char *content,
  ASSERT_NE (fname, NULL);
 }

-/* Destructor.  Verify that the next token in m_parser is EOF.  */
+/* Destructor.  By default, verify that the next token in m_parser is EOF.  */

 lexer_test::~lexer_test ()
 {
  location_t loc;
  const cpp_token *tok;

-  tok = cpp_get_token_with_location (m_parser, &loc);
-  ASSERT_NE (tok, NULL);
-  ASSERT_EQ (tok->type, CPP_EOF);
+  if (m_implicitly_expect_EOF)
+    {
+      tok = cpp_get_token_with_location (m_parser, &loc);
+      ASSERT_NE (tok, NULL);
+      ASSERT_EQ (tok->type, CPP_EOF);
+    }
 }

 /* Get the next token from m_parser.  */
@@ -3247,6 +3300,31 @@ test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
 				  "range endpoints are on different lines");
 }

+/* Test of parsing an unterminated raw string.  */
+
+static void
+test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
+{
+  const char *content = "R\"ouch()ouCh\" /* etc */";
+
+  lexer_error_sink errors;
+  lexer_test test (case_, content, &errors);
+  test.m_implicitly_expect_EOF = false;
+
+  /* Attempt to parse the raw string.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_EOF);
+
+  ASSERT_EQ (1, errors.m_errors.length ());
+  /* We expect the message "unterminated raw string"
+     in the "cpplib" translation domain.
+     It's not clear that dgettext is available on all supported hosts,
+     so this assertion is commented-out for now.
+       ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
+                     errors.m_errors[0]);
+  */
+}
+
 /* Test of lexing char constants.  */

 static void
@@ -3390,6 +3468,7 @@ input_c_tests ()
  for_each_line_table_case (test_lexer_string_locations_long_line);
  for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
  for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
+  for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
  for_each_line_table_case (test_lexer_char_constants);

  test_reading_source_line ();

--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
+2016-12-15  David Malcolm  <dmalcolm@redhat.com>
+
+	PR preprocessor/78680
+	PR preprocessor/78811
+	* lex.c (_cpp_lex_direct): Only determine the end-location of
+	the token and build a range for non-reserved start locations.
+	Do not do it for EOF tokens.
+
 2016-12-12  David Malcolm  <dmalcolm@redhat.com>

 	PR preprocessor/78680

--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -3089,25 +3089,27 @@ _cpp_lex_direct (cpp_reader *pfile)
      break;
    }

-  /* Ensure that any line notes are processed, so that we have the
-     correct physical line/column for the end-point of the token even
-     when a logical line is split via one or more backslashes.  */
-  if (buffer->cur >= buffer->notes[buffer->cur_note].pos
-      && !pfile->overlaid_buffer)
-    _cpp_process_line_notes (pfile, false);
-
-  source_range tok_range;
-  tok_range.m_start = result->src_loc;
-  if (result->src_loc >= RESERVED_LOCATION_COUNT)
-    tok_range.m_finish
-      = linemap_position_for_column (pfile->line_table,
-				     CPP_BUF_COLUMN (buffer, buffer->cur));
-  else
-    tok_range.m_finish = tok_range.m_start;
-
-  result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
-					   result->src_loc,
-					   tok_range, NULL);
+  /* Potentially convert the location of the token to a range.  */
+  if (result->src_loc >= RESERVED_LOCATION_COUNT
+      && result->type != CPP_EOF)
+    {
+      /* Ensure that any line notes are processed, so that we have the
+	 correct physical line/column for the end-point of the token even
+	 when a logical line is split via one or more backslashes.  */
+      if (buffer->cur >= buffer->notes[buffer->cur_note].pos
+	  && !pfile->overlaid_buffer)
+	_cpp_process_line_notes (pfile, false);
+
+      source_range tok_range;
+      tok_range.m_start = result->src_loc;
+      tok_range.m_finish
+	= linemap_position_for_column (pfile->line_table,
+				       CPP_BUF_COLUMN (buffer, buffer->cur));
+
+      result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
+					       result->src_loc,
+					       tok_range, NULL);
+    }

  return result;
 }