lex.c 92.6 KB
Newer Older
1
/* CPP Library - lexical analysis.
Jakub Jelinek committed
2
   Copyright (C) 2000-2015 Free Software Foundation, Inc.
3 4 5 6 7 8 9
   Contributed by Per Bothner, 1994-95.
   Based on CCCP program by Paul Rubin, June 1986
   Adapted to ANSI C, Richard Stallman, Jan 1987
   Broken out to separate file, Zack Weinberg, Mar 2000

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
10
Free Software Foundation; either version 3, or (at your option) any
11 12 13 14 15 16 17 18
later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
19 20
along with this program; see the file COPYING3.  If not see
<http://www.gnu.org/licenses/>.  */
21 22 23 24

#include "config.h"
#include "system.h"
#include "cpplib.h"
25
#include "internal.h"
26

Neil Booth committed
27
enum spell_type
28
{
Neil Booth committed
29 30
  SPELL_OPERATOR = 0,
  SPELL_IDENT,
31
  SPELL_LITERAL,
Neil Booth committed
32
  SPELL_NONE
33 34
};

Neil Booth committed
35
struct token_spelling
36
{
Neil Booth committed
37 38
  enum spell_type category;
  const unsigned char *name;
39 40
};

41
static const unsigned char *const digraph_spellings[] =
42
{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
Neil Booth committed
43

44 45
#define OP(e, s) { SPELL_OPERATOR, UC s  },
#define TK(e, s) { SPELL_ ## s,    UC #e },
46
static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
Neil Booth committed
47 48 49 50 51
#undef OP
#undef TK

#define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
#define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52

53 54 55 56 57
static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
static int skip_line_comment (cpp_reader *);
static void skip_whitespace (cpp_reader *, cppchar_t);
static void lex_string (cpp_reader *, cpp_token *, const uchar *);
static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58
static void store_comment (cpp_reader *, cpp_token *);
59 60 61 62 63 64 65
static void create_literal (cpp_reader *, cpp_token *, const uchar *,
			    unsigned int, enum cpp_ttype);
static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
static int name_p (cpp_reader *, const cpp_string *);
static tokenrun *next_tokenrun (tokenrun *);

static _cpp_buff *new_buff (size_t);
66

67

Zack Weinberg committed
68
/* Utility routine:
69

70 71
   Compares, the token TOKEN to the NUL-terminated string STRING.
   TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
Zack Weinberg committed
72
int
73
cpp_ideq (const cpp_token *token, const char *string)
Zack Weinberg committed
74
{
75
  if (token->type != CPP_NAME)
Zack Weinberg committed
76
    return 0;
77

78
  return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79
}
80

81 82
/* Record a note TYPE at byte POS into the current cleaned logical
   line.  */
83
static void
84
add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85
{
86 87 88
  if (buffer->notes_used == buffer->notes_cap)
    {
      buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 90
      buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
                                  buffer->notes_cap);
91
    }
92

93 94 95
  buffer->notes[buffer->notes_used].pos = pos;
  buffer->notes[buffer->notes_used].type = type;
  buffer->notes_used++;
96 97
}

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265

/* Fast path to find line special characters using optimized character
   scanning algorithms.  Anything complicated falls back to the slow
   path below.  Since this loop is very hot it's worth doing these kinds
   of optimizations.

   One of the paths through the ifdefs should provide 

     const uchar *search_line_fast (const uchar *s, const uchar *end);

   Between S and END, search for \n, \r, \\, ?.  Return a pointer to
   the found character.

   Note that the last character of the buffer is *always* a newline,
   as forced by _cpp_convert_input.  This fact can be used to avoid
   explicitly looking for the end of the buffer.  */

/* Configure gives us an ifdef test.  */
#ifndef WORDS_BIGENDIAN
#define WORDS_BIGENDIAN 0
#endif

/* We'd like the largest integer that fits into a register.  There's nothing
   in <stdint.h> that gives us that.  For most hosts this is unsigned long,
   but MS decided on an LLP64 model.  Thankfully when building with GCC we
   can get the "real" word size.  */
#ifdef __GNUC__
typedef unsigned int word_type __attribute__((__mode__(__word__)));
#else
typedef unsigned long word_type;
#endif

/* The code below is only expecting sizes 4 or 8.
   Die at compile-time if this expectation is violated.  */
typedef char check_word_type_size
  [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];

/* Return X with the first N bytes forced to values that won't match one
   of the interesting characters.  Note that NUL is not interesting.  */

static inline word_type
acc_char_mask_misalign (word_type val, unsigned int n)
{
  word_type mask = -1;
  if (WORDS_BIGENDIAN)
    mask >>= n * 8;
  else
    mask <<= n * 8;
  return val & mask;
}

/* Return X replicated to all byte positions within WORD_TYPE.  */

static inline word_type
acc_char_replicate (uchar x)
{
  word_type ret;

  ret = (x << 24) | (x << 16) | (x << 8) | x;
  if (sizeof(word_type) == 8)
    ret = (ret << 16 << 16) | ret;
  return ret;
}

/* Return non-zero if some byte of VAL is (probably) C.  */

static inline word_type
acc_char_cmp (word_type val, word_type c)
{
#if defined(__GNUC__) && defined(__alpha__)
  /* We can get exact results using a compare-bytes instruction.  
     Get (val == c) via (0 >= (val ^ c)).  */
  return __builtin_alpha_cmpbge (0, val ^ c);
#else
  word_type magic = 0x7efefefeU;
  if (sizeof(word_type) == 8)
    magic = (magic << 16 << 16) | 0xfefefefeU;
  magic |= 1;

  val ^= c;
  return ((val + magic) ^ ~val) & ~magic;
#endif
}

/* Given the result of acc_char_cmp is non-zero, return the index of
   the found character.  If this was a false positive, return -1.  */

static inline int
acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
		word_type val ATTRIBUTE_UNUSED)
{
#if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
  /* The cmpbge instruction sets *bits* of the result corresponding to
     matches in the bytes with no false positives.  */
  return __builtin_ctzl (cmp);
#else
  unsigned int i;

  /* ??? It would be nice to force unrolling here,
     and have all of these constants folded.  */
  for (i = 0; i < sizeof(word_type); ++i)
    {
      uchar c;
      if (WORDS_BIGENDIAN)
	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
      else
	c = (val >> i * 8) & 0xff;

      if (c == '\n' || c == '\r' || c == '\\' || c == '?')
	return i;
    }

  return -1;
#endif
}

/* A version of the fast scanner using bit fiddling techniques.
 
   For 32-bit words, one would normally perform 16 comparisons and
   16 branches.  With this algorithm one performs 24 arithmetic
   operations and one branch.  Whether this is faster with a 32-bit
   word size is going to be somewhat system dependent.

   For 64-bit words, we eliminate twice the number of comparisons
   and branches without increasing the number of arithmetic operations.
   It's almost certainly going to be a win with 64-bit word size.  */

static const uchar * search_line_acc_char (const uchar *, const uchar *)
  ATTRIBUTE_UNUSED;

static const uchar *
search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
  const word_type repl_nl = acc_char_replicate ('\n');
  const word_type repl_cr = acc_char_replicate ('\r');
  const word_type repl_bs = acc_char_replicate ('\\');
  const word_type repl_qm = acc_char_replicate ('?');

  unsigned int misalign;
  const word_type *p;
  word_type val, t;
  
  /* Align the buffer.  Mask out any bytes from before the beginning.  */
  p = (word_type *)((uintptr_t)s & -sizeof(word_type));
  val = *p;
  misalign = (uintptr_t)s & (sizeof(word_type) - 1);
  if (misalign)
    val = acc_char_mask_misalign (val, misalign);

  /* Main loop.  */
  while (1)
    {
      t  = acc_char_cmp (val, repl_nl);
      t |= acc_char_cmp (val, repl_cr);
      t |= acc_char_cmp (val, repl_bs);
      t |= acc_char_cmp (val, repl_qm);

      if (__builtin_expect (t != 0, 0))
	{
	  int i = acc_char_index (t, val);
	  if (i >= 0)
	    return (const uchar *)p + i;
	}

      val = *++p;
    }
}

266
/* Disable on Solaris 2/x86 until the following problem can be properly
267 268 269 270 271 272
   autoconfed:

   The Solaris 10+ assembler tags objects with the instruction set
   extensions used, so SSE4.2 executables cannot run on machines that
   don't support that extension.  */

273
#if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292

/* Replicated character data to be shared between implementations.
   Recall that outside of a context with vector support we can't
   define compatible vector types, therefore these are all defined
   in terms of raw characters.  */
static const char repl_chars[4][16] __attribute__((aligned(16))) = {
  { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
  { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
  { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
  { '?', '?', '?', '?', '?', '?', '?', '?',
    '?', '?', '?', '?', '?', '?', '?', '?' },
};

/* A version of the fast scanner using MMX vectorized byte compare insns.

   This uses the PMOVMSKB instruction which was introduced with "MMX2",
293
   which was packaged into SSE1; it is also present in the AMD MMX
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
   extension.  Mark the function as using "sse" so that we emit a real
   "emms" instruction, rather than the 3dNOW "femms" instruction.  */

static const uchar *
#ifndef __SSE__
__attribute__((__target__("sse")))
#endif
search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
  typedef char v8qi __attribute__ ((__vector_size__ (8)));
  typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));

  const v8qi repl_nl = *(const v8qi *)repl_chars[0];
  const v8qi repl_cr = *(const v8qi *)repl_chars[1];
  const v8qi repl_bs = *(const v8qi *)repl_chars[2];
  const v8qi repl_qm = *(const v8qi *)repl_chars[3];

  unsigned int misalign, found, mask;
  const v8qi *p;
  v8qi data, t, c;

  /* Align the source pointer.  While MMX doesn't generate unaligned data
     faults, this allows us to safely scan to the end of the buffer without
     reading beyond the end of the last page.  */
  misalign = (uintptr_t)s & 7;
  p = (const v8qi *)((uintptr_t)s & -8);
  data = *p;

  /* Create a mask for the bytes that are valid within the first
     16-byte block.  The Idea here is that the AND with the mask
     within the loop is "free", since we need some AND or TEST
     insn in order to set the flags for the branch anyway.  */
  mask = -1u << misalign;

  /* Main loop processing 8 bytes at a time.  */
  goto start;
  do
    {
      data = *++p;
      mask = -1;

    start:
      t = __builtin_ia32_pcmpeqb(data, repl_nl);
      c = __builtin_ia32_pcmpeqb(data, repl_cr);
      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
      c = __builtin_ia32_pcmpeqb(data, repl_bs);
      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
      c = __builtin_ia32_pcmpeqb(data, repl_qm);
      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
      found = __builtin_ia32_pmovmskb (t);
      found &= mask;
    }
  while (!found);

  __builtin_ia32_emms ();

  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz(found);
  return (const uchar *)p + found;
}

/* A version of the fast scanner using SSE2 vectorized byte compare insns.  */

static const uchar *
#ifndef __SSE2__
__attribute__((__target__("sse2")))
#endif
search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
  typedef char v16qi __attribute__ ((__vector_size__ (16)));

  const v16qi repl_nl = *(const v16qi *)repl_chars[0];
  const v16qi repl_cr = *(const v16qi *)repl_chars[1];
  const v16qi repl_bs = *(const v16qi *)repl_chars[2];
  const v16qi repl_qm = *(const v16qi *)repl_chars[3];

  unsigned int misalign, found, mask;
  const v16qi *p;
  v16qi data, t;

  /* Align the source pointer.  */
  misalign = (uintptr_t)s & 15;
  p = (const v16qi *)((uintptr_t)s & -16);
  data = *p;

  /* Create a mask for the bytes that are valid within the first
     16-byte block.  The Idea here is that the AND with the mask
     within the loop is "free", since we need some AND or TEST
     insn in order to set the flags for the branch anyway.  */
  mask = -1u << misalign;

  /* Main loop processing 16 bytes at a time.  */
  goto start;
  do
    {
      data = *++p;
      mask = -1;

    start:
      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
      found = __builtin_ia32_pmovmskb128 (t);
      found &= mask;
    }
  while (!found);

  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz(found);
  return (const uchar *)p + found;
}

409
#ifdef HAVE_SSE4
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
/* A version of the fast scanner using SSE 4.2 vectorized string insns.  */

static const uchar *
#ifndef __SSE4_2__
__attribute__((__target__("sse4.2")))
#endif
search_line_sse42 (const uchar *s, const uchar *end)
{
  typedef char v16qi __attribute__ ((__vector_size__ (16)));
  static const v16qi search = { '\n', '\r', '?', '\\' };

  uintptr_t si = (uintptr_t)s;
  uintptr_t index;

  /* Check for unaligned input.  */
  if (si & 15)
    {
427 428
      v16qi sv;

429 430 431 432 433 434 435 436 437 438 439 440
      if (__builtin_expect (end - s < 16, 0)
	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
	{
	  /* There are less than 16 bytes left in the buffer, and less
	     than 16 bytes left on the page.  Reading 16 bytes at this
	     point might generate a spurious page fault.  Defer to the
	     SSE2 implementation, which already handles alignment.  */
	  return search_line_sse2 (s, end);
	}

      /* ??? The builtin doesn't understand that the PCMPESTRI read from
	 memory need not be aligned.  */
441 442 443
      sv = __builtin_ia32_loaddqu ((const char *) s);
      index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);

444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
      if (__builtin_expect (index < 16, 0))
	goto found;

      /* Advance the pointer to an aligned address.  We will re-scan a
	 few bytes, but we no longer need care for reading past the
	 end of a page, since we're guaranteed a match.  */
      s = (const uchar *)((si + 16) & -16);
    }

  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
     in inline assembly, we can make proper use of the flags set.  */
  __asm (      "sub $16, %1\n"
	"	.balign 16\n"
	"0:	add $16, %1\n"
	"	%vpcmpestri $0, (%1), %2\n"
	"	jnc 0b"
	: "=&c"(index), "+r"(s)
	: "x"(search), "a"(4), "d"(16));

 found:
  return s + index;
}

467 468 469 470 471
#else
/* Work around out-dated assemblers without sse4 support.  */
#define search_line_sse42 search_line_sse2
#endif

472 473 474 475 476 477 478
/* Check the CPU capabilities.  */

#include "../gcc/config/i386/cpuid.h"

typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
static search_line_fast_type search_line_fast;

479 480
#define HAVE_init_vectorized_lexer 1
static inline void
481 482 483 484 485 486 487 488 489 490
init_vectorized_lexer (void)
{
  unsigned dummy, ecx = 0, edx = 0;
  search_line_fast_type impl = search_line_acc_char;
  int minimum = 0;

#if defined(__SSE4_2__)
  minimum = 3;
#elif defined(__SSE2__)
  minimum = 2;
491
#elif defined(__SSE__)
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
  minimum = 1;
#endif

  if (minimum == 3)
    impl = search_line_sse42;
  else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
    {
      if (minimum == 3 || (ecx & bit_SSE4_2))
        impl = search_line_sse42;
      else if (minimum == 2 || (edx & bit_SSE2))
	impl = search_line_sse2;
      else if (minimum == 1 || (edx & bit_SSE))
	impl = search_line_mmx;
    }
  else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
    {
508 509
      if (minimum == 1
	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
510 511 512 513 514 515
	impl = search_line_mmx;
    }

  search_line_fast = impl;
}

516
#elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
517

518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
/* A vection of the fast scanner using AltiVec vectorized byte compares
   and VSX unaligned loads (when VSX is available).  This is otherwise
   the same as the pre-GCC 5 version.  */

static const uchar *
search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
  typedef __attribute__((altivec(vector))) unsigned char vc;

  const vc repl_nl = {
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
  };
  const vc repl_cr = {
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
  };
  const vc repl_bs = {
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
  };
  const vc repl_qm = {
    '?', '?', '?', '?', '?', '?', '?', '?', 
    '?', '?', '?', '?', '?', '?', '?', '?', 
  };
  const vc zero = { 0 };

  vc data, t;

  /* Main loop processing 16 bytes at a time.  */
  do
    {
      vc m_nl, m_cr, m_bs, m_qm;

      data = *((const vc *)s);
      s += 16;

      m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
      m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
      m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
      m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
      t = (m_nl | m_cr) | (m_bs | m_qm);

      /* T now contains 0xff in bytes for which we matched one of the relevant
	 characters.  We want to exit the loop if any byte in T is non-zero.
	 Below is the expansion of vec_any_ne(t, zero).  */
    }
  while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));

  /* Restore s to to point to the 16 bytes we just processed.  */
  s -= 16;

  {
#define N  (sizeof(vc) / sizeof(long))

    union {
      vc v;
      /* Statically assert that N is 2 or 4.  */
      unsigned long l[(N == 2 || N == 4) ? N : -1];
    } u;
    unsigned long l, i = 0;

    u.v = t;

    /* Find the first word of T that is non-zero.  */
    switch (N)
      {
      case 4:
	l = u.l[i++];
	if (l != 0)
	  break;
	s += sizeof(unsigned long);
	l = u.l[i++];
	if (l != 0)
	  break;
	s += sizeof(unsigned long);
      case 2:
	l = u.l[i++];
	if (l != 0)
	  break;
	s += sizeof(unsigned long);
	l = u.l[i];
      }

    /* L now contains 0xff in bytes for which we matched one of the
       relevant characters.  We can find the byte index by finding
       its bit index and dividing by 8.  */
#ifdef __BIG_ENDIAN__
    l = __builtin_clzl(l) >> 3;
#else
    l = __builtin_ctzl(l) >> 3;
#endif
    return s + l;

#undef N
  }
}

#elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)

/* A vection of the fast scanner using AltiVec vectorized byte compares.
   This cannot be used for little endian because vec_lvsl/lvsr are
   deprecated for little endian and the code won't work properly.  */
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
/* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
   so we can't compile this function without -maltivec on the command line
   (or implied by some other switch).  */

static const uchar *
search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
  typedef __attribute__((altivec(vector))) unsigned char vc;

  const vc repl_nl = {
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
  };
  const vc repl_cr = {
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
  };
  const vc repl_bs = {
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
  };
  const vc repl_qm = {
    '?', '?', '?', '?', '?', '?', '?', '?', 
    '?', '?', '?', '?', '?', '?', '?', '?', 
  };
  const vc ones = {
    -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1,
  };
  const vc zero = { 0 };

  vc data, mask, t;

  /* Altivec loads automatically mask addresses with -16.  This lets us
     issue the first load as early as possible.  */
  data = __builtin_vec_ld(0, (const vc *)s);

  /* Discard bytes before the beginning of the buffer.  Do this by
     beginning with all ones and shifting in zeros according to the
     mis-alignment.  The LVSR instruction pulls the exact shift we
     want from the address.  */
  mask = __builtin_vec_lvsr(0, s);
  mask = __builtin_vec_perm(zero, ones, mask);
  data &= mask;

  /* While altivec loads mask addresses, we still need to align S so
     that the offset we compute at the end is correct.  */
  s = (const uchar *)((uintptr_t)s & -16);

  /* Main loop processing 16 bytes at a time.  */
  goto start;
  do
    {
      vc m_nl, m_cr, m_bs, m_qm;

      s += 16;
      data = __builtin_vec_ld(0, (const vc *)s);

    start:
      m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
      m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
      m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
      m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
      t = (m_nl | m_cr) | (m_bs | m_qm);

      /* T now contains 0xff in bytes for which we matched one of the relevant
	 characters.  We want to exit the loop if any byte in T is non-zero.
	 Below is the expansion of vec_any_ne(t, zero).  */
    }
  while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));

  {
#define N  (sizeof(vc) / sizeof(long))

    union {
      vc v;
697 698
      /* Statically assert that N is 2 or 4.  */
      unsigned long l[(N == 2 || N == 4) ? N : -1];
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
    } u;
    unsigned long l, i = 0;

    u.v = t;

    /* Find the first word of T that is non-zero.  */
    switch (N)
      {
      case 4:
	l = u.l[i++];
	if (l != 0)
	  break;
	s += sizeof(unsigned long);
	l = u.l[i++];
	if (l != 0)
	  break;
	s += sizeof(unsigned long);
      case 2:
	l = u.l[i++];
	if (l != 0)
	  break;
	s += sizeof(unsigned long);
	l = u.l[i];
      }

    /* L now contains 0xff in bytes for which we matched one of the
       relevant characters.  We can find the byte index by finding
       its bit index and dividing by 8.  */
    l = __builtin_clzl(l) >> 3;
    return s + l;

#undef N
  }
}

734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
#elif defined (__ARM_NEON__)
#include "arm_neon.h"

static const uchar *
search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
  const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
  const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
  const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
  const uint8x16_t repl_qm = vdupq_n_u8 ('?');
  const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);

  unsigned int misalign, found, mask;
  const uint8_t *p;
  uint8x16_t data;

  /* Align the source pointer.  */
  misalign = (uintptr_t)s & 15;
  p = (const uint8_t *)((uintptr_t)s & -16);
  data = vld1q_u8 (p);

  /* Create a mask for the bytes that are valid within the first
     16-byte block.  The Idea here is that the AND with the mask
     within the loop is "free", since we need some AND or TEST
     insn in order to set the flags for the branch anyway.  */
  mask = (-1u << misalign) & 0xffff;

  /* Main loop, processing 16 bytes at a time.  */
  goto start;

  do
    {
      uint8x8_t l;
      uint16x4_t m;
      uint32x2_t n;
      uint8x16_t t, u, v, w;

      p += 16;
      data = vld1q_u8 (p);
      mask = 0xffff;

    start:
      t = vceqq_u8 (data, repl_nl);
      u = vceqq_u8 (data, repl_cr);
      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
      t = vandq_u8 (vorrq_u8 (v, w), xmask);
      l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
      m = vpaddl_u8 (l);
      n = vpaddl_u16 (m);
      
      found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n, 
	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
      found &= mask;
    }
  while (!found);

  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz (found);
  return (const uchar *)p + found;
}

797 798 799 800 801 802 803 804 805
#else

/* We only have one accellerated alternative.  Use a direct call so that
   we encourage inlining.  */

#define search_line_fast  search_line_acc_char

#endif

806 807 808 809 810 811 812 813 814 815
/* Initialize the lexer if needed.  */

void
_cpp_init_lexer (void)
{
#ifdef HAVE_init_vectorized_lexer
  init_vectorized_lexer ();
#endif
}

816 817 818
/* Returns with a logical line that contains no escaped newlines or
   trigraphs.  This is a time-critical inner loop.  */
void
819
_cpp_clean_line (cpp_reader *pfile)
820
{
821 822 823
  cpp_buffer *buffer;
  const uchar *s;
  uchar c, *d, *p;
824

825 826 827 828
  buffer = pfile->buffer;
  buffer->cur_note = buffer->notes_used = 0;
  buffer->cur = buffer->line_base = buffer->next_line;
  buffer->need_line = false;
829
  s = buffer->next_line;
830

831
  if (!buffer->from_stage3)
832
    {
833 834
      const uchar *pbackslash = NULL;

835
      /* Fast path.  This is the common case of an un-escaped line with
836 837
	 no trigraphs.  The primary win here is by not writing any
	 data back to memory until we have to.  */
838
      while (1)
839
	{
840 841
	  /* Perform an optimized search for \n, \r, \\, ?.  */
	  s = search_line_fast (s, buffer->rlimit);
842

843 844 845 846 847
	  c = *s;
	  if (c == '\\')
	    {
	      /* Record the location of the backslash and continue.  */
	      pbackslash = s++;
848
	    }
849
	  else if (__builtin_expect (c == '?', 0))
850
	    {
851 852
	      if (__builtin_expect (s[1] == '?', false)
		   && _cpp_trigraph_map[s[2]])
853
		{
854 855 856 857 858 859 860 861 862 863 864 865
		  /* Have a trigraph.  We may or may not have to convert
		     it.  Add a line note regardless, for -Wtrigraphs.  */
		  add_line_note (buffer, s, s[2]);
		  if (CPP_OPTION (pfile, trigraphs))
		    {
		      /* We do, and that means we have to switch to the
		         slow path.  */
		      d = (uchar *) s;
		      *d = _cpp_trigraph_map[s[2]];
		      s += 2;
		      goto slow_path;
		    }
866
		}
867 868
	      /* Not a trigraph.  Continue on fast-path.  */
	      s++;
869
	    }
870 871
	  else
	    break;
872 873
	}

874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
      /* This must be \r or \n.  We're either done, or we'll be forced
	 to write back to the buffer and continue on the slow path.  */
      d = (uchar *) s;

      if (__builtin_expect (s == buffer->rlimit, false))
	goto done;

      /* DOS line ending? */
      if (__builtin_expect (c == '\r', false) && s[1] == '\n')
	{
	  s++;
	  if (s == buffer->rlimit)
	    goto done;
	}

      if (__builtin_expect (pbackslash == NULL, true))
	goto done;

      /* Check for escaped newline.  */
      p = d;
      while (is_nvspace (p[-1]))
	p--;
      if (p - 1 != pbackslash)
	goto done;

      /* Have an escaped newline; process it and proceed to
	 the slow path.  */
      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
      d = p - 2;
      buffer->next_line = p - 1;
904

905 906
    slow_path:
      while (1)
907
	{
908 909 910 911 912
	  c = *++s;
	  *++d = c;

	  if (c == '\n' || c == '\r')
	    {
913
	      /* Handle DOS line endings.  */
914 915 916 917 918 919 920 921 922 923 924 925
	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
		s++;
	      if (s == buffer->rlimit)
		break;

	      /* Escaped?  */
	      p = d;
	      while (p != buffer->next_line && is_nvspace (p[-1]))
		p--;
	      if (p == buffer->next_line || p[-1] != '\\')
		break;

926
	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
927 928 929 930 931 932
	      d = p - 2;
	      buffer->next_line = p - 1;
	    }
	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
	    {
	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
933
	      add_line_note (buffer, d, s[2]);
934 935 936 937 938 939
	      if (CPP_OPTION (pfile, trigraphs))
		{
		  *d = _cpp_trigraph_map[s[2]];
		  s += 2;
		}
	    }
940
	}
941
    }
942 943
  else
    {
944
      while (*s != '\n' && *s != '\r')
945 946 947 948 949 950 951
	s++;
      d = (uchar *) s;

      /* Handle DOS line endings.  */
      if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
	s++;
    }
952

953
 done:
954
  *d = '\n';
955 956
  /* A sentinel note that should never be processed.  */
  add_line_note (buffer, d + 1, '\n');
957
  buffer->next_line = s + 1;
958 959
}

960 961 962
/* Return true if the trigraph indicated by NOTE should be warned
   about in a comment.  */
static bool
963
warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
964 965 966 967 968
{
  const uchar *p;

  /* Within comments we don't warn about trigraphs, unless the
     trigraph forms an escaped newline, as that may change
969
     behavior.  */
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
  if (note->type != '/')
    return false;

  /* If -trigraphs, then this was an escaped newline iff the next note
     is coincident.  */
  if (CPP_OPTION (pfile, trigraphs))
    return note[1].pos == note->pos;

  /* Otherwise, see if this forms an escaped newline.  */
  p = note->pos + 3;
  while (is_nvspace (*p))
    p++;

  /* There might have been escaped newlines between the trigraph and the
     newline we found.  Hence the position test.  */
  return (*p == '\n' && p < note[1].pos);
}

988 989 990
/* Process the notes created by add_line_note as far as the current
   location.  */
void
991
_cpp_process_line_notes (cpp_reader *pfile, int in_comment)
992
{
993 994
  cpp_buffer *buffer = pfile->buffer;

995
  for (;;)
Zack Weinberg committed
996
    {
997 998
      _cpp_line_note *note = &buffer->notes[buffer->cur_note];
      unsigned int col;
999

1000 1001
      if (note->pos > buffer->cur)
	break;
1002

1003 1004
      buffer->cur_note++;
      col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1005

1006
      if (note->type == '\\' || note->type == ' ')
1007
	{
1008
	  if (note->type == ' ' && !in_comment)
1009
	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1010
				 "backslash and newline separated by space");
1011

1012
	  if (buffer->next_line > buffer->rlimit)
1013
	    {
1014
	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1015 1016 1017
				   "backslash-newline at end of file");
	      /* Prevent "no newline at end of file" warning.  */
	      buffer->next_line = buffer->rlimit;
1018
	    }
1019 1020

	  buffer->line_base = note->pos;
1021
	  CPP_INCREMENT_LINE (pfile, 0);
1022
	}
1023 1024
      else if (_cpp_trigraph_map[note->type])
	{
1025 1026
	  if (CPP_OPTION (pfile, warn_trigraphs)
	      && (!in_comment || warn_in_comment (pfile, note)))
1027 1028
	    {
	      if (CPP_OPTION (pfile, trigraphs))
1029 1030 1031 1032 1033
		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
                                       pfile->line_table->highest_line, col,
				       "trigraph ??%c converted to %c",
				       note->type,
				       (int) _cpp_trigraph_map[note->type]);
1034
	      else
1035
		{
1036 1037 1038
		  cpp_warning_with_line 
		    (pfile, CPP_W_TRIGRAPHS,
                     pfile->line_table->highest_line, col,
1039 1040 1041
		     "trigraph ??%c ignored, use -trigraphs to enable",
		     note->type);
		}
1042 1043
	    }
	}
1044 1045
      else if (note->type == 0)
	/* Already processed in lex_raw_string.  */;
1046 1047
      else
	abort ();
Zack Weinberg committed
1048
    }
1049 1050
}

1051 1052
/* Skip a C-style block comment.  We find the end of the comment by
   seeing if an asterisk is before every '/' we encounter.  Returns
1053 1054 1055
   nonzero if comment terminated by EOF, zero otherwise.

   Buffer->cur points to the initial asterisk of the comment.  */
1056
bool
1057
_cpp_skip_block_comment (cpp_reader *pfile)
1058
{
Zack Weinberg committed
1059
  cpp_buffer *buffer = pfile->buffer;
1060 1061
  const uchar *cur = buffer->cur;
  uchar c;
1062

1063 1064 1065
  cur++;
  if (*cur == '/')
    cur++;
1066

1067 1068
  for (;;)
    {
1069 1070
      /* People like decorating comments with '*', so check for '/'
	 instead for efficiency.  */
1071 1072
      c = *cur++;

Zack Weinberg committed
1073
      if (c == '/')
1074
	{
1075
	  if (cur[-2] == '*')
1076
	    break;
Zack Weinberg committed
1077

1078
	  /* Warn about potential nested comments, but not if the '/'
1079
	     comes immediately before the true comment delimiter.
Zack Weinberg committed
1080
	     Don't bother to get it right across escaped newlines.  */
1081
	  if (CPP_OPTION (pfile, warn_comments)
1082 1083 1084
	      && cur[0] == '*' && cur[1] != '/')
	    {
	      buffer->cur = cur;
1085 1086 1087 1088
	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
				     pfile->line_table->highest_line,
				     CPP_BUF_COL (buffer),
				     "\"/*\" within comment");
1089
	    }
1090
	}
1091 1092
      else if (c == '\n')
	{
1093
	  unsigned int cols;
1094
	  buffer->cur = cur - 1;
1095 1096 1097 1098
	  _cpp_process_line_notes (pfile, true);
	  if (buffer->next_line >= buffer->rlimit)
	    return true;
	  _cpp_clean_line (pfile);
1099 1100 1101 1102

	  cols = buffer->next_line - buffer->line_base;
	  CPP_INCREMENT_LINE (pfile, cols);

1103
	  cur = buffer->cur;
1104
	}
1105
    }
Zack Weinberg committed
1106

1107
  buffer->cur = cur;
1108
  _cpp_process_line_notes (pfile, true);
1109
  return false;
1110 1111
}

1112
/* Skip a C++ line comment, leaving buffer->cur pointing to the
1113
   terminating newline.  Handles escaped newlines.  Returns nonzero
1114
   if a multiline comment.  */
Zack Weinberg committed
1115
static int
1116
skip_line_comment (cpp_reader *pfile)
1117
{
1118
  cpp_buffer *buffer = pfile->buffer;
1119
  source_location orig_line = pfile->line_table->highest_line;
Zack Weinberg committed
1120

1121 1122
  while (*buffer->cur != '\n')
    buffer->cur++;
1123

1124
  _cpp_process_line_notes (pfile, true);
1125
  return orig_line != pfile->line_table->highest_line;
Zack Weinberg committed
1126
}
1127

1128
/* Skips whitespace, saving the next non-whitespace character.  */
1129
static void
1130
skip_whitespace (cpp_reader *pfile, cppchar_t c)
Zack Weinberg committed
1131 1132
{
  cpp_buffer *buffer = pfile->buffer;
1133
  bool saw_NUL = false;
1134

1135
  do
Zack Weinberg committed
1136
    {
1137
      /* Horizontal space always OK.  */
1138
      if (c == ' ' || c == '\t')
1139 1140
	;
      /* Just \f \v or \0 left.  */
1141
      else if (c == '\0')
1142
	saw_NUL = true;
Neil Booth committed
1143
      else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1144
	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1145 1146 1147
			     CPP_BUF_COL (buffer),
			     "%s in preprocessing directive",
			     c == '\f' ? "form feed" : "vertical tab");
1148 1149

      c = *buffer->cur++;
1150
    }
1151
  /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1152 1153
  while (is_nvspace (c));

1154
  if (saw_NUL)
1155
    cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1156

1157
  buffer->cur--;
Zack Weinberg committed
1158
}
1159

Neil Booth committed
1160 1161 1162
/* See if the characters of a number token are valid in a name (no
   '.', '+' or '-').  */
static int
1163
name_p (cpp_reader *pfile, const cpp_string *string)
Neil Booth committed
1164 1165 1166 1167 1168 1169 1170
{
  unsigned int i;

  for (i = 0; i < string->len; i++)
    if (!is_idchar (string->text[i]))
      return 0;

1171
  return 1;
Neil Booth committed
1172 1173
}

Geoffrey Keating committed
1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185
/* After parsing an identifier or other sequence, produce a warning about
   sequences not in NFC/NFKC.  */
static void
warn_about_normalization (cpp_reader *pfile, 
			  const cpp_token *token,
			  const struct normalize_state *s)
{
  if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
      && !pfile->state.skipping)
    {
      /* Make sure that the token is printed using UCNs, even
	 if we'd otherwise happily print UTF-8.  */
1186
      unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
Geoffrey Keating committed
1187 1188 1189 1190
      size_t sz;

      sz = cpp_spell_token (pfile, token, buf, false) - buf;
      if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1191 1192
	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
			       "`%.*s' is not in NFKC", (int) sz, buf);
Geoffrey Keating committed
1193
      else
1194 1195
	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
			       "`%.*s' is not in NFC", (int) sz, buf);
1196
      free (buf);
Geoffrey Keating committed
1197 1198 1199
    }
}

1200
/* Returns TRUE if the sequence starting at buffer->cur is invalid in
1201
   an identifier.  FIRST is TRUE if this starts an identifier.  */
1202
static bool
Geoffrey Keating committed
1203 1204
forms_identifier_p (cpp_reader *pfile, int first,
		    struct normalize_state *state)
1205
{
1206 1207 1208 1209 1210 1211 1212 1213
  cpp_buffer *buffer = pfile->buffer;

  if (*buffer->cur == '$')
    {
      if (!CPP_OPTION (pfile, dollars_in_ident))
	return false;

      buffer->cur++;
1214
      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1215
	{
1216
	  CPP_OPTION (pfile, warn_dollars) = 0;
1217
	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1218 1219 1220 1221
	}

      return true;
    }
1222

1223
  /* Is this a syntactically valid UCN?  */
1224
  if (CPP_OPTION (pfile, extended_identifiers)
1225
      && *buffer->cur == '\\'
1226
      && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1227
    {
1228
      buffer->cur += 2;
Geoffrey Keating committed
1229 1230
      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
			  state))
1231 1232
	return true;
      buffer->cur -= 2;
1233 1234
    }

1235
  return false;
1236 1237
}

Kai Tietz committed
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270
/* Helper function to get the cpp_hashnode of the identifier BASE.  */
static cpp_hashnode *
lex_identifier_intern (cpp_reader *pfile, const uchar *base)
{
  cpp_hashnode *result;
  const uchar *cur;
  unsigned int len;
  unsigned int hash = HT_HASHSTEP (0, *base);

  cur = base + 1;
  while (ISIDNUM (*cur))
    {
      hash = HT_HASHSTEP (hash, *cur);
      cur++;
    }
  len = cur - base;
  hash = HT_HASHFINISH (hash, len);
  result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
					      base, len, hash, HT_ALLOC));

  /* Rarely, identifiers require diagnostics when lexed.  */
  if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
			&& !pfile->state.skipping, 0))
    {
      /* It is allowed to poison the same identifier twice.  */
      if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
		   NODE_NAME (result));

      /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
	 replacement list of a variadic macro.  */
      if (result == pfile->spec_nodes.n__VA_ARGS__
	  && !pfile->state.va_args_ok)
1271 1272 1273 1274 1275 1276 1277 1278 1279 1280
	{
	  if (CPP_OPTION (pfile, cplusplus))
	    cpp_error (pfile, CPP_DL_PEDWARN,
		       "__VA_ARGS__ can only appear in the expansion"
		       " of a C++11 variadic macro");
	  else
	    cpp_error (pfile, CPP_DL_PEDWARN,
		       "__VA_ARGS__ can only appear in the expansion"
		       " of a C99 variadic macro");
	}
Kai Tietz committed
1281 1282 1283

      /* For -Wc++-compat, warn about use of C++ named operators.  */
      if (result->flags & NODE_WARN_OPERATOR)
1284 1285 1286
	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
		     "identifier \"%s\" is a special operator name in C++",
		     NODE_NAME (result));
Kai Tietz committed
1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
    }

  return result;
}

/* Get the cpp_hashnode of an identifier specified by NAME in
   the current cpp_reader object.  If none is found, NULL is returned.  */
cpp_hashnode *
_cpp_lex_identifier (cpp_reader *pfile, const char *name)
{
  cpp_hashnode *result;
  result = lex_identifier_intern (pfile, (uchar *) name);
  return result;
}

1302
/* Lex an identifier starting at BUFFER->CUR - 1.  */
1303
static cpp_hashnode *
Geoffrey Keating committed
1304
lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1305
		struct normalize_state *nst, cpp_hashnode **spelling)
1306
{
Neil Booth committed
1307
  cpp_hashnode *result;
1308
  const uchar *cur;
1309 1310
  unsigned int len;
  unsigned int hash = HT_HASHSTEP (0, *base);
1311

1312
  cur = pfile->buffer->cur;
1313
  if (! starts_ucn)
Joseph Myers committed
1314 1315 1316 1317 1318 1319 1320 1321
    {
      while (ISIDNUM (*cur))
	{
	  hash = HT_HASHSTEP (hash, *cur);
	  cur++;
	}
      NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
    }
1322
  pfile->buffer->cur = cur;
Geoffrey Keating committed
1323
  if (starts_ucn || forms_identifier_p (pfile, false, nst))
1324
    {
1325 1326 1327
      /* Slower version for identifiers containing UCNs (or $).  */
      do {
	while (ISIDNUM (*pfile->buffer->cur))
Geoffrey Keating committed
1328
	  {
Joseph Myers committed
1329
	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
Geoffrey Keating committed
1330 1331 1332
	    pfile->buffer->cur++;
	  }
      } while (forms_identifier_p (pfile, false, nst));
1333 1334
      result = _cpp_interpret_identifier (pfile, base,
					  pfile->buffer->cur - base);
1335
      *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1336
    }
1337 1338 1339 1340
  else
    {
      len = cur - base;
      hash = HT_HASHFINISH (hash, len);
1341

1342 1343
      result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
						  base, len, hash, HT_ALLOC));
1344
      *spelling = result;
1345
    }
1346

1347
  /* Rarely, identifiers require diagnostics when lexed.  */
1348 1349 1350 1351 1352
  if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
			&& !pfile->state.skipping, 0))
    {
      /* It is allowed to poison the same identifier twice.  */
      if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1353
	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1354 1355 1356 1357 1358 1359
		   NODE_NAME (result));

      /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
	 replacement list of a variadic macro.  */
      if (result == pfile->spec_nodes.n__VA_ARGS__
	  && !pfile->state.va_args_ok)
1360 1361 1362 1363 1364 1365 1366 1367 1368 1369
	{
	  if (CPP_OPTION (pfile, cplusplus))
	    cpp_error (pfile, CPP_DL_PEDWARN,
		       "__VA_ARGS__ can only appear in the expansion"
		       " of a C++11 variadic macro");
	  else
	    cpp_error (pfile, CPP_DL_PEDWARN,
		       "__VA_ARGS__ can only appear in the expansion"
		       " of a C99 variadic macro");
	}
1370 1371 1372

      /* For -Wc++-compat, warn about use of C++ named operators.  */
      if (result->flags & NODE_WARN_OPERATOR)
1373 1374 1375
	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
		     "identifier \"%s\" is a special operator name in C++",
		     NODE_NAME (result));
1376 1377 1378 1379 1380
    }

  return result;
}

1381
/* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1382
static void
Geoffrey Keating committed
1383 1384
lex_number (cpp_reader *pfile, cpp_string *number,
	    struct normalize_state *nst)
1385
{
1386
  const uchar *cur;
1387 1388
  const uchar *base;
  uchar *dest;
1389

1390 1391
  base = pfile->buffer->cur - 1;
  do
Zack Weinberg committed
1392
    {
1393
      cur = pfile->buffer->cur;
1394

1395
      /* N.B. ISIDNUM does not include $.  */
1396 1397
      while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
	     || VALID_SIGN (*cur, cur[-1]))
Geoffrey Keating committed
1398
	{
Joseph Myers committed
1399
	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
Geoffrey Keating committed
1400 1401
	  cur++;
	}
1402

1403
      pfile->buffer->cur = cur;
1404
    }
Geoffrey Keating committed
1405
  while (forms_identifier_p (pfile, false, nst));
Neil Booth committed
1406

1407 1408 1409 1410 1411
  number->len = cur - base;
  dest = _cpp_unaligned_alloc (pfile, number->len + 1);
  memcpy (dest, base, number->len);
  dest[number->len] = '\0';
  number->text = dest;
Neil Booth committed
1412 1413
}

1414 1415
/* Create a token of type TYPE with a literal spelling.  */
static void
1416 1417
create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
		unsigned int len, enum cpp_ttype type)
1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
{
  uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);

  memcpy (dest, base, len);
  dest[len] = '\0';
  token->type = type;
  token->val.str.len = len;
  token->val.str.text = dest;
}

1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
/* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
   sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */

static void
bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
		_cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
{
  _cpp_buff *first_buff = *first_buff_p;
  _cpp_buff *last_buff = *last_buff_p;

  if (first_buff == NULL)
    first_buff = last_buff = _cpp_get_buff (pfile, len);
  else if (len > BUFF_ROOM (last_buff))
    {
      size_t room = BUFF_ROOM (last_buff);
      memcpy (BUFF_FRONT (last_buff), base, room);
      BUFF_FRONT (last_buff) += room;
      base += room;
      len -= room;
      last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
    }

  memcpy (BUFF_FRONT (last_buff), base, len);
  BUFF_FRONT (last_buff) += len;

  *first_buff_p = first_buff;
  *last_buff_p = last_buff;
}

1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483

/* Returns true if a macro has been defined.
   This might not work if compile with -save-temps,
   or preprocess separately from compilation.  */

static bool
is_macro(cpp_reader *pfile, const uchar *base)
{
  const uchar *cur = base;
  if (! ISIDST (*cur))
    return false;
  unsigned int hash = HT_HASHSTEP (0, *cur);
  ++cur;
  while (ISIDNUM (*cur))
    {
      hash = HT_HASHSTEP (hash, *cur);
      ++cur;
    }
  hash = HT_HASHFINISH (hash, cur - base);

  cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
					base, cur - base, hash, HT_NO_INSERT));

  return !result ? false : (result->type == NT_MACRO);
}


1484
/* Lexes a raw string.  The stored string contains the spelling, including
1485
   double quotes, delimiter string, '(' and ')', any leading
1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
   'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
   literal, or CPP_OTHER if it was not properly terminated.

   The spelling is NUL-terminated, but it is not guaranteed that this
   is the first NUL since embedded NULs are preserved.  */

static void
lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
		const uchar *cur)
{
1496 1497 1498 1499 1500 1501
  uchar raw_prefix[17];
  uchar temp_buffer[18];
  const uchar *orig_base;
  unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
  enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
  raw_str_phase phase = RAW_STR_PREFIX;
1502 1503
  enum cpp_ttype type;
  size_t total_len = 0;
1504 1505 1506 1507
  /* Index into temp_buffer during phases other than RAW_STR,
     during RAW_STR phase 17 to tell BUF_APPEND that nothing should
     be appended to temp_buffer.  */
  size_t temp_buffer_len = 0;
1508
  _cpp_buff *first_buff = NULL, *last_buff = NULL;
1509
  size_t raw_prefix_start;
1510
  _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1511 1512 1513 1514 1515 1516

  type = (*base == 'L' ? CPP_WSTRING :
	  *base == 'U' ? CPP_STRING32 :
	  *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
	  : CPP_STRING);

1517 1518 1519 1520 1521
#define BUF_APPEND(STR,LEN)					\
      do {							\
	bufring_append (pfile, (const uchar *)(STR), (LEN),	\
			&first_buff, &last_buff);		\
	total_len += (LEN);					\
1522 1523 1524 1525 1526 1527 1528 1529
	if (__builtin_expect (temp_buffer_len < 17, 0)		\
	    && (const uchar *)(STR) != base			\
	    && (LEN) <= 2)					\
	  {							\
	    memcpy (temp_buffer + temp_buffer_len,		\
		    (const uchar *)(STR), (LEN));		\
	    temp_buffer_len += (LEN);				\
	  }							\
1530 1531
      } while (0);

1532 1533 1534 1535 1536
  orig_base = base;
  ++cur;
  raw_prefix_start = cur - base;
  for (;;)
    {
1537 1538 1539
      cppchar_t c;

      /* If we previously performed any trigraph or line splicing
1540 1541
	 transformations, undo them in between the opening and closing
	 double quote.  */
1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
      while (note->pos < cur)
	++note;
      for (; note->pos == cur; ++note)
	{
	  switch (note->type)
	    {
	    case '\\':
	    case ' ':
	      /* Restore backslash followed by newline.  */
	      BUF_APPEND (base, cur - base);
	      base = cur;
	      BUF_APPEND ("\\", 1);
	    after_backslash:
	      if (note->type == ' ')
		{
		  /* GNU backslash whitespace newline extension.  FIXME
		     could be any sequence of non-vertical space.  When we
		     can properly restore any such sequence, we should mark
		     this note as handled so _cpp_process_line_notes
		     doesn't warn.  */
		  BUF_APPEND (" ", 1);
		}

	      BUF_APPEND ("\n", 1);
	      break;

	    case 0:
	      /* Already handled.  */
	      break;

	    default:
	      if (_cpp_trigraph_map[note->type])
		{
		  /* Don't warn about this trigraph in
		     _cpp_process_line_notes, since trigraphs show up as
		     trigraphs in raw strings.  */
1578
		  uchar type = note->type;
1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605
		  note->type = 0;

		  if (!CPP_OPTION (pfile, trigraphs))
		    /* If we didn't convert the trigraph in the first
		       place, don't do anything now either.  */
		    break;

		  BUF_APPEND (base, cur - base);
		  base = cur;
		  BUF_APPEND ("??", 2);

		  /* ??/ followed by newline gets two line notes, one for
		     the trigraph and one for the backslash/newline.  */
		  if (type == '/' && note[1].pos == cur)
		    {
		      if (note[1].type != '\\'
			  && note[1].type != ' ')
			abort ();
		      BUF_APPEND ("/", 1);
		      ++note;
		      goto after_backslash;
		    }
		  else
		    {
		      /* Skip the replacement character.  */
		      base = ++cur;
		      BUF_APPEND (&type, 1);
1606 1607
		      c = type;
		      goto check_c;
1608 1609 1610 1611 1612 1613 1614 1615
		    }
		}
	      else
		abort ();
	      break;
	    }
	}
      c = *cur++;
1616 1617
      if (__builtin_expect (temp_buffer_len < 17, 0))
	temp_buffer[temp_buffer_len++] = c;
1618

1619 1620
     check_c:
      if (phase == RAW_STR_PREFIX)
1621
	{
1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707
	  while (raw_prefix_len < temp_buffer_len)
	    {
	      raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
	      switch (raw_prefix[raw_prefix_len])
		{
		case ' ': case '(': case ')': case '\\': case '\t':
		case '\v': case '\f': case '\n': default:
		  break;
		/* Basic source charset except the above chars.  */
		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
		case 'y': case 'z':
		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
		case 'Y': case 'Z':
		case '0': case '1': case '2': case '3': case '4': case '5':
		case '6': case '7': case '8': case '9':
		case '_': case '{': case '}': case '#': case '[': case ']':
		case '<': case '>': case '%': case ':': case ';': case '.':
		case '?': case '*': case '+': case '-': case '/': case '^':
		case '&': case '|': case '~': case '!': case '=': case ',':
		case '"': case '\'':
		  if (raw_prefix_len < 16)
		    {
		      raw_prefix_len++;
		      continue;
		    }
		  break;
		}

	      if (raw_prefix[raw_prefix_len] != '(')
		{
		  int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
		  if (raw_prefix_len == 16)
		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
					 col, "raw string delimiter longer "
					      "than 16 characters");
		  else if (raw_prefix[raw_prefix_len] == '\n')
		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
					 col, "invalid new-line in raw "
					      "string delimiter");
		  else
		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
					 col, "invalid character '%c' in "
					      "raw string delimiter",
					 (int) raw_prefix[raw_prefix_len]);
		  pfile->buffer->cur = orig_base + raw_prefix_start - 1;
		  create_literal (pfile, token, orig_base,
				  raw_prefix_start - 1, CPP_OTHER);
		  if (first_buff)
		    _cpp_release_buff (pfile, first_buff);
		  return;
		}
	      raw_prefix[raw_prefix_len] = '"';
	      phase = RAW_STR;
	      /* Nothing should be appended to temp_buffer during
		 RAW_STR phase.  */
	      temp_buffer_len = 17;
	      break;
	    }
	  continue;
	}
      else if (phase == RAW_STR_SUFFIX)
	{
	  while (raw_suffix_len <= raw_prefix_len
		 && raw_suffix_len < temp_buffer_len
		 && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
	    raw_suffix_len++;
	  if (raw_suffix_len > raw_prefix_len)
	    break;
	  if (raw_suffix_len == temp_buffer_len)
	    continue;
	  phase = RAW_STR;
	  /* Nothing should be appended to temp_buffer during
	     RAW_STR phase.  */
	  temp_buffer_len = 17;
	}
      if (c == ')')
	{
	  phase = RAW_STR_SUFFIX;
	  raw_suffix_len = 0;
	  temp_buffer_len = 0;
1708 1709 1710 1711
	}
      else if (c == '\n')
	{
	  if (pfile->state.in_directive
1712 1713
	      || (pfile->state.parsing_args
		  && pfile->buffer->next_line >= pfile->buffer->rlimit))
1714 1715 1716 1717 1718 1719 1720 1721
	    {
	      cur--;
	      type = CPP_OTHER;
	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
				   "unterminated raw string");
	      break;
	    }

1722
	  BUF_APPEND (base, cur - base);
1723 1724 1725 1726 1727

	  if (pfile->buffer->cur < pfile->buffer->rlimit)
	    CPP_INCREMENT_LINE (pfile, 0);
	  pfile->buffer->need_line = true;

1728 1729
	  pfile->buffer->cur = cur-1;
	  _cpp_process_line_notes (pfile, false);
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744
	  if (!_cpp_get_fresh_line (pfile))
	    {
	      source_location src_loc = token->src_loc;
	      token->type = CPP_EOF;
	      /* Tell the compiler the line number of the EOF token.  */
	      token->src_loc = pfile->line_table->highest_line;
	      token->flags = BOL;
	      if (first_buff != NULL)
		_cpp_release_buff (pfile, first_buff);
	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
				   "unterminated raw string");
	      return;
	    }

	  cur = base = pfile->buffer->cur;
1745
	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
1746 1747 1748
	}
    }

1749 1750
  if (CPP_OPTION (pfile, user_literals))
    {
1751 1752 1753 1754 1755
      /* If a string format macro, say from inttypes.h, is placed touching
	 a string literal it could be parsed as a C++11 user-defined string
	 literal thus breaking the program.
	 Try to identify macros with is_macro. A warning is issued. */
      if (is_macro (pfile, cur))
1756
	{
1757
	  /* Raise a warning, but do not consume subsequent tokens.  */
1758
	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1759 1760 1761
	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
				   token->src_loc, 0,
				   "invalid suffix on literal; C++11 requires "
1762
				   "a space between literal and string macro");
1763
	}
1764
      /* Grab user defined literal suffix.  */
1765
      else if (ISIDST (*cur))
1766 1767 1768
	{
	  type = cpp_userdef_string_add_type (type);
	  ++cur;
1769 1770 1771

	  while (ISIDNUM (*cur))
	    ++cur;
1772 1773 1774
	}
    }

1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798
  pfile->buffer->cur = cur;
  if (first_buff == NULL)
    create_literal (pfile, token, base, cur - base, type);
  else
    {
      uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);

      token->type = type;
      token->val.str.len = total_len + (cur - base);
      token->val.str.text = dest;
      last_buff = first_buff;
      while (last_buff != NULL)
	{
	  memcpy (dest, last_buff->base,
		  BUFF_FRONT (last_buff) - last_buff->base);
	  dest += BUFF_FRONT (last_buff) - last_buff->base;
	  last_buff = last_buff->next;
	}
      _cpp_release_buff (pfile, first_buff);
      memcpy (dest, base, cur - base);
      dest[cur - base] = '\0';
    }
}

1799
/* Lexes a string, character constant, or angle-bracketed header file
1800
   name.  The stored string contains the spelling, including opening
1801 1802 1803 1804
   quote and any leading 'L', 'u', 'U' or 'u8' and optional
   'R' modifier.  It returns the type of the literal, or CPP_OTHER
   if it was not properly terminated, or CPP_LESS for an unterminated
   header name which must be relexed as normal tokens.
1805 1806 1807

   The spelling is NUL-terminated, but it is not guaranteed that this
   is the first NUL since embedded NULs are preserved.  */
Zack Weinberg committed
1808
static void
1809
lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1810
{
1811 1812
  bool saw_NUL = false;
  const uchar *cur;
1813
  cppchar_t terminator;
1814 1815 1816 1817
  enum cpp_ttype type;

  cur = base;
  terminator = *cur++;
1818
  if (terminator == 'L' || terminator == 'U')
1819
    terminator = *cur++;
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831
  else if (terminator == 'u')
    {
      terminator = *cur++;
      if (terminator == '8')
	terminator = *cur++;
    }
  if (terminator == 'R')
    {
      lex_raw_string (pfile, token, base, cur);
      return;
    }
  if (terminator == '"')
1832 1833
    type = (*base == 'L' ? CPP_WSTRING :
	    *base == 'U' ? CPP_STRING32 :
1834 1835
	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
			 : CPP_STRING);
1836
  else if (terminator == '\'')
1837 1838 1839
    type = (*base == 'L' ? CPP_WCHAR :
	    *base == 'U' ? CPP_CHAR32 :
	    *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1840 1841
  else
    terminator = '>', type = CPP_HEADER_NAME;
Neil Booth committed
1842

1843
  for (;;)
1844
    {
1845
      cppchar_t c = *cur++;
1846

1847
      /* In #include-style directives, terminators are not escapable.  */
1848 1849 1850
      if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
	cur++;
      else if (c == terminator)
1851
	break;
1852
      else if (c == '\n')
1853
	{
1854
	  cur--;
1855 1856 1857 1858 1859 1860 1861 1862
	  /* Unmatched quotes always yield undefined behavior, but
	     greedy lexing means that what appears to be an unterminated
	     header name may actually be a legitimate sequence of tokens.  */
	  if (terminator == '>')
	    {
	      token->type = CPP_LESS;
	      return;
	    }
1863 1864
	  type = CPP_OTHER;
	  break;
1865
	}
1866 1867
      else if (c == '\0')
	saw_NUL = true;
1868 1869
    }

1870
  if (saw_NUL && !pfile->state.skipping)
1871 1872
    cpp_error (pfile, CPP_DL_WARNING,
	       "null character(s) preserved in literal");
1873

1874 1875 1876 1877
  if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
    cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
	       (int) terminator);

1878 1879
  if (CPP_OPTION (pfile, user_literals))
    {
1880 1881 1882 1883 1884
      /* If a string format macro, say from inttypes.h, is placed touching
	 a string literal it could be parsed as a C++11 user-defined string
	 literal thus breaking the program.
	 Try to identify macros with is_macro. A warning is issued. */
      if (is_macro (pfile, cur))
1885
	{
1886
	  /* Raise a warning, but do not consume subsequent tokens.  */
1887
	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1888 1889 1890
	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
				   token->src_loc, 0,
				   "invalid suffix on literal; C++11 requires "
1891
				   "a space between literal and string macro");
1892
	}
1893
      /* Grab user defined literal suffix.  */
1894
      else if (ISIDST (*cur))
1895 1896 1897 1898
	{
	  type = cpp_userdef_char_add_type (type);
	  type = cpp_userdef_string_add_type (type);
          ++cur;
1899 1900 1901

	  while (ISIDNUM (*cur))
	    ++cur;
1902 1903 1904
	}
    }

1905 1906
  pfile->buffer->cur = cur;
  create_literal (pfile, token, base, cur - base, type);
1907
}
Zack Weinberg committed
1908

1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953
/* Return the comment table. The client may not make any assumption
   about the ordering of the table.  */
cpp_comment_table *
cpp_get_comments (cpp_reader *pfile)
{
  return &pfile->comments;
}

/* Append a comment to the end of the comment table. */
static void 
store_comment (cpp_reader *pfile, cpp_token *token) 
{
  int len;

  if (pfile->comments.allocated == 0)
    {
      pfile->comments.allocated = 256; 
      pfile->comments.entries = (cpp_comment *) xmalloc
	(pfile->comments.allocated * sizeof (cpp_comment));
    }

  if (pfile->comments.count == pfile->comments.allocated)
    {
      pfile->comments.allocated *= 2;
      pfile->comments.entries = (cpp_comment *) xrealloc
	(pfile->comments.entries,
	 pfile->comments.allocated * sizeof (cpp_comment));
    }

  len = token->val.str.len;

  /* Copy comment. Note, token may not be NULL terminated. */
  pfile->comments.entries[pfile->comments.count].comment = 
    (char *) xmalloc (sizeof (char) * (len + 1));
  memcpy (pfile->comments.entries[pfile->comments.count].comment,
	  token->val.str.text, len);
  pfile->comments.entries[pfile->comments.count].comment[len] = '\0';

  /* Set source location. */
  pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;

  /* Increment the count of entries in the comment table. */
  pfile->comments.count++;
}

Neil Booth committed
1954
/* The stored comment includes the comment start and any terminator.  */
1955
static void
1956 1957
save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
	      cppchar_t type)
1958
{
Zack Weinberg committed
1959
  unsigned char *buffer;
1960
  unsigned int len, clen, i;
1961

1962
  len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1963

1964 1965
  /* C++ comments probably (not definitely) have moved past a new
     line, which we don't want to save in the comment.  */
1966
  if (is_vspace (pfile->buffer->cur[-1]))
1967
    len--;
1968

1969 1970 1971
  /* If we are currently in a directive or in argument parsing, then
     we need to store all C++ comments as C comments internally, and
     so we need to allocate a little extra space in that case.
1972 1973 1974

     Note that the only time we encounter a directive here is
     when we are saving comments in a "#define".  */
1975 1976
  clen = ((pfile->state.in_directive || pfile->state.parsing_args)
	  && type == '/') ? len + 2 : len;
1977 1978

  buffer = _cpp_unaligned_alloc (pfile, clen);
1979

Zack Weinberg committed
1980
  token->type = CPP_COMMENT;
1981
  token->val.str.len = clen;
1982
  token->val.str.text = buffer;
1983

1984 1985
  buffer[0] = '/';
  memcpy (buffer + 1, from, len - 1);
1986

1987
  /* Finish conversion to a C comment, if necessary.  */
1988
  if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1989 1990 1991 1992
    {
      buffer[1] = '*';
      buffer[clen - 2] = '*';
      buffer[clen - 1] = '/';
1993 1994 1995 1996 1997
      /* As there can be in a C++ comments illegal sequences for C comments
         we need to filter them out.  */
      for (i = 2; i < (clen - 2); i++)
        if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
          buffer[i] = '|';
1998
    }
1999 2000 2001

  /* Finally store this comment for use by clients of libcpp. */
  store_comment (pfile, token);
2002
}
2003

2004 2005
/* Allocate COUNT tokens for RUN.  */
void
2006
_cpp_init_tokenrun (tokenrun *run, unsigned int count)
2007
{
2008
  run->base = XNEWVEC (cpp_token, count);
2009 2010 2011 2012 2013 2014
  run->limit = run->base + count;
  run->next = NULL;
}

/* Returns the next tokenrun, or creates one if there is none.  */
static tokenrun *
2015
next_tokenrun (tokenrun *run)
2016 2017 2018
{
  if (run->next == NULL)
    {
2019
      run->next = XNEW (tokenrun);
2020
      run->next->prev = run;
2021 2022 2023 2024 2025 2026
      _cpp_init_tokenrun (run->next, 250);
    }

  return run->next;
}

2027
/* Return the number of not yet processed token in a given
2028 2029
   context.  */
int
2030
_cpp_remaining_tokens_num_in_context (cpp_context *context)
2031 2032
{
  if (context->tokens_kind == TOKENS_KIND_DIRECT)
2033
    return (LAST (context).token - FIRST (context).token);
2034 2035
  else if (context->tokens_kind == TOKENS_KIND_INDIRECT
	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
2036
    return (LAST (context).ptoken - FIRST (context).ptoken);
2037 2038 2039 2040
  else
      abort ();
}

2041 2042
/* Returns the token present at index INDEX in a given context.  If
   INDEX is zero, the next token to be processed is returned.  */
2043
static const cpp_token*
2044
_cpp_token_from_context_at (cpp_context *context, int index)
2045 2046 2047 2048 2049 2050 2051 2052 2053 2054
{
  if (context->tokens_kind == TOKENS_KIND_DIRECT)
    return &(FIRST (context).token[index]);
  else if (context->tokens_kind == TOKENS_KIND_INDIRECT
	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
    return FIRST (context).ptoken[index];
 else
   abort ();
}

2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065
/* Look ahead in the input stream.  */
const cpp_token *
cpp_peek_token (cpp_reader *pfile, int index)
{
  cpp_context *context = pfile->context;
  const cpp_token *peektok;
  int count;

  /* First, scan through any pending cpp_context objects.  */
  while (context->prev)
    {
2066
      ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2067 2068

      if (index < (int) sz)
2069
        return _cpp_token_from_context_at (context, index);
2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092
      index -= (int) sz;
      context = context->prev;
    }

  /* We will have to read some new tokens after all (and do so
     without invalidating preceding tokens).  */
  count = index;
  pfile->keep_tokens++;

  do
    {
      peektok = _cpp_lex_token (pfile);
      if (peektok->type == CPP_EOF)
	return peektok;
    }
  while (index--);

  _cpp_backup_tokens_direct (pfile, count + 1);
  pfile->keep_tokens--;

  return peektok;
}

2093 2094 2095 2096 2097
/* Allocate a single token that is invalidated at the same time as the
   rest of the tokens on the line.  Has its line and col set to the
   same as the last lexed token, so that diagnostics appear in the
   right place.  */
cpp_token *
2098
_cpp_temp_token (cpp_reader *pfile)
2099 2100
{
  cpp_token *old, *result;
2101 2102
  ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
  ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2103 2104

  old = pfile->cur_token - 1;
2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124
  /* Any pre-existing lookaheads must not be clobbered.  */
  if (la)
    {
      if (sz <= la)
        {
          tokenrun *next = next_tokenrun (pfile->cur_run);

          if (sz < la)
            memmove (next->base + 1, next->base,
                     (la - sz) * sizeof (cpp_token));

          next->base[0] = pfile->cur_run->limit[-1];
        }

      if (sz > 1)
        memmove (pfile->cur_token + 1, pfile->cur_token,
                 MIN (la, sz - 1) * sizeof (cpp_token));
    }

  if (!sz && pfile->cur_token == pfile->cur_run->limit)
2125 2126 2127 2128 2129 2130
    {
      pfile->cur_run = next_tokenrun (pfile->cur_run);
      pfile->cur_token = pfile->cur_run->base;
    }

  result = pfile->cur_token++;
2131
  result->src_loc = old->src_loc;
2132 2133 2134
  return result;
}

2135 2136
/* Lex a token into RESULT (external interface).  Takes care of issues
   like directive handling, token lookahead, multiple include
2137
   optimization and skipping.  */
2138
const cpp_token *
2139
_cpp_lex_token (cpp_reader *pfile)
2140
{
2141
  cpp_token *result;
2142

2143
  for (;;)
2144
    {
2145
      if (pfile->cur_token == pfile->cur_run->limit)
2146
	{
2147 2148
	  pfile->cur_run = next_tokenrun (pfile->cur_run);
	  pfile->cur_token = pfile->cur_run->base;
2149
	}
2150 2151 2152 2153 2154
      /* We assume that the current token is somewhere in the current
	 run.  */
      if (pfile->cur_token < pfile->cur_run->base
	  || pfile->cur_token >= pfile->cur_run->limit)
	abort ();
2155

2156
      if (pfile->lookaheads)
2157 2158 2159 2160
	{
	  pfile->lookaheads--;
	  result = pfile->cur_token++;
	}
2161
      else
2162
	result = _cpp_lex_direct (pfile);
2163 2164

      if (result->flags & BOL)
2165
	{
2166 2167 2168
	  /* Is this a directive.  If _cpp_handle_directive returns
	     false, it is an assembler #.  */
	  if (result->type == CPP_HASH
2169 2170 2171
	      /* 6.10.3 p 11: Directives in a list of macro arguments
		 gives undefined behavior.  This implementation
		 handles the directive as normal.  */
2172
	      && pfile->state.parsing_args != 1)
2173
	    {
2174
	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2175
		{
2176 2177
		  if (pfile->directive_result.type == CPP_PADDING)
		    continue;
2178 2179 2180
		  result = &pfile->directive_result;
		}
	    }
2181 2182
	  else if (pfile->state.in_deferred_pragma)
	    result = &pfile->directive_result;
2183

2184
	  if (pfile->cb.line_change && !pfile->state.skipping)
2185
	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2186 2187
	}

2188
      /* We don't skip tokens in directives.  */
2189
      if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2190
	break;
2191

2192
      /* Outside a directive, invalidate controlling macros.  At file
2193
	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2194
	 get here and MI optimization works.  */
2195
      pfile->mi_valid = false;
2196 2197 2198

      if (!pfile->state.skipping || result->type == CPP_EOF)
	break;
2199 2200
    }

2201
  return result;
2202 2203
}

2204 2205
/* Returns true if a fresh line has been loaded.  */
bool
2206
_cpp_get_fresh_line (cpp_reader *pfile)
2207
{
2208 2209
  int return_at_eof;

2210 2211 2212
  /* We can't get a new line until we leave the current directive.  */
  if (pfile->state.in_directive)
    return false;
2213

2214
  for (;;)
2215
    {
2216
      cpp_buffer *buffer = pfile->buffer;
2217

2218 2219 2220 2221
      if (!buffer->need_line)
	return true;

      if (buffer->next_line < buffer->rlimit)
2222
	{
2223 2224 2225
	  _cpp_clean_line (pfile);
	  return true;
	}
2226

2227 2228 2229 2230 2231 2232 2233 2234 2235
      /* First, get out of parsing arguments state.  */
      if (pfile->state.parsing_args)
	return false;

      /* End of buffer.  Non-empty files should end in a newline.  */
      if (buffer->buf != buffer->rlimit
	  && buffer->next_line > buffer->rlimit
	  && !buffer->from_stage3)
	{
2236
	  /* Clip to buffer size.  */
2237 2238
	  buffer->next_line = buffer->rlimit;
	}
2239 2240

      return_at_eof = buffer->return_at_eof;
2241
      _cpp_pop_buffer (pfile);
2242
      if (pfile->buffer == NULL || return_at_eof)
2243
	return false;
2244
    }
2245 2246
}

2247 2248 2249 2250 2251 2252 2253 2254
#define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
  do							\
    {							\
      result->type = ELSE_TYPE;				\
      if (*buffer->cur == CHAR)				\
	buffer->cur++, result->type = THEN_TYPE;	\
    }							\
  while (0)
2255

2256 2257 2258 2259
/* Lex a token into pfile->cur_token, which is also incremented, to
   get diagnostics pointing to the correct location.

   Does not handle issues such as token lookahead, multiple-include
2260
   optimization, directives, skipping etc.  This function is only
2261 2262 2263 2264 2265 2266 2267
   suitable for use by _cpp_lex_token, and in special cases like
   lex_expansion_token which doesn't care for any of these issues.

   When meeting a newline, returns CPP_EOF if parsing a directive,
   otherwise returns to the start of the token buffer if permissible.
   Returns the location of the lexed token.  */
cpp_token *
2268
_cpp_lex_direct (cpp_reader *pfile)
2269
{
2270
  cppchar_t c;
2271
  cpp_buffer *buffer;
2272
  const unsigned char *comment_start;
2273
  cpp_token *result = pfile->cur_token++;
2274

2275
 fresh_line:
2276
  result->flags = 0;
2277
  buffer = pfile->buffer;
2278
  if (buffer->need_line)
2279
    {
2280 2281 2282 2283 2284 2285 2286 2287
      if (pfile->state.in_deferred_pragma)
	{
	  result->type = CPP_PRAGMA_EOL;
	  pfile->state.in_deferred_pragma = false;
	  if (!pfile->state.pragma_allow_expansion)
	    pfile->state.prevent_expansion--;
	  return result;
	}
2288 2289 2290
      if (!_cpp_get_fresh_line (pfile))
	{
	  result->type = CPP_EOF;
Neil Booth committed
2291 2292 2293
	  if (!pfile->state.in_directive)
	    {
	      /* Tell the compiler the line number of the EOF token.  */
2294
	      result->src_loc = pfile->line_table->highest_line;
Neil Booth committed
2295 2296
	      result->flags = BOL;
	    }
2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308
	  return result;
	}
      if (!pfile->keep_tokens)
	{
	  pfile->cur_run = &pfile->base_run;
	  result = pfile->base_run.base;
	  pfile->cur_token = result + 1;
	}
      result->flags = BOL;
      if (pfile->state.parsing_args == 2)
	result->flags |= PREV_WHITE;
    }
2309
  buffer = pfile->buffer;
2310
 update_tokens_line:
2311
  result->src_loc = pfile->line_table->highest_line;
Zack Weinberg committed
2312

2313
 skipped_white:
2314 2315 2316 2317
  if (buffer->cur >= buffer->notes[buffer->cur_note].pos
      && !pfile->overlaid_buffer)
    {
      _cpp_process_line_notes (pfile, false);
2318
      result->src_loc = pfile->line_table->highest_line;
2319
    }
2320
  c = *buffer->cur++;
2321

2322 2323 2324 2325 2326
  if (pfile->forced_token_location_p)
    result->src_loc = *pfile->forced_token_location_p;
  else
    result->src_loc = linemap_position_for_column (pfile->line_table,
					  CPP_BUF_COLUMN (buffer, buffer->cur));
2327

2328
  switch (c)
2329
    {
2330 2331
    case ' ': case '\t': case '\f': case '\v': case '\0':
      result->flags |= PREV_WHITE;
2332 2333
      skip_whitespace (pfile, c);
      goto skipped_white;
2334

2335
    case '\n':
2336 2337
      if (buffer->cur < buffer->rlimit)
	CPP_INCREMENT_LINE (pfile, 0);
2338 2339
      buffer->need_line = true;
      goto fresh_line;
2340

2341 2342
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
Geoffrey Keating committed
2343 2344 2345 2346 2347 2348 2349
      {
	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
	result->type = CPP_NUMBER;
	lex_number (pfile, &result->val.str, &nst);
	warn_about_normalization (pfile, result, &nst);
	break;
      }
2350

2351
    case 'L':
2352 2353
    case 'u':
    case 'U':
2354 2355 2356
    case 'R':
      /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
	 wide strings or raw strings.  */
2357 2358
      if (c == 'L' || CPP_OPTION (pfile, rliterals)
	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2359
	{
2360 2361 2362 2363 2364
	  if ((*buffer->cur == '\'' && c != 'R')
	      || *buffer->cur == '"'
	      || (*buffer->cur == 'R'
		  && c != 'R'
		  && buffer->cur[1] == '"'
2365
		  && CPP_OPTION (pfile, rliterals))
2366 2367 2368
	      || (*buffer->cur == '8'
		  && c == 'u'
		  && (buffer->cur[1] == '"'
2369 2370
		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
			  && CPP_OPTION (pfile, rliterals)))))
2371 2372 2373 2374
	    {
	      lex_string (pfile, result, buffer->cur - 1);
	      break;
	    }
2375
	}
2376
      /* Fall through.  */
2377

2378 2379 2380 2381
    case '_':
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2382
    case 's': case 't':           case 'v': case 'w': case 'x':
2383 2384
    case 'y': case 'z':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2385
    case 'G': case 'H': case 'I': case 'J': case 'K':
2386
    case 'M': case 'N': case 'O': case 'P': case 'Q':
2387
    case 'S': case 'T':           case 'V': case 'W': case 'X':
2388 2389
    case 'Y': case 'Z':
      result->type = CPP_NAME;
Geoffrey Keating committed
2390 2391
      {
	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2392
	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2393 2394
						&nst,
						&result->val.node.spelling);
Geoffrey Keating committed
2395 2396
	warn_about_normalization (pfile, result, &nst);
      }
2397 2398

      /* Convert named operators to their proper types.  */
2399
      if (result->val.node.node->flags & NODE_OPERATOR)
2400 2401
	{
	  result->flags |= NAMED_OP;
2402
	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2403 2404 2405 2406 2407
	}
      break;

    case '\'':
    case '"':
2408
      lex_string (pfile, result, buffer->cur - 1);
2409
      break;
Zack Weinberg committed
2410

2411
    case '/':
2412 2413
      /* A potential block or line comment.  */
      comment_start = buffer->cur;
2414 2415
      c = *buffer->cur;
      
2416 2417
      if (c == '*')
	{
2418
	  if (_cpp_skip_block_comment (pfile))
2419
	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2420
	}
2421
      else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2422
	{
2423 2424 2425
	  /* Don't warn for system headers.  */
	  if (cpp_in_system_header (pfile))
	    ;
2426
	  /* Warn about comments if pedantically GNUC89, and not
2427
	     in system headers.  */
2428 2429 2430
	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
		   && CPP_PEDANTIC (pfile)
		   && ! buffer->warned_cplusplus_comments)
Zack Weinberg committed
2431
	    {
2432
	      cpp_error (pfile, CPP_DL_PEDWARN,
2433
			 "C++ style comments are not allowed in ISO C90");
2434
	      cpp_error (pfile, CPP_DL_PEDWARN,
2435
			 "(this will be reported only once per input file)");
2436 2437
	      buffer->warned_cplusplus_comments = 1;
	    }
2438
	  /* Or if specifically desired via -Wc90-c99-compat.  */
2439
	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2440
		   && ! CPP_OPTION (pfile, cplusplus)
2441 2442 2443
		   && ! buffer->warned_cplusplus_comments)
	    {
	      cpp_error (pfile, CPP_DL_WARNING,
2444
			 "C++ style comments are incompatible with C90");
2445 2446 2447 2448
	      cpp_error (pfile, CPP_DL_WARNING,
			 "(this will be reported only once per input file)");
	      buffer->warned_cplusplus_comments = 1;
	    }
2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473
	  /* In C89/C94, C++ style comments are forbidden.  */
	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
	    {
	      /* But don't be confused about valid code such as
	         - // immediately followed by *,
		 - // in a preprocessing directive,
		 - // in an #if 0 block.  */
	      if (buffer->cur[1] == '*'
		  || pfile->state.in_directive
		  || pfile->state.skipping)
		{
		  result->type = CPP_DIV;
		  break;
		}
	      else if (! buffer->warned_cplusplus_comments)
		{
		  cpp_error (pfile, CPP_DL_ERROR,
			     "C++ style comments are not allowed in ISO C90");
		  cpp_error (pfile, CPP_DL_ERROR,
			     "(this will be reported only once per input "
			     "file)");
		  buffer->warned_cplusplus_comments = 1;
		}
	    }
2474
	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2475
	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2476
	}
2477 2478
      else if (c == '=')
	{
2479
	  buffer->cur++;
2480 2481 2482 2483 2484 2485 2486 2487
	  result->type = CPP_DIV_EQ;
	  break;
	}
      else
	{
	  result->type = CPP_DIV;
	  break;
	}
2488

2489 2490 2491
      if (!pfile->state.save_comments)
	{
	  result->flags |= PREV_WHITE;
2492
	  goto update_tokens_line;
2493
	}
2494 2495

      /* Save the comment as a token in its own right.  */
2496
      save_comment (pfile, result, comment_start, c);
2497
      break;
2498 2499 2500 2501

    case '<':
      if (pfile->state.angled_headers)
	{
2502
	  lex_string (pfile, result, buffer->cur - 1);
2503 2504
	  if (result->type != CPP_LESS)
	    break;
2505
	}
2506

2507 2508 2509 2510
      result->type = CPP_LESS;
      if (*buffer->cur == '=')
	buffer->cur++, result->type = CPP_LESS_EQ;
      else if (*buffer->cur == '<')
2511
	{
2512 2513
	  buffer->cur++;
	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2514
	}
2515
      else if (CPP_OPTION (pfile, digraphs))
2516
	{
2517 2518
	  if (*buffer->cur == ':')
	    {
Paolo Carlini committed
2519 2520 2521 2522 2523
	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
		 three characters are <:: and the subsequent character
		 is neither : nor >, the < is treated as a preprocessor
		 token by itself".  */
	      if (CPP_OPTION (pfile, cplusplus)
2524 2525
		  && CPP_OPTION (pfile, lang) != CLK_CXX98
		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
Paolo Carlini committed
2526 2527 2528 2529
		  && buffer->cur[1] == ':'
		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
		break;

2530 2531 2532 2533 2534 2535 2536 2537 2538 2539
	      buffer->cur++;
	      result->flags |= DIGRAPH;
	      result->type = CPP_OPEN_SQUARE;
	    }
	  else if (*buffer->cur == '%')
	    {
	      buffer->cur++;
	      result->flags |= DIGRAPH;
	      result->type = CPP_OPEN_BRACE;
	    }
2540
	}
2541 2542 2543
      break;

    case '>':
2544 2545 2546 2547
      result->type = CPP_GREATER;
      if (*buffer->cur == '=')
	buffer->cur++, result->type = CPP_GREATER_EQ;
      else if (*buffer->cur == '>')
2548
	{
2549 2550 2551
	  buffer->cur++;
	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
	}
2552 2553
      break;

2554
    case '%':
2555 2556 2557 2558
      result->type = CPP_MOD;
      if (*buffer->cur == '=')
	buffer->cur++, result->type = CPP_MOD_EQ;
      else if (CPP_OPTION (pfile, digraphs))
2559
	{
2560
	  if (*buffer->cur == ':')
2561
	    {
2562 2563 2564 2565
	      buffer->cur++;
	      result->flags |= DIGRAPH;
	      result->type = CPP_HASH;
	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
2566
		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2567 2568 2569 2570 2571 2572
	    }
	  else if (*buffer->cur == '>')
	    {
	      buffer->cur++;
	      result->flags |= DIGRAPH;
	      result->type = CPP_CLOSE_BRACE;
2573 2574
	    }
	}
2575 2576
      break;

2577
    case '.':
2578
      result->type = CPP_DOT;
2579
      if (ISDIGIT (*buffer->cur))
2580
	{
Geoffrey Keating committed
2581
	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2582
	  result->type = CPP_NUMBER;
Geoffrey Keating committed
2583 2584
	  lex_number (pfile, &result->val.str, &nst);
	  warn_about_normalization (pfile, result, &nst);
2585
	}
2586 2587 2588 2589
      else if (*buffer->cur == '.' && buffer->cur[1] == '.')
	buffer->cur += 2, result->type = CPP_ELLIPSIS;
      else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
	buffer->cur++, result->type = CPP_DOT_STAR;
2590
      break;
2591

2592
    case '+':
2593 2594 2595 2596 2597
      result->type = CPP_PLUS;
      if (*buffer->cur == '+')
	buffer->cur++, result->type = CPP_PLUS_PLUS;
      else if (*buffer->cur == '=')
	buffer->cur++, result->type = CPP_PLUS_EQ;
2598
      break;
2599

2600
    case '-':
2601 2602
      result->type = CPP_MINUS;
      if (*buffer->cur == '>')
2603
	{
2604
	  buffer->cur++;
2605
	  result->type = CPP_DEREF;
2606 2607
	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
	    buffer->cur++, result->type = CPP_DEREF_STAR;
2608
	}
2609 2610 2611 2612
      else if (*buffer->cur == '-')
	buffer->cur++, result->type = CPP_MINUS_MINUS;
      else if (*buffer->cur == '=')
	buffer->cur++, result->type = CPP_MINUS_EQ;
2613
      break;
2614

2615
    case '&':
2616 2617 2618 2619 2620
      result->type = CPP_AND;
      if (*buffer->cur == '&')
	buffer->cur++, result->type = CPP_AND_AND;
      else if (*buffer->cur == '=')
	buffer->cur++, result->type = CPP_AND_EQ;
2621
      break;
2622

2623
    case '|':
2624 2625 2626 2627 2628
      result->type = CPP_OR;
      if (*buffer->cur == '|')
	buffer->cur++, result->type = CPP_OR_OR;
      else if (*buffer->cur == '=')
	buffer->cur++, result->type = CPP_OR_EQ;
2629
      break;
2630

2631
    case ':':
2632 2633 2634 2635
      result->type = CPP_COLON;
      if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
	buffer->cur++, result->type = CPP_SCOPE;
      else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2636
	{
2637
	  buffer->cur++;
2638
	  result->flags |= DIGRAPH;
2639 2640
	  result->type = CPP_CLOSE_SQUARE;
	}
2641
      break;
2642

2643 2644 2645 2646
    case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
    case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
    case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
    case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2647
    case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2648

2649
    case '?': result->type = CPP_QUERY; break;
2650 2651 2652 2653 2654 2655 2656 2657 2658 2659
    case '~': result->type = CPP_COMPL; break;
    case ',': result->type = CPP_COMMA; break;
    case '(': result->type = CPP_OPEN_PAREN; break;
    case ')': result->type = CPP_CLOSE_PAREN; break;
    case '[': result->type = CPP_OPEN_SQUARE; break;
    case ']': result->type = CPP_CLOSE_SQUARE; break;
    case '{': result->type = CPP_OPEN_BRACE; break;
    case '}': result->type = CPP_CLOSE_BRACE; break;
    case ';': result->type = CPP_SEMICOLON; break;

2660
      /* @ is a punctuator in Objective-C.  */
2661
    case '@': result->type = CPP_ATSIGN; break;
2662

2663
    case '$':
2664 2665 2666
    case '\\':
      {
	const uchar *base = --buffer->cur;
Geoffrey Keating committed
2667
	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2668

Geoffrey Keating committed
2669
	if (forms_identifier_p (pfile, true, &nst))
2670 2671
	  {
	    result->type = CPP_NAME;
2672 2673
	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
						    &result->val.node.spelling);
Geoffrey Keating committed
2674
	    warn_about_normalization (pfile, result, &nst);
2675 2676 2677
	    break;
	  }
	buffer->cur++;
2678
      }
2679

2680
    default:
2681 2682
      create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
      break;
2683
    }
2684 2685

  return result;
2686 2687
}

2688 2689
/* An upper bound on the number of bytes needed to spell TOKEN.
   Does not include preceding whitespace.  */
Neil Booth committed
2690
unsigned int
2691
cpp_token_len (const cpp_token *token)
2692
{
Neil Booth committed
2693
  unsigned int len;
2694

Neil Booth committed
2695
  switch (TOKEN_SPELL (token))
Zack Weinberg committed
2696
    {
2697
    default:		len = 6;				break;
2698
    case SPELL_LITERAL:	len = token->val.str.len;		break;
2699
    case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
Zack Weinberg committed
2700
    }
2701 2702

  return len;
2703 2704
}

2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738
/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
   Return the number of bytes read out of NAME.  (There are always
   10 bytes written to BUFFER.)  */

static size_t
utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
{
  int j;
  int ucn_len = 0;
  int ucn_len_c;
  unsigned t;
  unsigned long utf32;
  
  /* Compute the length of the UTF-8 sequence.  */
  for (t = *name; t & 0x80; t <<= 1)
    ucn_len++;
  
  utf32 = *name & (0x7F >> ucn_len);
  for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
    {
      utf32 = (utf32 << 6) | (*++name & 0x3F);
      
      /* Ill-formed UTF-8.  */
      if ((*name & ~0x3F) != 0x80)
	abort ();
    }
  
  *buffer++ = '\\';
  *buffer++ = 'U';
  for (j = 7; j >= 0; j--)
    *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
  return ucn_len;
}

2739 2740 2741 2742 2743 2744 2745
/* Given a token TYPE corresponding to a digraph, return a pointer to
   the spelling of the digraph.  */
static const unsigned char *
cpp_digraph2name (enum cpp_ttype type)
{
  return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
}
2746

2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768
/* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
   The buffer must already contain the enough space to hold the
   token's spelling.  Returns a pointer to the character after the
   last character written.  */
unsigned char *
_cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
{
  size_t i;
  const unsigned char *name = NODE_NAME (ident);
	  
  for (i = 0; i < NODE_LEN (ident); i++)
    if (name[i] & ~0x7F)
      {
	i += utf8_to_ucn (buffer, name + i) - 1;
	buffer += 10;
      }
    else
      *buffer++ = name[i];

  return buffer;
}

Zack Weinberg committed
2769
/* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2770
   already contain the enough space to hold the token's spelling.
2771
   Returns a pointer to the character after the last character written.
2772
   FORSTRING is true if this is to be the spelling after translation
2773 2774 2775
   phase 1 (with the original spelling of extended identifiers), false
   if extended identifiers should always be written using UCNs (there is
   no option for always writing them in the internal UTF-8 form).
2776
   FIXME: Would be nice if we didn't need the PFILE argument.  */
Neil Booth committed
2777
unsigned char *
2778
cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2779
		 unsigned char *buffer, bool forstring)
Zack Weinberg committed
2780
{
2781
  switch (TOKEN_SPELL (token))
Zack Weinberg committed
2782 2783 2784 2785 2786
    {
    case SPELL_OPERATOR:
      {
	const unsigned char *spelling;
	unsigned char c;
2787

Zack Weinberg committed
2788
	if (token->flags & DIGRAPH)
2789
	  spelling = cpp_digraph2name (token->type);
2790 2791
	else if (token->flags & NAMED_OP)
	  goto spell_ident;
Zack Weinberg committed
2792
	else
2793
	  spelling = TOKEN_NAME (token);
2794

Zack Weinberg committed
2795 2796 2797 2798
	while ((c = *spelling++) != '\0')
	  *buffer++ = c;
      }
      break;
2799

2800
    spell_ident:
Zack Weinberg committed
2801
    case SPELL_IDENT:
2802 2803
      if (forstring)
	{
2804 2805 2806
	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
		  NODE_LEN (token->val.node.spelling));
	  buffer += NODE_LEN (token->val.node.spelling);
2807 2808
	}
      else
2809
	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
Zack Weinberg committed
2810
      break;
2811

2812
    case SPELL_LITERAL:
2813 2814 2815 2816
      memcpy (buffer, token->val.str.text, token->val.str.len);
      buffer += token->val.str.len;
      break;

Zack Weinberg committed
2817
    case SPELL_NONE:
2818 2819
      cpp_error (pfile, CPP_DL_ICE,
		 "unspellable token %s", TOKEN_NAME (token));
Zack Weinberg committed
2820 2821
      break;
    }
2822

Zack Weinberg committed
2823 2824
  return buffer;
}
2825

2826 2827
/* Returns TOKEN spelt as a null-terminated string.  The string is
   freed when the reader is destroyed.  Useful for diagnostics.  */
Neil Booth committed
2828
unsigned char *
2829
cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2830 2831
{ 
  unsigned int len = cpp_token_len (token) + 1;
2832
  unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
Zack Weinberg committed
2833

2834
  end = cpp_spell_token (pfile, token, start, false);
Neil Booth committed
2835
  end[0] = '\0';
Zack Weinberg committed
2836

Neil Booth committed
2837 2838
  return start;
}
Zack Weinberg committed
2839

2840 2841 2842
/* Returns a pointer to a string which spells the token defined by
   TYPE and FLAGS.  Used by C front ends, which really should move to
   using cpp_token_as_text.  */
Neil Booth committed
2843
const char *
2844
cpp_type2name (enum cpp_ttype type, unsigned char flags)
Neil Booth committed
2845
{
2846 2847 2848 2849 2850
  if (flags & DIGRAPH)
    return (const char *) cpp_digraph2name (type);
  else if (flags & NAMED_OP)
    return cpp_named_operator2name (type);

Neil Booth committed
2851 2852
  return (const char *) token_spellings[type].name;
}
Zack Weinberg committed
2853

2854 2855 2856
/* Writes the spelling of token to FP, without any preceding space.
   Separated from cpp_spell_token for efficiency - to avoid stdio
   double-buffering.  */
Neil Booth committed
2857
void
2858
cpp_output_token (const cpp_token *token, FILE *fp)
Neil Booth committed
2859 2860
{
  switch (TOKEN_SPELL (token))
Zack Weinberg committed
2861
    {
Neil Booth committed
2862 2863 2864
    case SPELL_OPERATOR:
      {
	const unsigned char *spelling;
2865
	int c;
Zack Weinberg committed
2866

Neil Booth committed
2867
	if (token->flags & DIGRAPH)
2868
	  spelling = cpp_digraph2name (token->type);
Neil Booth committed
2869 2870 2871 2872
	else if (token->flags & NAMED_OP)
	  goto spell_ident;
	else
	  spelling = TOKEN_NAME (token);
Zack Weinberg committed
2873

2874 2875 2876 2877
	c = *spelling;
	do
	  putc (c, fp);
	while ((c = *++spelling) != '\0');
Neil Booth committed
2878 2879
      }
      break;
Zack Weinberg committed
2880

Neil Booth committed
2881 2882
    spell_ident:
    case SPELL_IDENT:
2883 2884
      {
	size_t i;
2885
	const unsigned char * name = NODE_NAME (token->val.node.node);
2886
	
2887
	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2888 2889 2890 2891 2892 2893 2894
	  if (name[i] & ~0x7F)
	    {
	      unsigned char buffer[10];
	      i += utf8_to_ucn (buffer, name + i) - 1;
	      fwrite (buffer, 1, 10, fp);
	    }
	  else
2895
	    fputc (NODE_NAME (token->val.node.node)[i], fp);
2896 2897
      }
      break;
Zack Weinberg committed
2898

2899
    case SPELL_LITERAL:
2900 2901 2902
      fwrite (token->val.str.text, 1, token->val.str.len, fp);
      break;

Neil Booth committed
2903 2904 2905
    case SPELL_NONE:
      /* An error, most probably.  */
      break;
Zack Weinberg committed
2906
    }
Zack Weinberg committed
2907 2908
}

Neil Booth committed
2909 2910
/* Compare two tokens.  */
int
2911
_cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
Zack Weinberg committed
2912
{
Neil Booth committed
2913 2914 2915 2916 2917
  if (a->type == b->type && a->flags == b->flags)
    switch (TOKEN_SPELL (a))
      {
      default:			/* Keep compiler happy.  */
      case SPELL_OPERATOR:
2918
	/* token_no is used to track where multiple consecutive ##
2919
	   tokens were originally located.  */
2920
	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
Neil Booth committed
2921
      case SPELL_NONE:
2922
	return (a->type != CPP_MACRO_ARG
2923 2924
		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
Neil Booth committed
2925
      case SPELL_IDENT:
2926 2927
	return (a->val.node.node == b->val.node.node
		&& a->val.node.spelling == b->val.node.spelling);
2928
      case SPELL_LITERAL:
Neil Booth committed
2929 2930 2931 2932
	return (a->val.str.len == b->val.str.len
		&& !memcmp (a->val.str.text, b->val.str.text,
			    a->val.str.len));
      }
Zack Weinberg committed
2933

Zack Weinberg committed
2934 2935 2936
  return 0;
}

Neil Booth committed
2937 2938 2939 2940 2941
/* Returns nonzero if a space should be inserted to avoid an
   accidental token paste for output.  For simplicity, it is
   conservative, and occasionally advises a space where one is not
   needed, e.g. "." and ".2".  */
int
2942 2943
cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
		 const cpp_token *token2)
Zack Weinberg committed
2944
{
Neil Booth committed
2945 2946
  enum cpp_ttype a = token1->type, b = token2->type;
  cppchar_t c;
Zack Weinberg committed
2947

Neil Booth committed
2948 2949 2950 2951
  if (token1->flags & NAMED_OP)
    a = CPP_NAME;
  if (token2->flags & NAMED_OP)
    b = CPP_NAME;
Zack Weinberg committed
2952

Neil Booth committed
2953 2954
  c = EOF;
  if (token2->flags & DIGRAPH)
2955
    c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
Neil Booth committed
2956 2957
  else if (token_spellings[b].category == SPELL_OPERATOR)
    c = token_spellings[b].name[0];
Zack Weinberg committed
2958

Neil Booth committed
2959
  /* Quickly get everything that can paste with an '='.  */
2960
  if ((int) a <= (int) CPP_LAST_EQ && c == '=')
Neil Booth committed
2961
    return 1;
Zack Weinberg committed
2962

Neil Booth committed
2963
  switch (a)
Zack Weinberg committed
2964
    {
2965 2966
    case CPP_GREATER:	return c == '>';
    case CPP_LESS:	return c == '<' || c == '%' || c == ':';
Neil Booth committed
2967 2968 2969 2970 2971 2972 2973 2974
    case CPP_PLUS:	return c == '+';
    case CPP_MINUS:	return c == '-' || c == '>';
    case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
    case CPP_MOD:	return c == ':' || c == '>';
    case CPP_AND:	return c == '&';
    case CPP_OR:	return c == '|';
    case CPP_COLON:	return c == ':' || c == '>';
    case CPP_DEREF:	return c == '*';
2975
    case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
Neil Booth committed
2976 2977 2978 2979 2980 2981 2982
    case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
    case CPP_NAME:	return ((b == CPP_NUMBER
				 && name_p (pfile, &token2->val.str))
				|| b == CPP_NAME
				|| b == CPP_CHAR || b == CPP_STRING); /* L */
    case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
				|| c == '.' || c == '+' || c == '-');
2983
				      /* UCNs */
2984 2985
    case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
				 && b == CPP_NAME)
2986
				|| (CPP_OPTION (pfile, objc)
2987
				    && token1->val.str.text[0] == '@'
2988
				    && (b == CPP_NAME || b == CPP_STRING)));
2989 2990 2991 2992 2993 2994 2995 2996 2997
    case CPP_STRING:
    case CPP_WSTRING:
    case CPP_UTF8STRING:
    case CPP_STRING16:
    case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
				&& (b == CPP_NAME
				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
					&& ISIDST (token2->val.str.text[0]))));

Neil Booth committed
2998
    default:		break;
Zack Weinberg committed
2999 3000
    }

Zack Weinberg committed
3001
  return 0;
Zack Weinberg committed
3002 3003
}

Neil Booth committed
3004
/* Output all the remaining tokens on the current line, and a newline
3005 3006
   character, to FP.  Leading whitespace is removed.  If there are
   macros, special token padding is not performed.  */
Zack Weinberg committed
3007
void
3008
cpp_output_line (cpp_reader *pfile, FILE *fp)
Zack Weinberg committed
3009
{
3010
  const cpp_token *token;
3011

3012 3013
  token = cpp_get_token (pfile);
  while (token->type != CPP_EOF)
3014
    {
3015 3016 3017 3018
      cpp_output_token (token, fp);
      token = cpp_get_token (pfile);
      if (token->flags & PREV_WHITE)
	putc (' ', fp);
3019 3020
    }

Neil Booth committed
3021
  putc ('\n', fp);
Zack Weinberg committed
3022
}
Zack Weinberg committed
3023

3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068
/* Return a string representation of all the remaining tokens on the
   current line.  The result is allocated using xmalloc and must be
   freed by the caller.  */
unsigned char *
cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
{
  const cpp_token *token;
  unsigned int out = dir_name ? ustrlen (dir_name) : 0;
  unsigned int alloced = 120 + out;
  unsigned char *result = (unsigned char *) xmalloc (alloced);

  /* If DIR_NAME is empty, there are no initial contents.  */
  if (dir_name)
    {
      sprintf ((char *) result, "#%s ", dir_name);
      out += 2;
    }

  token = cpp_get_token (pfile);
  while (token->type != CPP_EOF)
    {
      unsigned char *last;
      /* Include room for a possible space and the terminating nul.  */
      unsigned int len = cpp_token_len (token) + 2;

      if (out + len > alloced)
	{
	  alloced *= 2;
	  if (out + len > alloced)
	    alloced = out + len;
	  result = (unsigned char *) xrealloc (result, alloced);
	}

      last = cpp_spell_token (pfile, token, &result[out], 0);
      out = last - result;

      token = cpp_get_token (pfile);
      if (token->flags & PREV_WHITE)
	result[out++] = ' ';
    }

  result[out] = '\0';
  return result;
}

3069 3070 3071 3072 3073 3074 3075
/* Memory buffers.  Changing these three constants can have a dramatic
   effect on performance.  The values here are reasonable defaults,
   but might be tuned.  If you adjust them, be sure to test across a
   range of uses of cpplib, including heavy nested function-like macro
   expansion.  Also check the change in peak memory usage (NJAMD is a
   good tool for this).  */
#define MIN_BUFF_SIZE 8000
3076
#define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3077 3078
#define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
Zack Weinberg committed
3079

3080 3081 3082 3083
#if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
  #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
#endif

Neil Booth committed
3084 3085
/* Create a new allocation buffer.  Place the control block at the end
   of the buffer, so that buffer overflows will cause immediate chaos.  */
3086
static _cpp_buff *
3087
new_buff (size_t len)
3088 3089
{
  _cpp_buff *result;
3090
  unsigned char *base;
3091

3092 3093
  if (len < MIN_BUFF_SIZE)
    len = MIN_BUFF_SIZE;
3094
  len = CPP_ALIGN (len);
3095

3096 3097 3098 3099 3100 3101 3102 3103
#ifdef ENABLE_VALGRIND_CHECKING
  /* Valgrind warns about uses of interior pointers, so put _cpp_buff
     struct first.  */
  size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
  base = XNEWVEC (unsigned char, len + slen);
  result = (_cpp_buff *) base;
  base += slen;
#else
3104
  base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3105
  result = (_cpp_buff *) (base + len);
3106
#endif
3107 3108 3109 3110 3111 3112 3113 3114 3115
  result->base = base;
  result->cur = base;
  result->limit = base + len;
  result->next = NULL;
  return result;
}

/* Place a chain of unwanted allocation buffers on the free list.  */
void
3116
_cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127
{
  _cpp_buff *end = buff;

  while (end->next)
    end = end->next;
  end->next = pfile->free_buffs;
  pfile->free_buffs = buff;
}

/* Return a free buffer of size at least MIN_SIZE.  */
_cpp_buff *
3128
_cpp_get_buff (cpp_reader *pfile, size_t min_size)
3129 3130 3131 3132 3133
{
  _cpp_buff *result, **p;

  for (p = &pfile->free_buffs;; p = &(*p)->next)
    {
3134
      size_t size;
3135 3136

      if (*p == NULL)
3137
	return new_buff (min_size);
3138 3139 3140 3141
      result = *p;
      size = result->limit - result->base;
      /* Return a buffer that's big enough, but don't waste one that's
         way too big.  */
3142
      if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3143 3144 3145 3146 3147 3148 3149 3150 3151
	break;
    }

  *p = result->next;
  result->next = NULL;
  result->cur = result->base;
  return result;
}

3152
/* Creates a new buffer with enough space to hold the uncommitted
3153 3154 3155
   remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
   the excess bytes to the new buffer.  Chains the new buffer after
   BUFF, and returns the new buffer.  */
3156
_cpp_buff *
3157
_cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3158
{
3159
  size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3160
  _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3161

3162 3163 3164 3165 3166
  buff->next = new_buff;
  memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
  return new_buff;
}

3167
/* Creates a new buffer with enough space to hold the uncommitted
3168 3169 3170 3171 3172
   remaining bytes of the buffer pointed to by BUFF, and at least
   MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
   Chains the new buffer before the buffer pointed to by BUFF, and
   updates the pointer to point to the new buffer.  */
void
3173
_cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3174 3175 3176 3177 3178 3179 3180 3181
{
  _cpp_buff *new_buff, *old_buff = *pbuff;
  size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);

  new_buff = _cpp_get_buff (pfile, size);
  memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
  new_buff->next = old_buff;
  *pbuff = new_buff;
3182 3183 3184 3185
}

/* Free a chain of buffers starting at BUFF.  */
void
3186
_cpp_free_buff (_cpp_buff *buff)
3187 3188 3189 3190 3191 3192
{
  _cpp_buff *next;

  for (; buff; buff = next)
    {
      next = buff->next;
3193 3194 3195
#ifdef ENABLE_VALGRIND_CHECKING
      free (buff);
#else
3196
      free (buff->base);
3197
#endif
3198 3199
    }
}
Zack Weinberg committed
3200

3201 3202
/* Allocate permanent, unaligned storage of length LEN.  */
unsigned char *
3203
_cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219
{
  _cpp_buff *buff = pfile->u_buff;
  unsigned char *result = buff->cur;

  if (len > (size_t) (buff->limit - result))
    {
      buff = _cpp_get_buff (pfile, len);
      buff->next = pfile->u_buff;
      pfile->u_buff = buff;
      result = buff->cur;
    }

  buff->cur = result + len;
  return result;
}

3220 3221 3222 3223 3224 3225 3226 3227 3228 3229
/* Allocate permanent, unaligned storage of length LEN from a_buff.
   That buffer is used for growing allocations when saving macro
   replacement lists in a #define, and when parsing an answer to an
   assertion in #assert, #unassert or #if (and therefore possibly
   whilst expanding macros).  It therefore must not be used by any
   code that they might call: specifically the lexer and the guts of
   the macro expander.

   All existing other uses clearly fit this restriction: storing
   registered pragmas during initialization.  */
Neil Booth committed
3230
unsigned char *
3231
_cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3232
{
3233 3234
  _cpp_buff *buff = pfile->a_buff;
  unsigned char *result = buff->cur;
3235

3236
  if (len > (size_t) (buff->limit - result))
3237
    {
3238 3239 3240 3241
      buff = _cpp_get_buff (pfile, len);
      buff->next = pfile->a_buff;
      pfile->a_buff = buff;
      result = buff->cur;
3242
    }
Zack Weinberg committed
3243

3244
  buff->cur = result + len;
Neil Booth committed
3245
  return result;
Zack Weinberg committed
3246
}
Geoffrey Keating committed
3247 3248 3249 3250

/* Say which field of TOK is in use.  */

enum cpp_token_fld_kind
3251
cpp_token_val_index (const cpp_token *tok)
Geoffrey Keating committed
3252 3253 3254 3255 3256 3257 3258
{
  switch (TOKEN_SPELL (tok))
    {
    case SPELL_IDENT:
      return CPP_TOKEN_FLD_NODE;
    case SPELL_LITERAL:
      return CPP_TOKEN_FLD_STR;
3259 3260
    case SPELL_OPERATOR:
      if (tok->type == CPP_PASTE)
3261
	return CPP_TOKEN_FLD_TOKEN_NO;
3262 3263
      else
	return CPP_TOKEN_FLD_NONE;
Geoffrey Keating committed
3264 3265 3266 3267 3268
    case SPELL_NONE:
      if (tok->type == CPP_MACRO_ARG)
	return CPP_TOKEN_FLD_ARG_NO;
      else if (tok->type == CPP_PADDING)
	return CPP_TOKEN_FLD_SOURCE;
3269
      else if (tok->type == CPP_PRAGMA)
3270
	return CPP_TOKEN_FLD_PRAGMA;
Geoffrey Keating committed
3271 3272 3273 3274 3275
      /* else fall through */
    default:
      return CPP_TOKEN_FLD_NONE;
    }
}
3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293

/* All tokens lexed in R after calling this function will be forced to have
   their source_location the same as the location referenced by P, until
   cpp_stop_forcing_token_locations is called for R.  */

void
cpp_force_token_locations (cpp_reader *r, source_location *p)
{
  r->forced_token_location_p = p;
}

/* Go back to assigning locations naturally for lexed tokens.  */

void
cpp_stop_forcing_token_locations (cpp_reader *r)
{
  r->forced_token_location_p = NULL;
}