charset.c 55.5 KB
Newer Older
1
/* CPP Library - charsets
Jakub Jelinek committed
2
   Copyright (C) 1998-2015 Free Software Foundation, Inc.
3 4 5 6 7

   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
8
Free Software Foundation; either version 3, or (at your option) any
9 10 11 12 13 14 15 16
later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
17 18
along with this program; see the file COPYING3.  If not see
<http://www.gnu.org/licenses/>.  */
19 20 21 22

#include "config.h"
#include "system.h"
#include "cpplib.h"
23
#include "internal.h"
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65

/* Character set handling for C-family languages.

   Terminological note: In what follows, "charset" or "character set"
   will be taken to mean both an abstract set of characters and an
   encoding for that set.

   The C99 standard discusses two character sets: source and execution.
   The source character set is used for internal processing in translation
   phases 1 through 4; the execution character set is used thereafter.
   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
   of these terms).  Furthermore, the "basic character set" (listed in
   5.2.1p3) is to be encoded in each with values one byte wide, and is
   to appear in the initial shift state.

   It is not explicitly mentioned, but there is also a "wide execution
   character set" used to encode wide character constants and wide
   string literals; this is supposed to be the result of applying the
   standard library function mbstowcs() to an equivalent narrow string
   (6.4.5p5).  However, the behavior of hexadecimal and octal
   \-escapes is at odds with this; they are supposed to be translated
   directly to wchar_t values (6.4.4.4p5,6).

   The source character set is not necessarily the character set used
   to encode physical source files on disk; translation phase 1 converts
   from whatever that encoding is to the source character set.

   The presence of universal character names in C99 (6.4.3 et seq.)
   forces the source character set to be isomorphic to ISO 10646,
   that is, Unicode.  There is no such constraint on the execution
   character set; note also that the conversion from source to
   execution character set does not occur for identifiers (5.1.1.2p1#5).

   For convenience of implementation, the source character set's
   encoding of the basic character set should be identical to the
   execution character set OF THE HOST SYSTEM's encoding of the basic
   character set, and it should not be a state-dependent encoding.

   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
   depending on whether the host is based on ASCII or EBCDIC (see
   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
66 67 68
   Technical Report #16).  With limited exceptions, it relies on the
   system library's iconv() primitive to do charset conversion
   (specified in SUSv2).  */
69 70 71 72 73 74

#if !HAVE_ICONV
/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
   below, which are guarded only by if statements with compile-time
   constant conditions, do not cause link errors.  */
#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
75
#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
76
#define iconv_close(x)   (void)0
77
#define ICONV_CONST
78 79 80 81
#endif

#if HOST_CHARSET == HOST_CHARSET_ASCII
#define SOURCE_CHARSET "UTF-8"
82
#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
83 84
#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
#define SOURCE_CHARSET "UTF-EBCDIC"
85
#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
86 87 88 89
#else
#error "Unrecognized basic host character set"
#endif

90 91 92 93
#ifndef EILSEQ
#define EILSEQ EINVAL
#endif

94
/* This structure is used for a resizable string buffer throughout.  */
95
/* Don't call it strbuf, as that conflicts with unistd.h on systems
96
   such as DYNIX/ptx where unistd.h includes stropts.h.  */
97
struct _cpp_strbuf
98 99 100 101 102 103 104 105
{
  uchar *text;
  size_t asize;
  size_t len;
};

/* This is enough to hold any string that fits on a single 80-column
   line, even if iconv quadruples its size (e.g. conversion from
106
   ASCII to UTF-32) rounded up to a power of two.  */
107 108
#define OUTBUF_BLOCK_SIZE 256

109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
   logic.  This is because a depressing number of systems lack iconv,
   or have have iconv libraries that do not do these conversions, so
   we need a fallback implementation for them.  To ensure the fallback
   doesn't break due to neglect, it is used on all systems.

   UTF-32 encoding is nice and simple: a four-byte binary number,
   constrained to the range 00000000-7FFFFFFF to avoid questions of
   signedness.  We do have to cope with big- and little-endian
   variants.

   UTF-16 encoding uses two-byte binary numbers, again in big- and
   little-endian variants, for all values in the 00000000-0000FFFF
   range.  Values in the 00010000-0010FFFF range are encoded as pairs
   of two-byte numbers, called "surrogate pairs": given a number S in
   this range, it is mapped to a pair (H, L) as follows:

     H = (S - 0x10000) / 0x400 + 0xD800
     L = (S - 0x10000) % 0x400 + 0xDC00

   Two-byte values in the D800...DFFF range are ill-formed except as a
   component of a surrogate pair.  Even if the encoding within a
   two-byte value is little-endian, the H member of the surrogate pair
   comes first.

   There is no way to encode values in the 00110000-7FFFFFFF range,
   which is not currently a problem as there are no assigned code
   points in that range; however, the author expects that it will
   eventually become necessary to abandon UTF-16 due to this
   limitation.  Note also that, because of these pairs, UTF-16 does
   not meet the requirements of the C standard for a wide character
   encoding (see 3.7.3 and 6.4.4.4p11).

   UTF-8 encoding looks like this:

   value range	       encoded as
   00000000-0000007F   0xxxxxxx
   00000080-000007FF   110xxxxx 10xxxxxx
   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
   never occur.  Note also that any value that can be encoded by a
   given row of the table can also be encoded by all successive rows,
   but this is not done; only the shortest possible encoding for any
   given value is valid.  For instance, the character 07C0 could be
   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
   FC 80 80 80 9F 80.  Only the first is valid.

   An implementation note: the transformation from UTF-16 to UTF-8, or
   vice versa, is easiest done by using UTF-32 as an intermediary.  */

/* Internal primitives which go from an UTF-8 byte stream to native-endian
   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
   operation in several places below.  */
static inline int
one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
		     cppchar_t *cp)
{
171
  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
172
  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
173

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
  cppchar_t c;
  const uchar *inbuf = *inbufp;
  size_t nbytes, i;

  if (*inbytesleftp < 1)
    return EINVAL;

  c = *inbuf;
  if (c < 0x80)
    {
      *cp = c;
      *inbytesleftp -= 1;
      *inbufp += 1;
      return 0;
    }

  /* The number of leading 1-bits in the first byte indicates how many
     bytes follow.  */
  for (nbytes = 2; nbytes < 7; nbytes++)
    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
      goto found;
  return EILSEQ;
 found:

  if (*inbytesleftp < nbytes)
    return EINVAL;

  c = (c & masks[nbytes-1]);
  inbuf++;
  for (i = 1; i < nbytes; i++)
    {
      cppchar_t n = *inbuf++;
      if ((n & 0xC0) != 0x80)
	return EILSEQ;
      c = ((c << 6) + (n & 0x3F));
    }

  /* Make sure the shortest possible encoding was used.  */
  if (c <=      0x7F && nbytes > 1) return EILSEQ;
  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;

  /* Make sure the character is valid.  */
  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;

  *cp = c;
  *inbufp = inbuf;
  *inbytesleftp -= nbytes;
  return 0;
}

static inline int
one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
{
  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
  size_t nbytes;
  uchar buf[6], *p = &buf[6];
  uchar *outbuf = *outbufp;

  nbytes = 1;
  if (c < 0x80)
    *--p = c;
  else
    {
      do
	{
	  *--p = ((c & 0x3F) | 0x80);
	  c >>= 6;
	  nbytes++;
	}
      while (c >= 0x3F || (c & limits[nbytes-1]));
      *--p = (c | masks[nbytes-1]);
    }

  if (*outbytesleftp < nbytes)
    return E2BIG;

  while (p < &buf[6])
    *outbuf++ = *p++;
  *outbytesleftp -= nbytes;
  *outbufp = outbuf;
  return 0;
}

/* The following four functions transform one character between the two
   encodings named in the function name.  All have the signature
   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
           uchar **outbufp, size_t *outbytesleftp)

   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
   interpreted as a boolean indicating whether big-endian or
   little-endian encoding is to be used for the member of the pair
   that is not UTF-8.

   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
   do for iconv.

   The return value is either 0 for success, or an errno value for
   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
   input sequence), ir EINVAL (incomplete input sequence).  */
277

278 279 280 281 282
static inline int
one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
		   uchar **outbufp, size_t *outbytesleftp)
{
  uchar *outbuf;
283
  cppchar_t s = 0;
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
  int rval;

  /* Check for space first, since we know exactly how much we need.  */
  if (*outbytesleftp < 4)
    return E2BIG;

  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
  if (rval)
    return rval;

  outbuf = *outbufp;
  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;

  *outbufp += 4;
  *outbytesleftp -= 4;
  return 0;
}

static inline int
one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
		   uchar **outbufp, size_t *outbytesleftp)
{
  cppchar_t s;
  int rval;
  const uchar *inbuf;

  if (*inbytesleftp < 4)
    return EINVAL;

  inbuf = *inbufp;

  s  = inbuf[bigend ? 0 : 3] << 24;
  s += inbuf[bigend ? 1 : 2] << 16;
  s += inbuf[bigend ? 2 : 1] << 8;
  s += inbuf[bigend ? 3 : 0];

  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
    return EILSEQ;

  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
  if (rval)
    return rval;

  *inbufp += 4;
  *inbytesleftp -= 4;
  return 0;
}

static inline int
one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
		   uchar **outbufp, size_t *outbytesleftp)
{
  int rval;
340
  cppchar_t s = 0;
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
  const uchar *save_inbuf = *inbufp;
  size_t save_inbytesleft = *inbytesleftp;
  uchar *outbuf = *outbufp;

  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
  if (rval)
    return rval;

  if (s > 0x0010FFFF)
    {
      *inbufp = save_inbuf;
      *inbytesleftp = save_inbytesleft;
      return EILSEQ;
    }

356
  if (s <= 0xFFFF)
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
    {
      if (*outbytesleftp < 2)
	{
	  *inbufp = save_inbuf;
	  *inbytesleftp = save_inbytesleft;
	  return E2BIG;
	}
      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;

      *outbufp += 2;
      *outbytesleftp -= 2;
      return 0;
    }
  else
    {
      cppchar_t hi, lo;

      if (*outbytesleftp < 4)
	{
	  *inbufp = save_inbuf;
	  *inbytesleftp = save_inbytesleft;
	  return E2BIG;
	}

      hi = (s - 0x10000) / 0x400 + 0xD800;
      lo = (s - 0x10000) % 0x400 + 0xDC00;

      /* Even if we are little-endian, put the high surrogate first.
	 ??? Matches practice?  */
      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;

      *outbufp += 4;
      *outbytesleftp -= 4;
      return 0;
    }
}

static inline int
one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
		   uchar **outbufp, size_t *outbytesleftp)
{
  cppchar_t s;
  const uchar *inbuf = *inbufp;
  int rval;

  if (*inbytesleftp < 2)
    return EINVAL;
  s  = inbuf[bigend ? 0 : 1] << 8;
  s += inbuf[bigend ? 1 : 0];

  /* Low surrogate without immediately preceding high surrogate is invalid.  */
  if (s >= 0xDC00 && s <= 0xDFFF)
    return EILSEQ;
  /* High surrogate must have a following low surrogate.  */
  else if (s >= 0xD800 && s <= 0xDBFF)
    {
      cppchar_t hi = s, lo;
      if (*inbytesleftp < 4)
	return EINVAL;

      lo  = inbuf[bigend ? 2 : 3] << 8;
      lo += inbuf[bigend ? 3 : 2];

      if (lo < 0xDC00 || lo > 0xDFFF)
	return EILSEQ;

      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
    }

  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
  if (rval)
    return rval;

  /* Success - update the input pointers (one_cppchar_to_utf8 has done
     the output pointers for us).  */
  if (s <= 0xFFFF)
    {
      *inbufp += 2;
      *inbytesleftp -= 2;
    }
  else
    {
      *inbufp += 4;
      *inbytesleftp -= 4;
    }
  return 0;
}

/* Helper routine for the next few functions.  The 'const' on
   one_conversion means that we promise not to modify what function is
451
   pointed to, which lets the inliner see through it.  */
452 453 454 455

static inline bool
conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
					     uchar **, size_t *),
456
		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
{
  const uchar *inbuf;
  uchar *outbuf;
  size_t inbytesleft, outbytesleft;
  int rval;

  inbuf = from;
  inbytesleft = flen;
  outbuf = to->text + to->len;
  outbytesleft = to->asize - to->len;

  for (;;)
    {
      do
	rval = one_conversion (cd, &inbuf, &inbytesleft,
			       &outbuf, &outbytesleft);
      while (inbytesleft && !rval);

      if (__builtin_expect (inbytesleft == 0, 1))
	{
	  to->len = to->asize - outbytesleft;
	  return true;
	}
      if (rval != E2BIG)
	{
	  errno = rval;
	  return false;
	}

      outbytesleft += OUTBUF_BLOCK_SIZE;
      to->asize += OUTBUF_BLOCK_SIZE;
488
      to->text = XRESIZEVEC (uchar, to->text, to->asize);
489 490 491
      outbuf = to->text + to->asize - outbytesleft;
    }
}
492

493 494 495 496

/* These functions convert entire strings between character sets.
   They all have the signature

497
   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
498 499 500 501 502 503 504 505

   The input string FROM is converted as specified by the function
   name plus the iconv descriptor CD (which may be fake), and the
   result appended to TO.  On any error, false is returned, otherwise true.  */

/* These four use the custom conversion code above.  */
static bool
convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
506
		    struct _cpp_strbuf *to)
507 508 509 510 511 512
{
  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
}

static bool
convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
513
		    struct _cpp_strbuf *to)
514 515 516 517 518 519
{
  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
}

static bool
convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
520
		    struct _cpp_strbuf *to)
521 522 523 524 525 526
{
  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
}

static bool
convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
527
		    struct _cpp_strbuf *to)
528 529 530 531 532 533 534
{
  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
}

/* Identity conversion, used when we have no alternative.  */
static bool
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
535
		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
536 537 538 539
{
  if (to->len + flen > to->asize)
    {
      to->asize = to->len + flen;
540
      to->asize += to->asize / 4;
541
      to->text = XRESIZEVEC (uchar, to->text, to->asize);
542 543 544 545 546 547 548 549
    }
  memcpy (to->text + to->len, from, flen);
  to->len += flen;
  return true;
}

/* And this one uses the system iconv primitive.  It's a little
   different, since iconv's interface is a little different.  */
550
#if HAVE_ICONV
551 552 553 554 555 556 557 558 559

#define CONVERT_ICONV_GROW_BUFFER \
  do { \
      outbytesleft += OUTBUF_BLOCK_SIZE; \
      to->asize += OUTBUF_BLOCK_SIZE; \
      to->text = XRESIZEVEC (uchar, to->text, to->asize); \
      outbuf = (char *)to->text + to->asize - outbytesleft; \
  } while (0)

560 561
static bool
convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
562
		     struct _cpp_strbuf *to)
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
{
  ICONV_CONST char *inbuf;
  char *outbuf;
  size_t inbytesleft, outbytesleft;

  /* Reset conversion descriptor and check that it is valid.  */
  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
    return false;

  inbuf = (ICONV_CONST char *)from;
  inbytesleft = flen;
  outbuf = (char *)to->text + to->len;
  outbytesleft = to->asize - to->len;

  for (;;)
    {
      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
      if (__builtin_expect (inbytesleft == 0, 1))
	{
582 583 584 585 586 587 588 589 590 591 592
	  /* Close out any shift states, returning to the initial state.  */
	  if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
	    {
	      if (errno != E2BIG)
		return false;

	      CONVERT_ICONV_GROW_BUFFER;
	      if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
		return false;
	    }

593 594 595 596 597 598
	  to->len = to->asize - outbytesleft;
	  return true;
	}
      if (errno != E2BIG)
	return false;

599
      CONVERT_ICONV_GROW_BUFFER;
600 601
    }
}
602 603 604
#else
#define convert_using_iconv 0 /* prevent undefined symbol error below */
#endif
605 606 607 608 609 610 611

/* Arrange for the above custom conversion logic to be used automatically
   when conversion between a suitable pair of character sets is requested.  */

#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)

612
struct cpp_conversion
613 614 615 616 617
{
  const char *pair;
  convert_f func;
  iconv_t fake_cd;
};
618
static const struct cpp_conversion conversion_tab[] = {
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
};

/* Subroutine of cpp_init_iconv: initialize and return a
   cset_converter structure for conversion from FROM to TO.  If
   iconv_open() fails, issue an error and return an identity
   converter.  Silently return an identity converter if FROM and TO
   are identical.  */
static struct cset_converter
635 636
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
{
637 638 639
  struct cset_converter ret;
  char *pair;
  size_t i;
640

641 642 643 644
  if (!strcasecmp (to, from))
    {
      ret.func = convert_no_conversion;
      ret.cd = (iconv_t) -1;
645
      ret.width = -1;
646 647 648
      return ret;
    }

649
  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
650

651 652 653 654 655 656 657 658
  strcpy(pair, from);
  strcat(pair, "/");
  strcat(pair, to);
  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
    if (!strcasecmp (pair, conversion_tab[i].pair))
      {
	ret.func = conversion_tab[i].func;
	ret.cd = conversion_tab[i].fake_cd;
659
	ret.width = -1;
660 661
	return ret;
      }
662

663
  /* No custom converter - try iconv.  */
664
  if (HAVE_ICONV)
665
    {
666 667
      ret.func = convert_using_iconv;
      ret.cd = iconv_open (to, from);
668
      ret.width = -1;
669

670 671 672
      if (ret.cd == (iconv_t) -1)
	{
	  if (errno == EINVAL)
673
	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
674 675 676
		       "conversion from %s to %s not supported by iconv",
		       from, to);
	  else
677
	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
678 679 680 681 682 683

	  ret.func = convert_no_conversion;
	}
    }
  else
    {
684
      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
685 686
		 "no iconv implementation, cannot convert from %s to %s",
		 from, to);
687
      ret.func = convert_no_conversion;
688
      ret.cd = (iconv_t) -1;
689
      ret.width = -1;
690
    }
691
  return ret;
692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
}

/* If charset conversion is requested, initialize iconv(3) descriptors
   for conversion from the source character set to the execution
   character sets.  If iconv is not present in the C library, and
   conversion is requested, issue an error.  */

void
cpp_init_iconv (cpp_reader *pfile)
{
  const char *ncset = CPP_OPTION (pfile, narrow_charset);
  const char *wcset = CPP_OPTION (pfile, wide_charset);
  const char *default_wcset;

  bool be = CPP_OPTION (pfile, bytes_big_endian);

  if (CPP_OPTION (pfile, wchar_precision) >= 32)
709
    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
710
  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
711
    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
712 713 714 715 716
  else
    /* This effectively means that wide strings are not supported,
       so don't do any conversion at all.  */
   default_wcset = SOURCE_CHARSET;

717 718 719 720
  if (!ncset)
    ncset = SOURCE_CHARSET;
  if (!wcset)
    wcset = default_wcset;
721

722
  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
723
  pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
724 725
  pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
  pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
726 727 728 729 730 731 732 733
  pfile->char16_cset_desc = init_iconv_desc (pfile,
					     be ? "UTF-16BE" : "UTF-16LE",
					     SOURCE_CHARSET);
  pfile->char16_cset_desc.width = 16;
  pfile->char32_cset_desc = init_iconv_desc (pfile,
					     be ? "UTF-32BE" : "UTF-32LE",
					     SOURCE_CHARSET);
  pfile->char32_cset_desc.width = 32;
734
  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
735
  pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
736 737
}

738
/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
739 740 741 742 743
void
_cpp_destroy_iconv (cpp_reader *pfile)
{
  if (HAVE_ICONV)
    {
744 745
      if (pfile->narrow_cset_desc.func == convert_using_iconv)
	iconv_close (pfile->narrow_cset_desc.cd);
746 747 748 749 750 751
      if (pfile->utf8_cset_desc.func == convert_using_iconv)
	iconv_close (pfile->utf8_cset_desc.cd);
      if (pfile->char16_cset_desc.func == convert_using_iconv)
	iconv_close (pfile->char16_cset_desc.cd);
      if (pfile->char32_cset_desc.func == convert_using_iconv)
	iconv_close (pfile->char32_cset_desc.cd);
752 753
      if (pfile->wide_cset_desc.func == convert_using_iconv)
	iconv_close (pfile->wide_cset_desc.cd);
754 755 756
    }
}

757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
/* Utility routine for use by a full compiler.  C is a character taken
   from the *basic* source character set, encoded in the host's
   execution encoding.  Convert it to (the target's) execution
   encoding, and return that value.

   Issues an internal error if C's representation in the narrow
   execution character set fails to be a single-byte value (C99
   5.2.1p3: "The representation of each member of the source and
   execution character sets shall fit in a byte.")  May also issue an
   internal error if C fails to be a member of the basic source
   character set (testing this exactly is too hard, especially when
   the host character set is EBCDIC).  */
cppchar_t
cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
{
  uchar sbuf[1];
  struct _cpp_strbuf tbuf;

  /* This test is merely an approximation, but it suffices to catch
     the most important thing, which is that we don't get handed a
     character outside the unibyte range of the host character set.  */
  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
    {
      cpp_error (pfile, CPP_DL_ICE,
		 "character 0x%lx is not in the basic source character set\n",
		 (unsigned long)c);
      return 0;
    }

  /* Being a character in the unibyte range of the host character set,
     we can safely splat it into a one-byte buffer and trust that that
     is a well-formed string.  */
  sbuf[0] = c;

  /* This should never need to reallocate, but just in case... */
  tbuf.asize = 1;
793
  tbuf.text = XNEWVEC (uchar, tbuf.asize);
794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
  tbuf.len = 0;

  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
    {
      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
      return 0;
    }
  if (tbuf.len != 1)
    {
      cpp_error (pfile, CPP_DL_ICE,
		 "character 0x%lx is not unibyte in execution character set",
		 (unsigned long)c);
      return 0;
    }
  c = tbuf.text[0];
  free(tbuf.text);
  return c;
}


814 815 816 817 818 819 820 821 822 823 824 825 826

/* Utility routine that computes a mask of the form 0000...111... with
   WIDTH 1-bits.  */
static inline size_t
width_to_mask (size_t width)
{
  width = MIN (width, BITS_PER_CPPCHAR_T);
  if (width >= CHAR_BIT * sizeof (size_t))
    return ~(size_t) 0;
  else
    return ((size_t) 1 << width) - 1;
}

Geoffrey Keating committed
827 828 829 830 831
/* A large table of unicode character information.  */
enum {
  /* Valid in a C99 identifier?  */
  C99 = 1,
  /* Valid in a C99 identifier, but not as the first character?  */
Joseph Myers committed
832
  N99 = 2,
Geoffrey Keating committed
833 834
  /* Valid in a C++ identifier?  */
  CXX = 4,
Joseph Myers committed
835 836 837 838
  /* Valid in a C11/C++11 identifier?  */
  C11 = 8,
  /* Valid in a C11/C++11 identifier, but not as the first character?  */
  N11 = 16,
Geoffrey Keating committed
839
  /* NFC representation is not valid in an identifier?  */
Joseph Myers committed
840
  CID = 32,
Geoffrey Keating committed
841
  /* Might be valid NFC form?  */
Joseph Myers committed
842
  NFC = 64,
Geoffrey Keating committed
843
  /* Might be valid NFKC form?  */
Joseph Myers committed
844
  NKC = 128,
Geoffrey Keating committed
845
  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
Joseph Myers committed
846
  CTX = 256
Geoffrey Keating committed
847 848
};

Joseph Myers committed
849
struct ucnrange {
Geoffrey Keating committed
850
  /* Bitmap of flags above.  */
Joseph Myers committed
851
  unsigned short flags;
Geoffrey Keating committed
852 853 854
  /* Combining class of the character.  */
  unsigned char combine;
  /* Last character in the range described by this entry.  */
Joseph Myers committed
855
  unsigned int end;
Geoffrey Keating committed
856
};
Joseph Myers committed
857
#include "ucnid.h"
Geoffrey Keating committed
858

859 860 861
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
   the start of an identifier, and 0 if C is not valid in an
   identifier.  We assume C has already gone through the checks of
Geoffrey Keating committed
862 863 864
   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
   algorithm is a simple binary search on the table defined in
   ucnid.h.  */
865 866

static int
Geoffrey Keating committed
867 868
ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
			 struct normalize_state *nst)
869 870
{
  int mn, mx, md;
Joseph Myers committed
871
  unsigned short valid_flags, invalid_start_flags;
872

Joseph Myers committed
873
  if (c > 0x10FFFF)
Geoffrey Keating committed
874 875 876 877 878
    return 0;

  mn = 0;
  mx = ARRAY_SIZE (ucnranges) - 1;
  while (mx != mn)
879 880
    {
      md = (mn + mx) / 2;
Geoffrey Keating committed
881
      if (c <= ucnranges[md].end)
882 883
	mx = md;
      else
Geoffrey Keating committed
884
	mn = md + 1;
885
    }
886

887 888
  /* When -pedantic, we require the character to have been listed by
     the standard for the current language.  Otherwise, we accept the
Joseph Myers committed
889 890 891 892 893 894 895 896 897 898 899 900
     union of the acceptable sets for all supported language versions.  */
  valid_flags = C99 | CXX | C11;
  if (CPP_PEDANTIC (pfile))
    {
      if (CPP_OPTION (pfile, c11_identifiers))
	valid_flags = C11;
      else if (CPP_OPTION (pfile, c99))
	valid_flags = C99;
      else if (CPP_OPTION (pfile, cplusplus))
	valid_flags = CXX;
    }
  if (! (ucnranges[mn].flags & valid_flags))
Geoffrey Keating committed
901
      return 0;
Joseph Myers committed
902 903 904 905 906 907
  if (CPP_OPTION (pfile, c11_identifiers))
    invalid_start_flags = N11;
  else if (CPP_OPTION (pfile, c99))
    invalid_start_flags = N99;
  else
    invalid_start_flags = 0;
908

Geoffrey Keating committed
909 910 911 912 913 914 915 916 917 918 919 920 921 922 923
  /* Update NST.  */
  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
    nst->level = normalized_none;
  else if (ucnranges[mn].flags & CTX)
    {
      bool safe;
      cppchar_t p = nst->previous;

      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
	 and are combined algorithmically from a sequence of the form
	 1100-1112 1161-1175 11A8-11C2
	 (if the third is not present, it is treated as 11A7, which is not
	 really a valid character).
	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
	 only the combining characters.  */
Joseph Myers committed
924
      if (c >= 0x1161 && c <= 0x1175)
Geoffrey Keating committed
925 926 927 928
	safe = p < 0x1100 || p > 0x1112;
      else if (c >= 0x11A8 && c <= 0x11C2)
	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
      else
Joseph Myers committed
929 930
	safe = check_nfc (pfile, c, p);
      if (!safe)
Geoffrey Keating committed
931
	{
Joseph Myers committed
932 933 934 935
	  if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
	    nst->level = MAX (nst->level, normalized_identifier_C);
	  else
	    nst->level = normalized_none;
Geoffrey Keating committed
936 937 938 939 940 941 942 943 944 945
	}
    }
  else if (ucnranges[mn].flags & NKC)
    ;
  else if (ucnranges[mn].flags & NFC)
    nst->level = MAX (nst->level, normalized_C);
  else if (ucnranges[mn].flags & CID)
    nst->level = MAX (nst->level, normalized_identifier_C);
  else
    nst->level = normalized_none;
Joseph Myers committed
946 947
  if (ucnranges[mn].combine == 0)
    nst->previous = c;
Geoffrey Keating committed
948 949
  nst->prev_class = ucnranges[mn].combine;

Joseph Myers committed
950 951 952
  /* In C99, UCN digits may not begin identifiers.  In C11 and C++11,
     UCN combining characters may not begin identifiers.  */
  if (ucnranges[mn].flags & invalid_start_flags)
953 954 955 956
    return 2;

  return 1;
}
957 958 959 960 961 962

/* [lex.charset]: The character designated by the universal character
   name \UNNNNNNNN is that character whose character short name in
   ISO/IEC 10646 is NNNNNNNN; the character designated by the
   universal character name \uNNNN is that character whose character
   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
963 964 965 966 967 968 969 970 971 972
   for a universal character name corresponds to a surrogate code point
   (in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
   Additionally, if the hexadecimal value for a universal-character-name
   outside a character or string literal corresponds to a control character
   (in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
   character in the basic source character set, the program is ill-formed.

   C99 6.4.3: A universal character name shall not specify a character
   whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
   or 0060 (`), nor one in the range D800 through DFFF inclusive.
973 974

   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
975 976
   buffer end is delimited by a non-hex digit.  Returns false if the
   UCN has not been consumed, true otherwise.
977

978 979 980
   The value of the UCN, whether valid or invalid, is returned in *CP.
   Diagnostics are emitted for invalid values.  PSTR is updated to point
   one beyond the UCN, or to the syntactically invalid character.
981 982

   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
983
   an identifier, or 2 otherwise.  */
984

985
bool
986
_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
Geoffrey Keating committed
987
		const uchar *limit, int identifier_pos,
988
		struct normalize_state *nst, cppchar_t *cp)
989 990 991 992 993 994 995
{
  cppchar_t result, c;
  unsigned int length;
  const uchar *str = *pstr;
  const uchar *base = str - 2;

  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
996
    cpp_error (pfile, CPP_DL_WARNING,
997
	       "universal character names are only valid in C++ and C99");
998
  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
999 1000 1001
	   && !CPP_OPTION (pfile, cplusplus))
    cpp_error (pfile, CPP_DL_WARNING,
	       "C99's universal character names are incompatible with C90");
1002
  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
1003 1004 1005
    cpp_warning (pfile, CPP_W_TRADITIONAL,
	         "the meaning of '\\%c' is different in traditional C",
	         (int) str[-1]);
1006 1007 1008 1009 1010 1011

  if (str[-1] == 'u')
    length = 4;
  else if (str[-1] == 'U')
    length = 8;
  else
Geoffrey Keating committed
1012 1013 1014 1015
    {
      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
      length = 4;
    }
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025

  result = 0;
  do
    {
      c = *str;
      if (!ISXDIGIT (c))
	break;
      str++;
      result = (result << 4) + hex_value (c);
    }
1026
  while (--length && str < limit);
1027

1028 1029 1030 1031
  /* Partial UCNs are not valid in strings, but decompose into
     multiple tokens in identifiers, so we can't give a helpful
     error message in that case.  */
  if (length && identifier_pos)
1032 1033 1034 1035 1036
    {
      *cp = 0;
      return false;
    }

1037 1038
  *pstr = str;
  if (length)
1039
    {
1040 1041
      cpp_error (pfile, CPP_DL_ERROR,
		 "incomplete universal character name %.*s",
1042 1043 1044
		 (int) (str - base), base);
      result = 1;
    }
1045 1046 1047 1048
  /* The C99 standard permits $, @ and ` to be specified as UCNs.  We use
     hex escapes so that this also works with EBCDIC hosts.
     C++0x permits everything below 0xa0 within literals;
     ucn_valid_in_identifier will complain about identifiers.  */
1049
  else if ((result < 0xa0
1050
	    && !CPP_OPTION (pfile, cplusplus)
1051 1052 1053 1054
	    && (result != 0x24 && result != 0x40 && result != 0x60))
	   || (result & 0x80000000)
	   || (result >= 0xD800 && result <= 0xDFFF))
    {
1055 1056
      cpp_error (pfile, CPP_DL_ERROR,
		 "%.*s is not a valid universal character",
1057
		 (int) (str - base), base);
1058
      result = 1;
1059
    }
1060 1061 1062 1063 1064 1065 1066 1067
  else if (identifier_pos && result == 0x24 
	   && CPP_OPTION (pfile, dollars_in_ident))
    {
      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
	{
	  CPP_OPTION (pfile, warn_dollars) = 0;
	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
	}
Joseph Myers committed
1068
      NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
1069
    }
1070 1071
  else if (identifier_pos)
    {
Geoffrey Keating committed
1072
      int validity = ucn_valid_in_identifier (pfile, result, nst);
1073 1074

      if (validity == 0)
1075
	cpp_error (pfile, CPP_DL_ERROR,
1076
		   "universal character %.*s is not valid in an identifier",
1077
		   (int) (str - base), base);
1078
      else if (validity == 2 && identifier_pos == 1)
1079
	cpp_error (pfile, CPP_DL_ERROR,
1080
   "universal character %.*s is not valid at the start of an identifier",
1081
		   (int) (str - base), base);
1082 1083
    }

1084 1085
  *cp = result;
  return true;
1086 1087
}

1088 1089
/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
   it to the execution character set and write the result into TBUF.
1090
   An advanced pointer is returned.  Issues all relevant diagnostics.  */
1091 1092
static const uchar *
convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1093
	     struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1094
{
1095
  cppchar_t ucn;
1096 1097 1098 1099
  uchar buf[6];
  uchar *bufp = buf;
  size_t bytesleft = 6;
  int rval;
Geoffrey Keating committed
1100
  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1101

1102
  from++;  /* Skip u/U.  */
1103
  _cpp_valid_ucn (pfile, &from, limit, 0, &nst, &ucn);
1104

1105 1106
  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
  if (rval)
1107
    {
1108
      errno = rval;
1109 1110
      cpp_errno (pfile, CPP_DL_ERROR,
		 "converting UCN to source character set");
1111
    }
1112
  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1113 1114
    cpp_errno (pfile, CPP_DL_ERROR,
	       "converting UCN to execution character set");
1115 1116 1117

  return from;
}
1118

1119 1120 1121 1122 1123
/* Subroutine of convert_hex and convert_oct.  N is the representation
   in the execution character set of a numeric escape; write it into the
   string buffer TBUF and update the end-of-string pointer therein.  WIDE
   is true if it's a wide string that's being assembled in TBUF.  This
   function issues no diagnostics and never fails.  */
1124 1125
static void
emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1126
		     struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1127
{
1128 1129 1130
  size_t width = cvt.width;

  if (width != CPP_OPTION (pfile, char_precision))
1131
    {
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
      /* We have to render this into the target byte order, which may not
	 be our byte order.  */
      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
      size_t cwidth = CPP_OPTION (pfile, char_precision);
      size_t cmask = width_to_mask (cwidth);
      size_t nbwc = width / cwidth;
      size_t i;
      size_t off = tbuf->len;
      cppchar_t c;

      if (tbuf->len + nbwc > tbuf->asize)
	{
	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1145
	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1146 1147 1148 1149 1150 1151 1152 1153 1154
	}

      for (i = 0; i < nbwc; i++)
	{
	  c = n & cmask;
	  n >>= cwidth;
	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
	}
      tbuf->len += nbwc;
1155
    }
1156
  else
1157
    {
1158 1159
      /* Note: this code does not handle the case where the target
	 and host have a different number of bits in a byte.  */
1160 1161 1162
      if (tbuf->len + 1 > tbuf->asize)
	{
	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1163
	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1164 1165
	}
      tbuf->text[tbuf->len++] = n;
1166
    }
1167
}
1168

1169 1170 1171 1172 1173 1174 1175 1176
/* Convert a hexadecimal escape, pointed to by FROM, to the execution
   character set and write it into the string buffer TBUF.  Returns an
   advanced pointer, and issues diagnostics as necessary.
   No character set translation occurs; this routine always produces the
   execution-set character with numeric value equal to the given hex
   number.  You can, e.g. generate surrogate pairs this way.  */
static const uchar *
convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1177
	     struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1178 1179 1180
{
  cppchar_t c, n = 0, overflow = 0;
  int digits_found = 0;
1181
  size_t width = cvt.width;
1182 1183 1184
  size_t mask = width_to_mask (width);

  if (CPP_WTRADITIONAL (pfile))
1185 1186
    cpp_warning (pfile, CPP_W_TRADITIONAL,
	         "the meaning of '\\x' is different in traditional C");
1187

1188
  from++;  /* Skip 'x'.  */
1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
  while (from < limit)
    {
      c = *from;
      if (! hex_p (c))
	break;
      from++;
      overflow |= n ^ (n << 4 >> 4);
      n = (n << 4) + hex_value (c);
      digits_found = 1;
    }
1199

1200 1201
  if (!digits_found)
    {
1202
      cpp_error (pfile, CPP_DL_ERROR,
1203 1204 1205 1206 1207 1208
		 "\\x used with no following hex digits");
      return from;
    }

  if (overflow | (n != (n & mask)))
    {
1209
      cpp_error (pfile, CPP_DL_PEDWARN,
1210 1211 1212 1213
		 "hex escape sequence out of range");
      n &= mask;
    }

1214
  emit_numeric_escape (pfile, n, tbuf, cvt);
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226

  return from;
}

/* Convert an octal escape, pointed to by FROM, to the execution
   character set and write it into the string buffer TBUF.  Returns an
   advanced pointer, and issues diagnostics as necessary.
   No character set translation occurs; this routine always produces the
   execution-set character with numeric value equal to the given octal
   number.  */
static const uchar *
convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1227
	     struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1228 1229 1230
{
  size_t count = 0;
  cppchar_t c, n = 0;
1231
  size_t width = cvt.width;
1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
  size_t mask = width_to_mask (width);
  bool overflow = false;

  while (from < limit && count++ < 3)
    {
      c = *from;
      if (c < '0' || c > '7')
	break;
      from++;
      overflow |= n ^ (n << 3 >> 3);
      n = (n << 3) + c - '0';
    }

  if (n != (n & mask))
    {
1247
      cpp_error (pfile, CPP_DL_PEDWARN,
1248 1249 1250 1251
		 "octal escape sequence out of range");
      n &= mask;
    }

1252
  emit_numeric_escape (pfile, n, tbuf, cvt);
1253 1254 1255 1256 1257 1258 1259 1260 1261 1262

  return from;
}

/* Convert an escape sequence (pointed to by FROM) to its value on
   the target, and to the execution character set.  Do not scan past
   LIMIT.  Write the converted value into TBUF.  Returns an advanced
   pointer.  Handles all relevant diagnostics.  */
static const uchar *
convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1263
		struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280
{
  /* Values of \a \b \e \f \n \r \t \v respectively.  */
#if HOST_CHARSET == HOST_CHARSET_ASCII
  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
#else
#error "unknown host character set"
#endif

  uchar c;

  c = *from;
  switch (c)
    {
      /* UCNs, hex escapes, and octal escapes are processed separately.  */
    case 'u': case 'U':
1281
      return convert_ucn (pfile, from, limit, tbuf, cvt);
1282 1283

    case 'x':
1284
      return convert_hex (pfile, from, limit, tbuf, cvt);
1285 1286 1287 1288
      break;

    case '0':  case '1':  case '2':  case '3':
    case '4':  case '5':  case '6':  case '7':
1289
      return convert_oct (pfile, from, limit, tbuf, cvt);
1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312

      /* Various letter escapes.  Get the appropriate host-charset
	 value into C.  */
    case '\\': case '\'': case '"': case '?': break;

    case '(': case '{': case '[': case '%':
      /* '\(', etc, can be used at the beginning of a line in a long
	 string split onto multiple lines with \-newline, to prevent
	 Emacs or other text editors from getting confused.  '\%' can
	 be used to prevent SCCS from mangling printf format strings.  */
      if (CPP_PEDANTIC (pfile))
	goto unknown;
      break;

    case 'b': c = charconsts[1];  break;
    case 'f': c = charconsts[3];  break;
    case 'n': c = charconsts[4];  break;
    case 'r': c = charconsts[5];  break;
    case 't': c = charconsts[6];  break;
    case 'v': c = charconsts[7];  break;

    case 'a':
      if (CPP_WTRADITIONAL (pfile))
1313 1314
	cpp_warning (pfile, CPP_W_TRADITIONAL,
		     "the meaning of '\\a' is different in traditional C");
1315 1316 1317 1318 1319
      c = charconsts[0];
      break;

    case 'e': case 'E':
      if (CPP_PEDANTIC (pfile))
1320
	cpp_error (pfile, CPP_DL_PEDWARN,
1321 1322 1323 1324 1325 1326 1327
		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
      c = charconsts[2];
      break;

    default:
    unknown:
      if (ISGRAPH (c))
1328
	cpp_error (pfile, CPP_DL_PEDWARN,
1329
		   "unknown escape sequence: '\\%c'", (int) c);
1330
      else
1331 1332 1333 1334 1335 1336 1337 1338
	{
	  /* diagnostic.c does not support "%03o".  When it does, this
	     code can use %03o directly in the diagnostic again.  */
	  char buf[32];
	  sprintf(buf, "%03o", (int) c);
	  cpp_error (pfile, CPP_DL_PEDWARN,
		     "unknown escape sequence: '\\%s'", buf);
	}
1339 1340 1341
    }

  /* Now convert what we have to the execution character set.  */
1342
  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1343
    cpp_errno (pfile, CPP_DL_ERROR,
1344 1345 1346 1347 1348
	       "converting escape sequence to execution character set");

  return from + 1;
}

1349 1350 1351 1352 1353 1354 1355 1356 1357
/* TYPE is a token type.  The return value is the conversion needed to
   convert from source to execution character set for the given type. */
static struct cset_converter
converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
{
  switch (type)
    {
    default:
	return pfile->narrow_cset_desc;
1358
    case CPP_UTF8CHAR:
1359 1360
    case CPP_UTF8STRING:
	return pfile->utf8_cset_desc;
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
    case CPP_CHAR16:
    case CPP_STRING16:
	return pfile->char16_cset_desc;
    case CPP_CHAR32:
    case CPP_STRING32:
	return pfile->char32_cset_desc;
    case CPP_WCHAR:
    case CPP_WSTRING:
	return pfile->wide_cset_desc;
    }
}

1373 1374 1375 1376 1377 1378 1379 1380
/* FROM is an array of cpp_string structures of length COUNT.  These
   are to be converted from the source to the execution character set,
   escape sequences translated, and finally all are to be
   concatenated.  WIDE indicates whether or not to produce a wide
   string.  The result is written into TO.  Returns true for success,
   false for failure.  */
bool
cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1381
		      cpp_string *to,  enum cpp_ttype type)
1382
{
1383
  struct _cpp_strbuf tbuf;
1384 1385
  const uchar *p, *base, *limit;
  size_t i;
1386
  struct cset_converter cvt = converter_for_type (pfile, type);
1387 1388

  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1389
  tbuf.text = XNEWVEC (uchar, tbuf.asize);
1390 1391 1392 1393 1394
  tbuf.len = 0;

  for (i = 0; i < count; i++)
    {
      p = from[i].text;
1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407
      if (*p == 'u')
	{
	  if (*++p == '8')
	    p++;
	}
      else if (*p == 'L' || *p == 'U') p++;
      if (*p == 'R')
	{
	  const uchar *prefix;

	  /* Skip over 'R"'.  */
	  p += 2;
	  prefix = p;
1408
	  while (*p != '(')
1409 1410 1411 1412 1413 1414
	    p++;
	  p++;
	  limit = from[i].text + from[i].len;
	  if (limit >= p + (p - prefix) + 1)
	    limit -= (p - prefix) + 1;

1415 1416 1417 1418
	  /* Raw strings are all normal characters; these can be fed
	     directly to convert_cset.  */
	  if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
	    goto fail;
1419 1420 1421 1422

	  continue;
	}

1423 1424
      p++; /* Skip leading quote.  */
      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434

      for (;;)
	{
	  base = p;
	  while (p < limit && *p != '\\')
	    p++;
	  if (p > base)
	    {
	      /* We have a run of normal characters; these can be fed
		 directly to convert_cset.  */
1435
	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1436 1437 1438 1439 1440
		goto fail;
	    }
	  if (p == limit)
	    break;

1441
	  p = convert_escape (pfile, p + 1, limit, &tbuf, cvt);
1442 1443 1444 1445
	}
    }
  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
     structure.  */
1446
  emit_numeric_escape (pfile, 0, &tbuf, cvt);
1447
  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1448 1449 1450 1451 1452
  to->text = tbuf.text;
  to->len = tbuf.len;
  return true;

 fail:
1453
  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1454 1455 1456
  free (tbuf.text);
  return false;
}
1457 1458 1459 1460

/* Subroutine of do_line and do_linemarker.  Convert escape sequences
   in a string, but do not perform character set conversion.  */
bool
1461
cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1462 1463
				  size_t count,	cpp_string *to,
				  enum cpp_ttype type ATTRIBUTE_UNUSED)
1464 1465 1466 1467 1468 1469
{
  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
  bool retval;

  pfile->narrow_cset_desc.func = convert_no_conversion;
  pfile->narrow_cset_desc.cd = (iconv_t) -1;
1470
  pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
1471

1472
  retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
1473 1474 1475 1476 1477

  pfile->narrow_cset_desc = save_narrow_cset_desc;
  return retval;
}

1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516

/* Subroutine of cpp_interpret_charconst which performs the conversion
   to a number, for narrow strings.  STR is the string structure returned
   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
   cpp_interpret_charconst.  */
static cppchar_t
narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
			 unsigned int *pchars_seen, int *unsignedp)
{
  size_t width = CPP_OPTION (pfile, char_precision);
  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
  size_t mask = width_to_mask (width);
  size_t i;
  cppchar_t result, c;
  bool unsigned_p;

  /* The value of a multi-character character constant, or a
     single-character character constant whose representation in the
     execution character set is more than one byte long, is
     implementation defined.  This implementation defines it to be the
     number formed by interpreting the byte sequence in memory as a
     big-endian binary number.  If overflow occurs, the high bytes are
     lost, and a warning is issued.

     We don't want to process the NUL terminator handed back by
     cpp_interpret_string.  */
  result = 0;
  for (i = 0; i < str.len - 1; i++)
    {
      c = str.text[i] & mask;
      if (width < BITS_PER_CPPCHAR_T)
	result = (result << width) | c;
      else
	result = c;
    }

  if (i > max_chars)
    {
      i = max_chars;
1517 1518
      cpp_error (pfile, CPP_DL_WARNING,
		 "character constant too long for its type");
1519 1520
    }
  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1521
    cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546

  /* Multichar constants are of type int and therefore signed.  */
  if (i > 1)
    unsigned_p = 0;
  else
    unsigned_p = CPP_OPTION (pfile, unsigned_char);

  /* Truncate the constant to its natural width, and simultaneously
     sign- or zero-extend to the full width of cppchar_t.
     For single-character constants, the value is WIDTH bits wide.
     For multi-character constants, the value is INT_PRECISION bits wide.  */
  if (i > 1)
    width = CPP_OPTION (pfile, int_precision);
  if (width < BITS_PER_CPPCHAR_T)
    {
      mask = ((cppchar_t) 1 << width) - 1;
      if (unsigned_p || !(result & (1 << (width - 1))))
	result &= mask;
      else
	result |= ~mask;
    }
  *pchars_seen = i;
  *unsignedp = unsigned_p;
  return result;
}
1547

1548 1549 1550
/* Subroutine of cpp_interpret_charconst which performs the conversion
   to a number, for wide strings.  STR is the string structure returned
   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1551
   cpp_interpret_charconst.  TYPE is the token type.  */
1552 1553
static cppchar_t
wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1554 1555
		       unsigned int *pchars_seen, int *unsignedp,
		       enum cpp_ttype type)
1556 1557
{
  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1558
  size_t width = converter_for_type (pfile, type).width;
1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579
  size_t cwidth = CPP_OPTION (pfile, char_precision);
  size_t mask = width_to_mask (width);
  size_t cmask = width_to_mask (cwidth);
  size_t nbwc = width / cwidth;
  size_t off, i;
  cppchar_t result = 0, c;

  /* This is finicky because the string is in the target's byte order,
     which may not be our byte order.  Only the last character, ignoring
     the NUL terminator, is relevant.  */
  off = str.len - (nbwc * 2);
  result = 0;
  for (i = 0; i < nbwc; i++)
    {
      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
      result = (result << cwidth) | (c & cmask);
    }

  /* Wide character constants have type wchar_t, and a single
     character exactly fills a wchar_t, so a multi-character wide
     character constant is guaranteed to overflow.  */
1580
  if (str.len > nbwc * 2)
1581 1582
    cpp_error (pfile, CPP_DL_WARNING,
	       "character constant too long for its type");
1583 1584 1585 1586 1587

  /* Truncate the constant to its natural width, and simultaneously
     sign- or zero-extend to the full width of cppchar_t.  */
  if (width < BITS_PER_CPPCHAR_T)
    {
1588 1589 1590
      if (type == CPP_CHAR16 || type == CPP_CHAR32
	  || CPP_OPTION (pfile, unsigned_wchar)
	  || !(result & (1 << (width - 1))))
1591 1592 1593 1594 1595
	result &= mask;
      else
	result |= ~mask;
    }

1596 1597 1598 1599 1600 1601
  if (type == CPP_CHAR16 || type == CPP_CHAR32
      || CPP_OPTION (pfile, unsigned_wchar))
    *unsignedp = 1;
  else
    *unsignedp = 0;

1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614
  *pchars_seen = 1;
  return result;
}

/* Interpret a (possibly wide) character constant in TOKEN.
   PCHARS_SEEN points to a variable that is filled in with the number
   of characters seen, and UNSIGNEDP to a variable that indicates
   whether the result has signed type.  */
cppchar_t
cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
			 unsigned int *pchars_seen, int *unsignedp)
{
  cpp_string str = { 0, 0 };
1615 1616
  bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
  int u8 = 2 * int(token->type == CPP_UTF8CHAR);
1617 1618
  cppchar_t result;

1619 1620
  /* An empty constant will appear as L'', u'', U'', u8'', or '' */
  if (token->val.str.len == (size_t) (2 + wide + u8))
1621
    {
1622
      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1623 1624
      return 0;
    }
1625
  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type))
1626 1627 1628
    return 0;

  if (wide)
1629 1630
    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
				    token->type);
1631 1632 1633 1634 1635 1636 1637
  else
    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);

  if (str.text != token->val.str.text)
    free ((void *)str.text);

  return result;
1638
}
1639 1640 1641 1642 1643 1644 1645 1646 1647

/* Convert an identifier denoted by ID and LEN, which might contain
   UCN escapes, to the source character set, either UTF-8 or
   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
cpp_hashnode *
_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
{
  /* It turns out that a UCN escape always turns into fewer characters
     than the escape itself, so we can allocate a temporary in advance.  */
1648
  uchar * buf = (uchar *) alloca (len + 1);
1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
  uchar * bufp = buf;
  size_t idp;
  
  for (idp = 0; idp < len; idp++)
    if (id[idp] != '\\')
      *bufp++ = id[idp];
    else
      {
	unsigned length = id[idp+1] == 'u' ? 4 : 8;
	cppchar_t value = 0;
	size_t bufleft = len - (bufp - buf);
	int rval;

	idp += 2;
	while (length && idp < len && ISXDIGIT (id[idp]))
	  {
	    value = (value << 4) + hex_value (id[idp]);
	    idp++;
	    length--;
	  }
	idp--;

	/* Special case for EBCDIC: if the identifier contains
	   a '$' specified using a UCN, translate it to EBCDIC.  */
	if (value == 0x24)
	  {
	    *bufp++ = '$';
	    continue;
	  }

	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
	if (rval)
	  {
	    errno = rval;
	    cpp_errno (pfile, CPP_DL_ERROR,
		       "converting UCN to source character set");
	    break;
	  }
      }
1688

1689 1690 1691 1692
  return CPP_HASHNODE (ht_lookup (pfile->hash_table, 
				  buf, bufp - buf, HT_ALLOC));
}

1693 1694 1695 1696
/* Convert an input buffer (containing the complete contents of one
   source file) from INPUT_CHARSET to the source character set.  INPUT
   points to the input buffer, SIZE is its allocated size, and LEN is
   the length of the meaningful data within the buffer.  The
1697 1698 1699 1700 1701 1702 1703 1704 1705 1706
   translated buffer is returned, *ST_SIZE is set to the length of
   the meaningful data within the translated buffer, and *BUFFER_START
   is set to the start of the returned buffer.  *BUFFER_START may
   differ from the return value in the case of a BOM or other ignored
   marker information.

   INPUT is expected to have been allocated with xmalloc.  This
   function will either set *BUFFER_START to INPUT, or free it and set
   *BUFFER_START to a pointer to another xmalloc-allocated block of
   memory.  */
1707
uchar * 
Eric Christopher committed
1708
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1709 1710
		    uchar *input, size_t size, size_t len,
		    const unsigned char **buffer_start, off_t *st_size)
1711
{
Eric Christopher committed
1712 1713
  struct cset_converter input_cset;
  struct _cpp_strbuf to;
1714
  unsigned char *buffer;
1715

Eric Christopher committed
1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
  if (input_cset.func == convert_no_conversion)
    {
      to.text = input;
      to.asize = size;
      to.len = len;
    }
  else
    {
      to.asize = MAX (65536, len);
1726
      to.text = XNEWVEC (uchar, to.asize);
Eric Christopher committed
1727
      to.len = 0;
1728

Eric Christopher committed
1729 1730 1731 1732
      if (!APPLY_CONVERSION (input_cset, input, len, &to))
	cpp_error (pfile, CPP_DL_ERROR,
		   "failure to convert %s to %s",
		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1733

Eric Christopher committed
1734 1735
      free (input);
    }
1736

Eric Christopher committed
1737 1738 1739
  /* Clean up the mess.  */
  if (input_cset.func == convert_using_iconv)
    iconv_close (input_cset.cd);
1740

Eric Christopher committed
1741
  /* Resize buffer if we allocated substantially too much, or if we
1742 1743 1744 1745 1746 1747 1748 1749 1750
     haven't enough space for the \n-terminator or following
     15 bytes of padding (used to quiet warnings from valgrind or
     Address Sanitizer, when the optimized lexer accesses aligned
     16-byte memory chunks, including the bytes after the malloced,
     area, and stops lexing on '\n').  */
  if (to.len + 4096 < to.asize || to.len + 16 > to.asize)
    to.text = XRESIZEVEC (uchar, to.text, to.len + 16);

  memset (to.text + to.len, '\0', 16);
1751

1752 1753 1754 1755
  /* If the file is using old-school Mac line endings (\r only),
     terminate with another \r, not an \n, so that we do not mistake
     the \r\n sequence for a single DOS line ending and erroneously
     issue the "No newline at end of file" diagnostic.  */
1756
  if (to.len && to.text[to.len - 1] == '\r')
1757 1758 1759 1760
    to.text[to.len] = '\r';
  else
    to.text[to.len] = '\n';

1761
  buffer = to.text;
Eric Christopher committed
1762
  *st_size = to.len;
1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778
#if HOST_CHARSET == HOST_CHARSET_ASCII
  /* The HOST_CHARSET test just above ensures that the source charset
     is UTF-8.  So, ignore a UTF-8 BOM if we see one.  Note that
     glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
     BOM -- however, even if it did, we would still need this code due
     to the 'convert_no_conversion' case.  */
  if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
      && to.text[2] == 0xbf)
    {
      *st_size -= 3;
      buffer += 3;
    }
#endif

  *buffer_start = to.text;
  return buffer;
1779 1780
}

1781
/* Decide on the default encoding to assume for input files.  */
Eric Christopher committed
1782 1783
const char *
_cpp_default_encoding (void)
1784
{
Eric Christopher committed
1785 1786
  const char *current_encoding = NULL;

1787 1788 1789 1790 1791 1792 1793 1794 1795
  /* We disable this because the default codeset is 7-bit ASCII on
     most platforms, and this causes conversion failures on every
     file in GCC that happens to have one of the upper 128 characters
     in it -- most likely, as part of the name of a contributor.
     We should definitely recognize in-band markers of file encoding,
     like:
     - the appropriate Unicode byte-order mark (FE FF) to recognize
       UTF16 and UCS4 (in both big-endian and little-endian flavors)
       and UTF8
1796
     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1797 1798 1799 1800
       distinguish ASCII and EBCDIC.
     - now we can parse something like "#pragma GCC encoding <xyz>
       on the first line, or even Emacs/VIM's mode line tags (there's
       a problem here in that VIM uses the last line, and Emacs has
1801
       its more elaborate "local variables" convention).
1802 1803 1804 1805
     - investigate whether Java has another common convention, which
       would be friendly to support.
     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
Eric Christopher committed
1806 1807 1808 1809 1810 1811 1812
  setlocale (LC_CTYPE, "");
  current_encoding = nl_langinfo (CODESET);
#endif
  if (current_encoding == NULL || *current_encoding == '\0')
    current_encoding = SOURCE_CHARSET;

  return current_encoding;
1813
}