PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling

PR libstdc++/79980 * include/bits/locale_conv.h (__do_str_codecvt): Set __count on error path. * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads for manipulating codecvt_mode values. (read_utf16_bom): Compare input to BOM constants instead of integral constants that depend on endianness. Take mode parameter by reference and adjust it, to distinguish between no BOM present and UTF-16BE BOM present. (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom. (surrogates): New enumeration type. (utf16_in, utf16_out): Add surrogates parameter to choose between UTF-16 and UCS2 behaviour. (utf16_span, ucs2_span): Use std::min not std::max. (ucs2_out): Use std::min not std::max. Disallow surrogate pairs. (ucs2_in): Likewise. Adjust calls to read_utf16_bom. * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test. * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test. From-SVN: r246200

PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling
PR libstdc++/79980 * include/bits/locale_conv.h (__do_str_codecvt): Set __count on error path. * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads for manipulating codecvt_mode values. (read_utf16_bom): Compare input to BOM constants instead of integral constants that depend on endianness. Take mode parameter by reference and adjust it, to distinguish between no BOM present and UTF-16BE BOM present. (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom. (surrogates): New enumeration type. (utf16_in, utf16_out): Add surrogates parameter to choose between UTF-16 and UCS2 behaviour. (utf16_span, ucs2_span): Use std::min not std::max. (ucs2_out): Use std::min not std::max. Disallow surrogate pairs. (ucs2_in): Likewise. Adjust calls to read_utf16_bom. * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test. * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test. From-SVN: r246200
bcd682e1 · Jonathan Wakely · Jonathan Wakely · 02e12bda · bcd682e1 · bcd682e1
Commit bcd682e1 authored Mar 16, 2017 by Jonathan Wakely Committed by Jonathan Wakely Mar 16, 2017
5 changed files
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
 2017-03-16  Jonathan Wakely  <jwakely@redhat.com>
+	PR libstdc++/79980
+	* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
+	error path.
+	* src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
+	for manipulating codecvt_mode values.
+	(read_utf16_bom): Compare input to BOM constants instead of integral
+	constants that depend on endianness.  Take mode parameter by
+	reference and adjust it, to distinguish between no BOM present and
+	UTF-16BE BOM present.
+	(ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
+	(surrogates): New enumeration type.
+	(utf16_in, utf16_out): Add surrogates parameter to choose between
+	UTF-16 and UCS2 behaviour.
+	(utf16_span, ucs2_span): Use std::min not std::max.
+	(ucs2_out): Use std::min not std::max.  Disallow surrogate pairs.
+	(ucs2_in): Likewise. Adjust calls to read_utf16_bom.
+	* testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
+	* testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.
 	PR libstdc++/79511
 	* src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
 	as a surrogate pair.

--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	     && (__outstr.size() - __outchars) < __maxlen);
      if (__result == codecvt_base::error)
-	return false;
+	{
+	  __count = __next - __first;
+	  return false;
+	}
      if (__result == codecvt_base::noconv)
 	{

--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -24,13 +24,27 @@
 #include <codecvt>
 #include <cstring>		// std::memcpy, std::memcmp
-#include <bits/stl_algobase.h>	// std::max
+#include <bits/stl_algobase.h>	// std::min
 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
 namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
+  // The standard doesn't define these operators, which is annoying.
+  static underlying_type<codecvt_mode>::type
+  to_integer(codecvt_mode m)
+  { return static_cast<mode_t>(m); }
+  static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
+  static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
+  static codecvt_mode operator~(codecvt_mode m)
+  { return codecvt_mode(~to_integer(m)); }
 namespace
 {
  // Largest code point that fits in a single UTF-16 code unit.
@@ -117,22 +131,26 @@ namespace
      read_bom(from, utf8_bom);
  }
-  // If consume_header is set in mode update from.next to after any BOM.
+  // If consume_header is not set in mode, no effects.
-  // Return little_endian iff the UTF-16LE BOM was present.
+  // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
-  codecvt_mode
+  // - if the UTF-16BE BOM was found unset little_endian in mode, or
-  read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
+  // - if the UTF-16LE BOM was found set little_endian in mode.
+  void
+  read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
  {
    if (mode & consume_header && from.size())
      {
-	if (*from.next == 0xFEFF)
+	if (!memcmp(from.next, utf16_bom, 2))
-	  ++from.next;
+	  {
-	else if (*from.next == 0xFFFE)
+	    ++from.next;
+	    mode &= ~little_endian;
+	  }
+	else if (!memcmp(from.next, utf16le_bom, 2))
 	  {
 	    ++from.next;
-	    return little_endian;
+	    mode |= little_endian;
 	  }
      }
-    return {};
  }
  // Read a codepoint from a UTF-8 multibyte sequence.
@@ -380,8 +398,7 @@ namespace
  ucs4_in(range<const char16_t>& from, range<char32_t>& to,
          unsigned long maxcode = max_code_point, codecvt_mode mode = {})
  {
-    if (read_utf16_bom(from, mode) == little_endian)
+    read_utf16_bom(from, mode);
-      mode = codecvt_mode(mode & little_endian);
    while (from.size() && to.size())
      {
 	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
@@ -413,11 +430,15 @@ namespace
    return codecvt_base::ok;
  }
-  // utf8 -> utf16
+  // Flag indicating whether to process UTF-16 or UCS2
+  enum class surrogates { allowed, disallowed };
+  // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
  template<typename C>
  codecvt_base::result
  utf16_in(range<const char>& from, range<C>& to,
-           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+	   unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+	   surrogates s = surrogates::allowed)
  {
    read_utf8_bom(from, mode);
    while (from.size() && to.size())
@@ -425,7 +446,12 @@ namespace
 	const char* const first = from.next;
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
 	if (codepoint == incomplete_mb_character)
-	  return codecvt_base::partial;
+	  {
+	    if (s == surrogates::allowed)
+	      return codecvt_base::partial;
+	    else
+	      return codecvt_base::error; // No surrogates in UCS2
+	  }
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
 	if (!write_utf16_code_point(to, codepoint, mode))
@@ -437,11 +463,12 @@ namespace
    return codecvt_base::ok;
  }
-  // utf16 -> utf8
+  // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
  template<typename C>
  codecvt_base::result
  utf16_out(range<const C>& from, range<char>& to,
-            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+	    unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+	    surrogates s = surrogates::allowed)
  {
    if (!write_utf8_bom(to, mode))
      return codecvt_base::partial;
@@ -451,6 +478,9 @@ namespace
 	int inc = 1;
 	if (is_high_surrogate(c))
 	  {
+	    if (s == surrogates::disallowed)
+	      return codecvt_base::error; // No surrogates in UCS-2
 	    if (from.size() < 2)
 	      return codecvt_base::ok; // stop converting at this point
@@ -492,7 +522,7 @@ namespace
 	++count;
      }
    if (count+1 == max) // take one more character if it fits in a single unit
-      read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
+      read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
    return from.next;
  }
@@ -501,7 +531,9 @@ namespace
  ucs2_in(range<const char>& from, range<char16_t>& to,
 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
  {
-    return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
  }
  // ucs2 -> utf8
@@ -509,7 +541,9 @@ namespace
  ucs2_out(range<const char16_t>& from, range<char>& to,
 	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
  {
-    return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
  }
  // ucs2 -> utf16
@@ -537,14 +571,14 @@ namespace
  ucs2_in(range<const char16_t>& from, range<char16_t>& to,
 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
  {
-    if (read_utf16_bom(from, mode) == little_endian)
+    read_utf16_bom(from, mode);
-      mode = codecvt_mode(mode & little_endian);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    maxcode = std::min(max_single_utf16_unit, maxcode);
    while (from.size() && to.size())
      {
 	const char32_t c = read_utf16_code_point(from, maxcode, mode);
 	if (c == incomplete_mb_character)
-	  return codecvt_base::partial;
+	  return codecvt_base::error; // UCS-2 only supports single units.
 	if (c > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = c;
@@ -557,9 +591,9 @@ namespace
            char32_t maxcode, codecvt_mode mode)
  {
    range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
+    read_utf16_bom(from, mode);
-      mode = codecvt_mode(mode & little_endian);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    maxcode = std::min(max_single_utf16_unit, maxcode);
    char32_t c = 0;
    while (max-- && c <= maxcode)
      c = read_utf16_code_point(from, maxcode, mode);
@@ -572,7 +606,8 @@ namespace
  {
    range<const char> from{ begin, end };
    read_utf8_bom(from, mode);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
    char32_t c = 0;
    while (max-- && c <= maxcode)
      c = read_utf8_code_point(from, maxcode);
@@ -598,8 +633,7 @@ namespace
            char32_t maxcode = max_code_point, codecvt_mode mode = {})
  {
    range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
+    read_utf16_bom(from, mode);
-      mode = codecvt_mode(mode & little_endian);
    char32_t c = 0;
    while (max-- && c <= maxcode)
      c = read_utf16_code_point(from, maxcode, mode);

--- a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+// { dg-do run { target c++11 } }
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+// PR libstdc++/79980
+constexpr std::codecvt_mode mode(std::codecvt_mode m)
+{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
+template<typename WCh, unsigned long Max = 0x10FFFF,
+	 std::codecvt_mode Mode = std::consume_header>
+  using Conv
+    = std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
+void
+test01()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+void
+test02()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+void
+test03()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+void
+test04()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+void
+test05()
+{
+  const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
+  Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+4);
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+void
+test06()
+{
+  const char src[] = "\0\x61\xAB\xCD";
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+3); // incomplete character
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+void
+test07()
+{
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  // ucs2 to utf-16 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = conv.to_bytes(utf16);
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+}
+int main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+  test05();
+  test06();
+  test07();
+}
--- a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+// { dg-do run { target c++11 } }
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+using std::wstring_convert;
+using std::codecvt_utf8;
+void
+test01()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
+  // utf-8 to ucs2 conversion should fail on character outside BMP
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 7 );
+  // ucs2 to utf-8 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = c.to_bytes(utf16);
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = c.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+}
+void
+test02()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
+  // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+void
+test03()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 7 );
+}
+void
+test04()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+}