Commit bcd682e1 by Jonathan Wakely Committed by Jonathan Wakely

PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling

	PR libstdc++/79980
	* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
	error path.
	* src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
	for manipulating codecvt_mode values.
	(read_utf16_bom): Compare input to BOM constants instead of integral
	constants that depend on endianness.  Take mode parameter by
	reference and adjust it, to distinguish between no BOM present and
	UTF-16BE BOM present.
	(ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
	(surrogates): New enumeration type.
	(utf16_in, utf16_out): Add surrogates parameter to choose between
	UTF-16 and UCS2 behaviour.
	(utf16_span, ucs2_span): Use std::min not std::max.
	(ucs2_out): Use std::min not std::max.  Disallow surrogate pairs.
	(ucs2_in): Likewise. Adjust calls to read_utf16_bom.
	* testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
	* testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.

From-SVN: r246200
parent 02e12bda
2017-03-16 Jonathan Wakely <jwakely@redhat.com>
PR libstdc++/79980
* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
error path.
* src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
for manipulating codecvt_mode values.
(read_utf16_bom): Compare input to BOM constants instead of integral
constants that depend on endianness. Take mode parameter by
reference and adjust it, to distinguish between no BOM present and
UTF-16BE BOM present.
(ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
(surrogates): New enumeration type.
(utf16_in, utf16_out): Add surrogates parameter to choose between
UTF-16 and UCS2 behaviour.
(utf16_span, ucs2_span): Use std::min not std::max.
(ucs2_out): Use std::min not std::max. Disallow surrogate pairs.
(ucs2_in): Likewise. Adjust calls to read_utf16_bom.
* testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
* testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.
PR libstdc++/79511
* src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
as a surrogate pair.
......
......@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
&& (__outstr.size() - __outchars) < __maxlen);
if (__result == codecvt_base::error)
return false;
{
__count = __next - __first;
return false;
}
if (__result == codecvt_base::noconv)
{
......
......@@ -24,13 +24,27 @@
#include <codecvt>
#include <cstring> // std::memcpy, std::memcmp
#include <bits/stl_algobase.h> // std::max
#include <bits/stl_algobase.h> // std::min
#ifdef _GLIBCXX_USE_C99_STDINT_TR1
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
// The standard doesn't define these operators, which is annoying.
static underlying_type<codecvt_mode>::type
to_integer(codecvt_mode m)
{ return static_cast<mode_t>(m); }
static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
{ return m = codecvt_mode(to_integer(m) & to_integer(n)); }
static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
{ return m = codecvt_mode(to_integer(m) | to_integer(n)); }
static codecvt_mode operator~(codecvt_mode m)
{ return codecvt_mode(~to_integer(m)); }
namespace
{
// Largest code point that fits in a single UTF-16 code unit.
......@@ -117,22 +131,26 @@ namespace
read_bom(from, utf8_bom);
}
// If consume_header is set in mode update from.next to after any BOM.
// Return little_endian iff the UTF-16LE BOM was present.
codecvt_mode
read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
// If consume_header is not set in mode, no effects.
// Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
// - if the UTF-16BE BOM was found unset little_endian in mode, or
// - if the UTF-16LE BOM was found set little_endian in mode.
void
read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
{
if (mode & consume_header && from.size())
{
if (*from.next == 0xFEFF)
++from.next;
else if (*from.next == 0xFFFE)
if (!memcmp(from.next, utf16_bom, 2))
{
++from.next;
mode &= ~little_endian;
}
else if (!memcmp(from.next, utf16le_bom, 2))
{
++from.next;
return little_endian;
mode |= little_endian;
}
}
return {};
}
// Read a codepoint from a UTF-8 multibyte sequence.
......@@ -380,8 +398,7 @@ namespace
ucs4_in(range<const char16_t>& from, range<char32_t>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
if (read_utf16_bom(from, mode) == little_endian)
mode = codecvt_mode(mode & little_endian);
read_utf16_bom(from, mode);
while (from.size() && to.size())
{
const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
......@@ -413,11 +430,15 @@ namespace
return codecvt_base::ok;
}
// utf8 -> utf16
// Flag indicating whether to process UTF-16 or UCS2
enum class surrogates { allowed, disallowed };
// utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
template<typename C>
codecvt_base::result
utf16_in(range<const char>& from, range<C>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
unsigned long maxcode = max_code_point, codecvt_mode mode = {},
surrogates s = surrogates::allowed)
{
read_utf8_bom(from, mode);
while (from.size() && to.size())
......@@ -425,7 +446,12 @@ namespace
const char* const first = from.next;
const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == incomplete_mb_character)
return codecvt_base::partial;
{
if (s == surrogates::allowed)
return codecvt_base::partial;
else
return codecvt_base::error; // No surrogates in UCS2
}
if (codepoint > maxcode)
return codecvt_base::error;
if (!write_utf16_code_point(to, codepoint, mode))
......@@ -437,11 +463,12 @@ namespace
return codecvt_base::ok;
}
// utf16 -> utf8
// utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
template<typename C>
codecvt_base::result
utf16_out(range<const C>& from, range<char>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
unsigned long maxcode = max_code_point, codecvt_mode mode = {},
surrogates s = surrogates::allowed)
{
if (!write_utf8_bom(to, mode))
return codecvt_base::partial;
......@@ -451,6 +478,9 @@ namespace
int inc = 1;
if (is_high_surrogate(c))
{
if (s == surrogates::disallowed)
return codecvt_base::error; // No surrogates in UCS-2
if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point
......@@ -492,7 +522,7 @@ namespace
++count;
}
if (count+1 == max) // take one more character if it fits in a single unit
read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
return from.next;
}
......@@ -501,7 +531,9 @@ namespace
ucs2_in(range<const char>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf8
......@@ -509,7 +541,9 @@ namespace
ucs2_out(range<const char16_t>& from, range<char>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf16
......@@ -537,14 +571,14 @@ namespace
ucs2_in(range<const char16_t>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
if (read_utf16_bom(from, mode) == little_endian)
mode = codecvt_mode(mode & little_endian);
maxcode = std::max(max_single_utf16_unit, maxcode);
read_utf16_bom(from, mode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
while (from.size() && to.size())
{
const char32_t c = read_utf16_code_point(from, maxcode, mode);
if (c == incomplete_mb_character)
return codecvt_base::partial;
return codecvt_base::error; // UCS-2 only supports single units.
if (c > maxcode)
return codecvt_base::error;
*to.next++ = c;
......@@ -557,9 +591,9 @@ namespace
char32_t maxcode, codecvt_mode mode)
{
range<const char16_t> from{ begin, end };
if (read_utf16_bom(from, mode) == little_endian)
mode = codecvt_mode(mode & little_endian);
maxcode = std::max(max_single_utf16_unit, maxcode);
read_utf16_bom(from, mode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
......@@ -572,7 +606,8 @@ namespace
{
range<const char> from{ begin, end };
read_utf8_bom(from, mode);
maxcode = std::max(max_single_utf16_unit, maxcode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf8_code_point(from, maxcode);
......@@ -598,8 +633,7 @@ namespace
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
range<const char16_t> from{ begin, end };
if (read_utf16_bom(from, mode) == little_endian)
mode = codecvt_mode(mode & little_endian);
read_utf16_bom(from, mode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
......
// Copyright (C) 2017 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// { dg-do run { target c++11 } }
#include <locale>
#include <codecvt>
#include <testsuite_hooks.h>
// PR libstdc++/79980
constexpr std::codecvt_mode mode(std::codecvt_mode m)
{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
template<typename WCh, unsigned long Max = 0x10FFFF,
std::codecvt_mode Mode = std::consume_header>
using Conv
= std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
void
test01()
{
const char src[] = "\xFE\xFF\xAB\xCD";
Conv<char16_t> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xabcd );
}
void
test02()
{
const char src[] = "\xFF\xFE\xAB\xCD";
Conv<char16_t> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xcdab );
}
void
test03()
{
const char src[] = "\xFE\xFF\xAB\xCD";
Conv<char16_t, 0x10FFFF, std::little_endian> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xabcd );
}
void
test04()
{
const char src[] = "\xFF\xFE\xAB\xCD";
Conv<char16_t, 0x10FFFF, std::little_endian> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xcdab );
}
void
test05()
{
const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
std::u16string result = conv.from_bytes(src, src+4);
VERIFY( result == u"from_bytes failed" );
VERIFY( conv.converted() == 2 );
}
void
test06()
{
const char src[] = "\0\x61\xAB\xCD";
Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
std::u16string result = conv.from_bytes(src, src+3); // incomplete character
VERIFY( result == u"from_bytes failed" );
VERIFY( conv.converted() == 2 );
}
void
test07()
{
Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
// ucs2 to utf-16 conversion should fail on invalid ucs2 input:
std::u16string utf16 = u"1234\U00001111\U0001ffff";
auto out = conv.to_bytes(utf16);
VERIFY( out == "to_bytes failed" );
VERIFY( conv.converted() == 5 );
// And should also fail on incomplete surrogate pair (not return partial):
out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
VERIFY( out == "to_bytes failed" );
VERIFY( conv.converted() == 5 );
}
int main()
{
test01();
test02();
test03();
test04();
test05();
test06();
test07();
}
// Copyright (C) 2017 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// { dg-do run { target c++11 } }
#include <codecvt>
#include <locale>
#include <string>
#include <testsuite_hooks.h>
using std::wstring_convert;
using std::codecvt_utf8;
void
test01()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
// utf-8 to ucs2 conversion should fail on character outside BMP
auto ucs2 = c.from_bytes(src);
VERIFY( ucs2 == u"BAD" );
VERIFY( c.converted() == 7 );
// ucs2 to utf-8 conversion should fail on invalid ucs2 input:
std::u16string utf16 = u"1234\U00001111\U0001ffff";
auto out = c.to_bytes(utf16);
VERIFY( out == "bad" );
VERIFY( c.converted() == 5 );
// And should also fail on incomplete surrogate pair (not return partial):
out = c.to_bytes(utf16.substr(0, utf16.size()-1));
VERIFY( out == "bad" );
VERIFY( c.converted() == 5 );
}
void
test02()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
// utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
auto ucs2 = c.from_bytes(src);
VERIFY( ucs2 == u"BAD" );
VERIFY( c.converted() == 4 );
}
void
test03()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
// utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
auto ucs4 = c.from_bytes(src);
VERIFY( ucs4 == U"BAD" );
VERIFY( c.converted() == 7 );
}
void
test04()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
// utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
auto ucs4 = c.from_bytes(src);
VERIFY( ucs4 == U"BAD" );
VERIFY( c.converted() == 4 );
}
int
main()
{
test01();
test02();
test03();
test04();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment