Commit bcd682e1 by Jonathan Wakely Committed by Jonathan Wakely

PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling

	PR libstdc++/79980
	* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
	error path.
	* src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
	for manipulating codecvt_mode values.
	(read_utf16_bom): Compare input to BOM constants instead of integral
	constants that depend on endianness.  Take mode parameter by
	reference and adjust it, to distinguish between no BOM present and
	UTF-16BE BOM present.
	(ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
	(surrogates): New enumeration type.
	(utf16_in, utf16_out): Add surrogates parameter to choose between
	UTF-16 and UCS2 behaviour.
	(utf16_span, ucs2_span): Use std::min not std::max.
	(ucs2_out): Use std::min not std::max.  Disallow surrogate pairs.
	(ucs2_in): Likewise. Adjust calls to read_utf16_bom.
	* testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
	* testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.

From-SVN: r246200
parent 02e12bda
2017-03-16 Jonathan Wakely <jwakely@redhat.com> 2017-03-16 Jonathan Wakely <jwakely@redhat.com>
PR libstdc++/79980
* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
error path.
* src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
for manipulating codecvt_mode values.
(read_utf16_bom): Compare input to BOM constants instead of integral
constants that depend on endianness. Take mode parameter by
reference and adjust it, to distinguish between no BOM present and
UTF-16BE BOM present.
(ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
(surrogates): New enumeration type.
(utf16_in, utf16_out): Add surrogates parameter to choose between
UTF-16 and UCS2 behaviour.
(utf16_span, ucs2_span): Use std::min not std::max.
(ucs2_out): Use std::min not std::max. Disallow surrogate pairs.
(ucs2_in): Likewise. Adjust calls to read_utf16_bom.
* testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
* testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.
PR libstdc++/79511 PR libstdc++/79511
* src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff * src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
as a surrogate pair. as a surrogate pair.
......
...@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION ...@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
&& (__outstr.size() - __outchars) < __maxlen); && (__outstr.size() - __outchars) < __maxlen);
if (__result == codecvt_base::error) if (__result == codecvt_base::error)
return false; {
__count = __next - __first;
return false;
}
if (__result == codecvt_base::noconv) if (__result == codecvt_base::noconv)
{ {
......
...@@ -24,13 +24,27 @@ ...@@ -24,13 +24,27 @@
#include <codecvt> #include <codecvt>
#include <cstring> // std::memcpy, std::memcmp #include <cstring> // std::memcpy, std::memcmp
#include <bits/stl_algobase.h> // std::max #include <bits/stl_algobase.h> // std::min
#ifdef _GLIBCXX_USE_C99_STDINT_TR1 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
namespace std _GLIBCXX_VISIBILITY(default) namespace std _GLIBCXX_VISIBILITY(default)
{ {
_GLIBCXX_BEGIN_NAMESPACE_VERSION _GLIBCXX_BEGIN_NAMESPACE_VERSION
// The standard doesn't define these operators, which is annoying.
static underlying_type<codecvt_mode>::type
to_integer(codecvt_mode m)
{ return static_cast<mode_t>(m); }
static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
{ return m = codecvt_mode(to_integer(m) & to_integer(n)); }
static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
{ return m = codecvt_mode(to_integer(m) | to_integer(n)); }
static codecvt_mode operator~(codecvt_mode m)
{ return codecvt_mode(~to_integer(m)); }
namespace namespace
{ {
// Largest code point that fits in a single UTF-16 code unit. // Largest code point that fits in a single UTF-16 code unit.
...@@ -117,22 +131,26 @@ namespace ...@@ -117,22 +131,26 @@ namespace
read_bom(from, utf8_bom); read_bom(from, utf8_bom);
} }
// If consume_header is set in mode update from.next to after any BOM. // If consume_header is not set in mode, no effects.
// Return little_endian iff the UTF-16LE BOM was present. // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
codecvt_mode // - if the UTF-16BE BOM was found unset little_endian in mode, or
read_utf16_bom(range<const char16_t>& from, codecvt_mode mode) // - if the UTF-16LE BOM was found set little_endian in mode.
void
read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
{ {
if (mode & consume_header && from.size()) if (mode & consume_header && from.size())
{ {
if (*from.next == 0xFEFF) if (!memcmp(from.next, utf16_bom, 2))
++from.next; {
else if (*from.next == 0xFFFE) ++from.next;
mode &= ~little_endian;
}
else if (!memcmp(from.next, utf16le_bom, 2))
{ {
++from.next; ++from.next;
return little_endian; mode |= little_endian;
} }
} }
return {};
} }
// Read a codepoint from a UTF-8 multibyte sequence. // Read a codepoint from a UTF-8 multibyte sequence.
...@@ -380,8 +398,7 @@ namespace ...@@ -380,8 +398,7 @@ namespace
ucs4_in(range<const char16_t>& from, range<char32_t>& to, ucs4_in(range<const char16_t>& from, range<char32_t>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {}) unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{ {
if (read_utf16_bom(from, mode) == little_endian) read_utf16_bom(from, mode);
mode = codecvt_mode(mode & little_endian);
while (from.size() && to.size()) while (from.size() && to.size())
{ {
const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
...@@ -413,11 +430,15 @@ namespace ...@@ -413,11 +430,15 @@ namespace
return codecvt_base::ok; return codecvt_base::ok;
} }
// utf8 -> utf16 // Flag indicating whether to process UTF-16 or UCS2
enum class surrogates { allowed, disallowed };
// utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
template<typename C> template<typename C>
codecvt_base::result codecvt_base::result
utf16_in(range<const char>& from, range<C>& to, utf16_in(range<const char>& from, range<C>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {}) unsigned long maxcode = max_code_point, codecvt_mode mode = {},
surrogates s = surrogates::allowed)
{ {
read_utf8_bom(from, mode); read_utf8_bom(from, mode);
while (from.size() && to.size()) while (from.size() && to.size())
...@@ -425,7 +446,12 @@ namespace ...@@ -425,7 +446,12 @@ namespace
const char* const first = from.next; const char* const first = from.next;
const char32_t codepoint = read_utf8_code_point(from, maxcode); const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == incomplete_mb_character) if (codepoint == incomplete_mb_character)
return codecvt_base::partial; {
if (s == surrogates::allowed)
return codecvt_base::partial;
else
return codecvt_base::error; // No surrogates in UCS2
}
if (codepoint > maxcode) if (codepoint > maxcode)
return codecvt_base::error; return codecvt_base::error;
if (!write_utf16_code_point(to, codepoint, mode)) if (!write_utf16_code_point(to, codepoint, mode))
...@@ -437,11 +463,12 @@ namespace ...@@ -437,11 +463,12 @@ namespace
return codecvt_base::ok; return codecvt_base::ok;
} }
// utf16 -> utf8 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
template<typename C> template<typename C>
codecvt_base::result codecvt_base::result
utf16_out(range<const C>& from, range<char>& to, utf16_out(range<const C>& from, range<char>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {}) unsigned long maxcode = max_code_point, codecvt_mode mode = {},
surrogates s = surrogates::allowed)
{ {
if (!write_utf8_bom(to, mode)) if (!write_utf8_bom(to, mode))
return codecvt_base::partial; return codecvt_base::partial;
...@@ -451,6 +478,9 @@ namespace ...@@ -451,6 +478,9 @@ namespace
int inc = 1; int inc = 1;
if (is_high_surrogate(c)) if (is_high_surrogate(c))
{ {
if (s == surrogates::disallowed)
return codecvt_base::error; // No surrogates in UCS-2
if (from.size() < 2) if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point return codecvt_base::ok; // stop converting at this point
...@@ -492,7 +522,7 @@ namespace ...@@ -492,7 +522,7 @@ namespace
++count; ++count;
} }
if (count+1 == max) // take one more character if it fits in a single unit if (count+1 == max) // take one more character if it fits in a single unit
read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode)); read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
return from.next; return from.next;
} }
...@@ -501,7 +531,9 @@ namespace ...@@ -501,7 +531,9 @@ namespace
ucs2_in(range<const char>& from, range<char16_t>& to, ucs2_in(range<const char>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {}) char32_t maxcode = max_code_point, codecvt_mode mode = {})
{ {
return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode); // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
} }
// ucs2 -> utf8 // ucs2 -> utf8
...@@ -509,7 +541,9 @@ namespace ...@@ -509,7 +541,9 @@ namespace
ucs2_out(range<const char16_t>& from, range<char>& to, ucs2_out(range<const char16_t>& from, range<char>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {}) char32_t maxcode = max_code_point, codecvt_mode mode = {})
{ {
return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode); // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
} }
// ucs2 -> utf16 // ucs2 -> utf16
...@@ -537,14 +571,14 @@ namespace ...@@ -537,14 +571,14 @@ namespace
ucs2_in(range<const char16_t>& from, range<char16_t>& to, ucs2_in(range<const char16_t>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {}) char32_t maxcode = max_code_point, codecvt_mode mode = {})
{ {
if (read_utf16_bom(from, mode) == little_endian) read_utf16_bom(from, mode);
mode = codecvt_mode(mode & little_endian); // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::max(max_single_utf16_unit, maxcode); maxcode = std::min(max_single_utf16_unit, maxcode);
while (from.size() && to.size()) while (from.size() && to.size())
{ {
const char32_t c = read_utf16_code_point(from, maxcode, mode); const char32_t c = read_utf16_code_point(from, maxcode, mode);
if (c == incomplete_mb_character) if (c == incomplete_mb_character)
return codecvt_base::partial; return codecvt_base::error; // UCS-2 only supports single units.
if (c > maxcode) if (c > maxcode)
return codecvt_base::error; return codecvt_base::error;
*to.next++ = c; *to.next++ = c;
...@@ -557,9 +591,9 @@ namespace ...@@ -557,9 +591,9 @@ namespace
char32_t maxcode, codecvt_mode mode) char32_t maxcode, codecvt_mode mode)
{ {
range<const char16_t> from{ begin, end }; range<const char16_t> from{ begin, end };
if (read_utf16_bom(from, mode) == little_endian) read_utf16_bom(from, mode);
mode = codecvt_mode(mode & little_endian); // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::max(max_single_utf16_unit, maxcode); maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0; char32_t c = 0;
while (max-- && c <= maxcode) while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode); c = read_utf16_code_point(from, maxcode, mode);
...@@ -572,7 +606,8 @@ namespace ...@@ -572,7 +606,8 @@ namespace
{ {
range<const char> from{ begin, end }; range<const char> from{ begin, end };
read_utf8_bom(from, mode); read_utf8_bom(from, mode);
maxcode = std::max(max_single_utf16_unit, maxcode); // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0; char32_t c = 0;
while (max-- && c <= maxcode) while (max-- && c <= maxcode)
c = read_utf8_code_point(from, maxcode); c = read_utf8_code_point(from, maxcode);
...@@ -598,8 +633,7 @@ namespace ...@@ -598,8 +633,7 @@ namespace
char32_t maxcode = max_code_point, codecvt_mode mode = {}) char32_t maxcode = max_code_point, codecvt_mode mode = {})
{ {
range<const char16_t> from{ begin, end }; range<const char16_t> from{ begin, end };
if (read_utf16_bom(from, mode) == little_endian) read_utf16_bom(from, mode);
mode = codecvt_mode(mode & little_endian);
char32_t c = 0; char32_t c = 0;
while (max-- && c <= maxcode) while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode); c = read_utf16_code_point(from, maxcode, mode);
......
// Copyright (C) 2017 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// { dg-do run { target c++11 } }
#include <locale>
#include <codecvt>
#include <testsuite_hooks.h>
// PR libstdc++/79980
constexpr std::codecvt_mode mode(std::codecvt_mode m)
{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
template<typename WCh, unsigned long Max = 0x10FFFF,
std::codecvt_mode Mode = std::consume_header>
using Conv
= std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
void
test01()
{
const char src[] = "\xFE\xFF\xAB\xCD";
Conv<char16_t> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xabcd );
}
void
test02()
{
const char src[] = "\xFF\xFE\xAB\xCD";
Conv<char16_t> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xcdab );
}
void
test03()
{
const char src[] = "\xFE\xFF\xAB\xCD";
Conv<char16_t, 0x10FFFF, std::little_endian> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xabcd );
}
void
test04()
{
const char src[] = "\xFF\xFE\xAB\xCD";
Conv<char16_t, 0x10FFFF, std::little_endian> conv;
auto dst = conv.from_bytes(src, src+4);
VERIFY( dst[0] == 0xcdab );
}
void
test05()
{
const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
std::u16string result = conv.from_bytes(src, src+4);
VERIFY( result == u"from_bytes failed" );
VERIFY( conv.converted() == 2 );
}
void
test06()
{
const char src[] = "\0\x61\xAB\xCD";
Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
std::u16string result = conv.from_bytes(src, src+3); // incomplete character
VERIFY( result == u"from_bytes failed" );
VERIFY( conv.converted() == 2 );
}
void
test07()
{
Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
// ucs2 to utf-16 conversion should fail on invalid ucs2 input:
std::u16string utf16 = u"1234\U00001111\U0001ffff";
auto out = conv.to_bytes(utf16);
VERIFY( out == "to_bytes failed" );
VERIFY( conv.converted() == 5 );
// And should also fail on incomplete surrogate pair (not return partial):
out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
VERIFY( out == "to_bytes failed" );
VERIFY( conv.converted() == 5 );
}
int main()
{
test01();
test02();
test03();
test04();
test05();
test06();
test07();
}
// Copyright (C) 2017 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// { dg-do run { target c++11 } }
#include <codecvt>
#include <locale>
#include <string>
#include <testsuite_hooks.h>
using std::wstring_convert;
using std::codecvt_utf8;
void
test01()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
// utf-8 to ucs2 conversion should fail on character outside BMP
auto ucs2 = c.from_bytes(src);
VERIFY( ucs2 == u"BAD" );
VERIFY( c.converted() == 7 );
// ucs2 to utf-8 conversion should fail on invalid ucs2 input:
std::u16string utf16 = u"1234\U00001111\U0001ffff";
auto out = c.to_bytes(utf16);
VERIFY( out == "bad" );
VERIFY( c.converted() == 5 );
// And should also fail on incomplete surrogate pair (not return partial):
out = c.to_bytes(utf16.substr(0, utf16.size()-1));
VERIFY( out == "bad" );
VERIFY( c.converted() == 5 );
}
void
test02()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
// utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
auto ucs2 = c.from_bytes(src);
VERIFY( ucs2 == u"BAD" );
VERIFY( c.converted() == 4 );
}
void
test03()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
// utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
auto ucs4 = c.from_bytes(src);
VERIFY( ucs4 == U"BAD" );
VERIFY( c.converted() == 7 );
}
void
test04()
{
std::string src = u8"1234\U00001111\U0001ffff";
wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
// utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
auto ucs4 = c.from_bytes(src);
VERIFY( ucs4 == U"BAD" );
VERIFY( c.converted() == 4 );
}
int
main()
{
test01();
test02();
test03();
test04();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment