Commit b6584a72 by Jonathan Wakely Committed by Jonathan Wakely

re PR libstdc++/64797 (22_locale/conversions/string/2.cc FAILs)

	PR libstdc++/64797
	* include/bits/locale_conv.h (wstring_convert::_M_conv): Handle
	incomplete multibyte sequences correctly.
	* include/std/codecvt (codecvt_utf8, codecvt_utf16,
	codecvt_utf8_utf16): Limit _Maxcode to maximum Unicode code point.
	* src/c++11/codecvt.cc (invalid_mb_sequence, incomplete_mb_character):
	Define constants.
	(is_high_surrogate, is_low_surrogate, surrogate_pair_to_code_point):
	Define convenience functions.
	(read_utf8_code_point): Return relevant constant to distinguish
	incomplete characters from invalid sequences.
	(read_utf16_code_point): Likewise. Check for invalid sequences.
	(ucs4_in, utf16_in): Use incomplete_mb_character constant.
	(utf16_out): Check for invalid sequences.
	(utf16_span): Fix condition.
	(ucs2_out): Use is_high_surrogate.
	(ucs2_in): Use incomplete_mb_character constant and fix condition.
	* testsuite/22_locale/codecvt/char16_t.cc: Fix whitespace.
	* testsuite/22_locale/conversions/buffer/1.cc: New.
	* testsuite/22_locale/conversions/string/2.cc: Use char16_t and
	char32_t instead of wchar_t.
	* testsuite/22_locale/conversions/string/3.cc: New.

From-SVN: r221189
parent d50a26f2
2015-03-04 Jonathan Wakely <jwakely@redhat.com>
PR libstdc++/64797
* include/bits/locale_conv.h (wstring_convert::_M_conv): Handle
incomplete multibyte sequences correctly.
* include/std/codecvt (codecvt_utf8, codecvt_utf16,
codecvt_utf8_utf16): Limit _Maxcode to maximum Unicode code point.
* src/c++11/codecvt.cc (invalid_mb_sequence, incomplete_mb_character):
Define constants.
(is_high_surrogate, is_low_surrogate, surrogate_pair_to_code_point):
Define convenience functions.
(read_utf8_code_point): Return relevant constant to distinguish
incomplete characters from invalid sequences.
(read_utf16_code_point): Likewise. Check for invalid sequences.
(ucs4_in, utf16_in): Use incomplete_mb_character constant.
(utf16_out): Check for invalid sequences.
(utf16_span): Fix condition.
(ucs2_out): Use is_high_surrogate.
(ucs2_in): Use incomplete_mb_character constant and fix condition.
* testsuite/22_locale/codecvt/char16_t.cc: Fix whitespace.
* testsuite/22_locale/conversions/buffer/1.cc: New.
* testsuite/22_locale/conversions/string/2.cc: Use char16_t and
char32_t instead of wchar_t.
* testsuite/22_locale/conversions/string/3.cc: New.
2015-03-03 Iain Sandoe <iain@codesourcery.com> 2015-03-03 Iain Sandoe <iain@codesourcery.com>
PR libstdc++/64883 PR libstdc++/64883
......
...@@ -198,18 +198,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION ...@@ -198,18 +198,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
auto __outstr = __err ? _OutStr(__err->get_allocator()) : _OutStr(); auto __outstr = __err ? _OutStr(__err->get_allocator()) : _OutStr();
size_t __outchars = 0; size_t __outchars = 0;
auto __next = __first; auto __next = __first;
const auto __maxlen = _M_cvt->max_length();
codecvt_base::result __result; codecvt_base::result __result;
do do
{ {
__outstr.resize(__outstr.size() + (__last - __next)); __outstr.resize(__outstr.size() + (__last - __next) + __maxlen);
auto __outnext = &__outstr.front() + __outchars; auto __outnext = &__outstr.front() + __outchars;
auto const __outlast = &__outstr.back() + 1; auto const __outlast = &__outstr.back() + 1;
__result = ((*_M_cvt).*__memfn)(_M_state, __next, __last, __next, __result = ((*_M_cvt).*__memfn)(_M_state, __next, __last, __next,
__outnext, __outlast, __outnext); __outnext, __outlast, __outnext);
__outchars = __outnext - &__outstr.front(); __outchars = __outnext - &__outstr.front();
} }
while (__result == codecvt_base::partial && __next != __last); while (__result == codecvt_base::partial && __next != __last
&& (__outstr.size() - __outchars) < __maxlen);
__outstr.resize(__outchars); __outstr.resize(__outchars);
_M_count = __next - __first; _M_count = __next - __first;
...@@ -428,7 +430,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION ...@@ -428,7 +430,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return _M_put(__next, __pending); return _M_put(__next, __pending);
if (!_M_put(__outbuf, __outnext - __outbuf)) if (!_M_put(__outbuf, __outnext - __outbuf))
return false; return false;
} }
while (__next != __last && __next != __start); while (__next != __last && __next != __start);
......
...@@ -148,7 +148,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION ...@@ -148,7 +148,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
public: \ public: \
explicit \ explicit \
_NAME(size_t __refs = 0) \ _NAME(size_t __refs = 0) \
: __ ## _NAME ## _base<_ELEM>(_Maxcode, _Mode, __refs) { } \ : __ ## _NAME ## _base<_ELEM>(std::min(_Maxcode, 0x10fffful), \
_Mode, __refs) \
{ } \
} }
template<typename _Elem> class __codecvt_utf8_base; template<typename _Elem> class __codecvt_utf8_base;
......
...@@ -35,8 +35,14 @@ namespace ...@@ -35,8 +35,14 @@ namespace
{ {
// Largest code point that fits in a single UTF-16 code unit. // Largest code point that fits in a single UTF-16 code unit.
const char32_t max_single_utf16_unit = 0xFFFF; const char32_t max_single_utf16_unit = 0xFFFF;
const char32_t max_code_point = 0x10FFFF; const char32_t max_code_point = 0x10FFFF;
// The functions below rely on maxcode < incomplete_mb_character
// (which is enforced by the codecvt_utf* classes on construction).
const char32_t incomplete_mb_character = char32_t(-2);
const char32_t invalid_mb_sequence = char32_t(-1);
template<typename Elem> template<typename Elem>
struct range struct range
{ {
...@@ -131,13 +137,13 @@ namespace ...@@ -131,13 +137,13 @@ namespace
// Read a codepoint from a UTF-8 multibyte sequence. // Read a codepoint from a UTF-8 multibyte sequence.
// Updates from.next if the codepoint is not greater than maxcode. // Updates from.next if the codepoint is not greater than maxcode.
// Returns -1 if there is an invalid or incomplete multibyte character. // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
char32_t char32_t
read_utf8_code_point(range<const char>& from, unsigned long maxcode) read_utf8_code_point(range<const char>& from, unsigned long maxcode)
{ {
size_t avail = from.size(); const size_t avail = from.size();
if (avail == 0) if (avail == 0)
return -1; return incomplete_mb_character;
unsigned char c1 = from.next[0]; unsigned char c1 = from.next[0];
// https://en.wikipedia.org/wiki/UTF-8#Sample_code // https://en.wikipedia.org/wiki/UTF-8#Sample_code
if (c1 < 0x80) if (c1 < 0x80)
...@@ -146,14 +152,14 @@ namespace ...@@ -146,14 +152,14 @@ namespace
return c1; return c1;
} }
else if (c1 < 0xC2) // continuation or overlong 2-byte sequence else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
return -1; return invalid_mb_sequence;
else if (c1 < 0xE0) // 2-byte sequence else if (c1 < 0xE0) // 2-byte sequence
{ {
if (avail < 2) if (avail < 2)
return -1; return incomplete_mb_character;
unsigned char c2 = from.next[1]; unsigned char c2 = from.next[1];
if ((c2 & 0xC0) != 0x80) if ((c2 & 0xC0) != 0x80)
return -1; return invalid_mb_sequence;
char32_t c = (c1 << 6) + c2 - 0x3080; char32_t c = (c1 << 6) + c2 - 0x3080;
if (c <= maxcode) if (c <= maxcode)
from.next += 2; from.next += 2;
...@@ -162,15 +168,15 @@ namespace ...@@ -162,15 +168,15 @@ namespace
else if (c1 < 0xF0) // 3-byte sequence else if (c1 < 0xF0) // 3-byte sequence
{ {
if (avail < 3) if (avail < 3)
return -1; return incomplete_mb_character;
unsigned char c2 = from.next[1]; unsigned char c2 = from.next[1];
if ((c2 & 0xC0) != 0x80) if ((c2 & 0xC0) != 0x80)
return -1; return invalid_mb_sequence;
if (c1 == 0xE0 && c2 < 0xA0) // overlong if (c1 == 0xE0 && c2 < 0xA0) // overlong
return -1; return invalid_mb_sequence;
unsigned char c3 = from.next[2]; unsigned char c3 = from.next[2];
if ((c3 & 0xC0) != 0x80) if ((c3 & 0xC0) != 0x80)
return -1; return invalid_mb_sequence;
char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
if (c <= maxcode) if (c <= maxcode)
from.next += 3; from.next += 3;
...@@ -179,27 +185,27 @@ namespace ...@@ -179,27 +185,27 @@ namespace
else if (c1 < 0xF5) // 4-byte sequence else if (c1 < 0xF5) // 4-byte sequence
{ {
if (avail < 4) if (avail < 4)
return -1; return incomplete_mb_character;
unsigned char c2 = from.next[1]; unsigned char c2 = from.next[1];
if ((c2 & 0xC0) != 0x80) if ((c2 & 0xC0) != 0x80)
return -1; return invalid_mb_sequence;
if (c1 == 0xF0 && c2 < 0x90) // overlong if (c1 == 0xF0 && c2 < 0x90) // overlong
return -1; return invalid_mb_sequence;
if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
return -1; return invalid_mb_sequence;
unsigned char c3 = from.next[2]; unsigned char c3 = from.next[2];
if ((c3 & 0xC0) != 0x80) if ((c3 & 0xC0) != 0x80)
return -1; return invalid_mb_sequence;
unsigned char c4 = from.next[3]; unsigned char c4 = from.next[3];
if ((c4 & 0xC0) != 0x80) if ((c4 & 0xC0) != 0x80)
return -1; return invalid_mb_sequence;
char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
if (c <= maxcode) if (c <= maxcode)
from.next += 4; from.next += 4;
return c; return c;
} }
else // > U+10FFFF else // > U+10FFFF
return -1; return invalid_mb_sequence;
} }
bool bool
...@@ -250,27 +256,54 @@ namespace ...@@ -250,27 +256,54 @@ namespace
#endif #endif
} }
// Return true if c is a high-surrogate (aka leading) code point.
inline bool
is_high_surrogate(char32_t c)
{
return c >= 0xD800 && c <= 0xDBFF;
}
// Return true if c is a low-surrogate (aka trailing) code point.
inline bool
is_low_surrogate(char32_t c)
{
return c >= 0xDC00 && c <= 0xDFFF;
}
inline char32_t
surrogate_pair_to_code_point(char32_t high, char32_t low)
{
return (high << 10) + low - 0x35FDC00;
}
// Read a codepoint from a UTF-16 multibyte sequence. // Read a codepoint from a UTF-16 multibyte sequence.
// The sequence's endianness is indicated by (mode & little_endian). // The sequence's endianness is indicated by (mode & little_endian).
// Updates from.next if the codepoint is not greater than maxcode. // Updates from.next if the codepoint is not greater than maxcode.
// Returns -1 if there is an incomplete multibyte character. // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
char32_t char32_t
read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode, read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode,
codecvt_mode mode) codecvt_mode mode)
{ {
const size_t avail = from.size();
if (avail == 0)
return incomplete_mb_character;
int inc = 1; int inc = 1;
char32_t c = adjust_byte_order(from.next[0], mode); char32_t c = adjust_byte_order(from.next[0], mode);
if (c >= 0xD800 && c <= 0xDBFF) if (is_high_surrogate(c))
{ {
if (from.size() < 2) if (avail < 2)
return -1; return incomplete_mb_character;
const char16_t c2 = adjust_byte_order(from.next[1], mode); const char16_t c2 = adjust_byte_order(from.next[1], mode);
if (c2 >= 0xDC00 && c2 <= 0xDFFF) if (is_low_surrogate(c2))
{ {
c = (c << 10) + c2 - 0x35FDC00; c = surrogate_pair_to_code_point(c, c2);
inc = 2; inc = 2;
} }
else
return invalid_mb_sequence;
} }
else if (is_low_surrogate(c))
return invalid_mb_sequence;
if (c <= maxcode) if (c <= maxcode)
from.next += inc; from.next += inc;
return c; return c;
...@@ -314,8 +347,8 @@ namespace ...@@ -314,8 +347,8 @@ namespace
while (from.size() && to.size()) while (from.size() && to.size())
{ {
const char32_t codepoint = read_utf8_code_point(from, maxcode); const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == char32_t(-1)) if (codepoint == incomplete_mb_character)
break; return codecvt_base::partial;
if (codepoint > maxcode) if (codepoint > maxcode)
return codecvt_base::error; return codecvt_base::error;
*to.next++ = codepoint; *to.next++ = codepoint;
...@@ -352,8 +385,8 @@ namespace ...@@ -352,8 +385,8 @@ namespace
while (from.size() && to.size()) while (from.size() && to.size())
{ {
const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
if (codepoint == char32_t(-1)) if (codepoint == incomplete_mb_character)
break; return codecvt_base::partial;
if (codepoint > maxcode) if (codepoint > maxcode)
return codecvt_base::error; return codecvt_base::error;
*to.next++ = codepoint; *to.next++ = codepoint;
...@@ -389,11 +422,9 @@ namespace ...@@ -389,11 +422,9 @@ namespace
read_utf8_bom(from, mode); read_utf8_bom(from, mode);
while (from.size() && to.size()) while (from.size() && to.size())
{ {
const char* first = from.next; const char* const first = from.next;
if ((unsigned char)*first >= 0xF0 && to.size() < 2)
return codecvt_base::partial;
const char32_t codepoint = read_utf8_code_point(from, maxcode); const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == char32_t(-1)) if (codepoint == incomplete_mb_character)
return codecvt_base::partial; return codecvt_base::partial;
if (codepoint > maxcode) if (codepoint > maxcode)
return codecvt_base::error; return codecvt_base::error;
...@@ -418,20 +449,22 @@ namespace ...@@ -418,20 +449,22 @@ namespace
{ {
char32_t c = from.next[0]; char32_t c = from.next[0];
int inc = 1; int inc = 1;
if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair if (is_high_surrogate(c))
{ {
if (from.size() < 2) if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point return codecvt_base::ok; // stop converting at this point
const char32_t c2 = from.next[1]; const char32_t c2 = from.next[1];
if (c2 >= 0xDC00 && c2 <= 0xDFFF) if (is_low_surrogate(c2))
{ {
c = surrogate_pair_to_code_point(c, c2);
inc = 2; inc = 2;
c = (c << 10) + c2 - 0x35FDC00;
} }
else else
return codecvt_base::error; return codecvt_base::error;
} }
else if (is_low_surrogate(c))
return codecvt_base::error;
if (c > maxcode) if (c > maxcode)
return codecvt_base::error; return codecvt_base::error;
if (!write_utf8_code_point(to, c)) if (!write_utf8_code_point(to, c))
...@@ -452,8 +485,8 @@ namespace ...@@ -452,8 +485,8 @@ namespace
while (count+1 < max) while (count+1 < max)
{ {
char32_t c = read_utf8_code_point(from, maxcode); char32_t c = read_utf8_code_point(from, maxcode);
if (c == char32_t(-1)) if (c > maxcode)
break; return from.next;
else if (c > max_single_utf16_unit) else if (c > max_single_utf16_unit)
++count; ++count;
++count; ++count;
...@@ -489,7 +522,7 @@ namespace ...@@ -489,7 +522,7 @@ namespace
while (from.size() && to.size()) while (from.size() && to.size())
{ {
char16_t c = from.next[0]; char16_t c = from.next[0];
if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair if (is_high_surrogate(c))
return codecvt_base::error; return codecvt_base::error;
if (c > maxcode) if (c > maxcode)
return codecvt_base::error; return codecvt_base::error;
...@@ -510,9 +543,9 @@ namespace ...@@ -510,9 +543,9 @@ namespace
while (from.size() && to.size()) while (from.size() && to.size())
{ {
const char32_t c = read_utf16_code_point(from, maxcode, mode); const char32_t c = read_utf16_code_point(from, maxcode, mode);
if (c == char32_t(-1)) if (c == incomplete_mb_character)
break; return codecvt_base::partial;
if (c >= maxcode) if (c > maxcode)
return codecvt_base::error; return codecvt_base::error;
*to.next++ = c; *to.next++ = c;
} }
......
...@@ -79,8 +79,7 @@ test01() ...@@ -79,8 +79,7 @@ test01()
codecvt_c16::state_type state01; codecvt_c16::state_type state01;
state01 = {}; state01 = {};
codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end, codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end, from_next,
from_next,
buffer, buffer_end, to_next); buffer, buffer_end, to_next);
VERIFY(res == codecvt_base::ok); VERIFY(res == codecvt_base::ok);
......
// { dg-options "-std=gnu++11" }
// Copyright (C) 2012 Free Software Foundation
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 22.3.3.2.3 Buffer conversions
#include <locale>
#include <sstream>
#include <testsuite_hooks.h>
template<typename Elem>
struct cvt : std::codecvt<Elem, char, std::mbstate_t> { };
template<typename Elem>
using buf_conv = std::wbuffer_convert<cvt<Elem>, Elem>;
using std::string;
using std::stringstream;
using std::wstring;
using std::wstringstream;
void test01()
{
buf_conv<wchar_t> buf;
std::stringbuf sbuf;
VERIFY( buf.rdbuf() == nullptr );
VERIFY( buf.rdbuf(&sbuf) == nullptr );
VERIFY( buf.rdbuf() == &sbuf );
VERIFY( buf.rdbuf(nullptr) == &sbuf );
}
void test02()
{
std::stringbuf sbuf;
buf_conv<char> buf(&sbuf); // noconv
stringstream ss;
ss.std::ios::rdbuf(&buf);
string input = "King for a day...";
ss << input << std::flush;
string output = sbuf.str();
VERIFY( input == output );
}
void test03()
{
std::stringbuf sbuf;
buf_conv<wchar_t> buf(&sbuf);
wstringstream ss;
ss.std::wios::rdbuf(&buf);
wstring input = L"Fool for a lifetime";
ss << input << std::flush;
string output = sbuf.str();
VERIFY( output == "Fool for a lifetime" );
}
int main()
{
test01();
test02();
test03();
}
...@@ -30,26 +30,43 @@ template<typename Elem> ...@@ -30,26 +30,43 @@ template<typename Elem>
using str_conv = std::wstring_convert<cvt<Elem>, Elem>; using str_conv = std::wstring_convert<cvt<Elem>, Elem>;
using std::string; using std::string;
using std::wstring; using std::u16string;
using std::u32string;
// test conversion errors, with and without error strings // test conversion errors, with and without error strings
void test01() void test01()
{ {
typedef str_conv<wchar_t> sc; typedef str_conv<char16_t> sc;
const sc::byte_string berr = "invalid wide string"; const sc::byte_string berr = "invalid wide string";
const sc::wide_string werr = L"invalid byte string"; const sc::wide_string werr = u"invalid byte string";
sc c(berr, werr); sc c(berr, werr);
string input = "Stop"; string input = "Stop";
input += char(0xFF);
u16string woutput = c.from_bytes(input);
VERIFY( werr == woutput );
u16string winput = u"Stop";
winput += char16_t(0xDC00);
string output = c.to_bytes(winput);
VERIFY( berr == output );
}
void test02()
{
typedef str_conv<char32_t> sc;
const sc::byte_string berr = "invalid wide string";
const sc::wide_string werr = U"invalid byte string";
sc c(berr, werr);
string input = "Halt";
input += char(0xff); input += char(0xff);
input += char(0xff); u32string woutput = c.from_bytes(input);
wstring woutput = c.from_bytes(input);
VERIFY( werr == woutput ); VERIFY( werr == woutput );
wstring winput = L"Stop"; u32string winput = U"Halt";
winput += wchar_t(0xff); winput += char32_t(-1);
winput += wchar_t(0xff);
string output = c.to_bytes(winput); string output = c.to_bytes(winput);
VERIFY( berr == output ); VERIFY( berr == output );
} }
...@@ -57,4 +74,5 @@ void test01() ...@@ -57,4 +74,5 @@ void test01()
int main() int main()
{ {
test01(); test01();
test02();
} }
// { dg-options "-std=gnu++11" }
// Copyright (C) 2012 Free Software Foundation
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 22.3.3.2.2 String conversions
#include <locale>
#include <string>
#include <testsuite_hooks.h>
template<typename Elem>
struct cvt : std::codecvt<Elem, char, std::mbstate_t> { };
template<typename Elem>
using str_conv = std::wstring_convert<cvt<Elem>, Elem>;
using std::string;
using std::u32string;
// test construction with state, for partial conversions
void test01()
{
typedef str_conv<char32_t> wsc;
wsc c;
string input = u8"\u00a3 shillings pence";
u32string woutput = c.from_bytes(input.substr(0, 1));
auto partial_state = c.state();
auto partial_count = c.converted();
auto woutput2 = c.from_bytes("state reset on next conversion");
VERIFY( woutput2 == U"state reset on next conversion" );
wsc c2(new cvt<char32_t>, partial_state);
woutput += c2.from_bytes(input.substr(partial_count));
VERIFY( U"\u00a3 shillings pence" == woutput );
string roundtrip = c2.to_bytes(woutput);
VERIFY( input == roundtrip );
}
int main()
{
test01();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment