|
|
#include "libshit/utf_low_level.hpp"
|
|
|
|
|
|
#include "libshit/doctest.hpp"
|
|
|
#include "libshit/nonowning_string.hpp"
|
|
|
#include "libshit/shared_ptr.hpp"
|
|
|
|
|
|
#include <string>
|
|
|
#include <vector>
|
|
|
|
|
|
namespace Libshit::Utf::Test
|
|
|
{
|
|
|
TEST_SUITE_BEGIN("Libshit::Utf");
|
|
|
|
|
|
TEST_CASE("Get")
|
|
|
{
|
|
|
std::vector<int> in{1,2,3}, o0, o1;
|
|
|
RunColl(in, PushBackSink(std::vector<int>{})); // by value - mostly useless
|
|
|
RunColl(in, PushBackSink(std::ref(o0))); // reference_wrapper
|
|
|
CHECK(o0 == std::vector<int>{1,2,3});
|
|
|
RunColl(in, PushBackSink(&o1)); // ptr
|
|
|
CHECK(o1 == std::vector<int>{1,2,3});
|
|
|
auto o2 = MakeShared<std::vector<int>>();
|
|
|
RunColl(in, PushBackSink(o2));
|
|
|
CHECK(*o2 == std::vector<int>{1,2,3});
|
|
|
}
|
|
|
|
|
|
TEST_CASE("Map")
|
|
|
{
|
|
|
auto fun = [](int x) { return x+1; };
|
|
|
std::vector<int> in{2,5,-3}, out;
|
|
|
RunColl(in, Map(fun, PushBackSink(std::ref(out))));
|
|
|
CHECK(out == std::vector<int>{3,6,-2});
|
|
|
}
|
|
|
|
|
|
TEST_CASE("ReplaceInvalid")
|
|
|
{
|
|
|
std::u32string out;
|
|
|
auto run = [&](U32StringView in)
|
|
|
{
|
|
|
out.clear();
|
|
|
RunColl(in, ReplaceInvalid(PushBackSink(std::ref(out))));
|
|
|
};
|
|
|
|
|
|
// valid cases
|
|
|
run(U"abc"_ns); CHECK(out == U"abc"_ns);
|
|
|
run(U"猫"_ns); CHECK(out == U"猫"_ns);
|
|
|
run(U"💩"_ns); CHECK(out == U"💩"_ns);
|
|
|
run(U"\xd799"_ns); CHECK(out == U"\xd799"_ns);
|
|
|
run(U"\xe000"_ns); CHECK(out == U"\xe000"_ns);
|
|
|
run(U"\x10ffff"_ns); CHECK(out == U"\x10ffff"_ns);
|
|
|
|
|
|
// invalid cases
|
|
|
run(U"\xd800"_ns); CHECK(out == U"<EFBFBD>"_ns);
|
|
|
run(U"\xd812"_ns); CHECK(out == U"<EFBFBD>"_ns);
|
|
|
run(U"\xdbff"_ns); CHECK(out == U"<EFBFBD>"_ns);
|
|
|
run(U"\xdc00"_ns); CHECK(out == U"<EFBFBD>"_ns);
|
|
|
run(U"\xdfff"_ns); CHECK(out == U"<EFBFBD>"_ns);
|
|
|
run(U"\x110000"_ns); CHECK(out == U"<EFBFBD>"_ns);
|
|
|
run(U"\xffffffff"_ns); CHECK(out == U"<EFBFBD>"_ns);
|
|
|
// this is valid utf-16 (surrogate pairs), but not 32
|
|
|
run(U"\xd83d\xdca9"_ns); CHECK(out == U"<EFBFBD><EFBFBD>"_ns);
|
|
|
|
|
|
// mixed
|
|
|
run(U"!\xd800!"_ns); CHECK(out == U"!<21>!"_ns);
|
|
|
run(U"\x110000?"_ns); CHECK(out == U"<EFBFBD>?"_ns);
|
|
|
}
|
|
|
|
|
|
TEST_CASE("ToWtf8Gen")
|
|
|
{
|
|
|
auto test = [](U32StringView in, StringView exp)
|
|
|
{
|
|
|
std::string out;
|
|
|
RunColl(in, ToWtf8(PushBackSink(std::ref(out))));
|
|
|
CHECK(NonowningString{out} == exp);
|
|
|
out.clear();
|
|
|
RunColl(in, ToWtf8Overlong(PushBackSink(std::ref(out))));
|
|
|
CHECK(NonowningString{out} == exp);
|
|
|
};
|
|
|
|
|
|
// ascii
|
|
|
test(U"abcd"_ns, "abcd"_ns);
|
|
|
test(U"\1\2\33\x7f"_ns, "\1\2\33\x7f"_ns);
|
|
|
// two-byte utf8
|
|
|
test(U"\x80"_ns, u8"\u0080"_ns);
|
|
|
test(U"\x81"_ns, "\xc2\x81"_ns);
|
|
|
test(U"äßØ"_ns, u8"äßØ"_ns);
|
|
|
test(U"߿"_ns, u8"߿"_ns); // 0x7ff
|
|
|
// three-byte utf8
|
|
|
test(U"ࠀ"_ns, u8"ࠀ"_ns); // 0x800
|
|
|
test(U"猫ニャーニャー"_ns, u8"猫ニャーニャー"_ns);
|
|
|
test(U"\xffff"_ns, "\xef\xbf\xbf"_ns);
|
|
|
// four-byte utf-8
|
|
|
test(U"𐀀"_ns, u8"𐀀"_ns); // 0x010000
|
|
|
test(U"😂💩🤮"_ns, u8"😂💩🤮"_ns);
|
|
|
test(U"\x10ffff"_ns, u8"\U0010ffff"_ns);
|
|
|
// invalid
|
|
|
test(U"\x110000"_ns, u8"<EFBFBD>"_ns);
|
|
|
test(U"\xffffffff"_ns, u8"<EFBFBD>"_ns);
|
|
|
|
|
|
test(U"\xd83e"_ns, "\xed\xa0\xbe"_ns); // wtf-16 half surrogate pair
|
|
|
test(U"\xd83e\xdd2e"_ns, "\xed\xa0\xbe""\xed\xb4\xae"_ns); // cesu-8
|
|
|
|
|
|
// check overlong 0
|
|
|
std::string out;
|
|
|
RunColl(U"a\0b"_ns, ToWtf8(PushBackSink(std::ref(out))));
|
|
|
CHECK(out == "a\0b"_ns);
|
|
|
out.clear();
|
|
|
RunColl(U"a\0b"_ns, ToWtf8Overlong(PushBackSink(std::ref(out))));
|
|
|
CHECK(out == "a\xc0\x80""b"_ns);
|
|
|
}
|
|
|
|
|
|
TEST_CASE("ToWtf16")
|
|
|
{
|
|
|
std::u16string out;
|
|
|
auto run = [&](U32StringView in)
|
|
|
{
|
|
|
out.clear();
|
|
|
RunColl(in, ToWtf16(PushBackSink(std::ref(out))));
|
|
|
};
|
|
|
|
|
|
run(U"abc"_ns); CHECK(out == u"abc"_ns);
|
|
|
run(U"\0\1"_ns); CHECK(out == u"\0\1"_ns);
|
|
|
// cp < 0x10000 are left alone
|
|
|
run(U"\xd83e\xffff"_ns); CHECK(out == u"\xd83e\xffff"_ns);
|
|
|
// cp >= 0x10000 converted to surrogate pairs
|
|
|
run(U"𐀀"_ns); CHECK(out == u"𐀀"_ns);
|
|
|
run(U"😂💩🤮"_ns); CHECK(out == u"😂💩🤮"_ns);
|
|
|
run(U"\x10ffff"_ns); CHECK(out == u"\U0010ffff"_ns);
|
|
|
// invalid
|
|
|
run(U"\x110000\xffffffff"_ns); CHECK(out == u"<EFBFBD><EFBFBD>"_ns);
|
|
|
}
|
|
|
|
|
|
TEST_CASE("FromWtf8")
|
|
|
{
|
|
|
auto check = [](StringView in, U32StringView exp)
|
|
|
{
|
|
|
CAPTURE(doctest::toString(in));
|
|
|
std::u32string out;
|
|
|
RunColl(in, FromWtf8(PushBackSink(std::ref(out))));
|
|
|
CHECK(NonowningU32String{out} == exp);
|
|
|
};
|
|
|
|
|
|
check(u8""_ns, U""_ns);
|
|
|
check(u8"abc"_ns, U"abc"_ns);
|
|
|
check(u8"猫"_ns, U"猫"_ns);
|
|
|
check(u8"💩x"_ns, U"💩x"_ns);
|
|
|
check(u8"[💩]"_ns, U"[💩]"_ns);
|
|
|
check("\xff"_ns, U"<EFBFBD>"_ns);
|
|
|
|
|
|
check("\xc3"_ns, U"<EFBFBD>"_ns);
|
|
|
check("\xc3?"_ns, U"<EFBFBD>?"_ns);
|
|
|
check("\xc3\xa1"_ns, U"á"_ns);
|
|
|
check("\xe7"_ns, U"<EFBFBD>"_ns);
|
|
|
check("a\xe7"_ns, U"a<EFBFBD>"_ns);
|
|
|
check("\xe7\x8c"_ns, U"<EFBFBD>"_ns);
|
|
|
check("\xe7""a\x8c"_ns, U"<EFBFBD>a<EFBFBD>"_ns);
|
|
|
check("\xe7\x8c\xab"_ns, U"猫"_ns);
|
|
|
check("\x8c\xab"_ns, U"<EFBFBD><EFBFBD>"_ns);
|
|
|
check("\xf0"_ns, U"<EFBFBD>"_ns);
|
|
|
check("\xf0\x9f"_ns, U"<EFBFBD>"_ns);
|
|
|
check("\xf0\x9f\x92"_ns, U"<EFBFBD>"_ns);
|
|
|
check("\xf0\x9f\x92\xa9"_ns, U"💩"_ns);
|
|
|
|
|
|
check(u8"Ǐ옾⦆뺍圆𑄟𐰯🦝𘉾砓"_ns,
|
|
|
U"Ǐ옾⦆뺍圆𑄟𐰯🦝𘉾砓"_ns);
|
|
|
|
|
|
// overlong 0 is supported
|
|
|
check("\xc0\x80"_ns, U"\0"_ns);
|
|
|
// this is utf-8, not wtf-8
|
|
|
check("\xed\xa0\xbd"_ns, U"\xd83d"_ns);
|
|
|
check("\xed\xb2\xa9"_ns, U"\xdca9"_ns);
|
|
|
// CESU-8 half-decode
|
|
|
check("\xed\xa0\xbd\xed\xb2\xa9"_ns, U"\xd83d\xdca9"_ns);
|
|
|
// TODO: post unicode (0x110000)
|
|
|
// check("\xf4\x90\x80\x80"_ns, U"<22>"_ns);
|
|
|
}
|
|
|
|
|
|
TEST_CASE("FromWtf16")
|
|
|
{
|
|
|
std::u32string out;
|
|
|
auto run = [&](U16StringView in)
|
|
|
{
|
|
|
out.clear();
|
|
|
RunColl(in, FromWtf16(PushBackSink(std::ref(out))));
|
|
|
};
|
|
|
|
|
|
// valid
|
|
|
run(u"abcd"_ns); CHECK(out == U"abcd"_ns);
|
|
|
run(u"💩"_ns); CHECK(out == U"💩"_ns);
|
|
|
// missing surrogate pairs
|
|
|
run(u"\xd83d"_ns); CHECK(out == U"\xd83d"_ns);
|
|
|
run(u"\xd83dx"_ns); CHECK(out == U"\xd83dx"_ns);
|
|
|
run(u"\xdca9"_ns); CHECK(out == U"\xdca9"_ns);
|
|
|
run(u"x\xdca9"_ns); CHECK(out == U"x\xdca9"_ns);
|
|
|
run(u"\xdca9\xd83d"_ns); CHECK(out == U"\xdca9\xd83d"_ns);
|
|
|
|
|
|
// also works with UTF-32 input
|
|
|
out.clear();
|
|
|
RunColl(U"\xd83d\xdca9💩ab\xd83d"_ns, FromWtf16(PushBackSink(std::ref(out))));
|
|
|
CHECK(out == U"💩💩ab\xd83d"_ns);
|
|
|
}
|
|
|
|
|
|
TEST_SUITE_END();
|
|
|
}
|