You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
libshit/test/utf_low_level.cpp

205 lines
6.3 KiB
C++

#include "libshit/utf_low_level.hpp"
#include "libshit/doctest.hpp"
#include "libshit/nonowning_string.hpp"
#include "libshit/shared_ptr.hpp"
#include <string>
#include <vector>
namespace Libshit::Utf::Test
{
TEST_SUITE_BEGIN("Libshit::Utf");
TEST_CASE("Get")
{
std::vector<int> in{1,2,3}, o0, o1;
RunColl(in, PushBackSink(std::vector<int>{})); // by value - mostly useless
RunColl(in, PushBackSink(std::ref(o0))); // reference_wrapper
CHECK(o0 == std::vector<int>{1,2,3});
RunColl(in, PushBackSink(&o1)); // ptr
CHECK(o1 == std::vector<int>{1,2,3});
auto o2 = MakeShared<std::vector<int>>();
RunColl(in, PushBackSink(o2));
CHECK(*o2 == std::vector<int>{1,2,3});
}
TEST_CASE("Map")
{
auto fun = [](int x) { return x+1; };
std::vector<int> in{2,5,-3}, out;
RunColl(in, Map(fun, PushBackSink(std::ref(out))));
CHECK(out == std::vector<int>{3,6,-2});
}
TEST_CASE("ReplaceInvalid")
{
std::u32string out;
auto run = [&](U32StringView in)
{
out.clear();
RunColl(in, ReplaceInvalid(PushBackSink(std::ref(out))));
};
// valid cases
run(U"abc"_ns); CHECK(out == U"abc"_ns);
run(U""_ns); CHECK(out == U""_ns);
run(U"💩"_ns); CHECK(out == U"💩"_ns);
run(U"\xd799"_ns); CHECK(out == U"\xd799"_ns);
run(U"\xe000"_ns); CHECK(out == U"\xe000"_ns);
run(U"\x10ffff"_ns); CHECK(out == U"\x10ffff"_ns);
// invalid cases
run(U"\xd800"_ns); CHECK(out == U"<EFBFBD>"_ns);
run(U"\xd812"_ns); CHECK(out == U"<EFBFBD>"_ns);
run(U"\xdbff"_ns); CHECK(out == U"<EFBFBD>"_ns);
run(U"\xdc00"_ns); CHECK(out == U"<EFBFBD>"_ns);
run(U"\xdfff"_ns); CHECK(out == U"<EFBFBD>"_ns);
run(U"\x110000"_ns); CHECK(out == U"<EFBFBD>"_ns);
run(U"\xffffffff"_ns); CHECK(out == U"<EFBFBD>"_ns);
// this is valid utf-16 (surrogate pairs), but not 32
run(U"\xd83d\xdca9"_ns); CHECK(out == U"<EFBFBD><EFBFBD>"_ns);
// mixed
run(U"!\xd800!"_ns); CHECK(out == U"!<21>!"_ns);
run(U"\x110000?"_ns); CHECK(out == U"<EFBFBD>?"_ns);
}
TEST_CASE("ToWtf8Gen")
{
auto test = [](U32StringView in, StringView exp)
{
std::string out;
RunColl(in, ToWtf8(PushBackSink(std::ref(out))));
CHECK(NonowningString{out} == exp);
out.clear();
RunColl(in, ToWtf8Overlong(PushBackSink(std::ref(out))));
CHECK(NonowningString{out} == exp);
};
// ascii
test(U"abcd"_ns, "abcd"_ns);
test(U"\1\2\33\x7f"_ns, "\1\2\33\x7f"_ns);
// two-byte utf8
test(U"\x80"_ns, u8"\u0080"_ns);
test(U"\x81"_ns, "\xc2\x81"_ns);
test(U"äßØ"_ns, u8"äßØ"_ns);
test(U"߿"_ns, u8"߿"_ns); // 0x7ff
// three-byte utf8
test(U""_ns, u8""_ns); // 0x800
test(U"猫ニャーニャー"_ns, u8"猫ニャーニャー"_ns);
test(U"\xffff"_ns, "\xef\xbf\xbf"_ns);
// four-byte utf-8
test(U"𐀀"_ns, u8"𐀀"_ns); // 0x010000
test(U"😂💩🤮"_ns, u8"😂💩🤮"_ns);
test(U"\x10ffff"_ns, u8"\U0010ffff"_ns);
// invalid
test(U"\x110000"_ns, u8"<EFBFBD>"_ns);
test(U"\xffffffff"_ns, u8"<EFBFBD>"_ns);
test(U"\xd83e"_ns, "\xed\xa0\xbe"_ns); // wtf-16 half surrogate pair
test(U"\xd83e\xdd2e"_ns, "\xed\xa0\xbe""\xed\xb4\xae"_ns); // cesu-8
// check overlong 0
std::string out;
RunColl(U"a\0b"_ns, ToWtf8(PushBackSink(std::ref(out))));
CHECK(out == "a\0b"_ns);
out.clear();
RunColl(U"a\0b"_ns, ToWtf8Overlong(PushBackSink(std::ref(out))));
CHECK(out == "a\xc0\x80""b"_ns);
}
TEST_CASE("ToWtf16")
{
std::u16string out;
auto run = [&](U32StringView in)
{
out.clear();
RunColl(in, ToWtf16(PushBackSink(std::ref(out))));
};
run(U"abc"_ns); CHECK(out == u"abc"_ns);
run(U"\0\1"_ns); CHECK(out == u"\0\1"_ns);
// cp < 0x10000 are left alone
run(U"\xd83e\xffff"_ns); CHECK(out == u"\xd83e\xffff"_ns);
// cp >= 0x10000 converted to surrogate pairs
run(U"𐀀"_ns); CHECK(out == u"𐀀"_ns);
run(U"😂💩🤮"_ns); CHECK(out == u"😂💩🤮"_ns);
run(U"\x10ffff"_ns); CHECK(out == u"\U0010ffff"_ns);
// invalid
run(U"\x110000\xffffffff"_ns); CHECK(out == u"<EFBFBD><EFBFBD>"_ns);
}
TEST_CASE("FromWtf8")
{
auto check = [](StringView in, U32StringView exp)
{
CAPTURE(doctest::toString(in));
std::u32string out;
RunColl(in, FromWtf8(PushBackSink(std::ref(out))));
CHECK(NonowningU32String{out} == exp);
};
check(u8""_ns, U""_ns);
check(u8"abc"_ns, U"abc"_ns);
check(u8""_ns, U""_ns);
check(u8"💩x"_ns, U"💩x"_ns);
check(u8"[💩]"_ns, U"[💩]"_ns);
check("\xff"_ns, U"<EFBFBD>"_ns);
check("\xc3"_ns, U"<EFBFBD>"_ns);
check("\xc3?"_ns, U"<EFBFBD>?"_ns);
check("\xc3\xa1"_ns, U"á"_ns);
check("\xe7"_ns, U"<EFBFBD>"_ns);
check("a\xe7"_ns, U"a<EFBFBD>"_ns);
check("\xe7\x8c"_ns, U"<EFBFBD>"_ns);
check("\xe7""a\x8c"_ns, U"<EFBFBD>a<EFBFBD>"_ns);
check("\xe7\x8c\xab"_ns, U""_ns);
check("\x8c\xab"_ns, U"<EFBFBD><EFBFBD>"_ns);
check("\xf0"_ns, U"<EFBFBD>"_ns);
check("\xf0\x9f"_ns, U"<EFBFBD>"_ns);
check("\xf0\x9f\x92"_ns, U"<EFBFBD>"_ns);
check("\xf0\x9f\x92\xa9"_ns, U"💩"_ns);
check(u8"Ǐ옾⦆𞔔𑥹뺍圆𑄟𐰯𚁨𛅭𒜶🦝𘉾砓𑷄"_ns,
U"Ǐ옾⦆𞔔𑥹뺍圆𑄟𐰯𚁨𛅭𒜶🦝𘉾砓𑷄"_ns);
// overlong 0 is supported
check("\xc0\x80"_ns, U"\0"_ns);
// this is utf-8, not wtf-8
check("\xed\xa0\xbd"_ns, U"\xd83d"_ns);
check("\xed\xb2\xa9"_ns, U"\xdca9"_ns);
// CESU-8 half-decode
check("\xed\xa0\xbd\xed\xb2\xa9"_ns, U"\xd83d\xdca9"_ns);
// TODO: post unicode (0x110000)
// check("\xf4\x90\x80\x80"_ns, U"<22>"_ns);
}
TEST_CASE("FromWtf16")
{
std::u32string out;
auto run = [&](U16StringView in)
{
out.clear();
RunColl(in, FromWtf16(PushBackSink(std::ref(out))));
};
// valid
run(u"abcd"_ns); CHECK(out == U"abcd"_ns);
run(u"💩"_ns); CHECK(out == U"💩"_ns);
// missing surrogate pairs
run(u"\xd83d"_ns); CHECK(out == U"\xd83d"_ns);
run(u"\xd83dx"_ns); CHECK(out == U"\xd83dx"_ns);
run(u"\xdca9"_ns); CHECK(out == U"\xdca9"_ns);
run(u"x\xdca9"_ns); CHECK(out == U"x\xdca9"_ns);
run(u"\xdca9\xd83d"_ns); CHECK(out == U"\xdca9\xd83d"_ns);
// also works with UTF-32 input
out.clear();
RunColl(U"\xd83d\xdca9💩ab\xd83d"_ns, FromWtf16(PushBackSink(std::ref(out))));
CHECK(out == U"💩💩ab\xd83d"_ns);
}
TEST_SUITE_END();
}