utf_low_level.cpp (6435B)
1 #include "libshit/utf_low_level.hpp" 2 3 #include "libshit/doctest.hpp" 4 #include "libshit/nonowning_string.hpp" 5 #include "libshit/shared_ptr.hpp" 6 7 #include <string> 8 #include <vector> 9 10 namespace Libshit::Utf::Test 11 { 12 TEST_SUITE_BEGIN("Libshit::Utf"); 13 14 TEST_CASE("Get") 15 { 16 std::vector<int> in{1,2,3}, o0, o1; 17 RunColl(in, PushBackSink(std::vector<int>{})); // by value - mostly useless 18 RunColl(in, PushBackSink(std::ref(o0))); // reference_wrapper 19 CHECK(o0 == std::vector<int>{1,2,3}); 20 RunColl(in, PushBackSink(&o1)); // ptr 21 CHECK(o1 == std::vector<int>{1,2,3}); 22 auto o2 = MakeShared<std::vector<int>>(); 23 RunColl(in, PushBackSink(o2)); 24 CHECK(*o2 == std::vector<int>{1,2,3}); 25 } 26 27 TEST_CASE("Map") 28 { 29 auto fun = [](int x) { return x+1; }; 30 std::vector<int> in{2,5,-3}, out; 31 RunColl(in, Map(fun, PushBackSink(std::ref(out)))); 32 CHECK(out == std::vector<int>{3,6,-2}); 33 } 34 35 TEST_CASE("ReplaceInvalid") 36 { 37 std::u32string out; 38 auto run = [&](U32StringView in) 39 { 40 out.clear(); 41 RunColl(in, ReplaceInvalid(PushBackSink(std::ref(out)))); 42 }; 43 44 // valid cases 45 run(U"abc"_ns); CHECK(out == U"abc"_ns); 46 run(U"猫"_ns); CHECK(out == U"猫"_ns); 47 run(U"💩"_ns); CHECK(out == U"💩"_ns); 48 run(U"\xd799"_ns); CHECK(out == U"\xd799"_ns); 49 run(U"\xe000"_ns); CHECK(out == U"\xe000"_ns); 50 run(U"\x10ffff"_ns); CHECK(out == U"\x10ffff"_ns); 51 52 // invalid cases 53 run(U"\xd800"_ns); CHECK(out == U"�"_ns); 54 run(U"\xd812"_ns); CHECK(out == U"�"_ns); 55 run(U"\xdbff"_ns); CHECK(out == U"�"_ns); 56 run(U"\xdc00"_ns); CHECK(out == U"�"_ns); 57 run(U"\xdfff"_ns); CHECK(out == U"�"_ns); 58 run(U"\x110000"_ns); CHECK(out == U"�"_ns); 59 run(U"\xffffffff"_ns); CHECK(out == U"�"_ns); 60 // this is valid utf-16 (surrogate pairs), but not 32 61 run(U"\xd83d\xdca9"_ns); CHECK(out == U"��"_ns); 62 63 // mixed 64 run(U"!\xd800!"_ns); CHECK(out == U"!�!"_ns); 65 run(U"\x110000?"_ns); CHECK(out == U"�?"_ns); 66 } 67 68 TEST_CASE("ToWtf8Gen") 69 { 70 auto test = [](U32StringView in, StringView exp) 71 { 72 std::string out; 73 RunColl(in, ToWtf8(PushBackSink(std::ref(out)))); 74 CHECK(NonowningString{out} == exp); 75 out.clear(); 76 RunColl(in, ToWtf8Overlong(PushBackSink(std::ref(out)))); 77 CHECK(NonowningString{out} == exp); 78 }; 79 80 // ascii 81 test(U"abcd"_ns, "abcd"_ns); 82 test(U"\1\2\33\x7f"_ns, "\1\2\33\x7f"_ns); 83 // two-byte utf8 84 test(U"\x80"_ns, u8"\u0080"_ns); 85 test(U"\x81"_ns, "\xc2\x81"_ns); 86 test(U"äߨ"_ns, u8"äߨ"_ns); 87 test(U"߿"_ns, u8"߿"_ns); // 0x7ff 88 // three-byte utf8 89 test(U"ࠀ"_ns, u8"ࠀ"_ns); // 0x800 90 test(U"猫ニャーニャー"_ns, u8"猫ニャーニャー"_ns); 91 test(U"\xffff"_ns, "\xef\xbf\xbf"_ns); 92 // four-byte utf-8 93 test(U"𐀀"_ns, u8"𐀀"_ns); // 0x010000 94 test(U"😂💩🤮"_ns, u8"😂💩🤮"_ns); 95 test(U"\x10ffff"_ns, u8"\U0010ffff"_ns); 96 // invalid 97 test(U"\x110000"_ns, u8"�"_ns); 98 test(U"\xffffffff"_ns, u8"�"_ns); 99 100 test(U"\xd83e"_ns, "\xed\xa0\xbe"_ns); // wtf-16 half surrogate pair 101 test(U"\xd83e\xdd2e"_ns, "\xed\xa0\xbe""\xed\xb4\xae"_ns); // cesu-8 102 103 // check overlong 0 104 std::string out; 105 RunColl(U"a\0b"_ns, ToWtf8(PushBackSink(std::ref(out)))); 106 CHECK(out == "a\0b"_ns); 107 out.clear(); 108 RunColl(U"a\0b"_ns, ToWtf8Overlong(PushBackSink(std::ref(out)))); 109 CHECK(out == "a\xc0\x80""b"_ns); 110 } 111 112 TEST_CASE("ToWtf16") 113 { 114 std::u16string out; 115 auto run = [&](U32StringView in) 116 { 117 out.clear(); 118 RunColl(in, ToWtf16(PushBackSink(std::ref(out)))); 119 }; 120 121 run(U"abc"_ns); CHECK(out == u"abc"_ns); 122 run(U"\0\1"_ns); CHECK(out == u"\0\1"_ns); 123 // cp < 0x10000 are left alone 124 run(U"\xd83e\xffff"_ns); CHECK(out == u"\xd83e\xffff"_ns); 125 // cp >= 0x10000 converted to surrogate pairs 126 run(U"𐀀"_ns); CHECK(out == u"𐀀"_ns); 127 run(U"😂💩🤮"_ns); CHECK(out == u"😂💩🤮"_ns); 128 run(U"\x10ffff"_ns); CHECK(out == u"\U0010ffff"_ns); 129 // invalid 130 run(U"\x110000\xffffffff"_ns); CHECK(out == u"��"_ns); 131 } 132 133 TEST_CASE("FromWtf8") 134 { 135 auto check = [](StringView in, U32StringView exp) 136 { 137 CAPTURE(doctest::toString(in)); 138 std::u32string out; 139 RunColl(in, FromWtf8(PushBackSink(std::ref(out)))); 140 CHECK(NonowningU32String{out} == exp); 141 }; 142 143 check(u8""_ns, U""_ns); 144 check(u8"abc"_ns, U"abc"_ns); 145 check(u8"猫"_ns, U"猫"_ns); 146 check(u8"💩x"_ns, U"💩x"_ns); 147 check(u8"[💩]"_ns, U"[💩]"_ns); 148 check("\xff"_ns, U"�"_ns); 149 150 check("\xc3"_ns, U"�"_ns); 151 check("\xc3?"_ns, U"�?"_ns); 152 check("\xc3\xa1"_ns, U"á"_ns); 153 check("\xe7"_ns, U"�"_ns); 154 check("a\xe7"_ns, U"a�"_ns); 155 check("\xe7\x8c"_ns, U"�"_ns); 156 check("\xe7""a\x8c"_ns, U"�a�"_ns); 157 check("\xe7\x8c\xab"_ns, U"猫"_ns); 158 check("\x8c\xab"_ns, U"��"_ns); 159 check("\xf0"_ns, U"�"_ns); 160 check("\xf0\x9f"_ns, U"�"_ns); 161 check("\xf0\x9f\x92"_ns, U"�"_ns); 162 check("\xf0\x9f\x92\xa9"_ns, U"💩"_ns); 163 164 check(u8"Ǐ옾⦆뺍圆𑄟𐰯🦝𘉾砓"_ns, 165 U"Ǐ옾⦆뺍圆𑄟𐰯🦝𘉾砓"_ns); 166 167 // overlong 0 is supported 168 check("\xc0\x80"_ns, U"\0"_ns); 169 // this is utf-8, not wtf-8 170 check("\xed\xa0\xbd"_ns, U"\xd83d"_ns); 171 check("\xed\xb2\xa9"_ns, U"\xdca9"_ns); 172 // CESU-8 half-decode 173 check("\xed\xa0\xbd\xed\xb2\xa9"_ns, U"\xd83d\xdca9"_ns); 174 // TODO: post unicode (0x110000) 175 // check("\xf4\x90\x80\x80"_ns, U"�"_ns); 176 } 177 178 TEST_CASE("FromWtf16") 179 { 180 std::u32string out; 181 auto run = [&](U16StringView in) 182 { 183 out.clear(); 184 RunColl(in, FromWtf16(PushBackSink(std::ref(out)))); 185 }; 186 187 // valid 188 run(u"abcd"_ns); CHECK(out == U"abcd"_ns); 189 run(u"💩"_ns); CHECK(out == U"💩"_ns); 190 // missing surrogate pairs 191 run(u"\xd83d"_ns); CHECK(out == U"\xd83d"_ns); 192 run(u"\xd83dx"_ns); CHECK(out == U"\xd83dx"_ns); 193 run(u"\xdca9"_ns); CHECK(out == U"\xdca9"_ns); 194 run(u"x\xdca9"_ns); CHECK(out == U"x\xdca9"_ns); 195 run(u"\xdca9\xd83d"_ns); CHECK(out == U"\xdca9\xd83d"_ns); 196 197 // also works with UTF-32 input 198 out.clear(); 199 RunColl(U"\xd83d\xdca9💩ab\xd83d"_ns, FromWtf16(PushBackSink(std::ref(out)))); 200 CHECK(out == U"💩💩ab\xd83d"_ns); 201 } 202 203 TEST_SUITE_END(); 204 }