libshit

Just some random shit
git clone https://git.neptards.moe/neptards/libshit.git
Log | Files | Refs | Submodules | README | LICENSE

utf_low_level.cpp (6435B)


      1 #include "libshit/utf_low_level.hpp"
      2 
      3 #include "libshit/doctest.hpp"
      4 #include "libshit/nonowning_string.hpp"
      5 #include "libshit/shared_ptr.hpp"
      6 
      7 #include <string>
      8 #include <vector>
      9 
     10 namespace Libshit::Utf::Test
     11 {
     12   TEST_SUITE_BEGIN("Libshit::Utf");
     13 
     14   TEST_CASE("Get")
     15   {
     16     std::vector<int> in{1,2,3}, o0, o1;
     17     RunColl(in, PushBackSink(std::vector<int>{})); // by value - mostly useless
     18     RunColl(in, PushBackSink(std::ref(o0))); // reference_wrapper
     19     CHECK(o0 == std::vector<int>{1,2,3});
     20     RunColl(in, PushBackSink(&o1)); // ptr
     21     CHECK(o1 == std::vector<int>{1,2,3});
     22     auto o2 = MakeShared<std::vector<int>>();
     23     RunColl(in, PushBackSink(o2));
     24     CHECK(*o2 == std::vector<int>{1,2,3});
     25   }
     26 
     27   TEST_CASE("Map")
     28   {
     29     auto fun = [](int x) { return x+1; };
     30     std::vector<int> in{2,5,-3}, out;
     31     RunColl(in, Map(fun, PushBackSink(std::ref(out))));
     32     CHECK(out == std::vector<int>{3,6,-2});
     33   }
     34 
     35   TEST_CASE("ReplaceInvalid")
     36   {
     37     std::u32string out;
     38     auto run = [&](U32StringView in)
     39     {
     40       out.clear();
     41       RunColl(in, ReplaceInvalid(PushBackSink(std::ref(out))));
     42     };
     43 
     44     // valid cases
     45     run(U"abc"_ns); CHECK(out == U"abc"_ns);
     46     run(U"猫"_ns); CHECK(out == U"猫"_ns);
     47     run(U"💩"_ns); CHECK(out == U"💩"_ns);
     48     run(U"\xd799"_ns); CHECK(out == U"\xd799"_ns);
     49     run(U"\xe000"_ns); CHECK(out == U"\xe000"_ns);
     50     run(U"\x10ffff"_ns); CHECK(out == U"\x10ffff"_ns);
     51 
     52     // invalid cases
     53     run(U"\xd800"_ns); CHECK(out == U"�"_ns);
     54     run(U"\xd812"_ns); CHECK(out == U"�"_ns);
     55     run(U"\xdbff"_ns); CHECK(out == U"�"_ns);
     56     run(U"\xdc00"_ns); CHECK(out == U"�"_ns);
     57     run(U"\xdfff"_ns); CHECK(out == U"�"_ns);
     58     run(U"\x110000"_ns); CHECK(out == U"�"_ns);
     59     run(U"\xffffffff"_ns); CHECK(out == U"�"_ns);
     60     // this is valid utf-16 (surrogate pairs), but not 32
     61     run(U"\xd83d\xdca9"_ns); CHECK(out == U"��"_ns);
     62 
     63     // mixed
     64     run(U"!\xd800!"_ns); CHECK(out == U"!�!"_ns);
     65     run(U"\x110000?"_ns); CHECK(out == U"�?"_ns);
     66   }
     67 
     68   TEST_CASE("ToWtf8Gen")
     69   {
     70     auto test = [](U32StringView in, StringView exp)
     71     {
     72       std::string out;
     73       RunColl(in, ToWtf8(PushBackSink(std::ref(out))));
     74       CHECK(NonowningString{out} == exp);
     75       out.clear();
     76       RunColl(in, ToWtf8Overlong(PushBackSink(std::ref(out))));
     77       CHECK(NonowningString{out} == exp);
     78     };
     79 
     80     // ascii
     81     test(U"abcd"_ns, "abcd"_ns);
     82     test(U"\1\2\33\x7f"_ns, "\1\2\33\x7f"_ns);
     83     // two-byte utf8
     84     test(U"\x80"_ns, u8"\u0080"_ns);
     85     test(U"\x81"_ns, "\xc2\x81"_ns);
     86     test(U"äߨ"_ns, u8"äߨ"_ns);
     87     test(U"߿"_ns, u8"߿"_ns); // 0x7ff
     88     // three-byte utf8
     89     test(U"ࠀ"_ns, u8"ࠀ"_ns); // 0x800
     90     test(U"猫ニャーニャー"_ns, u8"猫ニャーニャー"_ns);
     91     test(U"\xffff"_ns, "\xef\xbf\xbf"_ns);
     92     // four-byte utf-8
     93     test(U"𐀀"_ns, u8"𐀀"_ns); // 0x010000
     94     test(U"😂💩🤮"_ns, u8"😂💩🤮"_ns);
     95     test(U"\x10ffff"_ns, u8"\U0010ffff"_ns);
     96     // invalid
     97     test(U"\x110000"_ns, u8"�"_ns);
     98     test(U"\xffffffff"_ns, u8"�"_ns);
     99 
    100     test(U"\xd83e"_ns, "\xed\xa0\xbe"_ns); // wtf-16 half surrogate pair
    101     test(U"\xd83e\xdd2e"_ns, "\xed\xa0\xbe""\xed\xb4\xae"_ns); // cesu-8
    102 
    103     // check overlong 0
    104     std::string out;
    105     RunColl(U"a\0b"_ns, ToWtf8(PushBackSink(std::ref(out))));
    106     CHECK(out == "a\0b"_ns);
    107     out.clear();
    108     RunColl(U"a\0b"_ns, ToWtf8Overlong(PushBackSink(std::ref(out))));
    109     CHECK(out == "a\xc0\x80""b"_ns);
    110   }
    111 
    112   TEST_CASE("ToWtf16")
    113   {
    114     std::u16string out;
    115     auto run = [&](U32StringView in)
    116     {
    117       out.clear();
    118       RunColl(in, ToWtf16(PushBackSink(std::ref(out))));
    119     };
    120 
    121     run(U"abc"_ns); CHECK(out == u"abc"_ns);
    122     run(U"\0\1"_ns); CHECK(out == u"\0\1"_ns);
    123     // cp < 0x10000 are left alone
    124     run(U"\xd83e\xffff"_ns); CHECK(out == u"\xd83e\xffff"_ns);
    125     // cp >= 0x10000 converted to surrogate pairs
    126     run(U"𐀀"_ns); CHECK(out == u"𐀀"_ns);
    127     run(U"😂💩🤮"_ns); CHECK(out == u"😂💩🤮"_ns);
    128     run(U"\x10ffff"_ns); CHECK(out == u"\U0010ffff"_ns);
    129     // invalid
    130     run(U"\x110000\xffffffff"_ns); CHECK(out == u"��"_ns);
    131   }
    132 
    133   TEST_CASE("FromWtf8")
    134   {
    135     auto check = [](StringView in, U32StringView exp)
    136     {
    137       CAPTURE(doctest::toString(in));
    138       std::u32string out;
    139       RunColl(in, FromWtf8(PushBackSink(std::ref(out))));
    140       CHECK(NonowningU32String{out} == exp);
    141     };
    142 
    143     check(u8""_ns, U""_ns);
    144     check(u8"abc"_ns, U"abc"_ns);
    145     check(u8"猫"_ns, U"猫"_ns);
    146     check(u8"💩x"_ns, U"💩x"_ns);
    147     check(u8"[💩]"_ns, U"[💩]"_ns);
    148     check("\xff"_ns, U"�"_ns);
    149 
    150     check("\xc3"_ns, U"�"_ns);
    151     check("\xc3?"_ns, U"�?"_ns);
    152     check("\xc3\xa1"_ns, U"á"_ns);
    153     check("\xe7"_ns, U"�"_ns);
    154     check("a\xe7"_ns, U"a�"_ns);
    155     check("\xe7\x8c"_ns, U"�"_ns);
    156     check("\xe7""a\x8c"_ns, U"�a�"_ns);
    157     check("\xe7\x8c\xab"_ns, U"猫"_ns);
    158     check("\x8c\xab"_ns, U"��"_ns);
    159     check("\xf0"_ns, U"�"_ns);
    160     check("\xf0\x9f"_ns, U"�"_ns);
    161     check("\xf0\x9f\x92"_ns, U"�"_ns);
    162     check("\xf0\x9f\x92\xa9"_ns, U"💩"_ns);
    163 
    164     check(u8"Ǐ옾⦆𞔔𑥹뺍圆𑄟𐰯𚁨𛅭𒜶🦝𘉾砓𑷄"_ns,
    165           U"Ǐ옾⦆𞔔𑥹뺍圆𑄟𐰯𚁨𛅭𒜶🦝𘉾砓𑷄"_ns);
    166 
    167     // overlong 0 is supported
    168     check("\xc0\x80"_ns, U"\0"_ns);
    169     // this is utf-8, not wtf-8
    170     check("\xed\xa0\xbd"_ns, U"\xd83d"_ns);
    171     check("\xed\xb2\xa9"_ns, U"\xdca9"_ns);
    172     // CESU-8 half-decode
    173     check("\xed\xa0\xbd\xed\xb2\xa9"_ns, U"\xd83d\xdca9"_ns);
    174     // TODO: post unicode (0x110000)
    175     // check("\xf4\x90\x80\x80"_ns, U"�"_ns);
    176   }
    177 
    178   TEST_CASE("FromWtf16")
    179   {
    180     std::u32string out;
    181     auto run = [&](U16StringView in)
    182     {
    183       out.clear();
    184       RunColl(in, FromWtf16(PushBackSink(std::ref(out))));
    185     };
    186 
    187     // valid
    188     run(u"abcd"_ns); CHECK(out == U"abcd"_ns);
    189     run(u"💩"_ns); CHECK(out == U"💩"_ns);
    190     // missing surrogate pairs
    191     run(u"\xd83d"_ns); CHECK(out == U"\xd83d"_ns);
    192     run(u"\xd83dx"_ns); CHECK(out == U"\xd83dx"_ns);
    193     run(u"\xdca9"_ns); CHECK(out == U"\xdca9"_ns);
    194     run(u"x\xdca9"_ns); CHECK(out == U"x\xdca9"_ns);
    195     run(u"\xdca9\xd83d"_ns); CHECK(out == U"\xdca9\xd83d"_ns);
    196 
    197     // also works with UTF-32 input
    198     out.clear();
    199     RunColl(U"\xd83d\xdca9💩ab\xd83d"_ns, FromWtf16(PushBackSink(std::ref(out))));
    200     CHECK(out == U"💩💩ab\xd83d"_ns);
    201   }
    202 
    203   TEST_SUITE_END();
    204 }