encoding.c++ - capnproto - FORK: Cap'n Proto serialization/RPC system

encoding.c++ (31878B)
      1 // Copyright (c) 2017 Cloudflare, Inc.; Sandstorm Development Group, Inc.; and contributors
      2 // Licensed under the MIT License:
      3 //
      4 // Permission is hereby granted, free of charge, to any person obtaining a copy
      5 // of this software and associated documentation files (the "Software"), to deal
      6 // in the Software without restriction, including without limitation the rights
      7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      8 // copies of the Software, and to permit persons to whom the Software is
      9 // furnished to do so, subject to the following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included in
     12 // all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     20 // THE SOFTWARE.
     21 
     22 #include "encoding.h"
     23 #include "vector.h"
     24 #include "debug.h"
     25 
     26 namespace kj {
     27 
     28 namespace {
     29 
     30 #define GOTO_ERROR_IF(cond) if (KJ_UNLIKELY(cond)) goto error
     31 
     32 inline void addChar32(Vector<char16_t>& vec, char32_t u) {
     33   // Encode as surrogate pair.
     34   u -= 0x10000;
     35   vec.add(0xd800 | (u >> 10));
     36   vec.add(0xdc00 | (u & 0x03ff));
     37 }
     38 
     39 inline void addChar32(Vector<char32_t>& vec, char32_t u) {
     40   vec.add(u);
     41 }
     42 
     43 template <typename T>
     44 EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
     45   Vector<T> result(text.size() + nulTerminate);
     46   bool hadErrors = false;
     47 
     48   size_t i = 0;
     49   while (i < text.size()) {
     50     byte c = text[i++];
     51     if (c < 0x80) {
     52       // 0xxxxxxx -- ASCII
     53       result.add(c);
     54       continue;
     55     } else if (KJ_UNLIKELY(c < 0xc0)) {
     56       // 10xxxxxx -- malformed continuation byte
     57       goto error;
     58     } else if (c < 0xe0) {
     59       // 110xxxxx -- 2-byte
     60       byte c2;
     61       GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
     62       char16_t u = (static_cast<char16_t>(c  & 0x1f) <<  6)
     63                  | (static_cast<char16_t>(c2 & 0x3f)      );
     64 
     65       // Disallow overlong sequence.
     66       GOTO_ERROR_IF(u < 0x80);
     67 
     68       result.add(u);
     69       continue;
     70     } else if (c < 0xf0) {
     71       // 1110xxxx -- 3-byte
     72       byte c2, c3;
     73       GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
     74       GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
     75       char16_t u = (static_cast<char16_t>(c  & 0x0f) << 12)
     76                  | (static_cast<char16_t>(c2 & 0x3f) <<  6)
     77                  | (static_cast<char16_t>(c3 & 0x3f)      );
     78 
     79       // Disallow overlong sequence.
     80       GOTO_ERROR_IF(u < 0x0800);
     81 
     82       // Flag surrogate pair code points as errors, but allow them through.
     83       if (KJ_UNLIKELY((u & 0xf800) == 0xd800)) {
     84         if (result.size() > 0 &&
     85             (u & 0xfc00) == 0xdc00 &&
     86             (result.back() & 0xfc00) == 0xd800) {
     87           // Whoops, the *previous* character was also an invalid surrogate, and if we add this
     88           // one too, they'll form a valid surrogate pair. If we allowed this, then it would mean
     89           // invalid UTF-8 round-tripped to UTF-16 and back could actually change meaning entirely.
     90           // OTOH, the reason we allow dangling surrogates is to allow invalid UTF-16 to round-trip
     91           // to UTF-8 without loss, but if the original UTF-16 had a valid surrogate pair, it would
     92           // have been encoded as a valid single UTF-8 codepoint, not as separate UTF-8 codepoints
     93           // for each surrogate.
     94           goto error;
     95         }
     96 
     97         hadErrors = true;
     98       }
     99 
    100       result.add(u);
    101       continue;
    102     } else if (c < 0xf8) {
    103       // 11110xxx -- 4-byte
    104       byte c2, c3, c4;
    105       GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
    106       GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
    107       GOTO_ERROR_IF(i == text.size() || ((c4 = text[i]) & 0xc0) != 0x80); ++i;
    108       char32_t u = (static_cast<char32_t>(c  & 0x07) << 18)
    109                  | (static_cast<char32_t>(c2 & 0x3f) << 12)
    110                  | (static_cast<char32_t>(c3 & 0x3f) <<  6)
    111                  | (static_cast<char32_t>(c4 & 0x3f)      );
    112 
    113       // Disallow overlong sequence.
    114       GOTO_ERROR_IF(u < 0x10000);
    115 
    116       // Unicode ends at U+10FFFF
    117       GOTO_ERROR_IF(u >= 0x110000);
    118 
    119       addChar32(result, u);
    120       continue;
    121     } else {
    122       // 5-byte and 6-byte sequences are not legal as they'd result in codepoints outside the
    123       // range of Unicode.
    124       goto error;
    125     }
    126 
    127   error:
    128     result.add(0xfffd);
    129     hadErrors = true;
    130     // Ignore all continuation bytes.
    131     while (i < text.size() && (text[i] & 0xc0) == 0x80) {
    132       ++i;
    133     }
    134   }
    135 
    136   if (nulTerminate) result.add(0);
    137 
    138   return { result.releaseAsArray(), hadErrors };
    139 }
    140 
    141 }  // namespace
    142 
    143 EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
    144   return encodeUtf<char16_t>(text, nulTerminate);
    145 }
    146 
    147 EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
    148   return encodeUtf<char32_t>(text, nulTerminate);
    149 }
    150 
    151 EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
    152   Vector<char> result(utf16.size() + 1);
    153   bool hadErrors = false;
    154 
    155   size_t i = 0;
    156   while (i < utf16.size()) {
    157     char16_t u = utf16[i++];
    158 
    159     if (u < 0x80) {
    160       result.add(u);
    161       continue;
    162     } else if (u < 0x0800) {
    163       result.addAll<std::initializer_list<char>>({
    164         static_cast<char>(((u >>  6)       ) | 0xc0),
    165         static_cast<char>(((u      ) & 0x3f) | 0x80)
    166       });
    167       continue;
    168     } else if ((u & 0xf800) == 0xd800) {
    169       // surrogate pair
    170       char16_t u2;
    171       if (KJ_UNLIKELY(i == utf16.size()                         // missing second half
    172                    || (u & 0x0400) != 0                         // first half in wrong range
    173                    || ((u2 = utf16[i]) & 0xfc00) != 0xdc00)) {  // second half in wrong range
    174         hadErrors = true;
    175         goto threeByte;
    176       }
    177       ++i;
    178 
    179       char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000;
    180       result.addAll<std::initializer_list<char>>({
    181         static_cast<char>(((u32 >> 18)       ) | 0xf0),
    182         static_cast<char>(((u32 >> 12) & 0x3f) | 0x80),
    183         static_cast<char>(((u32 >>  6) & 0x3f) | 0x80),
    184         static_cast<char>(((u32      ) & 0x3f) | 0x80)
    185       });
    186       continue;
    187     } else {
    188     threeByte:
    189       result.addAll<std::initializer_list<char>>({
    190         static_cast<char>(((u >> 12)       ) | 0xe0),
    191         static_cast<char>(((u >>  6) & 0x3f) | 0x80),
    192         static_cast<char>(((u      ) & 0x3f) | 0x80)
    193       });
    194       continue;
    195     }
    196   }
    197 
    198   result.add(0);
    199   return { String(result.releaseAsArray()), hadErrors };
    200 }
    201 
    202 EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
    203   Vector<char> result(utf16.size() + 1);
    204   bool hadErrors = false;
    205 
    206   size_t i = 0;
    207   while (i < utf16.size()) {
    208     char32_t u = utf16[i++];
    209 
    210     if (u < 0x80) {
    211       result.add(u);
    212       continue;
    213     } else if (u < 0x0800) {
    214       result.addAll<std::initializer_list<char>>({
    215         static_cast<char>(((u >>  6)       ) | 0xc0),
    216         static_cast<char>(((u      ) & 0x3f) | 0x80)
    217       });
    218       continue;
    219     } else if (u < 0x10000) {
    220       if (KJ_UNLIKELY((u & 0xfffff800) == 0xd800)) {
    221         // no surrogates allowed in utf-32
    222         hadErrors = true;
    223       }
    224       result.addAll<std::initializer_list<char>>({
    225         static_cast<char>(((u >> 12)       ) | 0xe0),
    226         static_cast<char>(((u >>  6) & 0x3f) | 0x80),
    227         static_cast<char>(((u      ) & 0x3f) | 0x80)
    228       });
    229       continue;
    230     } else {
    231       GOTO_ERROR_IF(u >= 0x110000);  // outside Unicode range
    232       result.addAll<std::initializer_list<char>>({
    233         static_cast<char>(((u >> 18)       ) | 0xf0),
    234         static_cast<char>(((u >> 12) & 0x3f) | 0x80),
    235         static_cast<char>(((u >>  6) & 0x3f) | 0x80),
    236         static_cast<char>(((u      ) & 0x3f) | 0x80)
    237       });
    238       continue;
    239     }
    240 
    241   error:
    242     result.addAll(StringPtr(u8"\ufffd"));
    243     hadErrors = true;
    244   }
    245 
    246   result.add(0);
    247   return { String(result.releaseAsArray()), hadErrors };
    248 }
    249 
    250 namespace {
    251 
    252 #if __GNUC__ >= 8 && !__clang__
    253 // GCC 8's new class-memaccess warning rightly dislikes the following hacks, but we're really sure
    254 // we want to allow them so disable the warning.
    255 #pragma GCC diagnostic push
    256 #pragma GCC diagnostic ignored "-Wclass-memaccess"
    257 #endif
    258 
    259 template <typename To, typename From>
    260 Array<To> coerceTo(Array<From>&& array) {
    261   static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
    262   Array<wchar_t> result;
    263   memcpy(&result, &array, sizeof(array));
    264   memset(&array, 0, sizeof(array));
    265   return result;
    266 }
    267 
    268 template <typename To, typename From>
    269 ArrayPtr<To> coerceTo(ArrayPtr<From> array) {
    270   static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
    271   return arrayPtr(reinterpret_cast<To*>(array.begin()), array.size());
    272 }
    273 
    274 template <typename To, typename From>
    275 EncodingResult<Array<To>> coerceTo(EncodingResult<Array<From>>&& result) {
    276   return { coerceTo<To>(Array<From>(kj::mv(result))), result.hadErrors };
    277 }
    278 
    279 #if __GNUC__ >= 8 && !__clang__
    280 #pragma GCC diagnostic pop
    281 #endif
    282 
    283 template <size_t s>
    284 struct WideConverter;
    285 
    286 template <>
    287 struct WideConverter<sizeof(char)> {
    288   typedef char Type;
    289 
    290   static EncodingResult<Array<char>> encode(ArrayPtr<const char> text, bool nulTerminate) {
    291     auto result = heapArray<char>(text.size() + nulTerminate);
    292     memcpy(result.begin(), text.begin(), text.size());
    293     if (nulTerminate) result.back() = 0;
    294     return { kj::mv(result), false };
    295   }
    296 
    297   static EncodingResult<kj::String> decode(ArrayPtr<const char> text) {
    298     return { kj::heapString(text), false };
    299   }
    300 };
    301 
    302 template <>
    303 struct WideConverter<sizeof(char16_t)> {
    304   typedef char16_t Type;
    305 
    306   static inline EncodingResult<Array<char16_t>> encode(
    307       ArrayPtr<const char> text, bool nulTerminate) {
    308     return encodeUtf16(text, nulTerminate);
    309   }
    310 
    311   static inline EncodingResult<kj::String> decode(ArrayPtr<const char16_t> text) {
    312     return decodeUtf16(text);
    313   }
    314 };
    315 
    316 template <>
    317 struct WideConverter<sizeof(char32_t)> {
    318   typedef char32_t Type;
    319 
    320   static inline EncodingResult<Array<char32_t>> encode(
    321       ArrayPtr<const char> text, bool nulTerminate) {
    322     return encodeUtf32(text, nulTerminate);
    323   }
    324 
    325   static inline EncodingResult<kj::String> decode(ArrayPtr<const char32_t> text) {
    326     return decodeUtf32(text);
    327   }
    328 };
    329 
    330 }  // namespace
    331 
    332 EncodingResult<Array<wchar_t>> encodeWideString(ArrayPtr<const char> text, bool nulTerminate) {
    333   return coerceTo<wchar_t>(WideConverter<sizeof(wchar_t)>::encode(text, nulTerminate));
    334 }
    335 EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide) {
    336   using Converter = WideConverter<sizeof(wchar_t)>;
    337   return Converter::decode(coerceTo<const Converter::Type>(wide));
    338 }
    339 
    340 // =======================================================================================
    341 
    342 namespace {
    343 
    344 const char HEX_DIGITS[] = "0123456789abcdef";
    345 // Maps integer in the range [0,16) to a hex digit.
    346 
    347 const char HEX_DIGITS_URI[] = "0123456789ABCDEF";
    348 // RFC 3986 section 2.1 says "For consistency, URI producers and normalizers should use uppercase
    349 // hexadecimal digits for all percent-encodings.
    350 
    351 static Maybe<uint> tryFromHexDigit(char c) {
    352   if ('0' <= c && c <= '9') {
    353     return c - '0';
    354   } else if ('a' <= c && c <= 'f') {
    355     return c - ('a' - 10);
    356   } else if ('A' <= c && c <= 'F') {
    357     return c - ('A' - 10);
    358   } else {
    359     return nullptr;
    360   }
    361 }
    362 
    363 static Maybe<uint> tryFromOctDigit(char c) {
    364   if ('0' <= c && c <= '7') {
    365     return c - '0';
    366   } else {
    367     return nullptr;
    368   }
    369 }
    370 
    371 }  // namespace
    372 
    373 String encodeHex(ArrayPtr<const byte> input) {
    374   return strArray(KJ_MAP(b, input) {
    375     return heapArray<char>({HEX_DIGITS[b/16], HEX_DIGITS[b%16]});
    376   }, "");
    377 }
    378 
    379 EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) {
    380   auto result = heapArray<byte>(text.size() / 2);
    381   bool hadErrors = text.size() % 2;
    382 
    383   for (auto i: kj::indices(result)) {
    384     byte b = 0;
    385     KJ_IF_MAYBE(d1, tryFromHexDigit(text[i*2])) {
    386       b = *d1 << 4;
    387     } else {
    388       hadErrors = true;
    389     }
    390     KJ_IF_MAYBE(d2, tryFromHexDigit(text[i*2+1])) {
    391       b |= *d2;
    392     } else {
    393       hadErrors = true;
    394     }
    395     result[i] = b;
    396   }
    397 
    398   return { kj::mv(result), hadErrors };
    399 }
    400 
    401 String encodeUriComponent(ArrayPtr<const byte> bytes) {
    402   Vector<char> result(bytes.size() + 1);
    403   for (byte b: bytes) {
    404     if (('A' <= b && b <= 'Z') ||
    405         ('a' <= b && b <= 'z') ||
    406         ('0' <= b && b <= '9') ||
    407         b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' ||
    408         b == '(' || b == ')') {
    409       result.add(b);
    410     } else {
    411       result.add('%');
    412       result.add(HEX_DIGITS_URI[b/16]);
    413       result.add(HEX_DIGITS_URI[b%16]);
    414     }
    415   }
    416   result.add('\0');
    417   return String(result.releaseAsArray());
    418 }
    419 
    420 String encodeUriFragment(ArrayPtr<const byte> bytes) {
    421   Vector<char> result(bytes.size() + 1);
    422   for (byte b: bytes) {
    423     if (('?' <= b && b <= '_') || // covers A-Z
    424         ('a' <= b && b <= '~') || // covers a-z
    425         ('&' <= b && b <= ';') || // covers 0-9
    426         b == '!' || b == '=' || b == '#' || b == '$') {
    427       result.add(b);
    428     } else {
    429       result.add('%');
    430       result.add(HEX_DIGITS_URI[b/16]);
    431       result.add(HEX_DIGITS_URI[b%16]);
    432     }
    433   }
    434   result.add('\0');
    435   return String(result.releaseAsArray());
    436 }
    437 
    438 String encodeUriPath(ArrayPtr<const byte> bytes) {
    439   Vector<char> result(bytes.size() + 1);
    440   for (byte b: bytes) {
    441     if (('@' <= b && b <= '[') || // covers A-Z
    442         ('a' <= b && b <= 'z') ||
    443         ('0' <= b && b <= ';') || // covers 0-9
    444         ('&' <= b && b <= '.') ||
    445         b == '_' || b == '!' || b == '=' || b == ']' ||
    446         b == '^' || b == '|' || b == '~' || b == '$') {
    447       result.add(b);
    448     } else {
    449       result.add('%');
    450       result.add(HEX_DIGITS_URI[b/16]);
    451       result.add(HEX_DIGITS_URI[b%16]);
    452     }
    453   }
    454   result.add('\0');
    455   return String(result.releaseAsArray());
    456 }
    457 
    458 String encodeUriUserInfo(ArrayPtr<const byte> bytes) {
    459   Vector<char> result(bytes.size() + 1);
    460   for (byte b: bytes) {
    461     if (('A' <= b && b <= 'Z') ||
    462         ('a' <= b && b <= 'z') ||
    463         ('0' <= b && b <= '9') ||
    464         ('&' <= b && b <= '.') ||
    465         b == '_' || b == '!' || b == '~' || b == '$') {
    466       result.add(b);
    467     } else {
    468       result.add('%');
    469       result.add(HEX_DIGITS_URI[b/16]);
    470       result.add(HEX_DIGITS_URI[b%16]);
    471     }
    472   }
    473   result.add('\0');
    474   return String(result.releaseAsArray());
    475 }
    476 
    477 String encodeWwwForm(ArrayPtr<const byte> bytes) {
    478   Vector<char> result(bytes.size() + 1);
    479   for (byte b: bytes) {
    480     if (('A' <= b && b <= 'Z') ||
    481         ('a' <= b && b <= 'z') ||
    482         ('0' <= b && b <= '9') ||
    483         b == '-' || b == '_' || b == '.' || b == '*') {
    484       result.add(b);
    485     } else if (b == ' ') {
    486       result.add('+');
    487     } else {
    488       result.add('%');
    489       result.add(HEX_DIGITS_URI[b/16]);
    490       result.add(HEX_DIGITS_URI[b%16]);
    491     }
    492   }
    493   result.add('\0');
    494   return String(result.releaseAsArray());
    495 }
    496 
    497 EncodingResult<Array<byte>> decodeBinaryUriComponent(
    498     ArrayPtr<const char> text, DecodeUriOptions options) {
    499   Vector<byte> result(text.size() + options.nulTerminate);
    500   bool hadErrors = false;
    501 
    502   const char* ptr = text.begin();
    503   const char* end = text.end();
    504   while (ptr < end) {
    505     if (*ptr == '%') {
    506       ++ptr;
    507 
    508       if (ptr == end) {
    509         hadErrors = true;
    510       } else KJ_IF_MAYBE(d1, tryFromHexDigit(*ptr)) {
    511         byte b = *d1;
    512         ++ptr;
    513         if (ptr == end) {
    514           hadErrors = true;
    515         } else KJ_IF_MAYBE(d2, tryFromHexDigit(*ptr)) {
    516           b = (b << 4) | *d2;
    517           ++ptr;
    518         } else {
    519           hadErrors = true;
    520         }
    521         result.add(b);
    522       } else {
    523         hadErrors = true;
    524       }
    525     } else if (options.plusToSpace && *ptr == '+') {
    526       ++ptr;
    527       result.add(' ');
    528     } else {
    529       result.add(*ptr++);
    530     }
    531   }
    532 
    533   if (options.nulTerminate) result.add(0);
    534   return { result.releaseAsArray(), hadErrors };
    535 }
    536 
    537 // =======================================================================================
    538 
    539 namespace _ { // private
    540 
    541 String encodeCEscapeImpl(ArrayPtr<const byte> bytes, bool isBinary) {
    542   Vector<char> escaped(bytes.size());
    543 
    544   for (byte b: bytes) {
    545     switch (b) {
    546       case '\a': escaped.addAll(StringPtr("\\a")); break;
    547       case '\b': escaped.addAll(StringPtr("\\b")); break;
    548       case '\f': escaped.addAll(StringPtr("\\f")); break;
    549       case '\n': escaped.addAll(StringPtr("\\n")); break;
    550       case '\r': escaped.addAll(StringPtr("\\r")); break;
    551       case '\t': escaped.addAll(StringPtr("\\t")); break;
    552       case '\v': escaped.addAll(StringPtr("\\v")); break;
    553       case '\'': escaped.addAll(StringPtr("\\\'")); break;
    554       case '\"': escaped.addAll(StringPtr("\\\"")); break;
    555       case '\\': escaped.addAll(StringPtr("\\\\")); break;
    556       default:
    557         if (b < 0x20 || b == 0x7f || (isBinary && b > 0x7f)) {
    558           // Use octal escape, not hex, because hex escapes technically have no length limit and
    559           // so can create ambiguity with subsequent characters.
    560           escaped.add('\\');
    561           escaped.add(HEX_DIGITS[b / 64]);
    562           escaped.add(HEX_DIGITS[(b / 8) % 8]);
    563           escaped.add(HEX_DIGITS[b % 8]);
    564         } else {
    565           escaped.add(b);
    566         }
    567         break;
    568     }
    569   }
    570 
    571   escaped.add(0);
    572   return String(escaped.releaseAsArray());
    573 }
    574 
    575 } // namespace
    576 
    577 EncodingResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) {
    578   Vector<byte> result(text.size() + nulTerminate);
    579   bool hadErrors = false;
    580 
    581   size_t i = 0;
    582   while (i < text.size()) {
    583     char c = text[i++];
    584     if (c == '\\') {
    585       if (i == text.size()) {
    586         hadErrors = true;
    587         continue;
    588       }
    589       char c2 = text[i++];
    590       switch (c2) {
    591         case 'a' : result.add('\a'); break;
    592         case 'b' : result.add('\b'); break;
    593         case 'f' : result.add('\f'); break;
    594         case 'n' : result.add('\n'); break;
    595         case 'r' : result.add('\r'); break;
    596         case 't' : result.add('\t'); break;
    597         case 'v' : result.add('\v'); break;
    598         case '\'': result.add('\''); break;
    599         case '\"': result.add('\"'); break;
    600         case '\\': result.add('\\'); break;
    601 
    602         case '0':
    603         case '1':
    604         case '2':
    605         case '3':
    606         case '4':
    607         case '5':
    608         case '6':
    609         case '7': {
    610           uint value = c2 - '0';
    611           for (uint j = 0; j < 2 && i < text.size(); j++) {
    612             KJ_IF_MAYBE(d, tryFromOctDigit(text[i])) {
    613               ++i;
    614               value = (value << 3) | *d;
    615             } else {
    616               break;
    617             }
    618           }
    619           if (value >= 0x100) hadErrors = true;
    620           result.add(value);
    621           break;
    622         }
    623 
    624         case 'x': {
    625           uint value = 0;
    626           while (i < text.size()) {
    627             KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
    628               ++i;
    629               value = (value << 4) | *d;
    630             } else {
    631               break;
    632             }
    633           }
    634           if (value >= 0x100) hadErrors = true;
    635           result.add(value);
    636           break;
    637         }
    638 
    639         case 'u': {
    640           char16_t value = 0;
    641           for (uint j = 0; j < 4; j++) {
    642             if (i == text.size()) {
    643               hadErrors = true;
    644               break;
    645             } else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
    646               ++i;
    647               value = (value << 4) | *d;
    648             } else {
    649               hadErrors = true;
    650               break;
    651             }
    652           }
    653           auto utf = decodeUtf16(arrayPtr(&value, 1));
    654           if (utf.hadErrors) hadErrors = true;
    655           result.addAll(utf.asBytes());
    656           break;
    657         }
    658 
    659         case 'U': {
    660           char32_t value = 0;
    661           for (uint j = 0; j < 8; j++) {
    662             if (i == text.size()) {
    663               hadErrors = true;
    664               break;
    665             } else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
    666               ++i;
    667               value = (value << 4) | *d;
    668             } else {
    669               hadErrors = true;
    670               break;
    671             }
    672           }
    673           auto utf = decodeUtf32(arrayPtr(&value, 1));
    674           if (utf.hadErrors) hadErrors = true;
    675           result.addAll(utf.asBytes());
    676           break;
    677         }
    678 
    679         default:
    680           result.add(c2);
    681       }
    682     } else {
    683       result.add(c);
    684     }
    685   }
    686 
    687   if (nulTerminate) result.add(0);
    688   return { result.releaseAsArray(), hadErrors };
    689 }
    690 
    691 // =======================================================================================
    692 // This code is derived from libb64 which has been placed in the public domain.
    693 // For details, see http://sourceforge.net/projects/libb64
    694 
    695 // -------------------------------------------------------------------
    696 // Encoder
    697 
    698 namespace {
    699 
    700 typedef enum {
    701   step_A, step_B, step_C
    702 } base64_encodestep;
    703 
    704 typedef struct {
    705   base64_encodestep step;
    706   char result;
    707   int stepcount;
    708 } base64_encodestate;
    709 
    710 const int CHARS_PER_LINE = 72;
    711 
    712 void base64_init_encodestate(base64_encodestate* state_in) {
    713   state_in->step = step_A;
    714   state_in->result = 0;
    715   state_in->stepcount = 0;
    716 }
    717 
    718 char base64_encode_value(char value_in) {
    719   static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    720   if (value_in > 63) return '=';
    721   return encoding[(int)value_in];
    722 }
    723 
    724 int base64_encode_block(const char* plaintext_in, int length_in,
    725                         char* code_out, base64_encodestate* state_in, bool breakLines) {
    726   const char* plainchar = plaintext_in;
    727   const char* const plaintextend = plaintext_in + length_in;
    728   char* codechar = code_out;
    729   char result;
    730   char fragment;
    731 
    732   result = state_in->result;
    733 
    734   switch (state_in->step) {
    735     while (1) {
    736       KJ_FALLTHROUGH;
    737   case step_A:
    738       if (plainchar == plaintextend) {
    739         state_in->result = result;
    740         state_in->step = step_A;
    741         return codechar - code_out;
    742       }
    743       fragment = *plainchar++;
    744       result = (fragment & 0x0fc) >> 2;
    745       *codechar++ = base64_encode_value(result);
    746       result = (fragment & 0x003) << 4;
    747       KJ_FALLTHROUGH;
    748   case step_B:
    749       if (plainchar == plaintextend) {
    750         state_in->result = result;
    751         state_in->step = step_B;
    752         return codechar - code_out;
    753       }
    754       fragment = *plainchar++;
    755       result |= (fragment & 0x0f0) >> 4;
    756       *codechar++ = base64_encode_value(result);
    757       result = (fragment & 0x00f) << 2;
    758       KJ_FALLTHROUGH;
    759   case step_C:
    760       if (plainchar == plaintextend) {
    761         state_in->result = result;
    762         state_in->step = step_C;
    763         return codechar - code_out;
    764       }
    765       fragment = *plainchar++;
    766       result |= (fragment & 0x0c0) >> 6;
    767       *codechar++ = base64_encode_value(result);
    768       result  = (fragment & 0x03f) >> 0;
    769       *codechar++ = base64_encode_value(result);
    770 
    771       ++(state_in->stepcount);
    772       if (breakLines && state_in->stepcount == CHARS_PER_LINE/4) {
    773         *codechar++ = '\n';
    774         state_in->stepcount = 0;
    775       }
    776     }
    777   }
    778   /* control should not reach here */
    779   return codechar - code_out;
    780 }
    781 
    782 int base64_encode_blockend(char* code_out, base64_encodestate* state_in, bool breakLines) {
    783   char* codechar = code_out;
    784 
    785   switch (state_in->step) {
    786   case step_B:
    787     *codechar++ = base64_encode_value(state_in->result);
    788     *codechar++ = '=';
    789     *codechar++ = '=';
    790     ++state_in->stepcount;
    791     break;
    792   case step_C:
    793     *codechar++ = base64_encode_value(state_in->result);
    794     *codechar++ = '=';
    795     ++state_in->stepcount;
    796     break;
    797   case step_A:
    798     break;
    799   }
    800   if (breakLines && state_in->stepcount > 0) {
    801     *codechar++ = '\n';
    802   }
    803 
    804   return codechar - code_out;
    805 }
    806 
    807 }  // namespace
    808 
    809 String encodeBase64(ArrayPtr<const byte> input, bool breakLines) {
    810   /* set up a destination buffer large enough to hold the encoded data */
    811   // equivalent to ceil(input.size() / 3) * 4
    812   auto numChars = (input.size() + 2) / 3 * 4;
    813   if (breakLines) {
    814     // Add space for newline characters.
    815     uint lineCount = numChars / CHARS_PER_LINE;
    816     if (numChars % CHARS_PER_LINE > 0) {
    817       // Partial line.
    818       ++lineCount;
    819     }
    820     numChars = numChars + lineCount;
    821   }
    822   auto output = heapString(numChars);
    823   /* keep track of our encoded position */
    824   char* c = output.begin();
    825   /* store the number of bytes encoded by a single call */
    826   int cnt = 0;
    827   size_t total = 0;
    828   /* we need an encoder state */
    829   base64_encodestate s;
    830 
    831   /*---------- START ENCODING ----------*/
    832   /* initialise the encoder state */
    833   base64_init_encodestate(&s);
    834   /* gather data from the input and send it to the output */
    835   cnt = base64_encode_block((const char *)input.begin(), input.size(), c, &s, breakLines);
    836   c += cnt;
    837   total += cnt;
    838 
    839   /* since we have encoded the entire input string, we know that
    840      there is no more input data; finalise the encoding */
    841   cnt = base64_encode_blockend(c, &s, breakLines);
    842   c += cnt;
    843   total += cnt;
    844   /*---------- STOP ENCODING  ----------*/
    845 
    846   KJ_ASSERT(total == output.size(), total, output.size());
    847 
    848   return output;
    849 }
    850 
    851 // -------------------------------------------------------------------
    852 // Decoder
    853 
    854 namespace {
    855 
    856 typedef enum {
    857   step_a, step_b, step_c, step_d
    858 } base64_decodestep;
    859 
    860 struct base64_decodestate {
    861   bool hadErrors = false;
    862   size_t nPaddingBytesSeen = 0;
    863   // Output state. `nPaddingBytesSeen` is not guaranteed to be correct if `hadErrors` is true. It is
    864   // included in the state purely to preserve the streaming capability of the algorithm while still
    865   // checking for errors correctly (consider chunk 1 = "abc=", chunk 2 = "d").
    866 
    867   base64_decodestep step = step_a;
    868   char plainchar = 0;
    869 };
    870 
    871 int base64_decode_value(char value_in) {
    872   // Returns either the fragment value or: -1 on whitespace, -2 on padding, -3 on invalid input.
    873   //
    874   // Note that the original libb64 implementation used -1 for invalid input, -2 on padding -- this
    875   // new scheme allows for some simpler error checks in steps A and B.
    876 
    877   static const signed char decoding[] = {
    878     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-1,-1,-3,-1,-1,-3,-3,
    879     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    880     -1,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,62,-3,-3,-3,63,
    881     52,53,54,55,56,57,58,59,  60,61,-3,-3,-3,-2,-3,-3,
    882     -3, 0, 1, 2, 3, 4, 5, 6,   7, 8, 9,10,11,12,13,14,
    883     15,16,17,18,19,20,21,22,  23,24,25,-3,-3,-3,-3,-3,
    884     -3,26,27,28,29,30,31,32,  33,34,35,36,37,38,39,40,
    885     41,42,43,44,45,46,47,48,  49,50,51,-3,-3,-3,-3,-3,
    886 
    887     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    888     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    889     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    890     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    891     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    892     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    893     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    894     -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
    895   };
    896   static_assert(sizeof(decoding) == 256, "base64 decoding table size error");
    897   return decoding[(unsigned char)value_in];
    898 }
    899 
    900 int base64_decode_block(const char* code_in, const int length_in,
    901                         char* plaintext_out, base64_decodestate* state_in) {
    902   const char* codechar = code_in;
    903   char* plainchar = plaintext_out;
    904   signed char fragment;
    905 
    906   if (state_in->step != step_a) {
    907     *plainchar = state_in->plainchar;
    908   }
    909 
    910 #define ERROR_IF(predicate) state_in->hadErrors = state_in->hadErrors || (predicate)
    911 
    912   switch (state_in->step)
    913   {
    914     while (1)
    915     {
    916       KJ_FALLTHROUGH;
    917   case step_a:
    918       do {
    919         if (codechar == code_in+length_in) {
    920           state_in->step = step_a;
    921           state_in->plainchar = '\0';
    922           return plainchar - plaintext_out;
    923         }
    924         fragment = (signed char)base64_decode_value(*codechar++);
    925         // It is an error to see invalid or padding bytes in step A.
    926         ERROR_IF(fragment < -1);
    927       } while (fragment < 0);
    928       *plainchar    = (fragment & 0x03f) << 2;
    929       KJ_FALLTHROUGH;
    930   case step_b:
    931       do {
    932         if (codechar == code_in+length_in) {
    933           state_in->step = step_b;
    934           state_in->plainchar = *plainchar;
    935           // It is always an error to suspend from step B, because we don't have enough bits yet.
    936           // TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
    937           //   to be called multiple times. We'll fix it if we ever care to support streaming.
    938           state_in->hadErrors = true;
    939           return plainchar - plaintext_out;
    940         }
    941         fragment = (signed char)base64_decode_value(*codechar++);
    942         // It is an error to see invalid or padding bytes in step B.
    943         ERROR_IF(fragment < -1);
    944       } while (fragment < 0);
    945       *plainchar++ |= (fragment & 0x030) >> 4;
    946       *plainchar    = (fragment & 0x00f) << 4;
    947       KJ_FALLTHROUGH;
    948   case step_c:
    949       do {
    950         if (codechar == code_in+length_in) {
    951           state_in->step = step_c;
    952           state_in->plainchar = *plainchar;
    953           // It is an error to complete from step C if we have seen incomplete padding.
    954           // TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
    955           //   to be called multiple times. We'll fix it if we ever care to support streaming.
    956           ERROR_IF(state_in->nPaddingBytesSeen == 1);
    957           return plainchar - plaintext_out;
    958         }
    959         fragment = (signed char)base64_decode_value(*codechar++);
    960         // It is an error to see invalid bytes or more than two padding bytes in step C.
    961         ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 2));
    962       } while (fragment < 0);
    963       // It is an error to continue from step C after having seen any padding.
    964       ERROR_IF(state_in->nPaddingBytesSeen > 0);
    965       *plainchar++ |= (fragment & 0x03c) >> 2;
    966       *plainchar    = (fragment & 0x003) << 6;
    967       KJ_FALLTHROUGH;
    968   case step_d:
    969       do {
    970         if (codechar == code_in+length_in) {
    971           state_in->step = step_d;
    972           state_in->plainchar = *plainchar;
    973           return plainchar - plaintext_out;
    974         }
    975         fragment = (signed char)base64_decode_value(*codechar++);
    976         // It is an error to see invalid bytes or more than one padding byte in step D.
    977         ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 1));
    978       } while (fragment < 0);
    979       // It is an error to continue from step D after having seen padding bytes.
    980       ERROR_IF(state_in->nPaddingBytesSeen > 0);
    981       *plainchar++   |= (fragment & 0x03f);
    982     }
    983   }
    984 
    985 #undef ERROR_IF
    986 
    987   /* control should not reach here */
    988   return plainchar - plaintext_out;
    989 }
    990 
    991 }  // namespace
    992 
    993 EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> input) {
    994   base64_decodestate state;
    995 
    996   auto output = heapArray<byte>((input.size() * 6 + 7) / 8);
    997 
    998   size_t n = base64_decode_block(input.begin(), input.size(),
    999       reinterpret_cast<char*>(output.begin()), &state);
   1000 
   1001   if (n < output.size()) {
   1002     auto copy = heapArray<byte>(n);
   1003     memcpy(copy.begin(), output.begin(), n);
   1004     output = kj::mv(copy);
   1005   }
   1006 
   1007   return EncodingResult<Array<byte>>(kj::mv(output), state.hadErrors);
   1008 }
   1009 
   1010 String encodeBase64Url(ArrayPtr<const byte> bytes) {
   1011   // TODO(perf): Rewrite as single pass?
   1012   // TODO(someday): Write decoder?
   1013 
   1014   auto base64 = kj::encodeBase64(bytes);
   1015 
   1016   for (char& c: base64) {
   1017     if (c == '+') c = '-';
   1018     if (c == '/') c = '_';
   1019   }
   1020 
   1021   // Remove trailing '='s.
   1022   kj::ArrayPtr<const char> slice = base64;
   1023   while (slice.size() > 0 && slice.back() == '=') {
   1024     slice = slice.slice(0, slice.size() - 1);
   1025   }
   1026 
   1027   return kj::str(slice);
   1028 }
   1029 
   1030 } // namespace kj
	capnproto FORK: Cap'n Proto serialization/RPC system - core tools and C++ library
	git clone https://git.neptards.moe/neptards/capnproto.git
	Log \| Files \| Refs \| README \| LICENSE