capnproto

FORK: Cap'n Proto serialization/RPC system - core tools and C++ library
git clone https://git.neptards.moe/neptards/capnproto.git
Log | Files | Refs | README | LICENSE

encoding.h (19635B)


      1 // Copyright (c) 2017 Cloudflare, Inc. and contributors
      2 // Licensed under the MIT License:
      3 //
      4 // Permission is hereby granted, free of charge, to any person obtaining a copy
      5 // of this software and associated documentation files (the "Software"), to deal
      6 // in the Software without restriction, including without limitation the rights
      7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      8 // copies of the Software, and to permit persons to whom the Software is
      9 // furnished to do so, subject to the following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included in
     12 // all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     20 // THE SOFTWARE.
     21 
     22 #pragma once
     23 // Functions for encoding/decoding bytes and text in common formats, including:
     24 // - UTF-{8,16,32}
     25 // - Hex
     26 // - URI encoding
     27 // - Base64
     28 
     29 #include "string.h"
     30 
     31 KJ_BEGIN_HEADER
     32 
     33 namespace kj {
     34 
     35 template <typename ResultType>
     36 struct EncodingResult: public ResultType {
     37   // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
     38   // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
     39   // Each encoding/decoding function that returns this type will "work around" errors in some way,
     40   // so an application doesn't strictly have to check for errors. E.g. the Unicode functions
     41   // replace errors with U+FFFD in the output.
     42   //
     43   // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
     44   // exactly if it were a Maybe<T> that is null in case of errors.
     45 
     46   inline EncodingResult(ResultType&& result, bool hadErrors)
     47       : ResultType(kj::mv(result)), hadErrors(hadErrors) {}
     48 
     49   const bool hadErrors;
     50 };
     51 
     52 template <typename T>
     53 inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
     54     -> decltype(toCharSequence(implicitCast<const T&>(value))) {
     55   return toCharSequence(implicitCast<const T&>(value));
     56 }
     57 
     58 EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
     59 EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
     60 // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
     61 //
     62 // If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
     63 //
     64 // The returned arrays are in platform-native endianness (otherwise they wouldn't really be
     65 // char16_t / char32_t).
     66 //
     67 // Note that the KJ Unicode encoding and decoding functions actually implement
     68 // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
     69 // handled. See comments on decodeUtf16() for more info.
     70 
     71 EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
     72 EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
     73 // Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
     74 //
     75 // The input should NOT include a NUL terminator; any NUL characters in the input array will be
     76 // preserved in the output.
     77 //
     78 // The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
     79 //
     80 // Note that the KJ Unicode encoding and decoding functions actually implement
     81 // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
     82 // of char16_t and you pass it through any number of conversions to other Unicode encodings,
     83 // eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
     84 // exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
     85 // is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
     86 // and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
     87 // file names on Windows NT are encoded using 16-bit characters, without enforcing that the
     88 // character sequence is valid UTF-16. It is important that programs on Windows be able to handle
     89 // such filenames, even if they choose to convert the name to UTF-8 for internal processing.
     90 //
     91 // Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
     92 // UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
     93 // result), but will NOT be replaced with the Unicode replacement character as other erroneous
     94 // sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
     95 //
     96 // KJ makes the following guarantees about invalid input:
     97 // - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
     98 //   with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
     99 // - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
    100 //   the original input, or will have replaced some invalid sequences with the Unicode replacement
    101 //   character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
    102 //   and no code units will ever be added except to encode U+FFFD. If the original input was not
    103 //   valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
    104 //   raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
    105 //   all, is a valid code point).
    106 
    107 EncodingResult<Array<wchar_t>> encodeWideString(
    108     ArrayPtr<const char> text, bool nulTerminate = false);
    109 EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
    110 // Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
    111 // different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
    112 // but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
    113 // encoding UTF-8 (e.g. BeOS did this).
    114 //
    115 // KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
    116 // the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above
    117 // (or simply make a copy if wchar_t is 8 bits).
    118 
    119 String encodeHex(ArrayPtr<const byte> bytes);
    120 EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
    121 // Encode/decode bytes as hex strings.
    122 
    123 String encodeUriComponent(ArrayPtr<const byte> bytes);
    124 String encodeUriComponent(ArrayPtr<const char> bytes);
    125 EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
    126 // Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396.
    127 // This is the same behavior as JavaScript's `encodeURIComponent()`.
    128 //
    129 // See https://tools.ietf.org/html/rfc2396#section-2.3
    130 
    131 String encodeUriFragment(ArrayPtr<const byte> bytes);
    132 String encodeUriFragment(ArrayPtr<const char> bytes);
    133 // Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL
    134 // specification. Use decodeUriComponent() to decode.
    135 //
    136 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
    137 //   decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
    138 //
    139 // See https://url.spec.whatwg.org/#fragment-percent-encode-set
    140 
    141 String encodeUriPath(ArrayPtr<const byte> bytes);
    142 String encodeUriPath(ArrayPtr<const char> bytes);
    143 // Encode URL path components (not entire paths!) using the path percent encode set defined by the
    144 // WHATWG URL specification. Use decodeUriComponent() to decode.
    145 //
    146 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
    147 //   decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
    148 //
    149 // Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set
    150 //   defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this
    151 //   function on individual path components, and never entire paths, augmenting the character set to
    152 //   include these separators allows this function to be used to implement a URL class that stores
    153 //   its path components in percent-decoded form.
    154 //
    155 // See https://url.spec.whatwg.org/#path-percent-encode-set
    156 
    157 String encodeUriUserInfo(ArrayPtr<const byte> bytes);
    158 String encodeUriUserInfo(ArrayPtr<const char> bytes);
    159 // Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL
    160 // specification. Use decodeUriComponent() to decode.
    161 //
    162 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
    163 //   decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
    164 //
    165 // See https://url.spec.whatwg.org/#userinfo-percent-encode-set
    166 
    167 String encodeWwwForm(ArrayPtr<const byte> bytes);
    168 String encodeWwwForm(ArrayPtr<const char> bytes);
    169 EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text);
    170 // Encode/decode URI components using % escapes and '+' (for spaces) according to the
    171 // application/x-www-form-urlencoded format defined by the WHATWG URL specification.
    172 //
    173 // Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is
    174 //   not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens
    175 //   to agree with us!
    176 //
    177 // See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer
    178 
    179 struct DecodeUriOptions {
    180   // Parameter to `decodeBinaryUriComponent()`.
    181 
    182   // This struct is intentionally convertible from bool, in order to maintain backwards
    183   // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second
    184   // parameter.
    185   DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false)
    186       : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {}
    187 
    188   bool nulTerminate;
    189   // Append a terminal NUL byte.
    190 
    191   bool plusToSpace;
    192   // Convert '+' to ' ' characters before percent decoding. Used to decode
    193   // application/x-www-form-urlencoded text, such as query strings.
    194 };
    195 EncodingResult<Array<byte>> decodeBinaryUriComponent(
    196     ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions());
    197 // Decode URI components using % escapes. This is a lower-level interface used to implement both
    198 // `decodeUriComponent()` and `decodeWwwForm()`
    199 
    200 String encodeCEscape(ArrayPtr<const byte> bytes);
    201 String encodeCEscape(ArrayPtr<const char> bytes);
    202 EncodingResult<Array<byte>> decodeBinaryCEscape(
    203     ArrayPtr<const char> text, bool nulTerminate = false);
    204 EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
    205 
    206 String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
    207 // Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
    208 // into the output every 72 characters (e.g. for encoding e-mail bodies).
    209 
    210 EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
    211 // Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
    212 // https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.
    213 
    214 String encodeBase64Url(ArrayPtr<const byte> bytes);
    215 // Encode the given bytes as URL-safe base64 text. (RFC 4648, section 5)
    216 
    217 // =======================================================================================
    218 // inline implementation details
    219 
    220 namespace _ {  // private
    221 
    222 template <typename T>
    223 NullableValue<T> readMaybe(EncodingResult<T>&& value) {
    224   if (value.hadErrors) {
    225     return nullptr;
    226   } else {
    227     return kj::mv(value);
    228   }
    229 }
    230 
    231 template <typename T>
    232 T* readMaybe(EncodingResult<T>& value) {
    233   if (value.hadErrors) {
    234     return nullptr;
    235   } else {
    236     return &value;
    237   }
    238 }
    239 
    240 template <typename T>
    241 const T* readMaybe(const EncodingResult<T>& value) {
    242   if (value.hadErrors) {
    243     return nullptr;
    244   } else {
    245     return &value;
    246   }
    247 }
    248 
    249 String encodeCEscapeImpl(ArrayPtr<const byte> bytes, bool isBinary);
    250 
    251 }  // namespace _ (private)
    252 
    253 inline String encodeUriComponent(ArrayPtr<const char> text) {
    254   return encodeUriComponent(text.asBytes());
    255 }
    256 inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
    257   auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true });
    258   return { String(result.releaseAsChars()), result.hadErrors };
    259 }
    260 
    261 inline String encodeUriFragment(ArrayPtr<const char> text) {
    262   return encodeUriFragment(text.asBytes());
    263 }
    264 inline String encodeUriPath(ArrayPtr<const char> text) {
    265   return encodeUriPath(text.asBytes());
    266 }
    267 inline String encodeUriUserInfo(ArrayPtr<const char> text) {
    268   return encodeUriUserInfo(text.asBytes());
    269 }
    270 
    271 inline String encodeWwwForm(ArrayPtr<const char> text) {
    272   return encodeWwwForm(text.asBytes());
    273 }
    274 inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) {
    275   auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true,
    276                                                                   /*.plusToSpace=*/true });
    277   return { String(result.releaseAsChars()), result.hadErrors };
    278 }
    279 
    280 inline String encodeCEscape(ArrayPtr<const char> text) {
    281   return _::encodeCEscapeImpl(text.asBytes(), false);
    282 }
    283 
    284 inline String encodeCEscape(ArrayPtr<const byte> bytes) {
    285   return _::encodeCEscapeImpl(bytes, true);
    286 }
    287 
    288 inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
    289   auto result = decodeBinaryCEscape(text, true);
    290   return { String(result.releaseAsChars()), result.hadErrors };
    291 }
    292 
    293 // If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
    294 // termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
    295 // only even matters for encoding-test.c++.
    296 
    297 template <size_t s>
    298 inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
    299   return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
    300 }
    301 template <size_t s>
    302 inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
    303   return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
    304 }
    305 template <size_t s>
    306 inline EncodingResult<Array<wchar_t>> encodeWideString(
    307     const char (&text)[s], bool nulTerminate=false) {
    308   return encodeWideString(arrayPtr(text, s - 1), nulTerminate);
    309 }
    310 template <size_t s>
    311 inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
    312   return decodeUtf16(arrayPtr(utf16, s - 1));
    313 }
    314 template <size_t s>
    315 inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
    316   return decodeUtf32(arrayPtr(utf32, s - 1));
    317 }
    318 template <size_t s>
    319 inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
    320   return decodeWideString(arrayPtr(utf32, s - 1));
    321 }
    322 template <size_t s>
    323 inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
    324   return decodeHex(arrayPtr(text, s - 1));
    325 }
    326 template <size_t s>
    327 inline String encodeUriComponent(const char (&text)[s]) {
    328   return encodeUriComponent(arrayPtr(text, s - 1));
    329 }
    330 template <size_t s>
    331 inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
    332   return decodeBinaryUriComponent(arrayPtr(text, s - 1));
    333 }
    334 template <size_t s>
    335 inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
    336   return decodeUriComponent(arrayPtr(text, s-1));
    337 }
    338 template <size_t s>
    339 inline String encodeUriFragment(const char (&text)[s]) {
    340   return encodeUriFragment(arrayPtr(text, s - 1));
    341 }
    342 template <size_t s>
    343 inline String encodeUriPath(const char (&text)[s]) {
    344   return encodeUriPath(arrayPtr(text, s - 1));
    345 }
    346 template <size_t s>
    347 inline String encodeUriUserInfo(const char (&text)[s]) {
    348   return encodeUriUserInfo(arrayPtr(text, s - 1));
    349 }
    350 template <size_t s>
    351 inline String encodeWwwForm(const char (&text)[s]) {
    352   return encodeWwwForm(arrayPtr(text, s - 1));
    353 }
    354 template <size_t s>
    355 inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) {
    356   return decodeWwwForm(arrayPtr(text, s-1));
    357 }
    358 template <size_t s>
    359 inline String encodeCEscape(const char (&text)[s]) {
    360   return encodeCEscape(arrayPtr(text, s - 1));
    361 }
    362 template <size_t s>
    363 inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
    364   return decodeBinaryCEscape(arrayPtr(text, s - 1));
    365 }
    366 template <size_t s>
    367 inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
    368   return decodeCEscape(arrayPtr(text, s-1));
    369 }
    370 template <size_t s>
    371 EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
    372   return decodeBase64(arrayPtr(text, s - 1));
    373 }
    374 
    375 #if __cplusplus >= 202000L
    376 template <size_t s>
    377 inline EncodingResult<Array<char16_t>> encodeUtf16(const char8_t (&text)[s], bool nulTerminate=false) {
    378   return encodeUtf16(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate);
    379 }
    380 template <size_t s>
    381 inline EncodingResult<Array<char32_t>> encodeUtf32(const char8_t (&text)[s], bool nulTerminate=false) {
    382   return encodeUtf32(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate);
    383 }
    384 template <size_t s>
    385 inline EncodingResult<Array<wchar_t>> encodeWideString(
    386     const char8_t (&text)[s], bool nulTerminate=false) {
    387   return encodeWideString(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate);
    388 }
    389 template <size_t s>
    390 inline EncodingResult<Array<byte>> decodeHex(const char8_t (&text)[s]) {
    391   return decodeHex(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    392 }
    393 template <size_t s>
    394 inline String encodeUriComponent(const char8_t (&text)[s]) {
    395   return encodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    396 }
    397 template <size_t s>
    398 inline Array<byte> decodeBinaryUriComponent(const char8_t (&text)[s]) {
    399   return decodeBinaryUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    400 }
    401 template <size_t s>
    402 inline EncodingResult<String> decodeUriComponent(const char8_t (&text)[s]) {
    403   return decodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s-1));
    404 }
    405 template <size_t s>
    406 inline String encodeUriFragment(const char8_t (&text)[s]) {
    407   return encodeUriFragment(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    408 }
    409 template <size_t s>
    410 inline String encodeUriPath(const char8_t (&text)[s]) {
    411   return encodeUriPath(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    412 }
    413 template <size_t s>
    414 inline String encodeUriUserInfo(const char8_t (&text)[s]) {
    415   return encodeUriUserInfo(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    416 }
    417 template <size_t s>
    418 inline String encodeWwwForm(const char8_t (&text)[s]) {
    419   return encodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    420 }
    421 template <size_t s>
    422 inline EncodingResult<String> decodeWwwForm(const char8_t (&text)[s]) {
    423   return decodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s-1));
    424 }
    425 template <size_t s>
    426 inline String encodeCEscape(const char8_t (&text)[s]) {
    427   return encodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    428 }
    429 template <size_t s>
    430 inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char8_t (&text)[s]) {
    431   return decodeBinaryCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    432 }
    433 template <size_t s>
    434 inline EncodingResult<String> decodeCEscape(const char8_t (&text)[s]) {
    435   return decodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s-1));
    436 }
    437 template <size_t s>
    438 EncodingResult<Array<byte>> decodeBase64(const char8_t (&text)[s]) {
    439   return decodeBase64(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
    440 }
    441 #endif
    442 
    443 } // namespace kj
    444 
    445 KJ_END_HEADER