encoding.h (19635B)
1 // Copyright (c) 2017 Cloudflare, Inc. and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 #pragma once 23 // Functions for encoding/decoding bytes and text in common formats, including: 24 // - UTF-{8,16,32} 25 // - Hex 26 // - URI encoding 27 // - Base64 28 29 #include "string.h" 30 31 KJ_BEGIN_HEADER 32 33 namespace kj { 34 35 template <typename ResultType> 36 struct EncodingResult: public ResultType { 37 // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except 38 // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input. 39 // Each encoding/decoding function that returns this type will "work around" errors in some way, 40 // so an application doesn't strictly have to check for errors. E.g. the Unicode functions 41 // replace errors with U+FFFD in the output. 42 // 43 // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T> 44 // exactly if it were a Maybe<T> that is null in case of errors. 45 46 inline EncodingResult(ResultType&& result, bool hadErrors) 47 : ResultType(kj::mv(result)), hadErrors(hadErrors) {} 48 49 const bool hadErrors; 50 }; 51 52 template <typename T> 53 inline auto KJ_STRINGIFY(const EncodingResult<T>& value) 54 -> decltype(toCharSequence(implicitCast<const T&>(value))) { 55 return toCharSequence(implicitCast<const T&>(value)); 56 } 57 58 EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false); 59 EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false); 60 // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32. 61 // 62 // If `nulTerminate` is true, an extra NUL character will be added to the end of the output. 63 // 64 // The returned arrays are in platform-native endianness (otherwise they wouldn't really be 65 // char16_t / char32_t). 66 // 67 // Note that the KJ Unicode encoding and decoding functions actually implement 68 // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is 69 // handled. See comments on decodeUtf16() for more info. 70 71 EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16); 72 EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); 73 // Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use). 74 // 75 // The input should NOT include a NUL terminator; any NUL characters in the input array will be 76 // preserved in the output. 77 // 78 // The input must be in platform-native endianness. BOMs are NOT recognized by these functions. 79 // 80 // Note that the KJ Unicode encoding and decoding functions actually implement 81 // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array 82 // of char16_t and you pass it through any number of conversions to other Unicode encodings, 83 // eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with 84 // exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This 85 // is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode) 86 // and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example, 87 // file names on Windows NT are encoded using 16-bit characters, without enforcing that the 88 // character sequence is valid UTF-16. It is important that programs on Windows be able to handle 89 // such filenames, even if they choose to convert the name to UTF-8 for internal processing. 90 // 91 // Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through 92 // UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the 93 // result), but will NOT be replaced with the Unicode replacement character as other erroneous 94 // sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding. 95 // 96 // KJ makes the following guarantees about invalid input: 97 // - A round trip from UTF-16 to other encodings and back will produce exactly the original input, 98 // with every leg of the trip raising the `hadErrors` flag if the original input was not valid. 99 // - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly 100 // the original input, or will have replaced some invalid sequences with the Unicode replacement 101 // character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD, 102 // and no code units will ever be added except to encode U+FFFD. If the original input was not 103 // valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be 104 // raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after 105 // all, is a valid code point). 106 107 EncodingResult<Array<wchar_t>> encodeWideString( 108 ArrayPtr<const char> text, bool nulTerminate = false); 109 EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide); 110 // Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have 111 // different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16, 112 // but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit, 113 // encoding UTF-8 (e.g. BeOS did this). 114 // 115 // KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on 116 // the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above 117 // (or simply make a copy if wchar_t is 8 bits). 118 119 String encodeHex(ArrayPtr<const byte> bytes); 120 EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text); 121 // Encode/decode bytes as hex strings. 122 123 String encodeUriComponent(ArrayPtr<const byte> bytes); 124 String encodeUriComponent(ArrayPtr<const char> bytes); 125 EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text); 126 // Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396. 127 // This is the same behavior as JavaScript's `encodeURIComponent()`. 128 // 129 // See https://tools.ietf.org/html/rfc2396#section-2.3 130 131 String encodeUriFragment(ArrayPtr<const byte> bytes); 132 String encodeUriFragment(ArrayPtr<const char> bytes); 133 // Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL 134 // specification. Use decodeUriComponent() to decode. 135 // 136 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- 137 // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. 138 // 139 // See https://url.spec.whatwg.org/#fragment-percent-encode-set 140 141 String encodeUriPath(ArrayPtr<const byte> bytes); 142 String encodeUriPath(ArrayPtr<const char> bytes); 143 // Encode URL path components (not entire paths!) using the path percent encode set defined by the 144 // WHATWG URL specification. Use decodeUriComponent() to decode. 145 // 146 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- 147 // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. 148 // 149 // Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set 150 // defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this 151 // function on individual path components, and never entire paths, augmenting the character set to 152 // include these separators allows this function to be used to implement a URL class that stores 153 // its path components in percent-decoded form. 154 // 155 // See https://url.spec.whatwg.org/#path-percent-encode-set 156 157 String encodeUriUserInfo(ArrayPtr<const byte> bytes); 158 String encodeUriUserInfo(ArrayPtr<const char> bytes); 159 // Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL 160 // specification. Use decodeUriComponent() to decode. 161 // 162 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- 163 // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. 164 // 165 // See https://url.spec.whatwg.org/#userinfo-percent-encode-set 166 167 String encodeWwwForm(ArrayPtr<const byte> bytes); 168 String encodeWwwForm(ArrayPtr<const char> bytes); 169 EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text); 170 // Encode/decode URI components using % escapes and '+' (for spaces) according to the 171 // application/x-www-form-urlencoded format defined by the WHATWG URL specification. 172 // 173 // Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is 174 // not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens 175 // to agree with us! 176 // 177 // See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer 178 179 struct DecodeUriOptions { 180 // Parameter to `decodeBinaryUriComponent()`. 181 182 // This struct is intentionally convertible from bool, in order to maintain backwards 183 // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second 184 // parameter. 185 DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false) 186 : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {} 187 188 bool nulTerminate; 189 // Append a terminal NUL byte. 190 191 bool plusToSpace; 192 // Convert '+' to ' ' characters before percent decoding. Used to decode 193 // application/x-www-form-urlencoded text, such as query strings. 194 }; 195 EncodingResult<Array<byte>> decodeBinaryUriComponent( 196 ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions()); 197 // Decode URI components using % escapes. This is a lower-level interface used to implement both 198 // `decodeUriComponent()` and `decodeWwwForm()` 199 200 String encodeCEscape(ArrayPtr<const byte> bytes); 201 String encodeCEscape(ArrayPtr<const char> bytes); 202 EncodingResult<Array<byte>> decodeBinaryCEscape( 203 ArrayPtr<const char> text, bool nulTerminate = false); 204 EncodingResult<String> decodeCEscape(ArrayPtr<const char> text); 205 206 String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false); 207 // Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted 208 // into the output every 72 characters (e.g. for encoding e-mail bodies). 209 210 EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text); 211 // Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see 212 // https://html.spec.whatwg.org/multipage/webappapis.html#atob for details. 213 214 String encodeBase64Url(ArrayPtr<const byte> bytes); 215 // Encode the given bytes as URL-safe base64 text. (RFC 4648, section 5) 216 217 // ======================================================================================= 218 // inline implementation details 219 220 namespace _ { // private 221 222 template <typename T> 223 NullableValue<T> readMaybe(EncodingResult<T>&& value) { 224 if (value.hadErrors) { 225 return nullptr; 226 } else { 227 return kj::mv(value); 228 } 229 } 230 231 template <typename T> 232 T* readMaybe(EncodingResult<T>& value) { 233 if (value.hadErrors) { 234 return nullptr; 235 } else { 236 return &value; 237 } 238 } 239 240 template <typename T> 241 const T* readMaybe(const EncodingResult<T>& value) { 242 if (value.hadErrors) { 243 return nullptr; 244 } else { 245 return &value; 246 } 247 } 248 249 String encodeCEscapeImpl(ArrayPtr<const byte> bytes, bool isBinary); 250 251 } // namespace _ (private) 252 253 inline String encodeUriComponent(ArrayPtr<const char> text) { 254 return encodeUriComponent(text.asBytes()); 255 } 256 inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) { 257 auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true }); 258 return { String(result.releaseAsChars()), result.hadErrors }; 259 } 260 261 inline String encodeUriFragment(ArrayPtr<const char> text) { 262 return encodeUriFragment(text.asBytes()); 263 } 264 inline String encodeUriPath(ArrayPtr<const char> text) { 265 return encodeUriPath(text.asBytes()); 266 } 267 inline String encodeUriUserInfo(ArrayPtr<const char> text) { 268 return encodeUriUserInfo(text.asBytes()); 269 } 270 271 inline String encodeWwwForm(ArrayPtr<const char> text) { 272 return encodeWwwForm(text.asBytes()); 273 } 274 inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) { 275 auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true, 276 /*.plusToSpace=*/true }); 277 return { String(result.releaseAsChars()), result.hadErrors }; 278 } 279 280 inline String encodeCEscape(ArrayPtr<const char> text) { 281 return _::encodeCEscapeImpl(text.asBytes(), false); 282 } 283 284 inline String encodeCEscape(ArrayPtr<const byte> bytes) { 285 return _::encodeCEscapeImpl(bytes, true); 286 } 287 288 inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) { 289 auto result = decodeBinaryCEscape(text, true); 290 return { String(result.releaseAsChars()), result.hadErrors }; 291 } 292 293 // If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL 294 // termintator, which is surprising. Let's add overloads that avoid that. In practice this probably 295 // only even matters for encoding-test.c++. 296 297 template <size_t s> 298 inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) { 299 return encodeUtf16(arrayPtr(text, s - 1), nulTerminate); 300 } 301 template <size_t s> 302 inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) { 303 return encodeUtf32(arrayPtr(text, s - 1), nulTerminate); 304 } 305 template <size_t s> 306 inline EncodingResult<Array<wchar_t>> encodeWideString( 307 const char (&text)[s], bool nulTerminate=false) { 308 return encodeWideString(arrayPtr(text, s - 1), nulTerminate); 309 } 310 template <size_t s> 311 inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) { 312 return decodeUtf16(arrayPtr(utf16, s - 1)); 313 } 314 template <size_t s> 315 inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) { 316 return decodeUtf32(arrayPtr(utf32, s - 1)); 317 } 318 template <size_t s> 319 inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) { 320 return decodeWideString(arrayPtr(utf32, s - 1)); 321 } 322 template <size_t s> 323 inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) { 324 return decodeHex(arrayPtr(text, s - 1)); 325 } 326 template <size_t s> 327 inline String encodeUriComponent(const char (&text)[s]) { 328 return encodeUriComponent(arrayPtr(text, s - 1)); 329 } 330 template <size_t s> 331 inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) { 332 return decodeBinaryUriComponent(arrayPtr(text, s - 1)); 333 } 334 template <size_t s> 335 inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) { 336 return decodeUriComponent(arrayPtr(text, s-1)); 337 } 338 template <size_t s> 339 inline String encodeUriFragment(const char (&text)[s]) { 340 return encodeUriFragment(arrayPtr(text, s - 1)); 341 } 342 template <size_t s> 343 inline String encodeUriPath(const char (&text)[s]) { 344 return encodeUriPath(arrayPtr(text, s - 1)); 345 } 346 template <size_t s> 347 inline String encodeUriUserInfo(const char (&text)[s]) { 348 return encodeUriUserInfo(arrayPtr(text, s - 1)); 349 } 350 template <size_t s> 351 inline String encodeWwwForm(const char (&text)[s]) { 352 return encodeWwwForm(arrayPtr(text, s - 1)); 353 } 354 template <size_t s> 355 inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) { 356 return decodeWwwForm(arrayPtr(text, s-1)); 357 } 358 template <size_t s> 359 inline String encodeCEscape(const char (&text)[s]) { 360 return encodeCEscape(arrayPtr(text, s - 1)); 361 } 362 template <size_t s> 363 inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) { 364 return decodeBinaryCEscape(arrayPtr(text, s - 1)); 365 } 366 template <size_t s> 367 inline EncodingResult<String> decodeCEscape(const char (&text)[s]) { 368 return decodeCEscape(arrayPtr(text, s-1)); 369 } 370 template <size_t s> 371 EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) { 372 return decodeBase64(arrayPtr(text, s - 1)); 373 } 374 375 #if __cplusplus >= 202000L 376 template <size_t s> 377 inline EncodingResult<Array<char16_t>> encodeUtf16(const char8_t (&text)[s], bool nulTerminate=false) { 378 return encodeUtf16(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate); 379 } 380 template <size_t s> 381 inline EncodingResult<Array<char32_t>> encodeUtf32(const char8_t (&text)[s], bool nulTerminate=false) { 382 return encodeUtf32(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate); 383 } 384 template <size_t s> 385 inline EncodingResult<Array<wchar_t>> encodeWideString( 386 const char8_t (&text)[s], bool nulTerminate=false) { 387 return encodeWideString(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate); 388 } 389 template <size_t s> 390 inline EncodingResult<Array<byte>> decodeHex(const char8_t (&text)[s]) { 391 return decodeHex(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 392 } 393 template <size_t s> 394 inline String encodeUriComponent(const char8_t (&text)[s]) { 395 return encodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 396 } 397 template <size_t s> 398 inline Array<byte> decodeBinaryUriComponent(const char8_t (&text)[s]) { 399 return decodeBinaryUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 400 } 401 template <size_t s> 402 inline EncodingResult<String> decodeUriComponent(const char8_t (&text)[s]) { 403 return decodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s-1)); 404 } 405 template <size_t s> 406 inline String encodeUriFragment(const char8_t (&text)[s]) { 407 return encodeUriFragment(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 408 } 409 template <size_t s> 410 inline String encodeUriPath(const char8_t (&text)[s]) { 411 return encodeUriPath(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 412 } 413 template <size_t s> 414 inline String encodeUriUserInfo(const char8_t (&text)[s]) { 415 return encodeUriUserInfo(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 416 } 417 template <size_t s> 418 inline String encodeWwwForm(const char8_t (&text)[s]) { 419 return encodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 420 } 421 template <size_t s> 422 inline EncodingResult<String> decodeWwwForm(const char8_t (&text)[s]) { 423 return decodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s-1)); 424 } 425 template <size_t s> 426 inline String encodeCEscape(const char8_t (&text)[s]) { 427 return encodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 428 } 429 template <size_t s> 430 inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char8_t (&text)[s]) { 431 return decodeBinaryCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 432 } 433 template <size_t s> 434 inline EncodingResult<String> decodeCEscape(const char8_t (&text)[s]) { 435 return decodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s-1)); 436 } 437 template <size_t s> 438 EncodingResult<Array<byte>> decodeBase64(const char8_t (&text)[s]) { 439 return decodeBase64(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); 440 } 441 #endif 442 443 } // namespace kj 444 445 KJ_END_HEADER