encoding.c++ (31878B)
1 // Copyright (c) 2017 Cloudflare, Inc.; Sandstorm Development Group, Inc.; and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 #include "encoding.h" 23 #include "vector.h" 24 #include "debug.h" 25 26 namespace kj { 27 28 namespace { 29 30 #define GOTO_ERROR_IF(cond) if (KJ_UNLIKELY(cond)) goto error 31 32 inline void addChar32(Vector<char16_t>& vec, char32_t u) { 33 // Encode as surrogate pair. 34 u -= 0x10000; 35 vec.add(0xd800 | (u >> 10)); 36 vec.add(0xdc00 | (u & 0x03ff)); 37 } 38 39 inline void addChar32(Vector<char32_t>& vec, char32_t u) { 40 vec.add(u); 41 } 42 43 template <typename T> 44 EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) { 45 Vector<T> result(text.size() + nulTerminate); 46 bool hadErrors = false; 47 48 size_t i = 0; 49 while (i < text.size()) { 50 byte c = text[i++]; 51 if (c < 0x80) { 52 // 0xxxxxxx -- ASCII 53 result.add(c); 54 continue; 55 } else if (KJ_UNLIKELY(c < 0xc0)) { 56 // 10xxxxxx -- malformed continuation byte 57 goto error; 58 } else if (c < 0xe0) { 59 // 110xxxxx -- 2-byte 60 byte c2; 61 GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i; 62 char16_t u = (static_cast<char16_t>(c & 0x1f) << 6) 63 | (static_cast<char16_t>(c2 & 0x3f) ); 64 65 // Disallow overlong sequence. 66 GOTO_ERROR_IF(u < 0x80); 67 68 result.add(u); 69 continue; 70 } else if (c < 0xf0) { 71 // 1110xxxx -- 3-byte 72 byte c2, c3; 73 GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i; 74 GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i; 75 char16_t u = (static_cast<char16_t>(c & 0x0f) << 12) 76 | (static_cast<char16_t>(c2 & 0x3f) << 6) 77 | (static_cast<char16_t>(c3 & 0x3f) ); 78 79 // Disallow overlong sequence. 80 GOTO_ERROR_IF(u < 0x0800); 81 82 // Flag surrogate pair code points as errors, but allow them through. 83 if (KJ_UNLIKELY((u & 0xf800) == 0xd800)) { 84 if (result.size() > 0 && 85 (u & 0xfc00) == 0xdc00 && 86 (result.back() & 0xfc00) == 0xd800) { 87 // Whoops, the *previous* character was also an invalid surrogate, and if we add this 88 // one too, they'll form a valid surrogate pair. If we allowed this, then it would mean 89 // invalid UTF-8 round-tripped to UTF-16 and back could actually change meaning entirely. 90 // OTOH, the reason we allow dangling surrogates is to allow invalid UTF-16 to round-trip 91 // to UTF-8 without loss, but if the original UTF-16 had a valid surrogate pair, it would 92 // have been encoded as a valid single UTF-8 codepoint, not as separate UTF-8 codepoints 93 // for each surrogate. 94 goto error; 95 } 96 97 hadErrors = true; 98 } 99 100 result.add(u); 101 continue; 102 } else if (c < 0xf8) { 103 // 11110xxx -- 4-byte 104 byte c2, c3, c4; 105 GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i; 106 GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i; 107 GOTO_ERROR_IF(i == text.size() || ((c4 = text[i]) & 0xc0) != 0x80); ++i; 108 char32_t u = (static_cast<char32_t>(c & 0x07) << 18) 109 | (static_cast<char32_t>(c2 & 0x3f) << 12) 110 | (static_cast<char32_t>(c3 & 0x3f) << 6) 111 | (static_cast<char32_t>(c4 & 0x3f) ); 112 113 // Disallow overlong sequence. 114 GOTO_ERROR_IF(u < 0x10000); 115 116 // Unicode ends at U+10FFFF 117 GOTO_ERROR_IF(u >= 0x110000); 118 119 addChar32(result, u); 120 continue; 121 } else { 122 // 5-byte and 6-byte sequences are not legal as they'd result in codepoints outside the 123 // range of Unicode. 124 goto error; 125 } 126 127 error: 128 result.add(0xfffd); 129 hadErrors = true; 130 // Ignore all continuation bytes. 131 while (i < text.size() && (text[i] & 0xc0) == 0x80) { 132 ++i; 133 } 134 } 135 136 if (nulTerminate) result.add(0); 137 138 return { result.releaseAsArray(), hadErrors }; 139 } 140 141 } // namespace 142 143 EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) { 144 return encodeUtf<char16_t>(text, nulTerminate); 145 } 146 147 EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) { 148 return encodeUtf<char32_t>(text, nulTerminate); 149 } 150 151 EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) { 152 Vector<char> result(utf16.size() + 1); 153 bool hadErrors = false; 154 155 size_t i = 0; 156 while (i < utf16.size()) { 157 char16_t u = utf16[i++]; 158 159 if (u < 0x80) { 160 result.add(u); 161 continue; 162 } else if (u < 0x0800) { 163 result.addAll<std::initializer_list<char>>({ 164 static_cast<char>(((u >> 6) ) | 0xc0), 165 static_cast<char>(((u ) & 0x3f) | 0x80) 166 }); 167 continue; 168 } else if ((u & 0xf800) == 0xd800) { 169 // surrogate pair 170 char16_t u2; 171 if (KJ_UNLIKELY(i == utf16.size() // missing second half 172 || (u & 0x0400) != 0 // first half in wrong range 173 || ((u2 = utf16[i]) & 0xfc00) != 0xdc00)) { // second half in wrong range 174 hadErrors = true; 175 goto threeByte; 176 } 177 ++i; 178 179 char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000; 180 result.addAll<std::initializer_list<char>>({ 181 static_cast<char>(((u32 >> 18) ) | 0xf0), 182 static_cast<char>(((u32 >> 12) & 0x3f) | 0x80), 183 static_cast<char>(((u32 >> 6) & 0x3f) | 0x80), 184 static_cast<char>(((u32 ) & 0x3f) | 0x80) 185 }); 186 continue; 187 } else { 188 threeByte: 189 result.addAll<std::initializer_list<char>>({ 190 static_cast<char>(((u >> 12) ) | 0xe0), 191 static_cast<char>(((u >> 6) & 0x3f) | 0x80), 192 static_cast<char>(((u ) & 0x3f) | 0x80) 193 }); 194 continue; 195 } 196 } 197 198 result.add(0); 199 return { String(result.releaseAsArray()), hadErrors }; 200 } 201 202 EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) { 203 Vector<char> result(utf16.size() + 1); 204 bool hadErrors = false; 205 206 size_t i = 0; 207 while (i < utf16.size()) { 208 char32_t u = utf16[i++]; 209 210 if (u < 0x80) { 211 result.add(u); 212 continue; 213 } else if (u < 0x0800) { 214 result.addAll<std::initializer_list<char>>({ 215 static_cast<char>(((u >> 6) ) | 0xc0), 216 static_cast<char>(((u ) & 0x3f) | 0x80) 217 }); 218 continue; 219 } else if (u < 0x10000) { 220 if (KJ_UNLIKELY((u & 0xfffff800) == 0xd800)) { 221 // no surrogates allowed in utf-32 222 hadErrors = true; 223 } 224 result.addAll<std::initializer_list<char>>({ 225 static_cast<char>(((u >> 12) ) | 0xe0), 226 static_cast<char>(((u >> 6) & 0x3f) | 0x80), 227 static_cast<char>(((u ) & 0x3f) | 0x80) 228 }); 229 continue; 230 } else { 231 GOTO_ERROR_IF(u >= 0x110000); // outside Unicode range 232 result.addAll<std::initializer_list<char>>({ 233 static_cast<char>(((u >> 18) ) | 0xf0), 234 static_cast<char>(((u >> 12) & 0x3f) | 0x80), 235 static_cast<char>(((u >> 6) & 0x3f) | 0x80), 236 static_cast<char>(((u ) & 0x3f) | 0x80) 237 }); 238 continue; 239 } 240 241 error: 242 result.addAll(StringPtr(u8"\ufffd")); 243 hadErrors = true; 244 } 245 246 result.add(0); 247 return { String(result.releaseAsArray()), hadErrors }; 248 } 249 250 namespace { 251 252 #if __GNUC__ >= 8 && !__clang__ 253 // GCC 8's new class-memaccess warning rightly dislikes the following hacks, but we're really sure 254 // we want to allow them so disable the warning. 255 #pragma GCC diagnostic push 256 #pragma GCC diagnostic ignored "-Wclass-memaccess" 257 #endif 258 259 template <typename To, typename From> 260 Array<To> coerceTo(Array<From>&& array) { 261 static_assert(sizeof(To) == sizeof(From), "incompatible coercion"); 262 Array<wchar_t> result; 263 memcpy(&result, &array, sizeof(array)); 264 memset(&array, 0, sizeof(array)); 265 return result; 266 } 267 268 template <typename To, typename From> 269 ArrayPtr<To> coerceTo(ArrayPtr<From> array) { 270 static_assert(sizeof(To) == sizeof(From), "incompatible coercion"); 271 return arrayPtr(reinterpret_cast<To*>(array.begin()), array.size()); 272 } 273 274 template <typename To, typename From> 275 EncodingResult<Array<To>> coerceTo(EncodingResult<Array<From>>&& result) { 276 return { coerceTo<To>(Array<From>(kj::mv(result))), result.hadErrors }; 277 } 278 279 #if __GNUC__ >= 8 && !__clang__ 280 #pragma GCC diagnostic pop 281 #endif 282 283 template <size_t s> 284 struct WideConverter; 285 286 template <> 287 struct WideConverter<sizeof(char)> { 288 typedef char Type; 289 290 static EncodingResult<Array<char>> encode(ArrayPtr<const char> text, bool nulTerminate) { 291 auto result = heapArray<char>(text.size() + nulTerminate); 292 memcpy(result.begin(), text.begin(), text.size()); 293 if (nulTerminate) result.back() = 0; 294 return { kj::mv(result), false }; 295 } 296 297 static EncodingResult<kj::String> decode(ArrayPtr<const char> text) { 298 return { kj::heapString(text), false }; 299 } 300 }; 301 302 template <> 303 struct WideConverter<sizeof(char16_t)> { 304 typedef char16_t Type; 305 306 static inline EncodingResult<Array<char16_t>> encode( 307 ArrayPtr<const char> text, bool nulTerminate) { 308 return encodeUtf16(text, nulTerminate); 309 } 310 311 static inline EncodingResult<kj::String> decode(ArrayPtr<const char16_t> text) { 312 return decodeUtf16(text); 313 } 314 }; 315 316 template <> 317 struct WideConverter<sizeof(char32_t)> { 318 typedef char32_t Type; 319 320 static inline EncodingResult<Array<char32_t>> encode( 321 ArrayPtr<const char> text, bool nulTerminate) { 322 return encodeUtf32(text, nulTerminate); 323 } 324 325 static inline EncodingResult<kj::String> decode(ArrayPtr<const char32_t> text) { 326 return decodeUtf32(text); 327 } 328 }; 329 330 } // namespace 331 332 EncodingResult<Array<wchar_t>> encodeWideString(ArrayPtr<const char> text, bool nulTerminate) { 333 return coerceTo<wchar_t>(WideConverter<sizeof(wchar_t)>::encode(text, nulTerminate)); 334 } 335 EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide) { 336 using Converter = WideConverter<sizeof(wchar_t)>; 337 return Converter::decode(coerceTo<const Converter::Type>(wide)); 338 } 339 340 // ======================================================================================= 341 342 namespace { 343 344 const char HEX_DIGITS[] = "0123456789abcdef"; 345 // Maps integer in the range [0,16) to a hex digit. 346 347 const char HEX_DIGITS_URI[] = "0123456789ABCDEF"; 348 // RFC 3986 section 2.1 says "For consistency, URI producers and normalizers should use uppercase 349 // hexadecimal digits for all percent-encodings. 350 351 static Maybe<uint> tryFromHexDigit(char c) { 352 if ('0' <= c && c <= '9') { 353 return c - '0'; 354 } else if ('a' <= c && c <= 'f') { 355 return c - ('a' - 10); 356 } else if ('A' <= c && c <= 'F') { 357 return c - ('A' - 10); 358 } else { 359 return nullptr; 360 } 361 } 362 363 static Maybe<uint> tryFromOctDigit(char c) { 364 if ('0' <= c && c <= '7') { 365 return c - '0'; 366 } else { 367 return nullptr; 368 } 369 } 370 371 } // namespace 372 373 String encodeHex(ArrayPtr<const byte> input) { 374 return strArray(KJ_MAP(b, input) { 375 return heapArray<char>({HEX_DIGITS[b/16], HEX_DIGITS[b%16]}); 376 }, ""); 377 } 378 379 EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) { 380 auto result = heapArray<byte>(text.size() / 2); 381 bool hadErrors = text.size() % 2; 382 383 for (auto i: kj::indices(result)) { 384 byte b = 0; 385 KJ_IF_MAYBE(d1, tryFromHexDigit(text[i*2])) { 386 b = *d1 << 4; 387 } else { 388 hadErrors = true; 389 } 390 KJ_IF_MAYBE(d2, tryFromHexDigit(text[i*2+1])) { 391 b |= *d2; 392 } else { 393 hadErrors = true; 394 } 395 result[i] = b; 396 } 397 398 return { kj::mv(result), hadErrors }; 399 } 400 401 String encodeUriComponent(ArrayPtr<const byte> bytes) { 402 Vector<char> result(bytes.size() + 1); 403 for (byte b: bytes) { 404 if (('A' <= b && b <= 'Z') || 405 ('a' <= b && b <= 'z') || 406 ('0' <= b && b <= '9') || 407 b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' || 408 b == '(' || b == ')') { 409 result.add(b); 410 } else { 411 result.add('%'); 412 result.add(HEX_DIGITS_URI[b/16]); 413 result.add(HEX_DIGITS_URI[b%16]); 414 } 415 } 416 result.add('\0'); 417 return String(result.releaseAsArray()); 418 } 419 420 String encodeUriFragment(ArrayPtr<const byte> bytes) { 421 Vector<char> result(bytes.size() + 1); 422 for (byte b: bytes) { 423 if (('?' <= b && b <= '_') || // covers A-Z 424 ('a' <= b && b <= '~') || // covers a-z 425 ('&' <= b && b <= ';') || // covers 0-9 426 b == '!' || b == '=' || b == '#' || b == '$') { 427 result.add(b); 428 } else { 429 result.add('%'); 430 result.add(HEX_DIGITS_URI[b/16]); 431 result.add(HEX_DIGITS_URI[b%16]); 432 } 433 } 434 result.add('\0'); 435 return String(result.releaseAsArray()); 436 } 437 438 String encodeUriPath(ArrayPtr<const byte> bytes) { 439 Vector<char> result(bytes.size() + 1); 440 for (byte b: bytes) { 441 if (('@' <= b && b <= '[') || // covers A-Z 442 ('a' <= b && b <= 'z') || 443 ('0' <= b && b <= ';') || // covers 0-9 444 ('&' <= b && b <= '.') || 445 b == '_' || b == '!' || b == '=' || b == ']' || 446 b == '^' || b == '|' || b == '~' || b == '$') { 447 result.add(b); 448 } else { 449 result.add('%'); 450 result.add(HEX_DIGITS_URI[b/16]); 451 result.add(HEX_DIGITS_URI[b%16]); 452 } 453 } 454 result.add('\0'); 455 return String(result.releaseAsArray()); 456 } 457 458 String encodeUriUserInfo(ArrayPtr<const byte> bytes) { 459 Vector<char> result(bytes.size() + 1); 460 for (byte b: bytes) { 461 if (('A' <= b && b <= 'Z') || 462 ('a' <= b && b <= 'z') || 463 ('0' <= b && b <= '9') || 464 ('&' <= b && b <= '.') || 465 b == '_' || b == '!' || b == '~' || b == '$') { 466 result.add(b); 467 } else { 468 result.add('%'); 469 result.add(HEX_DIGITS_URI[b/16]); 470 result.add(HEX_DIGITS_URI[b%16]); 471 } 472 } 473 result.add('\0'); 474 return String(result.releaseAsArray()); 475 } 476 477 String encodeWwwForm(ArrayPtr<const byte> bytes) { 478 Vector<char> result(bytes.size() + 1); 479 for (byte b: bytes) { 480 if (('A' <= b && b <= 'Z') || 481 ('a' <= b && b <= 'z') || 482 ('0' <= b && b <= '9') || 483 b == '-' || b == '_' || b == '.' || b == '*') { 484 result.add(b); 485 } else if (b == ' ') { 486 result.add('+'); 487 } else { 488 result.add('%'); 489 result.add(HEX_DIGITS_URI[b/16]); 490 result.add(HEX_DIGITS_URI[b%16]); 491 } 492 } 493 result.add('\0'); 494 return String(result.releaseAsArray()); 495 } 496 497 EncodingResult<Array<byte>> decodeBinaryUriComponent( 498 ArrayPtr<const char> text, DecodeUriOptions options) { 499 Vector<byte> result(text.size() + options.nulTerminate); 500 bool hadErrors = false; 501 502 const char* ptr = text.begin(); 503 const char* end = text.end(); 504 while (ptr < end) { 505 if (*ptr == '%') { 506 ++ptr; 507 508 if (ptr == end) { 509 hadErrors = true; 510 } else KJ_IF_MAYBE(d1, tryFromHexDigit(*ptr)) { 511 byte b = *d1; 512 ++ptr; 513 if (ptr == end) { 514 hadErrors = true; 515 } else KJ_IF_MAYBE(d2, tryFromHexDigit(*ptr)) { 516 b = (b << 4) | *d2; 517 ++ptr; 518 } else { 519 hadErrors = true; 520 } 521 result.add(b); 522 } else { 523 hadErrors = true; 524 } 525 } else if (options.plusToSpace && *ptr == '+') { 526 ++ptr; 527 result.add(' '); 528 } else { 529 result.add(*ptr++); 530 } 531 } 532 533 if (options.nulTerminate) result.add(0); 534 return { result.releaseAsArray(), hadErrors }; 535 } 536 537 // ======================================================================================= 538 539 namespace _ { // private 540 541 String encodeCEscapeImpl(ArrayPtr<const byte> bytes, bool isBinary) { 542 Vector<char> escaped(bytes.size()); 543 544 for (byte b: bytes) { 545 switch (b) { 546 case '\a': escaped.addAll(StringPtr("\\a")); break; 547 case '\b': escaped.addAll(StringPtr("\\b")); break; 548 case '\f': escaped.addAll(StringPtr("\\f")); break; 549 case '\n': escaped.addAll(StringPtr("\\n")); break; 550 case '\r': escaped.addAll(StringPtr("\\r")); break; 551 case '\t': escaped.addAll(StringPtr("\\t")); break; 552 case '\v': escaped.addAll(StringPtr("\\v")); break; 553 case '\'': escaped.addAll(StringPtr("\\\'")); break; 554 case '\"': escaped.addAll(StringPtr("\\\"")); break; 555 case '\\': escaped.addAll(StringPtr("\\\\")); break; 556 default: 557 if (b < 0x20 || b == 0x7f || (isBinary && b > 0x7f)) { 558 // Use octal escape, not hex, because hex escapes technically have no length limit and 559 // so can create ambiguity with subsequent characters. 560 escaped.add('\\'); 561 escaped.add(HEX_DIGITS[b / 64]); 562 escaped.add(HEX_DIGITS[(b / 8) % 8]); 563 escaped.add(HEX_DIGITS[b % 8]); 564 } else { 565 escaped.add(b); 566 } 567 break; 568 } 569 } 570 571 escaped.add(0); 572 return String(escaped.releaseAsArray()); 573 } 574 575 } // namespace 576 577 EncodingResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) { 578 Vector<byte> result(text.size() + nulTerminate); 579 bool hadErrors = false; 580 581 size_t i = 0; 582 while (i < text.size()) { 583 char c = text[i++]; 584 if (c == '\\') { 585 if (i == text.size()) { 586 hadErrors = true; 587 continue; 588 } 589 char c2 = text[i++]; 590 switch (c2) { 591 case 'a' : result.add('\a'); break; 592 case 'b' : result.add('\b'); break; 593 case 'f' : result.add('\f'); break; 594 case 'n' : result.add('\n'); break; 595 case 'r' : result.add('\r'); break; 596 case 't' : result.add('\t'); break; 597 case 'v' : result.add('\v'); break; 598 case '\'': result.add('\''); break; 599 case '\"': result.add('\"'); break; 600 case '\\': result.add('\\'); break; 601 602 case '0': 603 case '1': 604 case '2': 605 case '3': 606 case '4': 607 case '5': 608 case '6': 609 case '7': { 610 uint value = c2 - '0'; 611 for (uint j = 0; j < 2 && i < text.size(); j++) { 612 KJ_IF_MAYBE(d, tryFromOctDigit(text[i])) { 613 ++i; 614 value = (value << 3) | *d; 615 } else { 616 break; 617 } 618 } 619 if (value >= 0x100) hadErrors = true; 620 result.add(value); 621 break; 622 } 623 624 case 'x': { 625 uint value = 0; 626 while (i < text.size()) { 627 KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) { 628 ++i; 629 value = (value << 4) | *d; 630 } else { 631 break; 632 } 633 } 634 if (value >= 0x100) hadErrors = true; 635 result.add(value); 636 break; 637 } 638 639 case 'u': { 640 char16_t value = 0; 641 for (uint j = 0; j < 4; j++) { 642 if (i == text.size()) { 643 hadErrors = true; 644 break; 645 } else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) { 646 ++i; 647 value = (value << 4) | *d; 648 } else { 649 hadErrors = true; 650 break; 651 } 652 } 653 auto utf = decodeUtf16(arrayPtr(&value, 1)); 654 if (utf.hadErrors) hadErrors = true; 655 result.addAll(utf.asBytes()); 656 break; 657 } 658 659 case 'U': { 660 char32_t value = 0; 661 for (uint j = 0; j < 8; j++) { 662 if (i == text.size()) { 663 hadErrors = true; 664 break; 665 } else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) { 666 ++i; 667 value = (value << 4) | *d; 668 } else { 669 hadErrors = true; 670 break; 671 } 672 } 673 auto utf = decodeUtf32(arrayPtr(&value, 1)); 674 if (utf.hadErrors) hadErrors = true; 675 result.addAll(utf.asBytes()); 676 break; 677 } 678 679 default: 680 result.add(c2); 681 } 682 } else { 683 result.add(c); 684 } 685 } 686 687 if (nulTerminate) result.add(0); 688 return { result.releaseAsArray(), hadErrors }; 689 } 690 691 // ======================================================================================= 692 // This code is derived from libb64 which has been placed in the public domain. 693 // For details, see http://sourceforge.net/projects/libb64 694 695 // ------------------------------------------------------------------- 696 // Encoder 697 698 namespace { 699 700 typedef enum { 701 step_A, step_B, step_C 702 } base64_encodestep; 703 704 typedef struct { 705 base64_encodestep step; 706 char result; 707 int stepcount; 708 } base64_encodestate; 709 710 const int CHARS_PER_LINE = 72; 711 712 void base64_init_encodestate(base64_encodestate* state_in) { 713 state_in->step = step_A; 714 state_in->result = 0; 715 state_in->stepcount = 0; 716 } 717 718 char base64_encode_value(char value_in) { 719 static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 720 if (value_in > 63) return '='; 721 return encoding[(int)value_in]; 722 } 723 724 int base64_encode_block(const char* plaintext_in, int length_in, 725 char* code_out, base64_encodestate* state_in, bool breakLines) { 726 const char* plainchar = plaintext_in; 727 const char* const plaintextend = plaintext_in + length_in; 728 char* codechar = code_out; 729 char result; 730 char fragment; 731 732 result = state_in->result; 733 734 switch (state_in->step) { 735 while (1) { 736 KJ_FALLTHROUGH; 737 case step_A: 738 if (plainchar == plaintextend) { 739 state_in->result = result; 740 state_in->step = step_A; 741 return codechar - code_out; 742 } 743 fragment = *plainchar++; 744 result = (fragment & 0x0fc) >> 2; 745 *codechar++ = base64_encode_value(result); 746 result = (fragment & 0x003) << 4; 747 KJ_FALLTHROUGH; 748 case step_B: 749 if (plainchar == plaintextend) { 750 state_in->result = result; 751 state_in->step = step_B; 752 return codechar - code_out; 753 } 754 fragment = *plainchar++; 755 result |= (fragment & 0x0f0) >> 4; 756 *codechar++ = base64_encode_value(result); 757 result = (fragment & 0x00f) << 2; 758 KJ_FALLTHROUGH; 759 case step_C: 760 if (plainchar == plaintextend) { 761 state_in->result = result; 762 state_in->step = step_C; 763 return codechar - code_out; 764 } 765 fragment = *plainchar++; 766 result |= (fragment & 0x0c0) >> 6; 767 *codechar++ = base64_encode_value(result); 768 result = (fragment & 0x03f) >> 0; 769 *codechar++ = base64_encode_value(result); 770 771 ++(state_in->stepcount); 772 if (breakLines && state_in->stepcount == CHARS_PER_LINE/4) { 773 *codechar++ = '\n'; 774 state_in->stepcount = 0; 775 } 776 } 777 } 778 /* control should not reach here */ 779 return codechar - code_out; 780 } 781 782 int base64_encode_blockend(char* code_out, base64_encodestate* state_in, bool breakLines) { 783 char* codechar = code_out; 784 785 switch (state_in->step) { 786 case step_B: 787 *codechar++ = base64_encode_value(state_in->result); 788 *codechar++ = '='; 789 *codechar++ = '='; 790 ++state_in->stepcount; 791 break; 792 case step_C: 793 *codechar++ = base64_encode_value(state_in->result); 794 *codechar++ = '='; 795 ++state_in->stepcount; 796 break; 797 case step_A: 798 break; 799 } 800 if (breakLines && state_in->stepcount > 0) { 801 *codechar++ = '\n'; 802 } 803 804 return codechar - code_out; 805 } 806 807 } // namespace 808 809 String encodeBase64(ArrayPtr<const byte> input, bool breakLines) { 810 /* set up a destination buffer large enough to hold the encoded data */ 811 // equivalent to ceil(input.size() / 3) * 4 812 auto numChars = (input.size() + 2) / 3 * 4; 813 if (breakLines) { 814 // Add space for newline characters. 815 uint lineCount = numChars / CHARS_PER_LINE; 816 if (numChars % CHARS_PER_LINE > 0) { 817 // Partial line. 818 ++lineCount; 819 } 820 numChars = numChars + lineCount; 821 } 822 auto output = heapString(numChars); 823 /* keep track of our encoded position */ 824 char* c = output.begin(); 825 /* store the number of bytes encoded by a single call */ 826 int cnt = 0; 827 size_t total = 0; 828 /* we need an encoder state */ 829 base64_encodestate s; 830 831 /*---------- START ENCODING ----------*/ 832 /* initialise the encoder state */ 833 base64_init_encodestate(&s); 834 /* gather data from the input and send it to the output */ 835 cnt = base64_encode_block((const char *)input.begin(), input.size(), c, &s, breakLines); 836 c += cnt; 837 total += cnt; 838 839 /* since we have encoded the entire input string, we know that 840 there is no more input data; finalise the encoding */ 841 cnt = base64_encode_blockend(c, &s, breakLines); 842 c += cnt; 843 total += cnt; 844 /*---------- STOP ENCODING ----------*/ 845 846 KJ_ASSERT(total == output.size(), total, output.size()); 847 848 return output; 849 } 850 851 // ------------------------------------------------------------------- 852 // Decoder 853 854 namespace { 855 856 typedef enum { 857 step_a, step_b, step_c, step_d 858 } base64_decodestep; 859 860 struct base64_decodestate { 861 bool hadErrors = false; 862 size_t nPaddingBytesSeen = 0; 863 // Output state. `nPaddingBytesSeen` is not guaranteed to be correct if `hadErrors` is true. It is 864 // included in the state purely to preserve the streaming capability of the algorithm while still 865 // checking for errors correctly (consider chunk 1 = "abc=", chunk 2 = "d"). 866 867 base64_decodestep step = step_a; 868 char plainchar = 0; 869 }; 870 871 int base64_decode_value(char value_in) { 872 // Returns either the fragment value or: -1 on whitespace, -2 on padding, -3 on invalid input. 873 // 874 // Note that the original libb64 implementation used -1 for invalid input, -2 on padding -- this 875 // new scheme allows for some simpler error checks in steps A and B. 876 877 static const signed char decoding[] = { 878 -3,-3,-3,-3,-3,-3,-3,-3, -3,-1,-1,-3,-1,-1,-3,-3, 879 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 880 -1,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,62,-3,-3,-3,63, 881 52,53,54,55,56,57,58,59, 60,61,-3,-3,-3,-2,-3,-3, 882 -3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 883 15,16,17,18,19,20,21,22, 23,24,25,-3,-3,-3,-3,-3, 884 -3,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40, 885 41,42,43,44,45,46,47,48, 49,50,51,-3,-3,-3,-3,-3, 886 887 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 888 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 889 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 890 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 891 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 892 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 893 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 894 -3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3, 895 }; 896 static_assert(sizeof(decoding) == 256, "base64 decoding table size error"); 897 return decoding[(unsigned char)value_in]; 898 } 899 900 int base64_decode_block(const char* code_in, const int length_in, 901 char* plaintext_out, base64_decodestate* state_in) { 902 const char* codechar = code_in; 903 char* plainchar = plaintext_out; 904 signed char fragment; 905 906 if (state_in->step != step_a) { 907 *plainchar = state_in->plainchar; 908 } 909 910 #define ERROR_IF(predicate) state_in->hadErrors = state_in->hadErrors || (predicate) 911 912 switch (state_in->step) 913 { 914 while (1) 915 { 916 KJ_FALLTHROUGH; 917 case step_a: 918 do { 919 if (codechar == code_in+length_in) { 920 state_in->step = step_a; 921 state_in->plainchar = '\0'; 922 return plainchar - plaintext_out; 923 } 924 fragment = (signed char)base64_decode_value(*codechar++); 925 // It is an error to see invalid or padding bytes in step A. 926 ERROR_IF(fragment < -1); 927 } while (fragment < 0); 928 *plainchar = (fragment & 0x03f) << 2; 929 KJ_FALLTHROUGH; 930 case step_b: 931 do { 932 if (codechar == code_in+length_in) { 933 state_in->step = step_b; 934 state_in->plainchar = *plainchar; 935 // It is always an error to suspend from step B, because we don't have enough bits yet. 936 // TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is 937 // to be called multiple times. We'll fix it if we ever care to support streaming. 938 state_in->hadErrors = true; 939 return plainchar - plaintext_out; 940 } 941 fragment = (signed char)base64_decode_value(*codechar++); 942 // It is an error to see invalid or padding bytes in step B. 943 ERROR_IF(fragment < -1); 944 } while (fragment < 0); 945 *plainchar++ |= (fragment & 0x030) >> 4; 946 *plainchar = (fragment & 0x00f) << 4; 947 KJ_FALLTHROUGH; 948 case step_c: 949 do { 950 if (codechar == code_in+length_in) { 951 state_in->step = step_c; 952 state_in->plainchar = *plainchar; 953 // It is an error to complete from step C if we have seen incomplete padding. 954 // TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is 955 // to be called multiple times. We'll fix it if we ever care to support streaming. 956 ERROR_IF(state_in->nPaddingBytesSeen == 1); 957 return plainchar - plaintext_out; 958 } 959 fragment = (signed char)base64_decode_value(*codechar++); 960 // It is an error to see invalid bytes or more than two padding bytes in step C. 961 ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 2)); 962 } while (fragment < 0); 963 // It is an error to continue from step C after having seen any padding. 964 ERROR_IF(state_in->nPaddingBytesSeen > 0); 965 *plainchar++ |= (fragment & 0x03c) >> 2; 966 *plainchar = (fragment & 0x003) << 6; 967 KJ_FALLTHROUGH; 968 case step_d: 969 do { 970 if (codechar == code_in+length_in) { 971 state_in->step = step_d; 972 state_in->plainchar = *plainchar; 973 return plainchar - plaintext_out; 974 } 975 fragment = (signed char)base64_decode_value(*codechar++); 976 // It is an error to see invalid bytes or more than one padding byte in step D. 977 ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 1)); 978 } while (fragment < 0); 979 // It is an error to continue from step D after having seen padding bytes. 980 ERROR_IF(state_in->nPaddingBytesSeen > 0); 981 *plainchar++ |= (fragment & 0x03f); 982 } 983 } 984 985 #undef ERROR_IF 986 987 /* control should not reach here */ 988 return plainchar - plaintext_out; 989 } 990 991 } // namespace 992 993 EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> input) { 994 base64_decodestate state; 995 996 auto output = heapArray<byte>((input.size() * 6 + 7) / 8); 997 998 size_t n = base64_decode_block(input.begin(), input.size(), 999 reinterpret_cast<char*>(output.begin()), &state); 1000 1001 if (n < output.size()) { 1002 auto copy = heapArray<byte>(n); 1003 memcpy(copy.begin(), output.begin(), n); 1004 output = kj::mv(copy); 1005 } 1006 1007 return EncodingResult<Array<byte>>(kj::mv(output), state.hadErrors); 1008 } 1009 1010 String encodeBase64Url(ArrayPtr<const byte> bytes) { 1011 // TODO(perf): Rewrite as single pass? 1012 // TODO(someday): Write decoder? 1013 1014 auto base64 = kj::encodeBase64(bytes); 1015 1016 for (char& c: base64) { 1017 if (c == '+') c = '-'; 1018 if (c == '/') c = '_'; 1019 } 1020 1021 // Remove trailing '='s. 1022 kj::ArrayPtr<const char> slice = base64; 1023 while (slice.size() > 0 && slice.back() == '=') { 1024 slice = slice.slice(0, slice.size() - 1); 1025 } 1026 1027 return kj::str(slice); 1028 } 1029 1030 } // namespace kj