capnproto

FORK: Cap'n Proto serialization/RPC system - core tools and C++ library
git clone https://git.neptards.moe/neptards/capnproto.git
Log | Files | Refs | README | LICENSE

url.c++ (15895B)


      1 // Copyright (c) 2017 Cloudflare, Inc. and contributors
      2 // Licensed under the MIT License:
      3 //
      4 // Permission is hereby granted, free of charge, to any person obtaining a copy
      5 // of this software and associated documentation files (the "Software"), to deal
      6 // in the Software without restriction, including without limitation the rights
      7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      8 // copies of the Software, and to permit persons to whom the Software is
      9 // furnished to do so, subject to the following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included in
     12 // all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     20 // THE SOFTWARE.
     21 
     22 #include "url.h"
     23 #include <kj/encoding.h>
     24 #include <kj/parse/char.h>
     25 #include <kj/debug.h>
     26 #include <stdlib.h>
     27 
     28 namespace kj {
     29 
     30 namespace {
     31 
     32 constexpr auto ALPHAS = parse::charRange('a', 'z').orRange('A', 'Z');
     33 constexpr auto DIGITS = parse::charRange('0', '9');
     34 
     35 constexpr auto END_AUTHORITY = parse::anyOfChars("/?#");
     36 
     37 // Authority, path, and query components can typically be terminated by the start of a fragment.
     38 // However, fragments are disallowed in HTTP_REQUEST and HTTP_PROXY_REQUEST contexts. As a quirk, we
     39 // allow the fragment start character ('#') to live unescaped in path and query components. We do
     40 // not currently allow it in the authority component, because our parser would reject it as a host
     41 // character anyway.
     42 
     43 const parse::CharGroup_& getEndPathPart(Url::Context context) {
     44   static constexpr auto END_PATH_PART_HREF = parse::anyOfChars("/?#");
     45   static constexpr auto END_PATH_PART_REQUEST = parse::anyOfChars("/?");
     46 
     47   switch (context) {
     48     case Url::REMOTE_HREF:        return END_PATH_PART_HREF;
     49     case Url::HTTP_PROXY_REQUEST: return END_PATH_PART_REQUEST;
     50     case Url::HTTP_REQUEST:       return END_PATH_PART_REQUEST;
     51   }
     52 
     53   KJ_UNREACHABLE;
     54 }
     55 
     56 const parse::CharGroup_& getEndQueryPart(Url::Context context) {
     57   static constexpr auto END_QUERY_PART_HREF = parse::anyOfChars("&#");
     58   static constexpr auto END_QUERY_PART_REQUEST = parse::anyOfChars("&");
     59 
     60   switch (context) {
     61     case Url::REMOTE_HREF:        return END_QUERY_PART_HREF;
     62     case Url::HTTP_PROXY_REQUEST: return END_QUERY_PART_REQUEST;
     63     case Url::HTTP_REQUEST:       return END_QUERY_PART_REQUEST;
     64   }
     65 
     66   KJ_UNREACHABLE;
     67 }
     68 
     69 constexpr auto SCHEME_CHARS = ALPHAS.orGroup(DIGITS).orAny("+-.");
     70 constexpr auto NOT_SCHEME_CHARS = SCHEME_CHARS.invert();
     71 
     72 constexpr auto HOST_CHARS = ALPHAS.orGroup(DIGITS).orAny(".-:[]_");
     73 // [] is for ipv6 literals.
     74 // _ is not allowed in domain names, but the WHATWG URL spec allows it in hostnames, so we do, too.
     75 // TODO(someday): The URL spec actually allows a lot more than just '_', and requires nameprepping
     76 //   to Punycode. We'll have to decide how we want to deal with all that.
     77 
     78 void toLower(String& text) {
     79   for (char& c: text) {
     80     if ('A' <= c && c <= 'Z') {
     81       c += 'a' - 'A';
     82     }
     83   }
     84 }
     85 
     86 Maybe<ArrayPtr<const char>> trySplit(StringPtr& text, char c) {
     87   KJ_IF_MAYBE(pos, text.findFirst(c)) {
     88     ArrayPtr<const char> result = text.slice(0, *pos);
     89     text = text.slice(*pos + 1);
     90     return result;
     91   } else {
     92     return nullptr;
     93   }
     94 }
     95 
     96 Maybe<ArrayPtr<const char>> trySplit(ArrayPtr<const char>& text, char c) {
     97   for (auto i: kj::indices(text)) {
     98     if (text[i] == c) {
     99       ArrayPtr<const char> result = text.slice(0, i);
    100       text = text.slice(i + 1, text.size());
    101       return result;
    102     }
    103   }
    104   return nullptr;
    105 }
    106 
    107 ArrayPtr<const char> split(StringPtr& text, const parse::CharGroup_& chars) {
    108   for (auto i: kj::indices(text)) {
    109     if (chars.contains(text[i])) {
    110       ArrayPtr<const char> result = text.slice(0, i);
    111       text = text.slice(i);
    112       return result;
    113     }
    114   }
    115   auto result = text.asArray();
    116   text = "";
    117   return result;
    118 }
    119 
    120 String percentDecode(ArrayPtr<const char> text, bool& hadErrors, const Url::Options& options) {
    121   if (options.percentDecode) {
    122     auto result = decodeUriComponent(text);
    123     if (result.hadErrors) hadErrors = true;
    124     return kj::mv(result);
    125   }
    126   return kj::str(text);
    127 }
    128 
    129 String percentDecodeQuery(ArrayPtr<const char> text, bool& hadErrors, const Url::Options& options) {
    130   if (options.percentDecode) {
    131     auto result = decodeWwwForm(text);
    132     if (result.hadErrors) hadErrors = true;
    133     return kj::mv(result);
    134   }
    135   return kj::str(text);
    136 }
    137 
    138 }  // namespace
    139 
    140 Url::~Url() noexcept(false) {}
    141 
    142 Url Url::clone() const {
    143   return {
    144     kj::str(scheme),
    145     userInfo.map([](const UserInfo& ui) -> UserInfo {
    146       return {
    147         kj::str(ui.username),
    148         ui.password.map([](const String& s) { return kj::str(s); })
    149       };
    150     }),
    151     kj::str(host),
    152     KJ_MAP(part, path) { return kj::str(part); },
    153     hasTrailingSlash,
    154     KJ_MAP(param, query) -> QueryParam {
    155       // Preserve the "allocated-ness" of `param.value` with this careful copy.
    156       return { kj::str(param.name), param.value.begin() == nullptr ? kj::String()
    157                                                                    : kj::str(param.value) };
    158     },
    159     fragment.map([](const String& s) { return kj::str(s); }),
    160     options
    161   };
    162 }
    163 
    164 Url Url::parse(StringPtr url, Context context, Options options) {
    165   return KJ_REQUIRE_NONNULL(tryParse(url, context, options), "invalid URL", url);
    166 }
    167 
    168 Maybe<Url> Url::tryParse(StringPtr text, Context context, Options options) {
    169   Url result;
    170   result.options = options;
    171   bool err = false;  // tracks percent-decoding errors
    172 
    173   auto& END_PATH_PART = getEndPathPart(context);
    174   auto& END_QUERY_PART = getEndQueryPart(context);
    175 
    176   if (context == HTTP_REQUEST) {
    177     if (!text.startsWith("/")) {
    178       return nullptr;
    179     }
    180   } else {
    181     KJ_IF_MAYBE(scheme, trySplit(text, ':')) {
    182       result.scheme = kj::str(*scheme);
    183     } else {
    184       // missing scheme
    185       return nullptr;
    186     }
    187     toLower(result.scheme);
    188     if (result.scheme.size() == 0 ||
    189         !ALPHAS.contains(result.scheme[0]) ||
    190         !SCHEME_CHARS.containsAll(result.scheme.slice(1))) {
    191       // bad scheme
    192       return nullptr;
    193     }
    194 
    195     if (!text.startsWith("//")) {
    196       // We require an authority (hostname) part.
    197       return nullptr;
    198     }
    199     text = text.slice(2);
    200 
    201     {
    202       auto authority = split(text, END_AUTHORITY);
    203 
    204       KJ_IF_MAYBE(userpass, trySplit(authority, '@')) {
    205         if (context != REMOTE_HREF) {
    206           // No user/pass allowed here.
    207           return nullptr;
    208         }
    209         KJ_IF_MAYBE(username, trySplit(*userpass, ':')) {
    210           result.userInfo = UserInfo {
    211             percentDecode(*username, err, options),
    212             percentDecode(*userpass, err, options)
    213           };
    214         } else {
    215           result.userInfo = UserInfo {
    216             percentDecode(*userpass, err, options),
    217             nullptr
    218           };
    219         }
    220       }
    221 
    222       result.host = percentDecode(authority, err, options);
    223       if (!HOST_CHARS.containsAll(result.host)) return nullptr;
    224       toLower(result.host);
    225     }
    226   }
    227 
    228   while (text.startsWith("/")) {
    229     text = text.slice(1);
    230     auto part = split(text, END_PATH_PART);
    231     if (part.size() == 2 && part[0] == '.' && part[1] == '.') {
    232       if (result.path.size() != 0) {
    233         result.path.removeLast();
    234       }
    235       result.hasTrailingSlash = true;
    236     } else if ((part.size() == 0 && (!options.allowEmpty || text.size() == 0)) ||
    237                (part.size() == 1 && part[0] == '.')) {
    238       // Collapse consecutive slashes and "/./".
    239       result.hasTrailingSlash = true;
    240     } else {
    241       result.path.add(percentDecode(part, err, options));
    242       result.hasTrailingSlash = false;
    243     }
    244   }
    245 
    246   if (text.startsWith("?")) {
    247     do {
    248       text = text.slice(1);
    249       auto part = split(text, END_QUERY_PART);
    250 
    251       if (part.size() > 0 || options.allowEmpty) {
    252         KJ_IF_MAYBE(key, trySplit(part, '=')) {
    253           result.query.add(QueryParam { percentDecodeQuery(*key, err, options),
    254                                         percentDecodeQuery(part, err, options) });
    255         } else {
    256           result.query.add(QueryParam { percentDecodeQuery(part, err, options), nullptr });
    257         }
    258       }
    259     } while (text.startsWith("&"));
    260   }
    261 
    262   if (text.startsWith("#")) {
    263     if (context != REMOTE_HREF) {
    264       // No fragment allowed here.
    265       return nullptr;
    266     }
    267     result.fragment = percentDecode(text.slice(1), err, options);
    268   } else {
    269     // We should have consumed everything.
    270     KJ_ASSERT(text.size() == 0);
    271   }
    272 
    273   if (err) return nullptr;
    274 
    275   return kj::mv(result);
    276 }
    277 
    278 Url Url::parseRelative(StringPtr url) const {
    279   return KJ_REQUIRE_NONNULL(tryParseRelative(url), "invalid relative URL", url);
    280 }
    281 
    282 Maybe<Url> Url::tryParseRelative(StringPtr text) const {
    283   if (text.size() == 0) return clone();
    284 
    285   Url result;
    286   result.options = options;
    287   bool err = false;  // tracks percent-decoding errors
    288 
    289   auto& END_PATH_PART = getEndPathPart(Url::REMOTE_HREF);
    290   auto& END_QUERY_PART = getEndQueryPart(Url::REMOTE_HREF);
    291 
    292   // scheme
    293   {
    294     bool gotScheme = false;
    295     for (auto i: kj::indices(text)) {
    296       if (text[i] == ':') {
    297         // found valid scheme
    298         result.scheme = kj::str(text.slice(0, i));
    299         text = text.slice(i + 1);
    300         gotScheme = true;
    301         break;
    302       } else if (NOT_SCHEME_CHARS.contains(text[i])) {
    303         // no scheme
    304         break;
    305       }
    306     }
    307     if (!gotScheme) {
    308       // copy scheme
    309       result.scheme = kj::str(this->scheme);
    310     }
    311   }
    312 
    313   // authority
    314   bool hadNewAuthority = text.startsWith("//");
    315   if (hadNewAuthority) {
    316     text = text.slice(2);
    317 
    318     auto authority = split(text, END_AUTHORITY);
    319 
    320     KJ_IF_MAYBE(userpass, trySplit(authority, '@')) {
    321       KJ_IF_MAYBE(username, trySplit(*userpass, ':')) {
    322         result.userInfo = UserInfo {
    323           percentDecode(*username, err, options),
    324           percentDecode(*userpass, err, options)
    325         };
    326       } else {
    327         result.userInfo = UserInfo {
    328           percentDecode(*userpass, err, options),
    329           nullptr
    330         };
    331       }
    332     }
    333 
    334     result.host = percentDecode(authority, err, options);
    335     if (!HOST_CHARS.containsAll(result.host)) return nullptr;
    336     toLower(result.host);
    337   } else {
    338     // copy authority
    339     result.host = kj::str(this->host);
    340     result.userInfo = this->userInfo.map([](const UserInfo& userInfo) {
    341       return UserInfo {
    342         kj::str(userInfo.username),
    343         userInfo.password.map([](const String& password) { return kj::str(password); }),
    344       };
    345     });
    346   }
    347 
    348   // path
    349   bool hadNewPath = text.size() > 0 && text[0] != '?' && text[0] != '#';
    350   if (hadNewPath) {
    351     // There's a new path.
    352 
    353     if (text[0] == '/') {
    354       // New path is absolute, so don't copy the old path.
    355       text = text.slice(1);
    356       result.hasTrailingSlash = true;
    357     } else if (this->path.size() > 0) {
    358       // New path is relative, so start from the old path, dropping everything after the last
    359       // slash.
    360       auto slice = this->path.slice(0, this->path.size() - (this->hasTrailingSlash ? 0 : 1));
    361       result.path = KJ_MAP(part, slice) { return kj::str(part); };
    362       result.hasTrailingSlash = true;
    363     }
    364 
    365     for (;;) {
    366       auto part = split(text, END_PATH_PART);
    367       if (part.size() == 2 && part[0] == '.' && part[1] == '.') {
    368         if (result.path.size() != 0) {
    369           result.path.removeLast();
    370         }
    371         result.hasTrailingSlash = true;
    372       } else if (part.size() == 0 || (part.size() == 1 && part[0] == '.')) {
    373         // Collapse consecutive slashes and "/./".
    374         result.hasTrailingSlash = true;
    375       } else {
    376         result.path.add(percentDecode(part, err, options));
    377         result.hasTrailingSlash = false;
    378       }
    379 
    380       if (!text.startsWith("/")) break;
    381       text = text.slice(1);
    382     }
    383   } else if (!hadNewAuthority) {
    384     // copy path
    385     result.path = KJ_MAP(part, this->path) { return kj::str(part); };
    386     result.hasTrailingSlash = this->hasTrailingSlash;
    387   }
    388 
    389   if (text.startsWith("?")) {
    390     do {
    391       text = text.slice(1);
    392       auto part = split(text, END_QUERY_PART);
    393 
    394       if (part.size() > 0) {
    395         KJ_IF_MAYBE(key, trySplit(part, '=')) {
    396           result.query.add(QueryParam { percentDecodeQuery(*key, err, options),
    397                                         percentDecodeQuery(part, err, options) });
    398         } else {
    399           result.query.add(QueryParam { percentDecodeQuery(part, err, options),
    400                                         nullptr });
    401         }
    402       }
    403     } while (text.startsWith("&"));
    404   } else if (!hadNewAuthority && !hadNewPath) {
    405     // copy query
    406     result.query = KJ_MAP(param, this->query) -> QueryParam {
    407       // Preserve the "allocated-ness" of `param.value` with this careful copy.
    408       return { kj::str(param.name), param.value.begin() == nullptr ? kj::String()
    409                                                                    : kj::str(param.value) };
    410     };
    411   }
    412 
    413   if (text.startsWith("#")) {
    414     result.fragment = percentDecode(text.slice(1), err, options);
    415   } else {
    416     // We should have consumed everything.
    417     KJ_ASSERT(text.size() == 0);
    418   }
    419 
    420   if (err) return nullptr;
    421 
    422   return kj::mv(result);
    423 }
    424 
    425 String Url::toString(Context context) const {
    426   Vector<char> chars(128);
    427 
    428   if (context != HTTP_REQUEST) {
    429     chars.addAll(scheme);
    430     chars.addAll(StringPtr("://"));
    431 
    432     if (context == REMOTE_HREF) {
    433       KJ_IF_MAYBE(user, userInfo) {
    434         chars.addAll(options.percentDecode ? encodeUriUserInfo(user->username)
    435                                           : kj::str(user->username));
    436         KJ_IF_MAYBE(pass, user->password) {
    437           chars.add(':');
    438           chars.addAll(options.percentDecode ? encodeUriUserInfo(*pass) : kj::str(*pass));
    439         }
    440         chars.add('@');
    441       }
    442     }
    443 
    444     // RFC3986 specifies that hosts can contain percent-encoding escapes while suggesting that
    445     // they should only be used for UTF-8 sequences. However, the DNS standard specifies a
    446     // different way to encode Unicode into domain names and doesn't permit any characters which
    447     // would need to be escaped. Meanwhile, encodeUriComponent() here would incorrectly try to
    448     // escape colons and brackets (e.g. around ipv6 literal addresses). So, instead, we throw if
    449     // the host is invalid.
    450     if (HOST_CHARS.containsAll(host)) {
    451       chars.addAll(host);
    452     } else {
    453       KJ_FAIL_REQUIRE("invalid hostname when stringifying URL", host) {
    454         chars.addAll(StringPtr("invalid-host"));
    455         break;
    456       }
    457     }
    458   }
    459 
    460   for (auto& pathPart: path) {
    461     // Protect against path injection.
    462     KJ_REQUIRE((pathPart != "" || options.allowEmpty) && pathPart != "." && pathPart != "..",
    463                "invalid name in URL path", path) {
    464       continue;
    465     }
    466     chars.add('/');
    467     chars.addAll(options.percentDecode ? encodeUriPath(pathPart) : kj::str(pathPart));
    468   }
    469   if (hasTrailingSlash || (path.size() == 0 && context == HTTP_REQUEST)) {
    470     chars.add('/');
    471   }
    472 
    473   bool first = true;
    474   for (auto& param: query) {
    475     chars.add(first ? '?' : '&');
    476     first = false;
    477     chars.addAll(options.percentDecode ? encodeWwwForm(param.name) : kj::str(param.name));
    478     if (param.value.begin() != nullptr) {
    479       chars.add('=');
    480       chars.addAll(options.percentDecode ? encodeWwwForm(param.value) : kj::str(param.value));
    481     }
    482   }
    483 
    484   if (context == REMOTE_HREF) {
    485     KJ_IF_MAYBE(f, fragment) {
    486       chars.add('#');
    487       chars.addAll(options.percentDecode ? encodeUriFragment(*f) : kj::str(*f));
    488     }
    489   }
    490 
    491   chars.add('\0');
    492   return String(chars.releaseAsArray());
    493 }
    494 
    495 } // namespace kj