url.c++ (15895B)
1 // Copyright (c) 2017 Cloudflare, Inc. and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 #include "url.h" 23 #include <kj/encoding.h> 24 #include <kj/parse/char.h> 25 #include <kj/debug.h> 26 #include <stdlib.h> 27 28 namespace kj { 29 30 namespace { 31 32 constexpr auto ALPHAS = parse::charRange('a', 'z').orRange('A', 'Z'); 33 constexpr auto DIGITS = parse::charRange('0', '9'); 34 35 constexpr auto END_AUTHORITY = parse::anyOfChars("/?#"); 36 37 // Authority, path, and query components can typically be terminated by the start of a fragment. 38 // However, fragments are disallowed in HTTP_REQUEST and HTTP_PROXY_REQUEST contexts. As a quirk, we 39 // allow the fragment start character ('#') to live unescaped in path and query components. We do 40 // not currently allow it in the authority component, because our parser would reject it as a host 41 // character anyway. 42 43 const parse::CharGroup_& getEndPathPart(Url::Context context) { 44 static constexpr auto END_PATH_PART_HREF = parse::anyOfChars("/?#"); 45 static constexpr auto END_PATH_PART_REQUEST = parse::anyOfChars("/?"); 46 47 switch (context) { 48 case Url::REMOTE_HREF: return END_PATH_PART_HREF; 49 case Url::HTTP_PROXY_REQUEST: return END_PATH_PART_REQUEST; 50 case Url::HTTP_REQUEST: return END_PATH_PART_REQUEST; 51 } 52 53 KJ_UNREACHABLE; 54 } 55 56 const parse::CharGroup_& getEndQueryPart(Url::Context context) { 57 static constexpr auto END_QUERY_PART_HREF = parse::anyOfChars("&#"); 58 static constexpr auto END_QUERY_PART_REQUEST = parse::anyOfChars("&"); 59 60 switch (context) { 61 case Url::REMOTE_HREF: return END_QUERY_PART_HREF; 62 case Url::HTTP_PROXY_REQUEST: return END_QUERY_PART_REQUEST; 63 case Url::HTTP_REQUEST: return END_QUERY_PART_REQUEST; 64 } 65 66 KJ_UNREACHABLE; 67 } 68 69 constexpr auto SCHEME_CHARS = ALPHAS.orGroup(DIGITS).orAny("+-."); 70 constexpr auto NOT_SCHEME_CHARS = SCHEME_CHARS.invert(); 71 72 constexpr auto HOST_CHARS = ALPHAS.orGroup(DIGITS).orAny(".-:[]_"); 73 // [] is for ipv6 literals. 74 // _ is not allowed in domain names, but the WHATWG URL spec allows it in hostnames, so we do, too. 75 // TODO(someday): The URL spec actually allows a lot more than just '_', and requires nameprepping 76 // to Punycode. We'll have to decide how we want to deal with all that. 77 78 void toLower(String& text) { 79 for (char& c: text) { 80 if ('A' <= c && c <= 'Z') { 81 c += 'a' - 'A'; 82 } 83 } 84 } 85 86 Maybe<ArrayPtr<const char>> trySplit(StringPtr& text, char c) { 87 KJ_IF_MAYBE(pos, text.findFirst(c)) { 88 ArrayPtr<const char> result = text.slice(0, *pos); 89 text = text.slice(*pos + 1); 90 return result; 91 } else { 92 return nullptr; 93 } 94 } 95 96 Maybe<ArrayPtr<const char>> trySplit(ArrayPtr<const char>& text, char c) { 97 for (auto i: kj::indices(text)) { 98 if (text[i] == c) { 99 ArrayPtr<const char> result = text.slice(0, i); 100 text = text.slice(i + 1, text.size()); 101 return result; 102 } 103 } 104 return nullptr; 105 } 106 107 ArrayPtr<const char> split(StringPtr& text, const parse::CharGroup_& chars) { 108 for (auto i: kj::indices(text)) { 109 if (chars.contains(text[i])) { 110 ArrayPtr<const char> result = text.slice(0, i); 111 text = text.slice(i); 112 return result; 113 } 114 } 115 auto result = text.asArray(); 116 text = ""; 117 return result; 118 } 119 120 String percentDecode(ArrayPtr<const char> text, bool& hadErrors, const Url::Options& options) { 121 if (options.percentDecode) { 122 auto result = decodeUriComponent(text); 123 if (result.hadErrors) hadErrors = true; 124 return kj::mv(result); 125 } 126 return kj::str(text); 127 } 128 129 String percentDecodeQuery(ArrayPtr<const char> text, bool& hadErrors, const Url::Options& options) { 130 if (options.percentDecode) { 131 auto result = decodeWwwForm(text); 132 if (result.hadErrors) hadErrors = true; 133 return kj::mv(result); 134 } 135 return kj::str(text); 136 } 137 138 } // namespace 139 140 Url::~Url() noexcept(false) {} 141 142 Url Url::clone() const { 143 return { 144 kj::str(scheme), 145 userInfo.map([](const UserInfo& ui) -> UserInfo { 146 return { 147 kj::str(ui.username), 148 ui.password.map([](const String& s) { return kj::str(s); }) 149 }; 150 }), 151 kj::str(host), 152 KJ_MAP(part, path) { return kj::str(part); }, 153 hasTrailingSlash, 154 KJ_MAP(param, query) -> QueryParam { 155 // Preserve the "allocated-ness" of `param.value` with this careful copy. 156 return { kj::str(param.name), param.value.begin() == nullptr ? kj::String() 157 : kj::str(param.value) }; 158 }, 159 fragment.map([](const String& s) { return kj::str(s); }), 160 options 161 }; 162 } 163 164 Url Url::parse(StringPtr url, Context context, Options options) { 165 return KJ_REQUIRE_NONNULL(tryParse(url, context, options), "invalid URL", url); 166 } 167 168 Maybe<Url> Url::tryParse(StringPtr text, Context context, Options options) { 169 Url result; 170 result.options = options; 171 bool err = false; // tracks percent-decoding errors 172 173 auto& END_PATH_PART = getEndPathPart(context); 174 auto& END_QUERY_PART = getEndQueryPart(context); 175 176 if (context == HTTP_REQUEST) { 177 if (!text.startsWith("/")) { 178 return nullptr; 179 } 180 } else { 181 KJ_IF_MAYBE(scheme, trySplit(text, ':')) { 182 result.scheme = kj::str(*scheme); 183 } else { 184 // missing scheme 185 return nullptr; 186 } 187 toLower(result.scheme); 188 if (result.scheme.size() == 0 || 189 !ALPHAS.contains(result.scheme[0]) || 190 !SCHEME_CHARS.containsAll(result.scheme.slice(1))) { 191 // bad scheme 192 return nullptr; 193 } 194 195 if (!text.startsWith("//")) { 196 // We require an authority (hostname) part. 197 return nullptr; 198 } 199 text = text.slice(2); 200 201 { 202 auto authority = split(text, END_AUTHORITY); 203 204 KJ_IF_MAYBE(userpass, trySplit(authority, '@')) { 205 if (context != REMOTE_HREF) { 206 // No user/pass allowed here. 207 return nullptr; 208 } 209 KJ_IF_MAYBE(username, trySplit(*userpass, ':')) { 210 result.userInfo = UserInfo { 211 percentDecode(*username, err, options), 212 percentDecode(*userpass, err, options) 213 }; 214 } else { 215 result.userInfo = UserInfo { 216 percentDecode(*userpass, err, options), 217 nullptr 218 }; 219 } 220 } 221 222 result.host = percentDecode(authority, err, options); 223 if (!HOST_CHARS.containsAll(result.host)) return nullptr; 224 toLower(result.host); 225 } 226 } 227 228 while (text.startsWith("/")) { 229 text = text.slice(1); 230 auto part = split(text, END_PATH_PART); 231 if (part.size() == 2 && part[0] == '.' && part[1] == '.') { 232 if (result.path.size() != 0) { 233 result.path.removeLast(); 234 } 235 result.hasTrailingSlash = true; 236 } else if ((part.size() == 0 && (!options.allowEmpty || text.size() == 0)) || 237 (part.size() == 1 && part[0] == '.')) { 238 // Collapse consecutive slashes and "/./". 239 result.hasTrailingSlash = true; 240 } else { 241 result.path.add(percentDecode(part, err, options)); 242 result.hasTrailingSlash = false; 243 } 244 } 245 246 if (text.startsWith("?")) { 247 do { 248 text = text.slice(1); 249 auto part = split(text, END_QUERY_PART); 250 251 if (part.size() > 0 || options.allowEmpty) { 252 KJ_IF_MAYBE(key, trySplit(part, '=')) { 253 result.query.add(QueryParam { percentDecodeQuery(*key, err, options), 254 percentDecodeQuery(part, err, options) }); 255 } else { 256 result.query.add(QueryParam { percentDecodeQuery(part, err, options), nullptr }); 257 } 258 } 259 } while (text.startsWith("&")); 260 } 261 262 if (text.startsWith("#")) { 263 if (context != REMOTE_HREF) { 264 // No fragment allowed here. 265 return nullptr; 266 } 267 result.fragment = percentDecode(text.slice(1), err, options); 268 } else { 269 // We should have consumed everything. 270 KJ_ASSERT(text.size() == 0); 271 } 272 273 if (err) return nullptr; 274 275 return kj::mv(result); 276 } 277 278 Url Url::parseRelative(StringPtr url) const { 279 return KJ_REQUIRE_NONNULL(tryParseRelative(url), "invalid relative URL", url); 280 } 281 282 Maybe<Url> Url::tryParseRelative(StringPtr text) const { 283 if (text.size() == 0) return clone(); 284 285 Url result; 286 result.options = options; 287 bool err = false; // tracks percent-decoding errors 288 289 auto& END_PATH_PART = getEndPathPart(Url::REMOTE_HREF); 290 auto& END_QUERY_PART = getEndQueryPart(Url::REMOTE_HREF); 291 292 // scheme 293 { 294 bool gotScheme = false; 295 for (auto i: kj::indices(text)) { 296 if (text[i] == ':') { 297 // found valid scheme 298 result.scheme = kj::str(text.slice(0, i)); 299 text = text.slice(i + 1); 300 gotScheme = true; 301 break; 302 } else if (NOT_SCHEME_CHARS.contains(text[i])) { 303 // no scheme 304 break; 305 } 306 } 307 if (!gotScheme) { 308 // copy scheme 309 result.scheme = kj::str(this->scheme); 310 } 311 } 312 313 // authority 314 bool hadNewAuthority = text.startsWith("//"); 315 if (hadNewAuthority) { 316 text = text.slice(2); 317 318 auto authority = split(text, END_AUTHORITY); 319 320 KJ_IF_MAYBE(userpass, trySplit(authority, '@')) { 321 KJ_IF_MAYBE(username, trySplit(*userpass, ':')) { 322 result.userInfo = UserInfo { 323 percentDecode(*username, err, options), 324 percentDecode(*userpass, err, options) 325 }; 326 } else { 327 result.userInfo = UserInfo { 328 percentDecode(*userpass, err, options), 329 nullptr 330 }; 331 } 332 } 333 334 result.host = percentDecode(authority, err, options); 335 if (!HOST_CHARS.containsAll(result.host)) return nullptr; 336 toLower(result.host); 337 } else { 338 // copy authority 339 result.host = kj::str(this->host); 340 result.userInfo = this->userInfo.map([](const UserInfo& userInfo) { 341 return UserInfo { 342 kj::str(userInfo.username), 343 userInfo.password.map([](const String& password) { return kj::str(password); }), 344 }; 345 }); 346 } 347 348 // path 349 bool hadNewPath = text.size() > 0 && text[0] != '?' && text[0] != '#'; 350 if (hadNewPath) { 351 // There's a new path. 352 353 if (text[0] == '/') { 354 // New path is absolute, so don't copy the old path. 355 text = text.slice(1); 356 result.hasTrailingSlash = true; 357 } else if (this->path.size() > 0) { 358 // New path is relative, so start from the old path, dropping everything after the last 359 // slash. 360 auto slice = this->path.slice(0, this->path.size() - (this->hasTrailingSlash ? 0 : 1)); 361 result.path = KJ_MAP(part, slice) { return kj::str(part); }; 362 result.hasTrailingSlash = true; 363 } 364 365 for (;;) { 366 auto part = split(text, END_PATH_PART); 367 if (part.size() == 2 && part[0] == '.' && part[1] == '.') { 368 if (result.path.size() != 0) { 369 result.path.removeLast(); 370 } 371 result.hasTrailingSlash = true; 372 } else if (part.size() == 0 || (part.size() == 1 && part[0] == '.')) { 373 // Collapse consecutive slashes and "/./". 374 result.hasTrailingSlash = true; 375 } else { 376 result.path.add(percentDecode(part, err, options)); 377 result.hasTrailingSlash = false; 378 } 379 380 if (!text.startsWith("/")) break; 381 text = text.slice(1); 382 } 383 } else if (!hadNewAuthority) { 384 // copy path 385 result.path = KJ_MAP(part, this->path) { return kj::str(part); }; 386 result.hasTrailingSlash = this->hasTrailingSlash; 387 } 388 389 if (text.startsWith("?")) { 390 do { 391 text = text.slice(1); 392 auto part = split(text, END_QUERY_PART); 393 394 if (part.size() > 0) { 395 KJ_IF_MAYBE(key, trySplit(part, '=')) { 396 result.query.add(QueryParam { percentDecodeQuery(*key, err, options), 397 percentDecodeQuery(part, err, options) }); 398 } else { 399 result.query.add(QueryParam { percentDecodeQuery(part, err, options), 400 nullptr }); 401 } 402 } 403 } while (text.startsWith("&")); 404 } else if (!hadNewAuthority && !hadNewPath) { 405 // copy query 406 result.query = KJ_MAP(param, this->query) -> QueryParam { 407 // Preserve the "allocated-ness" of `param.value` with this careful copy. 408 return { kj::str(param.name), param.value.begin() == nullptr ? kj::String() 409 : kj::str(param.value) }; 410 }; 411 } 412 413 if (text.startsWith("#")) { 414 result.fragment = percentDecode(text.slice(1), err, options); 415 } else { 416 // We should have consumed everything. 417 KJ_ASSERT(text.size() == 0); 418 } 419 420 if (err) return nullptr; 421 422 return kj::mv(result); 423 } 424 425 String Url::toString(Context context) const { 426 Vector<char> chars(128); 427 428 if (context != HTTP_REQUEST) { 429 chars.addAll(scheme); 430 chars.addAll(StringPtr("://")); 431 432 if (context == REMOTE_HREF) { 433 KJ_IF_MAYBE(user, userInfo) { 434 chars.addAll(options.percentDecode ? encodeUriUserInfo(user->username) 435 : kj::str(user->username)); 436 KJ_IF_MAYBE(pass, user->password) { 437 chars.add(':'); 438 chars.addAll(options.percentDecode ? encodeUriUserInfo(*pass) : kj::str(*pass)); 439 } 440 chars.add('@'); 441 } 442 } 443 444 // RFC3986 specifies that hosts can contain percent-encoding escapes while suggesting that 445 // they should only be used for UTF-8 sequences. However, the DNS standard specifies a 446 // different way to encode Unicode into domain names and doesn't permit any characters which 447 // would need to be escaped. Meanwhile, encodeUriComponent() here would incorrectly try to 448 // escape colons and brackets (e.g. around ipv6 literal addresses). So, instead, we throw if 449 // the host is invalid. 450 if (HOST_CHARS.containsAll(host)) { 451 chars.addAll(host); 452 } else { 453 KJ_FAIL_REQUIRE("invalid hostname when stringifying URL", host) { 454 chars.addAll(StringPtr("invalid-host")); 455 break; 456 } 457 } 458 } 459 460 for (auto& pathPart: path) { 461 // Protect against path injection. 462 KJ_REQUIRE((pathPart != "" || options.allowEmpty) && pathPart != "." && pathPart != "..", 463 "invalid name in URL path", path) { 464 continue; 465 } 466 chars.add('/'); 467 chars.addAll(options.percentDecode ? encodeUriPath(pathPart) : kj::str(pathPart)); 468 } 469 if (hasTrailingSlash || (path.size() == 0 && context == HTTP_REQUEST)) { 470 chars.add('/'); 471 } 472 473 bool first = true; 474 for (auto& param: query) { 475 chars.add(first ? '?' : '&'); 476 first = false; 477 chars.addAll(options.percentDecode ? encodeWwwForm(param.name) : kj::str(param.name)); 478 if (param.value.begin() != nullptr) { 479 chars.add('='); 480 chars.addAll(options.percentDecode ? encodeWwwForm(param.value) : kj::str(param.value)); 481 } 482 } 483 484 if (context == REMOTE_HREF) { 485 KJ_IF_MAYBE(f, fragment) { 486 chars.add('#'); 487 chars.addAll(options.percentDecode ? encodeUriFragment(*f) : kj::str(*f)); 488 } 489 } 490 491 chars.add('\0'); 492 return String(chars.releaseAsArray()); 493 } 494 495 } // namespace kj