Serenity Operating System
at master 457 lines 16 kB view raw
1/* 2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org> 3 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#include <AK/CharacterTypes.h> 9#include <AK/Debug.h> 10#include <AK/LexicalPath.h> 11#include <AK/StringBuilder.h> 12#include <AK/URL.h> 13#include <AK/URLParser.h> 14#include <AK/Utf8View.h> 15 16namespace AK { 17 18// FIXME: It could make sense to force users of URL to use URLParser::parse() explicitly instead of using a constructor. 19URL::URL(StringView string) 20 : URL(URLParser::parse(string)) 21{ 22 if constexpr (URL_PARSER_DEBUG) { 23 if (m_valid) 24 dbgln("URL constructor: Parsed URL to be '{}'.", serialize()); 25 else 26 dbgln("URL constructor: Parsed URL to be invalid."); 27 } 28} 29 30DeprecatedString URL::path() const 31{ 32 if (cannot_be_a_base_url()) 33 return paths()[0]; 34 StringBuilder builder; 35 for (auto& path : m_paths) { 36 builder.append('/'); 37 builder.append(path); 38 } 39 return builder.to_deprecated_string(); 40} 41 42URL URL::complete_url(StringView relative_url) const 43{ 44 if (!is_valid()) 45 return {}; 46 47 return URLParser::parse(relative_url, this); 48} 49 50void URL::set_scheme(DeprecatedString scheme) 51{ 52 m_scheme = move(scheme); 53 m_valid = compute_validity(); 54} 55 56void URL::set_username(DeprecatedString username) 57{ 58 m_username = move(username); 59 m_valid = compute_validity(); 60} 61 62void URL::set_password(DeprecatedString password) 63{ 64 m_password = move(password); 65 m_valid = compute_validity(); 66} 67 68void URL::set_host(DeprecatedString host) 69{ 70 m_host = move(host); 71 m_valid = compute_validity(); 72} 73 74void URL::set_port(Optional<u16> port) 75{ 76 if (port == default_port_for_scheme(m_scheme)) { 77 m_port = {}; 78 return; 79 } 80 m_port = move(port); 81 m_valid = compute_validity(); 82} 83 84void URL::set_paths(Vector<DeprecatedString> paths) 85{ 86 m_paths = move(paths); 87 m_valid = compute_validity(); 88} 89 90void URL::set_query(DeprecatedString query) 91{ 92 m_query = move(query); 93} 94 95void URL::set_fragment(DeprecatedString fragment) 96{ 97 m_fragment = move(fragment); 98} 99 100// FIXME: This is by no means complete. 101// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong. 102bool URL::compute_validity() const 103{ 104 if (m_scheme.is_empty()) 105 return false; 106 107 if (m_scheme == "data") { 108 if (m_data_mime_type.is_empty()) 109 return false; 110 if (m_data_payload_is_base64) { 111 if (m_data_payload.length() % 4 != 0) 112 return false; 113 for (auto character : m_data_payload) { 114 if (!is_ascii_alphanumeric(character) || character == '+' || character == '/' || character == '=') 115 return false; 116 } 117 } 118 } else if (m_cannot_be_a_base_url) { 119 if (m_paths.size() != 1) 120 return false; 121 if (m_paths[0].is_empty()) 122 return false; 123 } else { 124 if (m_scheme.is_one_of("about", "mailto")) 125 return false; 126 // NOTE: Maybe it is allowed to have a zero-segment path. 127 if (m_paths.size() == 0) 128 return false; 129 } 130 131 // NOTE: A file URL's host should be the empty string for localhost, not null. 132 if (m_scheme == "file" && m_host.is_null()) 133 return false; 134 135 return true; 136} 137 138bool URL::scheme_requires_port(StringView scheme) 139{ 140 return (default_port_for_scheme(scheme) != 0); 141} 142 143u16 URL::default_port_for_scheme(StringView scheme) 144{ 145 if (scheme == "http") 146 return 80; 147 if (scheme == "https") 148 return 443; 149 if (scheme == "gemini") 150 return 1965; 151 if (scheme == "irc") 152 return 6667; 153 if (scheme == "ircs") 154 return 6697; 155 if (scheme == "ws") 156 return 80; 157 if (scheme == "wss") 158 return 443; 159 return 0; 160} 161 162URL URL::create_with_file_scheme(DeprecatedString const& path, DeprecatedString const& fragment, DeprecatedString const& hostname) 163{ 164 LexicalPath lexical_path(path); 165 if (!lexical_path.is_absolute()) 166 return {}; 167 168 URL url; 169 url.set_scheme("file"); 170 // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string. 171 // This is because a file URL always needs a non-null hostname. 172 url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname); 173 url.set_paths(lexical_path.parts()); 174 // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment. 175 if (path.ends_with('/')) 176 url.append_path(""); 177 url.set_fragment(fragment); 178 return url; 179} 180 181URL URL::create_with_help_scheme(DeprecatedString const& path, DeprecatedString const& fragment, DeprecatedString const& hostname) 182{ 183 LexicalPath lexical_path(path); 184 185 URL url; 186 url.set_scheme("help"); 187 // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string. 188 // This is because a file URL always needs a non-null hostname. 189 url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname); 190 url.set_paths(lexical_path.parts()); 191 // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment. 192 if (path.ends_with('/')) 193 url.append_path(""); 194 url.set_fragment(fragment); 195 return url; 196} 197 198URL URL::create_with_url_or_path(DeprecatedString const& url_or_path) 199{ 200 URL url = url_or_path; 201 if (url.is_valid()) 202 return url; 203 204 DeprecatedString path = LexicalPath::canonicalized_path(url_or_path); 205 return URL::create_with_file_scheme(path); 206} 207 208// https://url.spec.whatwg.org/#special-scheme 209bool URL::is_special_scheme(StringView scheme) 210{ 211 return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss"); 212} 213 214DeprecatedString URL::serialize_data_url() const 215{ 216 VERIFY(m_scheme == "data"); 217 VERIFY(!m_data_mime_type.is_null()); 218 VERIFY(!m_data_payload.is_null()); 219 StringBuilder builder; 220 builder.append(m_scheme); 221 builder.append(':'); 222 builder.append(m_data_mime_type); 223 if (m_data_payload_is_base64) 224 builder.append(";base64"sv); 225 builder.append(','); 226 // NOTE: The specification does not say anything about encoding this, but we should encode at least control and non-ASCII 227 // characters (since this is also a valid representation of the same data URL). 228 builder.append(URL::percent_encode(m_data_payload, PercentEncodeSet::C0Control)); 229 return builder.to_deprecated_string(); 230} 231 232// https://url.spec.whatwg.org/#concept-url-serializer 233DeprecatedString URL::serialize(ExcludeFragment exclude_fragment) const 234{ 235 if (m_scheme == "data") 236 return serialize_data_url(); 237 StringBuilder builder; 238 builder.append(m_scheme); 239 builder.append(':'); 240 241 if (!m_host.is_null()) { 242 builder.append("//"sv); 243 244 if (includes_credentials()) { 245 builder.append(percent_encode(m_username, PercentEncodeSet::Userinfo)); 246 if (!m_password.is_empty()) { 247 builder.append(':'); 248 builder.append(percent_encode(m_password, PercentEncodeSet::Userinfo)); 249 } 250 builder.append('@'); 251 } 252 253 builder.append(m_host); 254 if (m_port.has_value()) 255 builder.appendff(":{}", *m_port); 256 } 257 258 if (cannot_be_a_base_url()) { 259 builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path)); 260 } else { 261 if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty()) 262 builder.append("/."sv); 263 for (auto& segment : m_paths) { 264 builder.append('/'); 265 builder.append(percent_encode(segment, PercentEncodeSet::Path)); 266 } 267 } 268 269 if (!m_query.is_null()) { 270 builder.append('?'); 271 builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query)); 272 } 273 274 if (exclude_fragment == ExcludeFragment::No && !m_fragment.is_null()) { 275 builder.append('#'); 276 builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment)); 277 } 278 279 return builder.to_deprecated_string(); 280} 281 282// https://url.spec.whatwg.org/#url-rendering 283// NOTE: This does e.g. not display credentials. 284// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points 285// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible. 286DeprecatedString URL::serialize_for_display() const 287{ 288 VERIFY(m_valid); 289 if (m_scheme == "data") 290 return serialize_data_url(); 291 StringBuilder builder; 292 builder.append(m_scheme); 293 builder.append(':'); 294 295 if (!m_host.is_null()) { 296 builder.append("//"sv); 297 builder.append(m_host); 298 if (m_port.has_value()) 299 builder.appendff(":{}", *m_port); 300 } 301 302 if (cannot_be_a_base_url()) { 303 builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path)); 304 } else { 305 if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty()) 306 builder.append("/."sv); 307 for (auto& segment : m_paths) { 308 builder.append('/'); 309 builder.append(percent_encode(segment, PercentEncodeSet::Path)); 310 } 311 } 312 313 if (!m_query.is_null()) { 314 builder.append('?'); 315 builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query)); 316 } 317 318 if (!m_fragment.is_null()) { 319 builder.append('#'); 320 builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment)); 321 } 322 323 return builder.to_deprecated_string(); 324} 325 326// https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin 327// https://url.spec.whatwg.org/#concept-url-origin 328DeprecatedString URL::serialize_origin() const 329{ 330 VERIFY(m_valid); 331 332 if (m_scheme == "blob"sv) { 333 // TODO: 1. If URL’s blob URL entry is non-null, then return URL’s blob URL entry’s environment’s origin. 334 // 2. Let url be the result of parsing URL’s path[0]. 335 VERIFY(!m_paths.is_empty()); 336 URL url = m_paths[0]; 337 // 3. Return a new opaque origin, if url is failure, and url’s origin otherwise. 338 if (!url.is_valid()) 339 return "null"; 340 return url.serialize_origin(); 341 } else if (!m_scheme.is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin." 342 return "null"; 343 } 344 345 StringBuilder builder; 346 builder.append(m_scheme); 347 builder.append("://"sv); 348 builder.append(m_host); 349 if (m_port.has_value()) 350 builder.appendff(":{}", *m_port); 351 return builder.to_deprecated_string(); 352} 353 354bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const 355{ 356 if (this == &other) 357 return true; 358 if (!m_valid || !other.m_valid) 359 return false; 360 return serialize(exclude_fragments) == other.serialize(exclude_fragments); 361} 362 363DeprecatedString URL::basename() const 364{ 365 if (!m_valid) 366 return {}; 367 if (m_paths.is_empty()) 368 return {}; 369 return m_paths.last(); 370} 371 372void URL::append_percent_encoded(StringBuilder& builder, u32 code_point) 373{ 374 if (code_point <= 0x7f) 375 builder.appendff("%{:02X}", code_point); 376 else if (code_point <= 0x07ff) 377 builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80); 378 else if (code_point <= 0xffff) 379 builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80); 380 else if (code_point <= 0x10ffff) 381 builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80); 382 else 383 VERIFY_NOT_REACHED(); 384} 385 386// https://url.spec.whatwg.org/#c0-control-percent-encode-set 387bool URL::code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set) 388{ 389 switch (set) { 390 case URL::PercentEncodeSet::C0Control: 391 return code_point < 0x20 || code_point > 0x7E; 392 case URL::PercentEncodeSet::Fragment: 393 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"<>`"sv.contains(code_point); 394 case URL::PercentEncodeSet::Query: 395 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"#<>"sv.contains(code_point); 396 case URL::PercentEncodeSet::SpecialQuery: 397 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || code_point == '\''; 398 case URL::PercentEncodeSet::Path: 399 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || "?`{}"sv.contains(code_point); 400 case URL::PercentEncodeSet::Userinfo: 401 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(code_point); 402 case URL::PercentEncodeSet::Component: 403 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(code_point); 404 case URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded: 405 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Component) || "!'()~"sv.contains(code_point); 406 case URL::PercentEncodeSet::EncodeURI: 407 // NOTE: This is the same percent encode set that JS encodeURI() uses. 408 // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI 409 return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point))); 410 default: 411 VERIFY_NOT_REACHED(); 412 } 413} 414 415void URL::append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, URL::PercentEncodeSet set) 416{ 417 if (code_point_is_in_percent_encode_set(code_point, set)) 418 append_percent_encoded(builder, code_point); 419 else 420 builder.append_code_point(code_point); 421} 422 423DeprecatedString URL::percent_encode(StringView input, URL::PercentEncodeSet set, SpaceAsPlus space_as_plus) 424{ 425 StringBuilder builder; 426 for (auto code_point : Utf8View(input)) { 427 if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ') 428 builder.append('+'); 429 else 430 append_percent_encoded_if_necessary(builder, code_point, set); 431 } 432 return builder.to_deprecated_string(); 433} 434 435DeprecatedString URL::percent_decode(StringView input) 436{ 437 if (!input.contains('%')) 438 return input; 439 StringBuilder builder; 440 Utf8View utf8_view(input); 441 for (auto it = utf8_view.begin(); !it.done(); ++it) { 442 if (*it != '%') { 443 builder.append_code_point(*it); 444 } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) { 445 builder.append_code_point(*it); 446 } else { 447 ++it; 448 u8 byte = parse_ascii_hex_digit(*it) << 4; 449 ++it; 450 byte += parse_ascii_hex_digit(*it); 451 builder.append(byte); 452 } 453 } 454 return builder.to_deprecated_string(); 455} 456 457}