Serenity Operating System
1/*
2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
4 *
5 * SPDX-License-Identifier: BSD-2-Clause
6 */
7
8#include <AK/CharacterTypes.h>
9#include <AK/Debug.h>
10#include <AK/LexicalPath.h>
11#include <AK/StringBuilder.h>
12#include <AK/URL.h>
13#include <AK/URLParser.h>
14#include <AK/Utf8View.h>
15
16namespace AK {
17
18// FIXME: It could make sense to force users of URL to use URLParser::parse() explicitly instead of using a constructor.
19URL::URL(StringView string)
20 : URL(URLParser::parse(string))
21{
22 if constexpr (URL_PARSER_DEBUG) {
23 if (m_valid)
24 dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
25 else
26 dbgln("URL constructor: Parsed URL to be invalid.");
27 }
28}
29
30DeprecatedString URL::path() const
31{
32 if (cannot_be_a_base_url())
33 return paths()[0];
34 StringBuilder builder;
35 for (auto& path : m_paths) {
36 builder.append('/');
37 builder.append(path);
38 }
39 return builder.to_deprecated_string();
40}
41
42URL URL::complete_url(StringView relative_url) const
43{
44 if (!is_valid())
45 return {};
46
47 return URLParser::parse(relative_url, this);
48}
49
50void URL::set_scheme(DeprecatedString scheme)
51{
52 m_scheme = move(scheme);
53 m_valid = compute_validity();
54}
55
56void URL::set_username(DeprecatedString username)
57{
58 m_username = move(username);
59 m_valid = compute_validity();
60}
61
62void URL::set_password(DeprecatedString password)
63{
64 m_password = move(password);
65 m_valid = compute_validity();
66}
67
68void URL::set_host(DeprecatedString host)
69{
70 m_host = move(host);
71 m_valid = compute_validity();
72}
73
74void URL::set_port(Optional<u16> port)
75{
76 if (port == default_port_for_scheme(m_scheme)) {
77 m_port = {};
78 return;
79 }
80 m_port = move(port);
81 m_valid = compute_validity();
82}
83
84void URL::set_paths(Vector<DeprecatedString> paths)
85{
86 m_paths = move(paths);
87 m_valid = compute_validity();
88}
89
90void URL::set_query(DeprecatedString query)
91{
92 m_query = move(query);
93}
94
95void URL::set_fragment(DeprecatedString fragment)
96{
97 m_fragment = move(fragment);
98}
99
100// FIXME: This is by no means complete.
101// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
102bool URL::compute_validity() const
103{
104 if (m_scheme.is_empty())
105 return false;
106
107 if (m_scheme == "data") {
108 if (m_data_mime_type.is_empty())
109 return false;
110 if (m_data_payload_is_base64) {
111 if (m_data_payload.length() % 4 != 0)
112 return false;
113 for (auto character : m_data_payload) {
114 if (!is_ascii_alphanumeric(character) || character == '+' || character == '/' || character == '=')
115 return false;
116 }
117 }
118 } else if (m_cannot_be_a_base_url) {
119 if (m_paths.size() != 1)
120 return false;
121 if (m_paths[0].is_empty())
122 return false;
123 } else {
124 if (m_scheme.is_one_of("about", "mailto"))
125 return false;
126 // NOTE: Maybe it is allowed to have a zero-segment path.
127 if (m_paths.size() == 0)
128 return false;
129 }
130
131 // NOTE: A file URL's host should be the empty string for localhost, not null.
132 if (m_scheme == "file" && m_host.is_null())
133 return false;
134
135 return true;
136}
137
138bool URL::scheme_requires_port(StringView scheme)
139{
140 return (default_port_for_scheme(scheme) != 0);
141}
142
143u16 URL::default_port_for_scheme(StringView scheme)
144{
145 if (scheme == "http")
146 return 80;
147 if (scheme == "https")
148 return 443;
149 if (scheme == "gemini")
150 return 1965;
151 if (scheme == "irc")
152 return 6667;
153 if (scheme == "ircs")
154 return 6697;
155 if (scheme == "ws")
156 return 80;
157 if (scheme == "wss")
158 return 443;
159 return 0;
160}
161
162URL URL::create_with_file_scheme(DeprecatedString const& path, DeprecatedString const& fragment, DeprecatedString const& hostname)
163{
164 LexicalPath lexical_path(path);
165 if (!lexical_path.is_absolute())
166 return {};
167
168 URL url;
169 url.set_scheme("file");
170 // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string.
171 // This is because a file URL always needs a non-null hostname.
172 url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname);
173 url.set_paths(lexical_path.parts());
174 // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
175 if (path.ends_with('/'))
176 url.append_path("");
177 url.set_fragment(fragment);
178 return url;
179}
180
181URL URL::create_with_help_scheme(DeprecatedString const& path, DeprecatedString const& fragment, DeprecatedString const& hostname)
182{
183 LexicalPath lexical_path(path);
184
185 URL url;
186 url.set_scheme("help");
187 // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string.
188 // This is because a file URL always needs a non-null hostname.
189 url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname);
190 url.set_paths(lexical_path.parts());
191 // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
192 if (path.ends_with('/'))
193 url.append_path("");
194 url.set_fragment(fragment);
195 return url;
196}
197
198URL URL::create_with_url_or_path(DeprecatedString const& url_or_path)
199{
200 URL url = url_or_path;
201 if (url.is_valid())
202 return url;
203
204 DeprecatedString path = LexicalPath::canonicalized_path(url_or_path);
205 return URL::create_with_file_scheme(path);
206}
207
208// https://url.spec.whatwg.org/#special-scheme
209bool URL::is_special_scheme(StringView scheme)
210{
211 return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
212}
213
214DeprecatedString URL::serialize_data_url() const
215{
216 VERIFY(m_scheme == "data");
217 VERIFY(!m_data_mime_type.is_null());
218 VERIFY(!m_data_payload.is_null());
219 StringBuilder builder;
220 builder.append(m_scheme);
221 builder.append(':');
222 builder.append(m_data_mime_type);
223 if (m_data_payload_is_base64)
224 builder.append(";base64"sv);
225 builder.append(',');
226 // NOTE: The specification does not say anything about encoding this, but we should encode at least control and non-ASCII
227 // characters (since this is also a valid representation of the same data URL).
228 builder.append(URL::percent_encode(m_data_payload, PercentEncodeSet::C0Control));
229 return builder.to_deprecated_string();
230}
231
232// https://url.spec.whatwg.org/#concept-url-serializer
233DeprecatedString URL::serialize(ExcludeFragment exclude_fragment) const
234{
235 if (m_scheme == "data")
236 return serialize_data_url();
237 StringBuilder builder;
238 builder.append(m_scheme);
239 builder.append(':');
240
241 if (!m_host.is_null()) {
242 builder.append("//"sv);
243
244 if (includes_credentials()) {
245 builder.append(percent_encode(m_username, PercentEncodeSet::Userinfo));
246 if (!m_password.is_empty()) {
247 builder.append(':');
248 builder.append(percent_encode(m_password, PercentEncodeSet::Userinfo));
249 }
250 builder.append('@');
251 }
252
253 builder.append(m_host);
254 if (m_port.has_value())
255 builder.appendff(":{}", *m_port);
256 }
257
258 if (cannot_be_a_base_url()) {
259 builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path));
260 } else {
261 if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty())
262 builder.append("/."sv);
263 for (auto& segment : m_paths) {
264 builder.append('/');
265 builder.append(percent_encode(segment, PercentEncodeSet::Path));
266 }
267 }
268
269 if (!m_query.is_null()) {
270 builder.append('?');
271 builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query));
272 }
273
274 if (exclude_fragment == ExcludeFragment::No && !m_fragment.is_null()) {
275 builder.append('#');
276 builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment));
277 }
278
279 return builder.to_deprecated_string();
280}
281
282// https://url.spec.whatwg.org/#url-rendering
283// NOTE: This does e.g. not display credentials.
284// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
285// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
286DeprecatedString URL::serialize_for_display() const
287{
288 VERIFY(m_valid);
289 if (m_scheme == "data")
290 return serialize_data_url();
291 StringBuilder builder;
292 builder.append(m_scheme);
293 builder.append(':');
294
295 if (!m_host.is_null()) {
296 builder.append("//"sv);
297 builder.append(m_host);
298 if (m_port.has_value())
299 builder.appendff(":{}", *m_port);
300 }
301
302 if (cannot_be_a_base_url()) {
303 builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path));
304 } else {
305 if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty())
306 builder.append("/."sv);
307 for (auto& segment : m_paths) {
308 builder.append('/');
309 builder.append(percent_encode(segment, PercentEncodeSet::Path));
310 }
311 }
312
313 if (!m_query.is_null()) {
314 builder.append('?');
315 builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query));
316 }
317
318 if (!m_fragment.is_null()) {
319 builder.append('#');
320 builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment));
321 }
322
323 return builder.to_deprecated_string();
324}
325
326// https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
327// https://url.spec.whatwg.org/#concept-url-origin
328DeprecatedString URL::serialize_origin() const
329{
330 VERIFY(m_valid);
331
332 if (m_scheme == "blob"sv) {
333 // TODO: 1. If URL’s blob URL entry is non-null, then return URL’s blob URL entry’s environment’s origin.
334 // 2. Let url be the result of parsing URL’s path[0].
335 VERIFY(!m_paths.is_empty());
336 URL url = m_paths[0];
337 // 3. Return a new opaque origin, if url is failure, and url’s origin otherwise.
338 if (!url.is_valid())
339 return "null";
340 return url.serialize_origin();
341 } else if (!m_scheme.is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
342 return "null";
343 }
344
345 StringBuilder builder;
346 builder.append(m_scheme);
347 builder.append("://"sv);
348 builder.append(m_host);
349 if (m_port.has_value())
350 builder.appendff(":{}", *m_port);
351 return builder.to_deprecated_string();
352}
353
354bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
355{
356 if (this == &other)
357 return true;
358 if (!m_valid || !other.m_valid)
359 return false;
360 return serialize(exclude_fragments) == other.serialize(exclude_fragments);
361}
362
363DeprecatedString URL::basename() const
364{
365 if (!m_valid)
366 return {};
367 if (m_paths.is_empty())
368 return {};
369 return m_paths.last();
370}
371
372void URL::append_percent_encoded(StringBuilder& builder, u32 code_point)
373{
374 if (code_point <= 0x7f)
375 builder.appendff("%{:02X}", code_point);
376 else if (code_point <= 0x07ff)
377 builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
378 else if (code_point <= 0xffff)
379 builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
380 else if (code_point <= 0x10ffff)
381 builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
382 else
383 VERIFY_NOT_REACHED();
384}
385
386// https://url.spec.whatwg.org/#c0-control-percent-encode-set
387bool URL::code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set)
388{
389 switch (set) {
390 case URL::PercentEncodeSet::C0Control:
391 return code_point < 0x20 || code_point > 0x7E;
392 case URL::PercentEncodeSet::Fragment:
393 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"<>`"sv.contains(code_point);
394 case URL::PercentEncodeSet::Query:
395 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"#<>"sv.contains(code_point);
396 case URL::PercentEncodeSet::SpecialQuery:
397 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || code_point == '\'';
398 case URL::PercentEncodeSet::Path:
399 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || "?`{}"sv.contains(code_point);
400 case URL::PercentEncodeSet::Userinfo:
401 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(code_point);
402 case URL::PercentEncodeSet::Component:
403 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(code_point);
404 case URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded:
405 return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Component) || "!'()~"sv.contains(code_point);
406 case URL::PercentEncodeSet::EncodeURI:
407 // NOTE: This is the same percent encode set that JS encodeURI() uses.
408 // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
409 return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point)));
410 default:
411 VERIFY_NOT_REACHED();
412 }
413}
414
415void URL::append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, URL::PercentEncodeSet set)
416{
417 if (code_point_is_in_percent_encode_set(code_point, set))
418 append_percent_encoded(builder, code_point);
419 else
420 builder.append_code_point(code_point);
421}
422
423DeprecatedString URL::percent_encode(StringView input, URL::PercentEncodeSet set, SpaceAsPlus space_as_plus)
424{
425 StringBuilder builder;
426 for (auto code_point : Utf8View(input)) {
427 if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
428 builder.append('+');
429 else
430 append_percent_encoded_if_necessary(builder, code_point, set);
431 }
432 return builder.to_deprecated_string();
433}
434
435DeprecatedString URL::percent_decode(StringView input)
436{
437 if (!input.contains('%'))
438 return input;
439 StringBuilder builder;
440 Utf8View utf8_view(input);
441 for (auto it = utf8_view.begin(); !it.done(); ++it) {
442 if (*it != '%') {
443 builder.append_code_point(*it);
444 } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
445 builder.append_code_point(*it);
446 } else {
447 ++it;
448 u8 byte = parse_ascii_hex_digit(*it) << 4;
449 ++it;
450 byte += parse_ascii_hex_digit(*it);
451 builder.append(byte);
452 }
453 }
454 return builder.to_deprecated_string();
455}
456
457}