Serenity Operating System
at master 578 lines 18 kB view raw
1/* 2 * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/ScopeGuard.h> 8#include <LibPDF/CommonNames.h> 9#include <LibPDF/Document.h> 10#include <LibPDF/Filter.h> 11#include <LibPDF/Parser.h> 12#include <LibTextCodec/Decoder.h> 13#include <ctype.h> 14 15namespace PDF { 16 17PDFErrorOr<Vector<Operator>> Parser::parse_operators(Document* document, ReadonlyBytes bytes) 18{ 19 Parser parser(document, bytes); 20 parser.m_disable_encryption = true; 21 return parser.parse_operators(); 22} 23 24Parser::Parser(Document* document, ReadonlyBytes bytes) 25 : m_reader(bytes) 26 , m_document(document) 27{ 28} 29 30Parser::Parser(ReadonlyBytes bytes) 31 : m_reader(bytes) 32{ 33} 34 35void Parser::set_document(WeakPtr<Document> const& document) 36{ 37 m_document = document; 38} 39 40DeprecatedString Parser::parse_comment() 41{ 42 StringBuilder comment; 43 while (true) { 44 if (!m_reader.matches('%')) 45 break; 46 47 m_reader.consume(); 48 auto comment_start_offset = m_reader.offset(); 49 m_reader.move_until([&](auto) { 50 return m_reader.matches_eol(); 51 }); 52 comment.append(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset)); 53 m_reader.consume_eol(); 54 m_reader.consume_whitespace(); 55 } 56 return comment.to_deprecated_string(); 57} 58 59PDFErrorOr<Value> Parser::parse_value(CanBeIndirectValue can_be_indirect_value) 60{ 61 parse_comment(); 62 63 if (m_reader.matches("null")) { 64 m_reader.move_by(4); 65 m_reader.consume_whitespace(); 66 return Value(nullptr); 67 } 68 69 if (m_reader.matches("true")) { 70 m_reader.move_by(4); 71 m_reader.consume_whitespace(); 72 return Value(true); 73 } 74 75 if (m_reader.matches("false")) { 76 m_reader.move_by(5); 77 m_reader.consume_whitespace(); 78 return Value(false); 79 } 80 81 if (m_reader.matches_number()) { 82 if (can_be_indirect_value == CanBeIndirectValue::Yes) 83 return parse_possible_indirect_value_or_ref(); 84 else 85 return parse_number(); 86 } 87 88 if (m_reader.matches('/')) 89 return MUST(parse_name()); 90 91 if (m_reader.matches("<<")) { 92 auto dict = TRY(parse_dict()); 93 if (m_reader.matches("stream")) 94 return TRY(parse_stream(dict)); 95 return dict; 96 } 97 98 if (m_reader.matches_any('(', '<')) 99 return parse_string(); 100 101 if (m_reader.matches('[')) 102 return TRY(parse_array()); 103 104 return error(DeprecatedString::formatted("Unexpected char \"{}\"", m_reader.peek())); 105} 106 107PDFErrorOr<Value> Parser::parse_possible_indirect_value_or_ref() 108{ 109 auto first_number = TRY(parse_number()); 110 if (!m_reader.matches_number()) 111 return first_number; 112 113 m_reader.save(); 114 auto second_number = parse_number(); 115 if (second_number.is_error()) { 116 m_reader.load(); 117 return first_number; 118 } 119 120 if (m_reader.matches('R')) { 121 m_reader.discard(); 122 m_reader.consume(); 123 m_reader.consume_whitespace(); 124 return Value(Reference(first_number.get<int>(), second_number.value().get<int>())); 125 } 126 127 if (m_reader.matches("obj")) { 128 m_reader.discard(); 129 auto index = first_number.get<int>(); 130 auto generation = second_number.value().get<int>(); 131 VERIFY(index >= 0); 132 VERIFY(generation >= 0); 133 return TRY(parse_indirect_value(index, generation)); 134 } 135 136 m_reader.load(); 137 return first_number; 138} 139 140PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value(u32 index, u32 generation) 141{ 142 if (!m_reader.matches("obj")) 143 return error("Expected \"obj\" at beginning of indirect value"); 144 m_reader.move_by(3); 145 m_reader.consume_whitespace(); 146 147 push_reference({ index, generation }); 148 auto value = TRY(parse_value()); 149 if (!m_reader.matches("endobj")) 150 return error("Expected \"endobj\" at end of indirect value"); 151 152 m_reader.consume(6); 153 m_reader.consume_whitespace(); 154 155 pop_reference(); 156 157 return make_object<IndirectValue>(index, generation, value); 158} 159 160PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value() 161{ 162 auto first_number = TRY(parse_number()); 163 auto second_number = TRY(parse_number()); 164 auto index = first_number.get<int>(); 165 auto generation = second_number.get<int>(); 166 VERIFY(index >= 0); 167 VERIFY(generation >= 0); 168 return parse_indirect_value(index, generation); 169} 170 171PDFErrorOr<Value> Parser::parse_number() 172{ 173 m_reader.consume_whitespace(); 174 175 size_t start_offset = m_reader.offset(); 176 bool is_float = false; 177 bool consumed_digit = false; 178 179 if (m_reader.matches('+') || m_reader.matches('-')) 180 m_reader.consume(); 181 182 while (!m_reader.done()) { 183 if (m_reader.matches('.')) { 184 if (is_float) 185 break; 186 is_float = true; 187 m_reader.consume(); 188 } else if (isdigit(m_reader.peek())) { 189 m_reader.consume(); 190 consumed_digit = true; 191 } else { 192 break; 193 } 194 } 195 196 if (!consumed_digit) 197 return error("Invalid number"); 198 199 m_reader.consume_whitespace(); 200 201 auto string = DeprecatedString(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset)); 202 if (is_float) 203 return Value(strtof(string.characters(), nullptr)); 204 205 return Value(atoi(string.characters())); 206} 207 208PDFErrorOr<NonnullRefPtr<NameObject>> Parser::parse_name() 209{ 210 if (!m_reader.consume('/')) 211 return error("Expected Name object to start with \"/\""); 212 213 StringBuilder builder; 214 215 while (true) { 216 if (!m_reader.matches_regular_character()) 217 break; 218 219 if (m_reader.matches('#')) { 220 m_reader.consume(); 221 int hex_value = 0; 222 for (int i = 0; i < 2; i++) { 223 auto ch = m_reader.consume(); 224 VERIFY(isxdigit(ch)); 225 hex_value *= 16; 226 if (ch <= '9') { 227 hex_value += ch - '0'; 228 } else { 229 hex_value += ch - 'A' + 10; 230 } 231 } 232 builder.append(static_cast<char>(hex_value)); 233 continue; 234 } 235 236 builder.append(m_reader.consume()); 237 } 238 239 m_reader.consume_whitespace(); 240 241 return make_object<NameObject>(builder.to_deprecated_string()); 242} 243 244NonnullRefPtr<StringObject> Parser::parse_string() 245{ 246 ScopeGuard guard([&] { m_reader.consume_whitespace(); }); 247 248 DeprecatedString string; 249 bool is_binary_string; 250 251 if (m_reader.matches('(')) { 252 string = parse_literal_string(); 253 is_binary_string = false; 254 } else { 255 string = parse_hex_string(); 256 is_binary_string = true; 257 } 258 259 VERIFY(!string.is_null()); 260 261 auto string_object = make_object<StringObject>(string, is_binary_string); 262 263 if (m_document->security_handler() && !m_disable_encryption) 264 m_document->security_handler()->decrypt(string_object, m_current_reference_stack.last()); 265 266 auto unencrypted_string = string_object->string(); 267 268 if (unencrypted_string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) { 269 // The string is encoded in UTF16-BE 270 string_object->set_string(TextCodec::decoder_for("utf-16be"sv)->to_utf8(unencrypted_string).release_value_but_fixme_should_propagate_errors().to_deprecated_string()); 271 } else if (unencrypted_string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) { 272 // The string is encoded in UTF-8. This is the default anyways, but if these bytes 273 // are explicitly included, we have to trim them 274 string_object->set_string(unencrypted_string.substring(3)); 275 } 276 277 return string_object; 278} 279 280DeprecatedString Parser::parse_literal_string() 281{ 282 VERIFY(m_reader.consume('(')); 283 StringBuilder builder; 284 auto opened_parens = 0; 285 286 while (true) { 287 if (m_reader.matches('(')) { 288 opened_parens++; 289 builder.append(m_reader.consume()); 290 } else if (m_reader.matches(')')) { 291 m_reader.consume(); 292 if (opened_parens == 0) 293 break; 294 opened_parens--; 295 builder.append(')'); 296 } else if (m_reader.matches('\\')) { 297 m_reader.consume(); 298 if (m_reader.matches_eol()) { 299 m_reader.consume_eol(); 300 continue; 301 } 302 303 if (m_reader.done()) 304 return {}; 305 306 auto ch = m_reader.consume(); 307 switch (ch) { 308 case 'n': 309 builder.append('\n'); 310 break; 311 case 'r': 312 builder.append('\r'); 313 break; 314 case 't': 315 builder.append('\t'); 316 break; 317 case 'b': 318 builder.append('\b'); 319 break; 320 case 'f': 321 builder.append('\f'); 322 break; 323 case '(': 324 builder.append('('); 325 break; 326 case ')': 327 builder.append(')'); 328 break; 329 case '\\': 330 builder.append('\\'); 331 break; 332 default: { 333 if (ch >= '0' && ch <= '7') { 334 int octal_value = ch - '0'; 335 for (int i = 0; i < 2; i++) { 336 auto octal_ch = m_reader.consume(); 337 if (octal_ch < '0' || octal_ch > '7') 338 break; 339 octal_value = octal_value * 8 + (octal_ch - '0'); 340 } 341 builder.append(static_cast<char>(octal_value)); 342 } else { 343 builder.append(ch); 344 } 345 } 346 } 347 } else if (m_reader.matches_eol()) { 348 m_reader.consume_eol(); 349 builder.append('\n'); 350 } else { 351 builder.append(m_reader.consume()); 352 } 353 } 354 355 return builder.to_deprecated_string(); 356} 357 358DeprecatedString Parser::parse_hex_string() 359{ 360 VERIFY(m_reader.consume('<')); 361 362 StringBuilder builder; 363 364 while (true) { 365 if (m_reader.matches('>')) { 366 m_reader.consume(); 367 return builder.to_deprecated_string(); 368 } else { 369 int hex_value = 0; 370 371 for (int i = 0; i < 2; i++) { 372 m_reader.consume_whitespace(); 373 auto ch = m_reader.consume(); 374 if (ch == '>') { 375 // The hex string contains an odd number of characters, and the last character 376 // is assumed to be '0' 377 m_reader.consume(); 378 hex_value *= 16; 379 builder.append(static_cast<char>(hex_value)); 380 return builder.to_deprecated_string(); 381 } 382 VERIFY(isxdigit(ch)); 383 384 hex_value *= 16; 385 if (ch <= '9') { 386 hex_value += ch - '0'; 387 } else if (ch >= 'A' && ch <= 'F') { 388 hex_value += ch - 'A' + 10; 389 } else { 390 hex_value += ch - 'a' + 10; 391 } 392 } 393 394 builder.append(static_cast<char>(hex_value)); 395 } 396 } 397} 398 399PDFErrorOr<NonnullRefPtr<ArrayObject>> Parser::parse_array() 400{ 401 if (!m_reader.consume('[')) 402 return error("Expected array to start with \"[\""); 403 m_reader.consume_whitespace(); 404 Vector<Value> values; 405 406 while (!m_reader.matches(']')) 407 values.append(TRY(parse_value())); 408 409 VERIFY(m_reader.consume(']')); 410 m_reader.consume_whitespace(); 411 412 return make_object<ArrayObject>(values); 413} 414 415PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_dict() 416{ 417 if (!m_reader.consume('<') || !m_reader.consume('<')) 418 return error("Expected dict to start with \"<<\""); 419 420 m_reader.consume_whitespace(); 421 HashMap<DeprecatedFlyString, Value> map; 422 423 while (!m_reader.done()) { 424 if (m_reader.matches(">>")) 425 break; 426 auto name = TRY(parse_name())->name(); 427 auto value = TRY(parse_value()); 428 map.set(name, value); 429 } 430 431 if (!m_reader.consume('>') || !m_reader.consume('>')) 432 return error("Expected dict to end with \">>\""); 433 m_reader.consume_whitespace(); 434 435 return make_object<DictObject>(map); 436} 437 438PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictObject> dict) 439{ 440 if (!m_reader.matches("stream")) 441 return error("Expected stream to start with \"stream\""); 442 m_reader.move_by(6); 443 if (!m_reader.consume_eol()) 444 return error("Expected \"stream\" to be followed by a newline"); 445 446 ReadonlyBytes bytes; 447 448 auto maybe_length = dict->get(CommonNames::Length); 449 if (maybe_length.has_value() && m_document->can_resolve_refefences()) { 450 // The PDF writer has kindly provided us with the direct length of the stream 451 m_reader.save(); 452 auto length = TRY(m_document->resolve_to<int>(maybe_length.value())); 453 m_reader.load(); 454 bytes = m_reader.bytes().slice(m_reader.offset(), length); 455 m_reader.move_by(length); 456 m_reader.consume_whitespace(); 457 } else { 458 // We have to look for the endstream keyword 459 auto stream_start = m_reader.offset(); 460 while (!m_reader.matches("endstream")) { 461 m_reader.consume(); 462 m_reader.move_until('e'); 463 } 464 auto stream_end = m_reader.offset(); 465 m_reader.consume_eol(); 466 bytes = m_reader.bytes().slice(stream_start, stream_end - stream_start); 467 } 468 469 m_reader.move_by(9); 470 m_reader.consume_whitespace(); 471 472 auto stream_object = make_object<StreamObject>(dict, MUST(ByteBuffer::copy(bytes))); 473 474 if (m_document->security_handler() && !m_disable_encryption) 475 m_document->security_handler()->decrypt(stream_object, m_current_reference_stack.last()); 476 477 if (dict->contains(CommonNames::Filter)) { 478 Vector<DeprecatedFlyString> filters; 479 480 // We may either get a single filter or an array of cascading filters 481 auto filter_object = TRY(dict->get_object(m_document, CommonNames::Filter)); 482 if (filter_object->is<ArrayObject>()) { 483 auto filter_array = filter_object->cast<ArrayObject>(); 484 for (size_t i = 0; i < filter_array->size(); ++i) 485 filters.append(TRY(filter_array->get_name_at(m_document, i))->name()); 486 } else { 487 filters.append(filter_object->cast<NameObject>()->name()); 488 } 489 490 // Every filter may get its own parameter dictionary 491 Vector<RefPtr<DictObject>> decode_parms_vector; 492 RefPtr<Object> decode_parms_object; 493 if (dict->contains(CommonNames::DecodeParms)) { 494 decode_parms_object = TRY(dict->get_object(m_document, CommonNames::DecodeParms)); 495 if (decode_parms_object->is<ArrayObject>()) { 496 auto decode_parms_array = decode_parms_object->cast<ArrayObject>(); 497 for (size_t i = 0; i < decode_parms_array->size(); ++i) { 498 RefPtr<DictObject> decode_parms; 499 auto entry = decode_parms_array->at(i); 500 if (entry.has<NonnullRefPtr<Object>>()) 501 decode_parms = entry.get<NonnullRefPtr<Object>>()->cast<DictObject>(); 502 decode_parms_vector.append(decode_parms); 503 } 504 } else { 505 decode_parms_vector.append(decode_parms_object->cast<DictObject>()); 506 } 507 } 508 509 VERIFY(decode_parms_vector.is_empty() || decode_parms_vector.size() == filters.size()); 510 511 for (size_t i = 0; i < filters.size(); ++i) { 512 RefPtr<DictObject> decode_parms; 513 if (!decode_parms_vector.is_empty()) 514 decode_parms = decode_parms_vector.at(i); 515 516 stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filters.at(i), decode_parms)); 517 } 518 } 519 520 return stream_object; 521} 522 523PDFErrorOr<Vector<Operator>> Parser::parse_operators() 524{ 525 Vector<Operator> operators; 526 Vector<Value> operator_args; 527 528 constexpr static auto is_operator_char = [](char ch) { 529 return isalpha(ch) || ch == '*' || ch == '\''; 530 }; 531 532 m_reader.consume_whitespace(); 533 534 while (!m_reader.done()) { 535 auto ch = m_reader.peek(); 536 if (is_operator_char(ch)) { 537 auto operator_start = m_reader.offset(); 538 while (is_operator_char(ch)) { 539 m_reader.consume(); 540 if (m_reader.done()) 541 break; 542 ch = m_reader.peek(); 543 } 544 545 auto operator_string = StringView(m_reader.bytes().slice(operator_start, m_reader.offset() - operator_start)); 546 auto operator_type = Operator::operator_type_from_symbol(operator_string); 547 operators.append(Operator(operator_type, move(operator_args))); 548 operator_args = Vector<Value>(); 549 m_reader.consume_whitespace(); 550 551 continue; 552 } 553 554 // Note: We disallow parsing indirect values here, since 555 // operations like 0 0 0 RG would confuse the parser 556 auto v = TRY(parse_value(CanBeIndirectValue::No)); 557 operator_args.append(v); 558 } 559 560 return operators; 561} 562 563Error Parser::error( 564 DeprecatedString const& message 565#ifdef PDF_DEBUG 566 , 567 SourceLocation loc 568#endif 569) const 570{ 571#ifdef PDF_DEBUG 572 dbgln("\033[31m{} Parser error at offset {}: {}\033[0m", loc, m_reader.offset(), message); 573#endif 574 575 return Error { Error::Type::Parse, message }; 576} 577 578}