Serenity Operating System
at master 816 lines 33 kB view raw
1/* 2 * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org> 3 * Copyright (c) 2022, Julian Offenhäuser <offenhaeuser@protonmail.com> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#include <AK/BitStream.h> 9#include <AK/Endian.h> 10#include <AK/MemoryStream.h> 11#include <AK/Tuple.h> 12#include <LibPDF/CommonNames.h> 13#include <LibPDF/Document.h> 14#include <LibPDF/DocumentParser.h> 15#include <LibPDF/ObjectDerivatives.h> 16 17namespace PDF { 18 19DocumentParser::DocumentParser(Document* document, ReadonlyBytes bytes) 20 : Parser(document, bytes) 21{ 22} 23 24PDFErrorOr<void> DocumentParser::initialize() 25{ 26 TRY(parse_header()); 27 28 auto const linearization_result = TRY(initialize_linearization_dict()); 29 30 if (linearization_result == LinearizationResult::NotLinearized) 31 return initialize_non_linearized_xref_table(); 32 33 bool is_linearized = m_linearization_dictionary.has_value(); 34 if (is_linearized) { 35 // The file may have been linearized at one point, but could have been updated afterwards, 36 // which means it is no longer a linearized PDF file. 37 is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size(); 38 39 if (!is_linearized) { 40 // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still 41 // split. This might take some tweaking to ensure correct behavior, which can be 42 // implemented later. 43 TODO(); 44 } 45 } 46 47 if (is_linearized) 48 return initialize_linearized_xref_table(); 49 50 return initialize_non_linearized_xref_table(); 51} 52 53PDFErrorOr<Value> DocumentParser::parse_object_with_index(u32 index) 54{ 55 VERIFY(m_xref_table->has_object(index)); 56 57 if (m_xref_table->is_object_compressed(index)) 58 // The object can be found in a object stream 59 return parse_compressed_object_with_index(index); 60 61 auto byte_offset = m_xref_table->byte_offset_for_object(index); 62 m_reader.move_to(byte_offset); 63 auto indirect_value = TRY(parse_indirect_value()); 64 VERIFY(indirect_value->index() == index); 65 return indirect_value->value(); 66} 67 68PDFErrorOr<void> DocumentParser::parse_header() 69{ 70 // FIXME: Do something with the version? 71 m_reader.set_reading_forwards(); 72 if (m_reader.remaining() == 0) 73 return error("Empty PDF document"); 74 75 m_reader.move_to(0); 76 if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-")) 77 return error("Not a PDF document"); 78 79 m_reader.move_by(5); 80 81 char major_ver = m_reader.read(); 82 if (major_ver != '1' && major_ver != '2') 83 return error(DeprecatedString::formatted("Unknown major version \"{}\"", major_ver)); 84 85 if (m_reader.read() != '.') 86 return error("Malformed PDF version"); 87 88 char minor_ver = m_reader.read(); 89 if (minor_ver < '0' || minor_ver > '7') 90 return error(DeprecatedString::formatted("Unknown minor version \"{}\"", minor_ver)); 91 92 m_reader.consume_eol(); 93 94 // Parse optional high-byte comment, which signifies a binary file 95 // FIXME: Do something with this? 96 auto comment = parse_comment(); 97 if (!comment.is_empty()) { 98 auto binary = comment.length() >= 4; 99 if (binary) { 100 for (size_t i = 0; i < comment.length() && binary; i++) 101 binary = static_cast<u8>(comment[i]) > 128; 102 } 103 } 104 105 return {}; 106} 107 108PDFErrorOr<DocumentParser::LinearizationResult> DocumentParser::initialize_linearization_dict() 109{ 110 // parse_header() is called immediately before this, so we are at the right location 111 auto indirect_value = Value(*TRY(parse_indirect_value())); 112 auto dict_value = TRY(m_document->resolve(indirect_value)); 113 if (!dict_value.has<NonnullRefPtr<Object>>()) 114 return error("Expected linearization object to be a dictionary"); 115 116 auto dict_object = dict_value.get<NonnullRefPtr<Object>>(); 117 if (!dict_object->is<DictObject>()) 118 return LinearizationResult::NotLinearized; 119 120 auto dict = dict_object->cast<DictObject>(); 121 122 if (!dict->contains(CommonNames::Linearized)) 123 return LinearizationResult::NotLinearized; 124 125 if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T)) 126 return error("Malformed linearization dictionary"); 127 128 auto length_of_file = dict->get_value(CommonNames::L); 129 auto hint_table = dict->get_value(CommonNames::H); 130 auto first_page_object_number = dict->get_value(CommonNames::O); 131 auto offset_of_first_page_end = dict->get_value(CommonNames::E); 132 auto number_of_pages = dict->get_value(CommonNames::N); 133 auto offset_of_main_xref_table = dict->get_value(CommonNames::T); 134 auto first_page = dict->get(CommonNames::P).value_or({}); 135 136 // Validation 137 if (!length_of_file.has_u32() 138 || !hint_table.has<NonnullRefPtr<Object>>() 139 || !first_page_object_number.has_u32() 140 || !number_of_pages.has_u16() 141 || !offset_of_main_xref_table.has_u32() 142 || (!first_page.has<Empty>() && !first_page.has_u32())) { 143 return error("Malformed linearization dictionary parameters"); 144 } 145 146 auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>(); 147 auto hint_table_size = hint_table_array->size(); 148 if (hint_table_size != 2 && hint_table_size != 4) 149 return error("Expected hint table to be of length 2 or 4"); 150 151 auto primary_hint_stream_offset = hint_table_array->at(0); 152 auto primary_hint_stream_length = hint_table_array->at(1); 153 Value overflow_hint_stream_offset; 154 Value overflow_hint_stream_length; 155 156 if (hint_table_size == 4) { 157 overflow_hint_stream_offset = hint_table_array->at(2); 158 overflow_hint_stream_length = hint_table_array->at(3); 159 } 160 161 if (!primary_hint_stream_offset.has_u32() 162 || !primary_hint_stream_length.has_u32() 163 || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32()) 164 || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) { 165 return error("Malformed hint stream"); 166 } 167 168 m_linearization_dictionary = LinearizationDictionary { 169 length_of_file.get_u32(), 170 primary_hint_stream_offset.get_u32(), 171 primary_hint_stream_length.get_u32(), 172 overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(), 173 overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(), 174 first_page_object_number.get_u32(), 175 offset_of_first_page_end.get_u32(), 176 number_of_pages.get_u16(), 177 offset_of_main_xref_table.get_u32(), 178 first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(), 179 }; 180 181 return LinearizationResult::Linearized; 182} 183 184PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table() 185{ 186 // The linearization parameter dictionary has just been parsed, and the xref table 187 // comes immediately after it. We are in the correct spot. 188 m_xref_table = TRY(parse_xref_table()); 189 190 // Also parse the main xref table and merge into the first-page xref table. Note 191 // that we don't use the main xref table offset from the linearization dict because 192 // for some reason, it specified the offset of the whitespace after the object 193 // index start and length? So it's much easier to do it this way. 194 auto main_xref_table_offset = m_xref_table->trailer()->get_value(CommonNames::Prev).to_int(); 195 m_reader.move_to(main_xref_table_offset); 196 auto main_xref_table = TRY(parse_xref_table()); 197 TRY(m_xref_table->merge(move(*main_xref_table))); 198 199 return validate_xref_table_and_fix_if_necessary(); 200} 201 202PDFErrorOr<void> DocumentParser::initialize_hint_tables() 203{ 204 auto linearization_dict = m_linearization_dictionary.value(); 205 auto primary_offset = linearization_dict.primary_hint_stream_offset; 206 auto overflow_offset = linearization_dict.overflow_hint_stream_offset; 207 208 auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> { 209 m_reader.move_to(offset); 210 auto stream_indirect_value = parse_indirect_value(); 211 if (stream_indirect_value.is_error()) 212 return {}; 213 214 auto stream_value = stream_indirect_value.value()->value(); 215 if (!stream_value.has<NonnullRefPtr<Object>>()) 216 return {}; 217 218 auto stream_object = stream_value.get<NonnullRefPtr<Object>>(); 219 if (!stream_object->is<StreamObject>()) 220 return {}; 221 222 return stream_object->cast<StreamObject>(); 223 }; 224 225 auto primary_hint_stream = parse_hint_table(primary_offset); 226 if (!primary_hint_stream) 227 return error("Invalid primary hint stream"); 228 229 RefPtr<StreamObject> overflow_hint_stream; 230 if (overflow_offset != NumericLimits<u32>::max()) 231 overflow_hint_stream = parse_hint_table(overflow_offset); 232 233 ByteBuffer possible_merged_stream_buffer; 234 ReadonlyBytes hint_stream_bytes; 235 236 if (overflow_hint_stream) { 237 auto primary_size = primary_hint_stream->bytes().size(); 238 auto overflow_size = overflow_hint_stream->bytes().size(); 239 auto total_size = primary_size + overflow_size; 240 241 auto buffer_result = ByteBuffer::create_uninitialized(total_size); 242 if (buffer_result.is_error()) 243 return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" }; 244 possible_merged_stream_buffer = buffer_result.release_value(); 245 MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes())); 246 MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes())); 247 hint_stream_bytes = possible_merged_stream_buffer.bytes(); 248 } else { 249 hint_stream_bytes = primary_hint_stream->bytes(); 250 } 251 252 auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes)); 253 auto hint_table_entries = TRY(parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes)); 254 255 // FIXME: Do something with the hint tables 256 return {}; 257} 258 259PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table() 260{ 261 m_reader.move_to(m_reader.bytes().size() - 1); 262 if (!navigate_to_before_eof_marker()) 263 return error("No EOF marker"); 264 if (!navigate_to_after_startxref()) 265 return error("No xref"); 266 267 m_reader.set_reading_forwards(); 268 auto xref_offset_value = TRY(parse_number()); 269 auto xref_offset = TRY(m_document->resolve_to<int>(xref_offset_value)); 270 m_reader.move_to(xref_offset); 271 272 // As per 7.5.6 Incremental Updates: 273 // When a conforming reader reads the file, it shall build its cross-reference 274 // information in such a way that the most recent copy of each object shall be 275 // the one accessed from the file. 276 // NOTE: This means that we have to follow back the chain of XRef table sections 277 // and only add objects that were not already specified in a previous 278 // (and thus newer) XRef section. 279 while (1) { 280 auto xref_table = TRY(parse_xref_table()); 281 if (!m_xref_table) 282 m_xref_table = xref_table; 283 else 284 TRY(m_xref_table->merge(move(*xref_table))); 285 286 if (!xref_table->trailer() || !xref_table->trailer()->contains(CommonNames::Prev)) 287 break; 288 289 auto offset = TRY(m_document->resolve_to<int>(xref_table->trailer()->get_value(CommonNames::Prev))); 290 m_reader.move_to(offset); 291 } 292 293 return validate_xref_table_and_fix_if_necessary(); 294} 295 296PDFErrorOr<void> DocumentParser::validate_xref_table_and_fix_if_necessary() 297{ 298 /* While an xref table may start with an object number other than zero, this is 299 very uncommon and likely a sign of a document with broken indices. 300 Like most other PDF parsers seem to do, we still try to salvage the situation. 301 NOTE: This is probably not spec-compliant behavior.*/ 302 size_t first_valid_index = 0; 303 while (m_xref_table->byte_offset_for_object(first_valid_index) == invalid_byte_offset) 304 first_valid_index++; 305 306 if (first_valid_index) { 307 auto& entries = m_xref_table->entries(); 308 309 bool need_to_rebuild_table = true; 310 for (size_t i = first_valid_index; i < entries.size(); ++i) { 311 if (!entries[i].in_use) 312 continue; 313 314 size_t actual_object_number = 0; 315 if (entries[i].compressed) { 316 auto object_stream_index = m_xref_table->object_stream_for_object(i); 317 auto stream_offset = m_xref_table->byte_offset_for_object(object_stream_index); 318 m_reader.move_to(stream_offset); 319 auto first_number = TRY(parse_number()); 320 actual_object_number = first_number.get_u32(); 321 } else { 322 auto byte_offset = m_xref_table->byte_offset_for_object(i); 323 m_reader.move_to(byte_offset); 324 auto indirect_value = TRY(parse_indirect_value()); 325 actual_object_number = indirect_value->index(); 326 } 327 328 if (actual_object_number != i - first_valid_index) { 329 /* Our suspicion was wrong, not all object numbers are shifted equally. 330 This could mean that the document is hopelessly broken, or it just 331 starts at a non-zero object index for some reason. */ 332 need_to_rebuild_table = false; 333 break; 334 } 335 } 336 337 if (need_to_rebuild_table) { 338 warnln("Broken xref table detected, trying to fix it."); 339 entries.remove(0, first_valid_index); 340 } 341 } 342 343 return {}; 344} 345 346PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream() 347{ 348 auto first_number = TRY(parse_number()); 349 auto second_number = TRY(parse_number()); 350 351 if (!m_reader.matches("obj")) 352 return error("Malformed xref object"); 353 m_reader.move_by(3); 354 if (m_reader.matches_eol()) 355 m_reader.consume_eol(); 356 357 auto dict = TRY(parse_dict()); 358 auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name(); 359 if (type != "XRef") 360 return error("Malformed xref dictionary"); 361 362 auto field_sizes = TRY(dict->get_array(m_document, "W")); 363 if (field_sizes->size() != 3) 364 return error("Malformed xref dictionary"); 365 366 auto highest_object_number = dict->get_value("Size").get<int>() - 1; 367 368 Vector<Tuple<int, int>> subsections; 369 if (dict->contains(CommonNames::Index)) { 370 auto index_array = TRY(dict->get_array(m_document, CommonNames::Index)); 371 if (index_array->size() % 2 != 0) 372 return error("Malformed xref dictionary"); 373 374 for (size_t i = 0; i < index_array->size(); i += 2) 375 subsections.append({ index_array->at(i).get<int>(), index_array->at(i + 1).get<int>() - 1 }); 376 } else { 377 subsections.append({ 0, highest_object_number }); 378 } 379 auto stream = TRY(parse_stream(dict)); 380 auto table = adopt_ref(*new XRefTable()); 381 382 auto field_to_long = [](ReadonlyBytes field) -> long { 383 long value = 0; 384 const u8 max = (field.size() - 1) * 8; 385 for (size_t i = 0; i < field.size(); ++i) { 386 value |= static_cast<long>(field[i]) << (max - (i * 8)); 387 } 388 return value; 389 }; 390 391 size_t byte_index = 0; 392 size_t subsection_index = 0; 393 394 Vector<XRefEntry> entries; 395 396 for (int entry_index = 0; subsection_index < subsections.size(); ++entry_index) { 397 Array<long, 3> fields; 398 for (size_t field_index = 0; field_index < 3; ++field_index) { 399 auto field_size = field_sizes->at(field_index).get_u32(); 400 401 if (byte_index + field_size > stream->bytes().size()) 402 return error("The xref stream data cut off early"); 403 404 auto field = stream->bytes().slice(byte_index, field_size); 405 fields[field_index] = field_to_long(field); 406 byte_index += field_size; 407 } 408 409 u8 type = fields[0]; 410 if (!field_sizes->at(0).get_u32()) 411 type = 1; 412 413 entries.append({ fields[1], static_cast<u16>(fields[2]), type != 0, type == 2 }); 414 415 auto subsection = subsections[subsection_index]; 416 if (entry_index >= subsection.get<1>()) { 417 table->add_section({ subsection.get<0>(), subsection.get<1>(), entries }); 418 entries.clear(); 419 subsection_index++; 420 } 421 } 422 423 table->set_trailer(dict); 424 425 return table; 426} 427 428PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table() 429{ 430 if (!m_reader.matches("xref")) { 431 // Since version 1.5, there may be a cross-reference stream instead 432 return parse_xref_stream(); 433 } 434 435 m_reader.move_by(4); 436 if (!m_reader.consume_eol()) 437 return error("Expected newline after \"xref\""); 438 439 auto table = adopt_ref(*new XRefTable()); 440 441 while (m_reader.matches_number()) { 442 Vector<XRefEntry> entries; 443 444 auto starting_index_value = TRY(parse_number()); 445 auto starting_index = starting_index_value.get<int>(); 446 auto object_count_value = TRY(parse_number()); 447 auto object_count = object_count_value.get<int>(); 448 449 for (int i = 0; i < object_count; i++) { 450 auto offset_string = DeprecatedString(m_reader.bytes().slice(m_reader.offset(), 10)); 451 m_reader.move_by(10); 452 if (!m_reader.consume(' ')) 453 return error("Malformed xref entry"); 454 455 auto generation_string = DeprecatedString(m_reader.bytes().slice(m_reader.offset(), 5)); 456 m_reader.move_by(5); 457 if (!m_reader.consume(' ')) 458 return error("Malformed xref entry"); 459 460 auto letter = m_reader.read(); 461 if (letter != 'n' && letter != 'f') 462 return error("Malformed xref entry"); 463 464 // The line ending sequence can be one of the following: 465 // SP CR, SP LF, or CR LF 466 if (m_reader.matches(' ')) { 467 m_reader.consume(); 468 auto ch = m_reader.consume(); 469 if (ch != '\r' && ch != '\n') 470 return error("Malformed xref entry"); 471 } else { 472 if (!m_reader.matches("\r\n")) 473 return error("Malformed xref entry"); 474 m_reader.move_by(2); 475 } 476 477 auto offset = strtol(offset_string.characters(), nullptr, 10); 478 auto generation = strtol(generation_string.characters(), nullptr, 10); 479 480 entries.append({ offset, static_cast<u16>(generation), letter == 'n' }); 481 } 482 483 table->add_section({ starting_index, object_count, entries }); 484 } 485 486 m_reader.consume_whitespace(); 487 if (m_reader.matches("trailer")) 488 table->set_trailer(TRY(parse_file_trailer())); 489 490 return table; 491} 492 493PDFErrorOr<NonnullRefPtr<DictObject>> DocumentParser::parse_file_trailer() 494{ 495 while (m_reader.matches_eol()) 496 m_reader.consume_eol(); 497 498 if (!m_reader.matches("trailer")) 499 return error("Expected \"trailer\" keyword"); 500 m_reader.move_by(7); 501 m_reader.consume_whitespace(); 502 auto dict = TRY(parse_dict()); 503 504 if (!m_reader.matches("startxref")) 505 return error("Expected \"startxref\""); 506 m_reader.move_by(9); 507 m_reader.consume_whitespace(); 508 509 m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); 510 VERIFY(m_reader.consume_eol()); 511 if (!m_reader.matches("%%EOF")) 512 return error("Expected \"%%EOF\""); 513 514 m_reader.move_by(5); 515 m_reader.consume_whitespace(); 516 return dict; 517} 518 519PDFErrorOr<Value> DocumentParser::parse_compressed_object_with_index(u32 index) 520{ 521 auto object_stream_index = m_xref_table->object_stream_for_object(index); 522 auto stream_offset = m_xref_table->byte_offset_for_object(object_stream_index); 523 524 m_reader.move_to(stream_offset); 525 526 auto first_number = TRY(parse_number()); 527 auto second_number = TRY(parse_number()); 528 529 if (first_number.get<int>() != object_stream_index) 530 return error("Mismatching object stream index"); 531 if (second_number.get<int>() != 0) 532 return error("Non-zero object stream generation number"); 533 534 if (!m_reader.matches("obj")) 535 return error("Malformed object stream"); 536 m_reader.move_by(3); 537 if (m_reader.matches_eol()) 538 m_reader.consume_eol(); 539 540 auto dict = TRY(parse_dict()); 541 auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name(); 542 if (type != "ObjStm") 543 return error("Invalid object stream type"); 544 545 auto object_count = dict->get_value("N").get_u32(); 546 auto first_object_offset = dict->get_value("First").get_u32(); 547 548 auto stream = TRY(parse_stream(dict)); 549 Parser stream_parser(m_document, stream->bytes()); 550 551 for (u32 i = 0; i < object_count; ++i) { 552 auto object_number = TRY(stream_parser.parse_number()); 553 auto object_offset = TRY(stream_parser.parse_number()); 554 555 if (object_number.get_u32() == index) { 556 stream_parser.move_to(first_object_offset + object_offset.get_u32()); 557 break; 558 } 559 } 560 561 return TRY(stream_parser.parse_value()); 562} 563 564PDFErrorOr<DocumentParser::PageOffsetHintTable> DocumentParser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes) 565{ 566 if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable)) 567 return error("Hint stream is too small"); 568 569 size_t offset = 0; 570 571 auto read_u32 = [&] { 572 u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0]; 573 offset += 4; 574 return AK::convert_between_host_and_big_endian(data); 575 }; 576 577 auto read_u16 = [&] { 578 u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0]; 579 offset += 2; 580 return AK::convert_between_host_and_big_endian(data); 581 }; 582 583 PageOffsetHintTable hint_table { 584 read_u32(), 585 read_u32(), 586 read_u16(), 587 read_u32(), 588 read_u16(), 589 read_u32(), 590 read_u16(), 591 read_u32(), 592 read_u16(), 593 read_u16(), 594 read_u16(), 595 read_u16(), 596 read_u16(), 597 }; 598 599 // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric 600 // fields in PageOffsetHintTableEntry are u32 601 VERIFY(hint_table.bits_required_for_object_number <= 32); 602 VERIFY(hint_table.bits_required_for_page_length <= 32); 603 VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32); 604 VERIFY(hint_table.bits_required_for_content_stream_length <= 32); 605 VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32); 606 VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32); 607 VERIFY(hint_table.bits_required_for_fraction_numerator <= 32); 608 609 return hint_table; 610} 611 612PDFErrorOr<Vector<DocumentParser::PageOffsetHintTableEntry>> DocumentParser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes) 613{ 614 auto input_stream = TRY(try_make<FixedMemoryStream>(hint_stream_bytes)); 615 TRY(input_stream->seek(sizeof(PageOffsetHintTable))); 616 617 LittleEndianInputBitStream bit_stream { move(input_stream) }; 618 619 auto number_of_pages = m_linearization_dictionary.value().number_of_pages; 620 Vector<PageOffsetHintTableEntry> entries; 621 for (size_t i = 0; i < number_of_pages; i++) 622 entries.append(PageOffsetHintTableEntry {}); 623 624 auto bits_required_for_object_number = hint_table.bits_required_for_object_number; 625 auto bits_required_for_page_length = hint_table.bits_required_for_page_length; 626 auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets; 627 auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length; 628 auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs; 629 auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier; 630 auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator; 631 632 auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) -> ErrorOr<void> { 633 if (bit_size <= 0) 634 return {}; 635 636 for (int i = 0; i < number_of_pages; i++) { 637 auto& entry = entries[i]; 638 entry.*field = TRY(bit_stream.read_bits(bit_size)); 639 } 640 641 return {}; 642 }; 643 644 auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) -> ErrorOr<void> { 645 if (bit_size <= 0) 646 return {}; 647 648 for (int page = 1; page < number_of_pages; page++) { 649 auto number_of_shared_objects = entries[page].number_of_shared_objects; 650 Vector<u32> items; 651 items.ensure_capacity(number_of_shared_objects); 652 653 for (size_t i = 0; i < number_of_shared_objects; i++) 654 items.unchecked_append(TRY(bit_stream.read_bits(bit_size))); 655 656 entries[page].*field = move(items); 657 } 658 659 return {}; 660 }; 661 662 TRY(parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number)); 663 TRY(parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length)); 664 TRY(parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs)); 665 TRY(parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier)); 666 TRY(parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator)); 667 TRY(parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets)); 668 TRY(parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length)); 669 670 return entries; 671} 672 673bool DocumentParser::navigate_to_before_eof_marker() 674{ 675 m_reader.set_reading_backwards(); 676 677 while (!m_reader.done()) { 678 m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); 679 if (m_reader.done()) 680 return false; 681 682 m_reader.consume_eol(); 683 if (!m_reader.matches("%%EOF")) 684 continue; 685 686 m_reader.move_by(5); 687 if (!m_reader.matches_eol()) 688 continue; 689 m_reader.consume_eol(); 690 return true; 691 } 692 693 return false; 694} 695 696bool DocumentParser::navigate_to_after_startxref() 697{ 698 m_reader.set_reading_backwards(); 699 700 while (!m_reader.done()) { 701 m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); 702 auto offset = m_reader.offset() + 1; 703 704 m_reader.consume_eol(); 705 if (!m_reader.matches("startxref")) 706 continue; 707 708 m_reader.move_by(9); 709 if (!m_reader.matches_eol()) 710 continue; 711 712 m_reader.move_to(offset); 713 return true; 714 } 715 716 return false; 717} 718 719PDFErrorOr<RefPtr<DictObject>> DocumentParser::conditionally_parse_page_tree_node(u32 object_index) 720{ 721 auto dict_value = TRY(parse_object_with_index(object_index)); 722 auto dict_object = dict_value.get<NonnullRefPtr<Object>>(); 723 if (!dict_object->is<DictObject>()) 724 return error(DeprecatedString::formatted("Invalid page tree with xref index {}", object_index)); 725 726 auto dict = dict_object->cast<DictObject>(); 727 if (!dict->contains_any_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) 728 // This is a page, not a page tree node 729 return RefPtr<DictObject> {}; 730 731 if (!dict->contains(CommonNames::Type)) 732 return RefPtr<DictObject> {}; 733 auto type_object = TRY(dict->get_object(m_document, CommonNames::Type)); 734 if (!type_object->is<NameObject>()) 735 return RefPtr<DictObject> {}; 736 auto type_name = type_object->cast<NameObject>(); 737 if (type_name->name() != CommonNames::Pages) 738 return RefPtr<DictObject> {}; 739 740 return dict; 741} 742 743} 744 745namespace AK { 746 747template<> 748struct Formatter<PDF::DocumentParser::LinearizationDictionary> : Formatter<StringView> { 749 ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::LinearizationDictionary const& dict) 750 { 751 StringBuilder builder; 752 builder.append("{\n"sv); 753 builder.appendff(" length_of_file={}\n", dict.length_of_file); 754 builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset); 755 builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length); 756 builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset); 757 builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length); 758 builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number); 759 builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end); 760 builder.appendff(" number_of_pages={}\n", dict.number_of_pages); 761 builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table); 762 builder.appendff(" first_page={}\n", dict.first_page); 763 builder.append('}'); 764 return Formatter<StringView>::format(format_builder, builder.to_deprecated_string()); 765 } 766}; 767 768template<> 769struct Formatter<PDF::DocumentParser::PageOffsetHintTable> : Formatter<StringView> { 770 ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTable const& table) 771 { 772 StringBuilder builder; 773 builder.append("{\n"sv); 774 builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page); 775 builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object); 776 builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number); 777 builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page); 778 builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length); 779 builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream); 780 builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets); 781 builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length); 782 builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length); 783 builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs); 784 builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier); 785 builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator); 786 builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator); 787 builder.append('}'); 788 return Formatter<StringView>::format(format_builder, builder.to_deprecated_string()); 789 } 790}; 791 792template<> 793struct Formatter<PDF::DocumentParser::PageOffsetHintTableEntry> : Formatter<StringView> { 794 ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTableEntry const& entry) 795 { 796 StringBuilder builder; 797 builder.append("{\n"sv); 798 builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number); 799 builder.appendff(" page_length_number={}\n", entry.page_length_number); 800 builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects); 801 builder.append(" shared_object_identifiers=["sv); 802 for (auto& identifier : entry.shared_object_identifiers) 803 builder.appendff(" {}", identifier); 804 builder.append(" ]\n"sv); 805 builder.append(" shared_object_location_numerators=["sv); 806 for (auto& numerator : entry.shared_object_location_numerators) 807 builder.appendff(" {}", numerator); 808 builder.append(" ]\n"sv); 809 builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number); 810 builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number); 811 builder.append('}'); 812 return Formatter<StringView>::format(format_builder, builder.to_deprecated_string()); 813 } 814}; 815 816}