Userland/Libraries/LibPDF/DocumentParser.cpp at master

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / Userland / Libraries / LibPDF / DocumentParser.cpp
at master 816 lines 33 kB view raw
wrap content
Julian Offenhäuser LibPDF: Allow reading documents with incremental updates 3y ago
34350ee9
  1/*
  2 * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
  3 * Copyright (c) 2022, Julian Offenhäuser <offenhaeuser@protonmail.com>
  4 *
  5 * SPDX-License-Identifier: BSD-2-Clause
  6 */
  7
  8#include <AK/BitStream.h>
  9#include <AK/Endian.h>
 10#include <AK/MemoryStream.h>
 11#include <AK/Tuple.h>
 12#include <LibPDF/CommonNames.h>
 13#include <LibPDF/Document.h>
 14#include <LibPDF/DocumentParser.h>
 15#include <LibPDF/ObjectDerivatives.h>
 16
 17namespace PDF {
 18
 19DocumentParser::DocumentParser(Document* document, ReadonlyBytes bytes)
 20    : Parser(document, bytes)
 21{
 22}
 23
 24PDFErrorOr<void> DocumentParser::initialize()
 25{
 26    TRY(parse_header());
 27
 28    auto const linearization_result = TRY(initialize_linearization_dict());
 29
 30    if (linearization_result == LinearizationResult::NotLinearized)
 31        return initialize_non_linearized_xref_table();
 32
 33    bool is_linearized = m_linearization_dictionary.has_value();
 34    if (is_linearized) {
 35        // The file may have been linearized at one point, but could have been updated afterwards,
 36        // which means it is no longer a linearized PDF file.
 37        is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
 38
 39        if (!is_linearized) {
 40            // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
 41            // split. This might take some tweaking to ensure correct behavior, which can be
 42            // implemented later.
 43            TODO();
 44        }
 45    }
 46
 47    if (is_linearized)
 48        return initialize_linearized_xref_table();
 49
 50    return initialize_non_linearized_xref_table();
 51}
 52
 53PDFErrorOr<Value> DocumentParser::parse_object_with_index(u32 index)
 54{
 55    VERIFY(m_xref_table->has_object(index));
 56
 57    if (m_xref_table->is_object_compressed(index))
 58        // The object can be found in a object stream
 59        return parse_compressed_object_with_index(index);
 60
 61    auto byte_offset = m_xref_table->byte_offset_for_object(index);
 62    m_reader.move_to(byte_offset);
 63    auto indirect_value = TRY(parse_indirect_value());
 64    VERIFY(indirect_value->index() == index);
 65    return indirect_value->value();
 66}
 67
 68PDFErrorOr<void> DocumentParser::parse_header()
 69{
 70    // FIXME: Do something with the version?
 71    m_reader.set_reading_forwards();
 72    if (m_reader.remaining() == 0)
 73        return error("Empty PDF document");
 74
 75    m_reader.move_to(0);
 76    if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
 77        return error("Not a PDF document");
 78
 79    m_reader.move_by(5);
 80
 81    char major_ver = m_reader.read();
 82    if (major_ver != '1' && major_ver != '2')
 83        return error(DeprecatedString::formatted("Unknown major version \"{}\"", major_ver));
 84
 85    if (m_reader.read() != '.')
 86        return error("Malformed PDF version");
 87
 88    char minor_ver = m_reader.read();
 89    if (minor_ver < '0' || minor_ver > '7')
 90        return error(DeprecatedString::formatted("Unknown minor version \"{}\"", minor_ver));
 91
 92    m_reader.consume_eol();
 93
 94    // Parse optional high-byte comment, which signifies a binary file
 95    // FIXME: Do something with this?
 96    auto comment = parse_comment();
 97    if (!comment.is_empty()) {
 98        auto binary = comment.length() >= 4;
 99        if (binary) {
100            for (size_t i = 0; i < comment.length() && binary; i++)
101                binary = static_cast<u8>(comment[i]) > 128;
102        }
103    }
104
105    return {};
106}
107
108PDFErrorOr<DocumentParser::LinearizationResult> DocumentParser::initialize_linearization_dict()
109{
110    // parse_header() is called immediately before this, so we are at the right location
111    auto indirect_value = Value(*TRY(parse_indirect_value()));
112    auto dict_value = TRY(m_document->resolve(indirect_value));
113    if (!dict_value.has<NonnullRefPtr<Object>>())
114        return error("Expected linearization object to be a dictionary");
115
116    auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
117    if (!dict_object->is<DictObject>())
118        return LinearizationResult::NotLinearized;
119
120    auto dict = dict_object->cast<DictObject>();
121
122    if (!dict->contains(CommonNames::Linearized))
123        return LinearizationResult::NotLinearized;
124
125    if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
126        return error("Malformed linearization dictionary");
127
128    auto length_of_file = dict->get_value(CommonNames::L);
129    auto hint_table = dict->get_value(CommonNames::H);
130    auto first_page_object_number = dict->get_value(CommonNames::O);
131    auto offset_of_first_page_end = dict->get_value(CommonNames::E);
132    auto number_of_pages = dict->get_value(CommonNames::N);
133    auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
134    auto first_page = dict->get(CommonNames::P).value_or({});
135
136    // Validation
137    if (!length_of_file.has_u32()
138        || !hint_table.has<NonnullRefPtr<Object>>()
139        || !first_page_object_number.has_u32()
140        || !number_of_pages.has_u16()
141        || !offset_of_main_xref_table.has_u32()
142        || (!first_page.has<Empty>() && !first_page.has_u32())) {
143        return error("Malformed linearization dictionary parameters");
144    }
145
146    auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>();
147    auto hint_table_size = hint_table_array->size();
148    if (hint_table_size != 2 && hint_table_size != 4)
149        return error("Expected hint table to be of length 2 or 4");
150
151    auto primary_hint_stream_offset = hint_table_array->at(0);
152    auto primary_hint_stream_length = hint_table_array->at(1);
153    Value overflow_hint_stream_offset;
154    Value overflow_hint_stream_length;
155
156    if (hint_table_size == 4) {
157        overflow_hint_stream_offset = hint_table_array->at(2);
158        overflow_hint_stream_length = hint_table_array->at(3);
159    }
160
161    if (!primary_hint_stream_offset.has_u32()
162        || !primary_hint_stream_length.has_u32()
163        || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
164        || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
165        return error("Malformed hint stream");
166    }
167
168    m_linearization_dictionary = LinearizationDictionary {
169        length_of_file.get_u32(),
170        primary_hint_stream_offset.get_u32(),
171        primary_hint_stream_length.get_u32(),
172        overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
173        overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
174        first_page_object_number.get_u32(),
175        offset_of_first_page_end.get_u32(),
176        number_of_pages.get_u16(),
177        offset_of_main_xref_table.get_u32(),
178        first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
179    };
180
181    return LinearizationResult::Linearized;
182}
183
184PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
185{
186    // The linearization parameter dictionary has just been parsed, and the xref table
187    // comes immediately after it. We are in the correct spot.
188    m_xref_table = TRY(parse_xref_table());
189
190    // Also parse the main xref table and merge into the first-page xref table. Note
191    // that we don't use the main xref table offset from the linearization dict because
192    // for some reason, it specified the offset of the whitespace after the object
193    // index start and length? So it's much easier to do it this way.
194    auto main_xref_table_offset = m_xref_table->trailer()->get_value(CommonNames::Prev).to_int();
195    m_reader.move_to(main_xref_table_offset);
196    auto main_xref_table = TRY(parse_xref_table());
197    TRY(m_xref_table->merge(move(*main_xref_table)));
198
199    return validate_xref_table_and_fix_if_necessary();
200}
201
202PDFErrorOr<void> DocumentParser::initialize_hint_tables()
203{
204    auto linearization_dict = m_linearization_dictionary.value();
205    auto primary_offset = linearization_dict.primary_hint_stream_offset;
206    auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
207
208    auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
209        m_reader.move_to(offset);
210        auto stream_indirect_value = parse_indirect_value();
211        if (stream_indirect_value.is_error())
212            return {};
213
214        auto stream_value = stream_indirect_value.value()->value();
215        if (!stream_value.has<NonnullRefPtr<Object>>())
216            return {};
217
218        auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
219        if (!stream_object->is<StreamObject>())
220            return {};
221
222        return stream_object->cast<StreamObject>();
223    };
224
225    auto primary_hint_stream = parse_hint_table(primary_offset);
226    if (!primary_hint_stream)
227        return error("Invalid primary hint stream");
228
229    RefPtr<StreamObject> overflow_hint_stream;
230    if (overflow_offset != NumericLimits<u32>::max())
231        overflow_hint_stream = parse_hint_table(overflow_offset);
232
233    ByteBuffer possible_merged_stream_buffer;
234    ReadonlyBytes hint_stream_bytes;
235
236    if (overflow_hint_stream) {
237        auto primary_size = primary_hint_stream->bytes().size();
238        auto overflow_size = overflow_hint_stream->bytes().size();
239        auto total_size = primary_size + overflow_size;
240
241        auto buffer_result = ByteBuffer::create_uninitialized(total_size);
242        if (buffer_result.is_error())
243            return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" };
244        possible_merged_stream_buffer = buffer_result.release_value();
245        MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes()));
246        MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes()));
247        hint_stream_bytes = possible_merged_stream_buffer.bytes();
248    } else {
249        hint_stream_bytes = primary_hint_stream->bytes();
250    }
251
252    auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes));
253    auto hint_table_entries = TRY(parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes));
254
255    // FIXME: Do something with the hint tables
256    return {};
257}
258
259PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()
260{
261    m_reader.move_to(m_reader.bytes().size() - 1);
262    if (!navigate_to_before_eof_marker())
263        return error("No EOF marker");
264    if (!navigate_to_after_startxref())
265        return error("No xref");
266
267    m_reader.set_reading_forwards();
268    auto xref_offset_value = TRY(parse_number());
269    auto xref_offset = TRY(m_document->resolve_to<int>(xref_offset_value));
270    m_reader.move_to(xref_offset);
271
272    // As per 7.5.6 Incremental Updates:
273    // When a conforming reader reads the file, it shall build its cross-reference
274    // information in such a way that the most recent copy of each object shall be
275    // the one accessed from the file.
276    // NOTE: This means that we have to follow back the chain of XRef table sections
277    //       and only add objects that were not already specified in a previous
278    //       (and thus newer) XRef section.
279    while (1) {
280        auto xref_table = TRY(parse_xref_table());
281        if (!m_xref_table)
282            m_xref_table = xref_table;
283        else
284            TRY(m_xref_table->merge(move(*xref_table)));
285
286        if (!xref_table->trailer() || !xref_table->trailer()->contains(CommonNames::Prev))
287            break;
288
289        auto offset = TRY(m_document->resolve_to<int>(xref_table->trailer()->get_value(CommonNames::Prev)));
290        m_reader.move_to(offset);
291    }
292
293    return validate_xref_table_and_fix_if_necessary();
294}
295
296PDFErrorOr<void> DocumentParser::validate_xref_table_and_fix_if_necessary()
297{
298    /* While an xref table may start with an object number other than zero, this is
299       very uncommon and likely a sign of a document with broken indices.
300       Like most other PDF parsers seem to do, we still try to salvage the situation.
301       NOTE: This is probably not spec-compliant behavior.*/
302    size_t first_valid_index = 0;
303    while (m_xref_table->byte_offset_for_object(first_valid_index) == invalid_byte_offset)
304        first_valid_index++;
305
306    if (first_valid_index) {
307        auto& entries = m_xref_table->entries();
308
309        bool need_to_rebuild_table = true;
310        for (size_t i = first_valid_index; i < entries.size(); ++i) {
311            if (!entries[i].in_use)
312                continue;
313
314            size_t actual_object_number = 0;
315            if (entries[i].compressed) {
316                auto object_stream_index = m_xref_table->object_stream_for_object(i);
317                auto stream_offset = m_xref_table->byte_offset_for_object(object_stream_index);
318                m_reader.move_to(stream_offset);
319                auto first_number = TRY(parse_number());
320                actual_object_number = first_number.get_u32();
321            } else {
322                auto byte_offset = m_xref_table->byte_offset_for_object(i);
323                m_reader.move_to(byte_offset);
324                auto indirect_value = TRY(parse_indirect_value());
325                actual_object_number = indirect_value->index();
326            }
327
328            if (actual_object_number != i - first_valid_index) {
329                /* Our suspicion was wrong, not all object numbers are shifted equally.
330                   This could mean that the document is hopelessly broken, or it just
331                   starts at a non-zero object index for some reason. */
332                need_to_rebuild_table = false;
333                break;
334            }
335        }
336
337        if (need_to_rebuild_table) {
338            warnln("Broken xref table detected, trying to fix it.");
339            entries.remove(0, first_valid_index);
340        }
341    }
342
343    return {};
344}
345
346PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
347{
348    auto first_number = TRY(parse_number());
349    auto second_number = TRY(parse_number());
350
351    if (!m_reader.matches("obj"))
352        return error("Malformed xref object");
353    m_reader.move_by(3);
354    if (m_reader.matches_eol())
355        m_reader.consume_eol();
356
357    auto dict = TRY(parse_dict());
358    auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
359    if (type != "XRef")
360        return error("Malformed xref dictionary");
361
362    auto field_sizes = TRY(dict->get_array(m_document, "W"));
363    if (field_sizes->size() != 3)
364        return error("Malformed xref dictionary");
365
366    auto highest_object_number = dict->get_value("Size").get<int>() - 1;
367
368    Vector<Tuple<int, int>> subsections;
369    if (dict->contains(CommonNames::Index)) {
370        auto index_array = TRY(dict->get_array(m_document, CommonNames::Index));
371        if (index_array->size() % 2 != 0)
372            return error("Malformed xref dictionary");
373
374        for (size_t i = 0; i < index_array->size(); i += 2)
375            subsections.append({ index_array->at(i).get<int>(), index_array->at(i + 1).get<int>() - 1 });
376    } else {
377        subsections.append({ 0, highest_object_number });
378    }
379    auto stream = TRY(parse_stream(dict));
380    auto table = adopt_ref(*new XRefTable());
381
382    auto field_to_long = [](ReadonlyBytes field) -> long {
383        long value = 0;
384        const u8 max = (field.size() - 1) * 8;
385        for (size_t i = 0; i < field.size(); ++i) {
386            value |= static_cast<long>(field[i]) << (max - (i * 8));
387        }
388        return value;
389    };
390
391    size_t byte_index = 0;
392    size_t subsection_index = 0;
393
394    Vector<XRefEntry> entries;
395
396    for (int entry_index = 0; subsection_index < subsections.size(); ++entry_index) {
397        Array<long, 3> fields;
398        for (size_t field_index = 0; field_index < 3; ++field_index) {
399            auto field_size = field_sizes->at(field_index).get_u32();
400
401            if (byte_index + field_size > stream->bytes().size())
402                return error("The xref stream data cut off early");
403
404            auto field = stream->bytes().slice(byte_index, field_size);
405            fields[field_index] = field_to_long(field);
406            byte_index += field_size;
407        }
408
409        u8 type = fields[0];
410        if (!field_sizes->at(0).get_u32())
411            type = 1;
412
413        entries.append({ fields[1], static_cast<u16>(fields[2]), type != 0, type == 2 });
414
415        auto subsection = subsections[subsection_index];
416        if (entry_index >= subsection.get<1>()) {
417            table->add_section({ subsection.get<0>(), subsection.get<1>(), entries });
418            entries.clear();
419            subsection_index++;
420        }
421    }
422
423    table->set_trailer(dict);
424
425    return table;
426}
427
428PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
429{
430    if (!m_reader.matches("xref")) {
431        // Since version 1.5, there may be a cross-reference stream instead
432        return parse_xref_stream();
433    }
434
435    m_reader.move_by(4);
436    if (!m_reader.consume_eol())
437        return error("Expected newline after \"xref\"");
438
439    auto table = adopt_ref(*new XRefTable());
440
441    while (m_reader.matches_number()) {
442        Vector<XRefEntry> entries;
443
444        auto starting_index_value = TRY(parse_number());
445        auto starting_index = starting_index_value.get<int>();
446        auto object_count_value = TRY(parse_number());
447        auto object_count = object_count_value.get<int>();
448
449        for (int i = 0; i < object_count; i++) {
450            auto offset_string = DeprecatedString(m_reader.bytes().slice(m_reader.offset(), 10));
451            m_reader.move_by(10);
452            if (!m_reader.consume(' '))
453                return error("Malformed xref entry");
454
455            auto generation_string = DeprecatedString(m_reader.bytes().slice(m_reader.offset(), 5));
456            m_reader.move_by(5);
457            if (!m_reader.consume(' '))
458                return error("Malformed xref entry");
459
460            auto letter = m_reader.read();
461            if (letter != 'n' && letter != 'f')
462                return error("Malformed xref entry");
463
464            // The line ending sequence can be one of the following:
465            // SP CR, SP LF, or CR LF
466            if (m_reader.matches(' ')) {
467                m_reader.consume();
468                auto ch = m_reader.consume();
469                if (ch != '\r' && ch != '\n')
470                    return error("Malformed xref entry");
471            } else {
472                if (!m_reader.matches("\r\n"))
473                    return error("Malformed xref entry");
474                m_reader.move_by(2);
475            }
476
477            auto offset = strtol(offset_string.characters(), nullptr, 10);
478            auto generation = strtol(generation_string.characters(), nullptr, 10);
479
480            entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
481        }
482
483        table->add_section({ starting_index, object_count, entries });
484    }
485
486    m_reader.consume_whitespace();
487    if (m_reader.matches("trailer"))
488        table->set_trailer(TRY(parse_file_trailer()));
489
490    return table;
491}
492
493PDFErrorOr<NonnullRefPtr<DictObject>> DocumentParser::parse_file_trailer()
494{
495    while (m_reader.matches_eol())
496        m_reader.consume_eol();
497
498    if (!m_reader.matches("trailer"))
499        return error("Expected \"trailer\" keyword");
500    m_reader.move_by(7);
501    m_reader.consume_whitespace();
502    auto dict = TRY(parse_dict());
503
504    if (!m_reader.matches("startxref"))
505        return error("Expected \"startxref\"");
506    m_reader.move_by(9);
507    m_reader.consume_whitespace();
508
509    m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
510    VERIFY(m_reader.consume_eol());
511    if (!m_reader.matches("%%EOF"))
512        return error("Expected \"%%EOF\"");
513
514    m_reader.move_by(5);
515    m_reader.consume_whitespace();
516    return dict;
517}
518
519PDFErrorOr<Value> DocumentParser::parse_compressed_object_with_index(u32 index)
520{
521    auto object_stream_index = m_xref_table->object_stream_for_object(index);
522    auto stream_offset = m_xref_table->byte_offset_for_object(object_stream_index);
523
524    m_reader.move_to(stream_offset);
525
526    auto first_number = TRY(parse_number());
527    auto second_number = TRY(parse_number());
528
529    if (first_number.get<int>() != object_stream_index)
530        return error("Mismatching object stream index");
531    if (second_number.get<int>() != 0)
532        return error("Non-zero object stream generation number");
533
534    if (!m_reader.matches("obj"))
535        return error("Malformed object stream");
536    m_reader.move_by(3);
537    if (m_reader.matches_eol())
538        m_reader.consume_eol();
539
540    auto dict = TRY(parse_dict());
541    auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
542    if (type != "ObjStm")
543        return error("Invalid object stream type");
544
545    auto object_count = dict->get_value("N").get_u32();
546    auto first_object_offset = dict->get_value("First").get_u32();
547
548    auto stream = TRY(parse_stream(dict));
549    Parser stream_parser(m_document, stream->bytes());
550
551    for (u32 i = 0; i < object_count; ++i) {
552        auto object_number = TRY(stream_parser.parse_number());
553        auto object_offset = TRY(stream_parser.parse_number());
554
555        if (object_number.get_u32() == index) {
556            stream_parser.move_to(first_object_offset + object_offset.get_u32());
557            break;
558        }
559    }
560
561    return TRY(stream_parser.parse_value());
562}
563
564PDFErrorOr<DocumentParser::PageOffsetHintTable> DocumentParser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes)
565{
566    if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
567        return error("Hint stream is too small");
568
569    size_t offset = 0;
570
571    auto read_u32 = [&] {
572        u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
573        offset += 4;
574        return AK::convert_between_host_and_big_endian(data);
575    };
576
577    auto read_u16 = [&] {
578        u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
579        offset += 2;
580        return AK::convert_between_host_and_big_endian(data);
581    };
582
583    PageOffsetHintTable hint_table {
584        read_u32(),
585        read_u32(),
586        read_u16(),
587        read_u32(),
588        read_u16(),
589        read_u32(),
590        read_u16(),
591        read_u32(),
592        read_u16(),
593        read_u16(),
594        read_u16(),
595        read_u16(),
596        read_u16(),
597    };
598
599    // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
600    // fields in PageOffsetHintTableEntry are u32
601    VERIFY(hint_table.bits_required_for_object_number <= 32);
602    VERIFY(hint_table.bits_required_for_page_length <= 32);
603    VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
604    VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
605    VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
606    VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
607    VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
608
609    return hint_table;
610}
611
612PDFErrorOr<Vector<DocumentParser::PageOffsetHintTableEntry>> DocumentParser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes)
613{
614    auto input_stream = TRY(try_make<FixedMemoryStream>(hint_stream_bytes));
615    TRY(input_stream->seek(sizeof(PageOffsetHintTable)));
616
617    LittleEndianInputBitStream bit_stream { move(input_stream) };
618
619    auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
620    Vector<PageOffsetHintTableEntry> entries;
621    for (size_t i = 0; i < number_of_pages; i++)
622        entries.append(PageOffsetHintTableEntry {});
623
624    auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
625    auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
626    auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
627    auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
628    auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
629    auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
630    auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
631
632    auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) -> ErrorOr<void> {
633        if (bit_size <= 0)
634            return {};
635
636        for (int i = 0; i < number_of_pages; i++) {
637            auto& entry = entries[i];
638            entry.*field = TRY(bit_stream.read_bits(bit_size));
639        }
640
641        return {};
642    };
643
644    auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) -> ErrorOr<void> {
645        if (bit_size <= 0)
646            return {};
647
648        for (int page = 1; page < number_of_pages; page++) {
649            auto number_of_shared_objects = entries[page].number_of_shared_objects;
650            Vector<u32> items;
651            items.ensure_capacity(number_of_shared_objects);
652
653            for (size_t i = 0; i < number_of_shared_objects; i++)
654                items.unchecked_append(TRY(bit_stream.read_bits(bit_size)));
655
656            entries[page].*field = move(items);
657        }
658
659        return {};
660    };
661
662    TRY(parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number));
663    TRY(parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length));
664    TRY(parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs));
665    TRY(parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier));
666    TRY(parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator));
667    TRY(parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets));
668    TRY(parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length));
669
670    return entries;
671}
672
673bool DocumentParser::navigate_to_before_eof_marker()
674{
675    m_reader.set_reading_backwards();
676
677    while (!m_reader.done()) {
678        m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
679        if (m_reader.done())
680            return false;
681
682        m_reader.consume_eol();
683        if (!m_reader.matches("%%EOF"))
684            continue;
685
686        m_reader.move_by(5);
687        if (!m_reader.matches_eol())
688            continue;
689        m_reader.consume_eol();
690        return true;
691    }
692
693    return false;
694}
695
696bool DocumentParser::navigate_to_after_startxref()
697{
698    m_reader.set_reading_backwards();
699
700    while (!m_reader.done()) {
701        m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
702        auto offset = m_reader.offset() + 1;
703
704        m_reader.consume_eol();
705        if (!m_reader.matches("startxref"))
706            continue;
707
708        m_reader.move_by(9);
709        if (!m_reader.matches_eol())
710            continue;
711
712        m_reader.move_to(offset);
713        return true;
714    }
715
716    return false;
717}
718
719PDFErrorOr<RefPtr<DictObject>> DocumentParser::conditionally_parse_page_tree_node(u32 object_index)
720{
721    auto dict_value = TRY(parse_object_with_index(object_index));
722    auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
723    if (!dict_object->is<DictObject>())
724        return error(DeprecatedString::formatted("Invalid page tree with xref index {}", object_index));
725
726    auto dict = dict_object->cast<DictObject>();
727    if (!dict->contains_any_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count))
728        // This is a page, not a page tree node
729        return RefPtr<DictObject> {};
730
731    if (!dict->contains(CommonNames::Type))
732        return RefPtr<DictObject> {};
733    auto type_object = TRY(dict->get_object(m_document, CommonNames::Type));
734    if (!type_object->is<NameObject>())
735        return RefPtr<DictObject> {};
736    auto type_name = type_object->cast<NameObject>();
737    if (type_name->name() != CommonNames::Pages)
738        return RefPtr<DictObject> {};
739
740    return dict;
741}
742
743}
744
745namespace AK {
746
747template<>
748struct Formatter<PDF::DocumentParser::LinearizationDictionary> : Formatter<StringView> {
749    ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::LinearizationDictionary const& dict)
750    {
751        StringBuilder builder;
752        builder.append("{\n"sv);
753        builder.appendff("  length_of_file={}\n", dict.length_of_file);
754        builder.appendff("  primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
755        builder.appendff("  primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
756        builder.appendff("  overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
757        builder.appendff("  overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
758        builder.appendff("  first_page_object_number={}\n", dict.first_page_object_number);
759        builder.appendff("  offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
760        builder.appendff("  number_of_pages={}\n", dict.number_of_pages);
761        builder.appendff("  offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
762        builder.appendff("  first_page={}\n", dict.first_page);
763        builder.append('}');
764        return Formatter<StringView>::format(format_builder, builder.to_deprecated_string());
765    }
766};
767
768template<>
769struct Formatter<PDF::DocumentParser::PageOffsetHintTable> : Formatter<StringView> {
770    ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTable const& table)
771    {
772        StringBuilder builder;
773        builder.append("{\n"sv);
774        builder.appendff("  least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
775        builder.appendff("  location_of_first_page_object={}\n", table.location_of_first_page_object);
776        builder.appendff("  bits_required_for_object_number={}\n", table.bits_required_for_object_number);
777        builder.appendff("  least_length_of_a_page={}\n", table.least_length_of_a_page);
778        builder.appendff("  bits_required_for_page_length={}\n", table.bits_required_for_page_length);
779        builder.appendff("  least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
780        builder.appendff("  bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
781        builder.appendff("  least_content_stream_length={}\n", table.least_content_stream_length);
782        builder.appendff("  bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
783        builder.appendff("  bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
784        builder.appendff("  bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
785        builder.appendff("  bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
786        builder.appendff("  shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
787        builder.append('}');
788        return Formatter<StringView>::format(format_builder, builder.to_deprecated_string());
789    }
790};
791
792template<>
793struct Formatter<PDF::DocumentParser::PageOffsetHintTableEntry> : Formatter<StringView> {
794    ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTableEntry const& entry)
795    {
796        StringBuilder builder;
797        builder.append("{\n"sv);
798        builder.appendff("  objects_in_page_number={}\n", entry.objects_in_page_number);
799        builder.appendff("  page_length_number={}\n", entry.page_length_number);
800        builder.appendff("  number_of_shared_objects={}\n", entry.number_of_shared_objects);
801        builder.append("  shared_object_identifiers=["sv);
802        for (auto& identifier : entry.shared_object_identifiers)
803            builder.appendff(" {}", identifier);
804        builder.append(" ]\n"sv);
805        builder.append("  shared_object_location_numerators=["sv);
806        for (auto& numerator : entry.shared_object_location_numerators)
807            builder.appendff(" {}", numerator);
808        builder.append(" ]\n"sv);
809        builder.appendff("  page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
810        builder.appendff("  page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
811        builder.append('}');
812        return Formatter<StringView>::format(format_builder, builder.to_deprecated_string());
813    }
814};
815
816}