const std = @import("std"); const Allocator = std.mem.Allocator; /// namespace for extern decls and data structures I want to wrap instead of use /// directly const internal = struct { const Output = extern struct { document: *Node, root: *Node, errors: Vector(Error), }; const AllocFunc = *const fn (?*anyopaque, usize) callconv(.c) ?*anyopaque; const FreeFunc = *const fn (?*anyopaque, ?*anyopaque) callconv(.c) void; const Options = extern struct { alloc: AllocFunc, free: FreeFunc, userdata: ?*anyopaque, parse_options: ParseOptions, }; extern "c" fn gumbo_get_attribute(*const Vector(Attribute), [*:0]const u8) ?*Attribute; extern "c" fn gumbo_parse_with_options(*const Options, [*]const u8, usize) ?*Output; extern "c" fn gumbo_destroy_output(*const Options, *Output) void; }; pub const Error = extern struct {}; pub const StringPiece = extern struct { data: ?[*]const u8, length: usize, pub fn slice(self: @This()) []const u8 { const data = self.data orelse return &.{}; return data[0..self.length]; } }; pub const SourcePosition = extern struct { line: c_uint, column: c_uint, offset: c_uint, }; pub fn Vector(comptime T: type) type { return extern struct { const Self = @This(); // TODO items should probably be `?*T` data: ?[*]*T, length: c_uint, capacity: c_uint, pub fn slice(self: Self) []const *T { const data = self.data orelse { std.debug.assert(self.length == 0); return &.{}; }; return data[0..self.length]; } }; } pub const Attribute = extern struct { const Self = @This(); pub const Namespace = enum(c_int) { none, xlink, xml, xmlns }; namespace: Namespace, raw_name: [*:0]const u8, original_name: StringPiece, raw_value: [*:0]const u8, original_value: StringPiece, name_start: SourcePosition, name_end: SourcePosition, value_start: SourcePosition, value_end: SourcePosition, pub fn name(self: Self) [:0]const u8 { return std.mem.span(self.raw_name); } pub fn value(self: Self) [:0]const u8 { return std.mem.span(self.raw_value); } }; pub const Node = extern struct { const Self = @This(); // zig fmt: off pub const Tag = enum(c_int) { html, head, title, base, link, meta, style, script, noscript, template, body, article, section, nav, aside, h1, h2, h3, h4, h5, h6, hgroup, header, footer, address, p, hr, pre, blockquote, ol, ul, li, dl, dt, dd, figure, figcaption, main, div, a, em, strong, small, s, cite, q, dfn, abbr, data, time, code, @"var", samp, kbd, sub, sup, i, b, u, mark, ruby, rt, rp, bdi, bdo, span, br, wbr, ins, del, image, img, iframe, embed, object, param, video, audio, source, track, canvas, map, area, math, mi, mo, mn, ms, mtext, mglyph, malignmark, annotation_xml, svg, foreignobject, desc, table, caption, colgroup, col, tbody, thead, tfoot, tr, td, th, form, fieldset, legend, label, input, button, select, datalist, optgroup, option, textarea, keygen, output, progress, meter, details, summary, menu, menuitem, applet, acronym, bgsound, dir, frame, frameset, noframes, isindex, listing, xmp, nextid, noembed, plaintext, rb, strike, basefont, big, blink, center, font, marquee, multicol, nobr, spacer, tt, rtc, unknown_tag, last_tag }; // zig fmt: on pub const Namespace = enum(c_int) { html, svg, mathml }; pub const Type = enum(c_int) { document, element, text, cdata, comment, whitespace, template, }; pub const ParseFlags = packed struct(u32) { by_parser: bool, implicit_end_tag: bool, implied: bool, converted_from_end_tag: bool, is_index: bool, from_image: bool, reconstructed_formatting_element: bool, adoption_agency_closed: bool, adoption_agency_moved: bool, foster_parented: bool, _: u22, }; pub const Document = extern struct { // TODO }; pub const Element = extern struct { children: Vector(Self), tag: Tag, namespace: Namespace, original_tag: StringPiece, original_end_tag: StringPiece, start_pos: SourcePosition, end_pos: SourcePosition, attributes: Vector(Attribute), pub fn getAttr(self: *const @This(), name: [:0]const u8) ?*Attribute { return internal.gumbo_get_attribute(&self.attributes, name.ptr); } }; pub const Text = extern struct { raw_text: [*:0]const u8, original_text: StringPiece, start_pos: SourcePosition, pub fn text(t: @This()) []const u8 { return t.original_text.slice(); } }; pub const Data = union(Type) { document: Document, element: Element, text: Text, cdata: Text, comment: Text, whitespace: Text, template: Element, }; type: Type, parent: ?*Self, index_within_parent: usize, parse_flags: ParseFlags, data: extern union { document: Document, element: Element, text: Text, }, pub fn get(self: Self) Data { return switch (self.type) { .document => .{ .document = self.data.document }, inline .element, .template => |tag| @unionInit(Data, @tagName(tag), self.data.element), inline else => |tag| @unionInit(Data, @tagName(tag), self.data.text), }; } pub fn format(self: Self, w: *std.Io.Writer) std.Io.Writer.Error!void { switch (self.get()) { .element, .template => |el| { try w.print("<{s}", .{@tagName(el.tag)}); for (el.attributes.slice()) |attr| { try w.print(" {s}=\"{s}\"", .{ attr.name(), attr.value() }); } try w.writeByte('>'); }, .comment => |t| { try w.print("", .{t.text}); }, .text => |t| { try w.print("{s}", .{t.text}); }, else => |tag| { try w.print("{{{s}}}", .{@tagName(tag)}); }, } } pub const Iterator = struct { base: *Self, node: ?*Self, fn init(base: *Self) @This() { return .{ .base = base, .node = base }; } pub fn next(self: *@This()) ?*Self { if (self.node == null) return null; const node = self.node.?; // try to traverse children const data = node.get(); switch (data) { // TODO handle document .element, .template => |element| { const children = element.children.slice(); if (children.len > 0) { // TODO handle null? self.node = children[0]; return node; } }, else => {}, } if (node != self.base) { // try to get next child of parent (or parent of parent, etc.) var trav = node; while (trav.parent) |parent| : (trav = parent) { const parent_children = switch (parent.get()) { .document => break, .element, .template => |element| element.children.slice(), else => unreachable, }; if (trav.index_within_parent < parent_children.len - 1) { self.node = parent_children[trav.index_within_parent + 1]; return node; } if (parent == self.base) break; } } // no nodes left self.node = null; return node; } }; /// iterate through all elements in a tree pub fn iter(base: *Self) Iterator { return .init(base); } pub const QueryIterator = struct { nodes: Iterator, ctx: ?*anyopaque, pred: *const QueryPredicate, fn init(base: *Self, ctx: ?*anyopaque, pred: *const QueryPredicate) @This() { return .{ .nodes = .init(base), .ctx = ctx, .pred = pred, }; } pub fn next(self: *@This()) ?*Self { while (self.nodes.next()) |node| { if (self.pred(node)) { return node; } } return null; } }; pub const QueryPredicate = fn (ctx: ?*anyopaque, node: *Self) bool; /// filter nodes from a tree of elements pub fn query(base: *Self, ctx: ?*anyopaque, pred: *const QueryPredicate) QueryIterator { return .init(base, ctx, pred); } }; pub const ParseOptions = extern struct { tab_stop: c_int = 4, stop_on_first_error: bool = false, max_errors: c_int = -1, fragment_context: Node.Tag = .last_tag, fragment_namespace: Node.Namespace = .html, }; pub const Tree = struct { const Self = @This(); options: internal.Options, output: *internal.Output, pub fn deinit(self: Self) void { internal.gumbo_destroy_output(&self.options, self.output); const ally_ptr: *Allocator = @ptrCast(@alignCast(self.options.userdata.?)); const ally = ally_ptr.*; ally.destroy(ally_ptr); } pub fn document(self: Self) *Node { return self.output.document; } pub fn root(self: Self) *Node { return self.output.root; } }; /// allocates from a Zig allocator interface for Gumbo /// /// allocations use an extra 8 bytes to store length info, since gumbo's `free` /// function doesn't track size info fn allyAlloc(ctx: ?*anyopaque, nbytes: usize) callconv(.c) ?*anyopaque { const ally: *Allocator = @ptrCast(@alignCast(ctx.?)); const slice = ally.alignedAlloc(u8, .@"8", nbytes + 8) catch return null; @as(*usize, @ptrCast(slice)).* = nbytes; return @ptrFromInt(@intFromPtr(slice.ptr) + 8); } fn allyFree(ctx: ?*anyopaque, mb_ptr: ?*anyopaque) callconv(.c) void { const fat_ptr = mb_ptr orelse return; const ptr: [*]align(8) u8 = @ptrFromInt(@intFromPtr(fat_ptr) - 8); const nbytes = @as(*usize, @ptrCast(ptr)).*; const slice = ptr[0 .. nbytes + 8]; const ally: *Allocator = @ptrCast(@alignCast(ctx.?)); ally.free(slice); } pub fn parse(ally: Allocator, text: []const u8, options: ParseOptions) !Tree { const ally_ptr = try ally.create(Allocator); errdefer ally.destroy(ally_ptr); ally_ptr.* = ally; const it_options: internal.Options = .{ .userdata = ally_ptr, .alloc = allyAlloc, .free = allyFree, .parse_options = options, }; const output = internal.gumbo_parse_with_options(&it_options, text.ptr, text.len) orelse { return error.ParseFailed; }; return .{ .options = it_options, .output = output, }; }