Gumbo HTML5 parser packaged for Zig
1const std = @import("std");
2const Allocator = std.mem.Allocator;
3
4/// namespace for extern decls and data structures I want to wrap instead of use
5/// directly
6const internal = struct {
7 const Output = extern struct {
8 document: *Node,
9 root: *Node,
10 errors: Vector(Error),
11 };
12
13 const AllocFunc = *const fn (?*anyopaque, usize) callconv(.c) ?*anyopaque;
14 const FreeFunc = *const fn (?*anyopaque, ?*anyopaque) callconv(.c) void;
15
16 const Options = extern struct {
17 alloc: AllocFunc,
18 free: FreeFunc,
19 userdata: ?*anyopaque,
20 parse_options: ParseOptions,
21 };
22
23 extern "c" fn gumbo_get_attribute(*const Vector(Attribute), [*:0]const u8) ?*Attribute;
24
25 extern "c" fn gumbo_parse_with_options(*const Options, [*]const u8, usize) ?*Output;
26 extern "c" fn gumbo_destroy_output(*const Options, *Output) void;
27};
28
29pub const Error = extern struct {};
30
31pub const StringPiece = extern struct {
32 data: ?[*]const u8,
33 length: usize,
34
35 pub fn slice(self: @This()) []const u8 {
36 const data = self.data orelse return &.{};
37 return data[0..self.length];
38 }
39};
40
41pub const SourcePosition = extern struct {
42 line: c_uint,
43 column: c_uint,
44 offset: c_uint,
45};
46
47pub fn Vector(comptime T: type) type {
48 return extern struct {
49 const Self = @This();
50
51 // TODO items should probably be `?*T`
52 data: ?[*]*T,
53 length: c_uint,
54 capacity: c_uint,
55
56 pub fn slice(self: Self) []const *T {
57 const data = self.data orelse {
58 std.debug.assert(self.length == 0);
59 return &.{};
60 };
61 return data[0..self.length];
62 }
63 };
64}
65
66pub const Attribute = extern struct {
67 const Self = @This();
68 pub const Namespace = enum(c_int) { none, xlink, xml, xmlns };
69
70 namespace: Namespace,
71 raw_name: [*:0]const u8,
72 original_name: StringPiece,
73 raw_value: [*:0]const u8,
74 original_value: StringPiece,
75 name_start: SourcePosition,
76 name_end: SourcePosition,
77 value_start: SourcePosition,
78 value_end: SourcePosition,
79
80 pub fn name(self: Self) [:0]const u8 {
81 return std.mem.span(self.raw_name);
82 }
83
84 pub fn value(self: Self) [:0]const u8 {
85 return std.mem.span(self.raw_value);
86 }
87};
88
89pub const Node = extern struct {
90 const Self = @This();
91
92 // zig fmt: off
93 pub const Tag = enum(c_int) {
94 html, head, title, base, link, meta, style, script,
95 noscript, template, body, article, section, nav, aside, h1,
96 h2, h3, h4, h5, h6, hgroup, header, footer,
97 address, p, hr, pre, blockquote, ol, ul, li,
98 dl, dt, dd, figure, figcaption, main, div, a,
99 em, strong, small, s, cite, q, dfn, abbr,
100 data, time, code, @"var", samp, kbd, sub, sup,
101 i, b, u, mark, ruby, rt, rp, bdi,
102 bdo, span, br, wbr, ins, del, image, img,
103 iframe, embed, object, param, video, audio, source, track,
104 canvas, map, area, math, mi, mo, mn, ms,
105 mtext, mglyph, malignmark, annotation_xml, svg, foreignobject, desc, table,
106 caption, colgroup, col, tbody, thead, tfoot, tr, td,
107 th, form, fieldset, legend, label, input, button, select,
108 datalist, optgroup, option, textarea, keygen, output, progress, meter,
109 details, summary, menu, menuitem, applet, acronym, bgsound, dir,
110 frame, frameset, noframes, isindex, listing, xmp, nextid, noembed,
111 plaintext, rb, strike, basefont, big, blink, center, font,
112 marquee, multicol, nobr, spacer, tt, rtc, unknown_tag, last_tag
113 };
114 // zig fmt: on
115 pub const Namespace = enum(c_int) { html, svg, mathml };
116
117 pub const Type = enum(c_int) {
118 document,
119 element,
120 text,
121 cdata,
122 comment,
123 whitespace,
124 template,
125 };
126
127 pub const ParseFlags = packed struct(u32) {
128 by_parser: bool,
129 implicit_end_tag: bool,
130 implied: bool,
131 converted_from_end_tag: bool,
132 is_index: bool,
133 from_image: bool,
134 reconstructed_formatting_element: bool,
135 adoption_agency_closed: bool,
136 adoption_agency_moved: bool,
137 foster_parented: bool,
138 _: u22,
139 };
140
141 pub const Document = extern struct {
142 // TODO
143 };
144
145 pub const Element = extern struct {
146 children: Vector(Self),
147 tag: Tag,
148 namespace: Namespace,
149 original_tag: StringPiece,
150 original_end_tag: StringPiece,
151 start_pos: SourcePosition,
152 end_pos: SourcePosition,
153 attributes: Vector(Attribute),
154
155 pub fn getAttr(self: *const @This(), name: [:0]const u8) ?*Attribute {
156 return internal.gumbo_get_attribute(&self.attributes, name.ptr);
157 }
158 };
159
160 pub const Text = extern struct {
161 raw_text: [*:0]const u8,
162 original_text: StringPiece,
163 start_pos: SourcePosition,
164
165 pub fn text(t: @This()) []const u8 {
166 return t.original_text.slice();
167 }
168 };
169
170 pub const Data = union(Type) {
171 document: Document,
172 element: Element,
173 text: Text,
174 cdata: Text,
175 comment: Text,
176 whitespace: Text,
177 template: Element,
178 };
179
180 type: Type,
181 parent: ?*Self,
182 index_within_parent: usize,
183 parse_flags: ParseFlags,
184 data: extern union {
185 document: Document,
186 element: Element,
187 text: Text,
188 },
189
190 pub fn get(self: Self) Data {
191 return switch (self.type) {
192 .document => .{ .document = self.data.document },
193 inline .element, .template => |tag| @unionInit(Data, @tagName(tag), self.data.element),
194 inline else => |tag| @unionInit(Data, @tagName(tag), self.data.text),
195 };
196 }
197
198 pub fn format(self: Self, w: *std.Io.Writer) std.Io.Writer.Error!void {
199 switch (self.get()) {
200 .element, .template => |el| {
201 try w.print("<{s}", .{@tagName(el.tag)});
202 for (el.attributes.slice()) |attr| {
203 try w.print(" {s}=\"{s}\"", .{ attr.name(), attr.value() });
204 }
205 try w.writeByte('>');
206 },
207 .comment => |t| {
208 try w.print("<!-- {s} -->", .{t.text});
209 },
210 .text => |t| {
211 try w.print("{s}", .{t.text});
212 },
213 else => |tag| {
214 try w.print("{{{s}}}", .{@tagName(tag)});
215 },
216 }
217 }
218
219 pub const Iterator = struct {
220 base: *Self,
221 node: ?*Self,
222
223 fn init(base: *Self) @This() {
224 return .{ .base = base, .node = base };
225 }
226
227 pub fn next(self: *@This()) ?*Self {
228 if (self.node == null) return null;
229
230 const node = self.node.?;
231
232 // try to traverse children
233 const data = node.get();
234 switch (data) {
235 // TODO handle document
236 .element, .template => |element| {
237 const children = element.children.slice();
238 if (children.len > 0) {
239 // TODO handle null?
240 self.node = children[0];
241 return node;
242 }
243 },
244 else => {},
245 }
246
247 if (node != self.base) {
248 // try to get next child of parent (or parent of parent, etc.)
249 var trav = node;
250 while (trav.parent) |parent| : (trav = parent) {
251 const parent_children = switch (parent.get()) {
252 .document => break,
253 .element, .template => |element| element.children.slice(),
254 else => unreachable,
255 };
256
257 if (trav.index_within_parent < parent_children.len - 1) {
258 self.node = parent_children[trav.index_within_parent + 1];
259 return node;
260 }
261
262 if (parent == self.base) break;
263 }
264 }
265
266 // no nodes left
267 self.node = null;
268 return node;
269 }
270 };
271
272 /// iterate through all elements in a tree
273 pub fn iter(base: *Self) Iterator {
274 return .init(base);
275 }
276
277 pub const QueryIterator = struct {
278 nodes: Iterator,
279 ctx: ?*anyopaque,
280 pred: *const QueryPredicate,
281
282 fn init(base: *Self, ctx: ?*anyopaque, pred: *const QueryPredicate) @This() {
283 return .{
284 .nodes = .init(base),
285 .ctx = ctx,
286 .pred = pred,
287 };
288 }
289
290 pub fn next(self: *@This()) ?*Self {
291 while (self.nodes.next()) |node| {
292 if (self.pred(node)) {
293 return node;
294 }
295 }
296
297 return null;
298 }
299 };
300
301 pub const QueryPredicate = fn (ctx: ?*anyopaque, node: *Self) bool;
302
303 /// filter nodes from a tree of elements
304 pub fn query(base: *Self, ctx: ?*anyopaque, pred: *const QueryPredicate) QueryIterator {
305 return .init(base, ctx, pred);
306 }
307};
308
309pub const ParseOptions = extern struct {
310 tab_stop: c_int = 4,
311 stop_on_first_error: bool = false,
312 max_errors: c_int = -1,
313 fragment_context: Node.Tag = .last_tag,
314 fragment_namespace: Node.Namespace = .html,
315};
316
317pub const Tree = struct {
318 const Self = @This();
319
320 options: internal.Options,
321 output: *internal.Output,
322
323 pub fn deinit(self: Self) void {
324 internal.gumbo_destroy_output(&self.options, self.output);
325
326 const ally_ptr: *Allocator = @ptrCast(@alignCast(self.options.userdata.?));
327 const ally = ally_ptr.*;
328 ally.destroy(ally_ptr);
329 }
330
331 pub fn document(self: Self) *Node {
332 return self.output.document;
333 }
334
335 pub fn root(self: Self) *Node {
336 return self.output.root;
337 }
338};
339
340/// allocates from a Zig allocator interface for Gumbo
341///
342/// allocations use an extra 8 bytes to store length info, since gumbo's `free`
343/// function doesn't track size info
344fn allyAlloc(ctx: ?*anyopaque, nbytes: usize) callconv(.c) ?*anyopaque {
345 const ally: *Allocator = @ptrCast(@alignCast(ctx.?));
346 const slice = ally.alignedAlloc(u8, .@"8", nbytes + 8) catch return null;
347 @as(*usize, @ptrCast(slice)).* = nbytes;
348 return @ptrFromInt(@intFromPtr(slice.ptr) + 8);
349}
350
351fn allyFree(ctx: ?*anyopaque, mb_ptr: ?*anyopaque) callconv(.c) void {
352 const fat_ptr = mb_ptr orelse return;
353 const ptr: [*]align(8) u8 = @ptrFromInt(@intFromPtr(fat_ptr) - 8);
354 const nbytes = @as(*usize, @ptrCast(ptr)).*;
355 const slice = ptr[0 .. nbytes + 8];
356
357 const ally: *Allocator = @ptrCast(@alignCast(ctx.?));
358 ally.free(slice);
359}
360
361pub fn parse(ally: Allocator, text: []const u8, options: ParseOptions) !Tree {
362 const ally_ptr = try ally.create(Allocator);
363 errdefer ally.destroy(ally_ptr);
364 ally_ptr.* = ally;
365
366 const it_options: internal.Options = .{
367 .userdata = ally_ptr,
368 .alloc = allyAlloc,
369 .free = allyFree,
370 .parse_options = options,
371 };
372 const output = internal.gumbo_parse_with_options(&it_options, text.ptr, text.len) orelse {
373 return error.ParseFailed;
374 };
375
376 return .{
377 .options = it_options,
378 .output = output,
379 };
380}