OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* CSS selector lexer *)
2
3exception Selector_error of Selector_error_code.t
4
5type t = {
6 input : string;
7 len : int;
8 mutable pos : int;
9}
10
11let create input = { input; len = String.length input; pos = 0 }
12
13let peek t =
14 if t.pos < t.len then Some t.input.[t.pos]
15 else None
16
17let advance t =
18 if t.pos < t.len then t.pos <- t.pos + 1
19
20let consume t =
21 let c = peek t in
22 advance t;
23 c
24
25let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' || c = '\x0C'
26
27let is_name_start c =
28 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || Char.code c > 127
29
30let is_name_char c =
31 is_name_start c || (c >= '0' && c <= '9')
32
33let skip_whitespace t =
34 while t.pos < t.len && is_whitespace t.input.[t.pos] do
35 advance t
36 done
37
38let read_name t =
39 let start = t.pos in
40 while t.pos < t.len && is_name_char t.input.[t.pos] do
41 advance t
42 done;
43 String.sub t.input start (t.pos - start)
44
45let read_string t quote =
46 advance t; (* Skip opening quote *)
47 let buf = Buffer.create 32 in
48 let rec loop () =
49 match peek t with
50 | None -> raise (Selector_error Selector_error_code.Unterminated_string)
51 | Some c when c = quote -> advance t
52 | Some '\\' ->
53 advance t;
54 (match peek t with
55 | Some c -> Buffer.add_char buf c; advance t; loop ()
56 | None -> raise (Selector_error Selector_error_code.Unterminated_escape))
57 | Some c ->
58 Buffer.add_char buf c;
59 advance t;
60 loop ()
61 in
62 loop ();
63 Buffer.contents buf
64
65let read_unquoted_attr_value t =
66 let start = t.pos in
67 while t.pos < t.len &&
68 let c = t.input.[t.pos] in
69 not (is_whitespace c) && c <> ']' do
70 advance t
71 done;
72 String.sub t.input start (t.pos - start)
73
74let tokenize input =
75 let t = create input in
76 let tokens = ref [] in
77 let pending_ws = ref false in
78
79 while t.pos < t.len do
80 let c = t.input.[t.pos] in
81
82 if is_whitespace c then begin
83 pending_ws := true;
84 skip_whitespace t
85 end else if c = '>' || c = '+' || c = '~' then begin
86 pending_ws := false;
87 advance t;
88 skip_whitespace t;
89 tokens := Selector_token.Combinator (String.make 1 c) :: !tokens
90 end else begin
91 if !pending_ws && !tokens <> [] && c <> ',' then
92 tokens := Selector_token.Combinator " " :: !tokens;
93 pending_ws := false;
94
95 match c with
96 | '*' ->
97 advance t;
98 tokens := Selector_token.Universal :: !tokens
99 | '#' ->
100 advance t;
101 let name = read_name t in
102 if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_hash);
103 tokens := Selector_token.Id name :: !tokens
104 | '.' ->
105 advance t;
106 let name = read_name t in
107 if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_dot);
108 tokens := Selector_token.Class name :: !tokens
109 | '[' ->
110 advance t;
111 tokens := Selector_token.Attr_start :: !tokens;
112 skip_whitespace t;
113 let attr_name = read_name t in
114 if attr_name = "" then raise (Selector_error Selector_error_code.Expected_attribute_name);
115 tokens := Selector_token.Tag attr_name :: !tokens;
116 skip_whitespace t;
117
118 (match peek t with
119 | Some ']' ->
120 advance t;
121 tokens := Selector_token.Attr_end :: !tokens
122 | Some '=' ->
123 advance t;
124 tokens := Selector_token.Attr_op "=" :: !tokens;
125 skip_whitespace t;
126 let value = match peek t with
127 | Some '"' -> read_string t '"'
128 | Some '\'' -> read_string t '\''
129 | _ -> read_unquoted_attr_value t
130 in
131 tokens := Selector_token.String value :: !tokens;
132 skip_whitespace t;
133 if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket);
134 advance t;
135 tokens := Selector_token.Attr_end :: !tokens
136 | Some ('~' | '|' | '^' | '$' | '*') as op_char ->
137 let op_c = Option.get op_char in
138 advance t;
139 if peek t <> Some '=' then
140 raise (Selector_error (Selector_error_code.Expected_equals_after_operator op_c));
141 advance t;
142 tokens := Selector_token.Attr_op (String.make 1 op_c ^ "=") :: !tokens;
143 skip_whitespace t;
144 let value = match peek t with
145 | Some '"' -> read_string t '"'
146 | Some '\'' -> read_string t '\''
147 | _ -> read_unquoted_attr_value t
148 in
149 tokens := Selector_token.String value :: !tokens;
150 skip_whitespace t;
151 if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket);
152 advance t;
153 tokens := Selector_token.Attr_end :: !tokens
154 | _ -> raise (Selector_error Selector_error_code.Unexpected_character_in_attribute_selector))
155
156 | ',' ->
157 advance t;
158 skip_whitespace t;
159 tokens := Selector_token.Comma :: !tokens
160 | ':' ->
161 advance t;
162 tokens := Selector_token.Colon :: !tokens;
163 let name = read_name t in
164 if name = "" then raise (Selector_error Selector_error_code.Expected_pseudo_class_name);
165 tokens := Selector_token.Tag name :: !tokens;
166
167 if peek t = Some '(' then begin
168 advance t;
169 tokens := Selector_token.Paren_open :: !tokens;
170 skip_whitespace t;
171 (* Read argument until closing paren *)
172 let depth = ref 1 in
173 let start = t.pos in
174 while !depth > 0 && t.pos < t.len do
175 match t.input.[t.pos] with
176 | '(' -> incr depth; advance t
177 | ')' -> decr depth; if !depth > 0 then advance t
178 | _ -> advance t
179 done;
180 let arg = String.trim (String.sub t.input start (t.pos - start)) in
181 if arg <> "" then tokens := Selector_token.String arg :: !tokens;
182 if peek t <> Some ')' then raise (Selector_error Selector_error_code.Expected_closing_paren);
183 advance t;
184 tokens := Selector_token.Paren_close :: !tokens
185 end
186 | _ when is_name_start c ->
187 let name = read_name t in
188 tokens := Selector_token.Tag (String.lowercase_ascii name) :: !tokens
189 | _ ->
190 raise (Selector_error (Selector_error_code.Unexpected_character c))
191 end
192 done;
193
194 tokens := Selector_token.EOF :: !tokens;
195 List.rev !tokens