OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 6.5 kB view raw
1(* CSS selector lexer *) 2 3exception Selector_error of Selector_error_code.t 4 5type t = { 6 input : string; 7 len : int; 8 mutable pos : int; 9} 10 11let create input = { input; len = String.length input; pos = 0 } 12 13let peek t = 14 if t.pos < t.len then Some t.input.[t.pos] 15 else None 16 17let advance t = 18 if t.pos < t.len then t.pos <- t.pos + 1 19 20let consume t = 21 let c = peek t in 22 advance t; 23 c 24 25let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' || c = '\x0C' 26 27let is_name_start c = 28 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || Char.code c > 127 29 30let is_name_char c = 31 is_name_start c || (c >= '0' && c <= '9') 32 33let skip_whitespace t = 34 while t.pos < t.len && is_whitespace t.input.[t.pos] do 35 advance t 36 done 37 38let read_name t = 39 let start = t.pos in 40 while t.pos < t.len && is_name_char t.input.[t.pos] do 41 advance t 42 done; 43 String.sub t.input start (t.pos - start) 44 45let read_string t quote = 46 advance t; (* Skip opening quote *) 47 let buf = Buffer.create 32 in 48 let rec loop () = 49 match peek t with 50 | None -> raise (Selector_error Selector_error_code.Unterminated_string) 51 | Some c when c = quote -> advance t 52 | Some '\\' -> 53 advance t; 54 (match peek t with 55 | Some c -> Buffer.add_char buf c; advance t; loop () 56 | None -> raise (Selector_error Selector_error_code.Unterminated_escape)) 57 | Some c -> 58 Buffer.add_char buf c; 59 advance t; 60 loop () 61 in 62 loop (); 63 Buffer.contents buf 64 65let read_unquoted_attr_value t = 66 let start = t.pos in 67 while t.pos < t.len && 68 let c = t.input.[t.pos] in 69 not (is_whitespace c) && c <> ']' do 70 advance t 71 done; 72 String.sub t.input start (t.pos - start) 73 74let tokenize input = 75 let t = create input in 76 let tokens = ref [] in 77 let pending_ws = ref false in 78 79 while t.pos < t.len do 80 let c = t.input.[t.pos] in 81 82 if is_whitespace c then begin 83 pending_ws := true; 84 skip_whitespace t 85 end else if c = '>' || c = '+' || c = '~' then begin 86 pending_ws := false; 87 advance t; 88 skip_whitespace t; 89 tokens := Selector_token.Combinator (String.make 1 c) :: !tokens 90 end else begin 91 if !pending_ws && !tokens <> [] && c <> ',' then 92 tokens := Selector_token.Combinator " " :: !tokens; 93 pending_ws := false; 94 95 match c with 96 | '*' -> 97 advance t; 98 tokens := Selector_token.Universal :: !tokens 99 | '#' -> 100 advance t; 101 let name = read_name t in 102 if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_hash); 103 tokens := Selector_token.Id name :: !tokens 104 | '.' -> 105 advance t; 106 let name = read_name t in 107 if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_dot); 108 tokens := Selector_token.Class name :: !tokens 109 | '[' -> 110 advance t; 111 tokens := Selector_token.Attr_start :: !tokens; 112 skip_whitespace t; 113 let attr_name = read_name t in 114 if attr_name = "" then raise (Selector_error Selector_error_code.Expected_attribute_name); 115 tokens := Selector_token.Tag attr_name :: !tokens; 116 skip_whitespace t; 117 118 (match peek t with 119 | Some ']' -> 120 advance t; 121 tokens := Selector_token.Attr_end :: !tokens 122 | Some '=' -> 123 advance t; 124 tokens := Selector_token.Attr_op "=" :: !tokens; 125 skip_whitespace t; 126 let value = match peek t with 127 | Some '"' -> read_string t '"' 128 | Some '\'' -> read_string t '\'' 129 | _ -> read_unquoted_attr_value t 130 in 131 tokens := Selector_token.String value :: !tokens; 132 skip_whitespace t; 133 if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket); 134 advance t; 135 tokens := Selector_token.Attr_end :: !tokens 136 | Some ('~' | '|' | '^' | '$' | '*') as op_char -> 137 let op_c = Option.get op_char in 138 advance t; 139 if peek t <> Some '=' then 140 raise (Selector_error (Selector_error_code.Expected_equals_after_operator op_c)); 141 advance t; 142 tokens := Selector_token.Attr_op (String.make 1 op_c ^ "=") :: !tokens; 143 skip_whitespace t; 144 let value = match peek t with 145 | Some '"' -> read_string t '"' 146 | Some '\'' -> read_string t '\'' 147 | _ -> read_unquoted_attr_value t 148 in 149 tokens := Selector_token.String value :: !tokens; 150 skip_whitespace t; 151 if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket); 152 advance t; 153 tokens := Selector_token.Attr_end :: !tokens 154 | _ -> raise (Selector_error Selector_error_code.Unexpected_character_in_attribute_selector)) 155 156 | ',' -> 157 advance t; 158 skip_whitespace t; 159 tokens := Selector_token.Comma :: !tokens 160 | ':' -> 161 advance t; 162 tokens := Selector_token.Colon :: !tokens; 163 let name = read_name t in 164 if name = "" then raise (Selector_error Selector_error_code.Expected_pseudo_class_name); 165 tokens := Selector_token.Tag name :: !tokens; 166 167 if peek t = Some '(' then begin 168 advance t; 169 tokens := Selector_token.Paren_open :: !tokens; 170 skip_whitespace t; 171 (* Read argument until closing paren *) 172 let depth = ref 1 in 173 let start = t.pos in 174 while !depth > 0 && t.pos < t.len do 175 match t.input.[t.pos] with 176 | '(' -> incr depth; advance t 177 | ')' -> decr depth; if !depth > 0 then advance t 178 | _ -> advance t 179 done; 180 let arg = String.trim (String.sub t.input start (t.pos - start)) in 181 if arg <> "" then tokens := Selector_token.String arg :: !tokens; 182 if peek t <> Some ')' then raise (Selector_error Selector_error_code.Expected_closing_paren); 183 advance t; 184 tokens := Selector_token.Paren_close :: !tokens 185 end 186 | _ when is_name_start c -> 187 let name = read_name t in 188 tokens := Selector_token.Tag (String.lowercase_ascii name) :: !tokens 189 | _ -> 190 raise (Selector_error (Selector_error_code.Unexpected_character c)) 191 end 192 done; 193 194 tokens := Selector_token.EOF :: !tokens; 195 List.rev !tokens