project to map out webrings

add parser for robots.txt

skoove 3540c54a f8887e13

+34
Cargo.lock
··· 1317 ] 1318 1319 [[package]] 1320 name = "tinystr" 1321 version = "0.8.2" 1322 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1425 checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" 1426 dependencies = [ 1427 "pin-project-lite", 1428 "tracing-core", 1429 ] 1430 1431 [[package]] ··· 1601 dependencies = [ 1602 "reqwest", 1603 "scraper", 1604 ] 1605 1606 [[package]]
··· 1317 ] 1318 1319 [[package]] 1320 + name = "thiserror" 1321 + version = "2.0.17" 1322 + source = "registry+https://github.com/rust-lang/crates.io-index" 1323 + checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" 1324 + dependencies = [ 1325 + "thiserror-impl", 1326 + ] 1327 + 1328 + [[package]] 1329 + name = "thiserror-impl" 1330 + version = "2.0.17" 1331 + source = "registry+https://github.com/rust-lang/crates.io-index" 1332 + checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" 1333 + dependencies = [ 1334 + "proc-macro2", 1335 + "quote", 1336 + "syn", 1337 + ] 1338 + 1339 + [[package]] 1340 name = "tinystr" 1341 version = "0.8.2" 1342 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1445 checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" 1446 dependencies = [ 1447 "pin-project-lite", 1448 + "tracing-attributes", 1449 "tracing-core", 1450 + ] 1451 + 1452 + [[package]] 1453 + name = "tracing-attributes" 1454 + version = "0.1.30" 1455 + source = "registry+https://github.com/rust-lang/crates.io-index" 1456 + checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" 1457 + dependencies = [ 1458 + "proc-macro2", 1459 + "quote", 1460 + "syn", 1461 ] 1462 1463 [[package]] ··· 1633 dependencies = [ 1634 "reqwest", 1635 "scraper", 1636 + "thiserror", 1637 + "tracing", 1638 ] 1639 1640 [[package]]
+2
Cargo.toml
··· 6 [dependencies] 7 reqwest = "0.12" 8 scraper = "0.24"
··· 6 [dependencies] 7 reqwest = "0.12" 8 scraper = "0.24" 9 + thiserror = "2.0" 10 + tracing = "0.1"
+1
flake.nix
··· 32 ]) 33 clang 34 pkg-config 35 ] ++ librarys; 36 37 LD_LIBRARY_PATH = "${pkgs.lib.makeLibraryPath librarys}";
··· 32 ]) 33 clang 34 pkg-config 35 + cargo-tarpaulin 36 ] ++ librarys; 37 38 LD_LIBRARY_PATH = "${pkgs.lib.makeLibraryPath librarys}";
+4 -1
src/lib.rs
··· 1 - mod robotstxt; 2 3 pub static USER_AGENT: &str = concat!( 4 env!("CARGO_PKG_NAME"),
··· 1 + #![allow(clippy::needless_return)] 2 + 3 + pub mod robotstxt; 4 + pub mod utils; 5 6 pub static USER_AGENT: &str = concat!( 7 env!("CARGO_PKG_NAME"),
+11 -1
src/main.rs
··· 1 fn main() { 2 - println!("{}", webring_mapper::USER_AGENT); 3 }
··· 1 + use webring_mapper::robotstxt::RobotsTxt; 2 + 3 fn main() { 4 + let input = " 5 + User-agent: Googlebot 6 + Disallow: / 7 + 8 + User-agent: * 9 + Disallow: 10 + "; 11 + 12 + dbg!(RobotsTxt::parse(input)); 13 }
+381 -6
src/robotstxt.rs
··· 1 // simple parsing for robots.txt, no sitemap because lazy 2 3 - #[derive(Default, Debug)] 4 - struct RobotsTxt { 5 agents: std::collections::HashMap<String, Rules>, 6 } 7 8 - #[derive(Default, Debug)] 9 - struct Rules { 10 allow: Vec<String>, 11 disallow: Vec<String>, 12 } 13 14 impl RobotsTxt { 15 - fn parse(input: &str) -> Self { 16 - todo!() 17 } 18 }
··· 1 + use thiserror::Error; 2 + 3 + use crate::utils; 4 + 5 // simple parsing for robots.txt, no sitemap because lazy 6 7 + #[derive(Default, Debug, PartialEq)] 8 + pub struct RobotsTxt { 9 agents: std::collections::HashMap<String, Rules>, 10 } 11 12 + #[derive(Default, Debug, Clone, PartialEq)] 13 + pub struct Rules { 14 allow: Vec<String>, 15 disallow: Vec<String>, 16 } 17 18 + #[derive(Error, Debug)] 19 + pub enum RobotsTxtError { 20 + /// a line is missing a colon, all non comment lines in robots.txt should contain a colon 21 + #[error("missing colon in line: {0}")] 22 + MissingColon(String), 23 + 24 + /// a `user-agent` directive was followed by an empty value, this is invalid 25 + #[error("no user agent specified on line: {0}")] 26 + EmptyUserAgent(String), 27 + 28 + /// invlaid directive, the only valid ones are `user-agent`, `allow`, `disallow` and `sitemap` 29 + #[error("unknown directive \"{directive}\" on line: {line}")] 30 + UnknownDirective { directive: String, line: String }, 31 + 32 + /// a rule was found without a `user-agent` directive before it somewhere INVALID! 33 + #[error("no user agent was specified before rule: {0}")] 34 + NoUserAgent(String), 35 + } 36 + 37 impl RobotsTxt { 38 + pub fn parse(input: &str) -> Result<Self, RobotsTxtError> { 39 + let mut robots = Self::default(); 40 + let mut current_agent = (String::default(), Rules::default()); 41 + 42 + for line in input.lines() { 43 + let line = line.trim(); 44 + 45 + if line.is_empty() || line.starts_with('#') { 46 + continue; 47 + } 48 + 49 + let (key, value) = line 50 + .split_once(':') 51 + .map(|(k, v)| (k.trim().to_lowercase(), v.trim().to_string())) 52 + .ok_or_else(|| RobotsTxtError::MissingColon(line.to_string()))?; 53 + 54 + let handle_rule = |list: &mut Vec<String>| -> Result<(), RobotsTxtError> { 55 + if current_agent.0.is_empty() { 56 + return Err(RobotsTxtError::NoUserAgent(line.to_string())); 57 + } 58 + 59 + if value.is_empty() { 60 + return Ok(()); 61 + } 62 + 63 + list.push(value.to_string()); 64 + Ok(()) 65 + }; 66 + 67 + match key.as_str() { 68 + "user-agent" => { 69 + println!("found user agent {value}"); 70 + if value.is_empty() { 71 + return Err(RobotsTxtError::EmptyUserAgent(line.to_string())); 72 + } 73 + 74 + if !current_agent.0.is_empty() { 75 + robots 76 + .agents 77 + .insert(current_agent.0, current_agent.1.clone()); 78 + } 79 + 80 + current_agent.0 = value.to_lowercase(); 81 + current_agent.1.clear(); 82 + } 83 + "allow" => handle_rule(&mut current_agent.1.allow)?, 84 + "disallow" => handle_rule(&mut current_agent.1.disallow)?, 85 + "sitemap" => (), 86 + _ => { 87 + return Err(RobotsTxtError::UnknownDirective { 88 + directive: key, 89 + line: line.to_string(), 90 + }); 91 + } 92 + }; 93 + } 94 + 95 + if !current_agent.0.is_empty() { 96 + robots 97 + .agents 98 + .insert(current_agent.0, current_agent.1.clone()); 99 + } 100 + 101 + Ok(robots) 102 + } 103 + 104 + /// retrive the rules for a input user agent 105 + /// case insensitive, will remove everything after the `/`, and everything in `()` from the input 106 + pub fn get_rules(&self, useragent: &str) -> Option<&Rules> { 107 + let useragent = utils::clean_useragent(useragent); 108 + 109 + self.agents.get(&useragent) 110 + } 111 + 112 + /// takes a useragent and a path, and tells you if it is allowed to access that path 113 + pub fn is_allowed(&self, useragent: &str, path: &str) -> bool { 114 + let useragent = utils::clean_useragent(useragent); 115 + 116 + // first check agent-specific rules 117 + if let Some(rules) = self.agents.get(&useragent) { 118 + return rules.is_allowed(path); 119 + } 120 + 121 + // then check the wildcard rules 122 + if let Some(rules) = self.agents.get("*") { 123 + return rules.is_allowed(path); 124 + } 125 + 126 + true 127 + } 128 + } 129 + 130 + impl Rules { 131 + fn clear(&mut self) { 132 + self.allow.clear(); 133 + self.disallow.clear(); 134 + } 135 + 136 + pub fn allow(&self) -> &[String] { 137 + &self.allow 138 + } 139 + 140 + pub fn disallow(&self) -> &[String] { 141 + &self.disallow 142 + } 143 + 144 + pub fn is_allowed(&self, path: &str) -> bool { 145 + let longest_match = |patterns: &[String]| { 146 + patterns 147 + .iter() 148 + .filter_map(|p| { 149 + if path.starts_with(p) { 150 + Some(p.len()) 151 + } else { 152 + None 153 + } 154 + }) 155 + .max() 156 + .unwrap_or(0) 157 + }; 158 + 159 + let allow_len = longest_match(&self.allow); 160 + let disallow_len = longest_match(&self.disallow); 161 + 162 + if disallow_len > allow_len { 163 + false 164 + } else { 165 + true 166 + } 167 + } 168 + } 169 + 170 + #[cfg(test)] 171 + mod tests { 172 + use crate::robotstxt::RobotsTxt; 173 + 174 + #[test] 175 + fn test_allow_rule_parsing() { 176 + let input = " 177 + user-agent: fooBot 178 + allow: * 179 + "; 180 + 181 + let robots = RobotsTxt::parse(input).unwrap(); 182 + 183 + dbg!(&robots); 184 + 185 + assert_eq!( 186 + robots 187 + .get_rules("fooBot/1.0") 188 + .unwrap() 189 + .allow() 190 + .first() 191 + .unwrap(), 192 + &"*".to_string() 193 + ) 194 + } 195 + 196 + #[test] 197 + fn test_disallow_rule_parsing() { 198 + let input = " 199 + user-agent: fooBot 200 + disallow: * 201 + "; 202 + 203 + let robots = RobotsTxt::parse(input).unwrap(); 204 + 205 + dbg!(&robots); 206 + 207 + assert_eq!( 208 + robots 209 + .get_rules("fooBot/1.0") 210 + .unwrap() 211 + .disallow() 212 + .first() 213 + .unwrap(), 214 + &"*".to_string() 215 + ) 216 + } 217 + 218 + #[test] 219 + fn test_combined_rule_parsing() { 220 + let input = " 221 + user-agent: fooBot 222 + disallow: * 223 + allow: /foo 224 + "; 225 + 226 + let robots = RobotsTxt::parse(input).unwrap(); 227 + 228 + dbg!(&robots); 229 + 230 + assert_eq!( 231 + robots 232 + .get_rules("fooBot/1.0") 233 + .unwrap() 234 + .disallow() 235 + .first() 236 + .unwrap(), 237 + &"*".to_string() 238 + ); 239 + 240 + assert_eq!( 241 + robots 242 + .get_rules("fooBot/1.0") 243 + .unwrap() 244 + .allow() 245 + .first() 246 + .unwrap(), 247 + &"/foo".to_string() 248 + ) 249 + } 250 + 251 + #[test] 252 + fn missing_colon() { 253 + let input = " 254 + user-agent: FooBot 255 + allow * 256 + disallow: /private 257 + "; 258 + 259 + assert!(super::RobotsTxt::parse(input).is_err()) 260 + } 261 + 262 + #[test] 263 + fn empty_useragent() { 264 + let input = " 265 + user-agent: 266 + allow: * 267 + disallow: /private 268 + "; 269 + 270 + assert!(super::RobotsTxt::parse(input).is_err()) 271 + } 272 + 273 + #[test] 274 + fn unknown_directive() { 275 + let input = " 276 + user-agent: EvilBot 277 + PLEASE-dont-go-here-evilbot: /secret-plans/ 278 + "; 279 + 280 + assert!(super::RobotsTxt::parse(input).is_err()) 281 + } 282 + 283 + #[test] 284 + fn no_useragent() { 285 + let input = " 286 + allow: * 287 + "; 288 + 289 + assert!(super::RobotsTxt::parse(input).is_err()) 290 + } 291 + 292 + #[test] 293 + fn multiple_useragents() { 294 + let input = " 295 + User-agent: Googlebot 296 + Disallow: / 297 + 298 + User-agent: BotFoo 299 + Disallow: /private 300 + 301 + User-agent: FooBot 302 + Disallow: /fooland 303 + "; 304 + 305 + let robots = RobotsTxt::parse(input).unwrap(); 306 + 307 + assert!(robots.get_rules("googlebot").is_some()); 308 + assert!(robots.get_rules("BotFoo").is_some()); 309 + assert!(robots.get_rules("foobot").is_some()); 310 + } 311 + 312 + #[test] 313 + fn empty_allow_and_disallow_rules() { 314 + let input = " 315 + User-agent: FooBot 316 + Allow: / 317 + Disallow: 318 + 319 + User-agent: BotFoo 320 + Allow: 321 + Disallow: / 322 + "; 323 + 324 + let robots = RobotsTxt::parse(input).unwrap(); 325 + 326 + assert!(robots.get_rules("botfoo").is_some()); 327 + assert!(robots.get_rules("foobot").is_some()); 328 + assert_eq!(robots.get_rules("foobot").unwrap().allow().len(), 1); 329 + assert_eq!(robots.get_rules("foobot").unwrap().disallow().len(), 0); 330 + assert_eq!(robots.get_rules("botfoo").unwrap().allow().len(), 0); 331 + assert_eq!(robots.get_rules("botfoo").unwrap().disallow().len(), 1); 332 + } 333 + 334 + #[test] 335 + fn rules_is_allowed() { 336 + let rules = super::Rules { 337 + allow: vec!["/public".into()], 338 + disallow: vec!["/".into()], 339 + }; 340 + 341 + assert!(!rules.is_allowed("/private/page")); 342 + assert!(rules.is_allowed("/public/info")); 343 + } 344 + 345 + #[test] 346 + fn agents_is_allowed_explicit_allow() { 347 + let input = " 348 + user-agent: * 349 + disallow: Private 350 + 351 + user-agent: FooBot 352 + Allow: /private 353 + "; 354 + 355 + let robots = RobotsTxt::parse(input).unwrap(); 356 + 357 + assert!(robots.is_allowed("foobot", "/private")) 358 + } 359 + 360 + #[test] 361 + fn agents_is_allowed_explicit_disallow() { 362 + let input = " 363 + user-agent: * 364 + Allow: /private 365 + 366 + user-agent: FooBot 367 + Disallow: /private 368 + "; 369 + 370 + let robots = RobotsTxt::parse(input).unwrap(); 371 + 372 + assert!(!robots.is_allowed("foobot", "/private")) 373 + } 374 + 375 + #[test] 376 + fn agents_is_allowed_fallback_to_wildcard() { 377 + let input = " 378 + user-agent: * 379 + Allow: /private 380 + "; 381 + 382 + let robots = RobotsTxt::parse(input).unwrap(); 383 + 384 + assert!(robots.is_allowed("foobot", "/private")) 385 + } 386 + 387 + #[test] 388 + fn agents_is_allowed_empty_robots() { 389 + let robots = RobotsTxt::default(); 390 + 391 + assert!(robots.is_allowed("foobot", "/private")) 392 } 393 }
+41
src/utils.rs
···
··· 1 + /// cleans the version number and the stuff in parans from a useragent, additionally makes it lowercause 2 + /// eg: `FooBot/99.0 (contact: foolium@foocorp.co)` -> `foobot` 3 + pub fn clean_useragent(useragent: &str) -> String { 4 + let mut s = useragent.trim().to_string(); 5 + 6 + if let Some(pos) = s.find('(') { 7 + s.truncate(pos); 8 + } 9 + 10 + if let Some(pos) = s.find('/') { 11 + s.truncate(pos); 12 + } 13 + 14 + let s = s.to_lowercase().trim().to_string(); 15 + 16 + return s; 17 + } 18 + 19 + #[cfg(test)] 20 + mod tests { 21 + #[test] 22 + fn clean_useragent_with_parans() { 23 + assert_eq!( 24 + super::clean_useragent("FooBot (contact: foolium@foocorp.co)"), 25 + "foobot" 26 + ) 27 + } 28 + 29 + #[test] 30 + fn clean_useragent_with_forward_slash() { 31 + assert_eq!(super::clean_useragent("FooBot/99.0"), "foobot") 32 + } 33 + 34 + #[test] 35 + fn clean_useragent_with_all() { 36 + assert_eq!( 37 + super::clean_useragent("FooBot/99.0 (contact: foolium@foocorp.co)"), 38 + "foobot" 39 + ) 40 + } 41 + }