+34
Cargo.lock
+34
Cargo.lock
···
1317
1317
]
1318
1318
1319
1319
[[package]]
1320
+
name = "thiserror"
1321
+
version = "2.0.17"
1322
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1323
+
checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
1324
+
dependencies = [
1325
+
"thiserror-impl",
1326
+
]
1327
+
1328
+
[[package]]
1329
+
name = "thiserror-impl"
1330
+
version = "2.0.17"
1331
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1332
+
checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
1333
+
dependencies = [
1334
+
"proc-macro2",
1335
+
"quote",
1336
+
"syn",
1337
+
]
1338
+
1339
+
[[package]]
1320
1340
name = "tinystr"
1321
1341
version = "0.8.2"
1322
1342
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1425
1445
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
1426
1446
dependencies = [
1427
1447
"pin-project-lite",
1448
+
"tracing-attributes",
1428
1449
"tracing-core",
1450
+
]
1451
+
1452
+
[[package]]
1453
+
name = "tracing-attributes"
1454
+
version = "0.1.30"
1455
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1456
+
checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903"
1457
+
dependencies = [
1458
+
"proc-macro2",
1459
+
"quote",
1460
+
"syn",
1429
1461
]
1430
1462
1431
1463
[[package]]
···
1601
1633
dependencies = [
1602
1634
"reqwest",
1603
1635
"scraper",
1636
+
"thiserror",
1637
+
"tracing",
1604
1638
]
1605
1639
1606
1640
[[package]]
+2
Cargo.toml
+2
Cargo.toml
+1
flake.nix
+1
flake.nix
+4
-1
src/lib.rs
+4
-1
src/lib.rs
+11
-1
src/main.rs
+11
-1
src/main.rs
+381
-6
src/robotstxt.rs
+381
-6
src/robotstxt.rs
···
1
+
use thiserror::Error;
2
+
3
+
use crate::utils;
4
+
1
5
// simple parsing for robots.txt, no sitemap because lazy
2
6
3
-
#[derive(Default, Debug)]
4
-
struct RobotsTxt {
7
+
#[derive(Default, Debug, PartialEq)]
8
+
pub struct RobotsTxt {
5
9
agents: std::collections::HashMap<String, Rules>,
6
10
}
7
11
8
-
#[derive(Default, Debug)]
9
-
struct Rules {
12
+
#[derive(Default, Debug, Clone, PartialEq)]
13
+
pub struct Rules {
10
14
allow: Vec<String>,
11
15
disallow: Vec<String>,
12
16
}
13
17
18
+
#[derive(Error, Debug)]
19
+
pub enum RobotsTxtError {
20
+
/// a line is missing a colon, all non comment lines in robots.txt should contain a colon
21
+
#[error("missing colon in line: {0}")]
22
+
MissingColon(String),
23
+
24
+
/// a `user-agent` directive was followed by an empty value, this is invalid
25
+
#[error("no user agent specified on line: {0}")]
26
+
EmptyUserAgent(String),
27
+
28
+
/// invlaid directive, the only valid ones are `user-agent`, `allow`, `disallow` and `sitemap`
29
+
#[error("unknown directive \"{directive}\" on line: {line}")]
30
+
UnknownDirective { directive: String, line: String },
31
+
32
+
/// a rule was found without a `user-agent` directive before it somewhere INVALID!
33
+
#[error("no user agent was specified before rule: {0}")]
34
+
NoUserAgent(String),
35
+
}
36
+
14
37
impl RobotsTxt {
15
-
fn parse(input: &str) -> Self {
16
-
todo!()
38
+
pub fn parse(input: &str) -> Result<Self, RobotsTxtError> {
39
+
let mut robots = Self::default();
40
+
let mut current_agent = (String::default(), Rules::default());
41
+
42
+
for line in input.lines() {
43
+
let line = line.trim();
44
+
45
+
if line.is_empty() || line.starts_with('#') {
46
+
continue;
47
+
}
48
+
49
+
let (key, value) = line
50
+
.split_once(':')
51
+
.map(|(k, v)| (k.trim().to_lowercase(), v.trim().to_string()))
52
+
.ok_or_else(|| RobotsTxtError::MissingColon(line.to_string()))?;
53
+
54
+
let handle_rule = |list: &mut Vec<String>| -> Result<(), RobotsTxtError> {
55
+
if current_agent.0.is_empty() {
56
+
return Err(RobotsTxtError::NoUserAgent(line.to_string()));
57
+
}
58
+
59
+
if value.is_empty() {
60
+
return Ok(());
61
+
}
62
+
63
+
list.push(value.to_string());
64
+
Ok(())
65
+
};
66
+
67
+
match key.as_str() {
68
+
"user-agent" => {
69
+
println!("found user agent {value}");
70
+
if value.is_empty() {
71
+
return Err(RobotsTxtError::EmptyUserAgent(line.to_string()));
72
+
}
73
+
74
+
if !current_agent.0.is_empty() {
75
+
robots
76
+
.agents
77
+
.insert(current_agent.0, current_agent.1.clone());
78
+
}
79
+
80
+
current_agent.0 = value.to_lowercase();
81
+
current_agent.1.clear();
82
+
}
83
+
"allow" => handle_rule(&mut current_agent.1.allow)?,
84
+
"disallow" => handle_rule(&mut current_agent.1.disallow)?,
85
+
"sitemap" => (),
86
+
_ => {
87
+
return Err(RobotsTxtError::UnknownDirective {
88
+
directive: key,
89
+
line: line.to_string(),
90
+
});
91
+
}
92
+
};
93
+
}
94
+
95
+
if !current_agent.0.is_empty() {
96
+
robots
97
+
.agents
98
+
.insert(current_agent.0, current_agent.1.clone());
99
+
}
100
+
101
+
Ok(robots)
102
+
}
103
+
104
+
/// retrive the rules for a input user agent
105
+
/// case insensitive, will remove everything after the `/`, and everything in `()` from the input
106
+
pub fn get_rules(&self, useragent: &str) -> Option<&Rules> {
107
+
let useragent = utils::clean_useragent(useragent);
108
+
109
+
self.agents.get(&useragent)
110
+
}
111
+
112
+
/// takes a useragent and a path, and tells you if it is allowed to access that path
113
+
pub fn is_allowed(&self, useragent: &str, path: &str) -> bool {
114
+
let useragent = utils::clean_useragent(useragent);
115
+
116
+
// first check agent-specific rules
117
+
if let Some(rules) = self.agents.get(&useragent) {
118
+
return rules.is_allowed(path);
119
+
}
120
+
121
+
// then check the wildcard rules
122
+
if let Some(rules) = self.agents.get("*") {
123
+
return rules.is_allowed(path);
124
+
}
125
+
126
+
true
127
+
}
128
+
}
129
+
130
+
impl Rules {
131
+
fn clear(&mut self) {
132
+
self.allow.clear();
133
+
self.disallow.clear();
134
+
}
135
+
136
+
pub fn allow(&self) -> &[String] {
137
+
&self.allow
138
+
}
139
+
140
+
pub fn disallow(&self) -> &[String] {
141
+
&self.disallow
142
+
}
143
+
144
+
pub fn is_allowed(&self, path: &str) -> bool {
145
+
let longest_match = |patterns: &[String]| {
146
+
patterns
147
+
.iter()
148
+
.filter_map(|p| {
149
+
if path.starts_with(p) {
150
+
Some(p.len())
151
+
} else {
152
+
None
153
+
}
154
+
})
155
+
.max()
156
+
.unwrap_or(0)
157
+
};
158
+
159
+
let allow_len = longest_match(&self.allow);
160
+
let disallow_len = longest_match(&self.disallow);
161
+
162
+
if disallow_len > allow_len {
163
+
false
164
+
} else {
165
+
true
166
+
}
167
+
}
168
+
}
169
+
170
+
#[cfg(test)]
171
+
mod tests {
172
+
use crate::robotstxt::RobotsTxt;
173
+
174
+
#[test]
175
+
fn test_allow_rule_parsing() {
176
+
let input = "
177
+
user-agent: fooBot
178
+
allow: *
179
+
";
180
+
181
+
let robots = RobotsTxt::parse(input).unwrap();
182
+
183
+
dbg!(&robots);
184
+
185
+
assert_eq!(
186
+
robots
187
+
.get_rules("fooBot/1.0")
188
+
.unwrap()
189
+
.allow()
190
+
.first()
191
+
.unwrap(),
192
+
&"*".to_string()
193
+
)
194
+
}
195
+
196
+
#[test]
197
+
fn test_disallow_rule_parsing() {
198
+
let input = "
199
+
user-agent: fooBot
200
+
disallow: *
201
+
";
202
+
203
+
let robots = RobotsTxt::parse(input).unwrap();
204
+
205
+
dbg!(&robots);
206
+
207
+
assert_eq!(
208
+
robots
209
+
.get_rules("fooBot/1.0")
210
+
.unwrap()
211
+
.disallow()
212
+
.first()
213
+
.unwrap(),
214
+
&"*".to_string()
215
+
)
216
+
}
217
+
218
+
#[test]
219
+
fn test_combined_rule_parsing() {
220
+
let input = "
221
+
user-agent: fooBot
222
+
disallow: *
223
+
allow: /foo
224
+
";
225
+
226
+
let robots = RobotsTxt::parse(input).unwrap();
227
+
228
+
dbg!(&robots);
229
+
230
+
assert_eq!(
231
+
robots
232
+
.get_rules("fooBot/1.0")
233
+
.unwrap()
234
+
.disallow()
235
+
.first()
236
+
.unwrap(),
237
+
&"*".to_string()
238
+
);
239
+
240
+
assert_eq!(
241
+
robots
242
+
.get_rules("fooBot/1.0")
243
+
.unwrap()
244
+
.allow()
245
+
.first()
246
+
.unwrap(),
247
+
&"/foo".to_string()
248
+
)
249
+
}
250
+
251
+
#[test]
252
+
fn missing_colon() {
253
+
let input = "
254
+
user-agent: FooBot
255
+
allow *
256
+
disallow: /private
257
+
";
258
+
259
+
assert!(super::RobotsTxt::parse(input).is_err())
260
+
}
261
+
262
+
#[test]
263
+
fn empty_useragent() {
264
+
let input = "
265
+
user-agent:
266
+
allow: *
267
+
disallow: /private
268
+
";
269
+
270
+
assert!(super::RobotsTxt::parse(input).is_err())
271
+
}
272
+
273
+
#[test]
274
+
fn unknown_directive() {
275
+
let input = "
276
+
user-agent: EvilBot
277
+
PLEASE-dont-go-here-evilbot: /secret-plans/
278
+
";
279
+
280
+
assert!(super::RobotsTxt::parse(input).is_err())
281
+
}
282
+
283
+
#[test]
284
+
fn no_useragent() {
285
+
let input = "
286
+
allow: *
287
+
";
288
+
289
+
assert!(super::RobotsTxt::parse(input).is_err())
290
+
}
291
+
292
+
#[test]
293
+
fn multiple_useragents() {
294
+
let input = "
295
+
User-agent: Googlebot
296
+
Disallow: /
297
+
298
+
User-agent: BotFoo
299
+
Disallow: /private
300
+
301
+
User-agent: FooBot
302
+
Disallow: /fooland
303
+
";
304
+
305
+
let robots = RobotsTxt::parse(input).unwrap();
306
+
307
+
assert!(robots.get_rules("googlebot").is_some());
308
+
assert!(robots.get_rules("BotFoo").is_some());
309
+
assert!(robots.get_rules("foobot").is_some());
310
+
}
311
+
312
+
#[test]
313
+
fn empty_allow_and_disallow_rules() {
314
+
let input = "
315
+
User-agent: FooBot
316
+
Allow: /
317
+
Disallow:
318
+
319
+
User-agent: BotFoo
320
+
Allow:
321
+
Disallow: /
322
+
";
323
+
324
+
let robots = RobotsTxt::parse(input).unwrap();
325
+
326
+
assert!(robots.get_rules("botfoo").is_some());
327
+
assert!(robots.get_rules("foobot").is_some());
328
+
assert_eq!(robots.get_rules("foobot").unwrap().allow().len(), 1);
329
+
assert_eq!(robots.get_rules("foobot").unwrap().disallow().len(), 0);
330
+
assert_eq!(robots.get_rules("botfoo").unwrap().allow().len(), 0);
331
+
assert_eq!(robots.get_rules("botfoo").unwrap().disallow().len(), 1);
332
+
}
333
+
334
+
#[test]
335
+
fn rules_is_allowed() {
336
+
let rules = super::Rules {
337
+
allow: vec!["/public".into()],
338
+
disallow: vec!["/".into()],
339
+
};
340
+
341
+
assert!(!rules.is_allowed("/private/page"));
342
+
assert!(rules.is_allowed("/public/info"));
343
+
}
344
+
345
+
#[test]
346
+
fn agents_is_allowed_explicit_allow() {
347
+
let input = "
348
+
user-agent: *
349
+
disallow: Private
350
+
351
+
user-agent: FooBot
352
+
Allow: /private
353
+
";
354
+
355
+
let robots = RobotsTxt::parse(input).unwrap();
356
+
357
+
assert!(robots.is_allowed("foobot", "/private"))
358
+
}
359
+
360
+
#[test]
361
+
fn agents_is_allowed_explicit_disallow() {
362
+
let input = "
363
+
user-agent: *
364
+
Allow: /private
365
+
366
+
user-agent: FooBot
367
+
Disallow: /private
368
+
";
369
+
370
+
let robots = RobotsTxt::parse(input).unwrap();
371
+
372
+
assert!(!robots.is_allowed("foobot", "/private"))
373
+
}
374
+
375
+
#[test]
376
+
fn agents_is_allowed_fallback_to_wildcard() {
377
+
let input = "
378
+
user-agent: *
379
+
Allow: /private
380
+
";
381
+
382
+
let robots = RobotsTxt::parse(input).unwrap();
383
+
384
+
assert!(robots.is_allowed("foobot", "/private"))
385
+
}
386
+
387
+
#[test]
388
+
fn agents_is_allowed_empty_robots() {
389
+
let robots = RobotsTxt::default();
390
+
391
+
assert!(robots.is_allowed("foobot", "/private"))
17
392
}
18
393
}
+41
src/utils.rs
+41
src/utils.rs
···
1
+
/// cleans the version number and the stuff in parans from a useragent, additionally makes it lowercause
2
+
/// eg: `FooBot/99.0 (contact: foolium@foocorp.co)` -> `foobot`
3
+
pub fn clean_useragent(useragent: &str) -> String {
4
+
let mut s = useragent.trim().to_string();
5
+
6
+
if let Some(pos) = s.find('(') {
7
+
s.truncate(pos);
8
+
}
9
+
10
+
if let Some(pos) = s.find('/') {
11
+
s.truncate(pos);
12
+
}
13
+
14
+
let s = s.to_lowercase().trim().to_string();
15
+
16
+
return s;
17
+
}
18
+
19
+
#[cfg(test)]
20
+
mod tests {
21
+
#[test]
22
+
fn clean_useragent_with_parans() {
23
+
assert_eq!(
24
+
super::clean_useragent("FooBot (contact: foolium@foocorp.co)"),
25
+
"foobot"
26
+
)
27
+
}
28
+
29
+
#[test]
30
+
fn clean_useragent_with_forward_slash() {
31
+
assert_eq!(super::clean_useragent("FooBot/99.0"), "foobot")
32
+
}
33
+
34
+
#[test]
35
+
fn clean_useragent_with_all() {
36
+
assert_eq!(
37
+
super::clean_useragent("FooBot/99.0 (contact: foolium@foocorp.co)"),
38
+
"foobot"
39
+
)
40
+
}
41
+
}