+3
.gitignore
+3
.gitignore
+3
crates/readability/rules/.readthedocs.io.txt
+3
crates/readability/rules/.readthedocs.io.txt
+5
crates/readability/rules/.stanford.edu.txt
+5
crates/readability/rules/.stanford.edu.txt
+25
crates/readability/rules/.substack.com.txt
+25
crates/readability/rules/.substack.com.txt
···
1
+
author: //meta[@name="author"]/@content
2
+
title: //meta[@property="og:title"]/@content
3
+
body: //h3[contains(concat(' ',normalize-space(@class),' '),' subtitle ')] | //div[contains(concat(' ',normalize-space(@class),' '),' body ')]
4
+
5
+
# Clean Twitter embeds
6
+
strip: //div[contains(@class, 'tweet-footer')]//span
7
+
strip_id_or_class: expanded-link-description
8
+
strip_id_or_class: expanded-link-domain
9
+
10
+
strip_id_or_class: header-anchor-widget
11
+
strip_id_or_class: subscribe-widget
12
+
13
+
strip: //button
14
+
strip: //svg
15
+
strip: //p[contains(concat(' ',normalize-space(@class),' '),' button-wrapper ')]
16
+
17
+
wrap_in(blockquote): //div[@class='tweet']
18
+
19
+
20
+
prune: no
21
+
22
+
test_url: https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation
23
+
test_contains: Greenwald, by then furious, noted that neither Maass nor Reed had identified a factual inaccuracy
24
+
test_url: https://jonathancook.substack.com/p/why-the-western-media-is-afraid-of
25
+
test_contains: The goal of the corporate media is not unearthing truth
+10
crates/readability/rules/.theonion.com.txt
+10
crates/readability/rules/.theonion.com.txt
···
1
+
title: //h2[@class='title'] | //h1[contains(concat(' ',normalize-space(@class),' '),'headline')]
2
+
date: substring-before(//p[@class='meta'], '|')
3
+
body: //div[@class='article_body'] | //div[@class='story'] | //div[contains(concat(' ',normalize-space(@class),' '),'post-content')]
4
+
5
+
strip: //h2[@class='title']
6
+
strip: //p[@class='meta']
7
+
strip: //div[@class='ga_section']
8
+
strip: //div[@id='recent_slider']
9
+
10
+
test_url: https://politics.theonion.com/inconsolable-jeff-sessions-tries-to-commit-suicide-by-s-1826462420
+6
crates/readability/rules/theonion.com.txt
+6
crates/readability/rules/theonion.com.txt
+21
crates/readability/src/config/parser.rs
+21
crates/readability/src/config/parser.rs
···
153
153
assert_eq!(config.strip.len(), 2);
154
154
assert_eq!(config.strip_id_or_class.len(), 2);
155
155
}
156
+
#[test]
157
+
fn test_parse_invalid_boolean() {
158
+
let content = "prune: perhaps";
159
+
let result = parse_config(content);
160
+
assert!(result.is_err());
161
+
match result.unwrap_err() {
162
+
Error::ConfigError(msg) => assert_eq!(msg, "Invalid boolean value: perhaps"),
163
+
_ => panic!("Expected ConfigError"),
164
+
}
165
+
}
166
+
167
+
#[test]
168
+
fn test_parse_malformed_lines() {
169
+
let content = r#"
170
+
title: //h1
171
+
malformed line here
172
+
another: valid
173
+
"#;
174
+
let config = parse_config(content).unwrap();
175
+
assert_eq!(config.title.len(), 1);
176
+
}
156
177
}
+69
crates/readability/src/extractor/generic.rs
+69
crates/readability/src/extractor/generic.rs
···
561
561
562
562
assert!(result.body_html.contains("main article content"));
563
563
}
564
+
#[test]
565
+
fn test_extract_body_simple_fallback() {
566
+
let html = r#"
567
+
<html>
568
+
<body>
569
+
<div class="article-content">
570
+
Short content.
571
+
</div>
572
+
</body>
573
+
</html>
574
+
"#;
575
+
576
+
let extractor = GenericExtractor::new(html.to_string());
577
+
let document = Html::parse_document(html);
578
+
let body = extractor.extract_body_simple(&document);
579
+
580
+
assert!(body.is_some());
581
+
assert!(body.unwrap().contains("Short content"));
582
+
}
583
+
584
+
#[test]
585
+
fn test_extract_title_fallback_tag() {
586
+
let html = r#"
587
+
<html>
588
+
<head>
589
+
<title>Fallback Title</title>
590
+
</head>
591
+
<body></body>
592
+
</html>
593
+
"#;
594
+
595
+
let extractor = GenericExtractor::new(html.to_string());
596
+
let document = Html::parse_document(html);
597
+
let title = extractor.extract_title(&document);
598
+
599
+
assert_eq!(title, Some("Fallback Title".to_string()));
600
+
}
601
+
602
+
#[test]
603
+
fn test_extract_date_fallback_time_element() {
604
+
let html = r#"
605
+
<html>
606
+
<body>
607
+
<time datetime="2025-12-25">Christmas 2025</time>
608
+
</body>
609
+
</html>
610
+
"#;
611
+
612
+
let extractor = GenericExtractor::new(html.to_string());
613
+
let document = Html::parse_document(html);
614
+
let date = extractor.extract_date(&document);
615
+
assert_eq!(date, Some("2025-12-25".to_string()));
616
+
}
617
+
618
+
#[test]
619
+
fn test_extract_date_fallback_schema() {
620
+
let html = r#"
621
+
<html>
622
+
<body>
623
+
<span itemprop="datePublished" content="2025-01-01">Jan 1st</span>
624
+
</body>
625
+
</html>
626
+
"#;
627
+
628
+
let extractor = GenericExtractor::new(html.to_string());
629
+
let document = Html::parse_document(html);
630
+
let date = extractor.extract_date(&document);
631
+
assert_eq!(date, Some("2025-01-01".to_string()));
632
+
}
564
633
}
+22
-4
crates/readability/src/extractor/scoring.rs
+22
-4
crates/readability/src/extractor/scoring.rs
···
237
237
let document = Html::parse_fragment(html);
238
238
let selector = Selector::parse("div").unwrap();
239
239
let element = document.select(&selector).next().unwrap();
240
-
241
240
let weight = calculate_class_weight(element);
242
241
assert!(weight > 0.0, "Should have positive weight for content/article classes");
243
242
}
···
248
247
let document = Html::parse_fragment(html);
249
248
let selector = Selector::parse("div").unwrap();
250
249
let element = document.select(&selector).next().unwrap();
251
-
252
250
let weight = calculate_class_weight(element);
253
251
assert!(weight < 0.0, "Should have negative weight for sidebar/comment classes");
254
252
}
···
259
257
let document = Html::parse_fragment(html);
260
258
let selector = Selector::parse("div").unwrap();
261
259
let element = document.select(&selector).next().unwrap();
262
-
263
260
let density = calculate_link_density(element);
264
261
assert!(density > 0.0 && density < 1.0, "Link density should be between 0 and 1");
265
262
}
···
270
267
let document = Html::parse_fragment(html);
271
268
let selector = Selector::parse("div").unwrap();
272
269
let element = document.select(&selector).next().unwrap();
273
-
274
270
let density = calculate_link_density(element);
275
271
assert!(
276
272
density > 0.8,
···
332
328
333
329
let score = calculate_tag_score(element);
334
330
assert_eq!(score, -5.0, "Nav tag should score -5");
331
+
}
332
+
#[test]
333
+
fn test_mixed_signals() {
334
+
let html = r#"<div class="sidebar article-content">Content</div>"#;
335
+
let document = Html::parse_fragment(html);
336
+
let selector = Selector::parse("div").unwrap();
337
+
let element = document.select(&selector).next().unwrap();
338
+
339
+
assert!(
340
+
!is_unlikely_candidate(element),
341
+
"Mixed signals with positive pattern should be valid"
342
+
);
343
+
}
344
+
345
+
#[test]
346
+
fn test_empty_link_density() {
347
+
let html = r#"<div></div>"#;
348
+
let document = Html::parse_fragment(html);
349
+
let selector = Selector::parse("div").unwrap();
350
+
let element = document.select(&selector).next().unwrap();
351
+
352
+
assert_eq!(calculate_link_density(element), 0.0);
335
353
}
336
354
}
+33
-3
crates/readability/src/extractor/xpath.rs
+33
-3
crates/readability/src/extractor/xpath.rs
···
522
522
assert!(body.contains("Main content here"));
523
523
assert!(body.contains("Section Title"));
524
524
}
525
+
#[test]
526
+
fn test_rebuild_void_elements() {
527
+
let html = r#"
528
+
<html>
529
+
<body>
530
+
<p>Text <br> with break</p>
531
+
<img src="test.jpg">
532
+
<div id="remove">Remove me</div>
533
+
</body>
534
+
</html>
535
+
"#;
536
+
537
+
let config = SiteConfig { strip: vec!["//*[@id='remove']".to_string()], ..Default::default() };
538
+
let extractor = XPathExtractor::new(html.to_string());
539
+
let result = extractor.apply_strip_rules(html, &config).unwrap();
540
+
541
+
assert!(result.contains("<br>"));
542
+
assert!(!result.contains("</br>"));
543
+
assert!(result.contains("<img src=\"test.jpg\">"));
544
+
assert!(!result.contains("</img>"));
545
+
assert!(!result.contains("Remove me"));
546
+
}
547
+
548
+
#[test]
549
+
fn test_unsupported_xpath() {
550
+
let html = "<html></html>";
551
+
let extractor = XPathExtractor::new(html.to_string());
552
+
let document = Html::parse_document(html);
553
+
554
+
// TODO: implement complex axis navigation
555
+
let result = extractor.evaluate_xpath(&document, "//div/following-sibling::p", false);
556
+
assert!(matches!(result, Err(Error::XPathError(_))));
557
+
}
525
558
}
526
559
527
560
#[test]
528
561
fn test_wikipedia_xpath_patterns() {
529
562
let extractor = XPathExtractor::new(String::new());
530
-
531
-
// Wikipedia title XPath
532
563
let (css, filter) = extractor.xpath_to_css_with_attr("//h1[@id='firstHeading']").unwrap();
533
564
assert_eq!(css, "h1#firstHeading");
534
565
assert!(filter.is_none());
535
566
536
-
// Wikipedia body XPath (note space around =)
537
567
let (css, filter) = extractor.xpath_to_css_with_attr("//div[@id = 'bodyContent']").unwrap();
538
568
assert_eq!(css, "div#bodyContent");
539
569
assert!(filter.is_none());
+83
-49
crates/readability/tests/readability_tests.rs
+83
-49
crates/readability/tests/readability_tests.rs
···
1
1
use malfestio_readability::Readability;
2
+
use std::fs;
3
+
use std::path::PathBuf;
2
4
3
-
#[tokio::test]
4
-
#[ignore = "requires network access"]
5
-
async fn test_arxiv_extraction() {
6
-
let url = "https://arxiv.org/abs/2009.03017";
5
+
fn get_test_html(filename: &str) -> Option<String> {
6
+
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
7
+
path.push("tests/data");
8
+
path.push(filename);
7
9
8
-
let client = reqwest::Client::builder()
9
-
.user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
10
-
.build()
11
-
.unwrap();
10
+
if path.exists() {
11
+
Some(fs::read_to_string(path).unwrap())
12
+
} else {
13
+
println!("Test data file not found: {:?}. Skipping test.", path);
14
+
None
15
+
}
16
+
}
12
17
13
-
let response = client.get(url).send().await.unwrap();
14
-
let html = response.text().await.unwrap();
18
+
#[test]
19
+
fn test_arxiv_extraction() {
20
+
let html = match get_test_html("arxiv.html") {
21
+
Some(h) => h,
22
+
None => return,
23
+
};
24
+
let url = "https://arxiv.org/abs/2009.03017";
15
25
16
26
let readability = Readability::new(html, Some(url));
17
27
let article = readability.parse().unwrap();
18
28
19
29
assert!(!article.title.is_empty(), "Title should be extracted");
20
-
println!("Title: {}", article.title);
30
+
assert!(article.title.contains("Non-exponentially weighted aggregation"));
21
31
22
32
assert!(!article.markdown.is_empty(), "Body/markdown should be extracted");
23
33
assert!(article.markdown.len() > 50, "Abstract should have substantial content");
24
-
println!("Markdown length: {} chars", article.markdown.len());
25
34
26
-
assert!(article.author.is_some(), "Author should be extracted from meta tag");
27
-
println!("Author: {:?}", article.author);
28
-
29
-
assert!(
30
-
article.published_date.is_some(),
31
-
"Date should be extracted from meta tag"
32
-
);
33
-
println!("Date: {:?}", article.published_date);
35
+
// Arxiv meta tag uses "Lastname, Firstname" format: <meta name="citation_author" content="Alquier, Pierre" />
36
+
assert_eq!(article.author.as_deref(), Some("Alquier, Pierre"));
37
+
assert_eq!(article.published_date.as_deref(), Some("2020/09/07"));
34
38
}
35
39
36
-
#[tokio::test]
37
-
#[ignore = "requires network access"]
38
-
async fn test_wikipedia_extraction() {
40
+
#[test]
41
+
fn test_wikipedia_extraction() {
42
+
let html = match get_test_html("wikipedia.html") {
43
+
Some(h) => h,
44
+
None => return,
45
+
};
39
46
let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
40
47
41
-
let client = reqwest::Client::builder()
42
-
.user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
43
-
.build()
44
-
.unwrap();
45
-
46
-
let response = client.get(url).send().await.unwrap();
47
-
let html = response.text().await.unwrap();
48
-
49
48
let readability = Readability::new(html, Some(url));
50
49
let article = readability.parse().unwrap();
51
50
52
51
assert!(article.title.contains("Rust"), "Title should contain 'Rust'");
53
-
println!("Title: {}", article.title);
54
-
55
52
assert!(
56
53
article.markdown.len() > 1000,
57
54
"Wikipedia article should have substantial content"
58
55
);
59
-
println!("Markdown length: {} chars", article.markdown.len());
60
56
61
-
// Verify strip rules worked: mw-editsection elements should be removed
62
57
assert!(
63
58
!article.content.contains("mw-editsection"),
64
59
"Edit section elements (mw-editsection) should be stripped"
65
60
);
66
61
}
67
62
68
-
/// Test extraction for site without specific rules (falls back to generic)
69
-
#[tokio::test]
70
-
#[ignore = "requires network access"]
71
-
async fn test_generic_fallback_extraction() {
63
+
#[test]
64
+
fn test_generic_fallback_extraction() {
65
+
let html = match get_test_html("generic.html") {
66
+
Some(h) => h,
67
+
None => return,
68
+
};
72
69
let url = "https://www.rust-lang.org/";
73
-
74
-
let client = reqwest::Client::builder()
75
-
.user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
76
-
.build()
77
-
.unwrap();
78
-
79
-
let response = client.get(url).send().await.unwrap();
80
-
let html = response.text().await.unwrap();
81
70
82
71
let readability = Readability::new(html, Some(url));
83
72
let article = readability.parse().unwrap();
84
73
85
74
assert!(!article.title.is_empty(), "Title should be extracted via generic");
86
75
assert!(!article.markdown.is_empty(), "Content should be extracted via generic");
76
+
}
87
77
88
-
println!("Title: {}", article.title);
89
-
println!("Markdown length: {} chars", article.markdown.len());
78
+
#[test]
79
+
fn test_substack_extraction() {
80
+
let html = match get_test_html("substack.html") {
81
+
Some(h) => h,
82
+
None => return,
83
+
};
84
+
let url = "https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation";
85
+
86
+
let readability = Readability::new(html, Some(url));
87
+
let article = readability.parse().unwrap();
88
+
89
+
assert!(!article.title.is_empty(), "Title should be extracted");
90
+
assert!(
91
+
article.title.contains("Glenn Greenwald"),
92
+
"Title should match expectation"
93
+
);
94
+
}
95
+
96
+
#[test]
97
+
fn test_theonion_extraction() {
98
+
let html = match get_test_html("theonion.html") {
99
+
Some(h) => h,
100
+
None => return,
101
+
};
102
+
let url = "https://www.theonion.com/theresa-may-narrowly-manages-to-survive-parliamentary-f-1831077604";
103
+
104
+
let readability = Readability::new(html, Some(url));
105
+
let article = readability.parse().unwrap();
106
+
107
+
assert!(!article.title.is_empty(), "Title should be extracted");
108
+
// The onion uses JSON-LD or meta tags usually, check if our rules caught it
109
+
// TODO: we should implement JSON-LD support
110
+
}
111
+
112
+
#[test]
113
+
fn test_readthedocs_extraction() {
114
+
let html = match get_test_html("readthedocs.html") {
115
+
Some(h) => h,
116
+
None => return,
117
+
};
118
+
let url = "http://docs.readthedocs.io/en/latest/getting_started.html";
119
+
120
+
let readability = Readability::new(html, Some(url));
121
+
let article = readability.parse().unwrap();
122
+
123
+
assert!(!article.title.is_empty(), "Title should be extracted");
90
124
}
+8
justfile
+8
justfile
···
12
12
start:
13
13
cargo run --bin malfestio-cli start
14
14
15
+
# Fetch test data for readability tests
16
+
fetch-test-data:
17
+
./scripts/fetch_test_data.sh
18
+
19
+
# Run readability tests (fetches data first)
20
+
test-readability: fetch-test-data
21
+
cargo test -p malfestio-readability --test readability_tests
22
+
15
23
# Run all tests
16
24
test:
17
25
cargo test --quiet
+24
scripts/fetch_test_data.sh
+24
scripts/fetch_test_data.sh
···
1
+
#!/bin/bash
2
+
mkdir -p crates/readability/tests/data
3
+
4
+
fetch_if_missing() {
5
+
local url="$1"
6
+
local output="$2"
7
+
8
+
if [ -f "$output" ]; then
9
+
echo "Cached: $output"
10
+
else
11
+
echo "Fetching $url..."
12
+
curl -L -H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -o "$output" "$url"
13
+
fi
14
+
}
15
+
16
+
fetch_if_missing "https://arxiv.org/abs/2009.03017" "crates/readability/tests/data/arxiv.html"
17
+
fetch_if_missing "https://en.wikipedia.org/wiki/Rust_(programming_language)" "crates/readability/tests/data/wikipedia.html"
18
+
fetch_if_missing "https://dougshapiro.medium.com/how-will-the-disruption-of-hollywood-play-out-42f724c921e1" "crates/readability/tests/data/medium.html"
19
+
fetch_if_missing "https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation" "crates/readability/tests/data/substack.html"
20
+
fetch_if_missing "https://www.theonion.com/theresa-may-narrowly-manages-to-survive-parliamentary-f-1831077604" "crates/readability/tests/data/theonion.html"
21
+
fetch_if_missing "http://docs.readthedocs.io/en/latest/getting_started.html" "crates/readability/tests/data/readthedocs.html"
22
+
fetch_if_missing "https://www.rust-lang.org/" "crates/readability/tests/data/generic.html"
23
+
24
+
echo "Done."