+1891
-520
Diff
round #0
+260
-366
Cargo.lock
+260
-366
Cargo.lock
···
218
218
]
219
219
220
220
[[package]]
221
-
name = "block-buffer"
222
-
version = "0.10.4"
221
+
name = "bstr"
222
+
version = "0.2.17"
223
223
source = "registry+https://github.com/rust-lang/crates.io-index"
224
-
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
224
+
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
225
225
dependencies = [
226
-
"generic-array",
226
+
"lazy_static",
227
+
"memchr",
228
+
"regex-automata 0.1.10",
227
229
]
228
230
229
231
[[package]]
···
267
269
version = "1.0.4"
268
270
source = "registry+https://github.com/rust-lang/crates.io-index"
269
271
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
272
+
273
+
[[package]]
274
+
name = "cfg_aliases"
275
+
version = "0.2.1"
276
+
source = "registry+https://github.com/rust-lang/crates.io-index"
277
+
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
270
278
271
279
[[package]]
272
280
name = "chrono"
···
356
364
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
357
365
358
366
[[package]]
359
-
name = "core-foundation"
360
-
version = "0.9.4"
361
-
source = "registry+https://github.com/rust-lang/crates.io-index"
362
-
checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
363
-
dependencies = [
364
-
"core-foundation-sys",
365
-
"libc",
366
-
]
367
-
368
-
[[package]]
369
-
name = "core-foundation"
370
-
version = "0.10.1"
371
-
source = "registry+https://github.com/rust-lang/crates.io-index"
372
-
checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
373
-
dependencies = [
374
-
"core-foundation-sys",
375
-
"libc",
376
-
]
377
-
378
-
[[package]]
379
367
name = "core-foundation-sys"
380
368
version = "0.8.7"
381
369
source = "registry+https://github.com/rust-lang/crates.io-index"
382
370
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
383
371
384
372
[[package]]
385
-
name = "cpufeatures"
386
-
version = "0.2.17"
387
-
source = "registry+https://github.com/rust-lang/crates.io-index"
388
-
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
389
-
dependencies = [
390
-
"libc",
391
-
]
392
-
393
-
[[package]]
394
373
name = "crc32fast"
395
374
version = "1.5.0"
396
375
source = "registry+https://github.com/rust-lang/crates.io-index"
···
440
419
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
441
420
442
421
[[package]]
443
-
name = "crypto-common"
444
-
version = "0.1.7"
445
-
source = "registry+https://github.com/rust-lang/crates.io-index"
446
-
checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
447
-
dependencies = [
448
-
"generic-array",
449
-
"typenum",
450
-
]
451
-
452
-
[[package]]
453
422
name = "cssparser"
454
423
version = "0.34.0"
455
424
source = "registry+https://github.com/rust-lang/crates.io-index"
···
491
460
"proc-macro2",
492
461
"quote",
493
462
"syn",
494
-
]
495
-
496
-
[[package]]
497
-
name = "digest"
498
-
version = "0.10.7"
499
-
source = "registry+https://github.com/rust-lang/crates.io-index"
500
-
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
501
-
dependencies = [
502
-
"block-buffer",
503
-
"crypto-common",
504
463
]
505
464
506
465
[[package]]
···
628
587
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
629
588
630
589
[[package]]
590
+
name = "feed-rs"
591
+
version = "2.3.1"
592
+
source = "registry+https://github.com/rust-lang/crates.io-index"
593
+
checksum = "e4c0591d23efd0d595099af69a31863ac1823046b1b021e3b06ba3aae7e00991"
594
+
dependencies = [
595
+
"chrono",
596
+
"mediatype",
597
+
"quick-xml",
598
+
"regex",
599
+
"serde",
600
+
"serde_json",
601
+
"siphasher",
602
+
"url",
603
+
"uuid",
604
+
]
605
+
606
+
[[package]]
631
607
name = "find-msvc-tools"
632
608
version = "0.1.9"
633
609
source = "registry+https://github.com/rust-lang/crates.io-index"
···
644
620
version = "0.1.5"
645
621
source = "registry+https://github.com/rust-lang/crates.io-index"
646
622
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
647
-
648
-
[[package]]
649
-
name = "foreign-types"
650
-
version = "0.3.2"
651
-
source = "registry+https://github.com/rust-lang/crates.io-index"
652
-
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
653
-
dependencies = [
654
-
"foreign-types-shared",
655
-
]
656
-
657
-
[[package]]
658
-
name = "foreign-types-shared"
659
-
version = "0.1.1"
660
-
source = "registry+https://github.com/rust-lang/crates.io-index"
661
-
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
662
623
663
624
[[package]]
664
625
name = "form_urlencoded"
···
705
666
checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
706
667
707
668
[[package]]
708
-
name = "futures-sink"
709
-
version = "0.3.32"
710
-
source = "registry+https://github.com/rust-lang/crates.io-index"
711
-
checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
712
-
713
-
[[package]]
714
669
name = "futures-task"
715
670
version = "0.3.32"
716
671
source = "registry+https://github.com/rust-lang/crates.io-index"
···
738
693
]
739
694
740
695
[[package]]
741
-
name = "generic-array"
742
-
version = "0.14.7"
743
-
source = "registry+https://github.com/rust-lang/crates.io-index"
744
-
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
745
-
dependencies = [
746
-
"typenum",
747
-
"version_check",
748
-
]
749
-
750
-
[[package]]
751
696
name = "getopts"
752
697
version = "0.2.24"
753
698
source = "registry+https://github.com/rust-lang/crates.io-index"
···
763
708
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
764
709
dependencies = [
765
710
"cfg-if",
711
+
"js-sys",
766
712
"libc",
767
713
"wasi",
714
+
"wasm-bindgen",
768
715
]
769
716
770
717
[[package]]
···
774
721
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
775
722
dependencies = [
776
723
"cfg-if",
724
+
"js-sys",
777
725
"libc",
778
726
"r-efi",
779
727
"wasip2",
728
+
"wasm-bindgen",
780
729
]
781
730
782
731
[[package]]
···
799
748
checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7"
800
749
801
750
[[package]]
802
-
name = "h2"
803
-
version = "0.4.13"
804
-
source = "registry+https://github.com/rust-lang/crates.io-index"
805
-
checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54"
806
-
dependencies = [
807
-
"atomic-waker",
808
-
"bytes",
809
-
"fnv",
810
-
"futures-core",
811
-
"futures-sink",
812
-
"http",
813
-
"indexmap",
814
-
"slab",
815
-
"tokio",
816
-
"tokio-util",
817
-
"tracing",
818
-
]
819
-
820
-
[[package]]
821
751
name = "hashbrown"
822
752
version = "0.15.5"
823
753
source = "registry+https://github.com/rust-lang/crates.io-index"
···
928
858
"bytes",
929
859
"futures-channel",
930
860
"futures-core",
931
-
"h2",
932
861
"http",
933
862
"http-body",
934
863
"httparse",
···
955
884
"tokio",
956
885
"tokio-rustls",
957
886
"tower-service",
958
-
]
959
-
960
-
[[package]]
961
-
name = "hyper-tls"
962
-
version = "0.6.0"
963
-
source = "registry+https://github.com/rust-lang/crates.io-index"
964
-
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
965
-
dependencies = [
966
-
"bytes",
967
-
"http-body-util",
968
-
"hyper",
969
-
"hyper-util",
970
-
"native-tls",
971
-
"tokio",
972
-
"tokio-native-tls",
973
-
"tower-service",
887
+
"webpki-roots",
974
888
]
975
889
976
890
[[package]]
···
991
905
"percent-encoding",
992
906
"pin-project-lite",
993
907
"socket2",
994
-
"system-configuration",
995
908
"tokio",
996
909
"tower-service",
997
910
"tracing",
998
-
"windows-registry",
999
911
]
1000
912
1001
913
[[package]]
···
1310
1222
]
1311
1223
1312
1224
[[package]]
1225
+
name = "lru-slab"
1226
+
version = "0.1.2"
1227
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1228
+
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
1229
+
1230
+
[[package]]
1313
1231
name = "lz4_flex"
1314
1232
version = "0.11.5"
1315
1233
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1352
1270
source = "registry+https://github.com/rust-lang/crates.io-index"
1353
1271
checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
1354
1272
dependencies = [
1355
-
"regex-automata",
1273
+
"regex-automata 0.4.14",
1356
1274
]
1357
1275
1358
1276
[[package]]
···
1362
1280
checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
1363
1281
1364
1282
[[package]]
1283
+
name = "maud"
1284
+
version = "0.27.0"
1285
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1286
+
checksum = "8156733e27020ea5c684db5beac5d1d611e1272ab17901a49466294b84fc217e"
1287
+
dependencies = [
1288
+
"axum-core",
1289
+
"http",
1290
+
"itoa",
1291
+
"maud_macros",
1292
+
]
1293
+
1294
+
[[package]]
1295
+
name = "maud_macros"
1296
+
version = "0.27.0"
1297
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1298
+
checksum = "7261b00f3952f617899bc012e3dbd56e4f0110a038175929fa5d18e5a19913ca"
1299
+
dependencies = [
1300
+
"proc-macro2",
1301
+
"proc-macro2-diagnostics",
1302
+
"quote",
1303
+
"syn",
1304
+
]
1305
+
1306
+
[[package]]
1365
1307
name = "measure_time"
1366
1308
version = "0.8.3"
1367
1309
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1372
1314
]
1373
1315
1374
1316
[[package]]
1317
+
name = "mediatype"
1318
+
version = "0.19.20"
1319
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1320
+
checksum = "33746aadcb41349ec291e7f2f0a3aa6834d1d7c58066fb4b01f68efc4c4b7631"
1321
+
dependencies = [
1322
+
"serde",
1323
+
]
1324
+
1325
+
[[package]]
1375
1326
name = "memchr"
1376
1327
version = "2.8.0"
1377
1328
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1436
1387
"clap",
1437
1388
"color-eyre",
1438
1389
"dirs",
1390
+
"maud",
1439
1391
"mime_guess",
1440
1392
"motet_core",
1393
+
"regex",
1441
1394
"reqwest",
1442
-
"rust-embed",
1443
1395
"serde",
1444
1396
"serde_json",
1445
1397
"tokio",
···
1455
1407
"chrono",
1456
1408
"dirs",
1457
1409
"eyre",
1410
+
"feed-rs",
1458
1411
"reqwest",
1459
1412
"rusqlite",
1460
1413
"scraper",
1461
1414
"serde",
1462
1415
"serde_json",
1463
1416
"tantivy",
1417
+
"texting_robots",
1464
1418
"thiserror 2.0.18",
1465
1419
"tokio",
1466
1420
"tracing",
···
1474
1428
checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
1475
1429
1476
1430
[[package]]
1477
-
name = "native-tls"
1478
-
version = "0.2.18"
1479
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1480
-
checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2"
1481
-
dependencies = [
1482
-
"libc",
1483
-
"log",
1484
-
"openssl",
1485
-
"openssl-probe",
1486
-
"openssl-sys",
1487
-
"schannel",
1488
-
"security-framework",
1489
-
"security-framework-sys",
1490
-
"tempfile",
1491
-
]
1492
-
1493
-
[[package]]
1494
1431
name = "new_debug_unreachable"
1495
1432
version = "1.0.6"
1496
1433
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1569
1506
checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107"
1570
1507
1571
1508
[[package]]
1572
-
name = "openssl"
1573
-
version = "0.10.75"
1574
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1575
-
checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328"
1576
-
dependencies = [
1577
-
"bitflags",
1578
-
"cfg-if",
1579
-
"foreign-types",
1580
-
"libc",
1581
-
"once_cell",
1582
-
"openssl-macros",
1583
-
"openssl-sys",
1584
-
]
1585
-
1586
-
[[package]]
1587
-
name = "openssl-macros"
1588
-
version = "0.1.1"
1589
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1590
-
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
1591
-
dependencies = [
1592
-
"proc-macro2",
1593
-
"quote",
1594
-
"syn",
1595
-
]
1596
-
1597
-
[[package]]
1598
-
name = "openssl-probe"
1599
-
version = "0.2.1"
1600
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1601
-
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
1602
-
1603
-
[[package]]
1604
-
name = "openssl-sys"
1605
-
version = "0.9.111"
1606
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1607
-
checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321"
1608
-
dependencies = [
1609
-
"cc",
1610
-
"libc",
1611
-
"pkg-config",
1612
-
"vcpkg",
1613
-
]
1614
-
1615
-
[[package]]
1616
1509
name = "option-ext"
1617
1510
version = "0.2.0"
1618
1511
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1689
1582
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
1690
1583
dependencies = [
1691
1584
"phf_shared",
1692
-
"rand",
1585
+
"rand 0.8.5",
1693
1586
]
1694
1587
1695
1588
[[package]]
···
1782
1675
]
1783
1676
1784
1677
[[package]]
1678
+
name = "proc-macro2-diagnostics"
1679
+
version = "0.10.1"
1680
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1681
+
checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
1682
+
dependencies = [
1683
+
"proc-macro2",
1684
+
"quote",
1685
+
"syn",
1686
+
"version_check",
1687
+
]
1688
+
1689
+
[[package]]
1690
+
name = "quick-xml"
1691
+
version = "0.37.5"
1692
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1693
+
checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
1694
+
dependencies = [
1695
+
"encoding_rs",
1696
+
"memchr",
1697
+
]
1698
+
1699
+
[[package]]
1700
+
name = "quinn"
1701
+
version = "0.11.9"
1702
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1703
+
checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
1704
+
dependencies = [
1705
+
"bytes",
1706
+
"cfg_aliases",
1707
+
"pin-project-lite",
1708
+
"quinn-proto",
1709
+
"quinn-udp",
1710
+
"rustc-hash 2.1.1",
1711
+
"rustls",
1712
+
"socket2",
1713
+
"thiserror 2.0.18",
1714
+
"tokio",
1715
+
"tracing",
1716
+
"web-time",
1717
+
]
1718
+
1719
+
[[package]]
1720
+
name = "quinn-proto"
1721
+
version = "0.11.13"
1722
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1723
+
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
1724
+
dependencies = [
1725
+
"bytes",
1726
+
"getrandom 0.3.4",
1727
+
"lru-slab",
1728
+
"rand 0.9.2",
1729
+
"ring",
1730
+
"rustc-hash 2.1.1",
1731
+
"rustls",
1732
+
"rustls-pki-types",
1733
+
"slab",
1734
+
"thiserror 2.0.18",
1735
+
"tinyvec",
1736
+
"tracing",
1737
+
"web-time",
1738
+
]
1739
+
1740
+
[[package]]
1741
+
name = "quinn-udp"
1742
+
version = "0.5.14"
1743
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1744
+
checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
1745
+
dependencies = [
1746
+
"cfg_aliases",
1747
+
"libc",
1748
+
"once_cell",
1749
+
"socket2",
1750
+
"tracing",
1751
+
"windows-sys 0.60.2",
1752
+
]
1753
+
1754
+
[[package]]
1785
1755
name = "quote"
1786
1756
version = "1.0.44"
1787
1757
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1803
1773
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
1804
1774
dependencies = [
1805
1775
"libc",
1806
-
"rand_chacha",
1807
-
"rand_core",
1776
+
"rand_chacha 0.3.1",
1777
+
"rand_core 0.6.4",
1778
+
]
1779
+
1780
+
[[package]]
1781
+
name = "rand"
1782
+
version = "0.9.2"
1783
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1784
+
checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
1785
+
dependencies = [
1786
+
"rand_chacha 0.9.0",
1787
+
"rand_core 0.9.5",
1808
1788
]
1809
1789
1810
1790
[[package]]
···
1814
1794
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
1815
1795
dependencies = [
1816
1796
"ppv-lite86",
1817
-
"rand_core",
1797
+
"rand_core 0.6.4",
1798
+
]
1799
+
1800
+
[[package]]
1801
+
name = "rand_chacha"
1802
+
version = "0.9.0"
1803
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1804
+
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
1805
+
dependencies = [
1806
+
"ppv-lite86",
1807
+
"rand_core 0.9.5",
1818
1808
]
1819
1809
1820
1810
[[package]]
···
1827
1817
]
1828
1818
1829
1819
[[package]]
1820
+
name = "rand_core"
1821
+
version = "0.9.5"
1822
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1823
+
checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
1824
+
dependencies = [
1825
+
"getrandom 0.3.4",
1826
+
]
1827
+
1828
+
[[package]]
1830
1829
name = "rand_distr"
1831
1830
version = "0.4.3"
1832
1831
source = "registry+https://github.com/rust-lang/crates.io-index"
1833
1832
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
1834
1833
dependencies = [
1835
1834
"num-traits",
1836
-
"rand",
1835
+
"rand 0.8.5",
1837
1836
]
1838
1837
1839
1838
[[package]]
···
1884
1883
dependencies = [
1885
1884
"aho-corasick",
1886
1885
"memchr",
1887
-
"regex-automata",
1886
+
"regex-automata 0.4.14",
1888
1887
"regex-syntax",
1889
1888
]
1890
1889
1891
1890
[[package]]
1892
1891
name = "regex-automata"
1892
+
version = "0.1.10"
1893
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1894
+
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
1895
+
1896
+
[[package]]
1897
+
name = "regex-automata"
1893
1898
version = "0.4.14"
1894
1899
source = "registry+https://github.com/rust-lang/crates.io-index"
1895
1900
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
···
1913
1918
dependencies = [
1914
1919
"base64",
1915
1920
"bytes",
1916
-
"encoding_rs",
1917
1921
"futures-core",
1918
-
"h2",
1919
1922
"http",
1920
1923
"http-body",
1921
1924
"http-body-util",
1922
1925
"hyper",
1923
1926
"hyper-rustls",
1924
-
"hyper-tls",
1925
1927
"hyper-util",
1926
1928
"js-sys",
1927
1929
"log",
1928
-
"mime",
1929
-
"native-tls",
1930
1930
"percent-encoding",
1931
1931
"pin-project-lite",
1932
+
"quinn",
1933
+
"rustls",
1932
1934
"rustls-pki-types",
1933
1935
"serde",
1934
1936
"serde_json",
1935
1937
"serde_urlencoded",
1936
1938
"sync_wrapper",
1937
1939
"tokio",
1938
-
"tokio-native-tls",
1940
+
"tokio-rustls",
1939
1941
"tower",
1940
1942
"tower-http",
1941
1943
"tower-service",
···
1943
1945
"wasm-bindgen",
1944
1946
"wasm-bindgen-futures",
1945
1947
"web-sys",
1948
+
"webpki-roots",
1946
1949
]
1947
1950
1948
1951
[[package]]
···
1974
1977
]
1975
1978
1976
1979
[[package]]
1977
-
name = "rust-embed"
1978
-
version = "8.11.0"
1979
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1980
-
checksum = "04113cb9355a377d83f06ef1f0a45b8ab8cd7d8b1288160717d66df5c7988d27"
1981
-
dependencies = [
1982
-
"axum",
1983
-
"rust-embed-impl",
1984
-
"rust-embed-utils",
1985
-
"walkdir",
1986
-
]
1987
-
1988
-
[[package]]
1989
-
name = "rust-embed-impl"
1990
-
version = "8.11.0"
1991
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1992
-
checksum = "da0902e4c7c8e997159ab384e6d0fc91c221375f6894346ae107f47dd0f3ccaa"
1993
-
dependencies = [
1994
-
"proc-macro2",
1995
-
"quote",
1996
-
"rust-embed-utils",
1997
-
"syn",
1998
-
"walkdir",
1999
-
]
2000
-
2001
-
[[package]]
2002
-
name = "rust-embed-utils"
2003
-
version = "8.11.0"
2004
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2005
-
checksum = "5bcdef0be6fe7f6fa333b1073c949729274b05f123a0ad7efcb8efd878e5c3b1"
2006
-
dependencies = [
2007
-
"sha2",
2008
-
"walkdir",
2009
-
]
2010
-
2011
-
[[package]]
2012
1980
name = "rust-stemmers"
2013
1981
version = "1.2.0"
2014
1982
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2031
1999
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
2032
2000
2033
2001
[[package]]
2002
+
name = "rustc-hash"
2003
+
version = "2.1.1"
2004
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2005
+
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
2006
+
2007
+
[[package]]
2034
2008
name = "rustix"
2035
2009
version = "0.38.44"
2036
2010
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2063
2037
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
2064
2038
dependencies = [
2065
2039
"once_cell",
2040
+
"ring",
2066
2041
"rustls-pki-types",
2067
2042
"rustls-webpki",
2068
2043
"subtle",
···
2075
2050
source = "registry+https://github.com/rust-lang/crates.io-index"
2076
2051
checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
2077
2052
dependencies = [
2053
+
"web-time",
2078
2054
"zeroize",
2079
2055
]
2080
2056
···
2102
2078
checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
2103
2079
2104
2080
[[package]]
2105
-
name = "same-file"
2106
-
version = "1.0.6"
2107
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2108
-
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
2109
-
dependencies = [
2110
-
"winapi-util",
2111
-
]
2112
-
2113
-
[[package]]
2114
-
name = "schannel"
2115
-
version = "0.1.28"
2116
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2117
-
checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1"
2118
-
dependencies = [
2119
-
"windows-sys 0.61.2",
2120
-
]
2121
-
2122
-
[[package]]
2123
2081
name = "scopeguard"
2124
2082
version = "1.2.0"
2125
2083
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2141
2099
]
2142
2100
2143
2101
[[package]]
2144
-
name = "security-framework"
2145
-
version = "3.7.0"
2146
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2147
-
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
2148
-
dependencies = [
2149
-
"bitflags",
2150
-
"core-foundation 0.10.1",
2151
-
"core-foundation-sys",
2152
-
"libc",
2153
-
"security-framework-sys",
2154
-
]
2155
-
2156
-
[[package]]
2157
-
name = "security-framework-sys"
2158
-
version = "2.17.0"
2159
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2160
-
checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
2161
-
dependencies = [
2162
-
"core-foundation-sys",
2163
-
"libc",
2164
-
]
2165
-
2166
-
[[package]]
2167
2102
name = "selectors"
2168
2103
version = "0.26.0"
2169
2104
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2261
2196
checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930"
2262
2197
dependencies = [
2263
2198
"stable_deref_trait",
2264
-
]
2265
-
2266
-
[[package]]
2267
-
name = "sha2"
2268
-
version = "0.10.9"
2269
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2270
-
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
2271
-
dependencies = [
2272
-
"cfg-if",
2273
-
"cpufeatures",
2274
-
"digest",
2275
2199
]
2276
2200
2277
2201
[[package]]
···
2411
2335
]
2412
2336
2413
2337
[[package]]
2414
-
name = "system-configuration"
2415
-
version = "0.7.0"
2416
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2417
-
checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b"
2418
-
dependencies = [
2419
-
"bitflags",
2420
-
"core-foundation 0.9.4",
2421
-
"system-configuration-sys",
2422
-
]
2423
-
2424
-
[[package]]
2425
-
name = "system-configuration-sys"
2426
-
version = "0.6.0"
2427
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2428
-
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
2429
-
dependencies = [
2430
-
"core-foundation-sys",
2431
-
"libc",
2432
-
]
2433
-
2434
-
[[package]]
2435
2338
name = "tantivy"
2436
2339
version = "0.22.1"
2437
2340
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2463
2366
"rayon",
2464
2367
"regex",
2465
2368
"rust-stemmers",
2466
-
"rustc-hash",
2369
+
"rustc-hash 1.1.0",
2467
2370
"serde",
2468
2371
"serde_json",
2469
2372
"sketches-ddsketch",
···
2597
2500
]
2598
2501
2599
2502
[[package]]
2503
+
name = "texting_robots"
2504
+
version = "0.2.2"
2505
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2506
+
checksum = "5b82a718a28dda2e67ad6e0464597b58eae39e2e4d0451e03d1028d71e81bb4a"
2507
+
dependencies = [
2508
+
"anyhow",
2509
+
"bstr",
2510
+
"lazy_static",
2511
+
"nom",
2512
+
"percent-encoding",
2513
+
"regex",
2514
+
"thiserror 1.0.69",
2515
+
"url",
2516
+
]
2517
+
2518
+
[[package]]
2600
2519
name = "thiserror"
2601
2520
version = "1.0.69"
2602
2521
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2687
2606
]
2688
2607
2689
2608
[[package]]
2609
+
name = "tinyvec"
2610
+
version = "1.10.0"
2611
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2612
+
checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
2613
+
dependencies = [
2614
+
"tinyvec_macros",
2615
+
]
2616
+
2617
+
[[package]]
2618
+
name = "tinyvec_macros"
2619
+
version = "0.1.1"
2620
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2621
+
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
2622
+
2623
+
[[package]]
2690
2624
name = "tokio"
2691
2625
version = "1.49.0"
2692
2626
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2715
2649
]
2716
2650
2717
2651
[[package]]
2718
-
name = "tokio-native-tls"
2719
-
version = "0.3.1"
2720
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2721
-
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
2722
-
dependencies = [
2723
-
"native-tls",
2724
-
"tokio",
2725
-
]
2726
-
2727
-
[[package]]
2728
2652
name = "tokio-rustls"
2729
2653
version = "0.26.4"
2730
2654
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2735
2659
]
2736
2660
2737
2661
[[package]]
2738
-
name = "tokio-util"
2739
-
version = "0.7.18"
2740
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2741
-
checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
2742
-
dependencies = [
2743
-
"bytes",
2744
-
"futures-core",
2745
-
"futures-sink",
2746
-
"pin-project-lite",
2747
-
"tokio",
2748
-
]
2749
-
2750
-
[[package]]
2751
2662
name = "tower"
2752
2663
version = "0.5.3"
2753
2664
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2856
2767
"matchers",
2857
2768
"nu-ansi-term",
2858
2769
"once_cell",
2859
-
"regex-automata",
2770
+
"regex-automata 0.4.14",
2860
2771
"sharded-slab",
2861
2772
"smallvec",
2862
2773
"thread_local",
···
2872
2783
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
2873
2784
2874
2785
[[package]]
2875
-
name = "typenum"
2876
-
version = "1.19.0"
2877
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2878
-
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
2879
-
2880
-
[[package]]
2881
2786
name = "unicase"
2882
2787
version = "2.9.0"
2883
2788
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2973
2878
version = "0.9.5"
2974
2879
source = "registry+https://github.com/rust-lang/crates.io-index"
2975
2880
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
2976
-
2977
-
[[package]]
2978
-
name = "walkdir"
2979
-
version = "2.5.0"
2980
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2981
-
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
2982
-
dependencies = [
2983
-
"same-file",
2984
-
"winapi-util",
2985
-
]
2986
2881
2987
2882
[[package]]
2988
2883
name = "want"
···
3121
3016
]
3122
3017
3123
3018
[[package]]
3019
+
name = "web-time"
3020
+
version = "1.1.0"
3021
+
source = "registry+https://github.com/rust-lang/crates.io-index"
3022
+
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
3023
+
dependencies = [
3024
+
"js-sys",
3025
+
"wasm-bindgen",
3026
+
]
3027
+
3028
+
[[package]]
3029
+
name = "webpki-roots"
3030
+
version = "1.0.6"
3031
+
source = "registry+https://github.com/rust-lang/crates.io-index"
3032
+
checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
3033
+
dependencies = [
3034
+
"rustls-pki-types",
3035
+
]
3036
+
3037
+
[[package]]
3124
3038
name = "winapi"
3125
3039
version = "0.3.9"
3126
3040
source = "registry+https://github.com/rust-lang/crates.io-index"
···
3135
3049
version = "0.4.0"
3136
3050
source = "registry+https://github.com/rust-lang/crates.io-index"
3137
3051
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
3138
-
3139
-
[[package]]
3140
-
name = "winapi-util"
3141
-
version = "0.1.11"
3142
-
source = "registry+https://github.com/rust-lang/crates.io-index"
3143
-
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
3144
-
dependencies = [
3145
-
"windows-sys 0.61.2",
3146
-
]
3147
3052
3148
3053
[[package]]
3149
3054
name = "winapi-x86_64-pc-windows-gnu"
···
3191
3096
version = "0.2.1"
3192
3097
source = "registry+https://github.com/rust-lang/crates.io-index"
3193
3098
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
3194
-
3195
-
[[package]]
3196
-
name = "windows-registry"
3197
-
version = "0.6.1"
3198
-
source = "registry+https://github.com/rust-lang/crates.io-index"
3199
-
checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720"
3200
-
dependencies = [
3201
-
"windows-link",
3202
-
"windows-result",
3203
-
"windows-strings",
3204
-
]
3205
3099
3206
3100
[[package]]
3207
3101
name = "windows-result"
+5
-2
Cargo.toml
+5
-2
Cargo.toml
···
25
25
color-eyre = "0.6"
26
26
dirs = "6.0"
27
27
eyre = "0.6"
28
+
feed-rs = "2.3"
29
+
maud = { version = "0.27", features = ["axum"] }
28
30
mime_guess = "2.0"
29
-
reqwest = { version = "0.12", features = ["json"] }
30
-
rust-embed = { version = "8.0", features = ["axum"] }
31
+
regex = "1.11"
32
+
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
31
33
rusqlite = { version = "0.33", features = ["bundled"] }
32
34
scraper = "0.22"
33
35
serde = { version = "1.0", features = ["derive"] }
34
36
serde_json = "1.0"
35
37
tantivy = "0.22"
38
+
texting_robots = "0.2"
36
39
thiserror = "2.0"
37
40
tokio = { version = "1.0", features = ["full"] }
38
41
tower-http = { version = "0.6", features = ["cors"] }
+2
-1
motet_cli/Cargo.toml
+2
-1
motet_cli/Cargo.toml
···
21
21
clap = { workspace = true }
22
22
color-eyre = { workspace = true }
23
23
dirs = { workspace = true }
24
+
maud = { workspace = true }
24
25
mime_guess = { workspace = true }
26
+
regex = { workspace = true }
25
27
motet_core = { workspace = true }
26
28
reqwest = { workspace = true }
27
-
rust-embed = { workspace = true }
28
29
serde = { workspace = true }
29
30
serde_json = { workspace = true }
30
31
tokio = { workspace = true }
+82
-20
motet_cli/src/commands.rs
+82
-20
motet_cli/src/commands.rs
···
3
3
use clap::Args;
4
4
use color_eyre::eyre::{Result, WrapErr};
5
5
use motet_core::{
6
-
config,
7
-
crawler,
6
+
config::{self, SourceConfig},
7
+
crawler::{self, CrawledDocument},
8
8
index::SearchIndex,
9
9
query,
10
+
robots::RobotsChecker,
10
11
store::{CrawlRecord, Store},
11
12
};
12
-
use std::path::PathBuf;
13
+
use std::{path::PathBuf, sync::Arc};
13
14
use tracing::info;
14
15
15
16
/// Arguments for `motet crawl`.
···
75
76
let store = Store::open(&data_dir.join("motet.db"))?;
76
77
77
78
let client = reqwest::Client::builder()
78
-
.user_agent("motet/0.1 (personal search indexer)")
79
+
// Some CDNs (Akamai/CBC, Cloudflare/Bluesky) reject non-browser UAs.
80
+
// Since motet is a personal indexer for user-chosen sources, we use a
81
+
// real browser UA to avoid false-positive bot blocking.
82
+
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
83
+
.timeout(std::time::Duration::from_secs(30))
84
+
.connect_timeout(std::time::Duration::from_secs(10))
85
+
79
86
.build()
80
87
.wrap_err("failed to build HTTP client")?;
81
88
82
-
let sources: Vec<_> = match &args.source {
89
+
let sources: Vec<(String, SourceConfig)> = match &args.source {
83
90
Some(name) => {
84
91
if let Some(source_cfg) = cfg.sources.get(name) {
85
-
vec![(name.as_str(), source_cfg)]
92
+
vec![(name.clone(), source_cfg.clone())]
86
93
} else {
87
94
eprintln!("Source {name:?} not found in config. Available sources:");
88
95
for key in cfg.sources.keys() {
···
91
98
return Ok(());
92
99
}
93
100
}
94
-
None => cfg.sources.iter().map(|(k, v)| (k.as_str(), v)).collect(),
101
+
None => cfg
102
+
.sources
103
+
.into_iter()
104
+
.collect(),
95
105
};
96
106
97
-
for (name, source_cfg) in sources {
98
-
if args.dry_run {
107
+
if args.dry_run {
108
+
for (name, source_cfg) in &sources {
99
109
println!("[dry run] Would crawl source: {name} (kind: {:?})", source_cfg.kind);
100
-
continue;
101
110
}
111
+
return Ok(());
112
+
}
102
113
103
-
info!(%name, "starting crawl");
114
+
// Shared state for parallel crawling.
115
+
// reqwest::Client is Arc-based internally; RobotsChecker uses tokio::sync::Mutex.
116
+
let robots = Arc::new(RobotsChecker::new(client.clone()));
104
117
105
-
match crawler::dispatch(name, source_cfg, &client).await {
106
-
Ok(docs) => {
118
+
// Crawl all sources concurrently (bounded by JoinSet backpressure).
119
+
let max_concurrent = 6;
120
+
let mut join_set = tokio::task::JoinSet::new();
121
+
let mut pending = std::collections::VecDeque::from(sources);
122
+
123
+
// Seed the initial batch
124
+
while join_set.len() < max_concurrent {
125
+
let Some((name, source_cfg)) = pending.pop_front() else {
126
+
break;
127
+
};
128
+
let client = client.clone();
129
+
let robots = Arc::clone(&robots);
130
+
join_set.spawn(crawl_source(name, source_cfg, client, robots));
131
+
}
132
+
133
+
// Process results as they arrive, spawning more tasks to maintain concurrency.
134
+
while let Some(result) = join_set.join_next().await {
135
+
match result {
136
+
Ok(Ok((name, kind_label, docs))) => {
107
137
println!(" Crawled {} documents from {name}", docs.len());
108
138
109
-
// Store crawl records in SQLite
110
139
for doc in &docs {
111
140
store.upsert_crawl(&CrawlRecord {
112
141
url: doc.url.to_string(),
113
-
source_name: name.to_string(),
142
+
source_name: name.clone(),
114
143
crawled_at: doc.crawled_at,
115
144
etag: None,
116
145
title: Some(doc.title.clone()),
117
146
})?;
118
147
}
119
148
120
-
// Index in Tantivy
121
-
let kind_label = source_cfg.kind_label();
122
-
index.index_documents(name, kind_label, &docs)?;
149
+
index.index_documents(&name, &kind_label, &docs)?;
123
150
println!(" Indexed {} documents from {name}", docs.len());
124
151
}
125
-
Err(e) => {
152
+
Ok(Err((name, e))) => {
126
153
eprintln!(" Error crawling {name}: {e:#}");
154
+
}
155
+
Err(join_err) => {
156
+
eprintln!(" Task panicked: {join_err}");
127
157
}
128
158
}
159
+
160
+
// Refill the pool
161
+
if let Some((name, source_cfg)) = pending.pop_front() {
162
+
let client = client.clone();
163
+
let robots = Arc::clone(&robots);
164
+
join_set.spawn(crawl_source(name, source_cfg, client, robots));
165
+
}
129
166
}
130
167
131
168
let total = index.num_docs()?;
···
134
171
Ok(())
135
172
}
136
173
174
+
/// Crawl a single source. Returns `(source_name, kind_label, documents)` on
175
+
/// success, or `(source_name, error)` on failure. This signature is designed
176
+
/// for use inside a `JoinSet`.
177
+
async fn crawl_source(
178
+
name: String,
179
+
config: SourceConfig,
180
+
client: reqwest::Client,
181
+
robots: Arc<RobotsChecker>,
182
+
) -> std::result::Result<(String, String, Vec<CrawledDocument>), (String, color_eyre::eyre::Report)>
183
+
{
184
+
info!(%name, "starting crawl");
185
+
186
+
let kind_label = config.kind_label().to_string();
187
+
188
+
match crawler::dispatch(&name, &config, &client, &robots).await {
189
+
Ok(docs) => Ok((name, kind_label, docs)),
190
+
Err(e) => Err((name, e)),
191
+
}
192
+
}
193
+
137
194
/// Run the search command.
138
195
pub(crate) fn search(args: &SearchArgs) -> Result<()> {
139
196
let query_str = args.query.join(" ");
···
151
208
}
152
209
153
210
let index = SearchIndex::open(&index_path)?;
154
-
let results = query::search(index.inner(), &query_str, args.limit)?;
211
+
let results = query::search(
212
+
index.inner(),
213
+
&query_str,
214
+
args.limit,
215
+
&query::SearchFilters::default(),
216
+
)?;
155
217
156
218
if results.is_empty() {
157
219
println!("No results for: {query_str}");
+360
-73
motet_cli/src/serve.rs
+360
-73
motet_cli/src/serve.rs
···
1
-
//! Web server — axum API + embedded React frontend.
1
+
//! Web server — axum with server-side rendered HTML via maud.
2
2
3
3
use axum::{
4
4
Json, Router,
···
8
8
routing::get,
9
9
};
10
10
use color_eyre::eyre::Result;
11
-
use motet_core::{config, index::SearchIndex, query};
12
-
use rust_embed::Embed;
11
+
use maud::{DOCTYPE, Markup, PreEscaped, html};
12
+
use motet_core::{config, index::SearchIndex, query, store::Store};
13
13
use serde::{Deserialize, Serialize};
14
14
use std::sync::Arc;
15
+
use tokio::sync::Mutex;
15
16
use tower_http::cors::CorsLayer;
16
17
use tracing::info;
17
18
18
19
use crate::commands::ServeArgs;
19
20
20
-
/// Embedded React frontend assets.
21
-
#[derive(Embed)]
22
-
#[folder = "../motet_web/dist/"]
23
-
struct WebAssets;
24
-
25
21
/// Shared application state.
22
+
///
23
+
/// `Store` wraps a `rusqlite::Connection` which is not `Sync`, so we
24
+
/// protect it with a `tokio::sync::Mutex`.
26
25
struct AppState {
27
26
index: SearchIndex,
27
+
store: Mutex<Store>,
28
28
}
29
29
30
+
// ---------------------------------------------------------------------------
31
+
// HTML layout
32
+
// ---------------------------------------------------------------------------
33
+
34
+
/// Shared page shell: doctype, head, nav, footer.
35
+
fn layout(title: &str, content: Markup) -> Markup {
36
+
html! {
37
+
(DOCTYPE)
38
+
html lang="en" {
39
+
head {
40
+
meta charset="utf-8";
41
+
meta name="viewport" content="width=device-width, initial-scale=1";
42
+
title { (title) " — motet" }
43
+
style { (CSS) }
44
+
}
45
+
body {
46
+
nav {
47
+
a href="/" { "motet" }
48
+
}
49
+
main { (content) }
50
+
footer {
51
+
p { "search your corner of the web" }
52
+
}
53
+
}
54
+
}
55
+
}
56
+
}
57
+
58
+
/// Minimal embedded CSS — no build step needed.
59
+
const CSS: &str = r#"
60
+
:root {
61
+
--bg: #fafaf9;
62
+
--fg: #1c1917;
63
+
--muted: #78716c;
64
+
--accent: #b45309;
65
+
--border: #d6d3d1;
66
+
--card-bg: #fff;
67
+
--radius: 6px;
68
+
}
69
+
@media (prefers-color-scheme: dark) {
70
+
:root {
71
+
--bg: #1c1917;
72
+
--fg: #fafaf9;
73
+
--muted: #a8a29e;
74
+
--accent: #f59e0b;
75
+
--border: #44403c;
76
+
--card-bg: #292524;
77
+
}
78
+
}
79
+
* { margin: 0; padding: 0; box-sizing: border-box; }
80
+
body {
81
+
font-family: system-ui, -apple-system, sans-serif;
82
+
background: var(--bg);
83
+
color: var(--fg);
84
+
line-height: 1.6;
85
+
max-width: 48rem;
86
+
margin: 0 auto;
87
+
padding: 1rem 1.5rem;
88
+
}
89
+
nav { padding: 0.75rem 0; border-bottom: 1px solid var(--border); margin-bottom: 1.5rem; }
90
+
nav a { color: var(--accent); text-decoration: none; font-weight: 700; font-size: 1.25rem; }
91
+
footer { margin-top: 3rem; padding-top: 1rem; border-top: 1px solid var(--border); color: var(--muted); font-size: 0.85rem; }
92
+
main { min-height: 60vh; }
93
+
h1 { font-size: 1.5rem; margin-bottom: 1rem; }
94
+
form { display: flex; gap: 0.5rem; margin-bottom: 1.5rem; }
95
+
input[type=search] {
96
+
flex: 1;
97
+
padding: 0.5rem 0.75rem;
98
+
font-size: 1rem;
99
+
border: 1px solid var(--border);
100
+
border-radius: var(--radius);
101
+
background: var(--card-bg);
102
+
color: var(--fg);
103
+
}
104
+
button {
105
+
padding: 0.5rem 1.25rem;
106
+
font-size: 1rem;
107
+
background: var(--accent);
108
+
color: #fff;
109
+
border: none;
110
+
border-radius: var(--radius);
111
+
cursor: pointer;
112
+
}
113
+
button:hover { opacity: 0.9; }
114
+
.results { list-style: none; }
115
+
.result { margin-bottom: 1.25rem; padding-bottom: 1.25rem; border-bottom: 1px solid var(--border); }
116
+
.result:last-child { border-bottom: none; }
117
+
.result-title { font-size: 1.1rem; }
118
+
.result-title a { color: var(--accent); text-decoration: none; }
119
+
.result-title a:hover { text-decoration: underline; }
120
+
.result-url { font-size: 0.8rem; color: var(--muted); word-break: break-all; }
121
+
.result-snippet { margin-top: 0.25rem; color: var(--fg); font-size: 0.95rem; }
122
+
.result-meta { margin-top: 0.25rem; font-size: 0.8rem; color: var(--muted); }
123
+
.empty { color: var(--muted); font-style: italic; }
124
+
.stats-grid { display: grid; grid-template-columns: max-content 1fr; gap: 0.25rem 1rem; }
125
+
.stats-grid dt { color: var(--muted); }
126
+
.stats-grid dd { font-weight: 600; }
127
+
.filters { margin-bottom: 1rem; font-size: 0.9rem; color: var(--muted); }
128
+
.filter-tag {
129
+
display: inline-block;
130
+
padding: 0.15rem 0.5rem;
131
+
background: var(--card-bg);
132
+
border: 1px solid var(--border);
133
+
border-radius: var(--radius);
134
+
font-size: 0.85rem;
135
+
margin-right: 0.25rem;
136
+
}
137
+
.filter-tag a { color: var(--muted); text-decoration: none; margin-left: 0.25rem; }
138
+
.filter-tag a:hover { color: var(--accent); }
139
+
.filter-link { color: var(--muted); text-decoration: none; }
140
+
.filter-link:hover { color: var(--accent); text-decoration: underline; }
141
+
mark { background: #fef3c7; color: var(--fg); padding: 0.05rem 0.15rem; border-radius: 2px; }
142
+
@media (prefers-color-scheme: dark) {
143
+
mark { background: #78350f; }
144
+
}
145
+
"#;
146
+
147
+
// ---------------------------------------------------------------------------
148
+
// HTML pages
149
+
// ---------------------------------------------------------------------------
150
+
151
+
/// Query parameters for the search page.
152
+
#[derive(Debug, Deserialize)]
153
+
struct SearchParams {
154
+
#[serde(default)]
155
+
q: String,
156
+
#[serde(default = "default_limit")]
157
+
limit: usize,
158
+
/// Filter by source name (e.g. `crates_io`).
159
+
#[serde(default)]
160
+
source: Option<String>,
161
+
/// Filter by source kind (e.g. `blog`).
162
+
#[serde(default)]
163
+
kind: Option<String>,
164
+
}
165
+
166
+
const fn default_limit() -> usize {
167
+
20
168
+
}
169
+
170
+
impl SearchParams {
171
+
fn filters(&self) -> query::SearchFilters {
172
+
query::SearchFilters {
173
+
source: self.source.clone(),
174
+
kind: self.kind.clone(),
175
+
}
176
+
}
177
+
}
178
+
179
+
/// GET / — landing page with search bar and recent crawl results.
180
+
async fn page_home(State(state): State<Arc<AppState>>) -> Markup {
181
+
let total = state.index.num_docs().unwrap_or(0);
182
+
let recent = state.store.lock().await.recent(20).unwrap_or_default();
183
+
184
+
layout("Search", html! {
185
+
form action="/search" method="get" {
186
+
input type="search" name="q" placeholder="Search your web…" autofocus;
187
+
button type="submit" { "Search" }
188
+
}
189
+
p.empty { (total) " documents indexed" }
190
+
191
+
@if !recent.is_empty() {
192
+
h2 { "Recently crawled" }
193
+
ol.results {
194
+
@for record in &recent {
195
+
li.result {
196
+
div.result-title {
197
+
a href=(record.url) {
198
+
@if let Some(title) = &record.title {
199
+
(title)
200
+
} @else {
201
+
(record.url)
202
+
}
203
+
}
204
+
}
205
+
div.result-url { (record.url) }
206
+
div.result-meta {
207
+
(record.source_name) " · " (record.crawled_at.format("%b %d, %Y"))
208
+
}
209
+
}
210
+
}
211
+
}
212
+
}
213
+
})
214
+
}
215
+
216
+
/// GET /search?q=...&limit=... — search results page.
217
+
async fn page_search(
218
+
State(state): State<Arc<AppState>>,
219
+
Query(params): Query<SearchParams>,
220
+
) -> Markup {
221
+
if params.q.is_empty() {
222
+
return layout("Search", html! {
223
+
form action="/search" method="get" {
224
+
input type="search" name="q" placeholder="Search your web…" autofocus;
225
+
button type="submit" { "Search" }
226
+
}
227
+
});
228
+
}
229
+
230
+
let filters = params.filters();
231
+
let results = query::search(state.index.inner(), ¶ms.q, params.limit, &filters)
232
+
.unwrap_or_default();
233
+
234
+
let title = format!("{} — search", ¶ms.q);
235
+
236
+
layout(&title, html! {
237
+
form action="/search" method="get" {
238
+
input type="search" name="q" value=(params.q) placeholder="Search your web…" autofocus;
239
+
@if let Some(source) = ¶ms.source {
240
+
input type="hidden" name="source" value=(source);
241
+
}
242
+
@if let Some(kind) = ¶ms.kind {
243
+
input type="hidden" name="kind" value=(kind);
244
+
}
245
+
button type="submit" { "Search" }
246
+
}
247
+
248
+
// Active filters
249
+
@if params.source.is_some() || params.kind.is_some() {
250
+
div.filters {
251
+
span { "Filtering: " }
252
+
@if let Some(source) = ¶ms.source {
253
+
span.filter-tag {
254
+
(source) " "
255
+
a href={"/search?q=" (params.q)} title="Remove filter" { "×" }
256
+
}
257
+
}
258
+
@if let Some(kind) = ¶ms.kind {
259
+
span.filter-tag {
260
+
(kind) " "
261
+
a href={"/search?q=" (params.q)
262
+
@if let Some(source) = ¶ms.source {
263
+
"&source=" (source)
264
+
}
265
+
} title="Remove filter" { "×" }
266
+
}
267
+
}
268
+
}
269
+
}
270
+
271
+
@if results.is_empty() {
272
+
p.empty { "No results found." }
273
+
} @else {
274
+
p { (results.len()) " results" }
275
+
ol.results {
276
+
@for hit in &results {
277
+
li.result {
278
+
div.result-title { a href=(hit.url) { (hit.title) } }
279
+
div.result-url { (hit.url) }
280
+
@if !hit.snippet.is_empty() {
281
+
div.result-snippet { (PreEscaped(highlight_terms(&hit.snippet, ¶ms.q))) }
282
+
}
283
+
div.result-meta {
284
+
a.filter-link href={"/search?q=" (params.q) "&source=" (hit.source_name)} {
285
+
(hit.source_name)
286
+
}
287
+
" · "
288
+
a.filter-link href={"/search?q=" (params.q) "&kind=" (hit.source_kind)} {
289
+
(hit.source_kind)
290
+
}
291
+
}
292
+
}
293
+
}
294
+
}
295
+
}
296
+
})
297
+
}
298
+
299
+
// ---------------------------------------------------------------------------
300
+
// Snippet highlighting
301
+
// ---------------------------------------------------------------------------
302
+
303
+
/// Highlight query terms in a snippet by wrapping them in `<mark>` tags.
304
+
///
305
+
/// The snippet text is HTML-escaped first to prevent XSS, then matched
306
+
/// terms are wrapped. Matching is case-insensitive on word boundaries.
307
+
fn highlight_terms(snippet: &str, query: &str) -> String {
308
+
// HTML-escape the snippet
309
+
let escaped = snippet
310
+
.replace('&', "&")
311
+
.replace('<', "<")
312
+
.replace('>', ">");
313
+
314
+
// Split query into individual terms, skip very short ones
315
+
let terms: Vec<&str> = query
316
+
.split_whitespace()
317
+
.filter(|t| t.len() >= 2)
318
+
.collect();
319
+
320
+
if terms.is_empty() {
321
+
return escaped;
322
+
}
323
+
324
+
// Build a case-insensitive regex alternation: term1|term2|...
325
+
// Wrap in word boundaries to avoid partial matches inside words.
326
+
let pattern = terms
327
+
.iter()
328
+
.map(|t| regex::escape(t))
329
+
.collect::<Vec<_>>()
330
+
.join("|");
331
+
332
+
let Ok(re) = regex::RegexBuilder::new(&pattern)
333
+
.case_insensitive(true)
334
+
.build()
335
+
else {
336
+
return escaped;
337
+
};
338
+
339
+
re.replace_all(&escaped, "<mark>$0</mark>").into_owned()
340
+
}
341
+
342
+
// ---------------------------------------------------------------------------
343
+
// JSON API (kept for programmatic access)
344
+
// ---------------------------------------------------------------------------
345
+
30
346
/// JSON response for search results.
31
347
#[derive(Debug, Serialize)]
32
348
struct SearchResponse {
···
46
362
score: f32,
47
363
}
48
364
49
-
/// Query parameters for the search endpoint.
50
-
#[derive(Debug, Deserialize)]
51
-
struct SearchParams {
52
-
q: String,
53
-
#[serde(default = "default_limit")]
54
-
limit: usize,
55
-
}
56
-
57
-
const fn default_limit() -> usize {
58
-
20
59
-
}
60
-
61
365
/// JSON response for stats endpoint.
62
366
#[derive(Debug, Serialize)]
63
367
struct StatsResponse {
···
72
376
kind: String,
73
377
}
74
378
75
-
/// Start the web server.
76
-
pub(crate) async fn run(args: ServeArgs) -> Result<()> {
77
-
let data_dir = config::data_dir()?;
78
-
let index_path = data_dir.join("index");
79
-
80
-
if !index_path.exists() {
81
-
eprintln!("No index found. Run `motet crawl` first.");
82
-
return Ok(());
83
-
}
84
-
85
-
let index = SearchIndex::open(&index_path)?;
86
-
let state = Arc::new(AppState { index });
87
-
88
-
let app = Router::new()
89
-
.route("/api/search", get(api_search))
90
-
.route("/api/stats", get(api_stats))
91
-
.fallback(get(static_handler))
92
-
.layer(CorsLayer::permissive())
93
-
.with_state(state);
94
-
95
-
let addr = format!("{}:{}", args.bind, args.port);
96
-
let listener = tokio::net::TcpListener::bind(&addr).await?;
97
-
98
-
info!(%addr, "motet web UI starting");
99
-
println!("motet serving at http://{addr}");
100
-
101
-
axum::serve(listener, app).await?;
102
-
Ok(())
103
-
}
104
-
105
379
/// GET /api/search?q=...&limit=...
106
380
async fn api_search(
107
381
State(state): State<Arc<AppState>>,
108
382
Query(params): Query<SearchParams>,
109
383
) -> impl IntoResponse {
110
-
let results = match query::search(state.index.inner(), ¶ms.q, params.limit) {
384
+
let filters = params.filters();
385
+
let results = match query::search(state.index.inner(), ¶ms.q, params.limit, &filters) {
111
386
Ok(r) => r,
112
387
Err(e) => {
113
388
return (
···
162
437
})
163
438
}
164
439
165
-
/// Serve embedded static files, falling back to index.html for SPA routing.
166
-
async fn static_handler(uri: axum::http::Uri) -> impl IntoResponse {
167
-
let path = uri.path().trim_start_matches('/');
440
+
// ---------------------------------------------------------------------------
441
+
// Server bootstrap
442
+
// ---------------------------------------------------------------------------
168
443
169
-
// Try the exact path first
170
-
if let Some(file) = WebAssets::get(path) {
171
-
let mime = mime_guess::from_path(path).first_or_octet_stream();
172
-
return (
173
-
StatusCode::OK,
174
-
[(axum::http::header::CONTENT_TYPE, mime.as_ref())],
175
-
file.data.into_owned(),
176
-
)
177
-
.into_response();
178
-
}
444
+
/// Start the web server.
445
+
pub(crate) async fn run(args: ServeArgs) -> Result<()> {
446
+
let data_dir = config::data_dir()?;
447
+
let index_path = data_dir.join("index");
179
448
180
-
// Fall back to index.html for SPA client-side routing
181
-
match WebAssets::get("index.html") {
182
-
Some(file) => (
183
-
StatusCode::OK,
184
-
[(axum::http::header::CONTENT_TYPE, "text/html")],
185
-
file.data.into_owned(),
186
-
)
187
-
.into_response(),
188
-
None => (StatusCode::NOT_FOUND, "motet web UI not built. Run `npm run build` in motet_web/ first.").into_response(),
449
+
if !index_path.exists() {
450
+
eprintln!("No index found. Run `motet crawl` first.");
451
+
return Ok(());
189
452
}
453
+
454
+
let index = SearchIndex::open(&index_path)?;
455
+
let store = Store::open(&data_dir.join("motet.db"))?;
456
+
let state = Arc::new(AppState {
457
+
index,
458
+
store: Mutex::new(store),
459
+
});
460
+
461
+
let app = Router::new()
462
+
.route("/", get(page_home))
463
+
.route("/search", get(page_search))
464
+
.route("/api/search", get(api_search))
465
+
.route("/api/stats", get(api_stats))
466
+
.layer(CorsLayer::permissive())
467
+
.with_state(state);
468
+
469
+
let addr = format!("{}:{}", args.bind, args.port);
470
+
let listener = tokio::net::TcpListener::bind(&addr).await?;
471
+
472
+
info!(%addr, "motet web UI starting");
473
+
println!("motet serving at http://{addr}");
474
+
475
+
axum::serve(listener, app).await?;
476
+
Ok(())
190
477
}
+2
motet_core/Cargo.toml
+2
motet_core/Cargo.toml
···
16
16
chrono = { workspace = true }
17
17
dirs = { workspace = true }
18
18
eyre = { workspace = true }
19
+
feed-rs = { workspace = true }
19
20
reqwest = { workspace = true }
20
21
rusqlite = { workspace = true }
21
22
scraper = { workspace = true }
22
23
serde = { workspace = true }
23
24
serde_json = { workspace = true }
24
25
tantivy = { workspace = true }
26
+
texting_robots = { workspace = true }
25
27
thiserror = { workspace = true }
26
28
tokio = { workspace = true }
27
29
tracing = { workspace = true }
+156
-18
motet_core/src/config.rs
+156
-18
motet_core/src/config.rs
···
10
10
//! "sources": {
11
11
//! "scout_magazine": {
12
12
//! "kind": "blog",
13
-
//! "url": "https://scoutmagazine.ca/category/food-drink/",
13
+
//! "url": "https://scoutmagazine.ca/food-drink",
14
14
//! "crawl_interval": "3d",
15
-
//! "selector": "article"
15
+
//! "selector": ".Card"
16
16
//! }
17
17
//! }
18
18
//! }
19
19
//! ```
20
20
21
-
use eyre::{Result, WrapErr, bail};
21
+
use eyre::{bail, Result, WrapErr};
22
22
use serde::{Deserialize, Serialize};
23
23
use std::{
24
24
collections::BTreeMap,
···
82
82
/// Source kind label (e.g. "restaurant", "crate", "blog").
83
83
#[serde(default)]
84
84
pub source_kind_label: Option<String>,
85
+
86
+
/// For RSS sources: also fetch and index pages linked from article bodies.
87
+
#[serde(default)]
88
+
pub follow_links: bool,
85
89
}
86
90
87
91
/// The type of crawler to dispatch to.
···
94
98
CratesIo,
95
99
/// Reddit posts.
96
100
Reddit,
101
+
/// RSS or Atom feed.
102
+
Rss,
97
103
/// Yelp Fusion API.
98
104
Yelp,
99
105
}
···
120
126
}
121
127
122
128
match self.kind {
123
-
SourceKind::Blog | SourceKind::Reddit => "blog",
129
+
SourceKind::Blog | SourceKind::Reddit | SourceKind::Rss => "blog",
124
130
SourceKind::CratesIo => "crate",
125
131
SourceKind::Yelp => "restaurant",
126
132
}
···
210
216
return Ok(path);
211
217
}
212
218
213
-
let default = Config {
219
+
let default = default_config();
220
+
let json = serde_json::to_string_pretty(&default)?;
221
+
std::fs::write(&path, json)?;
222
+
223
+
Ok(path)
224
+
}
225
+
226
+
/// Build a convenience RSS [`SourceConfig`].
227
+
fn rss_source(url: &str, interval: &str, label: &str) -> SourceConfig {
228
+
SourceConfig {
229
+
kind: SourceKind::Rss,
230
+
url: Some(url.to_string()),
231
+
crawl_interval: interval.to_string(),
232
+
source_kind_label: Some(label.to_string()),
233
+
selector: None,
234
+
max_pages: None,
235
+
location: None,
236
+
categories: None,
237
+
api_key_env: None,
238
+
subreddit: None,
239
+
search_terms: None,
240
+
min_downloads: None,
241
+
follow_links: false,
242
+
}
243
+
}
244
+
245
+
/// Construct the default [`Config`] with all built-in sources.
246
+
#[allow(clippy::too_many_lines)]
247
+
fn default_config() -> Config {
248
+
Config {
214
249
sources: BTreeMap::from([
215
250
(
216
-
"this_week_in_rust".to_string(),
251
+
"ap_news".to_string(),
217
252
SourceConfig {
218
253
kind: SourceKind::Blog,
219
-
url: Some("https://this-week-in-rust.org/".to_string()),
220
-
crawl_interval: "7d".to_string(),
221
-
selector: Some("article".to_string()),
222
-
max_pages: Some(50),
254
+
url: Some("https://apnews.com".to_string()),
255
+
crawl_interval: "4h".to_string(),
256
+
selector: Some(".PagePromo".to_string()),
257
+
max_pages: Some(30),
258
+
source_kind_label: Some("news".to_string()),
259
+
location: None,
260
+
categories: None,
261
+
api_key_env: None,
262
+
subreddit: None,
263
+
search_terms: None,
264
+
min_downloads: None,
265
+
follow_links: false,
266
+
},
267
+
),
268
+
(
269
+
"bmann_blog".to_string(),
270
+
rss_source("https://bmannconsulting.com/blog.xml", "1d", "blog"),
271
+
),
272
+
(
273
+
"bmann_journal".to_string(),
274
+
rss_source("https://bmannconsulting.com/journal.xml", "1d", "blog"),
275
+
),
276
+
(
277
+
"bsky_bmann".to_string(),
278
+
rss_source(
279
+
"https://bsky.app/profile/did:plc:2cxgdrgtsmrbqnjkwyplmp43/rss",
280
+
"1d",
281
+
"bluesky",
282
+
),
283
+
),
284
+
(
285
+
"bsky_dustyweb".to_string(),
286
+
rss_source(
287
+
"https://bsky.app/profile/did:plc:dyyvywontyeuaegemczcushz/rss",
288
+
"1d",
289
+
"bluesky",
290
+
),
291
+
),
292
+
(
293
+
"bsky_expede".to_string(),
294
+
rss_source(
295
+
"https://bsky.app/profile/did:plc:oypgij57lv3ytni32p2jqbce/rss",
296
+
"1d",
297
+
"bluesky",
298
+
),
299
+
),
300
+
(
301
+
"cbc_bc".to_string(),
302
+
rss_source(
303
+
"https://www.cbc.ca/webfeed/rss/rss-canada-britishcolumbia",
304
+
"4h",
305
+
"news",
306
+
),
307
+
),
308
+
(
309
+
"cbc_top_stories".to_string(),
310
+
rss_source(
311
+
"https://www.cbc.ca/webfeed/rss/rss-topstories",
312
+
"4h",
313
+
"news",
314
+
),
315
+
),
316
+
(
317
+
"crates_io".to_string(),
318
+
SourceConfig {
319
+
kind: SourceKind::CratesIo,
320
+
url: None,
321
+
crawl_interval: "1d".to_string(),
322
+
selector: None,
323
+
max_pages: Some(5),
324
+
source_kind_label: Some("crate".to_string()),
325
+
location: None,
326
+
categories: None,
327
+
api_key_env: None,
328
+
subreddit: None,
329
+
search_terms: None,
330
+
min_downloads: Some(100),
331
+
follow_links: false,
332
+
},
333
+
),
334
+
(
335
+
"ink_and_switch".to_string(),
336
+
rss_source("https://inkandswitch.com/index.xml", "3d", "research"),
337
+
),
338
+
(
339
+
"monad_nomad".to_string(),
340
+
SourceConfig {
341
+
kind: SourceKind::Rss,
342
+
url: Some("https://notes.brooklynzelenka.com/index.xml".to_string()),
343
+
crawl_interval: "1d".to_string(),
223
344
source_kind_label: Some("blog".to_string()),
345
+
follow_links: true,
346
+
max_pages: Some(100),
347
+
selector: None,
224
348
location: None,
225
349
categories: None,
226
350
api_key_env: None,
···
233
357
"scout_magazine".to_string(),
234
358
SourceConfig {
235
359
kind: SourceKind::Blog,
236
-
url: Some("https://scoutmagazine.ca/category/food-drink/".to_string()),
360
+
url: Some("https://scoutmagazine.ca/food-drink".to_string()),
237
361
crawl_interval: "3d".to_string(),
238
-
selector: Some("article".to_string()),
362
+
selector: Some(".Card".to_string()),
239
363
max_pages: Some(50),
240
364
source_kind_label: Some("restaurant".to_string()),
241
365
location: None,
···
244
368
subreddit: None,
245
369
search_terms: None,
246
370
min_downloads: None,
371
+
follow_links: false,
372
+
},
373
+
),
374
+
(
375
+
"this_week_in_rust".to_string(),
376
+
SourceConfig {
377
+
kind: SourceKind::Blog,
378
+
url: Some("https://this-week-in-rust.org/".to_string()),
379
+
crawl_interval: "7d".to_string(),
380
+
selector: Some(".post-title".to_string()),
381
+
max_pages: Some(50),
382
+
source_kind_label: Some("blog".to_string()),
383
+
location: None,
384
+
categories: None,
385
+
api_key_env: None,
386
+
subreddit: None,
387
+
search_terms: None,
388
+
min_downloads: None,
389
+
follow_links: false,
247
390
},
248
391
),
249
392
]),
250
-
};
251
-
252
-
let json = serde_json::to_string_pretty(&default)?;
253
-
std::fs::write(&path, json)?;
254
-
255
-
Ok(path)
393
+
}
256
394
}
257
395
258
396
#[cfg(test)]
+36
-10
motet_core/src/crawler/blog.rs
+36
-10
motet_core/src/crawler/blog.rs
···
5
5
//! for any blog with a listing page that links to individual articles.
6
6
7
7
use super::{CrawledDocument, Crawler};
8
-
use crate::config::SourceConfig;
8
+
use crate::{config::SourceConfig, robots::RobotsChecker};
9
9
use chrono::Utc;
10
10
use eyre::{Result, WrapErr, bail};
11
11
use scraper::{Html, Selector};
12
+
use std::collections::HashSet;
12
13
use tracing::{debug, info, warn};
13
14
use url::Url;
14
15
···
22
23
source_name: &str,
23
24
config: &SourceConfig,
24
25
client: &reqwest::Client,
26
+
robots: &RobotsChecker,
25
27
) -> Result<Vec<CrawledDocument>> {
26
28
let base_url = config
27
29
.url
···
31
33
let base = Url::parse(base_url)
32
34
.wrap_err_with(|| format!("invalid URL for source {source_name:?}"))?;
33
35
36
+
if !robots.allowed(&base).await {
37
+
info!(%source_name, %base, "skipping blog index (blocked by robots.txt)");
38
+
return Ok(vec![]);
39
+
}
40
+
34
41
info!(%source_name, %base, "crawling blog index");
42
+
43
+
// Respect crawl delay if specified
44
+
let delay = robots.delay(&base).await;
35
45
36
46
let index_html = client
37
47
.get(base.as_str())
···
55
65
let mut documents = Vec::with_capacity(urls.len());
56
66
57
67
for url in &urls {
68
+
if !robots.allowed(url).await {
69
+
debug!(%url, "skipping article (blocked by robots.txt)");
70
+
continue;
71
+
}
72
+
73
+
if let Some(d) = delay {
74
+
tokio::time::sleep(d).await;
75
+
}
76
+
58
77
match fetch_article(client, url).await {
59
78
Ok(doc) => {
60
79
debug!(url = %doc.url, title = %doc.title, "crawled article");
···
95
114
let link_sel = Selector::parse("a[href]")
96
115
.map_err(|e| eyre::eyre!("failed to parse link selector: {e:?}"))?;
97
116
117
+
let container_count = document.select(&selector).count();
118
+
debug!(container_sel, container_count, "matched containers");
119
+
120
+
let mut seen = HashSet::new();
98
121
let mut urls = Vec::new();
99
122
100
123
for element in document.select(&selector) {
101
-
// Try to find a link inside the article element
102
124
if let Some(link) = element.select(&link_sel).next()
103
125
&& let Some(href) = link.value().attr("href")
104
126
&& let Ok(resolved) = base.join(href)
127
+
&& resolved.host_str() == base.host_str()
128
+
&& seen.insert(resolved.clone())
105
129
{
106
-
// Only include links on the same host
107
-
if resolved.host_str() == base.host_str() {
108
-
urls.push(resolved);
109
-
}
130
+
urls.push(resolved);
110
131
}
111
132
}
112
133
113
-
urls.dedup();
114
134
Ok(urls)
115
135
}
116
136
···
197
217
s.split_whitespace().collect::<Vec<_>>().join(" ")
198
218
}
199
219
200
-
/// Produce a display snippet of at most `max_len` characters.
220
+
/// Produce a display snippet of at most `max_len` characters, respecting
221
+
/// char boundaries.
201
222
fn make_snippet(body: &str, max_len: usize) -> String {
202
223
if body.len() <= max_len {
203
224
return body.to_string();
204
225
}
205
226
206
-
// Cut at a word boundary
207
-
let truncated = &body[..max_len];
227
+
// Walk back to a char boundary
228
+
let mut end = max_len;
229
+
while !body.is_char_boundary(end) {
230
+
end -= 1;
231
+
}
232
+
233
+
let truncated = &body[..end];
208
234
match truncated.rfind(' ') {
209
235
Some(pos) => format!("{}...", &truncated[..pos]),
210
236
None => format!("{truncated}..."),
+201
motet_core/src/crawler/crates_io.rs
+201
motet_core/src/crawler/crates_io.rs
···
1
+
//! crates.io crawler using the public JSON API.
2
+
//!
3
+
//! Fetches recently-updated crates from the crates.io API and indexes each
4
+
//! crate's name, description, and metadata as a searchable document.
5
+
//!
6
+
//! # API endpoint
7
+
//!
8
+
//! ```text
9
+
//! GET https://crates.io/api/v1/crates?sort=recent-updates&per_page=50
10
+
//! ```
11
+
//!
12
+
//! Pagination uses cursor-based `meta.next_page` URLs.
13
+
//!
14
+
//! # Example config
15
+
//!
16
+
//! ```json
17
+
//! {
18
+
//! "crates_io": {
19
+
//! "kind": "crates_io",
20
+
//! "crawl_interval": "1d",
21
+
//! "max_pages": 5,
22
+
//! "min_downloads": 100
23
+
//! }
24
+
//! }
25
+
//! ```
26
+
27
+
use super::{CrawledDocument, Crawler};
28
+
use crate::{config::SourceConfig, robots::RobotsChecker};
29
+
use chrono::Utc;
30
+
use eyre::{Result, WrapErr};
31
+
use serde::Deserialize;
32
+
use tracing::{debug, info};
33
+
use url::Url;
34
+
35
+
const API_BASE: &str = "https://crates.io/api/v1/crates";
36
+
const CRATES_IO_BASE: &str = "https://crates.io/crates";
37
+
const PER_PAGE: u32 = 50;
38
+
39
+
/// Crawler for the crates.io JSON API.
40
+
#[derive(Debug, Clone, Copy)]
41
+
pub struct CratesIoCrawler;
42
+
43
+
#[derive(Debug, Deserialize)]
44
+
struct CratesResponse {
45
+
crates: Vec<CrateEntry>,
46
+
meta: Meta,
47
+
}
48
+
49
+
#[derive(Debug, Deserialize)]
50
+
struct CrateEntry {
51
+
name: String,
52
+
description: Option<String>,
53
+
downloads: u64,
54
+
repository: Option<String>,
55
+
homepage: Option<String>,
56
+
max_version: String,
57
+
#[serde(default)]
58
+
categories: Option<Vec<String>>,
59
+
}
60
+
61
+
#[derive(Debug, Deserialize)]
62
+
struct Meta {
63
+
next_page: Option<String>,
64
+
}
65
+
66
+
impl Crawler for CratesIoCrawler {
67
+
async fn crawl(
68
+
&self,
69
+
source_name: &str,
70
+
config: &SourceConfig,
71
+
client: &reqwest::Client,
72
+
robots: &RobotsChecker,
73
+
) -> Result<Vec<CrawledDocument>> {
74
+
let max_pages = config.max_pages.unwrap_or(5);
75
+
let min_downloads = config.min_downloads.unwrap_or(0);
76
+
77
+
// Check robots.txt for the API base URL
78
+
let api_url = Url::parse(API_BASE).wrap_err("invalid API base URL")?;
79
+
if !robots.allowed(&api_url).await {
80
+
info!(%source_name, "skipping crates.io (blocked by robots.txt)");
81
+
return Ok(vec![]);
82
+
}
83
+
84
+
// Respect crawl delay if specified
85
+
let delay = robots.delay(&api_url).await;
86
+
87
+
info!(%source_name, %max_pages, %min_downloads, "crawling crates.io");
88
+
89
+
let mut documents = Vec::new();
90
+
let mut next_url = Some(format!("{API_BASE}?sort=recent-updates&per_page={PER_PAGE}"));
91
+
let mut pages_fetched: usize = 0;
92
+
93
+
while let Some(url) = next_url.take() {
94
+
if pages_fetched >= max_pages {
95
+
break;
96
+
}
97
+
98
+
debug!(%url, page = pages_fetched + 1, "fetching crates page");
99
+
100
+
let resp: CratesResponse = client
101
+
.get(&url)
102
+
.send()
103
+
.await
104
+
.wrap_err_with(|| format!("failed to fetch crates.io page {}", pages_fetched + 1))?
105
+
.json()
106
+
.await
107
+
.wrap_err("failed to parse crates.io JSON response")?;
108
+
109
+
for krate in &resp.crates {
110
+
if krate.downloads < min_downloads {
111
+
debug!(
112
+
name = %krate.name,
113
+
downloads = krate.downloads,
114
+
%min_downloads,
115
+
"skipping crate below download threshold"
116
+
);
117
+
continue;
118
+
}
119
+
120
+
let crate_url = Url::parse(&format!("{CRATES_IO_BASE}/{}", krate.name))
121
+
.wrap_err("failed to build crate URL")?;
122
+
123
+
let body = build_crate_body(krate);
124
+
let snippet = krate
125
+
.description
126
+
.clone()
127
+
.unwrap_or_default();
128
+
129
+
let mut tags: Vec<String> = krate
130
+
.categories
131
+
.clone()
132
+
.unwrap_or_default();
133
+
134
+
tags.push(format!("v{}", krate.max_version));
135
+
136
+
documents.push(CrawledDocument {
137
+
url: crate_url,
138
+
title: krate.name.clone(),
139
+
body,
140
+
snippet,
141
+
tags,
142
+
crawled_at: Utc::now(),
143
+
});
144
+
}
145
+
146
+
pages_fetched += 1;
147
+
148
+
if let Some(d) = delay {
149
+
tokio::time::sleep(d).await;
150
+
}
151
+
152
+
// crates.io next_page is a relative query string like "?foo=bar"
153
+
next_url = resp.meta.next_page.map(|page| {
154
+
if page.starts_with("http") {
155
+
page
156
+
} else {
157
+
format!("{API_BASE}{page}")
158
+
}
159
+
});
160
+
}
161
+
162
+
info!(
163
+
%source_name,
164
+
count = documents.len(),
165
+
pages = pages_fetched,
166
+
"finished crawling crates.io"
167
+
);
168
+
169
+
Ok(documents)
170
+
}
171
+
}
172
+
173
+
/// Build a rich body string from crate metadata for full-text indexing.
174
+
fn build_crate_body(krate: &CrateEntry) -> String {
175
+
let mut parts = Vec::new();
176
+
177
+
parts.push(krate.name.clone());
178
+
179
+
if let Some(desc) = &krate.description {
180
+
parts.push(desc.clone());
181
+
}
182
+
183
+
parts.push(format!("version {}", krate.max_version));
184
+
parts.push(format!("{} downloads", krate.downloads));
185
+
186
+
if let Some(repo) = &krate.repository {
187
+
parts.push(format!("repository: {repo}"));
188
+
}
189
+
190
+
if let Some(home) = &krate.homepage {
191
+
parts.push(format!("homepage: {home}"));
192
+
}
193
+
194
+
if let Some(cats) = &krate.categories
195
+
&& !cats.is_empty()
196
+
{
197
+
parts.push(format!("categories: {}", cats.join(", ")));
198
+
}
199
+
200
+
parts.join(" | ")
201
+
}
+383
motet_core/src/crawler/rss.rs
+383
motet_core/src/crawler/rss.rs
···
1
+
//! RSS/Atom feed crawler.
2
+
//!
3
+
//! Fetches a feed URL, parses it with [`feed_rs`], and converts each entry
4
+
//! into a [`CrawledDocument`]. Works with both RSS 2.0 and Atom feeds.
5
+
//!
6
+
//! # Example config
7
+
//!
8
+
//! ```json
9
+
//! {
10
+
//! "ink_and_switch": {
11
+
//! "kind": "rss",
12
+
//! "url": "https://inkandswitch.com/index.xml",
13
+
//! "crawl_interval": "1d"
14
+
//! }
15
+
//! }
16
+
//! ```
17
+
18
+
use super::{CrawledDocument, Crawler};
19
+
use crate::{config::SourceConfig, robots::RobotsChecker};
20
+
use chrono::Utc;
21
+
use eyre::{Result, WrapErr};
22
+
use scraper::{Html, Selector};
23
+
use std::collections::HashSet;
24
+
use tracing::{debug, info, warn};
25
+
use url::Url;
26
+
27
+
/// Crawler for RSS and Atom feeds.
28
+
#[derive(Debug, Clone, Copy)]
29
+
pub struct RssCrawler;
30
+
31
+
impl Crawler for RssCrawler {
32
+
async fn crawl(
33
+
&self,
34
+
source_name: &str,
35
+
config: &SourceConfig,
36
+
client: &reqwest::Client,
37
+
robots: &RobotsChecker,
38
+
) -> Result<Vec<CrawledDocument>> {
39
+
let feed_url = config
40
+
.url
41
+
.as_deref()
42
+
.ok_or_else(|| eyre::eyre!("RSS source {source_name:?} missing `url`"))?;
43
+
44
+
let parsed_feed_url = Url::parse(feed_url)
45
+
.wrap_err_with(|| format!("invalid feed URL for {source_name}"))?;
46
+
47
+
if !robots.allowed(&parsed_feed_url).await {
48
+
info!(%source_name, %feed_url, "skipping RSS feed (blocked by robots.txt)");
49
+
return Ok(vec![]);
50
+
}
51
+
52
+
info!(%source_name, %feed_url, "fetching RSS feed");
53
+
54
+
let body = client
55
+
.get(feed_url)
56
+
.send()
57
+
.await
58
+
.wrap_err_with(|| format!("failed to fetch feed for {source_name}"))?
59
+
.bytes()
60
+
.await
61
+
.wrap_err_with(|| format!("failed to read feed body for {source_name}"))?;
62
+
63
+
let feed = feed_rs::parser::parse(&body[..])
64
+
.wrap_err_with(|| format!("failed to parse feed for {source_name}"))?;
65
+
66
+
let max_pages = config.max_pages.unwrap_or(100);
67
+
let mut documents = Vec::with_capacity(feed.entries.len().min(max_pages));
68
+
69
+
// Collect raw HTML bodies so we can extract links later if follow_links is set.
70
+
let mut raw_html_bodies: Vec<String> = Vec::new();
71
+
72
+
for entry in feed.entries.into_iter().take(max_pages) {
73
+
let Some(url) = entry_url(&entry) else {
74
+
warn!(%source_name, id = %entry.id, "entry has no link, skipping");
75
+
continue;
76
+
};
77
+
78
+
let raw_html = entry_body_raw(&entry);
79
+
let body = strip_html(&raw_html);
80
+
81
+
if config.follow_links {
82
+
raw_html_bodies.push(raw_html);
83
+
}
84
+
85
+
let title = entry
86
+
.title
87
+
.map_or_else(|| url.path().to_string(), |t| t.content);
88
+
let snippet = make_snippet(&body, 300);
89
+
90
+
let tags = entry
91
+
.categories
92
+
.iter()
93
+
.map(|c| c.term.clone())
94
+
.collect();
95
+
96
+
debug!(%url, %title, body_len = body.len(), "parsed feed entry");
97
+
98
+
documents.push(CrawledDocument {
99
+
url,
100
+
title,
101
+
body,
102
+
snippet,
103
+
tags,
104
+
crawled_at: Utc::now(),
105
+
});
106
+
}
107
+
108
+
info!(
109
+
%source_name,
110
+
count = documents.len(),
111
+
"finished parsing RSS feed entries"
112
+
);
113
+
114
+
// Follow links found in entry bodies, if enabled.
115
+
if config.follow_links {
116
+
let entry_urls: HashSet<Url> = documents.iter().map(|d| d.url.clone()).collect();
117
+
let remaining = max_pages.saturating_sub(documents.len());
118
+
119
+
let linked = follow_entry_links(
120
+
source_name,
121
+
&raw_html_bodies,
122
+
&entry_urls,
123
+
&parsed_feed_url,
124
+
remaining,
125
+
client,
126
+
robots,
127
+
)
128
+
.await;
129
+
130
+
info!(
131
+
%source_name,
132
+
followed = linked.len(),
133
+
"finished following links from feed entries"
134
+
);
135
+
136
+
documents.extend(linked);
137
+
}
138
+
139
+
info!(
140
+
%source_name,
141
+
total = documents.len(),
142
+
"finished crawling RSS source"
143
+
);
144
+
145
+
Ok(documents)
146
+
}
147
+
}
148
+
149
+
/// Extract the best URL from a feed entry.
150
+
fn entry_url(entry: &feed_rs::model::Entry) -> Option<Url> {
151
+
// Prefer `alternate` link, then any link, then fall back to entry ID as URL
152
+
let href = entry
153
+
.links
154
+
.iter()
155
+
.find(|l| l.rel.as_deref() == Some("alternate"))
156
+
.or_else(|| entry.links.first())
157
+
.map(|l| l.href.as_str())
158
+
.or_else(|| {
159
+
// Some feeds use the entry ID as the URL
160
+
if entry.id.starts_with("http") {
161
+
Some(entry.id.as_str())
162
+
} else {
163
+
None
164
+
}
165
+
})?;
166
+
167
+
Url::parse(href).ok()
168
+
}
169
+
170
+
/// Extract the raw HTML body from a feed entry, preferring `content` over
171
+
/// `summary`. Returns the HTML as-is (not stripped) so callers can both
172
+
/// extract links and produce plain text from the same source.
173
+
fn entry_body_raw(entry: &feed_rs::model::Entry) -> String {
174
+
if let Some(content) = &entry.content
175
+
&& let Some(body) = &content.body
176
+
&& !body.trim().is_empty()
177
+
{
178
+
return body.clone();
179
+
}
180
+
181
+
if let Some(summary) = &entry.summary {
182
+
if !summary.content.trim().is_empty() {
183
+
return summary.content.clone();
184
+
}
185
+
}
186
+
187
+
for media in &entry.media {
188
+
if let Some(desc) = &media.description {
189
+
if !desc.content.trim().is_empty() {
190
+
return desc.content.clone();
191
+
}
192
+
}
193
+
}
194
+
195
+
String::new()
196
+
}
197
+
198
+
/// Extract all `<a href>` links from raw HTML, resolving them against `base`.
199
+
fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
200
+
let doc = Html::parse_fragment(html);
201
+
let Ok(sel) = Selector::parse("a[href]") else {
202
+
return vec![];
203
+
};
204
+
205
+
let mut urls = Vec::new();
206
+
for el in doc.select(&sel) {
207
+
if let Some(href) = el.value().attr("href")
208
+
&& let Ok(resolved) = base.join(href)
209
+
{
210
+
// Only follow http(s) links
211
+
if resolved.scheme() == "http" || resolved.scheme() == "https" {
212
+
urls.push(resolved);
213
+
}
214
+
}
215
+
}
216
+
urls
217
+
}
218
+
219
+
/// Follow links discovered in RSS entry bodies.
220
+
///
221
+
/// Deduplicates against already-indexed entry URLs, checks `robots.txt`,
222
+
/// and fetches each allowed page up to `max_count`.
223
+
async fn follow_entry_links(
224
+
source_name: &str,
225
+
raw_html_bodies: &[String],
226
+
entry_urls: &HashSet<Url>,
227
+
feed_url: &Url,
228
+
max_count: usize,
229
+
client: &reqwest::Client,
230
+
robots: &RobotsChecker,
231
+
) -> Vec<CrawledDocument> {
232
+
if max_count == 0 {
233
+
return vec![];
234
+
}
235
+
236
+
// Collect all unique links from all entry bodies.
237
+
let mut seen: HashSet<Url> = entry_urls.clone();
238
+
let mut link_urls: Vec<Url> = Vec::new();
239
+
240
+
for html in raw_html_bodies {
241
+
for url in extract_links_from_html(html, feed_url) {
242
+
if seen.insert(url.clone()) {
243
+
link_urls.push(url);
244
+
}
245
+
}
246
+
}
247
+
248
+
debug!(
249
+
%source_name,
250
+
unique_links = link_urls.len(),
251
+
max_count,
252
+
"found links in feed entry bodies"
253
+
);
254
+
255
+
let delay = robots.delay(feed_url).await;
256
+
let mut documents = Vec::new();
257
+
258
+
for url in link_urls.into_iter().take(max_count) {
259
+
if !robots.allowed(&url).await {
260
+
debug!(%url, "skipping followed link (blocked by robots.txt)");
261
+
continue;
262
+
}
263
+
264
+
if let Some(d) = delay {
265
+
tokio::time::sleep(d).await;
266
+
}
267
+
268
+
match fetch_linked_page(client, &url).await {
269
+
Ok(doc) => {
270
+
debug!(url = %doc.url, title = %doc.title, "crawled followed link");
271
+
documents.push(doc);
272
+
}
273
+
Err(e) => {
274
+
warn!(%url, error = %e, "failed to fetch followed link, skipping");
275
+
}
276
+
}
277
+
}
278
+
279
+
documents
280
+
}
281
+
282
+
/// Fetch a single linked page and extract its title and body text.
283
+
///
284
+
/// Mirrors [`blog::fetch_article`] but is self-contained in the RSS module
285
+
/// to avoid coupling the two crawlers.
286
+
async fn fetch_linked_page(client: &reqwest::Client, url: &Url) -> Result<CrawledDocument> {
287
+
let resp = client
288
+
.get(url.as_str())
289
+
.send()
290
+
.await
291
+
.wrap_err("failed to fetch linked page")?;
292
+
293
+
if !resp.status().is_success() {
294
+
eyre::bail!("HTTP {} for {}", resp.status(), url);
295
+
}
296
+
297
+
let html = resp.text().await.wrap_err("failed to read linked page body")?;
298
+
let document = Html::parse_document(&html);
299
+
300
+
let title = extract_page_title(&document).unwrap_or_else(|| url.path().to_string());
301
+
let body = extract_page_body(&document);
302
+
let snippet = make_snippet(&body, 300);
303
+
304
+
Ok(CrawledDocument {
305
+
url: url.clone(),
306
+
title,
307
+
body,
308
+
snippet,
309
+
tags: vec![],
310
+
crawled_at: Utc::now(),
311
+
})
312
+
}
313
+
314
+
/// Extract the page title from `<title>` or `<h1>`.
315
+
fn extract_page_title(doc: &Html) -> Option<String> {
316
+
for selector_str in &["title", "h1"] {
317
+
if let Ok(sel) = Selector::parse(selector_str)
318
+
&& let Some(el) = doc.select(&sel).next()
319
+
{
320
+
let text: String = el.text().collect::<String>().trim().to_string();
321
+
if !text.is_empty() {
322
+
return Some(text);
323
+
}
324
+
}
325
+
}
326
+
None
327
+
}
328
+
329
+
/// Extract visible body text from a page, preferring `<article>` or `<main>`.
330
+
fn extract_page_body(doc: &Html) -> String {
331
+
for tag in &["article", "main", "[role=main]"] {
332
+
if let Ok(sel) = Selector::parse(tag)
333
+
&& let Some(el) = doc.select(&sel).next()
334
+
{
335
+
let text: String = el.text().collect::<Vec<_>>().join(" ");
336
+
let cleaned = normalize_whitespace(&text);
337
+
if cleaned.len() > 100 {
338
+
return cleaned;
339
+
}
340
+
}
341
+
}
342
+
343
+
if let Ok(body_sel) = Selector::parse("body")
344
+
&& let Some(body) = doc.select(&body_sel).next()
345
+
{
346
+
let text: String = body.text().collect::<Vec<_>>().join(" ");
347
+
return normalize_whitespace(&text);
348
+
}
349
+
350
+
String::new()
351
+
}
352
+
353
+
/// Strip HTML tags from content, returning plain text.
354
+
fn strip_html(html: &str) -> String {
355
+
let doc = scraper::Html::parse_fragment(html);
356
+
let text: String = doc.root_element().text().collect::<Vec<_>>().join(" ");
357
+
normalize_whitespace(&text)
358
+
}
359
+
360
+
/// Collapse runs of whitespace into single spaces.
361
+
fn normalize_whitespace(s: &str) -> String {
362
+
s.split_whitespace().collect::<Vec<_>>().join(" ")
363
+
}
364
+
365
+
/// Produce a display snippet of at most `max_len` characters, respecting
366
+
/// char boundaries.
367
+
fn make_snippet(body: &str, max_len: usize) -> String {
368
+
if body.len() <= max_len {
369
+
return body.to_string();
370
+
}
371
+
372
+
// Walk back to a char boundary
373
+
let mut end = max_len;
374
+
while !body.is_char_boundary(end) {
375
+
end -= 1;
376
+
}
377
+
378
+
let truncated = &body[..end];
379
+
match truncated.rfind(' ') {
380
+
Some(pos) => format!("{}...", &truncated[..pos]),
381
+
None => format!("{truncated}..."),
382
+
}
383
+
}
+17
-4
motet_core/src/crawler.rs
+17
-4
motet_core/src/crawler.rs
···
5
5
//! [`dispatch`] selects the right crawler for a given [`SourceKind`].
6
6
7
7
pub mod blog;
8
+
pub mod crates_io;
9
+
pub mod rss;
8
10
9
-
use crate::config::{SourceConfig, SourceKind};
11
+
use crate::{
12
+
config::{SourceConfig, SourceKind},
13
+
robots::RobotsChecker,
14
+
};
10
15
use chrono::{DateTime, Utc};
11
16
use eyre::Result;
12
17
use url::Url;
···
36
41
/// Trait for all crawlers. Each source type implements this.
37
42
pub trait Crawler: Send + Sync {
38
43
/// Crawl the source and return extracted documents.
44
+
///
45
+
/// The [`RobotsChecker`] should be consulted before fetching any URL.
39
46
fn crawl(
40
47
&self,
41
48
source_name: &str,
42
49
config: &SourceConfig,
43
50
client: &reqwest::Client,
51
+
robots: &RobotsChecker,
44
52
) -> impl std::future::Future<Output = Result<Vec<CrawledDocument>>> + Send;
45
53
}
46
54
···
53
61
source_name: &str,
54
62
config: &SourceConfig,
55
63
client: &reqwest::Client,
64
+
robots: &RobotsChecker,
56
65
) -> Result<Vec<CrawledDocument>> {
57
66
match config.kind {
58
67
SourceKind::Blog => {
59
68
let crawler = blog::BlogCrawler;
60
-
crawler.crawl(source_name, config, client).await
69
+
crawler.crawl(source_name, config, client, robots).await
61
70
}
62
71
SourceKind::CratesIo => {
63
-
tracing::warn!("crates.io crawler not yet implemented");
64
-
Ok(vec![])
72
+
let crawler = crates_io::CratesIoCrawler;
73
+
crawler.crawl(source_name, config, client, robots).await
65
74
}
66
75
SourceKind::Reddit => {
67
76
tracing::warn!("reddit crawler not yet implemented");
68
77
Ok(vec![])
78
+
}
79
+
SourceKind::Rss => {
80
+
let crawler = rss::RssCrawler;
81
+
crawler.crawl(source_name, config, client, robots).await
69
82
}
70
83
SourceKind::Yelp => {
71
84
tracing::warn!("yelp crawler not yet implemented");
+1
motet_core/src/lib.rs
+1
motet_core/src/lib.rs
+112
-18
motet_core/src/query.rs
+112
-18
motet_core/src/query.rs
···
1
1
//! Query engine — search the Tantivy index and return ranked results.
2
+
//!
3
+
//! Search results are ranked by BM25 text relevance multiplied by a freshness
4
+
//! boost: an exponential decay based on how recently the document was crawled.
2
5
3
6
use crate::schema::{self, field};
4
7
use eyre::{Result, WrapErr};
5
8
use tantivy::{
6
-
Index, collector::TopDocs, query::QueryParser,
7
-
schema::Value,
9
+
collector::TopDocs,
10
+
query::{BooleanQuery, Occur, QueryParser, TermQuery},
11
+
schema::{Facet, IndexRecordOption, Value},
12
+
DocId, Index, Score, SegmentReader, Term,
8
13
};
9
14
use tracing::debug;
10
15
···
30
35
pub score: f32,
31
36
}
32
37
33
-
/// Search the index for the given query string.
38
+
/// Optional filters to narrow search results by facet.
39
+
#[derive(Debug, Clone, Default)]
40
+
pub struct SearchFilters {
41
+
/// Filter to a specific source name (e.g. `crates_io`).
42
+
pub source: Option<String>,
43
+
/// Filter to a specific source kind (e.g. `blog`, `rss`).
44
+
pub kind: Option<String>,
45
+
}
46
+
47
+
/// Search the index for the given query string, optionally filtered by facets.
34
48
///
35
49
/// Searches across `title` and `body` fields using Tantivy's default
36
-
/// query parser with BM25 scoring.
50
+
/// query parser with BM25 scoring. When filters are provided, results
51
+
/// are intersected with `TermQuery` on the corresponding facet field.
37
52
///
38
53
/// # Errors
39
54
///
···
46
61
index: &Index,
47
62
query_str: &str,
48
63
limit: usize,
64
+
filters: &SearchFilters,
49
65
) -> Result<Vec<SearchResult>> {
50
66
let schema = schema::build_schema();
51
67
···
53
69
let body_field = schema.get_field(field::BODY).expect("body field");
54
70
let url_field = schema.get_field(field::URL).expect("url field");
55
71
let snippet_field = schema.get_field(field::SNIPPET).expect("snippet field");
56
-
let source_kind_field = schema.get_field(field::SOURCE_KIND).expect("source_kind field");
57
-
let source_name_field = schema.get_field(field::SOURCE_NAME).expect("source_name field");
72
+
let source_kind_field = schema
73
+
.get_field(field::SOURCE_KIND)
74
+
.expect("source_kind field");
75
+
let source_name_field = schema
76
+
.get_field(field::SOURCE_NAME)
77
+
.expect("source_name field");
58
78
59
79
let query_parser = QueryParser::for_index(index, vec![title_field, body_field]);
60
80
61
-
let query = query_parser
81
+
let text_query = query_parser
62
82
.parse_query(query_str)
63
83
.wrap_err_with(|| format!("failed to parse query: {query_str:?}"))?;
64
84
65
-
let reader = index
66
-
.reader()
67
-
.wrap_err("failed to get index reader")?;
85
+
// Build facet filter clauses
86
+
let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = vec![(Occur::Must, text_query)];
87
+
88
+
if let Some(source) = &filters.source {
89
+
let facet =
90
+
Facet::from_text(&format!("/{source}")).wrap_err("invalid source name facet")?;
91
+
let term = Term::from_facet(source_name_field, &facet);
92
+
clauses.push((
93
+
Occur::Must,
94
+
Box::new(TermQuery::new(term, IndexRecordOption::Basic)),
95
+
));
96
+
}
97
+
98
+
if let Some(kind) = &filters.kind {
99
+
let facet = Facet::from_text(&format!("/{kind}")).wrap_err("invalid source kind facet")?;
100
+
let term = Term::from_facet(source_kind_field, &facet);
101
+
clauses.push((
102
+
Occur::Must,
103
+
Box::new(TermQuery::new(term, IndexRecordOption::Basic)),
104
+
));
105
+
}
106
+
107
+
let query = BooleanQuery::from(clauses);
108
+
109
+
let reader = index.reader().wrap_err("failed to get index reader")?;
68
110
69
111
let searcher = reader.searcher();
70
112
113
+
let now_secs = chrono::Utc::now().timestamp();
114
+
115
+
let collector =
116
+
TopDocs::with_limit(limit).tweak_score(move |segment_reader: &SegmentReader| {
117
+
let date_reader = segment_reader
118
+
.fast_fields()
119
+
.date(field::CRAWLED_AT)
120
+
.ok()
121
+
.map(|col| col.first_or_default_col(Default::default()));
122
+
123
+
move |doc: DocId, original_score: Score| -> Score {
124
+
let Some(reader) = &date_reader else {
125
+
return original_score;
126
+
};
127
+
let crawled_ts = reader.get_val(doc).into_timestamp_secs();
128
+
let age_days = (now_secs - crawled_ts) as f32 / 86_400.0;
129
+
original_score * freshness_multiplier(age_days)
130
+
}
131
+
});
132
+
71
133
let top_docs = searcher
72
-
.search(&query, &TopDocs::with_limit(limit))
134
+
.search(&query, &collector)
73
135
.wrap_err("search failed")?;
74
136
75
-
debug!(
76
-
query = query_str,
77
-
hits = top_docs.len(),
78
-
"search completed"
79
-
);
137
+
debug!(query = query_str, hits = top_docs.len(), "search completed");
80
138
81
139
let mut results = Vec::with_capacity(top_docs.len());
82
140
···
106
164
let source_kind = doc
107
165
.get_first(source_kind_field)
108
166
.and_then(|v| v.as_facet())
109
-
.map_or_else(String::new, std::string::ToString::to_string);
167
+
.map(|f| facet_label(f))
168
+
.unwrap_or_default();
110
169
111
170
let source_name = doc
112
171
.get_first(source_name_field)
113
172
.and_then(|v| v.as_facet())
114
-
.map_or_else(String::new, std::string::ToString::to_string);
173
+
.map(|f| facet_label(f))
174
+
.unwrap_or_default();
115
175
116
176
results.push(SearchResult {
117
177
url,
···
125
185
126
186
Ok(results)
127
187
}
188
+
189
+
/// Extract the last path component from a Tantivy facet (e.g. `/blog` → `blog`).
190
+
fn facet_label(facet: &tantivy::schema::Facet) -> String {
191
+
facet.to_path().last().unwrap_or(&"").to_string()
192
+
}
193
+
194
+
/// Exponential freshness decay.
195
+
///
196
+
/// Returns a multiplier in `[FRESHNESS_FLOOR, FRESHNESS_FLOOR + 1.0]`:
197
+
///
198
+
/// - `age_days = 0` → `1.0 + FRESHNESS_FLOOR` (maximum boost)
199
+
/// - `age_days = HALF_LIFE` → `0.5 + FRESHNESS_FLOOR`
200
+
/// - `age_days → ∞` → `FRESHNESS_FLOOR` (floor, old content isn't buried)
201
+
///
202
+
/// ```text
203
+
/// multiplier
204
+
/// 1.5 ┤ ·
205
+
/// │ ··
206
+
/// 1.0 ┤ ····
207
+
/// │ ········
208
+
/// 0.5 ┤ ·····················
209
+
/// └──────────┬──────────┬──────────────→ days
210
+
/// 7 30
211
+
/// ```
212
+
fn freshness_multiplier(age_days: f32) -> f32 {
213
+
/// Half-life in days: score contribution halves every 7 days.
214
+
const HALF_LIFE: f32 = 7.0;
215
+
216
+
/// Floor multiplier so old content isn't completely buried.
217
+
const FRESHNESS_FLOOR: f32 = 0.5;
218
+
219
+
let decay = (0.5_f32).powf(age_days.max(0.0) / HALF_LIFE);
220
+
decay + FRESHNESS_FLOOR
221
+
}
+223
motet_core/src/robots.rs
+223
motet_core/src/robots.rs
···
1
+
//! `robots.txt` compliance checker.
2
+
//!
3
+
//! Fetches and caches `robots.txt` per origin, then checks individual URLs
4
+
//! against the parsed rules. Uses [`texting_robots`] for RFC 9309 compliant
5
+
//! parsing.
6
+
//!
7
+
//! The checker is safe to share across concurrent crawl tasks via `Arc`.
8
+
//! Internally it uses a [`tokio::sync::Mutex`] so all methods take `&self`.
9
+
//!
10
+
//! # Usage
11
+
//!
12
+
//! ```rust,no_run
13
+
//! # async fn example() -> eyre::Result<()> {
14
+
//! use motet_core::robots::RobotsChecker;
15
+
//!
16
+
//! let client = reqwest::Client::new();
17
+
//! let checker = RobotsChecker::new(client);
18
+
//!
19
+
//! let url = url::Url::parse("https://example.com/page")?;
20
+
//! if checker.allowed(&url).await {
21
+
//! // safe to crawl
22
+
//! }
23
+
//! # Ok(())
24
+
//! # }
25
+
//! ```
26
+
27
+
use std::{collections::HashMap, time::Duration};
28
+
use texting_robots::Robot;
29
+
use tokio::sync::Mutex;
30
+
use tracing::{debug, warn};
31
+
use url::Url;
32
+
33
+
/// User-agent string used when checking `robots.txt` rules.
34
+
///
35
+
/// This should match the token portion of the HTTP `User-Agent` header.
36
+
const ROBOT_NAME: &str = "motet";
37
+
38
+
/// Maximum size of a `robots.txt` file we're willing to parse (500 KiB,
39
+
/// per Google's recommendation).
40
+
const MAX_ROBOTS_SIZE: usize = 512 * 1024;
41
+
42
+
/// Per-origin cache entry for parsed `robots.txt`.
43
+
enum CacheEntry {
44
+
/// Successfully fetched and parsed.
45
+
Parsed(Robot),
46
+
/// Fetch failed or returned 4xx — assume no restrictions.
47
+
Permissive,
48
+
}
49
+
50
+
/// Fetches, caches, and checks `robots.txt` for crawled URLs.
51
+
///
52
+
/// Create one per crawl run and share across tasks via `Arc`. The internal
53
+
/// cache is protected by a [`tokio::sync::Mutex`] so all methods take `&self`.
54
+
///
55
+
/// `reqwest::Client` is cheap to clone (it wraps an `Arc` internally), so
56
+
/// the checker owns its client rather than borrowing — this avoids lifetime
57
+
/// parameters and makes `Arc<RobotsChecker>` trivial.
58
+
pub struct RobotsChecker {
59
+
client: reqwest::Client,
60
+
cache: Mutex<HashMap<String, CacheEntry>>,
61
+
}
62
+
63
+
impl std::fmt::Debug for RobotsChecker {
64
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
65
+
f.debug_struct("RobotsChecker").finish_non_exhaustive()
66
+
}
67
+
}
68
+
69
+
impl RobotsChecker {
70
+
/// Create a new checker backed by the given HTTP client.
71
+
///
72
+
/// The client is cloned (cheap — `reqwest::Client` is `Arc`-based).
73
+
#[must_use]
74
+
pub fn new(client: reqwest::Client) -> Self {
75
+
Self {
76
+
client,
77
+
cache: Mutex::new(HashMap::new()),
78
+
}
79
+
}
80
+
81
+
/// Check whether the given URL is allowed by `robots.txt`.
82
+
///
83
+
/// On first call for a given origin this fetches and parses the
84
+
/// `robots.txt` file. Subsequent calls for the same origin use the
85
+
/// cached result.
86
+
///
87
+
/// Returns `true` (allow) when:
88
+
/// - The URL's origin has no `robots.txt` (404, other 4xx)
89
+
/// - The `robots.txt` explicitly allows the path
90
+
/// - The fetch failed (err on the side of crawling)
91
+
pub async fn allowed(&self, url: &Url) -> bool {
92
+
self.ensure_cached(url).await;
93
+
94
+
let cache = self.cache.lock().await;
95
+
let origin = origin_key(url);
96
+
97
+
match cache.get(&origin) {
98
+
Some(CacheEntry::Parsed(robot)) => {
99
+
let allowed = robot.allowed(url.as_str());
100
+
if !allowed {
101
+
debug!(url = %url, "blocked by robots.txt");
102
+
}
103
+
allowed
104
+
}
105
+
Some(CacheEntry::Permissive) | None => true,
106
+
}
107
+
}
108
+
109
+
/// Return the crawl delay for the given URL's origin, if specified
110
+
/// in `robots.txt`.
111
+
pub async fn delay(&self, url: &Url) -> Option<Duration> {
112
+
self.ensure_cached(url).await;
113
+
114
+
let cache = self.cache.lock().await;
115
+
let origin = origin_key(url);
116
+
117
+
match cache.get(&origin) {
118
+
Some(CacheEntry::Parsed(robot)) => {
119
+
robot.delay.map(|secs| Duration::from_secs_f64(f64::from(secs)))
120
+
}
121
+
_ => None,
122
+
}
123
+
}
124
+
125
+
/// Ensure the cache has an entry for this URL's origin.
126
+
async fn ensure_cached(&self, url: &Url) {
127
+
let origin = origin_key(url);
128
+
129
+
// Quick check: already cached?
130
+
{
131
+
let cache = self.cache.lock().await;
132
+
if cache.contains_key(&origin) {
133
+
return;
134
+
}
135
+
}
136
+
137
+
// Fetch outside the lock to avoid holding it during I/O.
138
+
let entry = fetch_and_parse(&self.client, url).await;
139
+
140
+
let mut cache = self.cache.lock().await;
141
+
// Another task may have raced us; only insert if still missing.
142
+
cache.entry(origin).or_insert(entry);
143
+
}
144
+
}
145
+
146
+
/// Build a cache key from a URL's origin (scheme + host + port).
147
+
fn origin_key(url: &Url) -> String {
148
+
url.origin().ascii_serialization()
149
+
}
150
+
151
+
/// Fetch `robots.txt` for the given URL's origin and parse it.
152
+
async fn fetch_and_parse(client: &reqwest::Client, url: &Url) -> CacheEntry {
153
+
let robots_url = match texting_robots::get_robots_url(url.as_str()) {
154
+
Ok(u) => u,
155
+
Err(e) => {
156
+
warn!(url = %url, error = %e, "could not derive robots.txt URL, assuming permissive");
157
+
return CacheEntry::Permissive;
158
+
}
159
+
};
160
+
161
+
let resp = match client.get(&robots_url).send().await {
162
+
Ok(r) => r,
163
+
Err(e) => {
164
+
warn!(%robots_url, error = %e, "failed to fetch robots.txt, assuming permissive");
165
+
return CacheEntry::Permissive;
166
+
}
167
+
};
168
+
169
+
let status = resp.status();
170
+
171
+
// 4xx (except 429): no restrictions
172
+
if status.is_client_error() && status.as_u16() != 429 {
173
+
debug!(%robots_url, %status, "robots.txt not found, assuming permissive");
174
+
return CacheEntry::Permissive;
175
+
}
176
+
177
+
// 429 or 5xx: be conservative, assume no restrictions but warn
178
+
if status.as_u16() == 429 || status.is_server_error() {
179
+
warn!(
180
+
%robots_url,
181
+
%status,
182
+
"robots.txt unavailable (server error/rate limit), assuming permissive"
183
+
);
184
+
return CacheEntry::Permissive;
185
+
}
186
+
187
+
// 2xx: parse the body
188
+
let bytes = match resp.bytes().await {
189
+
Ok(b) => b,
190
+
Err(e) => {
191
+
warn!(%robots_url, error = %e, "failed to read robots.txt body, assuming permissive");
192
+
return CacheEntry::Permissive;
193
+
}
194
+
};
195
+
196
+
// Limit size per Google's recommendation
197
+
let bytes: &[u8] = if bytes.len() > MAX_ROBOTS_SIZE {
198
+
debug!(
199
+
%robots_url,
200
+
size = bytes.len(),
201
+
"robots.txt exceeds 500 KiB, truncating"
202
+
);
203
+
bytes.get(..MAX_ROBOTS_SIZE).unwrap_or(&bytes)
204
+
} else {
205
+
&bytes
206
+
};
207
+
208
+
match Robot::new(ROBOT_NAME, bytes) {
209
+
Ok(robot) => {
210
+
debug!(
211
+
%robots_url,
212
+
delay = ?robot.delay,
213
+
sitemaps = robot.sitemaps.len(),
214
+
"parsed robots.txt"
215
+
);
216
+
CacheEntry::Parsed(robot)
217
+
}
218
+
Err(e) => {
219
+
warn!(%robots_url, error = %e, "failed to parse robots.txt, assuming permissive");
220
+
CacheEntry::Permissive
221
+
}
222
+
}
223
+
}
+3
-3
motet_core/src/schema.rs
+3
-3
motet_core/src/schema.rs
···
65
65
// Stored only — not indexed (display purposes)
66
66
builder.add_text_field(field::SNIPPET, STORED);
67
67
68
-
// Facets for filtering
69
-
builder.add_facet_field(field::SOURCE_KIND, FacetOptions::default());
70
-
builder.add_facet_field(field::SOURCE_NAME, FacetOptions::default());
68
+
// Facets for filtering (stored so we can display them in results)
69
+
builder.add_facet_field(field::SOURCE_KIND, FacetOptions::default().set_stored());
70
+
builder.add_facet_field(field::SOURCE_NAME, FacetOptions::default().set_stored());
71
71
72
72
// Date for freshness
73
73
builder.add_date_field(field::CRAWLED_AT, DateOptions::default() | STORED | FAST);
+48
-5
motet_core/src/store.rs
+48
-5
motet_core/src/store.rs
···
156
156
}
157
157
}
158
158
159
+
/// Return the most recently crawled records, ordered by crawl time descending.
160
+
///
161
+
/// # Errors
162
+
///
163
+
/// Returns an error if the database query fails.
164
+
pub fn recent(&self, limit: usize) -> Result<Vec<CrawlRecord>> {
165
+
let mut stmt = self.conn.prepare(
166
+
"SELECT url, source_name, crawled_at, etag, title
167
+
FROM crawl_records
168
+
ORDER BY crawled_at DESC
169
+
LIMIT ?1",
170
+
)?;
171
+
172
+
let rows = stmt
173
+
.query_map(params![limit], |row| {
174
+
let crawled_at_str: String = row.get(2)?;
175
+
Ok((
176
+
row.get::<_, String>(0)?,
177
+
row.get::<_, String>(1)?,
178
+
crawled_at_str,
179
+
row.get::<_, Option<String>>(3)?,
180
+
row.get::<_, Option<String>>(4)?,
181
+
))
182
+
})?
183
+
.collect::<std::result::Result<Vec<_>, _>>()
184
+
.wrap_err("failed to read recent crawl records")?;
185
+
186
+
let mut records = Vec::with_capacity(rows.len());
187
+
for (url, source_name, crawled_at_str, etag, title) in rows {
188
+
let crawled_at = crawled_at_str
189
+
.parse::<DateTime<Utc>>()
190
+
.wrap_err("invalid datetime in database")?;
191
+
192
+
records.push(CrawlRecord {
193
+
url,
194
+
source_name,
195
+
crawled_at,
196
+
etag,
197
+
title,
198
+
});
199
+
}
200
+
201
+
Ok(records)
202
+
}
203
+
159
204
/// Count total crawled documents, optionally filtered by source.
160
205
///
161
206
/// # Errors
···
168
213
params![name],
169
214
|row| row.get(0),
170
215
)?,
171
-
None => self.conn.query_row(
172
-
"SELECT COUNT(*) FROM crawl_records",
173
-
[],
174
-
|row| row.get(0),
175
-
)?,
216
+
None => self
217
+
.conn
218
+
.query_row("SELECT COUNT(*) FROM crawl_records", [], |row| row.get(0))?,
176
219
};
177
220
178
221
Ok(count)