this repo has no description

WIP: Flesh out #1

open opened by expede.wtf targeting main from flesh-out
Labels

None yet.

assignee

None yet.

Participants 1
AT URI
at://did:plc:oypgij57lv3ytni32p2jqbce/sh.tangled.repo.pull/3mg54j2askk22
+1891 -520
Diff #0
+260 -366
Cargo.lock
··· 218 218 ] 219 219 220 220 [[package]] 221 - name = "block-buffer" 222 - version = "0.10.4" 221 + name = "bstr" 222 + version = "0.2.17" 223 223 source = "registry+https://github.com/rust-lang/crates.io-index" 224 - checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" 224 + checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" 225 225 dependencies = [ 226 - "generic-array", 226 + "lazy_static", 227 + "memchr", 228 + "regex-automata 0.1.10", 227 229 ] 228 230 229 231 [[package]] ··· 267 269 version = "1.0.4" 268 270 source = "registry+https://github.com/rust-lang/crates.io-index" 269 271 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" 272 + 273 + [[package]] 274 + name = "cfg_aliases" 275 + version = "0.2.1" 276 + source = "registry+https://github.com/rust-lang/crates.io-index" 277 + checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" 270 278 271 279 [[package]] 272 280 name = "chrono" ··· 356 364 checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" 357 365 358 366 [[package]] 359 - name = "core-foundation" 360 - version = "0.9.4" 361 - source = "registry+https://github.com/rust-lang/crates.io-index" 362 - checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" 363 - dependencies = [ 364 - "core-foundation-sys", 365 - "libc", 366 - ] 367 - 368 - [[package]] 369 - name = "core-foundation" 370 - version = "0.10.1" 371 - source = "registry+https://github.com/rust-lang/crates.io-index" 372 - checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" 373 - dependencies = [ 374 - "core-foundation-sys", 375 - "libc", 376 - ] 377 - 378 - [[package]] 379 367 name = "core-foundation-sys" 380 368 version = "0.8.7" 381 369 source = "registry+https://github.com/rust-lang/crates.io-index" 382 370 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" 383 371 384 372 [[package]] 385 - name = "cpufeatures" 386 - version = "0.2.17" 387 - source = "registry+https://github.com/rust-lang/crates.io-index" 388 - checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" 389 - dependencies = [ 390 - "libc", 391 - ] 392 - 393 - [[package]] 394 373 name = "crc32fast" 395 374 version = "1.5.0" 396 375 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 440 419 checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" 441 420 442 421 [[package]] 443 - name = "crypto-common" 444 - version = "0.1.7" 445 - source = "registry+https://github.com/rust-lang/crates.io-index" 446 - checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" 447 - dependencies = [ 448 - "generic-array", 449 - "typenum", 450 - ] 451 - 452 - [[package]] 453 422 name = "cssparser" 454 423 version = "0.34.0" 455 424 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 491 460 "proc-macro2", 492 461 "quote", 493 462 "syn", 494 - ] 495 - 496 - [[package]] 497 - name = "digest" 498 - version = "0.10.7" 499 - source = "registry+https://github.com/rust-lang/crates.io-index" 500 - checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" 501 - dependencies = [ 502 - "block-buffer", 503 - "crypto-common", 504 463 ] 505 464 506 465 [[package]] ··· 628 587 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" 629 588 630 589 [[package]] 590 + name = "feed-rs" 591 + version = "2.3.1" 592 + source = "registry+https://github.com/rust-lang/crates.io-index" 593 + checksum = "e4c0591d23efd0d595099af69a31863ac1823046b1b021e3b06ba3aae7e00991" 594 + dependencies = [ 595 + "chrono", 596 + "mediatype", 597 + "quick-xml", 598 + "regex", 599 + "serde", 600 + "serde_json", 601 + "siphasher", 602 + "url", 603 + "uuid", 604 + ] 605 + 606 + [[package]] 631 607 name = "find-msvc-tools" 632 608 version = "0.1.9" 633 609 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 644 620 version = "0.1.5" 645 621 source = "registry+https://github.com/rust-lang/crates.io-index" 646 622 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" 647 - 648 - [[package]] 649 - name = "foreign-types" 650 - version = "0.3.2" 651 - source = "registry+https://github.com/rust-lang/crates.io-index" 652 - checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" 653 - dependencies = [ 654 - "foreign-types-shared", 655 - ] 656 - 657 - [[package]] 658 - name = "foreign-types-shared" 659 - version = "0.1.1" 660 - source = "registry+https://github.com/rust-lang/crates.io-index" 661 - checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" 662 623 663 624 [[package]] 664 625 name = "form_urlencoded" ··· 705 666 checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" 706 667 707 668 [[package]] 708 - name = "futures-sink" 709 - version = "0.3.32" 710 - source = "registry+https://github.com/rust-lang/crates.io-index" 711 - checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" 712 - 713 - [[package]] 714 669 name = "futures-task" 715 670 version = "0.3.32" 716 671 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 738 693 ] 739 694 740 695 [[package]] 741 - name = "generic-array" 742 - version = "0.14.7" 743 - source = "registry+https://github.com/rust-lang/crates.io-index" 744 - checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" 745 - dependencies = [ 746 - "typenum", 747 - "version_check", 748 - ] 749 - 750 - [[package]] 751 696 name = "getopts" 752 697 version = "0.2.24" 753 698 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 763 708 checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" 764 709 dependencies = [ 765 710 "cfg-if", 711 + "js-sys", 766 712 "libc", 767 713 "wasi", 714 + "wasm-bindgen", 768 715 ] 769 716 770 717 [[package]] ··· 774 721 checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" 775 722 dependencies = [ 776 723 "cfg-if", 724 + "js-sys", 777 725 "libc", 778 726 "r-efi", 779 727 "wasip2", 728 + "wasm-bindgen", 780 729 ] 781 730 782 731 [[package]] ··· 799 748 checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" 800 749 801 750 [[package]] 802 - name = "h2" 803 - version = "0.4.13" 804 - source = "registry+https://github.com/rust-lang/crates.io-index" 805 - checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" 806 - dependencies = [ 807 - "atomic-waker", 808 - "bytes", 809 - "fnv", 810 - "futures-core", 811 - "futures-sink", 812 - "http", 813 - "indexmap", 814 - "slab", 815 - "tokio", 816 - "tokio-util", 817 - "tracing", 818 - ] 819 - 820 - [[package]] 821 751 name = "hashbrown" 822 752 version = "0.15.5" 823 753 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 928 858 "bytes", 929 859 "futures-channel", 930 860 "futures-core", 931 - "h2", 932 861 "http", 933 862 "http-body", 934 863 "httparse", ··· 955 884 "tokio", 956 885 "tokio-rustls", 957 886 "tower-service", 958 - ] 959 - 960 - [[package]] 961 - name = "hyper-tls" 962 - version = "0.6.0" 963 - source = "registry+https://github.com/rust-lang/crates.io-index" 964 - checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" 965 - dependencies = [ 966 - "bytes", 967 - "http-body-util", 968 - "hyper", 969 - "hyper-util", 970 - "native-tls", 971 - "tokio", 972 - "tokio-native-tls", 973 - "tower-service", 887 + "webpki-roots", 974 888 ] 975 889 976 890 [[package]] ··· 991 905 "percent-encoding", 992 906 "pin-project-lite", 993 907 "socket2", 994 - "system-configuration", 995 908 "tokio", 996 909 "tower-service", 997 910 "tracing", 998 - "windows-registry", 999 911 ] 1000 912 1001 913 [[package]] ··· 1310 1222 ] 1311 1223 1312 1224 [[package]] 1225 + name = "lru-slab" 1226 + version = "0.1.2" 1227 + source = "registry+https://github.com/rust-lang/crates.io-index" 1228 + checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" 1229 + 1230 + [[package]] 1313 1231 name = "lz4_flex" 1314 1232 version = "0.11.5" 1315 1233 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1352 1270 source = "registry+https://github.com/rust-lang/crates.io-index" 1353 1271 checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" 1354 1272 dependencies = [ 1355 - "regex-automata", 1273 + "regex-automata 0.4.14", 1356 1274 ] 1357 1275 1358 1276 [[package]] ··· 1362 1280 checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" 1363 1281 1364 1282 [[package]] 1283 + name = "maud" 1284 + version = "0.27.0" 1285 + source = "registry+https://github.com/rust-lang/crates.io-index" 1286 + checksum = "8156733e27020ea5c684db5beac5d1d611e1272ab17901a49466294b84fc217e" 1287 + dependencies = [ 1288 + "axum-core", 1289 + "http", 1290 + "itoa", 1291 + "maud_macros", 1292 + ] 1293 + 1294 + [[package]] 1295 + name = "maud_macros" 1296 + version = "0.27.0" 1297 + source = "registry+https://github.com/rust-lang/crates.io-index" 1298 + checksum = "7261b00f3952f617899bc012e3dbd56e4f0110a038175929fa5d18e5a19913ca" 1299 + dependencies = [ 1300 + "proc-macro2", 1301 + "proc-macro2-diagnostics", 1302 + "quote", 1303 + "syn", 1304 + ] 1305 + 1306 + [[package]] 1365 1307 name = "measure_time" 1366 1308 version = "0.8.3" 1367 1309 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1372 1314 ] 1373 1315 1374 1316 [[package]] 1317 + name = "mediatype" 1318 + version = "0.19.20" 1319 + source = "registry+https://github.com/rust-lang/crates.io-index" 1320 + checksum = "33746aadcb41349ec291e7f2f0a3aa6834d1d7c58066fb4b01f68efc4c4b7631" 1321 + dependencies = [ 1322 + "serde", 1323 + ] 1324 + 1325 + [[package]] 1375 1326 name = "memchr" 1376 1327 version = "2.8.0" 1377 1328 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1436 1387 "clap", 1437 1388 "color-eyre", 1438 1389 "dirs", 1390 + "maud", 1439 1391 "mime_guess", 1440 1392 "motet_core", 1393 + "regex", 1441 1394 "reqwest", 1442 - "rust-embed", 1443 1395 "serde", 1444 1396 "serde_json", 1445 1397 "tokio", ··· 1455 1407 "chrono", 1456 1408 "dirs", 1457 1409 "eyre", 1410 + "feed-rs", 1458 1411 "reqwest", 1459 1412 "rusqlite", 1460 1413 "scraper", 1461 1414 "serde", 1462 1415 "serde_json", 1463 1416 "tantivy", 1417 + "texting_robots", 1464 1418 "thiserror 2.0.18", 1465 1419 "tokio", 1466 1420 "tracing", ··· 1474 1428 checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" 1475 1429 1476 1430 [[package]] 1477 - name = "native-tls" 1478 - version = "0.2.18" 1479 - source = "registry+https://github.com/rust-lang/crates.io-index" 1480 - checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" 1481 - dependencies = [ 1482 - "libc", 1483 - "log", 1484 - "openssl", 1485 - "openssl-probe", 1486 - "openssl-sys", 1487 - "schannel", 1488 - "security-framework", 1489 - "security-framework-sys", 1490 - "tempfile", 1491 - ] 1492 - 1493 - [[package]] 1494 1431 name = "new_debug_unreachable" 1495 1432 version = "1.0.6" 1496 1433 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1569 1506 checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" 1570 1507 1571 1508 [[package]] 1572 - name = "openssl" 1573 - version = "0.10.75" 1574 - source = "registry+https://github.com/rust-lang/crates.io-index" 1575 - checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" 1576 - dependencies = [ 1577 - "bitflags", 1578 - "cfg-if", 1579 - "foreign-types", 1580 - "libc", 1581 - "once_cell", 1582 - "openssl-macros", 1583 - "openssl-sys", 1584 - ] 1585 - 1586 - [[package]] 1587 - name = "openssl-macros" 1588 - version = "0.1.1" 1589 - source = "registry+https://github.com/rust-lang/crates.io-index" 1590 - checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" 1591 - dependencies = [ 1592 - "proc-macro2", 1593 - "quote", 1594 - "syn", 1595 - ] 1596 - 1597 - [[package]] 1598 - name = "openssl-probe" 1599 - version = "0.2.1" 1600 - source = "registry+https://github.com/rust-lang/crates.io-index" 1601 - checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" 1602 - 1603 - [[package]] 1604 - name = "openssl-sys" 1605 - version = "0.9.111" 1606 - source = "registry+https://github.com/rust-lang/crates.io-index" 1607 - checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" 1608 - dependencies = [ 1609 - "cc", 1610 - "libc", 1611 - "pkg-config", 1612 - "vcpkg", 1613 - ] 1614 - 1615 - [[package]] 1616 1509 name = "option-ext" 1617 1510 version = "0.2.0" 1618 1511 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1689 1582 checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" 1690 1583 dependencies = [ 1691 1584 "phf_shared", 1692 - "rand", 1585 + "rand 0.8.5", 1693 1586 ] 1694 1587 1695 1588 [[package]] ··· 1782 1675 ] 1783 1676 1784 1677 [[package]] 1678 + name = "proc-macro2-diagnostics" 1679 + version = "0.10.1" 1680 + source = "registry+https://github.com/rust-lang/crates.io-index" 1681 + checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" 1682 + dependencies = [ 1683 + "proc-macro2", 1684 + "quote", 1685 + "syn", 1686 + "version_check", 1687 + ] 1688 + 1689 + [[package]] 1690 + name = "quick-xml" 1691 + version = "0.37.5" 1692 + source = "registry+https://github.com/rust-lang/crates.io-index" 1693 + checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" 1694 + dependencies = [ 1695 + "encoding_rs", 1696 + "memchr", 1697 + ] 1698 + 1699 + [[package]] 1700 + name = "quinn" 1701 + version = "0.11.9" 1702 + source = "registry+https://github.com/rust-lang/crates.io-index" 1703 + checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" 1704 + dependencies = [ 1705 + "bytes", 1706 + "cfg_aliases", 1707 + "pin-project-lite", 1708 + "quinn-proto", 1709 + "quinn-udp", 1710 + "rustc-hash 2.1.1", 1711 + "rustls", 1712 + "socket2", 1713 + "thiserror 2.0.18", 1714 + "tokio", 1715 + "tracing", 1716 + "web-time", 1717 + ] 1718 + 1719 + [[package]] 1720 + name = "quinn-proto" 1721 + version = "0.11.13" 1722 + source = "registry+https://github.com/rust-lang/crates.io-index" 1723 + checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" 1724 + dependencies = [ 1725 + "bytes", 1726 + "getrandom 0.3.4", 1727 + "lru-slab", 1728 + "rand 0.9.2", 1729 + "ring", 1730 + "rustc-hash 2.1.1", 1731 + "rustls", 1732 + "rustls-pki-types", 1733 + "slab", 1734 + "thiserror 2.0.18", 1735 + "tinyvec", 1736 + "tracing", 1737 + "web-time", 1738 + ] 1739 + 1740 + [[package]] 1741 + name = "quinn-udp" 1742 + version = "0.5.14" 1743 + source = "registry+https://github.com/rust-lang/crates.io-index" 1744 + checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" 1745 + dependencies = [ 1746 + "cfg_aliases", 1747 + "libc", 1748 + "once_cell", 1749 + "socket2", 1750 + "tracing", 1751 + "windows-sys 0.60.2", 1752 + ] 1753 + 1754 + [[package]] 1785 1755 name = "quote" 1786 1756 version = "1.0.44" 1787 1757 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1803 1773 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 1804 1774 dependencies = [ 1805 1775 "libc", 1806 - "rand_chacha", 1807 - "rand_core", 1776 + "rand_chacha 0.3.1", 1777 + "rand_core 0.6.4", 1778 + ] 1779 + 1780 + [[package]] 1781 + name = "rand" 1782 + version = "0.9.2" 1783 + source = "registry+https://github.com/rust-lang/crates.io-index" 1784 + checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" 1785 + dependencies = [ 1786 + "rand_chacha 0.9.0", 1787 + "rand_core 0.9.5", 1808 1788 ] 1809 1789 1810 1790 [[package]] ··· 1814 1794 checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 1815 1795 dependencies = [ 1816 1796 "ppv-lite86", 1817 - "rand_core", 1797 + "rand_core 0.6.4", 1798 + ] 1799 + 1800 + [[package]] 1801 + name = "rand_chacha" 1802 + version = "0.9.0" 1803 + source = "registry+https://github.com/rust-lang/crates.io-index" 1804 + checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" 1805 + dependencies = [ 1806 + "ppv-lite86", 1807 + "rand_core 0.9.5", 1818 1808 ] 1819 1809 1820 1810 [[package]] ··· 1827 1817 ] 1828 1818 1829 1819 [[package]] 1820 + name = "rand_core" 1821 + version = "0.9.5" 1822 + source = "registry+https://github.com/rust-lang/crates.io-index" 1823 + checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" 1824 + dependencies = [ 1825 + "getrandom 0.3.4", 1826 + ] 1827 + 1828 + [[package]] 1830 1829 name = "rand_distr" 1831 1830 version = "0.4.3" 1832 1831 source = "registry+https://github.com/rust-lang/crates.io-index" 1833 1832 checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" 1834 1833 dependencies = [ 1835 1834 "num-traits", 1836 - "rand", 1835 + "rand 0.8.5", 1837 1836 ] 1838 1837 1839 1838 [[package]] ··· 1884 1883 dependencies = [ 1885 1884 "aho-corasick", 1886 1885 "memchr", 1887 - "regex-automata", 1886 + "regex-automata 0.4.14", 1888 1887 "regex-syntax", 1889 1888 ] 1890 1889 1891 1890 [[package]] 1892 1891 name = "regex-automata" 1892 + version = "0.1.10" 1893 + source = "registry+https://github.com/rust-lang/crates.io-index" 1894 + checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" 1895 + 1896 + [[package]] 1897 + name = "regex-automata" 1893 1898 version = "0.4.14" 1894 1899 source = "registry+https://github.com/rust-lang/crates.io-index" 1895 1900 checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" ··· 1913 1918 dependencies = [ 1914 1919 "base64", 1915 1920 "bytes", 1916 - "encoding_rs", 1917 1921 "futures-core", 1918 - "h2", 1919 1922 "http", 1920 1923 "http-body", 1921 1924 "http-body-util", 1922 1925 "hyper", 1923 1926 "hyper-rustls", 1924 - "hyper-tls", 1925 1927 "hyper-util", 1926 1928 "js-sys", 1927 1929 "log", 1928 - "mime", 1929 - "native-tls", 1930 1930 "percent-encoding", 1931 1931 "pin-project-lite", 1932 + "quinn", 1933 + "rustls", 1932 1934 "rustls-pki-types", 1933 1935 "serde", 1934 1936 "serde_json", 1935 1937 "serde_urlencoded", 1936 1938 "sync_wrapper", 1937 1939 "tokio", 1938 - "tokio-native-tls", 1940 + "tokio-rustls", 1939 1941 "tower", 1940 1942 "tower-http", 1941 1943 "tower-service", ··· 1943 1945 "wasm-bindgen", 1944 1946 "wasm-bindgen-futures", 1945 1947 "web-sys", 1948 + "webpki-roots", 1946 1949 ] 1947 1950 1948 1951 [[package]] ··· 1974 1977 ] 1975 1978 1976 1979 [[package]] 1977 - name = "rust-embed" 1978 - version = "8.11.0" 1979 - source = "registry+https://github.com/rust-lang/crates.io-index" 1980 - checksum = "04113cb9355a377d83f06ef1f0a45b8ab8cd7d8b1288160717d66df5c7988d27" 1981 - dependencies = [ 1982 - "axum", 1983 - "rust-embed-impl", 1984 - "rust-embed-utils", 1985 - "walkdir", 1986 - ] 1987 - 1988 - [[package]] 1989 - name = "rust-embed-impl" 1990 - version = "8.11.0" 1991 - source = "registry+https://github.com/rust-lang/crates.io-index" 1992 - checksum = "da0902e4c7c8e997159ab384e6d0fc91c221375f6894346ae107f47dd0f3ccaa" 1993 - dependencies = [ 1994 - "proc-macro2", 1995 - "quote", 1996 - "rust-embed-utils", 1997 - "syn", 1998 - "walkdir", 1999 - ] 2000 - 2001 - [[package]] 2002 - name = "rust-embed-utils" 2003 - version = "8.11.0" 2004 - source = "registry+https://github.com/rust-lang/crates.io-index" 2005 - checksum = "5bcdef0be6fe7f6fa333b1073c949729274b05f123a0ad7efcb8efd878e5c3b1" 2006 - dependencies = [ 2007 - "sha2", 2008 - "walkdir", 2009 - ] 2010 - 2011 - [[package]] 2012 1980 name = "rust-stemmers" 2013 1981 version = "1.2.0" 2014 1982 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2031 1999 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 2032 2000 2033 2001 [[package]] 2002 + name = "rustc-hash" 2003 + version = "2.1.1" 2004 + source = "registry+https://github.com/rust-lang/crates.io-index" 2005 + checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" 2006 + 2007 + [[package]] 2034 2008 name = "rustix" 2035 2009 version = "0.38.44" 2036 2010 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2063 2037 checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" 2064 2038 dependencies = [ 2065 2039 "once_cell", 2040 + "ring", 2066 2041 "rustls-pki-types", 2067 2042 "rustls-webpki", 2068 2043 "subtle", ··· 2075 2050 source = "registry+https://github.com/rust-lang/crates.io-index" 2076 2051 checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" 2077 2052 dependencies = [ 2053 + "web-time", 2078 2054 "zeroize", 2079 2055 ] 2080 2056 ··· 2102 2078 checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" 2103 2079 2104 2080 [[package]] 2105 - name = "same-file" 2106 - version = "1.0.6" 2107 - source = "registry+https://github.com/rust-lang/crates.io-index" 2108 - checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 2109 - dependencies = [ 2110 - "winapi-util", 2111 - ] 2112 - 2113 - [[package]] 2114 - name = "schannel" 2115 - version = "0.1.28" 2116 - source = "registry+https://github.com/rust-lang/crates.io-index" 2117 - checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" 2118 - dependencies = [ 2119 - "windows-sys 0.61.2", 2120 - ] 2121 - 2122 - [[package]] 2123 2081 name = "scopeguard" 2124 2082 version = "1.2.0" 2125 2083 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2141 2099 ] 2142 2100 2143 2101 [[package]] 2144 - name = "security-framework" 2145 - version = "3.7.0" 2146 - source = "registry+https://github.com/rust-lang/crates.io-index" 2147 - checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" 2148 - dependencies = [ 2149 - "bitflags", 2150 - "core-foundation 0.10.1", 2151 - "core-foundation-sys", 2152 - "libc", 2153 - "security-framework-sys", 2154 - ] 2155 - 2156 - [[package]] 2157 - name = "security-framework-sys" 2158 - version = "2.17.0" 2159 - source = "registry+https://github.com/rust-lang/crates.io-index" 2160 - checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" 2161 - dependencies = [ 2162 - "core-foundation-sys", 2163 - "libc", 2164 - ] 2165 - 2166 - [[package]] 2167 2102 name = "selectors" 2168 2103 version = "0.26.0" 2169 2104 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2261 2196 checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" 2262 2197 dependencies = [ 2263 2198 "stable_deref_trait", 2264 - ] 2265 - 2266 - [[package]] 2267 - name = "sha2" 2268 - version = "0.10.9" 2269 - source = "registry+https://github.com/rust-lang/crates.io-index" 2270 - checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" 2271 - dependencies = [ 2272 - "cfg-if", 2273 - "cpufeatures", 2274 - "digest", 2275 2199 ] 2276 2200 2277 2201 [[package]] ··· 2411 2335 ] 2412 2336 2413 2337 [[package]] 2414 - name = "system-configuration" 2415 - version = "0.7.0" 2416 - source = "registry+https://github.com/rust-lang/crates.io-index" 2417 - checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" 2418 - dependencies = [ 2419 - "bitflags", 2420 - "core-foundation 0.9.4", 2421 - "system-configuration-sys", 2422 - ] 2423 - 2424 - [[package]] 2425 - name = "system-configuration-sys" 2426 - version = "0.6.0" 2427 - source = "registry+https://github.com/rust-lang/crates.io-index" 2428 - checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" 2429 - dependencies = [ 2430 - "core-foundation-sys", 2431 - "libc", 2432 - ] 2433 - 2434 - [[package]] 2435 2338 name = "tantivy" 2436 2339 version = "0.22.1" 2437 2340 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2463 2366 "rayon", 2464 2367 "regex", 2465 2368 "rust-stemmers", 2466 - "rustc-hash", 2369 + "rustc-hash 1.1.0", 2467 2370 "serde", 2468 2371 "serde_json", 2469 2372 "sketches-ddsketch", ··· 2597 2500 ] 2598 2501 2599 2502 [[package]] 2503 + name = "texting_robots" 2504 + version = "0.2.2" 2505 + source = "registry+https://github.com/rust-lang/crates.io-index" 2506 + checksum = "5b82a718a28dda2e67ad6e0464597b58eae39e2e4d0451e03d1028d71e81bb4a" 2507 + dependencies = [ 2508 + "anyhow", 2509 + "bstr", 2510 + "lazy_static", 2511 + "nom", 2512 + "percent-encoding", 2513 + "regex", 2514 + "thiserror 1.0.69", 2515 + "url", 2516 + ] 2517 + 2518 + [[package]] 2600 2519 name = "thiserror" 2601 2520 version = "1.0.69" 2602 2521 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2687 2606 ] 2688 2607 2689 2608 [[package]] 2609 + name = "tinyvec" 2610 + version = "1.10.0" 2611 + source = "registry+https://github.com/rust-lang/crates.io-index" 2612 + checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" 2613 + dependencies = [ 2614 + "tinyvec_macros", 2615 + ] 2616 + 2617 + [[package]] 2618 + name = "tinyvec_macros" 2619 + version = "0.1.1" 2620 + source = "registry+https://github.com/rust-lang/crates.io-index" 2621 + checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" 2622 + 2623 + [[package]] 2690 2624 name = "tokio" 2691 2625 version = "1.49.0" 2692 2626 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2715 2649 ] 2716 2650 2717 2651 [[package]] 2718 - name = "tokio-native-tls" 2719 - version = "0.3.1" 2720 - source = "registry+https://github.com/rust-lang/crates.io-index" 2721 - checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" 2722 - dependencies = [ 2723 - "native-tls", 2724 - "tokio", 2725 - ] 2726 - 2727 - [[package]] 2728 2652 name = "tokio-rustls" 2729 2653 version = "0.26.4" 2730 2654 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2735 2659 ] 2736 2660 2737 2661 [[package]] 2738 - name = "tokio-util" 2739 - version = "0.7.18" 2740 - source = "registry+https://github.com/rust-lang/crates.io-index" 2741 - checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" 2742 - dependencies = [ 2743 - "bytes", 2744 - "futures-core", 2745 - "futures-sink", 2746 - "pin-project-lite", 2747 - "tokio", 2748 - ] 2749 - 2750 - [[package]] 2751 2662 name = "tower" 2752 2663 version = "0.5.3" 2753 2664 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2856 2767 "matchers", 2857 2768 "nu-ansi-term", 2858 2769 "once_cell", 2859 - "regex-automata", 2770 + "regex-automata 0.4.14", 2860 2771 "sharded-slab", 2861 2772 "smallvec", 2862 2773 "thread_local", ··· 2872 2783 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" 2873 2784 2874 2785 [[package]] 2875 - name = "typenum" 2876 - version = "1.19.0" 2877 - source = "registry+https://github.com/rust-lang/crates.io-index" 2878 - checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" 2879 - 2880 - [[package]] 2881 2786 name = "unicase" 2882 2787 version = "2.9.0" 2883 2788 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2973 2878 version = "0.9.5" 2974 2879 source = "registry+https://github.com/rust-lang/crates.io-index" 2975 2880 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 2976 - 2977 - [[package]] 2978 - name = "walkdir" 2979 - version = "2.5.0" 2980 - source = "registry+https://github.com/rust-lang/crates.io-index" 2981 - checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 2982 - dependencies = [ 2983 - "same-file", 2984 - "winapi-util", 2985 - ] 2986 2881 2987 2882 [[package]] 2988 2883 name = "want" ··· 3121 3016 ] 3122 3017 3123 3018 [[package]] 3019 + name = "web-time" 3020 + version = "1.1.0" 3021 + source = "registry+https://github.com/rust-lang/crates.io-index" 3022 + checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" 3023 + dependencies = [ 3024 + "js-sys", 3025 + "wasm-bindgen", 3026 + ] 3027 + 3028 + [[package]] 3029 + name = "webpki-roots" 3030 + version = "1.0.6" 3031 + source = "registry+https://github.com/rust-lang/crates.io-index" 3032 + checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" 3033 + dependencies = [ 3034 + "rustls-pki-types", 3035 + ] 3036 + 3037 + [[package]] 3124 3038 name = "winapi" 3125 3039 version = "0.3.9" 3126 3040 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3135 3049 version = "0.4.0" 3136 3050 source = "registry+https://github.com/rust-lang/crates.io-index" 3137 3051 checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 3138 - 3139 - [[package]] 3140 - name = "winapi-util" 3141 - version = "0.1.11" 3142 - source = "registry+https://github.com/rust-lang/crates.io-index" 3143 - checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" 3144 - dependencies = [ 3145 - "windows-sys 0.61.2", 3146 - ] 3147 3052 3148 3053 [[package]] 3149 3054 name = "winapi-x86_64-pc-windows-gnu" ··· 3191 3096 version = "0.2.1" 3192 3097 source = "registry+https://github.com/rust-lang/crates.io-index" 3193 3098 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" 3194 - 3195 - [[package]] 3196 - name = "windows-registry" 3197 - version = "0.6.1" 3198 - source = "registry+https://github.com/rust-lang/crates.io-index" 3199 - checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" 3200 - dependencies = [ 3201 - "windows-link", 3202 - "windows-result", 3203 - "windows-strings", 3204 - ] 3205 3099 3206 3100 [[package]] 3207 3101 name = "windows-result"
+5 -2
Cargo.toml
··· 25 25 color-eyre = "0.6" 26 26 dirs = "6.0" 27 27 eyre = "0.6" 28 + feed-rs = "2.3" 29 + maud = { version = "0.27", features = ["axum"] } 28 30 mime_guess = "2.0" 29 - reqwest = { version = "0.12", features = ["json"] } 30 - rust-embed = { version = "8.0", features = ["axum"] } 31 + regex = "1.11" 32 + reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } 31 33 rusqlite = { version = "0.33", features = ["bundled"] } 32 34 scraper = "0.22" 33 35 serde = { version = "1.0", features = ["derive"] } 34 36 serde_json = "1.0" 35 37 tantivy = "0.22" 38 + texting_robots = "0.2" 36 39 thiserror = "2.0" 37 40 tokio = { version = "1.0", features = ["full"] } 38 41 tower-http = { version = "0.6", features = ["cors"] }
+2 -1
motet_cli/Cargo.toml
··· 21 21 clap = { workspace = true } 22 22 color-eyre = { workspace = true } 23 23 dirs = { workspace = true } 24 + maud = { workspace = true } 24 25 mime_guess = { workspace = true } 26 + regex = { workspace = true } 25 27 motet_core = { workspace = true } 26 28 reqwest = { workspace = true } 27 - rust-embed = { workspace = true } 28 29 serde = { workspace = true } 29 30 serde_json = { workspace = true } 30 31 tokio = { workspace = true }
+82 -20
motet_cli/src/commands.rs
··· 3 3 use clap::Args; 4 4 use color_eyre::eyre::{Result, WrapErr}; 5 5 use motet_core::{ 6 - config, 7 - crawler, 6 + config::{self, SourceConfig}, 7 + crawler::{self, CrawledDocument}, 8 8 index::SearchIndex, 9 9 query, 10 + robots::RobotsChecker, 10 11 store::{CrawlRecord, Store}, 11 12 }; 12 - use std::path::PathBuf; 13 + use std::{path::PathBuf, sync::Arc}; 13 14 use tracing::info; 14 15 15 16 /// Arguments for `motet crawl`. ··· 75 76 let store = Store::open(&data_dir.join("motet.db"))?; 76 77 77 78 let client = reqwest::Client::builder() 78 - .user_agent("motet/0.1 (personal search indexer)") 79 + // Some CDNs (Akamai/CBC, Cloudflare/Bluesky) reject non-browser UAs. 80 + // Since motet is a personal indexer for user-chosen sources, we use a 81 + // real browser UA to avoid false-positive bot blocking. 82 + .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") 83 + .timeout(std::time::Duration::from_secs(30)) 84 + .connect_timeout(std::time::Duration::from_secs(10)) 85 + 79 86 .build() 80 87 .wrap_err("failed to build HTTP client")?; 81 88 82 - let sources: Vec<_> = match &args.source { 89 + let sources: Vec<(String, SourceConfig)> = match &args.source { 83 90 Some(name) => { 84 91 if let Some(source_cfg) = cfg.sources.get(name) { 85 - vec![(name.as_str(), source_cfg)] 92 + vec![(name.clone(), source_cfg.clone())] 86 93 } else { 87 94 eprintln!("Source {name:?} not found in config. Available sources:"); 88 95 for key in cfg.sources.keys() { ··· 91 98 return Ok(()); 92 99 } 93 100 } 94 - None => cfg.sources.iter().map(|(k, v)| (k.as_str(), v)).collect(), 101 + None => cfg 102 + .sources 103 + .into_iter() 104 + .collect(), 95 105 }; 96 106 97 - for (name, source_cfg) in sources { 98 - if args.dry_run { 107 + if args.dry_run { 108 + for (name, source_cfg) in &sources { 99 109 println!("[dry run] Would crawl source: {name} (kind: {:?})", source_cfg.kind); 100 - continue; 101 110 } 111 + return Ok(()); 112 + } 102 113 103 - info!(%name, "starting crawl"); 114 + // Shared state for parallel crawling. 115 + // reqwest::Client is Arc-based internally; RobotsChecker uses tokio::sync::Mutex. 116 + let robots = Arc::new(RobotsChecker::new(client.clone())); 104 117 105 - match crawler::dispatch(name, source_cfg, &client).await { 106 - Ok(docs) => { 118 + // Crawl all sources concurrently (bounded by JoinSet backpressure). 119 + let max_concurrent = 6; 120 + let mut join_set = tokio::task::JoinSet::new(); 121 + let mut pending = std::collections::VecDeque::from(sources); 122 + 123 + // Seed the initial batch 124 + while join_set.len() < max_concurrent { 125 + let Some((name, source_cfg)) = pending.pop_front() else { 126 + break; 127 + }; 128 + let client = client.clone(); 129 + let robots = Arc::clone(&robots); 130 + join_set.spawn(crawl_source(name, source_cfg, client, robots)); 131 + } 132 + 133 + // Process results as they arrive, spawning more tasks to maintain concurrency. 134 + while let Some(result) = join_set.join_next().await { 135 + match result { 136 + Ok(Ok((name, kind_label, docs))) => { 107 137 println!(" Crawled {} documents from {name}", docs.len()); 108 138 109 - // Store crawl records in SQLite 110 139 for doc in &docs { 111 140 store.upsert_crawl(&CrawlRecord { 112 141 url: doc.url.to_string(), 113 - source_name: name.to_string(), 142 + source_name: name.clone(), 114 143 crawled_at: doc.crawled_at, 115 144 etag: None, 116 145 title: Some(doc.title.clone()), 117 146 })?; 118 147 } 119 148 120 - // Index in Tantivy 121 - let kind_label = source_cfg.kind_label(); 122 - index.index_documents(name, kind_label, &docs)?; 149 + index.index_documents(&name, &kind_label, &docs)?; 123 150 println!(" Indexed {} documents from {name}", docs.len()); 124 151 } 125 - Err(e) => { 152 + Ok(Err((name, e))) => { 126 153 eprintln!(" Error crawling {name}: {e:#}"); 154 + } 155 + Err(join_err) => { 156 + eprintln!(" Task panicked: {join_err}"); 127 157 } 128 158 } 159 + 160 + // Refill the pool 161 + if let Some((name, source_cfg)) = pending.pop_front() { 162 + let client = client.clone(); 163 + let robots = Arc::clone(&robots); 164 + join_set.spawn(crawl_source(name, source_cfg, client, robots)); 165 + } 129 166 } 130 167 131 168 let total = index.num_docs()?; ··· 134 171 Ok(()) 135 172 } 136 173 174 + /// Crawl a single source. Returns `(source_name, kind_label, documents)` on 175 + /// success, or `(source_name, error)` on failure. This signature is designed 176 + /// for use inside a `JoinSet`. 177 + async fn crawl_source( 178 + name: String, 179 + config: SourceConfig, 180 + client: reqwest::Client, 181 + robots: Arc<RobotsChecker>, 182 + ) -> std::result::Result<(String, String, Vec<CrawledDocument>), (String, color_eyre::eyre::Report)> 183 + { 184 + info!(%name, "starting crawl"); 185 + 186 + let kind_label = config.kind_label().to_string(); 187 + 188 + match crawler::dispatch(&name, &config, &client, &robots).await { 189 + Ok(docs) => Ok((name, kind_label, docs)), 190 + Err(e) => Err((name, e)), 191 + } 192 + } 193 + 137 194 /// Run the search command. 138 195 pub(crate) fn search(args: &SearchArgs) -> Result<()> { 139 196 let query_str = args.query.join(" "); ··· 151 208 } 152 209 153 210 let index = SearchIndex::open(&index_path)?; 154 - let results = query::search(index.inner(), &query_str, args.limit)?; 211 + let results = query::search( 212 + index.inner(), 213 + &query_str, 214 + args.limit, 215 + &query::SearchFilters::default(), 216 + )?; 155 217 156 218 if results.is_empty() { 157 219 println!("No results for: {query_str}");
+360 -73
motet_cli/src/serve.rs
··· 1 - //! Web server — axum API + embedded React frontend. 1 + //! Web server — axum with server-side rendered HTML via maud. 2 2 3 3 use axum::{ 4 4 Json, Router, ··· 8 8 routing::get, 9 9 }; 10 10 use color_eyre::eyre::Result; 11 - use motet_core::{config, index::SearchIndex, query}; 12 - use rust_embed::Embed; 11 + use maud::{DOCTYPE, Markup, PreEscaped, html}; 12 + use motet_core::{config, index::SearchIndex, query, store::Store}; 13 13 use serde::{Deserialize, Serialize}; 14 14 use std::sync::Arc; 15 + use tokio::sync::Mutex; 15 16 use tower_http::cors::CorsLayer; 16 17 use tracing::info; 17 18 18 19 use crate::commands::ServeArgs; 19 20 20 - /// Embedded React frontend assets. 21 - #[derive(Embed)] 22 - #[folder = "../motet_web/dist/"] 23 - struct WebAssets; 24 - 25 21 /// Shared application state. 22 + /// 23 + /// `Store` wraps a `rusqlite::Connection` which is not `Sync`, so we 24 + /// protect it with a `tokio::sync::Mutex`. 26 25 struct AppState { 27 26 index: SearchIndex, 27 + store: Mutex<Store>, 28 28 } 29 29 30 + // --------------------------------------------------------------------------- 31 + // HTML layout 32 + // --------------------------------------------------------------------------- 33 + 34 + /// Shared page shell: doctype, head, nav, footer. 35 + fn layout(title: &str, content: Markup) -> Markup { 36 + html! { 37 + (DOCTYPE) 38 + html lang="en" { 39 + head { 40 + meta charset="utf-8"; 41 + meta name="viewport" content="width=device-width, initial-scale=1"; 42 + title { (title) " — motet" } 43 + style { (CSS) } 44 + } 45 + body { 46 + nav { 47 + a href="/" { "motet" } 48 + } 49 + main { (content) } 50 + footer { 51 + p { "search your corner of the web" } 52 + } 53 + } 54 + } 55 + } 56 + } 57 + 58 + /// Minimal embedded CSS — no build step needed. 59 + const CSS: &str = r#" 60 + :root { 61 + --bg: #fafaf9; 62 + --fg: #1c1917; 63 + --muted: #78716c; 64 + --accent: #b45309; 65 + --border: #d6d3d1; 66 + --card-bg: #fff; 67 + --radius: 6px; 68 + } 69 + @media (prefers-color-scheme: dark) { 70 + :root { 71 + --bg: #1c1917; 72 + --fg: #fafaf9; 73 + --muted: #a8a29e; 74 + --accent: #f59e0b; 75 + --border: #44403c; 76 + --card-bg: #292524; 77 + } 78 + } 79 + * { margin: 0; padding: 0; box-sizing: border-box; } 80 + body { 81 + font-family: system-ui, -apple-system, sans-serif; 82 + background: var(--bg); 83 + color: var(--fg); 84 + line-height: 1.6; 85 + max-width: 48rem; 86 + margin: 0 auto; 87 + padding: 1rem 1.5rem; 88 + } 89 + nav { padding: 0.75rem 0; border-bottom: 1px solid var(--border); margin-bottom: 1.5rem; } 90 + nav a { color: var(--accent); text-decoration: none; font-weight: 700; font-size: 1.25rem; } 91 + footer { margin-top: 3rem; padding-top: 1rem; border-top: 1px solid var(--border); color: var(--muted); font-size: 0.85rem; } 92 + main { min-height: 60vh; } 93 + h1 { font-size: 1.5rem; margin-bottom: 1rem; } 94 + form { display: flex; gap: 0.5rem; margin-bottom: 1.5rem; } 95 + input[type=search] { 96 + flex: 1; 97 + padding: 0.5rem 0.75rem; 98 + font-size: 1rem; 99 + border: 1px solid var(--border); 100 + border-radius: var(--radius); 101 + background: var(--card-bg); 102 + color: var(--fg); 103 + } 104 + button { 105 + padding: 0.5rem 1.25rem; 106 + font-size: 1rem; 107 + background: var(--accent); 108 + color: #fff; 109 + border: none; 110 + border-radius: var(--radius); 111 + cursor: pointer; 112 + } 113 + button:hover { opacity: 0.9; } 114 + .results { list-style: none; } 115 + .result { margin-bottom: 1.25rem; padding-bottom: 1.25rem; border-bottom: 1px solid var(--border); } 116 + .result:last-child { border-bottom: none; } 117 + .result-title { font-size: 1.1rem; } 118 + .result-title a { color: var(--accent); text-decoration: none; } 119 + .result-title a:hover { text-decoration: underline; } 120 + .result-url { font-size: 0.8rem; color: var(--muted); word-break: break-all; } 121 + .result-snippet { margin-top: 0.25rem; color: var(--fg); font-size: 0.95rem; } 122 + .result-meta { margin-top: 0.25rem; font-size: 0.8rem; color: var(--muted); } 123 + .empty { color: var(--muted); font-style: italic; } 124 + .stats-grid { display: grid; grid-template-columns: max-content 1fr; gap: 0.25rem 1rem; } 125 + .stats-grid dt { color: var(--muted); } 126 + .stats-grid dd { font-weight: 600; } 127 + .filters { margin-bottom: 1rem; font-size: 0.9rem; color: var(--muted); } 128 + .filter-tag { 129 + display: inline-block; 130 + padding: 0.15rem 0.5rem; 131 + background: var(--card-bg); 132 + border: 1px solid var(--border); 133 + border-radius: var(--radius); 134 + font-size: 0.85rem; 135 + margin-right: 0.25rem; 136 + } 137 + .filter-tag a { color: var(--muted); text-decoration: none; margin-left: 0.25rem; } 138 + .filter-tag a:hover { color: var(--accent); } 139 + .filter-link { color: var(--muted); text-decoration: none; } 140 + .filter-link:hover { color: var(--accent); text-decoration: underline; } 141 + mark { background: #fef3c7; color: var(--fg); padding: 0.05rem 0.15rem; border-radius: 2px; } 142 + @media (prefers-color-scheme: dark) { 143 + mark { background: #78350f; } 144 + } 145 + "#; 146 + 147 + // --------------------------------------------------------------------------- 148 + // HTML pages 149 + // --------------------------------------------------------------------------- 150 + 151 + /// Query parameters for the search page. 152 + #[derive(Debug, Deserialize)] 153 + struct SearchParams { 154 + #[serde(default)] 155 + q: String, 156 + #[serde(default = "default_limit")] 157 + limit: usize, 158 + /// Filter by source name (e.g. `crates_io`). 159 + #[serde(default)] 160 + source: Option<String>, 161 + /// Filter by source kind (e.g. `blog`). 162 + #[serde(default)] 163 + kind: Option<String>, 164 + } 165 + 166 + const fn default_limit() -> usize { 167 + 20 168 + } 169 + 170 + impl SearchParams { 171 + fn filters(&self) -> query::SearchFilters { 172 + query::SearchFilters { 173 + source: self.source.clone(), 174 + kind: self.kind.clone(), 175 + } 176 + } 177 + } 178 + 179 + /// GET / — landing page with search bar and recent crawl results. 180 + async fn page_home(State(state): State<Arc<AppState>>) -> Markup { 181 + let total = state.index.num_docs().unwrap_or(0); 182 + let recent = state.store.lock().await.recent(20).unwrap_or_default(); 183 + 184 + layout("Search", html! { 185 + form action="/search" method="get" { 186 + input type="search" name="q" placeholder="Search your web…" autofocus; 187 + button type="submit" { "Search" } 188 + } 189 + p.empty { (total) " documents indexed" } 190 + 191 + @if !recent.is_empty() { 192 + h2 { "Recently crawled" } 193 + ol.results { 194 + @for record in &recent { 195 + li.result { 196 + div.result-title { 197 + a href=(record.url) { 198 + @if let Some(title) = &record.title { 199 + (title) 200 + } @else { 201 + (record.url) 202 + } 203 + } 204 + } 205 + div.result-url { (record.url) } 206 + div.result-meta { 207 + (record.source_name) " · " (record.crawled_at.format("%b %d, %Y")) 208 + } 209 + } 210 + } 211 + } 212 + } 213 + }) 214 + } 215 + 216 + /// GET /search?q=...&limit=... — search results page. 217 + async fn page_search( 218 + State(state): State<Arc<AppState>>, 219 + Query(params): Query<SearchParams>, 220 + ) -> Markup { 221 + if params.q.is_empty() { 222 + return layout("Search", html! { 223 + form action="/search" method="get" { 224 + input type="search" name="q" placeholder="Search your web…" autofocus; 225 + button type="submit" { "Search" } 226 + } 227 + }); 228 + } 229 + 230 + let filters = params.filters(); 231 + let results = query::search(state.index.inner(), &params.q, params.limit, &filters) 232 + .unwrap_or_default(); 233 + 234 + let title = format!("{} — search", &params.q); 235 + 236 + layout(&title, html! { 237 + form action="/search" method="get" { 238 + input type="search" name="q" value=(params.q) placeholder="Search your web…" autofocus; 239 + @if let Some(source) = &params.source { 240 + input type="hidden" name="source" value=(source); 241 + } 242 + @if let Some(kind) = &params.kind { 243 + input type="hidden" name="kind" value=(kind); 244 + } 245 + button type="submit" { "Search" } 246 + } 247 + 248 + // Active filters 249 + @if params.source.is_some() || params.kind.is_some() { 250 + div.filters { 251 + span { "Filtering: " } 252 + @if let Some(source) = &params.source { 253 + span.filter-tag { 254 + (source) " " 255 + a href={"/search?q=" (params.q)} title="Remove filter" { "×" } 256 + } 257 + } 258 + @if let Some(kind) = &params.kind { 259 + span.filter-tag { 260 + (kind) " " 261 + a href={"/search?q=" (params.q) 262 + @if let Some(source) = &params.source { 263 + "&source=" (source) 264 + } 265 + } title="Remove filter" { "×" } 266 + } 267 + } 268 + } 269 + } 270 + 271 + @if results.is_empty() { 272 + p.empty { "No results found." } 273 + } @else { 274 + p { (results.len()) " results" } 275 + ol.results { 276 + @for hit in &results { 277 + li.result { 278 + div.result-title { a href=(hit.url) { (hit.title) } } 279 + div.result-url { (hit.url) } 280 + @if !hit.snippet.is_empty() { 281 + div.result-snippet { (PreEscaped(highlight_terms(&hit.snippet, &params.q))) } 282 + } 283 + div.result-meta { 284 + a.filter-link href={"/search?q=" (params.q) "&source=" (hit.source_name)} { 285 + (hit.source_name) 286 + } 287 + " · " 288 + a.filter-link href={"/search?q=" (params.q) "&kind=" (hit.source_kind)} { 289 + (hit.source_kind) 290 + } 291 + } 292 + } 293 + } 294 + } 295 + } 296 + }) 297 + } 298 + 299 + // --------------------------------------------------------------------------- 300 + // Snippet highlighting 301 + // --------------------------------------------------------------------------- 302 + 303 + /// Highlight query terms in a snippet by wrapping them in `<mark>` tags. 304 + /// 305 + /// The snippet text is HTML-escaped first to prevent XSS, then matched 306 + /// terms are wrapped. Matching is case-insensitive on word boundaries. 307 + fn highlight_terms(snippet: &str, query: &str) -> String { 308 + // HTML-escape the snippet 309 + let escaped = snippet 310 + .replace('&', "&amp;") 311 + .replace('<', "&lt;") 312 + .replace('>', "&gt;"); 313 + 314 + // Split query into individual terms, skip very short ones 315 + let terms: Vec<&str> = query 316 + .split_whitespace() 317 + .filter(|t| t.len() >= 2) 318 + .collect(); 319 + 320 + if terms.is_empty() { 321 + return escaped; 322 + } 323 + 324 + // Build a case-insensitive regex alternation: term1|term2|... 325 + // Wrap in word boundaries to avoid partial matches inside words. 326 + let pattern = terms 327 + .iter() 328 + .map(|t| regex::escape(t)) 329 + .collect::<Vec<_>>() 330 + .join("|"); 331 + 332 + let Ok(re) = regex::RegexBuilder::new(&pattern) 333 + .case_insensitive(true) 334 + .build() 335 + else { 336 + return escaped; 337 + }; 338 + 339 + re.replace_all(&escaped, "<mark>$0</mark>").into_owned() 340 + } 341 + 342 + // --------------------------------------------------------------------------- 343 + // JSON API (kept for programmatic access) 344 + // --------------------------------------------------------------------------- 345 + 30 346 /// JSON response for search results. 31 347 #[derive(Debug, Serialize)] 32 348 struct SearchResponse { ··· 46 362 score: f32, 47 363 } 48 364 49 - /// Query parameters for the search endpoint. 50 - #[derive(Debug, Deserialize)] 51 - struct SearchParams { 52 - q: String, 53 - #[serde(default = "default_limit")] 54 - limit: usize, 55 - } 56 - 57 - const fn default_limit() -> usize { 58 - 20 59 - } 60 - 61 365 /// JSON response for stats endpoint. 62 366 #[derive(Debug, Serialize)] 63 367 struct StatsResponse { ··· 72 376 kind: String, 73 377 } 74 378 75 - /// Start the web server. 76 - pub(crate) async fn run(args: ServeArgs) -> Result<()> { 77 - let data_dir = config::data_dir()?; 78 - let index_path = data_dir.join("index"); 79 - 80 - if !index_path.exists() { 81 - eprintln!("No index found. Run `motet crawl` first."); 82 - return Ok(()); 83 - } 84 - 85 - let index = SearchIndex::open(&index_path)?; 86 - let state = Arc::new(AppState { index }); 87 - 88 - let app = Router::new() 89 - .route("/api/search", get(api_search)) 90 - .route("/api/stats", get(api_stats)) 91 - .fallback(get(static_handler)) 92 - .layer(CorsLayer::permissive()) 93 - .with_state(state); 94 - 95 - let addr = format!("{}:{}", args.bind, args.port); 96 - let listener = tokio::net::TcpListener::bind(&addr).await?; 97 - 98 - info!(%addr, "motet web UI starting"); 99 - println!("motet serving at http://{addr}"); 100 - 101 - axum::serve(listener, app).await?; 102 - Ok(()) 103 - } 104 - 105 379 /// GET /api/search?q=...&limit=... 106 380 async fn api_search( 107 381 State(state): State<Arc<AppState>>, 108 382 Query(params): Query<SearchParams>, 109 383 ) -> impl IntoResponse { 110 - let results = match query::search(state.index.inner(), &params.q, params.limit) { 384 + let filters = params.filters(); 385 + let results = match query::search(state.index.inner(), &params.q, params.limit, &filters) { 111 386 Ok(r) => r, 112 387 Err(e) => { 113 388 return ( ··· 162 437 }) 163 438 } 164 439 165 - /// Serve embedded static files, falling back to index.html for SPA routing. 166 - async fn static_handler(uri: axum::http::Uri) -> impl IntoResponse { 167 - let path = uri.path().trim_start_matches('/'); 440 + // --------------------------------------------------------------------------- 441 + // Server bootstrap 442 + // --------------------------------------------------------------------------- 168 443 169 - // Try the exact path first 170 - if let Some(file) = WebAssets::get(path) { 171 - let mime = mime_guess::from_path(path).first_or_octet_stream(); 172 - return ( 173 - StatusCode::OK, 174 - [(axum::http::header::CONTENT_TYPE, mime.as_ref())], 175 - file.data.into_owned(), 176 - ) 177 - .into_response(); 178 - } 444 + /// Start the web server. 445 + pub(crate) async fn run(args: ServeArgs) -> Result<()> { 446 + let data_dir = config::data_dir()?; 447 + let index_path = data_dir.join("index"); 179 448 180 - // Fall back to index.html for SPA client-side routing 181 - match WebAssets::get("index.html") { 182 - Some(file) => ( 183 - StatusCode::OK, 184 - [(axum::http::header::CONTENT_TYPE, "text/html")], 185 - file.data.into_owned(), 186 - ) 187 - .into_response(), 188 - None => (StatusCode::NOT_FOUND, "motet web UI not built. Run `npm run build` in motet_web/ first.").into_response(), 449 + if !index_path.exists() { 450 + eprintln!("No index found. Run `motet crawl` first."); 451 + return Ok(()); 189 452 } 453 + 454 + let index = SearchIndex::open(&index_path)?; 455 + let store = Store::open(&data_dir.join("motet.db"))?; 456 + let state = Arc::new(AppState { 457 + index, 458 + store: Mutex::new(store), 459 + }); 460 + 461 + let app = Router::new() 462 + .route("/", get(page_home)) 463 + .route("/search", get(page_search)) 464 + .route("/api/search", get(api_search)) 465 + .route("/api/stats", get(api_stats)) 466 + .layer(CorsLayer::permissive()) 467 + .with_state(state); 468 + 469 + let addr = format!("{}:{}", args.bind, args.port); 470 + let listener = tokio::net::TcpListener::bind(&addr).await?; 471 + 472 + info!(%addr, "motet web UI starting"); 473 + println!("motet serving at http://{addr}"); 474 + 475 + axum::serve(listener, app).await?; 476 + Ok(()) 190 477 }
+2
motet_core/Cargo.toml
··· 16 16 chrono = { workspace = true } 17 17 dirs = { workspace = true } 18 18 eyre = { workspace = true } 19 + feed-rs = { workspace = true } 19 20 reqwest = { workspace = true } 20 21 rusqlite = { workspace = true } 21 22 scraper = { workspace = true } 22 23 serde = { workspace = true } 23 24 serde_json = { workspace = true } 24 25 tantivy = { workspace = true } 26 + texting_robots = { workspace = true } 25 27 thiserror = { workspace = true } 26 28 tokio = { workspace = true } 27 29 tracing = { workspace = true }
+156 -18
motet_core/src/config.rs
··· 10 10 //! "sources": { 11 11 //! "scout_magazine": { 12 12 //! "kind": "blog", 13 - //! "url": "https://scoutmagazine.ca/category/food-drink/", 13 + //! "url": "https://scoutmagazine.ca/food-drink", 14 14 //! "crawl_interval": "3d", 15 - //! "selector": "article" 15 + //! "selector": ".Card" 16 16 //! } 17 17 //! } 18 18 //! } 19 19 //! ``` 20 20 21 - use eyre::{Result, WrapErr, bail}; 21 + use eyre::{bail, Result, WrapErr}; 22 22 use serde::{Deserialize, Serialize}; 23 23 use std::{ 24 24 collections::BTreeMap, ··· 82 82 /// Source kind label (e.g. "restaurant", "crate", "blog"). 83 83 #[serde(default)] 84 84 pub source_kind_label: Option<String>, 85 + 86 + /// For RSS sources: also fetch and index pages linked from article bodies. 87 + #[serde(default)] 88 + pub follow_links: bool, 85 89 } 86 90 87 91 /// The type of crawler to dispatch to. ··· 94 98 CratesIo, 95 99 /// Reddit posts. 96 100 Reddit, 101 + /// RSS or Atom feed. 102 + Rss, 97 103 /// Yelp Fusion API. 98 104 Yelp, 99 105 } ··· 120 126 } 121 127 122 128 match self.kind { 123 - SourceKind::Blog | SourceKind::Reddit => "blog", 129 + SourceKind::Blog | SourceKind::Reddit | SourceKind::Rss => "blog", 124 130 SourceKind::CratesIo => "crate", 125 131 SourceKind::Yelp => "restaurant", 126 132 } ··· 210 216 return Ok(path); 211 217 } 212 218 213 - let default = Config { 219 + let default = default_config(); 220 + let json = serde_json::to_string_pretty(&default)?; 221 + std::fs::write(&path, json)?; 222 + 223 + Ok(path) 224 + } 225 + 226 + /// Build a convenience RSS [`SourceConfig`]. 227 + fn rss_source(url: &str, interval: &str, label: &str) -> SourceConfig { 228 + SourceConfig { 229 + kind: SourceKind::Rss, 230 + url: Some(url.to_string()), 231 + crawl_interval: interval.to_string(), 232 + source_kind_label: Some(label.to_string()), 233 + selector: None, 234 + max_pages: None, 235 + location: None, 236 + categories: None, 237 + api_key_env: None, 238 + subreddit: None, 239 + search_terms: None, 240 + min_downloads: None, 241 + follow_links: false, 242 + } 243 + } 244 + 245 + /// Construct the default [`Config`] with all built-in sources. 246 + #[allow(clippy::too_many_lines)] 247 + fn default_config() -> Config { 248 + Config { 214 249 sources: BTreeMap::from([ 215 250 ( 216 - "this_week_in_rust".to_string(), 251 + "ap_news".to_string(), 217 252 SourceConfig { 218 253 kind: SourceKind::Blog, 219 - url: Some("https://this-week-in-rust.org/".to_string()), 220 - crawl_interval: "7d".to_string(), 221 - selector: Some("article".to_string()), 222 - max_pages: Some(50), 254 + url: Some("https://apnews.com".to_string()), 255 + crawl_interval: "4h".to_string(), 256 + selector: Some(".PagePromo".to_string()), 257 + max_pages: Some(30), 258 + source_kind_label: Some("news".to_string()), 259 + location: None, 260 + categories: None, 261 + api_key_env: None, 262 + subreddit: None, 263 + search_terms: None, 264 + min_downloads: None, 265 + follow_links: false, 266 + }, 267 + ), 268 + ( 269 + "bmann_blog".to_string(), 270 + rss_source("https://bmannconsulting.com/blog.xml", "1d", "blog"), 271 + ), 272 + ( 273 + "bmann_journal".to_string(), 274 + rss_source("https://bmannconsulting.com/journal.xml", "1d", "blog"), 275 + ), 276 + ( 277 + "bsky_bmann".to_string(), 278 + rss_source( 279 + "https://bsky.app/profile/did:plc:2cxgdrgtsmrbqnjkwyplmp43/rss", 280 + "1d", 281 + "bluesky", 282 + ), 283 + ), 284 + ( 285 + "bsky_dustyweb".to_string(), 286 + rss_source( 287 + "https://bsky.app/profile/did:plc:dyyvywontyeuaegemczcushz/rss", 288 + "1d", 289 + "bluesky", 290 + ), 291 + ), 292 + ( 293 + "bsky_expede".to_string(), 294 + rss_source( 295 + "https://bsky.app/profile/did:plc:oypgij57lv3ytni32p2jqbce/rss", 296 + "1d", 297 + "bluesky", 298 + ), 299 + ), 300 + ( 301 + "cbc_bc".to_string(), 302 + rss_source( 303 + "https://www.cbc.ca/webfeed/rss/rss-canada-britishcolumbia", 304 + "4h", 305 + "news", 306 + ), 307 + ), 308 + ( 309 + "cbc_top_stories".to_string(), 310 + rss_source( 311 + "https://www.cbc.ca/webfeed/rss/rss-topstories", 312 + "4h", 313 + "news", 314 + ), 315 + ), 316 + ( 317 + "crates_io".to_string(), 318 + SourceConfig { 319 + kind: SourceKind::CratesIo, 320 + url: None, 321 + crawl_interval: "1d".to_string(), 322 + selector: None, 323 + max_pages: Some(5), 324 + source_kind_label: Some("crate".to_string()), 325 + location: None, 326 + categories: None, 327 + api_key_env: None, 328 + subreddit: None, 329 + search_terms: None, 330 + min_downloads: Some(100), 331 + follow_links: false, 332 + }, 333 + ), 334 + ( 335 + "ink_and_switch".to_string(), 336 + rss_source("https://inkandswitch.com/index.xml", "3d", "research"), 337 + ), 338 + ( 339 + "monad_nomad".to_string(), 340 + SourceConfig { 341 + kind: SourceKind::Rss, 342 + url: Some("https://notes.brooklynzelenka.com/index.xml".to_string()), 343 + crawl_interval: "1d".to_string(), 223 344 source_kind_label: Some("blog".to_string()), 345 + follow_links: true, 346 + max_pages: Some(100), 347 + selector: None, 224 348 location: None, 225 349 categories: None, 226 350 api_key_env: None, ··· 233 357 "scout_magazine".to_string(), 234 358 SourceConfig { 235 359 kind: SourceKind::Blog, 236 - url: Some("https://scoutmagazine.ca/category/food-drink/".to_string()), 360 + url: Some("https://scoutmagazine.ca/food-drink".to_string()), 237 361 crawl_interval: "3d".to_string(), 238 - selector: Some("article".to_string()), 362 + selector: Some(".Card".to_string()), 239 363 max_pages: Some(50), 240 364 source_kind_label: Some("restaurant".to_string()), 241 365 location: None, ··· 244 368 subreddit: None, 245 369 search_terms: None, 246 370 min_downloads: None, 371 + follow_links: false, 372 + }, 373 + ), 374 + ( 375 + "this_week_in_rust".to_string(), 376 + SourceConfig { 377 + kind: SourceKind::Blog, 378 + url: Some("https://this-week-in-rust.org/".to_string()), 379 + crawl_interval: "7d".to_string(), 380 + selector: Some(".post-title".to_string()), 381 + max_pages: Some(50), 382 + source_kind_label: Some("blog".to_string()), 383 + location: None, 384 + categories: None, 385 + api_key_env: None, 386 + subreddit: None, 387 + search_terms: None, 388 + min_downloads: None, 389 + follow_links: false, 247 390 }, 248 391 ), 249 392 ]), 250 - }; 251 - 252 - let json = serde_json::to_string_pretty(&default)?; 253 - std::fs::write(&path, json)?; 254 - 255 - Ok(path) 393 + } 256 394 } 257 395 258 396 #[cfg(test)]
+36 -10
motet_core/src/crawler/blog.rs
··· 5 5 //! for any blog with a listing page that links to individual articles. 6 6 7 7 use super::{CrawledDocument, Crawler}; 8 - use crate::config::SourceConfig; 8 + use crate::{config::SourceConfig, robots::RobotsChecker}; 9 9 use chrono::Utc; 10 10 use eyre::{Result, WrapErr, bail}; 11 11 use scraper::{Html, Selector}; 12 + use std::collections::HashSet; 12 13 use tracing::{debug, info, warn}; 13 14 use url::Url; 14 15 ··· 22 23 source_name: &str, 23 24 config: &SourceConfig, 24 25 client: &reqwest::Client, 26 + robots: &RobotsChecker, 25 27 ) -> Result<Vec<CrawledDocument>> { 26 28 let base_url = config 27 29 .url ··· 31 33 let base = Url::parse(base_url) 32 34 .wrap_err_with(|| format!("invalid URL for source {source_name:?}"))?; 33 35 36 + if !robots.allowed(&base).await { 37 + info!(%source_name, %base, "skipping blog index (blocked by robots.txt)"); 38 + return Ok(vec![]); 39 + } 40 + 34 41 info!(%source_name, %base, "crawling blog index"); 42 + 43 + // Respect crawl delay if specified 44 + let delay = robots.delay(&base).await; 35 45 36 46 let index_html = client 37 47 .get(base.as_str()) ··· 55 65 let mut documents = Vec::with_capacity(urls.len()); 56 66 57 67 for url in &urls { 68 + if !robots.allowed(url).await { 69 + debug!(%url, "skipping article (blocked by robots.txt)"); 70 + continue; 71 + } 72 + 73 + if let Some(d) = delay { 74 + tokio::time::sleep(d).await; 75 + } 76 + 58 77 match fetch_article(client, url).await { 59 78 Ok(doc) => { 60 79 debug!(url = %doc.url, title = %doc.title, "crawled article"); ··· 95 114 let link_sel = Selector::parse("a[href]") 96 115 .map_err(|e| eyre::eyre!("failed to parse link selector: {e:?}"))?; 97 116 117 + let container_count = document.select(&selector).count(); 118 + debug!(container_sel, container_count, "matched containers"); 119 + 120 + let mut seen = HashSet::new(); 98 121 let mut urls = Vec::new(); 99 122 100 123 for element in document.select(&selector) { 101 - // Try to find a link inside the article element 102 124 if let Some(link) = element.select(&link_sel).next() 103 125 && let Some(href) = link.value().attr("href") 104 126 && let Ok(resolved) = base.join(href) 127 + && resolved.host_str() == base.host_str() 128 + && seen.insert(resolved.clone()) 105 129 { 106 - // Only include links on the same host 107 - if resolved.host_str() == base.host_str() { 108 - urls.push(resolved); 109 - } 130 + urls.push(resolved); 110 131 } 111 132 } 112 133 113 - urls.dedup(); 114 134 Ok(urls) 115 135 } 116 136 ··· 197 217 s.split_whitespace().collect::<Vec<_>>().join(" ") 198 218 } 199 219 200 - /// Produce a display snippet of at most `max_len` characters. 220 + /// Produce a display snippet of at most `max_len` characters, respecting 221 + /// char boundaries. 201 222 fn make_snippet(body: &str, max_len: usize) -> String { 202 223 if body.len() <= max_len { 203 224 return body.to_string(); 204 225 } 205 226 206 - // Cut at a word boundary 207 - let truncated = &body[..max_len]; 227 + // Walk back to a char boundary 228 + let mut end = max_len; 229 + while !body.is_char_boundary(end) { 230 + end -= 1; 231 + } 232 + 233 + let truncated = &body[..end]; 208 234 match truncated.rfind(' ') { 209 235 Some(pos) => format!("{}...", &truncated[..pos]), 210 236 None => format!("{truncated}..."),
+201
motet_core/src/crawler/crates_io.rs
··· 1 + //! crates.io crawler using the public JSON API. 2 + //! 3 + //! Fetches recently-updated crates from the crates.io API and indexes each 4 + //! crate's name, description, and metadata as a searchable document. 5 + //! 6 + //! # API endpoint 7 + //! 8 + //! ```text 9 + //! GET https://crates.io/api/v1/crates?sort=recent-updates&per_page=50 10 + //! ``` 11 + //! 12 + //! Pagination uses cursor-based `meta.next_page` URLs. 13 + //! 14 + //! # Example config 15 + //! 16 + //! ```json 17 + //! { 18 + //! "crates_io": { 19 + //! "kind": "crates_io", 20 + //! "crawl_interval": "1d", 21 + //! "max_pages": 5, 22 + //! "min_downloads": 100 23 + //! } 24 + //! } 25 + //! ``` 26 + 27 + use super::{CrawledDocument, Crawler}; 28 + use crate::{config::SourceConfig, robots::RobotsChecker}; 29 + use chrono::Utc; 30 + use eyre::{Result, WrapErr}; 31 + use serde::Deserialize; 32 + use tracing::{debug, info}; 33 + use url::Url; 34 + 35 + const API_BASE: &str = "https://crates.io/api/v1/crates"; 36 + const CRATES_IO_BASE: &str = "https://crates.io/crates"; 37 + const PER_PAGE: u32 = 50; 38 + 39 + /// Crawler for the crates.io JSON API. 40 + #[derive(Debug, Clone, Copy)] 41 + pub struct CratesIoCrawler; 42 + 43 + #[derive(Debug, Deserialize)] 44 + struct CratesResponse { 45 + crates: Vec<CrateEntry>, 46 + meta: Meta, 47 + } 48 + 49 + #[derive(Debug, Deserialize)] 50 + struct CrateEntry { 51 + name: String, 52 + description: Option<String>, 53 + downloads: u64, 54 + repository: Option<String>, 55 + homepage: Option<String>, 56 + max_version: String, 57 + #[serde(default)] 58 + categories: Option<Vec<String>>, 59 + } 60 + 61 + #[derive(Debug, Deserialize)] 62 + struct Meta { 63 + next_page: Option<String>, 64 + } 65 + 66 + impl Crawler for CratesIoCrawler { 67 + async fn crawl( 68 + &self, 69 + source_name: &str, 70 + config: &SourceConfig, 71 + client: &reqwest::Client, 72 + robots: &RobotsChecker, 73 + ) -> Result<Vec<CrawledDocument>> { 74 + let max_pages = config.max_pages.unwrap_or(5); 75 + let min_downloads = config.min_downloads.unwrap_or(0); 76 + 77 + // Check robots.txt for the API base URL 78 + let api_url = Url::parse(API_BASE).wrap_err("invalid API base URL")?; 79 + if !robots.allowed(&api_url).await { 80 + info!(%source_name, "skipping crates.io (blocked by robots.txt)"); 81 + return Ok(vec![]); 82 + } 83 + 84 + // Respect crawl delay if specified 85 + let delay = robots.delay(&api_url).await; 86 + 87 + info!(%source_name, %max_pages, %min_downloads, "crawling crates.io"); 88 + 89 + let mut documents = Vec::new(); 90 + let mut next_url = Some(format!("{API_BASE}?sort=recent-updates&per_page={PER_PAGE}")); 91 + let mut pages_fetched: usize = 0; 92 + 93 + while let Some(url) = next_url.take() { 94 + if pages_fetched >= max_pages { 95 + break; 96 + } 97 + 98 + debug!(%url, page = pages_fetched + 1, "fetching crates page"); 99 + 100 + let resp: CratesResponse = client 101 + .get(&url) 102 + .send() 103 + .await 104 + .wrap_err_with(|| format!("failed to fetch crates.io page {}", pages_fetched + 1))? 105 + .json() 106 + .await 107 + .wrap_err("failed to parse crates.io JSON response")?; 108 + 109 + for krate in &resp.crates { 110 + if krate.downloads < min_downloads { 111 + debug!( 112 + name = %krate.name, 113 + downloads = krate.downloads, 114 + %min_downloads, 115 + "skipping crate below download threshold" 116 + ); 117 + continue; 118 + } 119 + 120 + let crate_url = Url::parse(&format!("{CRATES_IO_BASE}/{}", krate.name)) 121 + .wrap_err("failed to build crate URL")?; 122 + 123 + let body = build_crate_body(krate); 124 + let snippet = krate 125 + .description 126 + .clone() 127 + .unwrap_or_default(); 128 + 129 + let mut tags: Vec<String> = krate 130 + .categories 131 + .clone() 132 + .unwrap_or_default(); 133 + 134 + tags.push(format!("v{}", krate.max_version)); 135 + 136 + documents.push(CrawledDocument { 137 + url: crate_url, 138 + title: krate.name.clone(), 139 + body, 140 + snippet, 141 + tags, 142 + crawled_at: Utc::now(), 143 + }); 144 + } 145 + 146 + pages_fetched += 1; 147 + 148 + if let Some(d) = delay { 149 + tokio::time::sleep(d).await; 150 + } 151 + 152 + // crates.io next_page is a relative query string like "?foo=bar" 153 + next_url = resp.meta.next_page.map(|page| { 154 + if page.starts_with("http") { 155 + page 156 + } else { 157 + format!("{API_BASE}{page}") 158 + } 159 + }); 160 + } 161 + 162 + info!( 163 + %source_name, 164 + count = documents.len(), 165 + pages = pages_fetched, 166 + "finished crawling crates.io" 167 + ); 168 + 169 + Ok(documents) 170 + } 171 + } 172 + 173 + /// Build a rich body string from crate metadata for full-text indexing. 174 + fn build_crate_body(krate: &CrateEntry) -> String { 175 + let mut parts = Vec::new(); 176 + 177 + parts.push(krate.name.clone()); 178 + 179 + if let Some(desc) = &krate.description { 180 + parts.push(desc.clone()); 181 + } 182 + 183 + parts.push(format!("version {}", krate.max_version)); 184 + parts.push(format!("{} downloads", krate.downloads)); 185 + 186 + if let Some(repo) = &krate.repository { 187 + parts.push(format!("repository: {repo}")); 188 + } 189 + 190 + if let Some(home) = &krate.homepage { 191 + parts.push(format!("homepage: {home}")); 192 + } 193 + 194 + if let Some(cats) = &krate.categories 195 + && !cats.is_empty() 196 + { 197 + parts.push(format!("categories: {}", cats.join(", "))); 198 + } 199 + 200 + parts.join(" | ") 201 + }
+383
motet_core/src/crawler/rss.rs
··· 1 + //! RSS/Atom feed crawler. 2 + //! 3 + //! Fetches a feed URL, parses it with [`feed_rs`], and converts each entry 4 + //! into a [`CrawledDocument`]. Works with both RSS 2.0 and Atom feeds. 5 + //! 6 + //! # Example config 7 + //! 8 + //! ```json 9 + //! { 10 + //! "ink_and_switch": { 11 + //! "kind": "rss", 12 + //! "url": "https://inkandswitch.com/index.xml", 13 + //! "crawl_interval": "1d" 14 + //! } 15 + //! } 16 + //! ``` 17 + 18 + use super::{CrawledDocument, Crawler}; 19 + use crate::{config::SourceConfig, robots::RobotsChecker}; 20 + use chrono::Utc; 21 + use eyre::{Result, WrapErr}; 22 + use scraper::{Html, Selector}; 23 + use std::collections::HashSet; 24 + use tracing::{debug, info, warn}; 25 + use url::Url; 26 + 27 + /// Crawler for RSS and Atom feeds. 28 + #[derive(Debug, Clone, Copy)] 29 + pub struct RssCrawler; 30 + 31 + impl Crawler for RssCrawler { 32 + async fn crawl( 33 + &self, 34 + source_name: &str, 35 + config: &SourceConfig, 36 + client: &reqwest::Client, 37 + robots: &RobotsChecker, 38 + ) -> Result<Vec<CrawledDocument>> { 39 + let feed_url = config 40 + .url 41 + .as_deref() 42 + .ok_or_else(|| eyre::eyre!("RSS source {source_name:?} missing `url`"))?; 43 + 44 + let parsed_feed_url = Url::parse(feed_url) 45 + .wrap_err_with(|| format!("invalid feed URL for {source_name}"))?; 46 + 47 + if !robots.allowed(&parsed_feed_url).await { 48 + info!(%source_name, %feed_url, "skipping RSS feed (blocked by robots.txt)"); 49 + return Ok(vec![]); 50 + } 51 + 52 + info!(%source_name, %feed_url, "fetching RSS feed"); 53 + 54 + let body = client 55 + .get(feed_url) 56 + .send() 57 + .await 58 + .wrap_err_with(|| format!("failed to fetch feed for {source_name}"))? 59 + .bytes() 60 + .await 61 + .wrap_err_with(|| format!("failed to read feed body for {source_name}"))?; 62 + 63 + let feed = feed_rs::parser::parse(&body[..]) 64 + .wrap_err_with(|| format!("failed to parse feed for {source_name}"))?; 65 + 66 + let max_pages = config.max_pages.unwrap_or(100); 67 + let mut documents = Vec::with_capacity(feed.entries.len().min(max_pages)); 68 + 69 + // Collect raw HTML bodies so we can extract links later if follow_links is set. 70 + let mut raw_html_bodies: Vec<String> = Vec::new(); 71 + 72 + for entry in feed.entries.into_iter().take(max_pages) { 73 + let Some(url) = entry_url(&entry) else { 74 + warn!(%source_name, id = %entry.id, "entry has no link, skipping"); 75 + continue; 76 + }; 77 + 78 + let raw_html = entry_body_raw(&entry); 79 + let body = strip_html(&raw_html); 80 + 81 + if config.follow_links { 82 + raw_html_bodies.push(raw_html); 83 + } 84 + 85 + let title = entry 86 + .title 87 + .map_or_else(|| url.path().to_string(), |t| t.content); 88 + let snippet = make_snippet(&body, 300); 89 + 90 + let tags = entry 91 + .categories 92 + .iter() 93 + .map(|c| c.term.clone()) 94 + .collect(); 95 + 96 + debug!(%url, %title, body_len = body.len(), "parsed feed entry"); 97 + 98 + documents.push(CrawledDocument { 99 + url, 100 + title, 101 + body, 102 + snippet, 103 + tags, 104 + crawled_at: Utc::now(), 105 + }); 106 + } 107 + 108 + info!( 109 + %source_name, 110 + count = documents.len(), 111 + "finished parsing RSS feed entries" 112 + ); 113 + 114 + // Follow links found in entry bodies, if enabled. 115 + if config.follow_links { 116 + let entry_urls: HashSet<Url> = documents.iter().map(|d| d.url.clone()).collect(); 117 + let remaining = max_pages.saturating_sub(documents.len()); 118 + 119 + let linked = follow_entry_links( 120 + source_name, 121 + &raw_html_bodies, 122 + &entry_urls, 123 + &parsed_feed_url, 124 + remaining, 125 + client, 126 + robots, 127 + ) 128 + .await; 129 + 130 + info!( 131 + %source_name, 132 + followed = linked.len(), 133 + "finished following links from feed entries" 134 + ); 135 + 136 + documents.extend(linked); 137 + } 138 + 139 + info!( 140 + %source_name, 141 + total = documents.len(), 142 + "finished crawling RSS source" 143 + ); 144 + 145 + Ok(documents) 146 + } 147 + } 148 + 149 + /// Extract the best URL from a feed entry. 150 + fn entry_url(entry: &feed_rs::model::Entry) -> Option<Url> { 151 + // Prefer `alternate` link, then any link, then fall back to entry ID as URL 152 + let href = entry 153 + .links 154 + .iter() 155 + .find(|l| l.rel.as_deref() == Some("alternate")) 156 + .or_else(|| entry.links.first()) 157 + .map(|l| l.href.as_str()) 158 + .or_else(|| { 159 + // Some feeds use the entry ID as the URL 160 + if entry.id.starts_with("http") { 161 + Some(entry.id.as_str()) 162 + } else { 163 + None 164 + } 165 + })?; 166 + 167 + Url::parse(href).ok() 168 + } 169 + 170 + /// Extract the raw HTML body from a feed entry, preferring `content` over 171 + /// `summary`. Returns the HTML as-is (not stripped) so callers can both 172 + /// extract links and produce plain text from the same source. 173 + fn entry_body_raw(entry: &feed_rs::model::Entry) -> String { 174 + if let Some(content) = &entry.content 175 + && let Some(body) = &content.body 176 + && !body.trim().is_empty() 177 + { 178 + return body.clone(); 179 + } 180 + 181 + if let Some(summary) = &entry.summary { 182 + if !summary.content.trim().is_empty() { 183 + return summary.content.clone(); 184 + } 185 + } 186 + 187 + for media in &entry.media { 188 + if let Some(desc) = &media.description { 189 + if !desc.content.trim().is_empty() { 190 + return desc.content.clone(); 191 + } 192 + } 193 + } 194 + 195 + String::new() 196 + } 197 + 198 + /// Extract all `<a href>` links from raw HTML, resolving them against `base`. 199 + fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> { 200 + let doc = Html::parse_fragment(html); 201 + let Ok(sel) = Selector::parse("a[href]") else { 202 + return vec![]; 203 + }; 204 + 205 + let mut urls = Vec::new(); 206 + for el in doc.select(&sel) { 207 + if let Some(href) = el.value().attr("href") 208 + && let Ok(resolved) = base.join(href) 209 + { 210 + // Only follow http(s) links 211 + if resolved.scheme() == "http" || resolved.scheme() == "https" { 212 + urls.push(resolved); 213 + } 214 + } 215 + } 216 + urls 217 + } 218 + 219 + /// Follow links discovered in RSS entry bodies. 220 + /// 221 + /// Deduplicates against already-indexed entry URLs, checks `robots.txt`, 222 + /// and fetches each allowed page up to `max_count`. 223 + async fn follow_entry_links( 224 + source_name: &str, 225 + raw_html_bodies: &[String], 226 + entry_urls: &HashSet<Url>, 227 + feed_url: &Url, 228 + max_count: usize, 229 + client: &reqwest::Client, 230 + robots: &RobotsChecker, 231 + ) -> Vec<CrawledDocument> { 232 + if max_count == 0 { 233 + return vec![]; 234 + } 235 + 236 + // Collect all unique links from all entry bodies. 237 + let mut seen: HashSet<Url> = entry_urls.clone(); 238 + let mut link_urls: Vec<Url> = Vec::new(); 239 + 240 + for html in raw_html_bodies { 241 + for url in extract_links_from_html(html, feed_url) { 242 + if seen.insert(url.clone()) { 243 + link_urls.push(url); 244 + } 245 + } 246 + } 247 + 248 + debug!( 249 + %source_name, 250 + unique_links = link_urls.len(), 251 + max_count, 252 + "found links in feed entry bodies" 253 + ); 254 + 255 + let delay = robots.delay(feed_url).await; 256 + let mut documents = Vec::new(); 257 + 258 + for url in link_urls.into_iter().take(max_count) { 259 + if !robots.allowed(&url).await { 260 + debug!(%url, "skipping followed link (blocked by robots.txt)"); 261 + continue; 262 + } 263 + 264 + if let Some(d) = delay { 265 + tokio::time::sleep(d).await; 266 + } 267 + 268 + match fetch_linked_page(client, &url).await { 269 + Ok(doc) => { 270 + debug!(url = %doc.url, title = %doc.title, "crawled followed link"); 271 + documents.push(doc); 272 + } 273 + Err(e) => { 274 + warn!(%url, error = %e, "failed to fetch followed link, skipping"); 275 + } 276 + } 277 + } 278 + 279 + documents 280 + } 281 + 282 + /// Fetch a single linked page and extract its title and body text. 283 + /// 284 + /// Mirrors [`blog::fetch_article`] but is self-contained in the RSS module 285 + /// to avoid coupling the two crawlers. 286 + async fn fetch_linked_page(client: &reqwest::Client, url: &Url) -> Result<CrawledDocument> { 287 + let resp = client 288 + .get(url.as_str()) 289 + .send() 290 + .await 291 + .wrap_err("failed to fetch linked page")?; 292 + 293 + if !resp.status().is_success() { 294 + eyre::bail!("HTTP {} for {}", resp.status(), url); 295 + } 296 + 297 + let html = resp.text().await.wrap_err("failed to read linked page body")?; 298 + let document = Html::parse_document(&html); 299 + 300 + let title = extract_page_title(&document).unwrap_or_else(|| url.path().to_string()); 301 + let body = extract_page_body(&document); 302 + let snippet = make_snippet(&body, 300); 303 + 304 + Ok(CrawledDocument { 305 + url: url.clone(), 306 + title, 307 + body, 308 + snippet, 309 + tags: vec![], 310 + crawled_at: Utc::now(), 311 + }) 312 + } 313 + 314 + /// Extract the page title from `<title>` or `<h1>`. 315 + fn extract_page_title(doc: &Html) -> Option<String> { 316 + for selector_str in &["title", "h1"] { 317 + if let Ok(sel) = Selector::parse(selector_str) 318 + && let Some(el) = doc.select(&sel).next() 319 + { 320 + let text: String = el.text().collect::<String>().trim().to_string(); 321 + if !text.is_empty() { 322 + return Some(text); 323 + } 324 + } 325 + } 326 + None 327 + } 328 + 329 + /// Extract visible body text from a page, preferring `<article>` or `<main>`. 330 + fn extract_page_body(doc: &Html) -> String { 331 + for tag in &["article", "main", "[role=main]"] { 332 + if let Ok(sel) = Selector::parse(tag) 333 + && let Some(el) = doc.select(&sel).next() 334 + { 335 + let text: String = el.text().collect::<Vec<_>>().join(" "); 336 + let cleaned = normalize_whitespace(&text); 337 + if cleaned.len() > 100 { 338 + return cleaned; 339 + } 340 + } 341 + } 342 + 343 + if let Ok(body_sel) = Selector::parse("body") 344 + && let Some(body) = doc.select(&body_sel).next() 345 + { 346 + let text: String = body.text().collect::<Vec<_>>().join(" "); 347 + return normalize_whitespace(&text); 348 + } 349 + 350 + String::new() 351 + } 352 + 353 + /// Strip HTML tags from content, returning plain text. 354 + fn strip_html(html: &str) -> String { 355 + let doc = scraper::Html::parse_fragment(html); 356 + let text: String = doc.root_element().text().collect::<Vec<_>>().join(" "); 357 + normalize_whitespace(&text) 358 + } 359 + 360 + /// Collapse runs of whitespace into single spaces. 361 + fn normalize_whitespace(s: &str) -> String { 362 + s.split_whitespace().collect::<Vec<_>>().join(" ") 363 + } 364 + 365 + /// Produce a display snippet of at most `max_len` characters, respecting 366 + /// char boundaries. 367 + fn make_snippet(body: &str, max_len: usize) -> String { 368 + if body.len() <= max_len { 369 + return body.to_string(); 370 + } 371 + 372 + // Walk back to a char boundary 373 + let mut end = max_len; 374 + while !body.is_char_boundary(end) { 375 + end -= 1; 376 + } 377 + 378 + let truncated = &body[..end]; 379 + match truncated.rfind(' ') { 380 + Some(pos) => format!("{}...", &truncated[..pos]), 381 + None => format!("{truncated}..."), 382 + } 383 + }
+17 -4
motet_core/src/crawler.rs
··· 5 5 //! [`dispatch`] selects the right crawler for a given [`SourceKind`]. 6 6 7 7 pub mod blog; 8 + pub mod crates_io; 9 + pub mod rss; 8 10 9 - use crate::config::{SourceConfig, SourceKind}; 11 + use crate::{ 12 + config::{SourceConfig, SourceKind}, 13 + robots::RobotsChecker, 14 + }; 10 15 use chrono::{DateTime, Utc}; 11 16 use eyre::Result; 12 17 use url::Url; ··· 36 41 /// Trait for all crawlers. Each source type implements this. 37 42 pub trait Crawler: Send + Sync { 38 43 /// Crawl the source and return extracted documents. 44 + /// 45 + /// The [`RobotsChecker`] should be consulted before fetching any URL. 39 46 fn crawl( 40 47 &self, 41 48 source_name: &str, 42 49 config: &SourceConfig, 43 50 client: &reqwest::Client, 51 + robots: &RobotsChecker, 44 52 ) -> impl std::future::Future<Output = Result<Vec<CrawledDocument>>> + Send; 45 53 } 46 54 ··· 53 61 source_name: &str, 54 62 config: &SourceConfig, 55 63 client: &reqwest::Client, 64 + robots: &RobotsChecker, 56 65 ) -> Result<Vec<CrawledDocument>> { 57 66 match config.kind { 58 67 SourceKind::Blog => { 59 68 let crawler = blog::BlogCrawler; 60 - crawler.crawl(source_name, config, client).await 69 + crawler.crawl(source_name, config, client, robots).await 61 70 } 62 71 SourceKind::CratesIo => { 63 - tracing::warn!("crates.io crawler not yet implemented"); 64 - Ok(vec![]) 72 + let crawler = crates_io::CratesIoCrawler; 73 + crawler.crawl(source_name, config, client, robots).await 65 74 } 66 75 SourceKind::Reddit => { 67 76 tracing::warn!("reddit crawler not yet implemented"); 68 77 Ok(vec![]) 78 + } 79 + SourceKind::Rss => { 80 + let crawler = rss::RssCrawler; 81 + crawler.crawl(source_name, config, client, robots).await 69 82 } 70 83 SourceKind::Yelp => { 71 84 tracing::warn!("yelp crawler not yet implemented");
+1
motet_core/src/lib.rs
··· 7 7 pub mod crawler; 8 8 pub mod index; 9 9 pub mod query; 10 + pub mod robots; 10 11 pub mod schema; 11 12 pub mod store;
+112 -18
motet_core/src/query.rs
··· 1 1 //! Query engine — search the Tantivy index and return ranked results. 2 + //! 3 + //! Search results are ranked by BM25 text relevance multiplied by a freshness 4 + //! boost: an exponential decay based on how recently the document was crawled. 2 5 3 6 use crate::schema::{self, field}; 4 7 use eyre::{Result, WrapErr}; 5 8 use tantivy::{ 6 - Index, collector::TopDocs, query::QueryParser, 7 - schema::Value, 9 + collector::TopDocs, 10 + query::{BooleanQuery, Occur, QueryParser, TermQuery}, 11 + schema::{Facet, IndexRecordOption, Value}, 12 + DocId, Index, Score, SegmentReader, Term, 8 13 }; 9 14 use tracing::debug; 10 15 ··· 30 35 pub score: f32, 31 36 } 32 37 33 - /// Search the index for the given query string. 38 + /// Optional filters to narrow search results by facet. 39 + #[derive(Debug, Clone, Default)] 40 + pub struct SearchFilters { 41 + /// Filter to a specific source name (e.g. `crates_io`). 42 + pub source: Option<String>, 43 + /// Filter to a specific source kind (e.g. `blog`, `rss`). 44 + pub kind: Option<String>, 45 + } 46 + 47 + /// Search the index for the given query string, optionally filtered by facets. 34 48 /// 35 49 /// Searches across `title` and `body` fields using Tantivy's default 36 - /// query parser with BM25 scoring. 50 + /// query parser with BM25 scoring. When filters are provided, results 51 + /// are intersected with `TermQuery` on the corresponding facet field. 37 52 /// 38 53 /// # Errors 39 54 /// ··· 46 61 index: &Index, 47 62 query_str: &str, 48 63 limit: usize, 64 + filters: &SearchFilters, 49 65 ) -> Result<Vec<SearchResult>> { 50 66 let schema = schema::build_schema(); 51 67 ··· 53 69 let body_field = schema.get_field(field::BODY).expect("body field"); 54 70 let url_field = schema.get_field(field::URL).expect("url field"); 55 71 let snippet_field = schema.get_field(field::SNIPPET).expect("snippet field"); 56 - let source_kind_field = schema.get_field(field::SOURCE_KIND).expect("source_kind field"); 57 - let source_name_field = schema.get_field(field::SOURCE_NAME).expect("source_name field"); 72 + let source_kind_field = schema 73 + .get_field(field::SOURCE_KIND) 74 + .expect("source_kind field"); 75 + let source_name_field = schema 76 + .get_field(field::SOURCE_NAME) 77 + .expect("source_name field"); 58 78 59 79 let query_parser = QueryParser::for_index(index, vec![title_field, body_field]); 60 80 61 - let query = query_parser 81 + let text_query = query_parser 62 82 .parse_query(query_str) 63 83 .wrap_err_with(|| format!("failed to parse query: {query_str:?}"))?; 64 84 65 - let reader = index 66 - .reader() 67 - .wrap_err("failed to get index reader")?; 85 + // Build facet filter clauses 86 + let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = vec![(Occur::Must, text_query)]; 87 + 88 + if let Some(source) = &filters.source { 89 + let facet = 90 + Facet::from_text(&format!("/{source}")).wrap_err("invalid source name facet")?; 91 + let term = Term::from_facet(source_name_field, &facet); 92 + clauses.push(( 93 + Occur::Must, 94 + Box::new(TermQuery::new(term, IndexRecordOption::Basic)), 95 + )); 96 + } 97 + 98 + if let Some(kind) = &filters.kind { 99 + let facet = Facet::from_text(&format!("/{kind}")).wrap_err("invalid source kind facet")?; 100 + let term = Term::from_facet(source_kind_field, &facet); 101 + clauses.push(( 102 + Occur::Must, 103 + Box::new(TermQuery::new(term, IndexRecordOption::Basic)), 104 + )); 105 + } 106 + 107 + let query = BooleanQuery::from(clauses); 108 + 109 + let reader = index.reader().wrap_err("failed to get index reader")?; 68 110 69 111 let searcher = reader.searcher(); 70 112 113 + let now_secs = chrono::Utc::now().timestamp(); 114 + 115 + let collector = 116 + TopDocs::with_limit(limit).tweak_score(move |segment_reader: &SegmentReader| { 117 + let date_reader = segment_reader 118 + .fast_fields() 119 + .date(field::CRAWLED_AT) 120 + .ok() 121 + .map(|col| col.first_or_default_col(Default::default())); 122 + 123 + move |doc: DocId, original_score: Score| -> Score { 124 + let Some(reader) = &date_reader else { 125 + return original_score; 126 + }; 127 + let crawled_ts = reader.get_val(doc).into_timestamp_secs(); 128 + let age_days = (now_secs - crawled_ts) as f32 / 86_400.0; 129 + original_score * freshness_multiplier(age_days) 130 + } 131 + }); 132 + 71 133 let top_docs = searcher 72 - .search(&query, &TopDocs::with_limit(limit)) 134 + .search(&query, &collector) 73 135 .wrap_err("search failed")?; 74 136 75 - debug!( 76 - query = query_str, 77 - hits = top_docs.len(), 78 - "search completed" 79 - ); 137 + debug!(query = query_str, hits = top_docs.len(), "search completed"); 80 138 81 139 let mut results = Vec::with_capacity(top_docs.len()); 82 140 ··· 106 164 let source_kind = doc 107 165 .get_first(source_kind_field) 108 166 .and_then(|v| v.as_facet()) 109 - .map_or_else(String::new, std::string::ToString::to_string); 167 + .map(|f| facet_label(f)) 168 + .unwrap_or_default(); 110 169 111 170 let source_name = doc 112 171 .get_first(source_name_field) 113 172 .and_then(|v| v.as_facet()) 114 - .map_or_else(String::new, std::string::ToString::to_string); 173 + .map(|f| facet_label(f)) 174 + .unwrap_or_default(); 115 175 116 176 results.push(SearchResult { 117 177 url, ··· 125 185 126 186 Ok(results) 127 187 } 188 + 189 + /// Extract the last path component from a Tantivy facet (e.g. `/blog` → `blog`). 190 + fn facet_label(facet: &tantivy::schema::Facet) -> String { 191 + facet.to_path().last().unwrap_or(&"").to_string() 192 + } 193 + 194 + /// Exponential freshness decay. 195 + /// 196 + /// Returns a multiplier in `[FRESHNESS_FLOOR, FRESHNESS_FLOOR + 1.0]`: 197 + /// 198 + /// - `age_days = 0` → `1.0 + FRESHNESS_FLOOR` (maximum boost) 199 + /// - `age_days = HALF_LIFE` → `0.5 + FRESHNESS_FLOOR` 200 + /// - `age_days → ∞` → `FRESHNESS_FLOOR` (floor, old content isn't buried) 201 + /// 202 + /// ```text 203 + /// multiplier 204 + /// 1.5 ┤ · 205 + /// │ ·· 206 + /// 1.0 ┤ ···· 207 + /// │ ········ 208 + /// 0.5 ┤ ····················· 209 + /// └──────────┬──────────┬──────────────→ days 210 + /// 7 30 211 + /// ``` 212 + fn freshness_multiplier(age_days: f32) -> f32 { 213 + /// Half-life in days: score contribution halves every 7 days. 214 + const HALF_LIFE: f32 = 7.0; 215 + 216 + /// Floor multiplier so old content isn't completely buried. 217 + const FRESHNESS_FLOOR: f32 = 0.5; 218 + 219 + let decay = (0.5_f32).powf(age_days.max(0.0) / HALF_LIFE); 220 + decay + FRESHNESS_FLOOR 221 + }
+223
motet_core/src/robots.rs
··· 1 + //! `robots.txt` compliance checker. 2 + //! 3 + //! Fetches and caches `robots.txt` per origin, then checks individual URLs 4 + //! against the parsed rules. Uses [`texting_robots`] for RFC 9309 compliant 5 + //! parsing. 6 + //! 7 + //! The checker is safe to share across concurrent crawl tasks via `Arc`. 8 + //! Internally it uses a [`tokio::sync::Mutex`] so all methods take `&self`. 9 + //! 10 + //! # Usage 11 + //! 12 + //! ```rust,no_run 13 + //! # async fn example() -> eyre::Result<()> { 14 + //! use motet_core::robots::RobotsChecker; 15 + //! 16 + //! let client = reqwest::Client::new(); 17 + //! let checker = RobotsChecker::new(client); 18 + //! 19 + //! let url = url::Url::parse("https://example.com/page")?; 20 + //! if checker.allowed(&url).await { 21 + //! // safe to crawl 22 + //! } 23 + //! # Ok(()) 24 + //! # } 25 + //! ``` 26 + 27 + use std::{collections::HashMap, time::Duration}; 28 + use texting_robots::Robot; 29 + use tokio::sync::Mutex; 30 + use tracing::{debug, warn}; 31 + use url::Url; 32 + 33 + /// User-agent string used when checking `robots.txt` rules. 34 + /// 35 + /// This should match the token portion of the HTTP `User-Agent` header. 36 + const ROBOT_NAME: &str = "motet"; 37 + 38 + /// Maximum size of a `robots.txt` file we're willing to parse (500 KiB, 39 + /// per Google's recommendation). 40 + const MAX_ROBOTS_SIZE: usize = 512 * 1024; 41 + 42 + /// Per-origin cache entry for parsed `robots.txt`. 43 + enum CacheEntry { 44 + /// Successfully fetched and parsed. 45 + Parsed(Robot), 46 + /// Fetch failed or returned 4xx — assume no restrictions. 47 + Permissive, 48 + } 49 + 50 + /// Fetches, caches, and checks `robots.txt` for crawled URLs. 51 + /// 52 + /// Create one per crawl run and share across tasks via `Arc`. The internal 53 + /// cache is protected by a [`tokio::sync::Mutex`] so all methods take `&self`. 54 + /// 55 + /// `reqwest::Client` is cheap to clone (it wraps an `Arc` internally), so 56 + /// the checker owns its client rather than borrowing — this avoids lifetime 57 + /// parameters and makes `Arc<RobotsChecker>` trivial. 58 + pub struct RobotsChecker { 59 + client: reqwest::Client, 60 + cache: Mutex<HashMap<String, CacheEntry>>, 61 + } 62 + 63 + impl std::fmt::Debug for RobotsChecker { 64 + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 65 + f.debug_struct("RobotsChecker").finish_non_exhaustive() 66 + } 67 + } 68 + 69 + impl RobotsChecker { 70 + /// Create a new checker backed by the given HTTP client. 71 + /// 72 + /// The client is cloned (cheap — `reqwest::Client` is `Arc`-based). 73 + #[must_use] 74 + pub fn new(client: reqwest::Client) -> Self { 75 + Self { 76 + client, 77 + cache: Mutex::new(HashMap::new()), 78 + } 79 + } 80 + 81 + /// Check whether the given URL is allowed by `robots.txt`. 82 + /// 83 + /// On first call for a given origin this fetches and parses the 84 + /// `robots.txt` file. Subsequent calls for the same origin use the 85 + /// cached result. 86 + /// 87 + /// Returns `true` (allow) when: 88 + /// - The URL's origin has no `robots.txt` (404, other 4xx) 89 + /// - The `robots.txt` explicitly allows the path 90 + /// - The fetch failed (err on the side of crawling) 91 + pub async fn allowed(&self, url: &Url) -> bool { 92 + self.ensure_cached(url).await; 93 + 94 + let cache = self.cache.lock().await; 95 + let origin = origin_key(url); 96 + 97 + match cache.get(&origin) { 98 + Some(CacheEntry::Parsed(robot)) => { 99 + let allowed = robot.allowed(url.as_str()); 100 + if !allowed { 101 + debug!(url = %url, "blocked by robots.txt"); 102 + } 103 + allowed 104 + } 105 + Some(CacheEntry::Permissive) | None => true, 106 + } 107 + } 108 + 109 + /// Return the crawl delay for the given URL's origin, if specified 110 + /// in `robots.txt`. 111 + pub async fn delay(&self, url: &Url) -> Option<Duration> { 112 + self.ensure_cached(url).await; 113 + 114 + let cache = self.cache.lock().await; 115 + let origin = origin_key(url); 116 + 117 + match cache.get(&origin) { 118 + Some(CacheEntry::Parsed(robot)) => { 119 + robot.delay.map(|secs| Duration::from_secs_f64(f64::from(secs))) 120 + } 121 + _ => None, 122 + } 123 + } 124 + 125 + /// Ensure the cache has an entry for this URL's origin. 126 + async fn ensure_cached(&self, url: &Url) { 127 + let origin = origin_key(url); 128 + 129 + // Quick check: already cached? 130 + { 131 + let cache = self.cache.lock().await; 132 + if cache.contains_key(&origin) { 133 + return; 134 + } 135 + } 136 + 137 + // Fetch outside the lock to avoid holding it during I/O. 138 + let entry = fetch_and_parse(&self.client, url).await; 139 + 140 + let mut cache = self.cache.lock().await; 141 + // Another task may have raced us; only insert if still missing. 142 + cache.entry(origin).or_insert(entry); 143 + } 144 + } 145 + 146 + /// Build a cache key from a URL's origin (scheme + host + port). 147 + fn origin_key(url: &Url) -> String { 148 + url.origin().ascii_serialization() 149 + } 150 + 151 + /// Fetch `robots.txt` for the given URL's origin and parse it. 152 + async fn fetch_and_parse(client: &reqwest::Client, url: &Url) -> CacheEntry { 153 + let robots_url = match texting_robots::get_robots_url(url.as_str()) { 154 + Ok(u) => u, 155 + Err(e) => { 156 + warn!(url = %url, error = %e, "could not derive robots.txt URL, assuming permissive"); 157 + return CacheEntry::Permissive; 158 + } 159 + }; 160 + 161 + let resp = match client.get(&robots_url).send().await { 162 + Ok(r) => r, 163 + Err(e) => { 164 + warn!(%robots_url, error = %e, "failed to fetch robots.txt, assuming permissive"); 165 + return CacheEntry::Permissive; 166 + } 167 + }; 168 + 169 + let status = resp.status(); 170 + 171 + // 4xx (except 429): no restrictions 172 + if status.is_client_error() && status.as_u16() != 429 { 173 + debug!(%robots_url, %status, "robots.txt not found, assuming permissive"); 174 + return CacheEntry::Permissive; 175 + } 176 + 177 + // 429 or 5xx: be conservative, assume no restrictions but warn 178 + if status.as_u16() == 429 || status.is_server_error() { 179 + warn!( 180 + %robots_url, 181 + %status, 182 + "robots.txt unavailable (server error/rate limit), assuming permissive" 183 + ); 184 + return CacheEntry::Permissive; 185 + } 186 + 187 + // 2xx: parse the body 188 + let bytes = match resp.bytes().await { 189 + Ok(b) => b, 190 + Err(e) => { 191 + warn!(%robots_url, error = %e, "failed to read robots.txt body, assuming permissive"); 192 + return CacheEntry::Permissive; 193 + } 194 + }; 195 + 196 + // Limit size per Google's recommendation 197 + let bytes: &[u8] = if bytes.len() > MAX_ROBOTS_SIZE { 198 + debug!( 199 + %robots_url, 200 + size = bytes.len(), 201 + "robots.txt exceeds 500 KiB, truncating" 202 + ); 203 + bytes.get(..MAX_ROBOTS_SIZE).unwrap_or(&bytes) 204 + } else { 205 + &bytes 206 + }; 207 + 208 + match Robot::new(ROBOT_NAME, bytes) { 209 + Ok(robot) => { 210 + debug!( 211 + %robots_url, 212 + delay = ?robot.delay, 213 + sitemaps = robot.sitemaps.len(), 214 + "parsed robots.txt" 215 + ); 216 + CacheEntry::Parsed(robot) 217 + } 218 + Err(e) => { 219 + warn!(%robots_url, error = %e, "failed to parse robots.txt, assuming permissive"); 220 + CacheEntry::Permissive 221 + } 222 + } 223 + }
+3 -3
motet_core/src/schema.rs
··· 65 65 // Stored only — not indexed (display purposes) 66 66 builder.add_text_field(field::SNIPPET, STORED); 67 67 68 - // Facets for filtering 69 - builder.add_facet_field(field::SOURCE_KIND, FacetOptions::default()); 70 - builder.add_facet_field(field::SOURCE_NAME, FacetOptions::default()); 68 + // Facets for filtering (stored so we can display them in results) 69 + builder.add_facet_field(field::SOURCE_KIND, FacetOptions::default().set_stored()); 70 + builder.add_facet_field(field::SOURCE_NAME, FacetOptions::default().set_stored()); 71 71 72 72 // Date for freshness 73 73 builder.add_date_field(field::CRAWLED_AT, DateOptions::default() | STORED | FAST);
+48 -5
motet_core/src/store.rs
··· 156 156 } 157 157 } 158 158 159 + /// Return the most recently crawled records, ordered by crawl time descending. 160 + /// 161 + /// # Errors 162 + /// 163 + /// Returns an error if the database query fails. 164 + pub fn recent(&self, limit: usize) -> Result<Vec<CrawlRecord>> { 165 + let mut stmt = self.conn.prepare( 166 + "SELECT url, source_name, crawled_at, etag, title 167 + FROM crawl_records 168 + ORDER BY crawled_at DESC 169 + LIMIT ?1", 170 + )?; 171 + 172 + let rows = stmt 173 + .query_map(params![limit], |row| { 174 + let crawled_at_str: String = row.get(2)?; 175 + Ok(( 176 + row.get::<_, String>(0)?, 177 + row.get::<_, String>(1)?, 178 + crawled_at_str, 179 + row.get::<_, Option<String>>(3)?, 180 + row.get::<_, Option<String>>(4)?, 181 + )) 182 + })? 183 + .collect::<std::result::Result<Vec<_>, _>>() 184 + .wrap_err("failed to read recent crawl records")?; 185 + 186 + let mut records = Vec::with_capacity(rows.len()); 187 + for (url, source_name, crawled_at_str, etag, title) in rows { 188 + let crawled_at = crawled_at_str 189 + .parse::<DateTime<Utc>>() 190 + .wrap_err("invalid datetime in database")?; 191 + 192 + records.push(CrawlRecord { 193 + url, 194 + source_name, 195 + crawled_at, 196 + etag, 197 + title, 198 + }); 199 + } 200 + 201 + Ok(records) 202 + } 203 + 159 204 /// Count total crawled documents, optionally filtered by source. 160 205 /// 161 206 /// # Errors ··· 168 213 params![name], 169 214 |row| row.get(0), 170 215 )?, 171 - None => self.conn.query_row( 172 - "SELECT COUNT(*) FROM crawl_records", 173 - [], 174 - |row| row.get(0), 175 - )?, 216 + None => self 217 + .conn 218 + .query_row("SELECT COUNT(*) FROM crawl_records", [], |row| row.get(0))?, 176 219 }; 177 220 178 221 Ok(count)

History

1 round 0 comments
sign up or login to add to the discussion
expede.wtf submitted #0
5 commits
expand
Fix basics
add more sources
fix request headers
flesh out
add more sources
no conflicts, ready to merge
expand 0 comments