feat: document import module with PDF parsing and DOCX stub

+8

CHANGELOG.md

··· 7 7 8 8 ## Added 9 9 10 + ### 2026-01-03 11 + 12 + - Implemented `malfestio-readability` crate: A custom, rule-based content extraction engine replacing `dom_smoothie`, featuring XPath support (ftr-site-config compatible) and a Mozilla Readability-based generic fallback. 13 + 10 14 ### 2026-01-02 11 15 12 16 - Published AT Protocol Lexicons for all core types (`org.stormlightlabs.malfestio.*`) 17 + 18 + ### 2025-12-* 19 + 20 + - *TODO*

+361 -10

Cargo.lock

··· 3 3 version = 4 4 4 5 5 [[package]] 6 + name = "adler2" 7 + version = "2.0.1" 8 + source = "registry+https://github.com/rust-lang/crates.io-index" 9 + checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" 10 + 11 + [[package]] 12 + name = "adobe-cmap-parser" 13 + version = "0.4.1" 14 + source = "registry+https://github.com/rust-lang/crates.io-index" 15 + checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" 16 + dependencies = [ 17 + "pom", 18 + ] 19 + 20 + [[package]] 21 + name = "aes" 22 + version = "0.8.4" 23 + source = "registry+https://github.com/rust-lang/crates.io-index" 24 + checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" 25 + dependencies = [ 26 + "cfg-if", 27 + "cipher", 28 + "cpufeatures", 29 + ] 30 + 31 + [[package]] 6 32 name = "aho-corasick" 7 33 version = "1.1.4" 8 34 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 243 269 244 270 [[package]] 245 271 name = "bitflags" 272 + version = "1.3.2" 273 + source = "registry+https://github.com/rust-lang/crates.io-index" 274 + checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 275 + 276 + [[package]] 277 + name = "bitflags" 246 278 version = "2.10.0" 247 279 source = "registry+https://github.com/rust-lang/crates.io-index" 248 280 checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" ··· 257 289 ] 258 290 259 291 [[package]] 292 + name = "block-padding" 293 + version = "0.3.3" 294 + source = "registry+https://github.com/rust-lang/crates.io-index" 295 + checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" 296 + dependencies = [ 297 + "generic-array", 298 + ] 299 + 300 + [[package]] 260 301 name = "bumpalo" 261 302 version = "3.19.1" 262 303 source = "registry+https://github.com/rust-lang/crates.io-index" 263 304 checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" 264 305 265 306 [[package]] 307 + name = "bytecount" 308 + version = "0.6.9" 309 + source = "registry+https://github.com/rust-lang/crates.io-index" 310 + checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" 311 + 312 + [[package]] 313 + name = "bytemuck" 314 + version = "1.24.0" 315 + source = "registry+https://github.com/rust-lang/crates.io-index" 316 + checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" 317 + 318 + [[package]] 266 319 name = "byteorder" 267 320 version = "1.5.0" 268 321 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 275 328 checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" 276 329 277 330 [[package]] 331 + name = "cbc" 332 + version = "0.1.2" 333 + source = "registry+https://github.com/rust-lang/crates.io-index" 334 + checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" 335 + dependencies = [ 336 + "cipher", 337 + ] 338 + 339 + [[package]] 278 340 name = "cbor4ii" 279 341 version = "0.2.14" 280 342 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 300 362 version = "1.1.0" 301 363 source = "registry+https://github.com/rust-lang/crates.io-index" 302 364 checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" 365 + 366 + [[package]] 367 + name = "cff-parser" 368 + version = "0.1.0" 369 + source = "registry+https://github.com/rust-lang/crates.io-index" 370 + checksum = "31f5b6e9141c036f3ff4ce7b2f7e432b0f00dee416ddcd4f17741d189ddc2e9d" 303 371 304 372 [[package]] 305 373 name = "cfg-if" ··· 342 410 ] 343 411 344 412 [[package]] 413 + name = "cipher" 414 + version = "0.4.4" 415 + source = "registry+https://github.com/rust-lang/crates.io-index" 416 + checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" 417 + dependencies = [ 418 + "crypto-common", 419 + "inout", 420 + ] 421 + 422 + [[package]] 345 423 name = "clap" 346 424 version = "4.5.53" 347 425 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 380 458 version = "0.7.6" 381 459 source = "registry+https://github.com/rust-lang/crates.io-index" 382 460 checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" 461 + 462 + [[package]] 463 + name = "color_quant" 464 + version = "1.1.0" 465 + source = "registry+https://github.com/rust-lang/crates.io-index" 466 + checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" 383 467 384 468 [[package]] 385 469 name = "colorchoice" ··· 462 546 checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" 463 547 dependencies = [ 464 548 "libc", 549 + ] 550 + 551 + [[package]] 552 + name = "crc32fast" 553 + version = "1.5.0" 554 + source = "registry+https://github.com/rust-lang/crates.io-index" 555 + checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" 556 + dependencies = [ 557 + "cfg-if", 465 558 ] 466 559 467 560 [[package]] ··· 692 785 ] 693 786 694 787 [[package]] 788 + name = "docx-rs" 789 + version = "0.4.18" 790 + source = "registry+https://github.com/rust-lang/crates.io-index" 791 + checksum = "3f21be13b97bd2924f30323d674f5a8db382964972825abd93f30d08f21dad98" 792 + dependencies = [ 793 + "base64", 794 + "image", 795 + "serde", 796 + "serde_json", 797 + "thiserror 1.0.69", 798 + "xml-rs", 799 + "zip", 800 + ] 801 + 802 + [[package]] 695 803 name = "dotenvy" 696 804 version = "0.15.7" 697 805 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 710 818 checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" 711 819 dependencies = [ 712 820 "dtoa", 821 + ] 822 + 823 + [[package]] 824 + name = "ecb" 825 + version = "0.1.2" 826 + source = "registry+https://github.com/rust-lang/crates.io-index" 827 + checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7" 828 + dependencies = [ 829 + "cipher", 713 830 ] 714 831 715 832 [[package]] ··· 819 936 ] 820 937 821 938 [[package]] 939 + name = "euclid" 940 + version = "0.20.14" 941 + source = "registry+https://github.com/rust-lang/crates.io-index" 942 + checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" 943 + dependencies = [ 944 + "num-traits", 945 + ] 946 + 947 + [[package]] 822 948 name = "fallible-iterator" 823 949 version = "0.2.0" 824 950 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 831 957 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" 832 958 833 959 [[package]] 960 + name = "fdeflate" 961 + version = "0.3.7" 962 + source = "registry+https://github.com/rust-lang/crates.io-index" 963 + checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" 964 + dependencies = [ 965 + "simd-adler32", 966 + ] 967 + 968 + [[package]] 834 969 name = "ff" 835 970 version = "0.13.1" 836 971 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 853 988 checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" 854 989 855 990 [[package]] 991 + name = "flate2" 992 + version = "1.1.5" 993 + source = "registry+https://github.com/rust-lang/crates.io-index" 994 + checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" 995 + dependencies = [ 996 + "crc32fast", 997 + "miniz_oxide", 998 + ] 999 + 1000 + [[package]] 856 1001 name = "fnv" 857 1002 version = "1.0.7" 858 1003 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1032 1177 "r-efi", 1033 1178 "wasip2", 1034 1179 "wasm-bindgen", 1180 + ] 1181 + 1182 + [[package]] 1183 + name = "gif" 1184 + version = "0.13.3" 1185 + source = "registry+https://github.com/rust-lang/crates.io-index" 1186 + checksum = "4ae047235e33e2829703574b54fdec96bfbad892062d97fed2f76022287de61b" 1187 + dependencies = [ 1188 + "color_quant", 1189 + "weezl", 1035 1190 ] 1036 1191 1037 1192 [[package]] ··· 1503 1658 ] 1504 1659 1505 1660 [[package]] 1661 + name = "image" 1662 + version = "0.24.9" 1663 + source = "registry+https://github.com/rust-lang/crates.io-index" 1664 + checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d" 1665 + dependencies = [ 1666 + "bytemuck", 1667 + "byteorder", 1668 + "color_quant", 1669 + "gif", 1670 + "jpeg-decoder", 1671 + "num-traits", 1672 + "png", 1673 + "tiff", 1674 + ] 1675 + 1676 + [[package]] 1506 1677 name = "indexmap" 1507 1678 version = "2.12.1" 1508 1679 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1510 1681 dependencies = [ 1511 1682 "equivalent", 1512 1683 "hashbrown 0.16.1", 1684 + ] 1685 + 1686 + [[package]] 1687 + name = "inout" 1688 + version = "0.1.4" 1689 + source = "registry+https://github.com/rust-lang/crates.io-index" 1690 + checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" 1691 + dependencies = [ 1692 + "block-padding", 1693 + "generic-array", 1513 1694 ] 1514 1695 1515 1696 [[package]] ··· 1594 1775 ] 1595 1776 1596 1777 [[package]] 1778 + name = "jpeg-decoder" 1779 + version = "0.3.2" 1780 + source = "registry+https://github.com/rust-lang/crates.io-index" 1781 + checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07" 1782 + 1783 + [[package]] 1597 1784 name = "js-sys" 1598 1785 version = "0.3.83" 1599 1786 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1635 1822 source = "registry+https://github.com/rust-lang/crates.io-index" 1636 1823 checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" 1637 1824 dependencies = [ 1638 - "bitflags", 1825 + "bitflags 2.10.0", 1639 1826 "libc", 1640 1827 "redox_syscall 0.7.0", 1641 1828 ] ··· 1674 1861 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" 1675 1862 1676 1863 [[package]] 1864 + name = "lopdf" 1865 + version = "0.38.0" 1866 + source = "registry+https://github.com/rust-lang/crates.io-index" 1867 + checksum = "c7184fdea2bc3cd272a1acec4030c321a8f9875e877b3f92a53f2f6033fdc289" 1868 + dependencies = [ 1869 + "aes", 1870 + "bitflags 2.10.0", 1871 + "cbc", 1872 + "ecb", 1873 + "encoding_rs", 1874 + "flate2", 1875 + "getrandom 0.3.4", 1876 + "indexmap", 1877 + "itoa", 1878 + "log", 1879 + "md-5", 1880 + "nom", 1881 + "nom_locate", 1882 + "rand 0.9.2", 1883 + "rangemap", 1884 + "sha2", 1885 + "stringprep", 1886 + "thiserror 2.0.17", 1887 + "ttf-parser", 1888 + "weezl", 1889 + ] 1890 + 1891 + [[package]] 1677 1892 name = "lru" 1678 1893 version = "0.12.5" 1679 1894 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1756 1971 "base64", 1757 1972 "chrono", 1758 1973 "deadpool-postgres", 1974 + "docx-rs", 1759 1975 "dotenvy", 1760 1976 "ed25519-dalek", 1761 1977 "getrandom 0.3.4", 1762 1978 "hickory-resolver 0.24.4", 1763 1979 "malfestio-core", 1764 1980 "malfestio-readability", 1981 + "pdf-extract", 1765 1982 "regex", 1766 1983 "reqwest", 1767 1984 "serde", ··· 1866 2083 checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" 1867 2084 1868 2085 [[package]] 2086 + name = "miniz_oxide" 2087 + version = "0.8.9" 2088 + source = "registry+https://github.com/rust-lang/crates.io-index" 2089 + checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" 2090 + dependencies = [ 2091 + "adler2", 2092 + "simd-adler32", 2093 + ] 2094 + 2095 + [[package]] 1869 2096 name = "mio" 1870 2097 version = "1.1.1" 1871 2098 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1940 2167 checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" 1941 2168 1942 2169 [[package]] 2170 + name = "nom" 2171 + version = "8.0.0" 2172 + source = "registry+https://github.com/rust-lang/crates.io-index" 2173 + checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" 2174 + dependencies = [ 2175 + "memchr", 2176 + ] 2177 + 2178 + [[package]] 2179 + name = "nom_locate" 2180 + version = "5.0.0" 2181 + source = "registry+https://github.com/rust-lang/crates.io-index" 2182 + checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d" 2183 + dependencies = [ 2184 + "bytecount", 2185 + "memchr", 2186 + "nom", 2187 + ] 2188 + 2189 + [[package]] 1943 2190 name = "nu-ansi-term" 1944 2191 version = "0.50.3" 1945 2192 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1995 2242 source = "registry+https://github.com/rust-lang/crates.io-index" 1996 2243 checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" 1997 2244 dependencies = [ 1998 - "bitflags", 2245 + "bitflags 2.10.0", 1999 2246 "cfg-if", 2000 2247 "foreign-types", 2001 2248 "libc", ··· 2081 2328 ] 2082 2329 2083 2330 [[package]] 2331 + name = "pdf-extract" 2332 + version = "0.10.0" 2333 + source = "registry+https://github.com/rust-lang/crates.io-index" 2334 + checksum = "1e28ba1758a3d3f361459645780e09570b573fc3c82637449e9963174c813a98" 2335 + dependencies = [ 2336 + "adobe-cmap-parser", 2337 + "cff-parser", 2338 + "encoding_rs", 2339 + "euclid", 2340 + "log", 2341 + "lopdf", 2342 + "postscript", 2343 + "type1-encoding-parser", 2344 + "unicode-normalization", 2345 + ] 2346 + 2347 + [[package]] 2084 2348 name = "pem-rfc7468" 2085 2349 version = "0.7.0" 2086 2350 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2221 2485 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" 2222 2486 2223 2487 [[package]] 2488 + name = "png" 2489 + version = "0.17.16" 2490 + source = "registry+https://github.com/rust-lang/crates.io-index" 2491 + checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526" 2492 + dependencies = [ 2493 + "bitflags 1.3.2", 2494 + "crc32fast", 2495 + "fdeflate", 2496 + "flate2", 2497 + "miniz_oxide", 2498 + ] 2499 + 2500 + [[package]] 2501 + name = "pom" 2502 + version = "1.1.0" 2503 + source = "registry+https://github.com/rust-lang/crates.io-index" 2504 + checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" 2505 + 2506 + [[package]] 2224 2507 name = "portable-atomic" 2225 2508 version = "1.13.0" 2226 2509 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2258 2541 "serde_json", 2259 2542 "uuid", 2260 2543 ] 2544 + 2545 + [[package]] 2546 + name = "postscript" 2547 + version = "0.14.1" 2548 + source = "registry+https://github.com/rust-lang/crates.io-index" 2549 + checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" 2261 2550 2262 2551 [[package]] 2263 2552 name = "potential_utf" ··· 2443 2732 ] 2444 2733 2445 2734 [[package]] 2735 + name = "rangemap" 2736 + version = "1.7.1" 2737 + source = "registry+https://github.com/rust-lang/crates.io-index" 2738 + checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" 2739 + 2740 + [[package]] 2446 2741 name = "redox_syscall" 2447 2742 version = "0.5.18" 2448 2743 source = "registry+https://github.com/rust-lang/crates.io-index" 2449 2744 checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" 2450 2745 dependencies = [ 2451 - "bitflags", 2746 + "bitflags 2.10.0", 2452 2747 ] 2453 2748 2454 2749 [[package]] ··· 2457 2752 source = "registry+https://github.com/rust-lang/crates.io-index" 2458 2753 checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" 2459 2754 dependencies = [ 2460 - "bitflags", 2755 + "bitflags 2.10.0", 2461 2756 ] 2462 2757 2463 2758 [[package]] ··· 2584 2879 source = "registry+https://github.com/rust-lang/crates.io-index" 2585 2880 checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" 2586 2881 dependencies = [ 2587 - "bitflags", 2882 + "bitflags 2.10.0", 2588 2883 "errno", 2589 2884 "libc", 2590 2885 "linux-raw-sys", ··· 2710 3005 source = "registry+https://github.com/rust-lang/crates.io-index" 2711 3006 checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" 2712 3007 dependencies = [ 2713 - "bitflags", 3008 + "bitflags 2.10.0", 2714 3009 "core-foundation 0.9.4", 2715 3010 "core-foundation-sys", 2716 3011 "libc", ··· 2723 3018 source = "registry+https://github.com/rust-lang/crates.io-index" 2724 3019 checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" 2725 3020 dependencies = [ 2726 - "bitflags", 3021 + "bitflags 2.10.0", 2727 3022 "core-foundation 0.10.1", 2728 3023 "core-foundation-sys", 2729 3024 "libc", ··· 2746 3041 source = "registry+https://github.com/rust-lang/crates.io-index" 2747 3042 checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7" 2748 3043 dependencies = [ 2749 - "bitflags", 3044 + "bitflags 2.10.0", 2750 3045 "cssparser", 2751 3046 "derive_more", 2752 3047 "log", ··· 2930 3225 ] 2931 3226 2932 3227 [[package]] 3228 + name = "simd-adler32" 3229 + version = "0.3.8" 3230 + source = "registry+https://github.com/rust-lang/crates.io-index" 3231 + checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" 3232 + 3233 + [[package]] 2933 3234 name = "simdutf8" 2934 3235 version = "0.1.5" 2935 3236 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3131 3432 source = "registry+https://github.com/rust-lang/crates.io-index" 3132 3433 checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" 3133 3434 dependencies = [ 3134 - "bitflags", 3435 + "bitflags 2.10.0", 3135 3436 "core-foundation 0.9.4", 3136 3437 "system-configuration-sys", 3137 3438 ] ··· 3226 3527 ] 3227 3528 3228 3529 [[package]] 3530 + name = "tiff" 3531 + version = "0.9.1" 3532 + source = "registry+https://github.com/rust-lang/crates.io-index" 3533 + checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e" 3534 + dependencies = [ 3535 + "flate2", 3536 + "jpeg-decoder", 3537 + "weezl", 3538 + ] 3539 + 3540 + [[package]] 3229 3541 name = "time" 3230 3542 version = "0.3.44" 3231 3543 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3429 3741 source = "registry+https://github.com/rust-lang/crates.io-index" 3430 3742 checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" 3431 3743 dependencies = [ 3432 - "bitflags", 3744 + "bitflags 2.10.0", 3433 3745 "bytes", 3434 3746 "futures-util", 3435 3747 "http", ··· 3523 3835 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" 3524 3836 3525 3837 [[package]] 3838 + name = "ttf-parser" 3839 + version = "0.25.1" 3840 + source = "registry+https://github.com/rust-lang/crates.io-index" 3841 + checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" 3842 + 3843 + [[package]] 3844 + name = "type1-encoding-parser" 3845 + version = "0.1.0" 3846 + source = "registry+https://github.com/rust-lang/crates.io-index" 3847 + checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b" 3848 + dependencies = [ 3849 + "pom", 3850 + ] 3851 + 3852 + [[package]] 3526 3853 name = "typed-arena" 3527 3854 version = "1.7.0" 3528 3855 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3791 4118 ] 3792 4119 3793 4120 [[package]] 4121 + name = "weezl" 4122 + version = "0.1.12" 4123 + source = "registry+https://github.com/rust-lang/crates.io-index" 4124 + checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" 4125 + 4126 + [[package]] 3794 4127 name = "whoami" 3795 4128 version = "1.6.1" 3796 4129 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 4131 4464 checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" 4132 4465 4133 4466 [[package]] 4467 + name = "xml-rs" 4468 + version = "0.8.28" 4469 + source = "registry+https://github.com/rust-lang/crates.io-index" 4470 + checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f" 4471 + 4472 + [[package]] 4134 4473 name = "xml5ever" 4135 4474 version = "0.18.1" 4136 4475 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 4242 4581 "proc-macro2", 4243 4582 "quote", 4244 4583 "syn 2.0.111", 4584 + ] 4585 + 4586 + [[package]] 4587 + name = "zip" 4588 + version = "0.6.6" 4589 + source = "registry+https://github.com/rust-lang/crates.io-index" 4590 + checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" 4591 + dependencies = [ 4592 + "byteorder", 4593 + "crc32fast", 4594 + "crossbeam-utils", 4595 + "flate2", 4245 4596 ] 4246 4597 4247 4598 [[package]]

+5

README.md

··· 13 13 - [Information Architecture](./docs/information-architecture.md) - Navigation and URL structure 14 14 - [Data Model Mapping](./docs/data-model-mapping.md) - Lexicon to database mapping 15 15 - [Roadmap](./docs/todo.md) - Development milestones 16 + 17 + ## Test Data 18 + 19 + The file `crates/server/tests/data/1904.09828v2.pdf` is included for testing purposes. 20 + It contains the paper "Magic: The Gathering is Turing Complete" (arXiv:1904.09828).

+2

crates/server/Cargo.toml

··· 37 37 tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } 38 38 uuid = { version = "1.19.0", features = ["v4", "fast-rng"] } 39 39 hickory-resolver = "0.24" 40 + pdf-extract = "0.10" 41 + docx-rs = "0.4"

+15

crates/server/src/import/docx.rs

··· 1 + use super::DocumentParser; 2 + use anyhow::Result; 3 + use std::path::Path; 4 + 5 + pub struct DocxParser; 6 + 7 + impl DocumentParser for DocxParser { 8 + fn parse(&self, path: &Path) -> Result<String> { 9 + // TODO: Implement DOCX parsing using docx-rs 10 + Ok(format!( 11 + "DOCX parsing not yet implemented. File: {:?}", 12 + path.file_name() 13 + )) 14 + } 15 + }

+11

crates/server/src/import/mod.rs

··· 1 + use anyhow::Result; 2 + use std::path::Path; 3 + 4 + pub mod docx; 5 + pub mod pdf; 6 + 7 + /// Trait for parsing documents (PDF, DOCX, etc.) and extracting text. 8 + pub trait DocumentParser { 9 + /// Parse the document at the given path and return the extracted text. 10 + fn parse(&self, path: &Path) -> Result<String>; 11 + }

+13

crates/server/src/import/pdf.rs

··· 1 + use super::DocumentParser; 2 + use anyhow::{Context, Result}; 3 + use std::path::Path; 4 + 5 + pub struct PdfParser; 6 + 7 + impl DocumentParser for PdfParser { 8 + fn parse(&self, path: &Path) -> Result<String> { 9 + let text = 10 + pdf_extract::extract_text(path).with_context(|| format!("Failed to extract text from PDF: {:?}", path))?; 11 + Ok(text) 12 + } 13 + }

+1

crates/server/src/lib.rs

··· 1 1 pub mod api; 2 2 pub mod db; 3 3 pub mod firehose; 4 + pub mod import; 4 5 pub mod middleware; 5 6 pub mod oauth; 6 7 pub mod pds;

crates/server/tests/data/1904.09828v2.pdf

This is a binary file and will not be displayed.

+74

crates/server/tests/import_tests.rs

··· 1 + use malfestio_server::import::{DocumentParser, docx::DocxParser, pdf::PdfParser}; 2 + use std::path::PathBuf; 3 + 4 + fn get_test_data_path(filename: &str) -> PathBuf { 5 + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); 6 + path.push("tests/data"); 7 + path.push(filename); 8 + path 9 + } 10 + 11 + #[test] 12 + fn test_pdf_extraction() { 13 + let path = get_test_data_path("1904.09828v2.pdf"); 14 + assert!(path.exists(), "Test PDF not found at {:?}", path); 15 + 16 + let parser = PdfParser; 17 + let result = parser.parse(&path); 18 + assert!(result.is_ok(), "PDF parsing should succeed"); 19 + 20 + let content = result.unwrap(); 21 + assert!(!content.is_empty(), "Extracted content should not be empty"); 22 + assert!( 23 + content.contains("Magic: The Gathering"), 24 + "Content should contain 'Magic: The Gathering'" 25 + ); 26 + assert!( 27 + content.contains("Turing Complete"), 28 + "Content should contain 'Turing Complete'" 29 + ); 30 + 31 + assert!( 32 + content.contains("Alex Churchill"), 33 + "Content should contain author 'Alex Churchill'" 34 + ); 35 + 36 + let content_lower = content.to_lowercase(); 37 + assert!(content_lower.contains("abstract"), "Content should contain 'Abstract'"); 38 + 39 + assert!( 40 + content_lower.contains("introduction"), 41 + "Content should contain 'Introduction'" 42 + ); 43 + assert!( 44 + content_lower.contains("references"), 45 + "Content should contain 'References'" 46 + ); 47 + 48 + assert!( 49 + content.len() > 5000, 50 + "Content should be substantial (likely > 5000 chars)" 51 + ); 52 + } 53 + 54 + #[test] 55 + fn test_docx_stub_extraction() { 56 + let path = get_test_data_path("dummy.docx"); 57 + let parser = DocxParser; 58 + let result = parser.parse(&path); 59 + 60 + assert!(result.is_ok(), "DOCX stub should return Ok"); 61 + let content = result.unwrap(); 62 + assert!( 63 + content.contains("not yet implemented"), 64 + "Content should indicate stub implementation" 65 + ); 66 + } 67 + 68 + #[test] 69 + fn test_pdf_missing_file() { 70 + let path = get_test_data_path("non_existent.pdf"); 71 + let parser = PdfParser; 72 + let result = parser.parse(&path); 73 + assert!(result.is_err(), "Parsing missing file should return error"); 74 + }

+40

docs/todo.md

··· 184 184 185 185 **Reference:** [Ozone Moderation Service](https://github.com/bluesky-social/ozone) 186 186 187 + ### Milestone P - Readability Updates 188 + 189 + #### Deliverables 190 + 191 + **Multi-page Support:** 192 + 193 + - [ ] `single_page_link` directive - find "view full article" link 194 + - [ ] `next_page_link` directive - paginate through article pages 195 + - [ ] Concatenate content from multiple pages 196 + - [ ] Avoid circular pagination 197 + 198 + **Advanced Directives:** 199 + 200 + - [ ] `http_header(name)` directive - custom headers for fetching 201 + - [ ] `replace_string(find): replace` directive - text replacement 202 + - [ ] `find_string` directive - text pattern matching 203 + 204 + **Quality:** 205 + 206 + - [ ] Better table handling in markdown 207 + - [ ] Image caption extraction 208 + - [ ] JSON-LD support 209 + 210 + **Performance:** 211 + 212 + - [ ] LRU cache for parsed configs 213 + - [ ] Parallel candidate scoring 214 + - [ ] Lazy XPath evaluation 215 + 216 + **Markdown Conversion:** 217 + 218 + - [ ] Custom markdown converter (more control than html2md) 219 + - [ ] Code block language detection 220 + 221 + #### Acceptance 222 + 223 + - [ ] Can correctly extract multi-page articles (e.g., long news reports). 224 + - [ ] Advanced string manipulation allows for cleaner output on tricky sites. 225 + - [ ] Performance remains stable under high load. 226 + 187 227 ## Open Question/Parked Decisions 188 228 189 229 - Full offline authoring + later publish

Configure Feed

Configure Feed