C++ Standard Template Library browser.
at main 5.3 kB view raw
1#include "indexer.hpp" 2#include "prelude.hpp" 3#include <cctype> 4#include <cstdlib> 5 6fn filter_divs(std::string_view html) noexcept -> String 7{ 8 var buffer = String(); 9 while (not html.empty()) { 10 if (html.front() == '<') [[unlikely]] { 11 while (html.front() != '>') 12 html.remove_prefix(1); 13 html.remove_prefix(1); 14 } else { 15 buffer += html.front(); 16 html.remove_prefix(1); 17 } 18 } 19 20 return buffer; 21} 22 23fn to_words(std::string_view lines) noexcept -> Vector<String> 24{ 25 var buffer = Vector<String>(); 26 let valid_char = [](Char ch) { 27 return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); 28 }; 29 while (not lines.empty()) { 30 if (not valid_char(lines.front())) [[unlikely]] { 31 while (not lines.empty() && 32 not valid_char(lines.front())) { 33 lines.remove_prefix(1); 34 } 35 } else { 36 var str = String(); 37 while (valid_char(lines.front())) { 38 str += std::toupper(lines.front()); 39 lines.remove_prefix(1); 40 } 41 buffer.push_back(str); 42 } 43 } 44 45 return buffer; 46} 47 48fn index_directory(std::filesystem::path dir) noexcept -> Vector<Document> 49{ 50 std::cout << "creating index of: " << dir << std::endl; 51 var documents = Vector<Document>(); 52 for (let& dir_entry : 53 std::filesystem::recursive_directory_iterator(dir)) { 54 let path = dir_entry.path(); 55 if (path.extension() != ".html") 56 continue; 57 let document = Document(path.native()); 58 documents.push_back(document); 59 } 60 61 return documents; 62} 63 64fn count_query(Ref<const Vector<Document>> documents, 65 Ref<const String> query) noexcept 66 -> Vector<std::pair<String, F64>> 67{ 68 var matches = Vector<std::pair<String, F64>>(); 69 for (let& document : documents) { 70 if (document.freqs.contains(query)) { 71 matches.push_back( 72 {document.filename, F64(document.freqs.at(query))}); 73 } 74 } 75 return matches; 76} 77 78fn count_tfidf(Ref<const Vector<Document>> documents, 79 Ref<const String> query) noexcept 80 -> Vector<std::pair<String, F64>> 81{ 82 var matches = Vector<std::pair<String, F64>>(); 83 var tf = F64(); 84 var df = U64(); 85 for (let& document : documents) { 86 if (document.freqs.contains(query)) { 87 tf = F64(document.freqs.at(query)) / 88 F64(document.freqs.size()); 89 ++df; 90 matches.push_back({document.filename, tf}); 91 } 92 } 93 94 for (var & [ _, count ] : matches) { 95 count /= df; 96 } 97 return matches; 98} 99 100void cache_index(Ref<const Vector<Document>> documents) noexcept 101{ 102 std::cout << "caching index in: "; 103 using namespace std::filesystem; 104 /* check if cache dir exists and has an index */ 105 let cache_dir = get_cache_dir(); 106 create_directories(cache_dir); 107 let cache = cache_dir / "cache.txt"s; 108 109 std::cout << cache << std::endl; 110 111 /* index into cache dir */ 112 var f = std::ofstream(); 113 if (not exists(cache)) { 114 f.open(cache.c_str()); 115 for (let& document : documents) { 116 f << document.filename << ' ' << document.freqs.size() << '\n'; 117 for (let& [word, count] : document.freqs) { 118 f << word << ' ' << count << '\n'; 119 } 120 } 121 f.close(); 122 } 123} 124 125fn get_cache_dir() noexcept -> std::filesystem::path 126{ 127 using namespace std::filesystem; 128 var cache_dir = String(); 129 if (std::getenv("XDG_CACHE_HOME")) { 130 cache_dir = path(std::getenv("XDG_CACHE_HOME")) / "stl-index"; 131 } else if (std::getenv("HOME")) { 132 cache_dir = path(std::getenv("HOME")) / ".cache" / "stl-index"; 133 } else { 134 cache_dir = path("."); 135 std::cerr << "Cache directory not found at `$XDG_CACHE_HOME` or `$HOME/.cache`." << std::endl; 136 std::cerr << "Cache will be dumped in the current directory. Do with it what you will." << std::endl; 137 } 138 139 return cache_dir; 140} 141 142fn load_index() noexcept -> Vector<Document> 143{ 144 std::cout << "loading index from: "; 145 var documents = Vector<Document>(); 146 let cache_dir = get_cache_dir(); 147 var cache = std::ifstream(); 148 var path = cache_dir / "cache.txt"; 149 std::cout << path << std::endl; 150 cache.open(path); 151 if (cache.is_open()) { 152 var n = 0ULL; 153 var count = 0ULL; 154 while (not cache.eof()) { 155 var freqs = Vector<std::pair<String, U64>>(); 156 var filename = String(); 157 var word = String(); 158 cache >> filename; 159 cache >> n; 160 while (n-- && not cache.eof()) { 161 cache >> word; 162 cache >> count; 163 freqs.push_back({ word, count }); 164 } 165 documents.emplace_back(filename, std::unordered_map(freqs.cbegin(), freqs.cend())); 166 } 167 } 168 169 return documents; 170} 171 172Document::Document(std::string_view name, map<String, U64>&& mp) 173 : freqs(mp), filename(name) 174{ 175} 176 177Document::Document(std::string_view name) : filename(name) 178{ 179 var f = std::ifstream(filename); 180 181 if (not f.is_open()) { 182 std::cout << "Failed to open: " << name << std::endl; 183 } else { 184 /* weird notation for converting a file to string */ 185 var buffer = std::stringstream(); 186 buffer << f.rdbuf(); 187 let html = buffer.str(); 188 189 let words = to_words(filter_divs(html)); 190 for (let& str : words) 191 ++freqs[str]; 192 } 193} 194 195void Document::show_freqs() const noexcept 196{ 197 for (let & [ term, freq ] : freqs) 198 std::cout << term << ": " << freq << std::endl; 199} 200 201fn top_n_matches(Ref<const Vector<Document>> documents, Ref<const String> query, U64 n) noexcept 202 -> Vector<std::pair<String, F64>> 203{ 204 var matches = count_tfidf(documents, query); 205 let greater = [](let& x, let& y) { return x.second > y.second; }; 206 if (matches.size() < n) 207 ra::sort(matches, greater); 208 else 209 ra::partial_sort(matches, matches.begin() + n, greater); 210 211 matches.resize(n); 212 213 return matches; 214}