C++ Standard Template Library browser.
1#include "indexer.hpp"
2#include "prelude.hpp"
3#include <cctype>
4#include <cstdlib>
5
6fn filter_divs(std::string_view html) noexcept -> String
7{
8 var buffer = String();
9 while (not html.empty()) {
10 if (html.front() == '<') [[unlikely]] {
11 while (html.front() != '>')
12 html.remove_prefix(1);
13 html.remove_prefix(1);
14 } else {
15 buffer += html.front();
16 html.remove_prefix(1);
17 }
18 }
19
20 return buffer;
21}
22
23fn to_words(std::string_view lines) noexcept -> Vector<String>
24{
25 var buffer = Vector<String>();
26 let valid_char = [](Char ch) {
27 return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
28 };
29 while (not lines.empty()) {
30 if (not valid_char(lines.front())) [[unlikely]] {
31 while (not lines.empty() &&
32 not valid_char(lines.front())) {
33 lines.remove_prefix(1);
34 }
35 } else {
36 var str = String();
37 while (valid_char(lines.front())) {
38 str += std::toupper(lines.front());
39 lines.remove_prefix(1);
40 }
41 buffer.push_back(str);
42 }
43 }
44
45 return buffer;
46}
47
48fn index_directory(std::filesystem::path dir) noexcept -> Vector<Document>
49{
50 std::cout << "creating index of: " << dir << std::endl;
51 var documents = Vector<Document>();
52 for (let& dir_entry :
53 std::filesystem::recursive_directory_iterator(dir)) {
54 let path = dir_entry.path();
55 if (path.extension() != ".html")
56 continue;
57 let document = Document(path.native());
58 documents.push_back(document);
59 }
60
61 return documents;
62}
63
64fn count_query(Ref<const Vector<Document>> documents,
65 Ref<const String> query) noexcept
66 -> Vector<std::pair<String, F64>>
67{
68 var matches = Vector<std::pair<String, F64>>();
69 for (let& document : documents) {
70 if (document.freqs.contains(query)) {
71 matches.push_back(
72 {document.filename, F64(document.freqs.at(query))});
73 }
74 }
75 return matches;
76}
77
78fn count_tfidf(Ref<const Vector<Document>> documents,
79 Ref<const String> query) noexcept
80 -> Vector<std::pair<String, F64>>
81{
82 var matches = Vector<std::pair<String, F64>>();
83 var tf = F64();
84 var df = U64();
85 for (let& document : documents) {
86 if (document.freqs.contains(query)) {
87 tf = F64(document.freqs.at(query)) /
88 F64(document.freqs.size());
89 ++df;
90 matches.push_back({document.filename, tf});
91 }
92 }
93
94 for (var & [ _, count ] : matches) {
95 count /= df;
96 }
97 return matches;
98}
99
100void cache_index(Ref<const Vector<Document>> documents) noexcept
101{
102 std::cout << "caching index in: ";
103 using namespace std::filesystem;
104 /* check if cache dir exists and has an index */
105 let cache_dir = get_cache_dir();
106 create_directories(cache_dir);
107 let cache = cache_dir / "cache.txt"s;
108
109 std::cout << cache << std::endl;
110
111 /* index into cache dir */
112 var f = std::ofstream();
113 if (not exists(cache)) {
114 f.open(cache.c_str());
115 for (let& document : documents) {
116 f << document.filename << ' ' << document.freqs.size() << '\n';
117 for (let& [word, count] : document.freqs) {
118 f << word << ' ' << count << '\n';
119 }
120 }
121 f.close();
122 }
123}
124
125fn get_cache_dir() noexcept -> std::filesystem::path
126{
127 using namespace std::filesystem;
128 var cache_dir = String();
129 if (std::getenv("XDG_CACHE_HOME")) {
130 cache_dir = path(std::getenv("XDG_CACHE_HOME")) / "stl-index";
131 } else if (std::getenv("HOME")) {
132 cache_dir = path(std::getenv("HOME")) / ".cache" / "stl-index";
133 } else {
134 cache_dir = path(".");
135 std::cerr << "Cache directory not found at `$XDG_CACHE_HOME` or `$HOME/.cache`." << std::endl;
136 std::cerr << "Cache will be dumped in the current directory. Do with it what you will." << std::endl;
137 }
138
139 return cache_dir;
140}
141
142fn load_index() noexcept -> Vector<Document>
143{
144 std::cout << "loading index from: ";
145 var documents = Vector<Document>();
146 let cache_dir = get_cache_dir();
147 var cache = std::ifstream();
148 var path = cache_dir / "cache.txt";
149 std::cout << path << std::endl;
150 cache.open(path);
151 if (cache.is_open()) {
152 var n = 0ULL;
153 var count = 0ULL;
154 while (not cache.eof()) {
155 var freqs = Vector<std::pair<String, U64>>();
156 var filename = String();
157 var word = String();
158 cache >> filename;
159 cache >> n;
160 while (n-- && not cache.eof()) {
161 cache >> word;
162 cache >> count;
163 freqs.push_back({ word, count });
164 }
165 documents.emplace_back(filename, std::unordered_map(freqs.cbegin(), freqs.cend()));
166 }
167 }
168
169 return documents;
170}
171
172Document::Document(std::string_view name, map<String, U64>&& mp)
173 : freqs(mp), filename(name)
174{
175}
176
177Document::Document(std::string_view name) : filename(name)
178{
179 var f = std::ifstream(filename);
180
181 if (not f.is_open()) {
182 std::cout << "Failed to open: " << name << std::endl;
183 } else {
184 /* weird notation for converting a file to string */
185 var buffer = std::stringstream();
186 buffer << f.rdbuf();
187 let html = buffer.str();
188
189 let words = to_words(filter_divs(html));
190 for (let& str : words)
191 ++freqs[str];
192 }
193}
194
195void Document::show_freqs() const noexcept
196{
197 for (let & [ term, freq ] : freqs)
198 std::cout << term << ": " << freq << std::endl;
199}
200
201fn top_n_matches(Ref<const Vector<Document>> documents, Ref<const String> query, U64 n) noexcept
202 -> Vector<std::pair<String, F64>>
203{
204 var matches = count_tfidf(documents, query);
205 let greater = [](let& x, let& y) { return x.second > y.second; };
206 if (matches.size() < n)
207 ra::sort(matches, greater);
208 else
209 ra::partial_sort(matches, matches.begin() + n, greater);
210
211 matches.resize(n);
212
213 return matches;
214}