+12
nilla.nix
+12
nilla.nix
+69
ts/searchEngine/index.test.ts
+69
ts/searchEngine/index.test.ts
···
1
+
import { describe, it, beforeEach, expect } from "bun:test";
2
+
import { SearchIndex } from ".";
3
+
4
+
describe("Search Index", () => {
5
+
let index: SearchIndex;
6
+
beforeEach(() => {
7
+
index = new SearchIndex();
8
+
});
9
+
10
+
it("should add a new page to the index", () => {
11
+
index.addPage(
12
+
"https://www.example.com",
13
+
"This is a sample webpage about dogs",
14
+
);
15
+
expect(index.getPagesForKeyword("dogs")).toContain(
16
+
"https://www.example.com",
17
+
);
18
+
});
19
+
it("should return an empty list for the keyword", () => {
20
+
expect(index.getPagesForKeyword("pineapple")).toBeEmpty();
21
+
});
22
+
it("should update a page in the index", () => {
23
+
index.addPage(
24
+
"https://www.example.com",
25
+
"This is a sample web page about dogs",
26
+
);
27
+
index.updatePage(
28
+
"https://www.example.com",
29
+
"This is a sample web page about cats",
30
+
);
31
+
expect(index.getPagesForKeyword("dogs")).not.toContain(
32
+
"https://www.example.com",
33
+
);
34
+
expect(index.getPagesForKeyword("cats")).toContain(
35
+
"https://www.example.com",
36
+
);
37
+
});
38
+
it("should remove a page from the index", () => {
39
+
index.addPage(
40
+
"https://www.example.com",
41
+
"This is a sample web page about cats",
42
+
);
43
+
index.removePage("https://www.example.com");
44
+
expect(index.getPagesForKeyword("cats")).not.toContain(
45
+
"https://www.example.com",
46
+
);
47
+
});
48
+
it("should return relevant pages for a keyword", () => {
49
+
index.addPage(
50
+
"https://www.example.com",
51
+
"This is a sample web page about cats",
52
+
);
53
+
expect(index.getPagesForKeyword("cats")).toContain(
54
+
"https://www.example.com",
55
+
);
56
+
});
57
+
it("should return multiple relavent pages that share a keyword", () => {
58
+
index.addPage(
59
+
"https://www.pineapple-world.com",
60
+
"We have lots of pineapples. You've never seen this many pineapples before.",
61
+
);
62
+
index.addPage(
63
+
"https://www.pineapple-is-my-favorite-fruit.com",
64
+
"I love pineapples, it's all I eat. I mean I REALLY LOVE PINEAPPLES",
65
+
);
66
+
67
+
expect(index.getPagesForKeyword("pineapples")).toBeArrayOfSize(2);
68
+
});
69
+
});
+181
ts/searchEngine/index.ts
+181
ts/searchEngine/index.ts
···
1
+
const articles = ["a", "an", "the", "this"];
2
+
3
+
const prepositions = [
4
+
"in",
5
+
"on",
6
+
"at",
7
+
"to",
8
+
"from",
9
+
"by",
10
+
"with",
11
+
"for",
12
+
"of",
13
+
];
14
+
15
+
const conjunctions = ["and", "or", "but", "yet", "so"];
16
+
17
+
const pronouns = ["i", "me", "he", "she", "it", "we", "they", "you"];
18
+
19
+
const auxiliaryVerbs = [
20
+
"is",
21
+
"are",
22
+
"am",
23
+
"be",
24
+
"been",
25
+
"being",
26
+
"has",
27
+
"have",
28
+
"had",
29
+
"do",
30
+
"does",
31
+
"did",
32
+
];
33
+
34
+
const commonVerbs = ["go", "get", "make", "take", "see", "come", "think"];
35
+
36
+
const adverbs = ["very", "more", "most", "also", "just", "only"];
37
+
38
+
const otherCommon = [
39
+
"not",
40
+
"no",
41
+
"yes",
42
+
"some",
43
+
"any",
44
+
"all",
45
+
"each",
46
+
"every",
47
+
"what",
48
+
"which",
49
+
"who",
50
+
"when",
51
+
"where",
52
+
"why",
53
+
"how",
54
+
];
55
+
56
+
const contractions = [
57
+
"its",
58
+
"youve",
59
+
"youre",
60
+
"weve",
61
+
"were",
62
+
"itd",
63
+
"youd",
64
+
"yall",
65
+
];
66
+
67
+
const stopWords = new Set([
68
+
...articles,
69
+
...prepositions,
70
+
...adverbs,
71
+
...otherCommon,
72
+
...commonVerbs,
73
+
...conjunctions,
74
+
...pronouns,
75
+
...auxiliaryVerbs,
76
+
...contractions,
77
+
]);
78
+
79
+
export class SearchIndex {
80
+
index: Map<string, [string, number][]>;
81
+
82
+
constructor() {
83
+
this.index = new Map<string, [string, number][]>();
84
+
}
85
+
86
+
private getPhrases(
87
+
words: string[],
88
+
filter: boolean[],
89
+
): Map<string, number> {
90
+
const wordGroups: string[][] = [];
91
+
let currentSlice: string[] = [];
92
+
for (const [index, val] of filter.entries()) {
93
+
if (val) {
94
+
currentSlice.push(words[index]!);
95
+
continue;
96
+
}
97
+
if (currentSlice.length > 1) wordGroups.push(currentSlice);
98
+
currentSlice = [];
99
+
}
100
+
const subPhrases: string[] = wordGroups.flatMap((group) =>
101
+
this.getSubPhrases(group),
102
+
);
103
+
const subPhraseDict = new Map<string, number>();
104
+
for (const sp of subPhrases) {
105
+
subPhraseDict.set(sp, (subPhraseDict.get(sp) || 0) + 1);
106
+
}
107
+
return subPhraseDict;
108
+
}
109
+
110
+
private getSubPhrases(phrase: string[]): string[] {
111
+
const subPhrases: string[] = [phrase.join(" ")];
112
+
for (let i = 2; i < phrase.length; i++) {
113
+
for (let offset = 0; offset + i < phrase.length + 1; offset++) {
114
+
const subPhrase = phrase.slice(offset, offset + i).join(" ");
115
+
subPhrases.push(subPhrase);
116
+
}
117
+
}
118
+
return subPhrases;
119
+
}
120
+
121
+
private extractKeywords(pageContent: string): Map<string, number> {
122
+
let words: string[] = pageContent
123
+
.split(/\s+/)
124
+
.map((str) => str.replaceAll(/[^\w]+/g, "").toLowerCase())
125
+
.filter((str) => str.length > 0);
126
+
words = [...words, "a"];
127
+
const filter = words.map((word) => !stopWords.has(word));
128
+
const keywords = new Set<string>(words).difference(stopWords);
129
+
const keywordMap = new Map<string, number>(
130
+
keywords.values().map((kw) => [kw, 0]),
131
+
);
132
+
133
+
for (let word of words) {
134
+
if (keywords.has(word)) {
135
+
keywordMap.set(word, (keywordMap.get(word) || 0) + 1);
136
+
}
137
+
}
138
+
const phrases = this.getPhrases(words, filter);
139
+
return new Map([...keywordMap, ...phrases]);
140
+
}
141
+
142
+
addPage(url: string, pageContent: string): void {
143
+
let keywords = this.extractKeywords(pageContent);
144
+
for (let [kw, count] of keywords.entries()) {
145
+
if (this.index.has(kw)) {
146
+
let prev = this.index.get(kw)!;
147
+
prev.push([url, count]);
148
+
this.index.set(kw, prev);
149
+
} else {
150
+
this.index.set(kw, [[url, count]]);
151
+
}
152
+
}
153
+
}
154
+
155
+
updatePage(url: string, pageContent: string): void {
156
+
this.removePage(url);
157
+
this.addPage(url, pageContent);
158
+
}
159
+
160
+
removePage(url: string): void {
161
+
this.index.entries().forEach(([keyword, urls]) => {
162
+
const index = urls.findIndex(([u, _]) => u === url);
163
+
if (index >= 0) {
164
+
urls.splice(index, 1);
165
+
if (urls.length === 0) {
166
+
this.index.delete(keyword);
167
+
}
168
+
}
169
+
});
170
+
}
171
+
172
+
getPagesForKeyword(keyword: string): string[] {
173
+
const pages = this.index.get(keyword);
174
+
if (!pages) {
175
+
return [];
176
+
}
177
+
return Array.from(pages)
178
+
.sort((a, b) => a[1] - b[1])
179
+
.map(([url, _]) => url);
180
+
}
181
+
}