CMU Coding Bootcamp
1const articles = ["a", "an", "the", "this"];
2
3const prepositions = [
4 "in",
5 "on",
6 "at",
7 "to",
8 "from",
9 "by",
10 "with",
11 "for",
12 "of",
13];
14
15const conjunctions = ["and", "or", "but", "yet", "so"];
16
17const pronouns = ["i", "me", "he", "she", "it", "we", "they", "you"];
18
19const auxiliaryVerbs = [
20 "is",
21 "are",
22 "am",
23 "be",
24 "been",
25 "being",
26 "has",
27 "have",
28 "had",
29 "do",
30 "does",
31 "did",
32];
33
34const commonVerbs = ["go", "get", "make", "take", "see", "come", "think"];
35
36const adverbs = ["very", "more", "most", "also", "just", "only"];
37
38const otherCommon = [
39 "not",
40 "no",
41 "yes",
42 "some",
43 "any",
44 "all",
45 "each",
46 "every",
47 "what",
48 "which",
49 "who",
50 "when",
51 "where",
52 "why",
53 "how",
54];
55
56const contractions = [
57 "its",
58 "youve",
59 "youre",
60 "weve",
61 "were",
62 "itd",
63 "youd",
64 "yall",
65];
66
67const stopWords = new Set([
68 ...articles,
69 ...prepositions,
70 ...adverbs,
71 ...otherCommon,
72 ...commonVerbs,
73 ...conjunctions,
74 ...pronouns,
75 ...auxiliaryVerbs,
76 ...contractions,
77]);
78
79export class SearchIndex {
80 private index: Map<string, [string, number][]>;
81
82 constructor() {
83 this.index = new Map<string, [string, number][]>();
84 }
85
86 private getPhrases(
87 words: string[],
88 filter: boolean[],
89 ): Map<string, number> {
90 const wordGroups: string[][] = [];
91 let currentSlice: string[] = [];
92 for (const [index, val] of filter.entries()) {
93 if (val) {
94 currentSlice.push(words[index]!);
95 continue;
96 }
97 if (currentSlice.length > 1) wordGroups.push(currentSlice);
98 currentSlice = [];
99 }
100 const subPhrases: string[] = wordGroups.flatMap((group) =>
101 this.getSubPhrases(group),
102 );
103 const subPhraseDict = new Map<string, number>();
104 for (const sp of subPhrases) {
105 subPhraseDict.set(sp, (subPhraseDict.get(sp) || 0) + 1);
106 }
107 return subPhraseDict;
108 }
109
110 private getSubPhrases(phrase: string[]): string[] {
111 const subPhrases: string[] = [phrase.join(" ")];
112 for (let i = 2; i < phrase.length; i++) {
113 for (let offset = 0; offset + i < phrase.length + 1; offset++) {
114 const subPhrase = phrase.slice(offset, offset + i).join(" ");
115 subPhrases.push(subPhrase);
116 }
117 }
118 return subPhrases;
119 }
120
121 private extractKeywords(pageContent: string): Map<string, number> {
122 let words: string[] = pageContent
123 .split(/\s+/)
124 .map((str) => str.replaceAll(/[^\w]+/g, "").toLowerCase())
125 .filter((str) => str.length > 0);
126 words = [...words, "a"];
127 const filter = words.map((word) => !stopWords.has(word));
128 const keywords = new Set<string>(words).difference(stopWords);
129 const keywordMap = new Map<string, number>(
130 keywords.values().map((kw) => [kw, 0]),
131 );
132
133 for (let word of words) {
134 if (keywords.has(word)) {
135 keywordMap.set(word, (keywordMap.get(word) || 0) + 1);
136 }
137 }
138 const phrases = this.getPhrases(words, filter);
139 return new Map([...keywordMap, ...phrases]);
140 }
141
142 addPage(url: string, pageContent: string): void {
143 let keywords = this.extractKeywords(pageContent);
144 for (let [kw, count] of keywords.entries()) {
145 if (this.index.has(kw)) {
146 let prev = this.index.get(kw)!;
147 prev.push([url, count]);
148 this.index.set(kw, prev);
149 } else {
150 this.index.set(kw, [[url, count]]);
151 }
152 }
153 }
154
155 updatePage(url: string, pageContent: string): void {
156 this.removePage(url);
157 this.addPage(url, pageContent);
158 }
159
160 removePage(url: string): void {
161 this.index.entries().forEach(([keyword, urls]) => {
162 const index = urls.findIndex(([u, _]) => u === url);
163 if (index >= 0) {
164 urls.splice(index, 1);
165 if (urls.length === 0) {
166 this.index.delete(keyword);
167 }
168 }
169 });
170 }
171
172 checkPage(search: string): boolean {
173 for (const urls of this.index.values()) {
174 for (const [url, _] of urls) {
175 if (search === url) {
176 return true;
177 }
178 }
179 }
180 return false;
181 }
182
183 size() {
184 return this.index.size;
185 }
186
187 getPagesForKeyword(keyword: string): string[] {
188 const pages = this.index.get(keyword);
189 if (!pages) {
190 return [];
191 }
192 return Array.from(pages)
193 .sort((a, b) => a[1] - b[1])
194 .map(([url, _]) => url);
195 }
196
197 search(query: string): string[] {
198 const urls = new Map<string, number>();
199 const keys = this.extractKeywords(query);
200 for (const [key, count] of keys.entries()) {
201 const pages = this.index.get(key);
202 if (!pages) continue;
203 for (const [url, v] of pages) {
204 urls.set(
205 url,
206 urls.has(url)
207 ? (urls.get(url) || 0) + v * count
208 : v * count,
209 );
210 }
211 }
212 urls.forEach((value, key) => {
213 if (key.includes(query)) {
214 value += 10;
215 }
216 });
217 return Array.from(urls.entries())
218 .sort((a, b) => b[1] - a[1])
219 .map(([url, _]) => url);
220 }
221}