CMU Coding Bootcamp
at main 5.7 kB view raw
1const articles = ["a", "an", "the", "this"]; 2 3const prepositions = [ 4 "in", 5 "on", 6 "at", 7 "to", 8 "from", 9 "by", 10 "with", 11 "for", 12 "of", 13]; 14 15const conjunctions = ["and", "or", "but", "yet", "so"]; 16 17const pronouns = ["i", "me", "he", "she", "it", "we", "they", "you"]; 18 19const auxiliaryVerbs = [ 20 "is", 21 "are", 22 "am", 23 "be", 24 "been", 25 "being", 26 "has", 27 "have", 28 "had", 29 "do", 30 "does", 31 "did", 32]; 33 34const commonVerbs = ["go", "get", "make", "take", "see", "come", "think"]; 35 36const adverbs = ["very", "more", "most", "also", "just", "only"]; 37 38const otherCommon = [ 39 "not", 40 "no", 41 "yes", 42 "some", 43 "any", 44 "all", 45 "each", 46 "every", 47 "what", 48 "which", 49 "who", 50 "when", 51 "where", 52 "why", 53 "how", 54]; 55 56const contractions = [ 57 "its", 58 "youve", 59 "youre", 60 "weve", 61 "were", 62 "itd", 63 "youd", 64 "yall", 65]; 66 67const stopWords = new Set([ 68 ...articles, 69 ...prepositions, 70 ...adverbs, 71 ...otherCommon, 72 ...commonVerbs, 73 ...conjunctions, 74 ...pronouns, 75 ...auxiliaryVerbs, 76 ...contractions, 77]); 78 79export class SearchIndex { 80 private index: Map<string, [string, number][]>; 81 82 constructor() { 83 this.index = new Map<string, [string, number][]>(); 84 } 85 86 private getPhrases( 87 words: string[], 88 filter: boolean[], 89 ): Map<string, number> { 90 const wordGroups: string[][] = []; 91 let currentSlice: string[] = []; 92 for (const [index, val] of filter.entries()) { 93 if (val) { 94 currentSlice.push(words[index]!); 95 continue; 96 } 97 if (currentSlice.length > 1) wordGroups.push(currentSlice); 98 currentSlice = []; 99 } 100 const subPhrases: string[] = wordGroups.flatMap((group) => 101 this.getSubPhrases(group), 102 ); 103 const subPhraseDict = new Map<string, number>(); 104 for (const sp of subPhrases) { 105 subPhraseDict.set(sp, (subPhraseDict.get(sp) || 0) + 1); 106 } 107 return subPhraseDict; 108 } 109 110 private getSubPhrases(phrase: string[]): string[] { 111 const subPhrases: string[] = [phrase.join(" ")]; 112 for (let i = 2; i < phrase.length; i++) { 113 for (let offset = 0; offset + i < phrase.length + 1; offset++) { 114 const subPhrase = phrase.slice(offset, offset + i).join(" "); 115 subPhrases.push(subPhrase); 116 } 117 } 118 return subPhrases; 119 } 120 121 private extractKeywords(pageContent: string): Map<string, number> { 122 let words: string[] = pageContent 123 .split(/\s+/) 124 .map((str) => str.replaceAll(/[^\w]+/g, "").toLowerCase()) 125 .filter((str) => str.length > 0); 126 words = [...words, "a"]; 127 const filter = words.map((word) => !stopWords.has(word)); 128 const keywords = new Set<string>(words).difference(stopWords); 129 const keywordMap = new Map<string, number>( 130 keywords.values().map((kw) => [kw, 0]), 131 ); 132 133 for (let word of words) { 134 if (keywords.has(word)) { 135 keywordMap.set(word, (keywordMap.get(word) || 0) + 1); 136 } 137 } 138 const phrases = this.getPhrases(words, filter); 139 return new Map([...keywordMap, ...phrases]); 140 } 141 142 addPage(url: string, pageContent: string): void { 143 let keywords = this.extractKeywords(pageContent); 144 for (let [kw, count] of keywords.entries()) { 145 if (this.index.has(kw)) { 146 let prev = this.index.get(kw)!; 147 prev.push([url, count]); 148 this.index.set(kw, prev); 149 } else { 150 this.index.set(kw, [[url, count]]); 151 } 152 } 153 } 154 155 updatePage(url: string, pageContent: string): void { 156 this.removePage(url); 157 this.addPage(url, pageContent); 158 } 159 160 removePage(url: string): void { 161 this.index.entries().forEach(([keyword, urls]) => { 162 const index = urls.findIndex(([u, _]) => u === url); 163 if (index >= 0) { 164 urls.splice(index, 1); 165 if (urls.length === 0) { 166 this.index.delete(keyword); 167 } 168 } 169 }); 170 } 171 172 checkPage(search: string): boolean { 173 for (const urls of this.index.values()) { 174 for (const [url, _] of urls) { 175 if (search === url) { 176 return true; 177 } 178 } 179 } 180 return false; 181 } 182 183 size() { 184 return this.index.size; 185 } 186 187 getPagesForKeyword(keyword: string): string[] { 188 const pages = this.index.get(keyword); 189 if (!pages) { 190 return []; 191 } 192 return Array.from(pages) 193 .sort((a, b) => a[1] - b[1]) 194 .map(([url, _]) => url); 195 } 196 197 search(query: string): string[] { 198 const urls = new Map<string, number>(); 199 const keys = this.extractKeywords(query); 200 for (const [key, count] of keys.entries()) { 201 const pages = this.index.get(key); 202 if (!pages) continue; 203 for (const [url, v] of pages) { 204 urls.set( 205 url, 206 urls.has(url) 207 ? (urls.get(url) || 0) + v * count 208 : v * count, 209 ); 210 } 211 } 212 urls.forEach((value, key) => { 213 if (key.includes(query)) { 214 value += 10; 215 } 216 }); 217 return Array.from(urls.entries()) 218 .sort((a, b) => b[1] - a[1]) 219 .map(([url, _]) => url); 220 } 221}