CMU Coding Bootcamp

feat: initial search engine logic

Co-authored-by: Max Mahn <mahn.maxwell@proton.me>

thecoded.prof c69c82eb 952376a8

verified
Changed files
+262
ts
searchEngine
+12
nilla.nix
··· 125 125 ]; 126 126 }; 127 127 }; 128 + shells.none = { 129 + systems = [ "x86_64-linux" ]; 130 + 131 + shell = { 132 + mkShell 133 + }: 134 + mkShell { 135 + shellHook = '' 136 + echo "hello" 137 + ''; 138 + }; 139 + }; 128 140 }; 129 141 } 130 142 )
+69
ts/searchEngine/index.test.ts
··· 1 + import { describe, it, beforeEach, expect } from "bun:test"; 2 + import { SearchIndex } from "."; 3 + 4 + describe("Search Index", () => { 5 + let index: SearchIndex; 6 + beforeEach(() => { 7 + index = new SearchIndex(); 8 + }); 9 + 10 + it("should add a new page to the index", () => { 11 + index.addPage( 12 + "https://www.example.com", 13 + "This is a sample webpage about dogs", 14 + ); 15 + expect(index.getPagesForKeyword("dogs")).toContain( 16 + "https://www.example.com", 17 + ); 18 + }); 19 + it("should return an empty list for the keyword", () => { 20 + expect(index.getPagesForKeyword("pineapple")).toBeEmpty(); 21 + }); 22 + it("should update a page in the index", () => { 23 + index.addPage( 24 + "https://www.example.com", 25 + "This is a sample web page about dogs", 26 + ); 27 + index.updatePage( 28 + "https://www.example.com", 29 + "This is a sample web page about cats", 30 + ); 31 + expect(index.getPagesForKeyword("dogs")).not.toContain( 32 + "https://www.example.com", 33 + ); 34 + expect(index.getPagesForKeyword("cats")).toContain( 35 + "https://www.example.com", 36 + ); 37 + }); 38 + it("should remove a page from the index", () => { 39 + index.addPage( 40 + "https://www.example.com", 41 + "This is a sample web page about cats", 42 + ); 43 + index.removePage("https://www.example.com"); 44 + expect(index.getPagesForKeyword("cats")).not.toContain( 45 + "https://www.example.com", 46 + ); 47 + }); 48 + it("should return relevant pages for a keyword", () => { 49 + index.addPage( 50 + "https://www.example.com", 51 + "This is a sample web page about cats", 52 + ); 53 + expect(index.getPagesForKeyword("cats")).toContain( 54 + "https://www.example.com", 55 + ); 56 + }); 57 + it("should return multiple relavent pages that share a keyword", () => { 58 + index.addPage( 59 + "https://www.pineapple-world.com", 60 + "We have lots of pineapples. You've never seen this many pineapples before.", 61 + ); 62 + index.addPage( 63 + "https://www.pineapple-is-my-favorite-fruit.com", 64 + "I love pineapples, it's all I eat. I mean I REALLY LOVE PINEAPPLES", 65 + ); 66 + 67 + expect(index.getPagesForKeyword("pineapples")).toBeArrayOfSize(2); 68 + }); 69 + });
+181
ts/searchEngine/index.ts
··· 1 + const articles = ["a", "an", "the", "this"]; 2 + 3 + const prepositions = [ 4 + "in", 5 + "on", 6 + "at", 7 + "to", 8 + "from", 9 + "by", 10 + "with", 11 + "for", 12 + "of", 13 + ]; 14 + 15 + const conjunctions = ["and", "or", "but", "yet", "so"]; 16 + 17 + const pronouns = ["i", "me", "he", "she", "it", "we", "they", "you"]; 18 + 19 + const auxiliaryVerbs = [ 20 + "is", 21 + "are", 22 + "am", 23 + "be", 24 + "been", 25 + "being", 26 + "has", 27 + "have", 28 + "had", 29 + "do", 30 + "does", 31 + "did", 32 + ]; 33 + 34 + const commonVerbs = ["go", "get", "make", "take", "see", "come", "think"]; 35 + 36 + const adverbs = ["very", "more", "most", "also", "just", "only"]; 37 + 38 + const otherCommon = [ 39 + "not", 40 + "no", 41 + "yes", 42 + "some", 43 + "any", 44 + "all", 45 + "each", 46 + "every", 47 + "what", 48 + "which", 49 + "who", 50 + "when", 51 + "where", 52 + "why", 53 + "how", 54 + ]; 55 + 56 + const contractions = [ 57 + "its", 58 + "youve", 59 + "youre", 60 + "weve", 61 + "were", 62 + "itd", 63 + "youd", 64 + "yall", 65 + ]; 66 + 67 + const stopWords = new Set([ 68 + ...articles, 69 + ...prepositions, 70 + ...adverbs, 71 + ...otherCommon, 72 + ...commonVerbs, 73 + ...conjunctions, 74 + ...pronouns, 75 + ...auxiliaryVerbs, 76 + ...contractions, 77 + ]); 78 + 79 + export class SearchIndex { 80 + index: Map<string, [string, number][]>; 81 + 82 + constructor() { 83 + this.index = new Map<string, [string, number][]>(); 84 + } 85 + 86 + private getPhrases( 87 + words: string[], 88 + filter: boolean[], 89 + ): Map<string, number> { 90 + const wordGroups: string[][] = []; 91 + let currentSlice: string[] = []; 92 + for (const [index, val] of filter.entries()) { 93 + if (val) { 94 + currentSlice.push(words[index]!); 95 + continue; 96 + } 97 + if (currentSlice.length > 1) wordGroups.push(currentSlice); 98 + currentSlice = []; 99 + } 100 + const subPhrases: string[] = wordGroups.flatMap((group) => 101 + this.getSubPhrases(group), 102 + ); 103 + const subPhraseDict = new Map<string, number>(); 104 + for (const sp of subPhrases) { 105 + subPhraseDict.set(sp, (subPhraseDict.get(sp) || 0) + 1); 106 + } 107 + return subPhraseDict; 108 + } 109 + 110 + private getSubPhrases(phrase: string[]): string[] { 111 + const subPhrases: string[] = [phrase.join(" ")]; 112 + for (let i = 2; i < phrase.length; i++) { 113 + for (let offset = 0; offset + i < phrase.length + 1; offset++) { 114 + const subPhrase = phrase.slice(offset, offset + i).join(" "); 115 + subPhrases.push(subPhrase); 116 + } 117 + } 118 + return subPhrases; 119 + } 120 + 121 + private extractKeywords(pageContent: string): Map<string, number> { 122 + let words: string[] = pageContent 123 + .split(/\s+/) 124 + .map((str) => str.replaceAll(/[^\w]+/g, "").toLowerCase()) 125 + .filter((str) => str.length > 0); 126 + words = [...words, "a"]; 127 + const filter = words.map((word) => !stopWords.has(word)); 128 + const keywords = new Set<string>(words).difference(stopWords); 129 + const keywordMap = new Map<string, number>( 130 + keywords.values().map((kw) => [kw, 0]), 131 + ); 132 + 133 + for (let word of words) { 134 + if (keywords.has(word)) { 135 + keywordMap.set(word, (keywordMap.get(word) || 0) + 1); 136 + } 137 + } 138 + const phrases = this.getPhrases(words, filter); 139 + return new Map([...keywordMap, ...phrases]); 140 + } 141 + 142 + addPage(url: string, pageContent: string): void { 143 + let keywords = this.extractKeywords(pageContent); 144 + for (let [kw, count] of keywords.entries()) { 145 + if (this.index.has(kw)) { 146 + let prev = this.index.get(kw)!; 147 + prev.push([url, count]); 148 + this.index.set(kw, prev); 149 + } else { 150 + this.index.set(kw, [[url, count]]); 151 + } 152 + } 153 + } 154 + 155 + updatePage(url: string, pageContent: string): void { 156 + this.removePage(url); 157 + this.addPage(url, pageContent); 158 + } 159 + 160 + removePage(url: string): void { 161 + this.index.entries().forEach(([keyword, urls]) => { 162 + const index = urls.findIndex(([u, _]) => u === url); 163 + if (index >= 0) { 164 + urls.splice(index, 1); 165 + if (urls.length === 0) { 166 + this.index.delete(keyword); 167 + } 168 + } 169 + }); 170 + } 171 + 172 + getPagesForKeyword(keyword: string): string[] { 173 + const pages = this.index.get(keyword); 174 + if (!pages) { 175 + return []; 176 + } 177 + return Array.from(pages) 178 + .sort((a, b) => a[1] - b[1]) 179 + .map(([url, _]) => url); 180 + } 181 + }