CMU Coding Bootcamp

feat: crawler

Co-authored-by: Max Mahn <mahn.maxwell@proton.me>

thecoded.prof 80785524 0f13456e

verified
+7
ts/bun.lock
··· 3 3 "workspaces": { 4 4 "": { 5 5 "name": "ts", 6 + "dependencies": { 7 + "glob-to-regex.js": "^1.2.0", 8 + }, 6 9 "devDependencies": { 7 10 "@types/bun": "latest", 8 11 }, ··· 21 24 "bun-types": ["bun-types@1.3.0", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-u8X0thhx+yJ0KmkxuEo9HAtdfgCBaM/aI9K90VQcQioAmkVp3SG3FkwWGibUFz3WdXAdcsqOcbU40lK7tbHdkQ=="], 22 25 23 26 "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="], 27 + 28 + "glob-to-regex.js": ["glob-to-regex.js@1.2.0", "", { "peerDependencies": { "tslib": "2" } }, "sha512-QMwlOQKU/IzqMUOAZWubUOT8Qft+Y0KQWnX9nK3ch0CJg0tTp4TvGZsTfudYKv2NzoQSyPcnA6TYeIQ3jGichQ=="], 29 + 30 + "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], 24 31 25 32 "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], 26 33
+4 -1
ts/package.json
··· 8 8 "typescript": "^5" 9 9 }, 10 10 "private": true, 11 - "type": "module" 11 + "type": "module", 12 + "dependencies": { 13 + "glob-to-regex.js": "^1.2.0" 14 + } 12 15 }
+48
ts/searchEngine/crawler.test.ts
··· 1 + import { describe, it, beforeEach, expect } from "bun:test"; 2 + import { SearchIndex } from "."; 3 + import { Crawler, RobotsParser } from "./crawler"; 4 + import { sleep } from "bun"; 5 + 6 + describe("Robots Parser", () => { 7 + it("should parse robots.txt file", () => { 8 + const robotsTxt = ` 9 + User-agent: * 10 + User-agent: crawl 11 + Disallow: /admin 12 + Allow: /public 13 + 14 + User-Agent: crawl 15 + Disallow: /no-robots 16 + `; 17 + const robotsParser = new RobotsParser(robotsTxt); 18 + const { allows, disallows } = robotsParser.getUrlsForUA("crawl"); 19 + const urls = { 20 + allows, 21 + disallows, 22 + }; 23 + expect(allows.has("/public")).toBe(true); 24 + expect(disallows.has("/admin")).toBe(true); 25 + expect(RobotsParser.checkUserAgent(urls, "/admin")).toBe(false); 26 + expect(RobotsParser.checkUserAgent(urls, "/public")).toBe(true); 27 + }); 28 + }); 29 + 30 + describe("Crawler", () => { 31 + let crawler: Crawler; 32 + beforeEach(() => { 33 + crawler = new Crawler("SmartFridge", new SearchIndex()); 34 + }); 35 + 36 + it("should crawl a page", () => { 37 + const url = new URL("https://google.com"); 38 + crawler.crawl(url); 39 + crawler.on("storePage", (url) => { 40 + console.log(`Page stored: ${url}`); 41 + sleep(4000).then(() => { 42 + crawler.emit("stop"); 43 + expect(crawler.index.size()).toBe(1); 44 + }); 45 + }); 46 + // expect(crawler.index).toBe(1); 47 + }); 48 + });
+211
ts/searchEngine/crawler.ts
··· 1 + import { SearchIndex } from "."; 2 + import { toRegex } from "glob-to-regex.js"; 3 + import { EventEmitter } from "node:events"; 4 + 5 + interface RobotUrls { 6 + allows: Set<string>; 7 + disallows: Set<string>; 8 + } 9 + 10 + export class RobotsParser { 11 + disallow: Map<string, Set<string>> = new Map(); 12 + allow: Map<string, Set<string>> = new Map(); 13 + 14 + constructor(text: string) { 15 + const lines = text 16 + .split("\n") 17 + .filter((l) => !/^\s*#.*$/.test(l)) // remove full-line comments 18 + .map((l) => l.replace(/\s*#.*$/, "")); // remove end-of-line comments 19 + lines.push(""); 20 + 21 + const blocks: Array<Array<string>> = []; 22 + let current_block: Array<string> = []; 23 + lines.forEach((line) => { 24 + if (line == "") { 25 + if (current_block.length == 0) return; // ignore consecutive empty lines 26 + blocks.push(current_block); 27 + current_block = new Array(); 28 + } else { 29 + current_block.push(line); 30 + } 31 + }); 32 + 33 + blocks.forEach((block) => { 34 + let uas: string[] = []; 35 + let disallows: string[] = []; 36 + let allows: string[] = []; 37 + block.forEach((line) => { 38 + line = line.trim().toLowerCase(); 39 + const fields: Array<string> = line.split(/\s*:\s*/); 40 + if (fields.length < 2) return; 41 + if (fields[0] == "user-agent") { 42 + uas.push(fields[1]!); 43 + } else if (fields[0] == "disallow") { 44 + disallows.push(fields[1]!); 45 + } else if (fields[0] == "allow") { 46 + allows.push(fields[1]!); 47 + } 48 + }); 49 + uas.forEach((ua) => { 50 + ua = ua.toLowerCase(); 51 + this.disallow.set( 52 + ua, 53 + new Set([...(this.disallow.get(ua) || []), ...disallows]), 54 + ); 55 + this.allow.set( 56 + ua, 57 + new Set([...(this.allow.get(ua) || []), ...allows]), 58 + ); 59 + }); 60 + }); 61 + } 62 + 63 + static checkUserAgent(urls: RobotUrls, url: string): boolean { 64 + const { allows, disallows } = urls; 65 + const allowed = allows 66 + .values() 67 + .map((allow) => { 68 + const regex = toRegex(allow); 69 + return regex.test(url); 70 + }) 71 + .reduce((acc, curr) => acc || curr, false); 72 + if (allowed) { 73 + return true; 74 + } 75 + const disallowed = disallows 76 + .values() 77 + .map((disallow) => { 78 + const regex = toRegex(disallow); 79 + return regex.test(url); 80 + }) 81 + .reduce((acc, curr) => acc || curr, false); 82 + return !disallowed; 83 + } 84 + 85 + getUrlsForUA(ua: string): RobotUrls { 86 + ua = ua.toLowerCase(); 87 + const allowUAs = this.allow 88 + .keys() 89 + .filter((key) => toRegex(key).test(ua)); 90 + const disallowUAs = this.disallow 91 + .keys() 92 + .filter((key) => toRegex(key).test(ua)); 93 + let allows = new Set<string>(); 94 + let disallows = new Set<string>(); 95 + 96 + allowUAs.forEach((ua) => { 97 + const allow = this.allow.get(ua); 98 + if (allow) { 99 + allows = allows.union(allow); 100 + } 101 + }); 102 + disallowUAs.forEach((ua) => { 103 + const disallow = this.disallow.get(ua); 104 + if (disallow) { 105 + disallows = disallows.union(disallow); 106 + } 107 + }); 108 + return { 109 + allows, 110 + disallows, 111 + }; 112 + } 113 + } 114 + 115 + const urlRegex = /https?:\/\/[^\s\"]+/g; 116 + export class Crawler extends EventEmitter { 117 + private robots: Map<string, RobotUrls> = new Map(); // hostname, robots allowed and disallowed for the sepcified UA 118 + private visited: Set<URL> = new Set(); // URLS 119 + 120 + constructor( 121 + private readonly UA: string, 122 + public index: SearchIndex, 123 + ) { 124 + super(); 125 + this.on("addURL", (url: URL) => { 126 + console.log(`Adding URL: ${url}`); 127 + void this.processPage(url); 128 + }); 129 + this.once("stop", () => { 130 + this.removeAllListeners(); 131 + }); 132 + } 133 + 134 + private async checkDisallowed(url: URL): Promise<boolean> { 135 + const robots = 136 + this.robots.get(url.hostname) || (await this.getRobotsTxt(url)); 137 + return !RobotsParser.checkUserAgent(robots, url.toString()); 138 + } 139 + 140 + private async getRobotsTxt(url: URL): Promise<RobotUrls> { 141 + const robotsTxtUrl = new URL( 142 + `${url.protocol}//${url.hostname}/robots.txt`, 143 + ); 144 + 145 + const response = await fetch(robotsTxtUrl, { 146 + headers: { 147 + "User-Agent": this.UA, 148 + }, 149 + }); 150 + if (response.status !== 200) 151 + return { allows: new Set(), disallows: new Set() }; 152 + if (!response.headers.get("content-type")?.startsWith("text/plain")) 153 + return { allows: new Set(), disallows: new Set() }; 154 + const robotsTxt = await response.text(); 155 + const parsed = new RobotsParser(robotsTxt); 156 + const forUA = parsed.getUrlsForUA(this.UA); 157 + this.robots.set(url.hostname, forUA); 158 + return forUA; 159 + } 160 + 161 + private async addOutlinks(html: string): Promise<void> { 162 + const links = html.matchAll(urlRegex); 163 + if (!links) return; 164 + for (const [link, ..._] of links) { 165 + console.log(link); 166 + const url = new URL(link); 167 + if (await this.checkDisallowed(url)) { 168 + this.emit("addURL", url); 169 + } 170 + } 171 + } 172 + 173 + // private getText(html: string): string { 174 + // const parser = new DOMParser(); 175 + // const doc = parser.parseFromString(html, "text/html"); 176 + // return doc.body.textContent || ""; 177 + // } 178 + 179 + private async getPage(url: URL) { 180 + if (this.visited.has(url)) return; 181 + if (await this.checkDisallowed(url)) return; 182 + const page = await fetch(url); 183 + this.visited.add(url); 184 + if (!page.ok) return; 185 + if (!page.headers.get("Content-Type")?.startsWith("text/html")) return; 186 + 187 + return await page.text(); 188 + } 189 + 190 + private async processPage(url: URL) { 191 + const page = await this.getPage(url); 192 + if (!page) return; 193 + await this.addOutlinks(page); 194 + this.index.addPage(url.toString(), page); 195 + this.emit("storePage", url); 196 + } 197 + 198 + crawl(url_str: string | URL) { 199 + this.emit("addURL", new URL(url_str)); 200 + } 201 + } 202 + 203 + let crawler = new Crawler("SmartFridge", new SearchIndex()); 204 + 205 + const url = new URL("https://example.com"); 206 + crawler.crawl(url); 207 + crawler.on("storePage", (url) => { 208 + console.log(`Page stored: ${url}`); 209 + console.log("entries:", crawler.index.size()); 210 + crawler.emit("stop"); 211 + });
+4
ts/searchEngine/index.test.ts
··· 1 1 import { describe, it, beforeEach, expect } from "bun:test"; 2 2 import { SearchIndex } from "."; 3 + import { Crawler } from "./crawler"; 3 4 4 5 describe("Search Index", () => { 5 6 let index: SearchIndex; ··· 77 78 "beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans", 78 79 ); 79 80 index.addPage("https://www.beans-are-ok.com", "beans are ok I guess"); 81 + index.addPage("https://testsite.com", "beans"); 80 82 index.addPage( 81 83 "https://www.example.com/cats", 82 84 "This is a sample web page about cats", ··· 127 129 const results = index.search("beans"); 128 130 expect(results.indexOf("https://www.beans.com")).toBe(0); 129 131 expect(results.indexOf("https://www.beans-are-ok.com")).toBe(1); 132 + expect(results.indexOf("https://testsite.com")).toBe(2); 130 133 const results2 = index.search("beans beans"); 131 134 expect(results2.indexOf("https://www.beans.com")).toBe(0); 132 135 expect(results2.indexOf("https://www.beans-are-ok.com")).toBe(1); 136 + expect(results.indexOf("https://testsite.com")).toBe(2); 133 137 }); 134 138 });
+21 -1
ts/searchEngine/index.ts
··· 77 77 ]); 78 78 79 79 export class SearchIndex { 80 - index: Map<string, [string, number][]>; 80 + private index: Map<string, [string, number][]>; 81 81 82 82 constructor() { 83 83 this.index = new Map<string, [string, number][]>(); ··· 169 169 }); 170 170 } 171 171 172 + checkPage(search: string): boolean { 173 + for (const urls of this.index.values()) { 174 + for (const [url, _] of urls) { 175 + if (search === url) { 176 + return true; 177 + } 178 + } 179 + } 180 + return false; 181 + } 182 + 183 + size() { 184 + return this.index.size; 185 + } 186 + 172 187 getPagesForKeyword(keyword: string): string[] { 173 188 const pages = this.index.get(keyword); 174 189 if (!pages) { ··· 194 209 ); 195 210 } 196 211 } 212 + urls.forEach((value, key) => { 213 + if (key.includes(query)) { 214 + value += 10; 215 + } 216 + }); 197 217 return Array.from(urls.entries()) 198 218 .sort((a, b) => b[1] - a[1]) 199 219 .map(([url, _]) => url);
ts/searchEngine/mainLoop.plan

This is a binary file and will not be displayed.