+7
ts/bun.lock
+7
ts/bun.lock
···
3
"workspaces": {
4
"": {
5
"name": "ts",
6
"devDependencies": {
7
"@types/bun": "latest",
8
},
···
21
"bun-types": ["bun-types@1.3.0", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-u8X0thhx+yJ0KmkxuEo9HAtdfgCBaM/aI9K90VQcQioAmkVp3SG3FkwWGibUFz3WdXAdcsqOcbU40lK7tbHdkQ=="],
22
23
"csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
24
25
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
26
···
3
"workspaces": {
4
"": {
5
"name": "ts",
6
+
"dependencies": {
7
+
"glob-to-regex.js": "^1.2.0",
8
+
},
9
"devDependencies": {
10
"@types/bun": "latest",
11
},
···
24
"bun-types": ["bun-types@1.3.0", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-u8X0thhx+yJ0KmkxuEo9HAtdfgCBaM/aI9K90VQcQioAmkVp3SG3FkwWGibUFz3WdXAdcsqOcbU40lK7tbHdkQ=="],
25
26
"csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
27
+
28
+
"glob-to-regex.js": ["glob-to-regex.js@1.2.0", "", { "peerDependencies": { "tslib": "2" } }, "sha512-QMwlOQKU/IzqMUOAZWubUOT8Qft+Y0KQWnX9nK3ch0CJg0tTp4TvGZsTfudYKv2NzoQSyPcnA6TYeIQ3jGichQ=="],
29
+
30
+
"tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="],
31
32
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
33
+4
-1
ts/package.json
+4
-1
ts/package.json
+48
ts/searchEngine/crawler.test.ts
+48
ts/searchEngine/crawler.test.ts
···
···
1
+
import { describe, it, beforeEach, expect } from "bun:test";
2
+
import { SearchIndex } from ".";
3
+
import { Crawler, RobotsParser } from "./crawler";
4
+
import { sleep } from "bun";
5
+
6
+
describe("Robots Parser", () => {
7
+
it("should parse robots.txt file", () => {
8
+
const robotsTxt = `
9
+
User-agent: *
10
+
User-agent: crawl
11
+
Disallow: /admin
12
+
Allow: /public
13
+
14
+
User-Agent: crawl
15
+
Disallow: /no-robots
16
+
`;
17
+
const robotsParser = new RobotsParser(robotsTxt);
18
+
const { allows, disallows } = robotsParser.getUrlsForUA("crawl");
19
+
const urls = {
20
+
allows,
21
+
disallows,
22
+
};
23
+
expect(allows.has("/public")).toBe(true);
24
+
expect(disallows.has("/admin")).toBe(true);
25
+
expect(RobotsParser.checkUserAgent(urls, "/admin")).toBe(false);
26
+
expect(RobotsParser.checkUserAgent(urls, "/public")).toBe(true);
27
+
});
28
+
});
29
+
30
+
describe("Crawler", () => {
31
+
let crawler: Crawler;
32
+
beforeEach(() => {
33
+
crawler = new Crawler("SmartFridge", new SearchIndex());
34
+
});
35
+
36
+
it("should crawl a page", () => {
37
+
const url = new URL("https://google.com");
38
+
crawler.crawl(url);
39
+
crawler.on("storePage", (url) => {
40
+
console.log(`Page stored: ${url}`);
41
+
sleep(4000).then(() => {
42
+
crawler.emit("stop");
43
+
expect(crawler.index.size()).toBe(1);
44
+
});
45
+
});
46
+
// expect(crawler.index).toBe(1);
47
+
});
48
+
});
+211
ts/searchEngine/crawler.ts
+211
ts/searchEngine/crawler.ts
···
···
1
+
import { SearchIndex } from ".";
2
+
import { toRegex } from "glob-to-regex.js";
3
+
import { EventEmitter } from "node:events";
4
+
5
+
interface RobotUrls {
6
+
allows: Set<string>;
7
+
disallows: Set<string>;
8
+
}
9
+
10
+
export class RobotsParser {
11
+
disallow: Map<string, Set<string>> = new Map();
12
+
allow: Map<string, Set<string>> = new Map();
13
+
14
+
constructor(text: string) {
15
+
const lines = text
16
+
.split("\n")
17
+
.filter((l) => !/^\s*#.*$/.test(l)) // remove full-line comments
18
+
.map((l) => l.replace(/\s*#.*$/, "")); // remove end-of-line comments
19
+
lines.push("");
20
+
21
+
const blocks: Array<Array<string>> = [];
22
+
let current_block: Array<string> = [];
23
+
lines.forEach((line) => {
24
+
if (line == "") {
25
+
if (current_block.length == 0) return; // ignore consecutive empty lines
26
+
blocks.push(current_block);
27
+
current_block = new Array();
28
+
} else {
29
+
current_block.push(line);
30
+
}
31
+
});
32
+
33
+
blocks.forEach((block) => {
34
+
let uas: string[] = [];
35
+
let disallows: string[] = [];
36
+
let allows: string[] = [];
37
+
block.forEach((line) => {
38
+
line = line.trim().toLowerCase();
39
+
const fields: Array<string> = line.split(/\s*:\s*/);
40
+
if (fields.length < 2) return;
41
+
if (fields[0] == "user-agent") {
42
+
uas.push(fields[1]!);
43
+
} else if (fields[0] == "disallow") {
44
+
disallows.push(fields[1]!);
45
+
} else if (fields[0] == "allow") {
46
+
allows.push(fields[1]!);
47
+
}
48
+
});
49
+
uas.forEach((ua) => {
50
+
ua = ua.toLowerCase();
51
+
this.disallow.set(
52
+
ua,
53
+
new Set([...(this.disallow.get(ua) || []), ...disallows]),
54
+
);
55
+
this.allow.set(
56
+
ua,
57
+
new Set([...(this.allow.get(ua) || []), ...allows]),
58
+
);
59
+
});
60
+
});
61
+
}
62
+
63
+
static checkUserAgent(urls: RobotUrls, url: string): boolean {
64
+
const { allows, disallows } = urls;
65
+
const allowed = allows
66
+
.values()
67
+
.map((allow) => {
68
+
const regex = toRegex(allow);
69
+
return regex.test(url);
70
+
})
71
+
.reduce((acc, curr) => acc || curr, false);
72
+
if (allowed) {
73
+
return true;
74
+
}
75
+
const disallowed = disallows
76
+
.values()
77
+
.map((disallow) => {
78
+
const regex = toRegex(disallow);
79
+
return regex.test(url);
80
+
})
81
+
.reduce((acc, curr) => acc || curr, false);
82
+
return !disallowed;
83
+
}
84
+
85
+
getUrlsForUA(ua: string): RobotUrls {
86
+
ua = ua.toLowerCase();
87
+
const allowUAs = this.allow
88
+
.keys()
89
+
.filter((key) => toRegex(key).test(ua));
90
+
const disallowUAs = this.disallow
91
+
.keys()
92
+
.filter((key) => toRegex(key).test(ua));
93
+
let allows = new Set<string>();
94
+
let disallows = new Set<string>();
95
+
96
+
allowUAs.forEach((ua) => {
97
+
const allow = this.allow.get(ua);
98
+
if (allow) {
99
+
allows = allows.union(allow);
100
+
}
101
+
});
102
+
disallowUAs.forEach((ua) => {
103
+
const disallow = this.disallow.get(ua);
104
+
if (disallow) {
105
+
disallows = disallows.union(disallow);
106
+
}
107
+
});
108
+
return {
109
+
allows,
110
+
disallows,
111
+
};
112
+
}
113
+
}
114
+
115
+
const urlRegex = /https?:\/\/[^\s\"]+/g;
116
+
export class Crawler extends EventEmitter {
117
+
private robots: Map<string, RobotUrls> = new Map(); // hostname, robots allowed and disallowed for the sepcified UA
118
+
private visited: Set<URL> = new Set(); // URLS
119
+
120
+
constructor(
121
+
private readonly UA: string,
122
+
public index: SearchIndex,
123
+
) {
124
+
super();
125
+
this.on("addURL", (url: URL) => {
126
+
console.log(`Adding URL: ${url}`);
127
+
void this.processPage(url);
128
+
});
129
+
this.once("stop", () => {
130
+
this.removeAllListeners();
131
+
});
132
+
}
133
+
134
+
private async checkDisallowed(url: URL): Promise<boolean> {
135
+
const robots =
136
+
this.robots.get(url.hostname) || (await this.getRobotsTxt(url));
137
+
return !RobotsParser.checkUserAgent(robots, url.toString());
138
+
}
139
+
140
+
private async getRobotsTxt(url: URL): Promise<RobotUrls> {
141
+
const robotsTxtUrl = new URL(
142
+
`${url.protocol}//${url.hostname}/robots.txt`,
143
+
);
144
+
145
+
const response = await fetch(robotsTxtUrl, {
146
+
headers: {
147
+
"User-Agent": this.UA,
148
+
},
149
+
});
150
+
if (response.status !== 200)
151
+
return { allows: new Set(), disallows: new Set() };
152
+
if (!response.headers.get("content-type")?.startsWith("text/plain"))
153
+
return { allows: new Set(), disallows: new Set() };
154
+
const robotsTxt = await response.text();
155
+
const parsed = new RobotsParser(robotsTxt);
156
+
const forUA = parsed.getUrlsForUA(this.UA);
157
+
this.robots.set(url.hostname, forUA);
158
+
return forUA;
159
+
}
160
+
161
+
private async addOutlinks(html: string): Promise<void> {
162
+
const links = html.matchAll(urlRegex);
163
+
if (!links) return;
164
+
for (const [link, ..._] of links) {
165
+
console.log(link);
166
+
const url = new URL(link);
167
+
if (await this.checkDisallowed(url)) {
168
+
this.emit("addURL", url);
169
+
}
170
+
}
171
+
}
172
+
173
+
// private getText(html: string): string {
174
+
// const parser = new DOMParser();
175
+
// const doc = parser.parseFromString(html, "text/html");
176
+
// return doc.body.textContent || "";
177
+
// }
178
+
179
+
private async getPage(url: URL) {
180
+
if (this.visited.has(url)) return;
181
+
if (await this.checkDisallowed(url)) return;
182
+
const page = await fetch(url);
183
+
this.visited.add(url);
184
+
if (!page.ok) return;
185
+
if (!page.headers.get("Content-Type")?.startsWith("text/html")) return;
186
+
187
+
return await page.text();
188
+
}
189
+
190
+
private async processPage(url: URL) {
191
+
const page = await this.getPage(url);
192
+
if (!page) return;
193
+
await this.addOutlinks(page);
194
+
this.index.addPage(url.toString(), page);
195
+
this.emit("storePage", url);
196
+
}
197
+
198
+
crawl(url_str: string | URL) {
199
+
this.emit("addURL", new URL(url_str));
200
+
}
201
+
}
202
+
203
+
let crawler = new Crawler("SmartFridge", new SearchIndex());
204
+
205
+
const url = new URL("https://example.com");
206
+
crawler.crawl(url);
207
+
crawler.on("storePage", (url) => {
208
+
console.log(`Page stored: ${url}`);
209
+
console.log("entries:", crawler.index.size());
210
+
crawler.emit("stop");
211
+
});
+4
ts/searchEngine/index.test.ts
+4
ts/searchEngine/index.test.ts
···
1
import { describe, it, beforeEach, expect } from "bun:test";
2
import { SearchIndex } from ".";
3
4
describe("Search Index", () => {
5
let index: SearchIndex;
···
77
"beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans",
78
);
79
index.addPage("https://www.beans-are-ok.com", "beans are ok I guess");
80
index.addPage(
81
"https://www.example.com/cats",
82
"This is a sample web page about cats",
···
127
const results = index.search("beans");
128
expect(results.indexOf("https://www.beans.com")).toBe(0);
129
expect(results.indexOf("https://www.beans-are-ok.com")).toBe(1);
130
const results2 = index.search("beans beans");
131
expect(results2.indexOf("https://www.beans.com")).toBe(0);
132
expect(results2.indexOf("https://www.beans-are-ok.com")).toBe(1);
133
});
134
});
···
1
import { describe, it, beforeEach, expect } from "bun:test";
2
import { SearchIndex } from ".";
3
+
import { Crawler } from "./crawler";
4
5
describe("Search Index", () => {
6
let index: SearchIndex;
···
78
"beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans beans",
79
);
80
index.addPage("https://www.beans-are-ok.com", "beans are ok I guess");
81
+
index.addPage("https://testsite.com", "beans");
82
index.addPage(
83
"https://www.example.com/cats",
84
"This is a sample web page about cats",
···
129
const results = index.search("beans");
130
expect(results.indexOf("https://www.beans.com")).toBe(0);
131
expect(results.indexOf("https://www.beans-are-ok.com")).toBe(1);
132
+
expect(results.indexOf("https://testsite.com")).toBe(2);
133
const results2 = index.search("beans beans");
134
expect(results2.indexOf("https://www.beans.com")).toBe(0);
135
expect(results2.indexOf("https://www.beans-are-ok.com")).toBe(1);
136
+
expect(results.indexOf("https://testsite.com")).toBe(2);
137
});
138
});
+21
-1
ts/searchEngine/index.ts
+21
-1
ts/searchEngine/index.ts
···
77
]);
78
79
export class SearchIndex {
80
-
index: Map<string, [string, number][]>;
81
82
constructor() {
83
this.index = new Map<string, [string, number][]>();
···
169
});
170
}
171
172
getPagesForKeyword(keyword: string): string[] {
173
const pages = this.index.get(keyword);
174
if (!pages) {
···
194
);
195
}
196
}
197
return Array.from(urls.entries())
198
.sort((a, b) => b[1] - a[1])
199
.map(([url, _]) => url);
···
77
]);
78
79
export class SearchIndex {
80
+
private index: Map<string, [string, number][]>;
81
82
constructor() {
83
this.index = new Map<string, [string, number][]>();
···
169
});
170
}
171
172
+
checkPage(search: string): boolean {
173
+
for (const urls of this.index.values()) {
174
+
for (const [url, _] of urls) {
175
+
if (search === url) {
176
+
return true;
177
+
}
178
+
}
179
+
}
180
+
return false;
181
+
}
182
+
183
+
size() {
184
+
return this.index.size;
185
+
}
186
+
187
getPagesForKeyword(keyword: string): string[] {
188
const pages = this.index.get(keyword);
189
if (!pages) {
···
209
);
210
}
211
}
212
+
urls.forEach((value, key) => {
213
+
if (key.includes(query)) {
214
+
value += 10;
215
+
}
216
+
});
217
return Array.from(urls.entries())
218
.sort((a, b) => b[1] - a[1])
219
.map(([url, _]) => url);
ts/searchEngine/mainLoop.plan
ts/searchEngine/mainLoop.plan
This is a binary file and will not be displayed.