A social knowledge tool for researchers built on ATProto
1import { Result, ok, err } from '../../../shared/core/Result';
2import {
3 IVectorDatabase,
4 IndexUrlParams,
5 FindSimilarUrlsParams,
6 UrlSearchResult,
7} from '../domain/IVectorDatabase';
8import { UrlMetadataProps } from '../../cards/domain/value-objects/UrlMetadata';
9
10interface IndexedUrl {
11 url: string;
12 content: string;
13 metadata: UrlMetadataProps;
14 indexedAt: Date;
15}
16
17export class InMemoryVectorDatabase implements IVectorDatabase {
18 private static instance: InMemoryVectorDatabase;
19 private urls: Map<string, IndexedUrl> = new Map();
20
21 private constructor() {}
22
23 public static getInstance(): InMemoryVectorDatabase {
24 if (!InMemoryVectorDatabase.instance) {
25 InMemoryVectorDatabase.instance = new InMemoryVectorDatabase();
26 }
27 return InMemoryVectorDatabase.instance;
28 }
29
30 async indexUrl(params: IndexUrlParams): Promise<Result<void>> {
31 try {
32 console.log('Indexing URL in InMemoryVectorDatabase:', params.url);
33
34 // Prepare content for embedding (combine title, description, author, siteName)
35 const content = this.prepareContentForEmbedding(
36 params.title,
37 params.description,
38 params.author,
39 params.siteName,
40 );
41
42 this.urls.set(params.url, {
43 url: params.url,
44 content: content,
45 metadata: {
46 url: params.url,
47 title: params.title,
48 description: params.description,
49 author: params.author,
50 publishedDate: params.publishedDate,
51 siteName: params.siteName,
52 imageUrl: params.imageUrl,
53 type: params.type,
54 retrievedAt: params.retrievedAt,
55 },
56 indexedAt: new Date(),
57 });
58 console.log('Current indexed URLs:', Array.from(this.urls.keys()));
59
60 return ok(undefined);
61 } catch (error) {
62 return err(
63 new Error(
64 `Failed to index URL: ${error instanceof Error ? error.message : 'Unknown error'}`,
65 ),
66 );
67 }
68 }
69
70 async findSimilarUrls(
71 params: FindSimilarUrlsParams,
72 ): Promise<Result<UrlSearchResult[]>> {
73 try {
74 console.log('all urls to compare', this.urls);
75 const threshold = params.threshold || 0; // Lower default threshold for more matches
76 const results: UrlSearchResult[] = [];
77
78 // Get the query URL's content for comparison
79 const queryUrl = this.urls.get(params.url);
80 const queryContent = queryUrl?.content || params.url;
81
82 console.log('Query content for similarity:', queryContent);
83
84 for (const [url, indexed] of this.urls.entries()) {
85 // Skip the query URL itself
86 if (url === params.url) continue;
87
88 const similarity = this.calculateSimilarity(
89 queryContent,
90 indexed.content,
91 );
92
93 console.log(
94 `Similarity between "${queryContent}" and "${indexed.content}": ${similarity}`,
95 );
96
97 if (similarity >= threshold) {
98 results.push({
99 url: indexed.url,
100 similarity,
101 metadata: indexed.metadata,
102 });
103 }
104 }
105
106 // Sort by similarity (highest first) and limit results
107 results.sort((a, b) => b.similarity - a.similarity);
108 const limitedResults = results.slice(0, params.limit);
109
110 console.log(
111 `Found ${limitedResults.length} similar URLs above threshold ${threshold}`,
112 );
113
114 return ok(limitedResults);
115 } catch (error) {
116 return err(
117 new Error(
118 `Failed to find similar URLs: ${error instanceof Error ? error.message : 'Unknown error'}`,
119 ),
120 );
121 }
122 }
123
124 async deleteUrl(url: string): Promise<Result<void>> {
125 try {
126 this.urls.delete(url);
127 return ok(undefined);
128 } catch (error) {
129 return err(
130 new Error(
131 `Failed to delete URL: ${error instanceof Error ? error.message : 'Unknown error'}`,
132 ),
133 );
134 }
135 }
136
137 async healthCheck(): Promise<Result<boolean>> {
138 return ok(true);
139 }
140
141 /**
142 * Simple text similarity calculation based on shared words
143 * Uses a more lenient scoring system to increase likelihood of matches
144 */
145 private calculateSimilarity(text1: string, text2: string): number {
146 const words1 = this.tokenize(text1);
147 const words2 = this.tokenize(text2);
148
149 if (words1.length === 0 && words2.length === 0) return 1;
150 if (words1.length === 0 || words2.length === 0) return 0;
151
152 // Count shared words (with frequency)
153 const freq1 = this.getWordFrequency(words1);
154 const freq2 = this.getWordFrequency(words2);
155
156 let sharedWords = 0;
157 let totalWords = 0;
158
159 // Count shared words based on minimum frequency
160 for (const word of new Set([...words1, ...words2])) {
161 const count1 = freq1.get(word) || 0;
162 const count2 = freq2.get(word) || 0;
163
164 if (count1 > 0 && count2 > 0) {
165 sharedWords += Math.min(count1, count2);
166 }
167 totalWords += Math.max(count1, count2);
168 }
169
170 // Return ratio of shared words to total words
171 // This is more lenient than Jaccard similarity
172 return totalWords > 0 ? sharedWords / totalWords : 0;
173 }
174
175 /**
176 * Get word frequency map
177 */
178 private getWordFrequency(words: string[]): Map<string, number> {
179 const freq = new Map<string, number>();
180 for (const word of words) {
181 freq.set(word, (freq.get(word) || 0) + 1);
182 }
183 return freq;
184 }
185
186 private tokenize(text: string): string[] {
187 return text
188 .toLowerCase()
189 .replace(/[^\w\s]/g, ' ')
190 .split(/\s+/)
191 .filter((word) => word.length > 1); // Allow shorter words for more matches
192 }
193
194 /**
195 * Clear all indexed URLs (useful for testing)
196 */
197 clear(): void {
198 this.urls.clear();
199 }
200
201 /**
202 * Get count of indexed URLs (useful for testing/monitoring)
203 */
204 getIndexedUrlCount(): number {
205 return this.urls.size;
206 }
207
208 /**
209 * Prepare content for embedding (combine title, description, author, siteName)
210 */
211 private prepareContentForEmbedding(
212 title?: string,
213 description?: string,
214 author?: string,
215 siteName?: string,
216 ): string {
217 const parts: string[] = [];
218
219 if (title) parts.push(title);
220 if (description) parts.push(description);
221 if (author) parts.push(`by ${author}`);
222 if (siteName) parts.push(`from ${siteName}`);
223
224 return parts.join(' ');
225 }
226}