A social knowledge tool for researchers built on ATProto
at main 226 lines 6.3 kB view raw
1import { Result, ok, err } from '../../../shared/core/Result'; 2import { 3 IVectorDatabase, 4 IndexUrlParams, 5 FindSimilarUrlsParams, 6 UrlSearchResult, 7} from '../domain/IVectorDatabase'; 8import { UrlMetadataProps } from '../../cards/domain/value-objects/UrlMetadata'; 9 10interface IndexedUrl { 11 url: string; 12 content: string; 13 metadata: UrlMetadataProps; 14 indexedAt: Date; 15} 16 17export class InMemoryVectorDatabase implements IVectorDatabase { 18 private static instance: InMemoryVectorDatabase; 19 private urls: Map<string, IndexedUrl> = new Map(); 20 21 private constructor() {} 22 23 public static getInstance(): InMemoryVectorDatabase { 24 if (!InMemoryVectorDatabase.instance) { 25 InMemoryVectorDatabase.instance = new InMemoryVectorDatabase(); 26 } 27 return InMemoryVectorDatabase.instance; 28 } 29 30 async indexUrl(params: IndexUrlParams): Promise<Result<void>> { 31 try { 32 console.log('Indexing URL in InMemoryVectorDatabase:', params.url); 33 34 // Prepare content for embedding (combine title, description, author, siteName) 35 const content = this.prepareContentForEmbedding( 36 params.title, 37 params.description, 38 params.author, 39 params.siteName, 40 ); 41 42 this.urls.set(params.url, { 43 url: params.url, 44 content: content, 45 metadata: { 46 url: params.url, 47 title: params.title, 48 description: params.description, 49 author: params.author, 50 publishedDate: params.publishedDate, 51 siteName: params.siteName, 52 imageUrl: params.imageUrl, 53 type: params.type, 54 retrievedAt: params.retrievedAt, 55 }, 56 indexedAt: new Date(), 57 }); 58 console.log('Current indexed URLs:', Array.from(this.urls.keys())); 59 60 return ok(undefined); 61 } catch (error) { 62 return err( 63 new Error( 64 `Failed to index URL: ${error instanceof Error ? error.message : 'Unknown error'}`, 65 ), 66 ); 67 } 68 } 69 70 async findSimilarUrls( 71 params: FindSimilarUrlsParams, 72 ): Promise<Result<UrlSearchResult[]>> { 73 try { 74 console.log('all urls to compare', this.urls); 75 const threshold = params.threshold || 0; // Lower default threshold for more matches 76 const results: UrlSearchResult[] = []; 77 78 // Get the query URL's content for comparison 79 const queryUrl = this.urls.get(params.url); 80 const queryContent = queryUrl?.content || params.url; 81 82 console.log('Query content for similarity:', queryContent); 83 84 for (const [url, indexed] of this.urls.entries()) { 85 // Skip the query URL itself 86 if (url === params.url) continue; 87 88 const similarity = this.calculateSimilarity( 89 queryContent, 90 indexed.content, 91 ); 92 93 console.log( 94 `Similarity between "${queryContent}" and "${indexed.content}": ${similarity}`, 95 ); 96 97 if (similarity >= threshold) { 98 results.push({ 99 url: indexed.url, 100 similarity, 101 metadata: indexed.metadata, 102 }); 103 } 104 } 105 106 // Sort by similarity (highest first) and limit results 107 results.sort((a, b) => b.similarity - a.similarity); 108 const limitedResults = results.slice(0, params.limit); 109 110 console.log( 111 `Found ${limitedResults.length} similar URLs above threshold ${threshold}`, 112 ); 113 114 return ok(limitedResults); 115 } catch (error) { 116 return err( 117 new Error( 118 `Failed to find similar URLs: ${error instanceof Error ? error.message : 'Unknown error'}`, 119 ), 120 ); 121 } 122 } 123 124 async deleteUrl(url: string): Promise<Result<void>> { 125 try { 126 this.urls.delete(url); 127 return ok(undefined); 128 } catch (error) { 129 return err( 130 new Error( 131 `Failed to delete URL: ${error instanceof Error ? error.message : 'Unknown error'}`, 132 ), 133 ); 134 } 135 } 136 137 async healthCheck(): Promise<Result<boolean>> { 138 return ok(true); 139 } 140 141 /** 142 * Simple text similarity calculation based on shared words 143 * Uses a more lenient scoring system to increase likelihood of matches 144 */ 145 private calculateSimilarity(text1: string, text2: string): number { 146 const words1 = this.tokenize(text1); 147 const words2 = this.tokenize(text2); 148 149 if (words1.length === 0 && words2.length === 0) return 1; 150 if (words1.length === 0 || words2.length === 0) return 0; 151 152 // Count shared words (with frequency) 153 const freq1 = this.getWordFrequency(words1); 154 const freq2 = this.getWordFrequency(words2); 155 156 let sharedWords = 0; 157 let totalWords = 0; 158 159 // Count shared words based on minimum frequency 160 for (const word of new Set([...words1, ...words2])) { 161 const count1 = freq1.get(word) || 0; 162 const count2 = freq2.get(word) || 0; 163 164 if (count1 > 0 && count2 > 0) { 165 sharedWords += Math.min(count1, count2); 166 } 167 totalWords += Math.max(count1, count2); 168 } 169 170 // Return ratio of shared words to total words 171 // This is more lenient than Jaccard similarity 172 return totalWords > 0 ? sharedWords / totalWords : 0; 173 } 174 175 /** 176 * Get word frequency map 177 */ 178 private getWordFrequency(words: string[]): Map<string, number> { 179 const freq = new Map<string, number>(); 180 for (const word of words) { 181 freq.set(word, (freq.get(word) || 0) + 1); 182 } 183 return freq; 184 } 185 186 private tokenize(text: string): string[] { 187 return text 188 .toLowerCase() 189 .replace(/[^\w\s]/g, ' ') 190 .split(/\s+/) 191 .filter((word) => word.length > 1); // Allow shorter words for more matches 192 } 193 194 /** 195 * Clear all indexed URLs (useful for testing) 196 */ 197 clear(): void { 198 this.urls.clear(); 199 } 200 201 /** 202 * Get count of indexed URLs (useful for testing/monitoring) 203 */ 204 getIndexedUrlCount(): number { 205 return this.urls.size; 206 } 207 208 /** 209 * Prepare content for embedding (combine title, description, author, siteName) 210 */ 211 private prepareContentForEmbedding( 212 title?: string, 213 description?: string, 214 author?: string, 215 siteName?: string, 216 ): string { 217 const parts: string[] = []; 218 219 if (title) parts.push(title); 220 if (description) parts.push(description); 221 if (author) parts.push(`by ${author}`); 222 if (siteName) parts.push(`from ${siteName}`); 223 224 return parts.join(' '); 225 } 226}