fix: fix facet processing in FacetUtils for better link matching

Changed files
+132 -38
lib
+132 -38
lib/utils/facet_utils.dart
··· 50 50 int aLength = a.end - a.start; 51 51 int bLength = b.end - b.start; 52 52 53 - // For links, use the display text length 53 + // For links, use the length of the text that will actually be found 54 54 if (a.type?.contains('link') == true && a.data['uri'] != null) { 55 - String displayText = a.data['uri'] as String; 56 - displayText = _extractDisplayTextFromUri(displayText); 57 - aLength = displayText.length; 55 + final uri = a.data['uri'] as String; 56 + final possibleTexts = [_extractDisplayTextFromUri(uri), _extractDomainOnly(uri), uri]; 57 + // Use the longest text that exists in the original text 58 + for (final testText in possibleTexts) { 59 + if (text.contains(testText)) { 60 + aLength = testText.length; 61 + break; 62 + } 63 + } 58 64 } 59 65 60 66 if (b.type?.contains('link') == true && b.data['uri'] != null) { 61 - String displayText = b.data['uri'] as String; 62 - displayText = _extractDisplayTextFromUri(displayText); 63 - bLength = displayText.length; 67 + final uri = b.data['uri'] as String; 68 + final possibleTexts = [_extractDisplayTextFromUri(uri), _extractDomainOnly(uri), uri]; 69 + // Use the longest text that exists in the original text 70 + for (final testText in possibleTexts) { 71 + if (text.contains(testText)) { 72 + bLength = testText.length; 73 + break; 74 + } 75 + } 64 76 } 65 77 66 78 // Sort by length descending, then by start position ascending ··· 80 92 81 93 if (range.type?.contains('link') == true && range.data['uri'] != null) { 82 94 final uri = range.data['uri'] as String; 83 - final displayText = _extractDisplayTextFromUri(uri); 95 + 96 + // First, try to use the exact facet positions if they seem valid 97 + if (range.start >= 0 && range.end <= text.length && range.start < range.end) { 98 + final facetText = text.substring(range.start, range.end); 99 + 100 + // Check if the facet text matches any of our expected URL formats 101 + final possibleTexts = [ 102 + _extractDisplayTextFromUri(uri), // Full URL with protocol 103 + _extractDomainOnly(uri), // Just the domain 104 + uri, // Original URI as-is 105 + ]; 106 + 107 + bool facetTextMatches = possibleTexts.any( 108 + (possible) => 109 + facetText == possible || 110 + facetText.contains(possible) || 111 + possible.contains(facetText), 112 + ); 84 113 85 - // Find all occurrences of this text and pick the one that doesn't overlap with used positions 86 - int searchIndex = 0; 87 - bool foundValidMatch = false; 114 + if (facetTextMatches) { 115 + // Check if this range overlaps with used positions 116 + bool overlaps = false; 117 + for (int i = range.start; i < range.end; i++) { 118 + if (usedPositions.contains(i)) { 119 + overlaps = true; 120 + break; 121 + } 122 + } 88 123 89 - while (!foundValidMatch) { 90 - final globalIndex = text.indexOf(displayText, searchIndex); 91 - if (globalIndex == -1) break; 124 + if (!overlaps) { 125 + actualStart = range.start; 126 + actualEnd = range.end; 127 + actualContent = 128 + facetText; // Use exactly what's in the original text at facet position 92 129 93 - // Check if this range overlaps with any used positions 94 - bool overlaps = false; 95 - for (int i = globalIndex; i < globalIndex + displayText.length; i++) { 96 - if (usedPositions.contains(i)) { 97 - overlaps = true; 98 - break; 130 + // Mark these positions as used 131 + for (int i = actualStart; i < actualEnd; i++) { 132 + usedPositions.add(i); 133 + } 99 134 } 100 135 } 136 + } 101 137 102 - if (!overlaps) { 103 - actualStart = globalIndex; 104 - actualEnd = globalIndex + displayText.length; 105 - actualContent = displayText; 106 - foundValidMatch = true; 138 + // If facet positions didn't work, fall back to searching 139 + if (actualContent == null) { 140 + final possibleTexts = [ 141 + _extractDisplayTextFromUri(uri), // Full URL with protocol 142 + _extractDomainOnly(uri), // Just the domain 143 + uri, // Original URI as-is 144 + ]; 107 145 108 - // Mark these positions as used 109 - for (int i = actualStart; i < actualEnd; i++) { 110 - usedPositions.add(i); 146 + int searchIndex = 0; 147 + bool foundValidMatch = false; 148 + 149 + // Try each possible text representation 150 + for (final searchText in possibleTexts) { 151 + searchIndex = 0; 152 + while (!foundValidMatch) { 153 + final globalIndex = text.indexOf(searchText, searchIndex); 154 + if (globalIndex == -1) break; 155 + 156 + // Check if this range overlaps with any used positions 157 + bool overlaps = false; 158 + for (int i = globalIndex; i < globalIndex + searchText.length; i++) { 159 + if (usedPositions.contains(i)) { 160 + overlaps = true; 161 + break; 162 + } 163 + } 164 + 165 + if (!overlaps) { 166 + actualStart = globalIndex; 167 + actualEnd = globalIndex + searchText.length; 168 + actualContent = searchText; // Use exactly what we found in the text 169 + foundValidMatch = true; 170 + 171 + // Mark these positions as used 172 + for (int i = actualStart; i < actualEnd; i++) { 173 + usedPositions.add(i); 174 + } 175 + break; 176 + } else { 177 + searchIndex = globalIndex + 1; 178 + } 111 179 } 112 - } else { 113 - searchIndex = globalIndex + 1; 180 + if (foundValidMatch) break; 114 181 } 115 182 } 116 183 } ··· 198 265 return spans; 199 266 } 200 267 201 - /// Extracts the display text from a URI (removes protocol but keeps subdomain, removes path) 268 + /// Extracts the display text from a URI (keeps protocol and domain, removes path) 202 269 static String _extractDisplayTextFromUri(String uri) { 203 - String displayText = uri; 270 + // Find the first slash after the protocol to remove the path 271 + String protocolAndDomain = uri; 272 + if (uri.startsWith('https://')) { 273 + final pathIndex = uri.indexOf('/', 8); // Start search after "https://" 274 + if (pathIndex != -1) { 275 + protocolAndDomain = uri.substring(0, pathIndex); 276 + } 277 + } else if (uri.startsWith('http://')) { 278 + final pathIndex = uri.indexOf('/', 7); // Start search after "http://" 279 + if (pathIndex != -1) { 280 + protocolAndDomain = uri.substring(0, pathIndex); 281 + } 282 + } else { 283 + // For URIs without protocol, just remove the path 284 + final slashIndex = uri.indexOf('/'); 285 + if (slashIndex != -1) { 286 + protocolAndDomain = uri.substring(0, slashIndex); 287 + } 288 + } 289 + 290 + return protocolAndDomain; 291 + } 292 + 293 + /// Extracts just the domain part from a URI (removes protocol and path) 294 + static String _extractDomainOnly(String uri) { 295 + String domain = uri; 204 296 if (uri.startsWith('https://')) { 205 - displayText = uri.substring(8); 297 + domain = uri.substring(8); 206 298 } else if (uri.startsWith('http://')) { 207 - displayText = uri.substring(7); 299 + domain = uri.substring(7); 208 300 } 209 - // Remove path but keep subdomain (everything before the first slash after protocol) 210 - final slashIndex = displayText.indexOf('/'); 301 + 302 + // Remove path 303 + final slashIndex = domain.indexOf('/'); 211 304 if (slashIndex != -1) { 212 - displayText = displayText.substring(0, slashIndex); 305 + domain = domain.substring(0, slashIndex); 213 306 } 214 - return displayText; 307 + 308 + return domain; 215 309 } 216 310 }