A server-side link shortening service powered by Linkat

feat(encoding): Enhance shortcode generation with domain-awareness

Implements a more robust, deterministic shortcode generation algorithm.

- Uses `tldts` to extract the **base domain** for generating a domain-specific prefix.
- **Normalises** URLs (e.g., sorting query params, standardising protocol/hostname) before hashing to ensure consistent shortcodes for identical content.
- Updates the shortcode generation to use **BigInt** and the FNV-1a hashing algorithm for better distribution.
- **Increases** `DEFAULT_LENGTH` from 6 to 10.
- Reorders the `CHARS` list for the base encoding.
- **Adds `tldts` dependency**.

ewancroft.uk 32a167f1 10604d24

verified
Changed files
+153 -66
.vscode
src
+8 -1
.vscode/settings.json
··· 1 1 { 2 2 "cSpell.words": [ 3 3 "atproto", 4 + "cailean", 5 + "ewancroft", 4 6 "FOUC", 5 7 "Linkat", 8 + "netx", 9 + "Normalise", 10 + "normalised", 6 11 "pdsls", 7 12 "RKEY", 8 - "shortlink" 13 + "shortlink", 14 + "standardise", 15 + "tldts" 9 16 ] 10 17 }
+20 -1
package-lock.json
··· 8 8 "name": "atproto-shortlink", 9 9 "version": "0.0.1", 10 10 "dependencies": { 11 - "@atproto/api": "^0.18.1" 11 + "@atproto/api": "^0.18.1", 12 + "tldts": "^7.0.19" 12 13 }, 13 14 "devDependencies": { 14 15 "@sveltejs/adapter-auto": "^7.0.0", ··· 1566 1567 "bin": { 1567 1568 "tlds": "bin.js" 1568 1569 } 1570 + }, 1571 + "node_modules/tldts": { 1572 + "version": "7.0.19", 1573 + "resolved": "https://registry.npmjs.org/tldts/-/tldts-7.0.19.tgz", 1574 + "integrity": "sha512-8PWx8tvC4jDB39BQw1m4x8y5MH1BcQ5xHeL2n7UVFulMPH/3Q0uiamahFJ3lXA0zO2SUyRXuVVbWSDmstlt9YA==", 1575 + "license": "MIT", 1576 + "dependencies": { 1577 + "tldts-core": "^7.0.19" 1578 + }, 1579 + "bin": { 1580 + "tldts": "bin/cli.js" 1581 + } 1582 + }, 1583 + "node_modules/tldts-core": { 1584 + "version": "7.0.19", 1585 + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.0.19.tgz", 1586 + "integrity": "sha512-lJX2dEWx0SGH4O6p+7FPwYmJ/bu1JbcGJ8RLaG9b7liIgZ85itUVEPbMtWRVrde/0fnDPEPHW10ZsKW3kVsE9A==", 1587 + "license": "MIT" 1569 1588 }, 1570 1589 "node_modules/totalist": { 1571 1590 "version": "3.0.1",
+2 -1
package.json
··· 15 15 "lint": "prettier --check ." 16 16 }, 17 17 "dependencies": { 18 - "@atproto/api": "^0.18.1" 18 + "@atproto/api": "^0.18.1", 19 + "tldts": "^7.0.19" 19 20 }, 20 21 "devDependencies": { 21 22 "@sveltejs/adapter-auto": "^7.0.0",
+2 -2
src/lib/constants.ts
··· 18 18 */ 19 19 export const SHORTCODE = { 20 20 /** Default length for generated shortcodes */ 21 - DEFAULT_LENGTH: 6, 21 + DEFAULT_LENGTH: 10, 22 22 23 23 /** Maximum collision resolution attempts */ 24 24 MAX_COLLISION_ATTEMPTS: 20, 25 25 26 26 /** character set */ 27 - CHARS: '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 27 + CHARS: 'abcABCdefDEFghiHIJjklLMNmnOPQopqRSTrstUVWuvwXYZxyz0123456789' 28 28 } as const; 29 29 30 30 /**
+121 -61
src/lib/utils/encoding.ts
··· 1 - /** 2 - * Utilities for encoding URLs into short codes 3 - */ 4 - 5 1 import { SHORTCODE } from '$lib/constants'; 2 + import { parse, getDomain } from 'tldts'; 6 3 7 - /** 8 - * characters used for encoding (0-9, a-z, A-Z) 9 - */ 10 4 const BASE_CHARS = SHORTCODE.CHARS; 11 5 const BASE = BASE_CHARS.length; 12 6 13 - /** 14 - * Generates a simple hash from a string 15 - * @param text - Input string to hash 16 - * @returns Numeric hash value 17 - */ 18 - function hashString(text: string): number { 19 - let hash = 0; 20 - for (let i = 0; i < text.length; i++) { 21 - const char = text.charCodeAt(i); 22 - hash = (hash << 5) - hash + char; 23 - hash = hash & hash; // Convert to 32-bit integer 24 - } 25 - return Math.abs(hash); 7 + function hashString(text: string): bigint { 8 + let hash = 1469598103934665603n; 9 + for (let i = 0; i < text.length; i++) { 10 + const char = BigInt(text.charCodeAt(i)); 11 + hash = (hash ^ char) * 1099511628211n; 12 + } 13 + return hash < 0n ? -hash : hash; 26 14 } 27 15 28 - /** 29 - * Encodes a number to base string 30 - * @param num - Number to encode 31 - * @param length - Target length of the encoded string 32 - * @returns encoded string 33 - */ 34 - function toBase(num: number, length: number): string { 35 - let encoded = ''; 36 - for (let i = 0; i < length; i++) { 37 - encoded = BASE_CHARS[num % BASE] + encoded; 38 - num = Math.floor(num / BASE); 39 - } 40 - return encoded; 16 + function toBase(num: bigint, length: number, seed = ''): string { 17 + let encoded = ''; 18 + let n = num; 19 + for (let i = 0; i < length; i++) { 20 + let rem: bigint; 21 + if (n > 0n) { 22 + rem = n % BigInt(BASE); 23 + n = n / BigInt(BASE); 24 + } else { 25 + const fallback = hashString(num.toString() + '::' + seed + '::' + i.toString()); 26 + rem = fallback % BigInt(BASE); 27 + } 28 + encoded = BASE_CHARS[Number(rem)] + encoded; 29 + } 30 + return encoded; 31 + } 32 + 33 + function normaliseUrl(url: string): string { 34 + try { 35 + const parsed = new URL(url.startsWith('http') ? url : `https://${url}`); 36 + parsed.hash = ''; 37 + 38 + const sortedParams = [...parsed.searchParams.entries()].sort((a, b) => a[0].localeCompare(b[0])); 39 + parsed.search = ''; 40 + for (const [key, value] of sortedParams) parsed.searchParams.append(key, value); 41 + 42 + parsed.hostname = parsed.hostname.toLowerCase(); 43 + parsed.protocol = 'https:'; 44 + return parsed.toString(); 45 + } catch (e) { 46 + return url.trim(); 47 + } 41 48 } 42 49 43 - /** 44 - * Encodes a URL to a short string 45 - * Uses a deterministic hash encoding 46 - * 47 - * @param url - URL to encode 48 - * @param length - Target length of the shortcode (default: 6) 49 - * @returns Short encoded string 50 - * 51 - * @example 52 - * encodeUrl('https://github.com/user') // Returns something like 'a3k9zx' 53 - */ 50 + function getBaseDomain(url: string): string { 51 + try { 52 + const domain = getDomain(url, { allowPrivateDomains: false }); 53 + if (domain) return domain.toLowerCase(); 54 + 55 + const parsed = parse(url, { extractHostname: true }); 56 + return (parsed.hostname ?? '').toLowerCase(); 57 + } catch (e) { 58 + return ''; 59 + } 60 + } 61 + 54 62 export function encodeUrl(url: string, length: number = SHORTCODE.DEFAULT_LENGTH): string { 55 - const hash = hashString(url); 56 - return toBase(hash, length); 63 + if (!Number.isInteger(length) || length < 3) length = SHORTCODE.DEFAULT_LENGTH; 64 + 65 + const DOMAIN_PREFIX_LENGTH = 2; 66 + 67 + const normalised = normaliseUrl(url); 68 + const apex = getBaseDomain(normalised) || ''; 69 + 70 + const domainHash = hashString(apex || normalised); 71 + const domainPrefix = toBase(domainHash, DOMAIN_PREFIX_LENGTH, 'domain'); 72 + 73 + const remaining = Math.max(1, length - DOMAIN_PREFIX_LENGTH); 74 + 75 + let hostname = ''; 76 + try { 77 + hostname = new URL(normalised).hostname.toLowerCase(); 78 + } catch (e) { 79 + try { hostname = new URL(url.startsWith('http') ? url : `https://${url}`).hostname.toLowerCase(); } catch { hostname = ''; } 80 + } 81 + 82 + let subLevels: string[] = []; 83 + if (apex && hostname && hostname !== apex) { 84 + const sub = hostname.replace(new RegExp(`\.${apex}$`), ''); 85 + subLevels = sub.split('.'); 86 + } 87 + 88 + const MIN_URL_CORE = 1; 89 + const MIN_TAIL = 1; 90 + const tailLength = remaining; 91 + 92 + const urlHash = hashString(normalised + '::url'); 93 + const urlCoreLength = remaining - subLevels.length; 94 + const urlCore = toBase(urlHash, Math.max(MIN_URL_CORE, urlCoreLength), 'url'); 95 + 96 + const subTail: string[] = []; 97 + const reversedSubLevels = subLevels.slice().reverse(); 98 + for (let i = 0; i < reversedSubLevels.length; i++) { 99 + const h = hashString(reversedSubLevels[i] + '::sub'); 100 + subTail.push(toBase(h, 1, 'sub' + i)); 101 + } 102 + 103 + let tail = subTail.join(''); 104 + if (!tail) { 105 + const fallbackHash = hashString(normalised + '::fallback'); 106 + tail = toBase(fallbackHash, tailLength, 'sub'); 107 + } 108 + 109 + let out = domainPrefix + urlCore + tail; 110 + if (out.length > length) out = out.slice(0, length); 111 + if (out.length < length) { 112 + let pad = ''; 113 + let i = 0; 114 + while (out.length + pad.length < length) { 115 + const h = hashString(normalised + '::pad2::' + i); 116 + pad += toBase(h, Math.min(4, length - out.length - pad.length), 'pad2' + i); 117 + i++; 118 + } 119 + out += pad.slice(0, length - out.length); 120 + } 121 + 122 + // --- LOGGING MAX COMBINATIONS --- 123 + const maxCombinations = BigInt(BASE) ** BigInt(length); 124 + console.log(`[Shortcode Info] URL: ${url}`); 125 + console.log(`[Shortcode Info] Length: ${length}, Charset: ${BASE} chars`); 126 + console.log(`[Shortcode Info] Max possible combinations: ${maxCombinations.toString()}`); 127 + console.log(`[Shortcode Info] Domain prefix: ${domainPrefix}, URL core: ${urlCore}, Subdomain tail: ${tail}`); 128 + 129 + return out; 57 130 } 58 131 59 - /** 60 - * Validates if a string is a valid shortcode 61 - * @param code - String to validate 62 - * @returns True if the code contains only valid characters 63 - */ 64 132 export function isValidShortcode(code: string): boolean { 65 - return /^[0-9a-zA-Z]+$/.test(code); 133 + return /^[0-9a-zA-Z]+$/.test(code); 66 134 } 67 135 68 - /** 69 - * Calculates the maximum number of possible shortcodes for a given length 70 - * @param length - Length of the shortcode 71 - * @returns Number of possible combinations 72 - * 73 - * @example 74 - * getMaxCombinations(6) 75 - */ 76 136 export function getMaxCombinations(length: number): number { 77 - return Math.pow(BASE, length); 137 + return Math.pow(BASE, length); 78 138 }