fork of indigo with slightly nicer lexgen

search: normalize URLs for indexing and query

+1 -1
search/query.go
··· 123 123 124 124 if p.URL != "" { 125 125 filters = append(filters, map[string]interface{}{ 126 - "term": map[string]interface{}{"url": p.URL}, 126 + "term": map[string]interface{}{"url": NormalizeLossyURL(p.URL)}, 127 127 }) 128 128 } 129 129
+2 -2
search/testdata/transform-post-fixtures.json
··· 11 11 "embed": { 12 12 "$type": "app.bsky.embed.external", 13 13 "external": { 14 - "uri": "https://bsky.app", 14 + "uri": "https://www.bsky.app:443/index.html", 15 15 "title": "Bluesky Social", 16 16 "description": "See what's next.", 17 17 "thumb": { ··· 70 70 "features": [ 71 71 { 72 72 "$type": "app.bsky.richtext.facet#link", 73 - "uri": "https://en.wikipedia.org/wiki/CBOR" 73 + "uri": "https://en.wikipedia.org/wiki/CBOR?utm_campaign=123" 74 74 } 75 75 ] 76 76 },
+4 -2
search/transform.go
··· 167 167 } 168 168 169 169 var domains []string 170 - for _, raw := range urls { 171 - u, err := url.Parse(raw) 170 + for i, raw := range urls { 171 + clean := NormalizeLossyURL(raw) 172 + urls[i] = clean 173 + u, err := url.Parse(clean) 172 174 if nil == err { 173 175 domains = append(domains, u.Hostname()) 174 176 }
+58
search/url.go
··· 1 + package search 2 + 3 + import ( 4 + "net/url" 5 + 6 + "github.com/PuerkitoBio/purell" 7 + ) 8 + 9 + var trackingParams = []string{ 10 + "__s", 11 + "_ga", 12 + "campaign_id", 13 + "ceid", 14 + "emci", 15 + "emdi", 16 + "fbclid", 17 + "gclid", 18 + "hootPostID", 19 + "mc_eid", 20 + "mkclid", 21 + "mkt_tok", 22 + "msclkid", 23 + "pk_campaign", 24 + "pk_kwd", 25 + "sessionid", 26 + "sourceid", 27 + "utm_campaign", 28 + "utm_content", 29 + "utm_id", 30 + "utm_medium", 31 + "utm_source", 32 + "utm_term", 33 + "xpid", 34 + } 35 + 36 + // aggressively normalizes URL, for search indexing and matching. it is possible the URL won't be directly functional after this normalization 37 + func NormalizeLossyURL(raw string) string { 38 + clean, err := purell.NormalizeURLString(raw, purell.FlagsUsuallySafeGreedy|purell.FlagRemoveDirectoryIndex|purell.FlagRemoveFragment|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveWWW|purell.FlagSortQuery) 39 + if err != nil { 40 + return raw 41 + } 42 + 43 + // remove tracking params 44 + u, err := url.Parse(clean) 45 + if u.RawQuery == "" { 46 + return clean 47 + } 48 + params := u.Query() 49 + 50 + // there is probably a more efficient way to do this 51 + for _, p := range trackingParams { 52 + if params.Has(p) { 53 + params.Del(p) 54 + } 55 + } 56 + u.RawQuery = params.Encode() 57 + return u.String() 58 + }
+33
search/url_test.go
··· 1 + package search 2 + 3 + import ( 4 + "testing" 5 + 6 + "github.com/stretchr/testify/assert" 7 + ) 8 + 9 + func TestNormalizeLossyURL(t *testing.T) { 10 + assert := assert.New(t) 11 + 12 + fixtures := []struct { 13 + orig string 14 + clean string 15 + }{ 16 + {orig: "", clean: ""}, 17 + {orig: "asdf", clean: "asdf"}, 18 + {orig: "HTTP://bSky.app:80/index.html", clean: "http://bsky.app"}, 19 + {orig: "https://example.com/thing?c=123&utm_campaign=blah&a=first", clean: "https://example.com/thing?a=first&c=123"}, 20 + {orig: "https://example.com/thing?c=123&utm_campaign=blah&a=first", clean: "https://example.com/thing?a=first&c=123"}, 21 + {orig: "http://example.com/foo//bar.html", clean: "http://example.com/foo/bar.html"}, 22 + {orig: "http://example.com/bar.html#section1", clean: "http://example.com/bar.html"}, 23 + {orig: "http://example.com/foo/", clean: "http://example.com/foo"}, 24 + {orig: "http://example.com/", clean: "http://example.com"}, 25 + {orig: "http://example.com/%7Efoo", clean: "http://example.com/~foo"}, 26 + {orig: "http://example.com/foo/./bar/baz/../qux", clean: "http://example.com/foo/bar/qux"}, 27 + {orig: "http://www.example.com/", clean: "http://example.com"}, 28 + } 29 + 30 + for _, fix := range fixtures { 31 + assert.Equal(fix.clean, NormalizeLossyURL(fix.orig)) 32 + } 33 + }