+1
-1
search/query.go
+1
-1
search/query.go
+2
-2
search/testdata/transform-post-fixtures.json
+2
-2
search/testdata/transform-post-fixtures.json
···
11
11
"embed": {
12
12
"$type": "app.bsky.embed.external",
13
13
"external": {
14
-
"uri": "https://bsky.app",
14
+
"uri": "https://www.bsky.app:443/index.html",
15
15
"title": "Bluesky Social",
16
16
"description": "See what's next.",
17
17
"thumb": {
···
70
70
"features": [
71
71
{
72
72
"$type": "app.bsky.richtext.facet#link",
73
-
"uri": "https://en.wikipedia.org/wiki/CBOR"
73
+
"uri": "https://en.wikipedia.org/wiki/CBOR?utm_campaign=123"
74
74
}
75
75
]
76
76
},
+4
-2
search/transform.go
+4
-2
search/transform.go
···
167
167
}
168
168
169
169
var domains []string
170
-
for _, raw := range urls {
171
-
u, err := url.Parse(raw)
170
+
for i, raw := range urls {
171
+
clean := NormalizeLossyURL(raw)
172
+
urls[i] = clean
173
+
u, err := url.Parse(clean)
172
174
if nil == err {
173
175
domains = append(domains, u.Hostname())
174
176
}
+58
search/url.go
+58
search/url.go
···
1
+
package search
2
+
3
+
import (
4
+
"net/url"
5
+
6
+
"github.com/PuerkitoBio/purell"
7
+
)
8
+
9
+
var trackingParams = []string{
10
+
"__s",
11
+
"_ga",
12
+
"campaign_id",
13
+
"ceid",
14
+
"emci",
15
+
"emdi",
16
+
"fbclid",
17
+
"gclid",
18
+
"hootPostID",
19
+
"mc_eid",
20
+
"mkclid",
21
+
"mkt_tok",
22
+
"msclkid",
23
+
"pk_campaign",
24
+
"pk_kwd",
25
+
"sessionid",
26
+
"sourceid",
27
+
"utm_campaign",
28
+
"utm_content",
29
+
"utm_id",
30
+
"utm_medium",
31
+
"utm_source",
32
+
"utm_term",
33
+
"xpid",
34
+
}
35
+
36
+
// aggressively normalizes URL, for search indexing and matching. it is possible the URL won't be directly functional after this normalization
37
+
func NormalizeLossyURL(raw string) string {
38
+
clean, err := purell.NormalizeURLString(raw, purell.FlagsUsuallySafeGreedy|purell.FlagRemoveDirectoryIndex|purell.FlagRemoveFragment|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveWWW|purell.FlagSortQuery)
39
+
if err != nil {
40
+
return raw
41
+
}
42
+
43
+
// remove tracking params
44
+
u, err := url.Parse(clean)
45
+
if u.RawQuery == "" {
46
+
return clean
47
+
}
48
+
params := u.Query()
49
+
50
+
// there is probably a more efficient way to do this
51
+
for _, p := range trackingParams {
52
+
if params.Has(p) {
53
+
params.Del(p)
54
+
}
55
+
}
56
+
u.RawQuery = params.Encode()
57
+
return u.String()
58
+
}
+33
search/url_test.go
+33
search/url_test.go
···
1
+
package search
2
+
3
+
import (
4
+
"testing"
5
+
6
+
"github.com/stretchr/testify/assert"
7
+
)
8
+
9
+
func TestNormalizeLossyURL(t *testing.T) {
10
+
assert := assert.New(t)
11
+
12
+
fixtures := []struct {
13
+
orig string
14
+
clean string
15
+
}{
16
+
{orig: "", clean: ""},
17
+
{orig: "asdf", clean: "asdf"},
18
+
{orig: "HTTP://bSky.app:80/index.html", clean: "http://bsky.app"},
19
+
{orig: "https://example.com/thing?c=123&utm_campaign=blah&a=first", clean: "https://example.com/thing?a=first&c=123"},
20
+
{orig: "https://example.com/thing?c=123&utm_campaign=blah&a=first", clean: "https://example.com/thing?a=first&c=123"},
21
+
{orig: "http://example.com/foo//bar.html", clean: "http://example.com/foo/bar.html"},
22
+
{orig: "http://example.com/bar.html#section1", clean: "http://example.com/bar.html"},
23
+
{orig: "http://example.com/foo/", clean: "http://example.com/foo"},
24
+
{orig: "http://example.com/", clean: "http://example.com"},
25
+
{orig: "http://example.com/%7Efoo", clean: "http://example.com/~foo"},
26
+
{orig: "http://example.com/foo/./bar/baz/../qux", clean: "http://example.com/foo/bar/qux"},
27
+
{orig: "http://www.example.com/", clean: "http://example.com"},
28
+
}
29
+
30
+
for _, fix := range fixtures {
31
+
assert.Equal(fix.clean, NormalizeLossyURL(fix.orig))
32
+
}
33
+
}