1package search
2
3import (
4 "net/url"
5
6 "github.com/PuerkitoBio/purell"
7)
8
9var trackingParams = []string{
10 "__s",
11 "_ga",
12 "campaign_id",
13 "ceid",
14 "emci",
15 "emdi",
16 "fbclid",
17 "gclid",
18 "hootPostID",
19 "mc_eid",
20 "mkclid",
21 "mkt_tok",
22 "msclkid",
23 "pk_campaign",
24 "pk_kwd",
25 "sessionid",
26 "sourceid",
27 "utm_campaign",
28 "utm_content",
29 "utm_id",
30 "utm_medium",
31 "utm_source",
32 "utm_term",
33 "xpid",
34}
35
36// aggressively normalizes URL, for search indexing and matching. it is possible the URL won't be directly functional after this normalization
37func NormalizeLossyURL(raw string) string {
38 clean, err := purell.NormalizeURLString(raw, purell.FlagsUsuallySafeGreedy|purell.FlagRemoveDirectoryIndex|purell.FlagRemoveFragment|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveWWW|purell.FlagSortQuery)
39 if err != nil {
40 return raw
41 }
42
43 // remove tracking params
44 u, err := url.Parse(clean)
45 if err != nil {
46 return clean
47 }
48 if u.RawQuery == "" {
49 return clean
50 }
51 params := u.Query()
52
53 // there is probably a more efficient way to do this
54 for _, p := range trackingParams {
55 if params.Has(p) {
56 params.Del(p)
57 }
58 }
59 u.RawQuery = params.Encode()
60 return u.String()
61}