···167167 }
168168169169 var domains []string
170170- for _, raw := range urls {
171171- u, err := url.Parse(raw)
170170+ for i, raw := range urls {
171171+ clean := NormalizeLossyURL(raw)
172172+ urls[i] = clean
173173+ u, err := url.Parse(clean)
172174 if nil == err {
173175 domains = append(domains, u.Hostname())
174176 }
+58
search/url.go
···11+package search
22+33+import (
44+ "net/url"
55+66+ "github.com/PuerkitoBio/purell"
77+)
88+99+var trackingParams = []string{
1010+ "__s",
1111+ "_ga",
1212+ "campaign_id",
1313+ "ceid",
1414+ "emci",
1515+ "emdi",
1616+ "fbclid",
1717+ "gclid",
1818+ "hootPostID",
1919+ "mc_eid",
2020+ "mkclid",
2121+ "mkt_tok",
2222+ "msclkid",
2323+ "pk_campaign",
2424+ "pk_kwd",
2525+ "sessionid",
2626+ "sourceid",
2727+ "utm_campaign",
2828+ "utm_content",
2929+ "utm_id",
3030+ "utm_medium",
3131+ "utm_source",
3232+ "utm_term",
3333+ "xpid",
3434+}
3535+3636+// aggressively normalizes URL, for search indexing and matching. it is possible the URL won't be directly functional after this normalization
3737+func NormalizeLossyURL(raw string) string {
3838+ clean, err := purell.NormalizeURLString(raw, purell.FlagsUsuallySafeGreedy|purell.FlagRemoveDirectoryIndex|purell.FlagRemoveFragment|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveWWW|purell.FlagSortQuery)
3939+ if err != nil {
4040+ return raw
4141+ }
4242+4343+ // remove tracking params
4444+ u, err := url.Parse(clean)
4545+ if u.RawQuery == "" {
4646+ return clean
4747+ }
4848+ params := u.Query()
4949+5050+ // there is probably a more efficient way to do this
5151+ for _, p := range trackingParams {
5252+ if params.Has(p) {
5353+ params.Del(p)
5454+ }
5555+ }
5656+ u.RawQuery = params.Encode()
5757+ return u.String()
5858+}