fork of indigo with slightly nicer lexgen

palomar: refactor query string parsing; updates to schemas and transform (#643)

TODO:

- [x] normalize URLs: maybe with `purell`, maybe stripping tracking
parameters
- [x] date-level queries
- [x] add end-to-end tests (similar to japanese queries)
- [x] implement "merge" of query params (parsed and actual params)
- [x] run lexgen and wire up actual query params
- [x] `from:me` post search behavior (server-side, using viewer param)
- [ ] alternate post "sort" order (top vs. latest)

Closes: https://github.com/bluesky-social/indigo/issues/191
Closes: https://github.com/bluesky-social/indigo/issues/593

authored by bnewbold.net and committed by GitHub 44093230 1ffd4cfe

+6 -4
api/bsky/actorsearchActorsTypeahead.go
··· 19 19 // 20 20 // q: Search query prefix; not a full query string. 21 21 // term: DEPRECATED: use 'q' instead. 22 - func ActorSearchActorsTypeahead(ctx context.Context, c *xrpc.Client, limit int64, q string, term string) (*ActorSearchActorsTypeahead_Output, error) { 22 + // viewer: DID of the account making the request (not included for public/unauthenticated queries). Used to boost followed accounts in ranking. 23 + func ActorSearchActorsTypeahead(ctx context.Context, c *xrpc.Client, limit int64, q string, term string, viewer string) (*ActorSearchActorsTypeahead_Output, error) { 23 24 var out ActorSearchActorsTypeahead_Output 24 25 25 26 params := map[string]interface{}{ 26 - "limit": limit, 27 - "q": q, 28 - "term": term, 27 + "limit": limit, 28 + "q": q, 29 + "term": term, 30 + "viewer": viewer, 29 31 } 30 32 if err := c.Do(ctx, xrpc.Query, "", "app.bsky.actor.searchActorsTypeahead", params, nil, &out); err != nil { 31 33 return nil, err
+22 -4
api/bsky/feedsearchPosts.go
··· 20 20 21 21 // FeedSearchPosts calls the XRPC method "app.bsky.feed.searchPosts". 22 22 // 23 + // author: Filter to posts by the given account. Handles are resolved to DID before query-time. 23 24 // cursor: Optional pagination mechanism; may not necessarily allow scrolling through entire result set. 25 + // domain: Filter to posts with URLs (facet links or embeds) linking to the given domain (hostname). Server may apply hostname normalization. 26 + // lang: Filter to posts in the given language. Expected to be based on post language field, though server may override language detection. 27 + // mentions: Filter to posts which mention the given account. Handles are resolved to DID before query-time. Only matches rich-text facet mentions. 24 28 // q: Search query string; syntax, phrase, boolean, and faceting is unspecified, but Lucene query syntax is recommended. 25 - func FeedSearchPosts(ctx context.Context, c *xrpc.Client, cursor string, limit int64, q string) (*FeedSearchPosts_Output, error) { 29 + // since: Filter results for posts after the indicated datetime (inclusive). Expected to use 'sortAt' timestamp, which may not match 'createdAt'. Can be a datetime, or just an ISO date (YYYY-MM-DD). 30 + // sort: Specifies the ranking order of results. 31 + // tag: Filter to posts with the given tag (hashtag), based on rich-text facet or tag field. Do not include the hash (#) prefix. Multiple tags can be specified, with 'AND' matching. 32 + // until: Filter results for posts before the indicated datetime (not inclusive). Expected to use 'sortAt' timestamp, which may not match 'createdAt'. Can be a datetime, or just an ISO date (YYY-MM-DD). 33 + // url: Filter to posts with links (facet links or embeds) pointing to this URL. Server may apply URL normalization or fuzzy matching. 34 + func FeedSearchPosts(ctx context.Context, c *xrpc.Client, author string, cursor string, domain string, lang string, limit int64, mentions string, q string, since string, sort string, tag []string, until string, url string) (*FeedSearchPosts_Output, error) { 26 35 var out FeedSearchPosts_Output 27 36 28 37 params := map[string]interface{}{ 29 - "cursor": cursor, 30 - "limit": limit, 31 - "q": q, 38 + "author": author, 39 + "cursor": cursor, 40 + "domain": domain, 41 + "lang": lang, 42 + "limit": limit, 43 + "mentions": mentions, 44 + "q": q, 45 + "since": since, 46 + "sort": sort, 47 + "tag": tag, 48 + "until": until, 49 + "url": url, 32 50 } 33 51 if err := c.Do(ctx, xrpc.Query, "", "app.bsky.feed.searchPosts", params, nil, &out); err != nil { 34 52 return nil, err
+3 -1
api/bsky/unspeccedsearchActorsSkeleton.go
··· 23 23 // cursor: Optional pagination mechanism; may not necessarily allow scrolling through entire result set. 24 24 // q: Search query string; syntax, phrase, boolean, and faceting is unspecified, but Lucene query syntax is recommended. For typeahead search, only simple term match is supported, not full syntax. 25 25 // typeahead: If true, acts as fast/simple 'typeahead' query. 26 - func UnspeccedSearchActorsSkeleton(ctx context.Context, c *xrpc.Client, cursor string, limit int64, q string, typeahead bool) (*UnspeccedSearchActorsSkeleton_Output, error) { 26 + // viewer: DID of the account making the request (not included for public/unauthenticated queries). Used to boost followed accounts in ranking. 27 + func UnspeccedSearchActorsSkeleton(ctx context.Context, c *xrpc.Client, cursor string, limit int64, q string, typeahead bool, viewer string) (*UnspeccedSearchActorsSkeleton_Output, error) { 27 28 var out UnspeccedSearchActorsSkeleton_Output 28 29 29 30 params := map[string]interface{}{ ··· 31 32 "limit": limit, 32 33 "q": q, 33 34 "typeahead": typeahead, 35 + "viewer": viewer, 34 36 } 35 37 if err := c.Do(ctx, xrpc.Query, "", "app.bsky.unspecced.searchActorsSkeleton", params, nil, &out); err != nil { 36 38 return nil, err
+24 -4
api/bsky/unspeccedsearchPostsSkeleton.go
··· 20 20 21 21 // UnspeccedSearchPostsSkeleton calls the XRPC method "app.bsky.unspecced.searchPostsSkeleton". 22 22 // 23 + // author: Filter to posts by the given account. Handles are resolved to DID before query-time. 23 24 // cursor: Optional pagination mechanism; may not necessarily allow scrolling through entire result set. 25 + // domain: Filter to posts with URLs (facet links or embeds) linking to the given domain (hostname). Server may apply hostname normalization. 26 + // lang: Filter to posts in the given language. Expected to be based on post language field, though server may override language detection. 27 + // mentions: Filter to posts which mention the given account. Handles are resolved to DID before query-time. Only matches rich-text facet mentions. 24 28 // q: Search query string; syntax, phrase, boolean, and faceting is unspecified, but Lucene query syntax is recommended. 25 - func UnspeccedSearchPostsSkeleton(ctx context.Context, c *xrpc.Client, cursor string, limit int64, q string) (*UnspeccedSearchPostsSkeleton_Output, error) { 29 + // since: Filter results for posts after the indicated datetime (inclusive). Expected to use 'sortAt' timestamp, which may not match 'createdAt'. Can be a datetime, or just an ISO date (YYYY-MM-DD). 30 + // sort: Specifies the ranking order of results. 31 + // tag: Filter to posts with the given tag (hashtag), based on rich-text facet or tag field. Do not include the hash (#) prefix. Multiple tags can be specified, with 'AND' matching. 32 + // until: Filter results for posts before the indicated datetime (not inclusive). Expected to use 'sortAt' timestamp, which may not match 'createdAt'. Can be a datetime, or just an ISO date (YYY-MM-DD). 33 + // url: Filter to posts with links (facet links or embeds) pointing to this URL. Server may apply URL normalization or fuzzy matching. 34 + // viewer: DID of the account making the request (not included for public/unauthenticated queries). Used for 'from:me' queries. 35 + func UnspeccedSearchPostsSkeleton(ctx context.Context, c *xrpc.Client, author string, cursor string, domain string, lang string, limit int64, mentions string, q string, since string, sort string, tag []string, until string, url string, viewer string) (*UnspeccedSearchPostsSkeleton_Output, error) { 26 36 var out UnspeccedSearchPostsSkeleton_Output 27 37 28 38 params := map[string]interface{}{ 29 - "cursor": cursor, 30 - "limit": limit, 31 - "q": q, 39 + "author": author, 40 + "cursor": cursor, 41 + "domain": domain, 42 + "lang": lang, 43 + "limit": limit, 44 + "mentions": mentions, 45 + "q": q, 46 + "since": since, 47 + "sort": sort, 48 + "tag": tag, 49 + "until": until, 50 + "url": url, 51 + "viewer": viewer, 32 52 } 33 53 if err := c.Do(ctx, xrpc.Query, "", "app.bsky.unspecced.searchPostsSkeleton", params, nil, &out); err != nil { 34 54 return nil, err
+14 -8
cmd/palomar/main.go
··· 329 329 identity.DefaultDirectory(), // TODO: parse PLC arg 330 330 escli, 331 331 cctx.String("es-post-index"), 332 - strings.Join(cctx.Args().Slice(), " "), 333 - 0, 334 - 20, 332 + &search.PostSearchParams{ 333 + Query: strings.Join(cctx.Args().Slice(), " "), 334 + Offset: 0, 335 + Size: 20, 336 + }, 335 337 ) 336 338 if err != nil { 337 339 return err ··· 359 361 context.Background(), 360 362 escli, 361 363 cctx.String("es-profile-index"), 362 - strings.Join(cctx.Args().Slice(), " "), 363 - 10, 364 + &search.ActorSearchParams{ 365 + Query: strings.Join(cctx.Args().Slice(), " "), 366 + Size: 10, 367 + }, 364 368 ) 365 369 if err != nil { 366 370 return err ··· 372 376 identity.DefaultDirectory(), // TODO: parse PLC arg 373 377 escli, 374 378 cctx.String("es-profile-index"), 375 - strings.Join(cctx.Args().Slice(), " "), 376 - 0, 377 - 20, 379 + &search.ActorSearchParams{ 380 + Query: strings.Join(cctx.Args().Slice(), " "), 381 + Offset: 0, 382 + Size: 20, 383 + }, 378 384 ) 379 385 if err != nil { 380 386 return err
+1
go.mod
··· 74 74 ) 75 75 76 76 require ( 77 + github.com/PuerkitoBio/purell v1.2.1 // indirect 77 78 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect 78 79 github.com/go-redis/redis v6.15.9+incompatible // indirect 79 80 github.com/hashicorp/golang-lru v1.0.2 // indirect
+2
go.sum
··· 35 35 dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= 36 36 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 37 37 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= 38 + github.com/PuerkitoBio/purell v1.2.1 h1:QsZ4TjvwiMpat6gBCBxEQI0rcS9ehtkKtSpiUnd9N28= 39 + github.com/PuerkitoBio/purell v1.2.1/go.mod h1:ZwHcC/82TOaovDi//J/804umJFFmbOHPngi8iYYv/Eo= 38 40 github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b h1:5/++qT1/z812ZqBvqQt6ToRswSuPZ/B33m6xVHRzADU= 39 41 github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b/go.mod h1:4+EPqMRApwwE/6yo6CxiHoSnBzjRr3jsqer7frxP8y4= 40 42 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
+149 -13
search/handlers.go
··· 76 76 }) 77 77 } 78 78 79 + params := PostSearchParams{ 80 + Query: q, 81 + // TODO: parse/validate the sort options here? 82 + Sort: e.QueryParam("sort"), 83 + Domain: e.QueryParam("domain"), 84 + URL: e.QueryParam("url"), 85 + } 86 + 87 + viewerStr := e.QueryParam("viewer") 88 + if viewerStr != "" { 89 + d, err := syntax.ParseDID(viewerStr) 90 + if err != nil { 91 + return e.JSON(400, map[string]any{ 92 + "error": "BadRequest", 93 + "message": fmt.Sprintf("invalid DID for 'viewer': %s", err), 94 + }) 95 + } 96 + params.Viewer = &d 97 + } 98 + authorStr := e.QueryParam("author") 99 + if authorStr != "" { 100 + atid, err := syntax.ParseAtIdentifier(authorStr) 101 + if err != nil { 102 + return &echo.HTTPError{ 103 + Code: 400, 104 + Message: fmt.Sprintf("invalid DID for 'author': %s", err), 105 + } 106 + } 107 + if atid.IsHandle() { 108 + ident, err := s.dir.Lookup(context.TODO(), *atid) 109 + if err != nil { 110 + return e.JSON(400, map[string]any{ 111 + "error": "BadRequest", 112 + "message": fmt.Sprintf("invalid Handle for 'author': %s", err), 113 + }) 114 + } 115 + params.Author = &ident.DID 116 + } else { 117 + d, err := atid.AsDID() 118 + if err != nil { 119 + return err 120 + } 121 + params.Author = &d 122 + } 123 + } 124 + 125 + mentionsStr := e.QueryParam("mentions") 126 + if mentionsStr != "" { 127 + atid, err := syntax.ParseAtIdentifier(mentionsStr) 128 + if err != nil { 129 + return &echo.HTTPError{ 130 + Code: 400, 131 + Message: fmt.Sprintf("invalid DID for 'mentions': %s", err), 132 + } 133 + } 134 + if atid.IsHandle() { 135 + ident, err := s.dir.Lookup(context.TODO(), *atid) 136 + if err != nil { 137 + return e.JSON(400, map[string]any{ 138 + "error": "BadRequest", 139 + "message": fmt.Sprintf("invalid Handle for 'mentions': %s", err), 140 + }) 141 + } 142 + params.Mentions = &ident.DID 143 + } else { 144 + d, err := atid.AsDID() 145 + if err != nil { 146 + return err 147 + } 148 + params.Mentions = &d 149 + } 150 + } 151 + 152 + sinceStr := e.QueryParam("since") 153 + if sinceStr != "" { 154 + dt, err := syntax.ParseDatetime(sinceStr) 155 + if err != nil { 156 + return e.JSON(400, map[string]any{ 157 + "error": "BadRequest", 158 + "message": fmt.Sprintf("invalid Datetime for 'since': %s", err), 159 + }) 160 + } 161 + params.Since = &dt 162 + } 163 + 164 + untilStr := e.QueryParam("until") 165 + if untilStr != "" { 166 + dt, err := syntax.ParseDatetime(untilStr) 167 + if err != nil { 168 + return e.JSON(400, map[string]any{ 169 + "error": "BadRequest", 170 + "message": fmt.Sprintf("invalid Datetime for 'until': %s", err), 171 + }) 172 + } 173 + params.Until = &dt 174 + } 175 + 176 + langStr := e.QueryParam("lang") 177 + if langStr != "" { 178 + l, err := syntax.ParseLanguage(langStr) 179 + if err != nil { 180 + return e.JSON(400, map[string]any{ 181 + "error": "BadRequest", 182 + "message": fmt.Sprintf("invalid Language for 'lang': %s", err), 183 + }) 184 + } 185 + params.Lang = &l 186 + } 187 + // TODO: could be multiple tag params; guess we should "bind"? 188 + tags := e.Request().URL.Query()["tags"] 189 + if len(tags) > 0 { 190 + params.Tags = tags 191 + } 192 + 79 193 offset, limit, err := parseCursorLimit(e) 80 194 if err != nil { 81 195 span.SetAttributes(attribute.String("error", fmt.Sprintf("invalid cursor/limit: %s", err))) ··· 83 197 return err 84 198 } 85 199 200 + params.Offset = offset 201 + params.Size = limit 86 202 span.SetAttributes(attribute.Int("offset", offset), attribute.Int("limit", limit)) 87 203 88 - out, err := s.SearchPosts(ctx, q, offset, limit) 204 + out, err := s.SearchPosts(ctx, &params) 89 205 if err != nil { 90 206 span.SetAttributes(attribute.String("error", fmt.Sprintf("failed to SearchPosts: %s", err))) 91 207 span.SetStatus(codes.Error, err.Error()) ··· 106 222 q := strings.TrimSpace(e.QueryParam("q")) 107 223 if q == "" { 108 224 return e.JSON(400, map[string]any{ 109 - "error": "must pass non-empty search query", 225 + "error": "BadRequest", 226 + "message": "must pass non-empty search query", 110 227 }) 111 228 } 112 229 ··· 122 239 typeahead = true 123 240 } 124 241 242 + params := ActorSearchParams{ 243 + Query: q, 244 + Typeahead: typeahead, 245 + Offset: offset, 246 + Size: limit, 247 + } 248 + 249 + viewerStr := e.QueryParam("viewer") 250 + if viewerStr != "" { 251 + d, err := syntax.ParseDID(viewerStr) 252 + if err != nil { 253 + return e.JSON(400, map[string]any{ 254 + "error": "BadRequest", 255 + "message": fmt.Sprintf("invalid DID for 'viewer': %s", err), 256 + }) 257 + } 258 + params.Viewer = &d 259 + } 260 + 125 261 span.SetAttributes( 126 262 attribute.Int("offset", offset), 127 263 attribute.Int("limit", limit), 128 264 attribute.Bool("typeahead", typeahead), 129 265 ) 130 266 131 - out, err := s.SearchProfiles(ctx, q, typeahead, offset, limit) 267 + out, err := s.SearchProfiles(ctx, &params) 132 268 if err != nil { 133 269 span.SetAttributes(attribute.String("error", fmt.Sprintf("failed to SearchProfiles: %s", err))) 134 270 span.SetStatus(codes.Error, err.Error()) ··· 193 329 }) 194 330 } 195 331 196 - func (s *Server) SearchPosts(ctx context.Context, q string, offset, size int) (*appbsky.UnspeccedSearchPostsSkeleton_Output, error) { 332 + func (s *Server) SearchPosts(ctx context.Context, params *PostSearchParams) (*appbsky.UnspeccedSearchPostsSkeleton_Output, error) { 197 333 ctx, span := tracer.Start(ctx, "SearchPosts") 198 334 defer span.End() 199 335 200 - resp, err := DoSearchPosts(ctx, s.dir, s.escli, s.postIndex, q, offset, size) 336 + resp, err := DoSearchPosts(ctx, s.dir, s.escli, s.postIndex, params) 201 337 if err != nil { 202 338 return nil, err 203 339 } ··· 220 356 } 221 357 222 358 out := appbsky.UnspeccedSearchPostsSkeleton_Output{Posts: posts} 223 - if len(posts) == size && (offset+size) < 10000 { 224 - s := fmt.Sprintf("%d", offset+size) 359 + if len(posts) == params.Size && (params.Offset+params.Size) < 10000 { 360 + s := fmt.Sprintf("%d", params.Offset+params.Size) 225 361 out.Cursor = &s 226 362 } 227 363 if resp.Hits.Total.Relation == "eq" { ··· 231 367 return &out, nil 232 368 } 233 369 234 - func (s *Server) SearchProfiles(ctx context.Context, q string, typeahead bool, offset, size int) (*appbsky.UnspeccedSearchActorsSkeleton_Output, error) { 370 + func (s *Server) SearchProfiles(ctx context.Context, params *ActorSearchParams) (*appbsky.UnspeccedSearchActorsSkeleton_Output, error) { 235 371 ctx, span := tracer.Start(ctx, "SearchProfiles") 236 372 defer span.End() 237 373 238 374 var resp *EsSearchResponse 239 375 var err error 240 - if typeahead { 241 - resp, err = DoSearchProfilesTypeahead(ctx, s.escli, s.profileIndex, q, size) 376 + if params.Typeahead { 377 + resp, err = DoSearchProfilesTypeahead(ctx, s.escli, s.profileIndex, params) 242 378 } else { 243 - resp, err = DoSearchProfiles(ctx, s.dir, s.escli, s.profileIndex, q, offset, size) 379 + resp, err = DoSearchProfiles(ctx, s.dir, s.escli, s.profileIndex, params) 244 380 } 245 381 if err != nil { 246 382 return nil, err ··· 264 400 } 265 401 266 402 out := appbsky.UnspeccedSearchActorsSkeleton_Output{Actors: actors} 267 - if len(actors) == size && (offset+size) < 10000 { 268 - s := fmt.Sprintf("%d", offset+size) 403 + if len(actors) == params.Size && (params.Offset+params.Size) < 10000 { 404 + s := fmt.Sprintf("%d", params.Offset+params.Size) 269 405 out.Cursor = &s 270 406 } 271 407 if resp.Hits.Total.Relation == "eq" {
+92 -38
search/parse_query.go
··· 2 2 3 3 import ( 4 4 "context" 5 - "fmt" 6 5 "log/slog" 7 6 "strings" 7 + "time" 8 8 9 9 "github.com/bluesky-social/indigo/atproto/identity" 10 10 "github.com/bluesky-social/indigo/atproto/syntax" 11 11 ) 12 12 13 13 // ParseQuery takes a query string and pulls out some facet patterns ("from:handle.net") as filters 14 - func ParseQuery(ctx context.Context, dir identity.Directory, raw string) (string, []map[string]interface{}) { 15 - var filters []map[string]interface{} 14 + func ParsePostQuery(ctx context.Context, dir identity.Directory, raw string, viewer *syntax.DID) PostSearchParams { 16 15 quoted := false 17 16 parts := strings.FieldsFunc(raw, func(r rune) bool { 18 17 if r == '"' { ··· 21 20 return r == ' ' && !quoted 22 21 }) 23 22 23 + params := PostSearchParams{} 24 + 24 25 keep := make([]string, 0, len(parts)) 25 26 for _, p := range parts { 26 - p = strings.Trim(p, "\"") 27 + // pass-through quoted, either phrase or single token 28 + if strings.HasPrefix(p, "\"") { 29 + keep = append(keep, p) 30 + continue 31 + } 27 32 33 + // tags (array) 28 34 if strings.HasPrefix(p, "#") && len(p) > 1 { 29 - filters = append(filters, map[string]interface{}{ 30 - "term": map[string]interface{}{ 31 - "tag": map[string]interface{}{ 32 - "value": p[1:], 33 - "case_insensitive": true, 34 - }, 35 - }, 36 - }) 35 + params.Tags = append(params.Tags, p[1:]) 36 + continue 37 + } 38 + 39 + // handle (mention) 40 + if strings.HasPrefix(p, "@") && len(p) > 1 { 41 + handle, err := syntax.ParseHandle(p[1:]) 42 + if err != nil { 43 + keep = append(keep, p) 44 + continue 45 + } 46 + id, err := dir.LookupHandle(ctx, handle) 47 + if err != nil { 48 + if err != identity.ErrHandleNotFound { 49 + slog.Error("failed to resolve handle", "err", err) 50 + } 51 + continue 52 + } 53 + params.Mentions = &id.DID 37 54 continue 38 55 } 39 - if strings.HasPrefix(p, "did:") { 40 - filters = append(filters, map[string]interface{}{ 41 - "term": map[string]interface{}{"did": p}, 42 - }) 56 + 57 + tokParts := strings.SplitN(p, ":", 2) 58 + if len(tokParts) == 1 { 59 + keep = append(keep, p) 43 60 continue 44 61 } 45 - if strings.HasPrefix(p, "from:") && len(p) > 6 { 46 - h := p[5:] 47 - if h[0] == '@' { 48 - h = h[1:] 62 + 63 + switch tokParts[0] { 64 + case "did": 65 + // TODO: not really clear what to do here; treating like a mention doesn't really make sense? 66 + case "from", "to", "mentions": 67 + raw := tokParts[1] 68 + if raw == "me" { 69 + if viewer != nil && tokParts[0] == "from" { 70 + params.Author = viewer 71 + } else if viewer != nil { 72 + params.Mentions = viewer 73 + } 74 + continue 75 + } 76 + if strings.HasPrefix(raw, "@") && len(raw) > 1 { 77 + raw = raw[1:] 49 78 } 50 - handle, err := syntax.ParseHandle(h) 79 + handle, err := syntax.ParseHandle(raw) 51 80 if err != nil { 52 - keep = append(keep, p) 53 81 continue 54 82 } 55 83 id, err := dir.LookupHandle(ctx, handle) ··· 59 87 } 60 88 continue 61 89 } 62 - filters = append(filters, map[string]interface{}{ 63 - "term": map[string]interface{}{"did": id.DID.String()}, 64 - }) 90 + if tokParts[0] == "from" { 91 + params.Author = &id.DID 92 + } else { 93 + params.Mentions = &id.DID 94 + } 95 + continue 96 + case "http", "https": 97 + params.URL = p 98 + continue 99 + case "domain": 100 + params.Domain = tokParts[1] 101 + continue 102 + case "lang": 103 + lang, err := syntax.ParseLanguage(tokParts[1]) 104 + if nil == err { 105 + params.Lang = &lang 106 + } 107 + continue 108 + case "since", "until": 109 + var dt syntax.Datetime 110 + // first try just date 111 + date, err := time.Parse(time.DateOnly, tokParts[1]) 112 + if nil == err { 113 + dt = syntax.Datetime(date.Format(syntax.AtprotoDatetimeLayout)) 114 + } else { 115 + // fallback to formal atproto datetime format 116 + dt, err = syntax.ParseDatetimeLenient(tokParts[1]) 117 + if err != nil { 118 + continue 119 + } 120 + } 121 + if tokParts[0] == "since" { 122 + params.Since = &dt 123 + } else { 124 + params.Until = &dt 125 + } 65 126 continue 66 127 } 67 128 ··· 70 131 71 132 out := "" 72 133 for _, p := range keep { 73 - if strings.ContainsRune(p, ' ') { 74 - if out == "" { 75 - out = fmt.Sprintf(`"%s"`, p) 76 - } else { 77 - out += " " + fmt.Sprintf(`"%s"`, p) 78 - } 134 + if out == "" { 135 + out = p 79 136 } else { 80 - if out == "" { 81 - out = p 82 - } else { 83 - out += " " + p 84 - } 137 + out += " " + p 85 138 } 86 139 } 87 - if out == "" && len(filters) >= 1 { 140 + if out == "" { 88 141 out = "*" 89 142 } 90 - return out, filters 143 + params.Query = out 144 + return params 91 145 }
+62 -35
search/parse_query_test.go
··· 14 14 ctx := context.Background() 15 15 assert := assert.New(t) 16 16 dir := identity.NewMockDirectory() 17 - dir.Insert(identity.Identity{ 17 + ident := identity.Identity{ 18 18 Handle: syntax.Handle("known.example.com"), 19 19 DID: syntax.DID("did:plc:abc222"), 20 - }) 20 + } 21 + dir.Insert(ident) 22 + 23 + var p PostSearchParams 24 + 25 + p = ParsePostQuery(ctx, &dir, "", nil) 26 + assert.Equal("*", p.Query) 27 + assert.Empty(p.Filters()) 21 28 22 - var q string 23 - var f []map[string]interface{} 29 + q1 := "some +test \"with phrase\" -ok" 30 + p = ParsePostQuery(ctx, &dir, q1, nil) 31 + assert.Equal(q1, p.Query) 32 + assert.Empty(p.Filters()) 24 33 25 - q, f = ParseQuery(ctx, &dir, "") 26 - assert.Equal("", q) 27 - assert.Empty(f) 34 + q2 := "missing from:missing.example.com" 35 + p = ParsePostQuery(ctx, &dir, q2, nil) 36 + assert.Equal("missing", p.Query) 37 + assert.Empty(p.Filters()) 28 38 29 - p1 := "some +test \"with phrase\" -ok" 30 - q, f = ParseQuery(ctx, &dir, p1) 31 - assert.Equal(p1, q) 32 - assert.Empty(f) 39 + q3 := "known from:known.example.com" 40 + p = ParsePostQuery(ctx, &dir, q3, nil) 41 + assert.Equal("known", p.Query) 42 + assert.NotNil(p.Author) 43 + if p.Author != nil { 44 + assert.Equal("did:plc:abc222", p.Author.String()) 45 + } 33 46 34 - p2 := "missing from:missing.example.com" 35 - q, f = ParseQuery(ctx, &dir, p2) 36 - assert.Equal("missing", q) 37 - assert.Empty(f) 47 + q4 := "from:known.example.com" 48 + p = ParsePostQuery(ctx, &dir, q4, nil) 49 + assert.Equal("*", p.Query) 50 + assert.Equal(1, len(p.Filters())) 38 51 39 - p3 := "known from:known.example.com" 40 - q, f = ParseQuery(ctx, &dir, p3) 41 - assert.Equal("known", q) 42 - assert.Equal(1, len(f)) 52 + q5 := `from:known.example.com "multi word phrase" coolio blorg` 53 + p = ParsePostQuery(ctx, &dir, q5, nil) 54 + assert.Equal(`"multi word phrase" coolio blorg`, p.Query) 55 + assert.NotNil(p.Author) 56 + if p.Author != nil { 57 + assert.Equal("did:plc:abc222", p.Author.String()) 58 + } 59 + assert.Equal(1, len(p.Filters())) 43 60 44 - p4 := "from:known.example.com" 45 - q, f = ParseQuery(ctx, &dir, p4) 46 - assert.Equal("*", q) 47 - assert.Equal(1, len(f)) 61 + q6 := `from:known.example.com #cool_tag some other stuff` 62 + p = ParsePostQuery(ctx, &dir, q6, nil) 63 + assert.Equal(`some other stuff`, p.Query) 64 + assert.NotNil(p.Author) 65 + if p.Author != nil { 66 + assert.Equal("did:plc:abc222", p.Author.String()) 67 + } 68 + assert.Equal([]string{"cool_tag"}, p.Tags) 69 + assert.Equal(2, len(p.Filters())) 48 70 49 - p5 := `from:known.example.com "multi word phrase" coolio blorg` 50 - q, f = ParseQuery(ctx, &dir, p5) 51 - assert.Equal(`"multi word phrase" coolio blorg`, q) 52 - assert.Equal(1, len(f)) 71 + q7 := "known from:@known.example.com" 72 + p = ParsePostQuery(ctx, &dir, q7, nil) 73 + assert.Equal("known", p.Query) 74 + assert.NotNil(p.Author) 75 + if p.Author != nil { 76 + assert.Equal("did:plc:abc222", p.Author.String()) 77 + } 78 + assert.Equal(1, len(p.Filters())) 53 79 54 - p6 := `from:known.example.com #cool_tag some other stuff` 55 - q, f = ParseQuery(ctx, &dir, p6) 56 - assert.Equal(`some other stuff`, q) 57 - assert.Equal(2, len(f)) 80 + q8 := "known from:me" 81 + p = ParsePostQuery(ctx, &dir, q8, &ident.DID) 82 + assert.Equal("known", p.Query) 83 + assert.NotNil(p.Author) 84 + if p.Author != nil { 85 + assert.Equal("did:plc:abc222", p.Author.String()) 86 + } 87 + assert.Equal(1, len(p.Filters())) 58 88 59 - p7 := "known from:@known.example.com" 60 - q, f = ParseQuery(ctx, &dir, p7) 61 - assert.Equal("known", q) 62 - assert.Equal(1, len(f)) 89 + // TODO: more parsing tests: bare handles, to:, since:, until:, URL, domain:, lang 63 90 }
+4 -2
search/post_schema.json
··· 79 79 "lang_code": { "type": "keyword", "normalizer": "default" }, 80 80 "lang_code_iso2": { "type": "keyword", "normalizer": "default" }, 81 81 "mention_did": { "type": "keyword", "normalizer": "default" }, 82 - "link_url": { "type": "keyword", "normalizer": "default" }, 83 - "embed_url": { "type": "keyword", "normalizer": "default" }, 84 82 "embed_aturi": { "type": "keyword", "normalizer": "default" }, 85 83 "reply_root_aturi": { "type": "keyword", "normalizer": "default" }, 86 84 "embed_img_count": { "type": "integer" }, ··· 88 86 "embed_img_alt_text_ja": { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch", "copy_to": "everything_ja" }, 89 87 "self_label": { "type": "keyword", "normalizer": "default" }, 90 88 89 + "url": { "type": "keyword", "normalizer": "default" }, 90 + "domain": { "type": "keyword", "normalizer": "default" }, 91 91 "tag": { "type": "keyword", "normalizer": "default" }, 92 92 "emoji": { "type": "keyword", "normalizer": "caseSensitive" }, 93 + 94 + "likesFuzzy": { "type": "integer" }, 93 95 94 96 "everything": { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch" }, 95 97 "everything_ja": { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch" },
+5
search/profile_schema.json
··· 52 52 "img_alt_text": { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch", "copy_to": "everything" }, 53 53 "self_label": { "type": "keyword", "normalizer": "default" }, 54 54 55 + "url": { "type": "keyword", "normalizer": "default" }, 56 + "domain": { "type": "keyword", "normalizer": "default" }, 55 57 "tag": { "type": "keyword", "normalizer": "default" }, 56 58 "emoji": { "type": "keyword", "normalizer": "caseSensitive" }, 57 59 58 60 "has_avatar": { "type": "boolean" }, 59 61 "has_banner": { "type": "boolean" }, 62 + 63 + "pagerank": { "type": "float" }, 64 + "followersFuzzy": { "type": "integer" }, 60 65 61 66 "typeahead": { "type": "search_as_you_type", "analyzer": "textIcu", "search_analyzer": "textIcuSearch" }, 62 67 "everything": { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch" }
+156 -18
search/query.go
··· 49 49 Post any `json:"post"` 50 50 } 51 51 52 + type PostSearchParams struct { 53 + Query string `json:"q"` 54 + Sort string `json:"sort"` 55 + Author *syntax.DID `json:"author"` 56 + Since *syntax.Datetime `json:"since"` 57 + Until *syntax.Datetime `json:"until"` 58 + Mentions *syntax.DID `json:"mentions"` 59 + Lang *syntax.Language `json:"lang"` 60 + Domain string `json:"domain"` 61 + URL string `json:"url"` 62 + Tags []string `json:"tag"` 63 + Viewer *syntax.DID `json:"viewer"` 64 + Offset int `json:"offset"` 65 + Size int `json:"size"` 66 + } 67 + 68 + type ActorSearchParams struct { 69 + Query string `json:"q"` 70 + Typeahead bool `json:"typeahead"` 71 + Follows []syntax.DID `json:"follows"` 72 + Viewer *syntax.DID `json:"viewer"` 73 + Offset int `json:"offset"` 74 + Size int `json:"size"` 75 + } 76 + 77 + // Merges params from another param object in to this one. Intended to meld parsed query with HTTP query params, so not all functionality is supported, and priority is with the "current" object 78 + func (p *PostSearchParams) Update(other *PostSearchParams) { 79 + p.Query = other.Query 80 + if p.Author == nil { 81 + p.Author = other.Author 82 + } 83 + if p.Since == nil { 84 + p.Since = other.Since 85 + } 86 + if p.Until == nil { 87 + p.Until = other.Until 88 + } 89 + if p.Mentions == nil { 90 + p.Mentions = other.Mentions 91 + } 92 + if p.Lang == nil { 93 + p.Lang = other.Lang 94 + } 95 + if p.Domain == "" { 96 + p.Domain = other.Domain 97 + } 98 + if p.URL == "" { 99 + p.URL = other.URL 100 + } 101 + if len(p.Tags) == 0 { 102 + p.Tags = other.Tags 103 + } 104 + } 105 + 106 + // turns search params in to actual elasticsearch/opensearch filter DSL 107 + func (p *PostSearchParams) Filters() []map[string]interface{} { 108 + var filters []map[string]interface{} 109 + 110 + if p.Author != nil { 111 + filters = append(filters, map[string]interface{}{ 112 + "term": map[string]interface{}{"did": map[string]interface{}{ 113 + "value": p.Author.String(), 114 + "case_insensitive": true, 115 + }}, 116 + }) 117 + } 118 + 119 + if p.Mentions != nil { 120 + filters = append(filters, map[string]interface{}{ 121 + "term": map[string]interface{}{"mention_did": map[string]interface{}{ 122 + "value": p.Mentions.String(), 123 + "case_insensitive": true, 124 + }}, 125 + }) 126 + } 127 + 128 + if p.Lang != nil { 129 + // TODO: extracting just the 2-char code would be good 130 + filters = append(filters, map[string]interface{}{ 131 + "term": map[string]interface{}{"lang_code_iso2": map[string]interface{}{ 132 + "value": p.Lang.String(), 133 + "case_insensitive": true, 134 + }}, 135 + }) 136 + } 137 + 138 + if p.Since != nil { 139 + filters = append(filters, map[string]interface{}{ 140 + "range": map[string]interface{}{ 141 + "created_at": map[string]interface{}{ 142 + "gte": p.Since.String(), 143 + }, 144 + }, 145 + }) 146 + } 147 + 148 + if p.Until != nil { 149 + filters = append(filters, map[string]interface{}{ 150 + "range": map[string]interface{}{ 151 + "created_at": map[string]interface{}{ 152 + "lt": p.Until.String(), 153 + }, 154 + }, 155 + }) 156 + } 157 + 158 + if p.URL != "" { 159 + filters = append(filters, map[string]interface{}{ 160 + "term": map[string]interface{}{"url": map[string]interface{}{ 161 + "value": NormalizeLossyURL(p.URL), 162 + "case_insensitive": true, 163 + }}, 164 + }) 165 + } 166 + 167 + if p.Domain != "" { 168 + filters = append(filters, map[string]interface{}{ 169 + "term": map[string]interface{}{"domain": map[string]interface{}{ 170 + "value": p.Domain, 171 + "case_insensitive": true, 172 + }}, 173 + }) 174 + } 175 + 176 + for _, tag := range p.Tags { 177 + filters = append(filters, map[string]interface{}{ 178 + "term": map[string]interface{}{ 179 + "tag": map[string]interface{}{ 180 + "value": tag, 181 + "case_insensitive": true, 182 + }, 183 + }, 184 + }) 185 + } 186 + 187 + return filters 188 + } 189 + 52 190 func checkParams(offset, size int) error { 53 191 if offset+size > 10000 || size > 250 || offset > 10000 || offset < 0 || size < 0 { 54 192 return fmt.Errorf("disallowed size/offset parameters") ··· 56 194 return nil 57 195 } 58 196 59 - func DoSearchPosts(ctx context.Context, dir identity.Directory, escli *es.Client, index, q string, offset, size int) (*EsSearchResponse, error) { 197 + func DoSearchPosts(ctx context.Context, dir identity.Directory, escli *es.Client, index string, params *PostSearchParams) (*EsSearchResponse, error) { 60 198 ctx, span := tracer.Start(ctx, "DoSearchPosts") 61 199 defer span.End() 62 200 63 - if err := checkParams(offset, size); err != nil { 201 + if err := checkParams(params.Offset, params.Size); err != nil { 64 202 return nil, err 65 203 } 66 - queryStr, filters := ParseQuery(ctx, dir, q) 204 + queryStringParams := ParsePostQuery(ctx, dir, params.Query, params.Viewer) 205 + params.Update(&queryStringParams) 67 206 idx := "everything" 68 - if containsJapanese(queryStr) { 207 + if containsJapanese(params.Query) { 69 208 idx = "everything_ja" 70 209 } 71 210 basic := map[string]interface{}{ 72 211 "simple_query_string": map[string]interface{}{ 73 - "query": queryStr, 212 + "query": params.Query, 74 213 "fields": []string{idx}, 75 214 "flags": "AND|NOT|OR|PHRASE|PRECEDENCE|WHITESPACE", 76 215 "default_operator": "and", ··· 78 217 "analyze_wildcard": false, 79 218 }, 80 219 } 220 + filters := params.Filters() 81 221 // filter out future posts (TODO: temporary hack) 82 222 now := syntax.DatetimeNow() 83 223 filters = append(filters, map[string]interface{}{ ··· 99 239 "order": "desc", 100 240 }, 101 241 }, 102 - "size": size, 103 - "from": offset, 242 + "size": params.Size, 243 + "from": params.Offset, 104 244 } 105 245 106 246 return doSearch(ctx, escli, index, query) 107 247 } 108 248 109 - func DoSearchProfiles(ctx context.Context, dir identity.Directory, escli *es.Client, index, q string, offset, size int) (*EsSearchResponse, error) { 249 + func DoSearchProfiles(ctx context.Context, dir identity.Directory, escli *es.Client, index string, params *ActorSearchParams) (*EsSearchResponse, error) { 110 250 ctx, span := tracer.Start(ctx, "DoSearchProfiles") 111 251 defer span.End() 112 252 113 - if err := checkParams(offset, size); err != nil { 253 + if err := checkParams(params.Offset, params.Size); err != nil { 114 254 return nil, err 115 255 } 116 256 117 - queryStr, filters := ParseQuery(ctx, dir, q) 118 257 basic := map[string]interface{}{ 119 258 "simple_query_string": map[string]interface{}{ 120 - "query": queryStr, 259 + "query": params.Query, 121 260 "fields": []string{"everything"}, 122 261 "flags": "AND|NOT|OR|PHRASE|PRECEDENCE|WHITESPACE", 123 262 "default_operator": "and", ··· 135 274 map[string]interface{}{"term": map[string]interface{}{"has_banner": true}}, 136 275 }, 137 276 "minimum_should_match": 0, 138 - "filter": filters, 139 277 "boost": 0.5, 140 278 }, 141 279 }, 142 - "size": size, 143 - "from": offset, 280 + "size": params.Size, 281 + "from": params.Offset, 144 282 } 145 283 146 284 return doSearch(ctx, escli, index, query) 147 285 } 148 286 149 - func DoSearchProfilesTypeahead(ctx context.Context, escli *es.Client, index, q string, size int) (*EsSearchResponse, error) { 287 + func DoSearchProfilesTypeahead(ctx context.Context, escli *es.Client, index string, params *ActorSearchParams) (*EsSearchResponse, error) { 150 288 ctx, span := tracer.Start(ctx, "DoSearchProfilesTypeahead") 151 289 defer span.End() 152 290 153 - if err := checkParams(0, size); err != nil { 291 + if err := checkParams(0, params.Size); err != nil { 154 292 return nil, err 155 293 } 156 294 157 295 query := map[string]interface{}{ 158 296 "query": map[string]interface{}{ 159 297 "multi_match": map[string]interface{}{ 160 - "query": q, 298 + "query": params.Query, 161 299 "type": "bool_prefix", 162 300 "operator": "and", 163 301 "fields": []string{ ··· 169 307 }, 170 308 }, 171 309 }, 172 - "size": size, 310 + "size": params.Size, 173 311 } 174 312 175 313 return doSearch(ctx, escli, index, query)
+208
search/query_test.go
··· 200 200 } 201 201 assert.Equal(1, len(res.Hits.Hits)) 202 202 } 203 + 204 + func TestParsedQuery(t *testing.T) { 205 + assert := assert.New(t) 206 + ctx := context.Background() 207 + escli := testEsClient(t) 208 + dir := identity.NewMockDirectory() 209 + srv := testServer(ctx, t, escli, &dir) 210 + ident := identity.Identity{ 211 + DID: syntax.DID("did:plc:abc111"), 212 + Handle: syntax.Handle("handle.example.com"), 213 + } 214 + other := identity.Identity{ 215 + DID: syntax.DID("did:plc:abc222"), 216 + Handle: syntax.Handle("other.example.com"), 217 + } 218 + dir.Insert(ident) 219 + dir.Insert(other) 220 + 221 + res, err := DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20) 222 + if err != nil { 223 + t.Fatal(err) 224 + } 225 + assert.Equal(0, len(res.Hits.Hits)) 226 + 227 + p1 := appbsky.FeedPost{Text: "basic english post", CreatedAt: "2024-01-02T03:04:05.006Z"} 228 + assert.NoError(srv.indexPost(ctx, &ident, &p1, "app.bsky.feed.post/3kpnillluoh2y", cid.Undef)) 229 + p2 := appbsky.FeedPost{Text: "another english post", CreatedAt: "2024-01-02T03:04:05.006Z"} 230 + assert.NoError(srv.indexPost(ctx, &ident, &p2, "app.bsky.feed.post/3kpnilllu2222", cid.Undef)) 231 + p3 := appbsky.FeedPost{ 232 + Text: "#cat post with hashtag", 233 + CreatedAt: "2024-01-02T03:04:05.006Z", 234 + Facets: []*appbsky.RichtextFacet{ 235 + &appbsky.RichtextFacet{ 236 + Features: []*appbsky.RichtextFacet_Features_Elem{ 237 + &appbsky.RichtextFacet_Features_Elem{ 238 + RichtextFacet_Tag: &appbsky.RichtextFacet_Tag{ 239 + Tag: "trick", 240 + }, 241 + }, 242 + }, 243 + Index: &appbsky.RichtextFacet_ByteSlice{ 244 + ByteStart: 0, 245 + ByteEnd: 4, 246 + }, 247 + }, 248 + }, 249 + } 250 + assert.NoError(srv.indexPost(ctx, &ident, &p3, "app.bsky.feed.post/3kpnilllu3333", cid.Undef)) 251 + p4 := appbsky.FeedPost{ 252 + Text: "@other.example.com post with mention", 253 + CreatedAt: "2024-01-02T03:04:05.006Z", 254 + Facets: []*appbsky.RichtextFacet{ 255 + &appbsky.RichtextFacet{ 256 + Features: []*appbsky.RichtextFacet_Features_Elem{ 257 + &appbsky.RichtextFacet_Features_Elem{ 258 + RichtextFacet_Mention: &appbsky.RichtextFacet_Mention{ 259 + Did: "did:plc:abc222", 260 + }, 261 + }, 262 + }, 263 + Index: &appbsky.RichtextFacet_ByteSlice{ 264 + ByteStart: 0, 265 + ByteEnd: 18, 266 + }, 267 + }, 268 + }, 269 + } 270 + assert.NoError(srv.indexPost(ctx, &ident, &p4, "app.bsky.feed.post/3kpnilllu4444", cid.Undef)) 271 + p5 := appbsky.FeedPost{ 272 + Text: "https://bsky.app... post with hashtag #cat", 273 + CreatedAt: "2024-01-02T03:04:05.006Z", 274 + Facets: []*appbsky.RichtextFacet{ 275 + &appbsky.RichtextFacet{ 276 + Features: []*appbsky.RichtextFacet_Features_Elem{ 277 + &appbsky.RichtextFacet_Features_Elem{ 278 + RichtextFacet_Link: &appbsky.RichtextFacet_Link{ 279 + Uri: "htTPS://www.en.wikipedia.org/wiki/CBOR?q=3&a=1&utm_campaign=123", 280 + }, 281 + }, 282 + }, 283 + Index: &appbsky.RichtextFacet_ByteSlice{ 284 + ByteStart: 0, 285 + ByteEnd: 19, 286 + }, 287 + }, 288 + }, 289 + } 290 + assert.NoError(srv.indexPost(ctx, &ident, &p5, "app.bsky.feed.post/3kpnilllu5555", cid.Undef)) 291 + p6 := appbsky.FeedPost{ 292 + Text: "post with lang (deutsch)", 293 + CreatedAt: "2024-01-02T03:04:05.006Z", 294 + Langs: []string{"ja", "de-DE"}, 295 + } 296 + assert.NoError(srv.indexPost(ctx, &ident, &p6, "app.bsky.feed.post/3kpnilllu6666", cid.Undef)) 297 + p7 := appbsky.FeedPost{Text: "post with old date", CreatedAt: "2020-05-03T03:04:05.006Z"} 298 + assert.NoError(srv.indexPost(ctx, &ident, &p7, "app.bsky.feed.post/3kpnilllu7777", cid.Undef)) 299 + 300 + _, err = srv.escli.Indices.Refresh() 301 + assert.NoError(err) 302 + 303 + // expect all to be indexed 304 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "*", 0, 20) 305 + if err != nil { 306 + t.Fatal(err) 307 + } 308 + assert.Equal(7, len(res.Hits.Hits)) 309 + 310 + // check that english matches both 311 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20) 312 + if err != nil { 313 + t.Fatal(err) 314 + } 315 + assert.Equal(2, len(res.Hits.Hits)) 316 + 317 + // phrase only matches one 318 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "\"basic english\"", 0, 20) 319 + if err != nil { 320 + t.Fatal(err) 321 + } 322 + assert.Equal(1, len(res.Hits.Hits)) 323 + 324 + // posts-by 325 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "from:handle.example.com", 0, 20) 326 + if err != nil { 327 + t.Fatal(err) 328 + } 329 + assert.Equal(7, len(res.Hits.Hits)) 330 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "from:@handle.example.com", 0, 20) 331 + if err != nil { 332 + t.Fatal(err) 333 + } 334 + assert.Equal(7, len(res.Hits.Hits)) 335 + 336 + // hashtag query 337 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "post #trick", 0, 20) 338 + if err != nil { 339 + t.Fatal(err) 340 + } 341 + assert.Equal(1, len(res.Hits.Hits)) 342 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "post #Trick", 0, 20) 343 + if err != nil { 344 + t.Fatal(err) 345 + } 346 + assert.Equal(1, len(res.Hits.Hits)) 347 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "post #trick #allMustMatch", 0, 20) 348 + if err != nil { 349 + t.Fatal(err) 350 + } 351 + assert.Equal(0, len(res.Hits.Hits)) 352 + 353 + // mention query 354 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "@other.example.com", 0, 20) 355 + if err != nil { 356 + t.Fatal(err) 357 + } 358 + assert.Equal(1, len(res.Hits.Hits)) 359 + 360 + // URL and domain queries 361 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "https://en.wikipedia.org/wiki/CBOR?a=1&q=3", 0, 20) 362 + if err != nil { 363 + t.Fatal(err) 364 + } 365 + assert.Equal(1, len(res.Hits.Hits)) 366 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "\"https://en.wikipedia.org/wiki/CBOR?a=1&q=3\"", 0, 20) 367 + if err != nil { 368 + t.Fatal(err) 369 + } 370 + assert.Equal(0, len(res.Hits.Hits)) 371 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "https://en.wikipedia.org/wiki/CBOR", 0, 20) 372 + if err != nil { 373 + t.Fatal(err) 374 + } 375 + assert.Equal(0, len(res.Hits.Hits)) 376 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "domain:en.wikipedia.org", 0, 20) 377 + if err != nil { 378 + t.Fatal(err) 379 + } 380 + assert.Equal(1, len(res.Hits.Hits)) 381 + 382 + // lang filter 383 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "lang:de", 0, 20) 384 + if err != nil { 385 + t.Fatal(err) 386 + } 387 + assert.Equal(1, len(res.Hits.Hits)) 388 + 389 + // date range filters 390 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "since:2023-01-01T00:00:00Z", 0, 20) 391 + if err != nil { 392 + t.Fatal(err) 393 + } 394 + assert.Equal(6, len(res.Hits.Hits)) 395 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "since:2023-01-01", 0, 20) 396 + if err != nil { 397 + t.Fatal(err) 398 + } 399 + assert.Equal(6, len(res.Hits.Hits)) 400 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "until:2023-01-01", 0, 20) 401 + if err != nil { 402 + t.Fatal(err) 403 + } 404 + assert.Equal(1, len(res.Hits.Hits)) 405 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "until:asdf", 0, 20) 406 + if err != nil { 407 + t.Fatal(err) 408 + } 409 + assert.Equal(7, len(res.Hits.Hits)) 410 + }
+6 -4
search/testdata/transform-post-fixtures.json
··· 11 11 "embed": { 12 12 "$type": "app.bsky.embed.external", 13 13 "external": { 14 - "uri": "https://bsky.app", 14 + "uri": "https://www.bsky.app:443/index.html", 15 15 "title": "Bluesky Social", 16 16 "description": "See what's next.", 17 17 "thumb": { ··· 34 34 "record_cid": "bafyreibjifzpqj6o6wcq3hejh7y4z4z2vmiklkvykc57tw3pcbx3kxifpm", 35 35 "created_at": "2023-08-07T05:46:14.423045Z", 36 36 "text": "post which embeds an external URL as a card", 37 - "embed_url": "https://bsky.app", 37 + "url": ["https://bsky.app"], 38 + "domain": ["bsky.app"], 38 39 "embed_img_count": 0 39 40 } 40 41 }, ··· 69 70 "features": [ 70 71 { 71 72 "$type": "app.bsky.richtext.facet#link", 72 - "uri": "https://en.wikipedia.org/wiki/CBOR" 73 + "uri": "https://en.wikipedia.org/wiki/CBOR?utm_campaign=123" 73 74 } 74 75 ] 75 76 }, ··· 123 124 "created_at": "2023-08-07T05:46:14.423045Z", 124 125 "text": "longer example with #some #hashtags, emoji \u2620 \ud83d\ude42 \ud83c\udf85\ud83c\udfff, flags \ud83c\uddf8\ud83c\udde8 ", 125 126 "reply_root_aturi": "at://did:plc:u5cwb2mwiv2bfq53cjufe6yn/app.bsky.feed.post/3k43tv4rft22g", 126 - "link_url": [ "https://en.wikipedia.org/wiki/CBOR" ], 127 127 "mention_did": [ "did:plc:ewvi7nxzyoun6zhxrhs64oiz" ], 128 128 "embed_aturi": "at://did:plc:u5cwb2mwiv2bfq53cjufe6yn/app.bsky.feed.post/3k44deefqdk2g", 129 129 "lang_code": ["th", "en-US"], 130 130 "lang_code_iso2": ["th", "en"], 131 131 "self_label": ["nudity"], 132 + "url": [ "https://en.wikipedia.org/wiki/CBOR" ], 133 + "domain": [ "en.wikipedia.org" ], 132 134 "tag": ["some", "thing"], 133 135 "emoji": ["\u2620", "\ud83d\ude42", "\ud83c\udf85\ud83c\udfff", "\ud83c\uddf8\ud83c\udde8"], 134 136 "embed_img_count": 0
+20 -8
search/transform.go
··· 2 2 3 3 import ( 4 4 "log/slog" 5 + "net/url" 5 6 "strings" 6 7 "time" 7 8 ··· 21 22 Description *string `json:"description,omitempty"` 22 23 ImgAltText []string `json:"img_alt_text,omitempty"` 23 24 SelfLabel []string `json:"self_label,omitempty"` 25 + URL []string `json:"url,omitempty"` 26 + Domain []string `json:"domain,omitempty"` 24 27 Tag []string `json:"tag,omitempty"` 25 28 Emoji []string `json:"emoji,omitempty"` 26 29 HasAvatar bool `json:"has_avatar"` ··· 38 41 LangCode []string `json:"lang_code,omitempty"` 39 42 LangCodeIso2 []string `json:"lang_code_iso2,omitempty"` 40 43 MentionDID []string `json:"mention_did,omitempty"` 41 - LinkURL []string `json:"link_url,omitempty"` 42 - EmbedURL *string `json:"embed_url,omitempty"` 43 44 EmbedATURI *string `json:"embed_aturi,omitempty"` 44 45 ReplyRootATURI *string `json:"reply_root_aturi,omitempty"` 45 46 EmbedImgCount int `json:"embed_img_count"` 46 47 EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"` 47 48 EmbedImgAltTextJA []string `json:"embed_img_alt_text_ja,omitempty"` 48 49 SelfLabel []string `json:"self_label,omitempty"` 50 + URL []string `json:"url,omitempty"` 51 + Domain []string `json:"domain,omitempty"` 49 52 Tag []string `json:"tag,omitempty"` 50 53 Emoji []string `json:"emoji,omitempty"` 51 54 } ··· 117 120 } 118 121 } 119 122 var mentionDIDs []string 120 - var linkURLs []string 123 + var urls []string 121 124 for _, facet := range post.Facets { 122 125 for _, feat := range facet.Features { 123 126 if feat.RichtextFacet_Mention != nil { 124 127 mentionDIDs = append(mentionDIDs, feat.RichtextFacet_Mention.Did) 125 128 } 126 129 if feat.RichtextFacet_Link != nil { 127 - linkURLs = append(linkURLs, feat.RichtextFacet_Link.Uri) 130 + urls = append(urls, feat.RichtextFacet_Link.Uri) 128 131 } 129 132 } 130 133 } ··· 132 135 if post.Reply != nil { 133 136 replyRootATURI = &(post.Reply.Root.Uri) 134 137 } 135 - var embedURL *string 136 138 if post.Embed != nil && post.Embed.EmbedExternal != nil { 137 - embedURL = &post.Embed.EmbedExternal.External.Uri 139 + urls = append(urls, post.Embed.EmbedExternal.External.Uri) 138 140 } 139 141 var embedATURI *string 140 142 if post.Embed != nil && post.Embed.EmbedRecord != nil { ··· 164 166 } 165 167 } 166 168 169 + var domains []string 170 + for i, raw := range urls { 171 + clean := NormalizeLossyURL(raw) 172 + urls[i] = clean 173 + u, err := url.Parse(clean) 174 + if nil == err { 175 + domains = append(domains, u.Hostname()) 176 + } 177 + } 178 + 167 179 doc := PostDoc{ 168 180 DocIndexTs: syntax.DatetimeNow().String(), 169 181 DID: ident.DID.String(), ··· 173 185 LangCode: post.Langs, 174 186 LangCodeIso2: langCodeIso2, 175 187 MentionDID: mentionDIDs, 176 - LinkURL: linkURLs, 177 - EmbedURL: embedURL, 178 188 EmbedATURI: embedATURI, 179 189 ReplyRootATURI: replyRootATURI, 180 190 EmbedImgCount: embedImgCount, 181 191 EmbedImgAltText: embedImgAltText, 182 192 EmbedImgAltTextJA: embedImgAltTextJA, 183 193 SelfLabel: selfLabels, 194 + URL: urls, 195 + Domain: domains, 184 196 Tag: parsePostTags(post), 185 197 Emoji: parseEmojis(post.Text), 186 198 }
+61
search/url.go
··· 1 + package search 2 + 3 + import ( 4 + "net/url" 5 + 6 + "github.com/PuerkitoBio/purell" 7 + ) 8 + 9 + var trackingParams = []string{ 10 + "__s", 11 + "_ga", 12 + "campaign_id", 13 + "ceid", 14 + "emci", 15 + "emdi", 16 + "fbclid", 17 + "gclid", 18 + "hootPostID", 19 + "mc_eid", 20 + "mkclid", 21 + "mkt_tok", 22 + "msclkid", 23 + "pk_campaign", 24 + "pk_kwd", 25 + "sessionid", 26 + "sourceid", 27 + "utm_campaign", 28 + "utm_content", 29 + "utm_id", 30 + "utm_medium", 31 + "utm_source", 32 + "utm_term", 33 + "xpid", 34 + } 35 + 36 + // aggressively normalizes URL, for search indexing and matching. it is possible the URL won't be directly functional after this normalization 37 + func NormalizeLossyURL(raw string) string { 38 + clean, err := purell.NormalizeURLString(raw, purell.FlagsUsuallySafeGreedy|purell.FlagRemoveDirectoryIndex|purell.FlagRemoveFragment|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveWWW|purell.FlagSortQuery) 39 + if err != nil { 40 + return raw 41 + } 42 + 43 + // remove tracking params 44 + u, err := url.Parse(clean) 45 + if err != nil { 46 + return clean 47 + } 48 + if u.RawQuery == "" { 49 + return clean 50 + } 51 + params := u.Query() 52 + 53 + // there is probably a more efficient way to do this 54 + for _, p := range trackingParams { 55 + if params.Has(p) { 56 + params.Del(p) 57 + } 58 + } 59 + u.RawQuery = params.Encode() 60 + return u.String() 61 + }
+33
search/url_test.go
··· 1 + package search 2 + 3 + import ( 4 + "testing" 5 + 6 + "github.com/stretchr/testify/assert" 7 + ) 8 + 9 + func TestNormalizeLossyURL(t *testing.T) { 10 + assert := assert.New(t) 11 + 12 + fixtures := []struct { 13 + orig string 14 + clean string 15 + }{ 16 + {orig: "", clean: ""}, 17 + {orig: "asdf", clean: "asdf"}, 18 + {orig: "HTTP://bSky.app:80/index.html", clean: "http://bsky.app"}, 19 + {orig: "https://example.com/thing?c=123&utm_campaign=blah&a=first", clean: "https://example.com/thing?a=first&c=123"}, 20 + {orig: "https://example.com/thing?c=123&utm_campaign=blah&a=first", clean: "https://example.com/thing?a=first&c=123"}, 21 + {orig: "http://example.com/foo//bar.html", clean: "http://example.com/foo/bar.html"}, 22 + {orig: "http://example.com/bar.html#section1", clean: "http://example.com/bar.html"}, 23 + {orig: "http://example.com/foo/", clean: "http://example.com/foo"}, 24 + {orig: "http://example.com/", clean: "http://example.com"}, 25 + {orig: "http://example.com/%7Efoo", clean: "http://example.com/~foo"}, 26 + {orig: "http://example.com/foo/./bar/baz/../qux", clean: "http://example.com/foo/bar/qux"}, 27 + {orig: "http://www.example.com/", clean: "http://example.com"}, 28 + } 29 + 30 + for _, fix := range fixtures { 31 + assert.Equal(fix.clean, NormalizeLossyURL(fix.orig)) 32 + } 33 + }