···11+[package]
22+name = "trap"
33+version= "0.0.0"
44+edition = "2024"
55+publish = false
66+77+[dependencies]
88+anyhow = "1.0.100"
99+clap = { version = "4.5.53", features = ["derive", "env"] }
1010+data-encoding = "2.9.0"
1111+futures-util = { version = "0.3.31", features = ["sink"] }
1212+reqwest = { version = "0.12.26", features = ["json"] }
1313+serde = { version = "1.0.228", features = ["derive"] }
1414+serde_json = "1.0.145"
1515+sqlx = { version = "0.8.6", features = ["runtime-tokio", "tls-native-tls", "postgres", "time", "json", "macros", "derive"] }
1616+thiserror = "2.0.17"
1717+tokio = { version = "1.47.1", features = ["io-util", "macros", "net", "process", "signal", "rt-multi-thread"] }
1818+tokio-tungstenite = { version = "0.28.0", features = ["native-tls"] }
1919+tracing = "0.1.43"
2020+tracing-subscriber = { version = "0.3.20", features = ["env-filter"] }
2121+url = "2.5.7"
2222+2323+[profile.release]
2424+panic = "abort"
2525+lto = "fat"
2626+strip = true
+102
README.md
···11+# Trap
22+33+Traverse records received from a Tap service and dump into a PostgreSQL database.
44+55+## Example Usage
66+77+In this example we'll *tap into* (😉) everything in the "sh.tangled.*" NSID starting from the @tangled.org repo (ATproto repo, not git repo).
88+99+1. Setup a PostgreSQL cluster and create a database
1010+1111+...
1212+1313+Let's assume you've created a DB called `trap_tangled`.
1414+1515+2. Tap
1616+1717+```bash
1818+TAP_COLLECTION_FILTERS="sh.tangled.*" TAP_BIND=127.0.0.1:2480 tap run
1919+```
2020+2121+`trap` will collect *any* records the Tap service sends. You can control this with the `TAP_COLLECTION_FILTERS` variable.
2222+2323+3. Trap
2424+2525+Run `trap`, seeding from the DID of @tangled.org:
2626+2727+```bash
2828+RUST_LOG=debug,sqlx=warn INDEX_DATABASE_URL=postgresql:///trap_tangled trap --seed did:plc:wshs7t2adsemcrrd4snkeqli
2929+```
3030+3131+`trap` will submit the seed DIDs to the Tap service. Each record return by Tap will be scanned, and any DIDs found will also be added to the Tap service.
3232+3333+4. Wait.
3434+3535+*Eventually*, and I mean *eventually*, you'll end up with a table named `record` filled with every "sh.tangled.*" record reachable from the @tangled.org repo.
3636+3737+5. Perform *Data Science*
3838+3939+Time to jump into `psql`!
4040+4141+The `record_by_collection` view counts how many records have been indexed for each collection.
4242+4343+```
4444+trap_tangled=# select * from record_by_collection ;
4545+ collection | count
4646+-------------------------------+-------
4747+ sh.tangled.feed.star | 5350
4848+ sh.tangled.spindle.member | 4821
4949+ sh.tangled.graph.follow | 4425
5050+ sh.tangled.knot.member | 3607
5151+ sh.tangled.repo | 2618
5252+ sh.tangled.repo.pull | 1785
5353+ sh.tangled.repo.issue | 1390
5454+ sh.tangled.repo.issue.comment | 1386
5555+ sh.tangled.publicKey | 1298
5656+ sh.tangled.repo.pull.comment | 1127
5757+ sh.tangled.actor.profile | 713
5858+ sh.tangled.label.op | 628
5959+ sh.tangled.feed.reaction | 479
6060+ sh.tangled.string | 364
6161+ sh.tangled.repo.issue.state | 320
6262+ sh.tangled.knot | 158
6363+ sh.tangled.repo.collaborator | 146
6464+ sh.tangled.label.definition | 106
6565+ sh.tangled.repo.artifact | 69
6666+ sh.tangled.spindle | 51
6767+(20 rows)
6868+6969+trap_tangled=#
7070+```
7171+7272+Analyse SSH public-key statistics:
7373+7474+```
7575+trap_tangled=# SELECT split_part(data->>'key', ' ', 1) AS key_type,
7676+ count(*) AS count
7777+ FROM record
7878+ WHERE collection = 'sh.tangled.publicKey'
7979+ GROUP BY (split_part(data->>'key', ' ', 1))
8080+ ORDER BY (count(*)) DESC;
8181+ key_type | count
8282+------------------------------------+-------
8383+ ssh-ed25519 | 989
8484+ ssh-rsa | 239
8585+ sk-ssh-ed25519@openssh.com | 44
8686+ ecdsa-sha2-nistp256 | 22
8787+ sh-ed25519 | 2
8888+ sk-ecdsa-sha2-nistp256@openssh.com | 1
8989+ ecdsa-sha2-nistp521 | 1
9090+(7 rows)
9191+9292+trap_tangled=#
9393+```
9494+9595+Fascinating!
9696+9797+## Future work
9898+9999+????
100100+101101+Suggestions and PRs welcome!
102102+
+10
migrations/20251216142421_init.down.sql
···11+DROP VIEW record_by_collection;
22+33+DROP INDEX identity_handle_idx;
44+DROP TABLE identity;
55+DROP TYPE identity_status;
66+77+DROP TABLE deleted_record;
88+99+DROP INDEX record_collection_idx;
1010+DROP TABLE record;
+48
migrations/20251216142421_init.up.sql
···11+CREATE TABLE IF NOT EXISTS record (
22+ did text NOT NULL,
33+ collection text NOT NULL,
44+ rkey text NOT NULL,
55+ rev text NOT NULL,
66+ cid text NOT NULL,
77+ live boolean NOT NULL,
88+ data jsonb NOT NULL,
99+1010+ PRIMARY KEY (did, collection, rkey)
1111+);
1212+1313+CREATE INDEX ON record (collection);
1414+1515+CREATE TABLE IF NOT EXISTS deleted_record (
1616+ LIKE record
1717+ INCLUDING all
1818+);
1919+2020+DO $$ BEGIN
2121+ CREATE TYPE identity_status AS ENUM (
2222+ 'active',
2323+ 'takendown',
2424+ 'suspended',
2525+ 'deactivated',
2626+ 'deleted'
2727+ );
2828+EXCEPTION
2929+ WHEN duplicate_object THEN null;
3030+END $$;
3131+3232+CREATE TABLE IF NOT EXISTS identity (
3333+ did text NOT NULL,
3434+ handle text NOT NULL,
3535+ active boolean NOT NULL,
3636+ status identity_status NOT NULL,
3737+3838+ PRIMARY KEY (did)
3939+);
4040+4141+CREATE INDEX ON identity (handle);
4242+4343+CREATE OR REPLACE VIEW record_by_collection AS
4444+ SELECT collection,
4545+ count(*) AS count
4646+ FROM record
4747+ GROUP BY collection
4848+ ORDER BY (count(*)) DESC;
+7
queries/upsert_record.sql
···11+INSERT INTO record (did, collection, rkey, rev, cid, live, data)
22+ VALUES ($1, $2, $3, $4, $5, $6, $7)
33+ ON CONFLICT
44+ ON CONSTRAINT record_pkey
55+ DO UPDATE
66+ SET (rev, cid, live, data) = (EXCLUDED.rev, EXCLUDED.cid, EXCLUDED.live, EXCLUDED.data)
77+ WHERE EXCLUDED.rev > record.rev