Archive of the USPol Labeler's automatic labeling script. Out-of-date. Must run alongside Ozone. DO NOT OPEN ISSUES OR PULLS -- THEY WILL BE IGNORED/CLOSED
1import { AppBskyActorGetProfile, AppBskyEmbedExternal, AppBskyEmbedImages, AppBskyEmbedVideo, AppBskyFeedDefs, AppBskyFeedPost } from "@atproto/api";
2import { ESCALATION_THRESHOLD, ESCALATE_AND_LABEL_THRESHOLD, keywordList, LABEL_THRESHOLD, overridesList, redis, VERBOSE, CRAWL_THREAD, DRY_RUN, MISCELLANEOUS_LABEL, NO_ALT_TEXT_LABEL } from "../main.ts";
3import { Agent } from "@atproto/api";
4import { retry } from "@atproto/common";
5import { CredentialSession } from "@atproto/api";
6import { hasAlreadyHandled } from "./redis.ts";
7
8/** If there are more than 2 categories set, both of them get 25 points added.
9 So if a "political" post is detected, it's more likely that other keywords
10 may be related to their intended subjects. For example, a post that says
11 "vote trump" should definitely be labeled under the Trump label, and maybe
12 the Election label. But either separately could refer to a "trump card" or
13 to a PTA or corporate election, or benefits election, which we don't care about.
14 This value can be tweaked as needed. */
15export const CATEGORY_SHARE_POINTS = 25;
16
17/** The "bonus points" added to every category if the post came to our attention via a report.
18 * If a maybe-political post is reported, it's much more likely that it is indeed political.
19 * If it still doesn't meet criteria, the report is not dismissed. */
20export const REPORT_BONUS_POINTS = 25;
21
22/** If this post was reached because of crawling, that means another post in the thread matched
23 * a keyword. Therefore, this one is more likely to be political, and therefore it gets bonus
24 * points. */
25export const CRAWLED_BONUS_POINTS = 10;
26
27export const publicAgent = new Agent(new CredentialSession(new URL("https://public.api.bsky.app"), fetch, undefined));
28
29function escapeRegExp(string: string) {
30 return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string
31}
32
33const WORD_SEPARATOR_PATTERN = /(?:^|[ ",!.?:_~#+=$%&*)(\[\]{}<>'’-]|$)/;
34const WSP_STRING = WORD_SEPARATOR_PATTERN.toString().slice(1).slice(0,-1);
35
36export interface AuditPostOpts {
37 isReport?: boolean;
38 /** Did you crawl from another post to find this one? */
39 crawled?: boolean;
40 /** Used with [crawled]; indicates whether or not the root post got a label.
41 * The bonus points are not applied if no label was applied to the initial post. */
42 rootLabeled?: boolean;
43};
44const defaultAuditOpts: Required<AuditPostOpts> = {
45 isReport: false,
46 crawled: false,
47 rootLabeled: false,
48}
49
50export async function auditPostForKeywords(post: AppBskyFeedDefs.PostView, agent: Agent, me: AppBskyActorGetProfile.Response, zzzoptions: Partial<AuditPostOpts>) {
51 const opts: AuditPostOpts = {...defaultAuditOpts, ...zzzoptions};
52 const scores: Record<string, number> = {}
53 const matchedKeywords: Record<string, string[]> = {}
54
55 const alreadyHandled = await hasAlreadyHandled(post.uri);
56 if (alreadyHandled && VERBOSE) console.log("Already handled this one:", post.uri);
57
58 const overrideData = overridesList.find((v,_,__) => v.subject == post.author.did);
59 if (overrideData?.skip || alreadyHandled) {
60 const _suffix = overrideData?.skip ? "user is skipped in overrides" : "post was recently handled"
61 if (opts.isReport) await agent.tools.ozone.moderation.emitEvent({
62 event: {
63 $type: "tools.ozone.moderation.defs#modEventEscalate",
64 comment: `[Automated] Escalated due to report (${_suffix})`,
65 },
66 subject: {
67 $type: "com.atproto.repo.strongRef",
68 uri: post.uri,
69 cid: post.cid
70 },
71 createdBy: me.data.did,
72 });
73 return; // don't do anything for "skipped" users
74 } else if (overrideData?.score && MISCELLANEOUS_LABEL) {
75 // apply to "misc-or-bonus", which gets augmented and applied later
76 scores["or:"+MISCELLANEOUS_LABEL] ??= 0
77 scores["or:"+MISCELLANEOUS_LABEL] += overrideData.score
78 }
79
80 const _badAltText = ["alt text", "screenshot"];
81 const allTextList = [(post.record as AppBskyFeedPost.Record).text]
82 // It would be nice if we could check records (quotes) here,
83 // but that would hit the rate limit really fast.
84 // Looking for the reply chain would do the same.
85 // So neither of those can be done. Such a shame.
86 const embed = (post.record as AppBskyFeedPost.Record).embed;
87 if (AppBskyEmbedExternal.isMain(embed)) {
88 allTextList.push(embed?.external.title)
89 allTextList.push(embed?.external.description)
90 } else if (AppBskyEmbedImages.isMain(embed)) {
91 for (const image of embed?.images) {
92 if (image.alt) allTextList.push(image.alt)
93 if (!image.alt || _badAltText.includes(image.alt.toLowerCase())) {
94 if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD;
95 if (DRY_RUN || VERBOSE) console.info(" * Missing alt text");
96 } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for image");
97 }
98 } else if (AppBskyEmbedVideo.isMain(embed)) {
99 if (embed?.alt) allTextList.push(embed?.alt)
100 if (!embed?.alt || _badAltText.includes(embed?.alt.toLowerCase())) {
101 if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD;
102 if (DRY_RUN || VERBOSE) console.info(" * Missing alt text");
103 } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for video");
104 }
105 if (Object.keys(post.record).includes("bridgyOriginalText")) {
106 // deno-lint-ignore no-explicit-any
107 const bridgyOriginalText = (post.record as any).bridgyOriginalText;
108 allTextList.push(bridgyOriginalText)
109 // TODO: if BridgyFed ever adds a CW field, I can add it here
110 }
111 for (const tag in (post.record as AppBskyFeedPost.Record).tags) {
112 allTextList.push("#"+tag);
113 }
114 const allText = allTextList.join(" ");
115 const splitPost = allText.toLowerCase().split(WORD_SEPARATOR_PATTERN).filter((v,_,__) => v != "")
116
117 for (const entry of keywordList) {
118 // deno-lint-ignore no-inner-declarations no-var
119 var matchesLang: boolean|null = null;
120 inner: for (const keyword of entry.keywords) {
121 if (matchedKeywords[entry.label]?.join(", ").includes(keyword)) {
122 break inner
123 }
124 if (keyword.startsWith("-") && !keyword.startsWith("-$")) {
125 // keywords starting with - have a negative effect; the keyword _must not_ be present to match
126 const kw = keyword.substring(1);
127 if (splitPost.includes(kw)) {
128 break inner
129 } else if (allText.toLowerCase().includes(kw) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(kw)+WSP_STRING))) {
130 // This'll match phrases, but it's a little more intensive so we'll do a light check first
131 break inner
132 }
133 continue;
134 }
135 if (keyword.startsWith("$")) {
136 // keywords starting with $ check self-labels (or language)
137 if (keyword.startsWith("$lang:")) {
138 matchesLang ??= false;
139 // language check
140 const value = keyword.replace("$lang:","");
141 // deno-lint-ignore no-explicit-any
142 if (((post.record as any).langs as string[] | undefined)?.includes(value)) {
143 matchesLang = true;
144 }
145 }
146 if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) {
147 scores[entry.label] ??= 0
148 scores[entry.label] += entry.score
149 matchedKeywords[entry.label] ??= []
150 matchedKeywords[entry.label].push(keyword)
151 break inner;
152 }
153 } else if (keyword.startsWith("-$lang:")) {
154 // prefix -$lang: negates language check: posts marked with this language
155 // don't match the keyword (i.e. dem,-$lang:de)
156 const value = keyword.replace("-$lang:","");
157 // deno-lint-ignore no-explicit-any
158 if (((post.record as any).langs as string[] | undefined)?.includes(value)) {
159 break inner;
160 }
161 } else if (keyword.startsWith("-$")) {
162 if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) {
163 break inner;
164 }
165 }
166 if (matchesLang === false) break inner;
167
168 if (splitPost.includes(keyword)) {
169 scores[entry.label] ??= 0
170 scores[entry.label] += entry.score
171 matchedKeywords[entry.label] ??= []
172 matchedKeywords[entry.label].push(keyword)
173 break inner
174 } else if (allText.toLowerCase().includes(keyword) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(keyword)+WSP_STRING))) {
175 // This'll match phrases, but it's a little more intensive so we'll do a light check first
176 scores[entry.label] ??= 0
177 // make them less potent
178 //scores[entry.label] += Math.max(entry.score-10,0)
179 scores[entry.label] += entry.score
180 matchedKeywords[entry.label] ??= []
181 matchedKeywords[entry.label].push(keyword)
182 break inner
183 }
184 }
185 }
186 for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("or:"))) {
187 // A label in keywords.tsv with the or: prefix, i.e. or:miscellaneous-uspol, indicates that
188 // if there are no labels to match it to, the prefixed label should be used.
189 // If there are other labels, it applies as bonus points instead.
190 const label = fullLabel.replace("or:","");
191 // The misc-or-bonus pseudo-label converts into bonus points, or into miscellaneous-uspol if there is no other category.
192 if ((Object.keys(scores).length >= 2 && !scores["bonus-points-only"]) || Object.keys(scores).length >= 3) {
193 // Another category! Make it bonus points
194 scores["bonus-points-only"] ??= 0
195 scores["bonus-points-only"] += score
196 matchedKeywords["bonus-points-only"] ??= []
197 matchedKeywords["bonus-points-only"].push(...matchedKeywords[fullLabel])
198 } else {
199 // No other category! Make it miscellaneous-uspol
200 scores[label] = score
201 matchedKeywords[label] = matchedKeywords[fullLabel]
202 }
203 delete scores[fullLabel];
204 delete matchedKeywords[fullLabel];
205 }
206 for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("bo:"))) {
207 // Applies bonus points to a specific label ONLY, and only if it already exists
208 // (so bonus points alone can't label a post)
209 // in other words, for a bo:<LABEL> to be considered, another keyword under
210 // <LABEL> has to already have matched
211 const label = fullLabel.replace("or:","");
212 if (scores[label] && scores[label] > 0) {
213 scores[label] += score;
214 matchedKeywords[label] ??= []; // shouldn't be necesssary
215 matchedKeywords[label].push(...matchedKeywords[fullLabel])
216 }
217 delete scores[fullLabel];
218 delete matchedKeywords[fullLabel];
219 }
220 for (const [fullLabel, _] of Object.entries(scores).filter(([k,_]) => k.includes(":"))) {
221 // Clean up any invalid-prefixed labels
222 delete scores[fullLabel];
223 delete matchedKeywords[fullLabel];
224 }
225 if (Object.keys(scores).length > 0 && Object.entries(scores).some(([_,v]) => v > 0)) {
226 if (Object.keys(scores).length == 1 && scores["bonus-points-only"]) {
227 if (!opts.isReport) return;
228 if (DRY_RUN) {
229 console.info(" * Escalated due to report");
230 console.info(" * Bonus points matched: "+matchedKeywords["bonus-points-only"].join(", "));
231 }
232 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
233 event: {
234 $type: "tools.ozone.moderation.defs#modEventEscalate",
235 comment: `[Automated] Escalated due to report (only bonus points matched: ${matchedKeywords["bonus-points-only"].join(", ")})`,
236 },
237 subject: {
238 $type: "com.atproto.repo.strongRef",
239 uri: post.uri,
240 cid: post.cid
241 },
242 createdBy: me.data.did,
243 });
244 };
245 // Add share points to each category, if multiple categories are present:
246 if (Object.keys(scores).length >= 2) {
247 // if there are multiple entries...
248 for (const key in scores) {
249 // add CATEGORY_SHARE_POINTS (25 right now) to every entry
250 //scores.set(key, scores.get(key)! + CATEGORY_SHARE_POINTS)
251 if (key == "bonus-points-only") {continue}
252 if (scores["bonus-points-only"] > 0) {
253 scores[key] += scores["bonus-points-only"]
254 } else {
255 scores[key] += CATEGORY_SHARE_POINTS
256 }
257 }
258 }
259 // deno-lint-ignore no-empty
260 try {delete scores["bonus-points-only"]} finally {}
261 // Add bonus points for reports
262 if (opts.isReport) {
263 for (const key in scores) {
264 scores[key] += REPORT_BONUS_POINTS
265 }
266 }
267 // Add bonus for crawled threads/quotes
268 if (opts.crawled && opts.rootLabeled) {
269 if (VERBOSE) console.log("Reached by crawling:",post.uri)
270 for (const key in scores) {
271 scores[key] += CRAWLED_BONUS_POINTS
272 }
273 }
274 // deno-lint-ignore no-inner-declarations no-var
275 var comment = "[Automated] Confidence levels:";
276 const escalatingLabels: string[] = [];
277 const likelyLabels: string[] = [];
278 const certainLabels: string[] = [];
279 if (NO_ALT_TEXT_LABEL && scores[NO_ALT_TEXT_LABEL] && Object.keys(scores).length == 1) {
280 comment = "[Automated] Missing alt text!";
281 if (!certainLabels.includes(NO_ALT_TEXT_LABEL)) certainLabels.push(NO_ALT_TEXT_LABEL)
282 } else if (NO_ALT_TEXT_LABEL && scores[NO_ALT_TEXT_LABEL]) {
283 comment = "[Automated] Missing alt text!\r\nConfidence levels:"
284 if (!certainLabels.includes(NO_ALT_TEXT_LABEL)) certainLabels.push(NO_ALT_TEXT_LABEL)
285 }
286 for (const key in scores) {
287 if (key === undefined) continue;
288 if (NO_ALT_TEXT_LABEL && key == NO_ALT_TEXT_LABEL) continue;
289 const score = scores[key]!;
290 const kws = matchedKeywords[key] ?? ["unknown"];
291 comment += `\r\n${key}: ${score} (matched: ${kws.join(", ")}); `;
292 if (score >= LABEL_THRESHOLD) {
293 certainLabels.push(key)
294 } else if (score >= ESCALATE_AND_LABEL_THRESHOLD) {
295 likelyLabels.push(key)
296 } else if (score >= ESCALATION_THRESHOLD) {
297 escalatingLabels.push(key)
298 }
299 }
300 if (matchedKeywords["bonus-points-only"]) {
301 comment += `\r\nBonus point keywords: ${matchedKeywords["bonus-points-only"].join(", ")}`;
302 }
303 await retry(async () => await redis?.pipelineCommands([
304 ["MULTI"],
305 ["HSET", "alreadyHandled", post.uri, "true"],
306 ["HEXPIRE", "alreadyHandled", 86400, "NX", "FIELDS", 1, post.uri],
307 ["EXEC"],
308 ]), {maxRetries: 3});
309 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
310 event: {
311 $type: "tools.ozone.moderation.defs#modEventTag",
312 add: ["auto-handled"],
313 remove: [],
314 },
315 subject: {
316 $type: "com.atproto.repo.strongRef",
317 uri: post.uri,
318 cid: post.cid
319 },
320 createdBy: me.data.did,
321 });
322 const combinedLabels = [...certainLabels, ...likelyLabels];
323 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
324 event: {
325 $type: combinedLabels.length > 0
326 ? "tools.ozone.moderation.defs#modEventLabel"
327 : escalatingLabels.length > 0
328 ? "tools.ozone.moderation.defs#modEventEscalate"
329 : "tools.ozone.moderation.defs#modEventComment",
330 comment: comment,
331 createLabelVals: combinedLabels.length == 0 ? undefined : combinedLabels,
332 negateLabelVals: combinedLabels.length == 0 ? undefined : [],
333 },
334 subject: {
335 $type: "com.atproto.repo.strongRef",
336 uri: post.uri,
337 cid: post.cid
338 },
339 createdBy: me.data.did,
340 });
341 if (DRY_RUN && certainLabels.length > 0) console.info(" * Auto label:", comment)
342 else if (DRY_RUN && likelyLabels.length > 0) console.info(" * Auto label and escalate:", comment)
343 else if (DRY_RUN && escalatingLabels.length > 0) console.info(" * Escalate:", comment)
344 else if (DRY_RUN) console.info(" * No action:", comment)
345 if (certainLabels.length > 0) {
346 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
347 event: {
348 $type: "tools.ozone.moderation.defs#modEventAcknowledge",
349 },
350 subject: {
351 $type: "com.atproto.repo.strongRef",
352 uri: post.uri,
353 cid: post.cid
354 },
355 createdBy: me.data.did,
356 });
357 } else if (likelyLabels.length > 0) {
358 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
359 event: {
360 $type: "tools.ozone.moderation.defs#modEventEscalate",
361 comment: "[Automated] Escalated due to labels: "+likelyLabels.join(",")
362 },
363 subject: {
364 $type: "com.atproto.repo.strongRef",
365 uri: post.uri,
366 cid: post.cid
367 },
368 createdBy: me.data.did,
369 });
370 }
371 if (CRAWL_THREAD && !opts.crawled) try {
372 if (await hasAlreadyHandled((post.record as AppBskyFeedPost.Record).reply?.parent?.uri??"__defnot__") == true) return;
373 // If the parent has already/recently been handled, there's not
374 // much use in crawling the thread again.
375 const thread = await publicAgent.app.bsky.feed.getPostThread({
376 uri: post.uri,
377 // depth: 6,
378 // parentHeight: 80,
379 });
380 if (thread.success) {
381 const allPosts: AppBskyFeedDefs.PostView[] = [];
382 // deno-lint-ignore no-inner-declarations
383 function crawl(threadPost: AppBskyFeedDefs.ThreadViewPost) {
384 /// Crawls just the parent and replies fields.
385 /// The "post" field is added next to crawl().
386 if (AppBskyFeedDefs.isThreadViewPost(threadPost.parent)) {
387 allPosts.push(threadPost.parent.post);
388 crawl(threadPost.parent);
389 }
390 for (const reply of (threadPost.replies??[])) {
391 if (AppBskyFeedDefs.isThreadViewPost(reply)) {
392 allPosts.push(reply.post);
393 crawl(reply);
394 }
395 }
396 }
397 // Not adding allPosts.push(thread.post) here,
398 // because we already did that one
399 if (AppBskyFeedDefs.isThreadViewPost(thread.data.thread)) {
400 crawl(thread.data.thread)
401 }
402 for (const threadPost of allPosts) {
403 const rootLabeled = certainLabels.length > 0;
404 await auditPostForKeywords(threadPost, agent, me, {
405 crawled: true,
406 rootLabeled: rootLabeled,
407 isReport: false, //opts.isReport,
408 });
409 }
410 }
411 } catch(e) {
412 console.error("Failed while crawling %s:", post.uri, e)
413 }
414 return
415 }
416 if (opts.isReport && DRY_RUN) console.info(" * Escalated due to report");
417 if (opts.isReport && !DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
418 event: {
419 $type: "tools.ozone.moderation.defs#modEventEscalate",
420 comment: "[Automated] Escalated due to report",
421 },
422 subject: {
423 $type: "com.atproto.repo.strongRef",
424 uri: post.uri,
425 cid: post.cid
426 },
427 createdBy: me.data.did,
428 });
429}