Archive of the USPol Labeler's automatic labeling script. Out-of-date. Must run alongside Ozone. DO NOT OPEN ISSUES OR PULLS -- THEY WILL BE IGNORED/CLOSED
at tip 429 lines 19 kB view raw
1import { AppBskyActorGetProfile, AppBskyEmbedExternal, AppBskyEmbedImages, AppBskyEmbedVideo, AppBskyFeedDefs, AppBskyFeedPost } from "@atproto/api"; 2import { ESCALATION_THRESHOLD, ESCALATE_AND_LABEL_THRESHOLD, keywordList, LABEL_THRESHOLD, overridesList, redis, VERBOSE, CRAWL_THREAD, DRY_RUN, MISCELLANEOUS_LABEL, NO_ALT_TEXT_LABEL } from "../main.ts"; 3import { Agent } from "@atproto/api"; 4import { retry } from "@atproto/common"; 5import { CredentialSession } from "@atproto/api"; 6import { hasAlreadyHandled } from "./redis.ts"; 7 8/** If there are more than 2 categories set, both of them get 25 points added. 9 So if a "political" post is detected, it's more likely that other keywords 10 may be related to their intended subjects. For example, a post that says 11 "vote trump" should definitely be labeled under the Trump label, and maybe 12 the Election label. But either separately could refer to a "trump card" or 13 to a PTA or corporate election, or benefits election, which we don't care about. 14 This value can be tweaked as needed. */ 15export const CATEGORY_SHARE_POINTS = 25; 16 17/** The "bonus points" added to every category if the post came to our attention via a report. 18 * If a maybe-political post is reported, it's much more likely that it is indeed political. 19 * If it still doesn't meet criteria, the report is not dismissed. */ 20export const REPORT_BONUS_POINTS = 25; 21 22/** If this post was reached because of crawling, that means another post in the thread matched 23 * a keyword. Therefore, this one is more likely to be political, and therefore it gets bonus 24 * points. */ 25export const CRAWLED_BONUS_POINTS = 10; 26 27export const publicAgent = new Agent(new CredentialSession(new URL("https://public.api.bsky.app"), fetch, undefined)); 28 29function escapeRegExp(string: string) { 30 return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string 31} 32 33const WORD_SEPARATOR_PATTERN = /(?:^|[ ",!.?:_~#+=$%&*)(\[\]{}<>'’-]|$)/; 34const WSP_STRING = WORD_SEPARATOR_PATTERN.toString().slice(1).slice(0,-1); 35 36export interface AuditPostOpts { 37 isReport?: boolean; 38 /** Did you crawl from another post to find this one? */ 39 crawled?: boolean; 40 /** Used with [crawled]; indicates whether or not the root post got a label. 41 * The bonus points are not applied if no label was applied to the initial post. */ 42 rootLabeled?: boolean; 43}; 44const defaultAuditOpts: Required<AuditPostOpts> = { 45 isReport: false, 46 crawled: false, 47 rootLabeled: false, 48} 49 50export async function auditPostForKeywords(post: AppBskyFeedDefs.PostView, agent: Agent, me: AppBskyActorGetProfile.Response, zzzoptions: Partial<AuditPostOpts>) { 51 const opts: AuditPostOpts = {...defaultAuditOpts, ...zzzoptions}; 52 const scores: Record<string, number> = {} 53 const matchedKeywords: Record<string, string[]> = {} 54 55 const alreadyHandled = await hasAlreadyHandled(post.uri); 56 if (alreadyHandled && VERBOSE) console.log("Already handled this one:", post.uri); 57 58 const overrideData = overridesList.find((v,_,__) => v.subject == post.author.did); 59 if (overrideData?.skip || alreadyHandled) { 60 const _suffix = overrideData?.skip ? "user is skipped in overrides" : "post was recently handled" 61 if (opts.isReport) await agent.tools.ozone.moderation.emitEvent({ 62 event: { 63 $type: "tools.ozone.moderation.defs#modEventEscalate", 64 comment: `[Automated] Escalated due to report (${_suffix})`, 65 }, 66 subject: { 67 $type: "com.atproto.repo.strongRef", 68 uri: post.uri, 69 cid: post.cid 70 }, 71 createdBy: me.data.did, 72 }); 73 return; // don't do anything for "skipped" users 74 } else if (overrideData?.score && MISCELLANEOUS_LABEL) { 75 // apply to "misc-or-bonus", which gets augmented and applied later 76 scores["or:"+MISCELLANEOUS_LABEL] ??= 0 77 scores["or:"+MISCELLANEOUS_LABEL] += overrideData.score 78 } 79 80 const _badAltText = ["alt text", "screenshot"]; 81 const allTextList = [(post.record as AppBskyFeedPost.Record).text] 82 // It would be nice if we could check records (quotes) here, 83 // but that would hit the rate limit really fast. 84 // Looking for the reply chain would do the same. 85 // So neither of those can be done. Such a shame. 86 const embed = (post.record as AppBskyFeedPost.Record).embed; 87 if (AppBskyEmbedExternal.isMain(embed)) { 88 allTextList.push(embed?.external.title) 89 allTextList.push(embed?.external.description) 90 } else if (AppBskyEmbedImages.isMain(embed)) { 91 for (const image of embed?.images) { 92 if (image.alt) allTextList.push(image.alt) 93 if (!image.alt || _badAltText.includes(image.alt.toLowerCase())) { 94 if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD; 95 if (DRY_RUN || VERBOSE) console.info(" * Missing alt text"); 96 } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for image"); 97 } 98 } else if (AppBskyEmbedVideo.isMain(embed)) { 99 if (embed?.alt) allTextList.push(embed?.alt) 100 if (!embed?.alt || _badAltText.includes(embed?.alt.toLowerCase())) { 101 if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD; 102 if (DRY_RUN || VERBOSE) console.info(" * Missing alt text"); 103 } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for video"); 104 } 105 if (Object.keys(post.record).includes("bridgyOriginalText")) { 106 // deno-lint-ignore no-explicit-any 107 const bridgyOriginalText = (post.record as any).bridgyOriginalText; 108 allTextList.push(bridgyOriginalText) 109 // TODO: if BridgyFed ever adds a CW field, I can add it here 110 } 111 for (const tag in (post.record as AppBskyFeedPost.Record).tags) { 112 allTextList.push("#"+tag); 113 } 114 const allText = allTextList.join(" "); 115 const splitPost = allText.toLowerCase().split(WORD_SEPARATOR_PATTERN).filter((v,_,__) => v != "") 116 117 for (const entry of keywordList) { 118 // deno-lint-ignore no-inner-declarations no-var 119 var matchesLang: boolean|null = null; 120 inner: for (const keyword of entry.keywords) { 121 if (matchedKeywords[entry.label]?.join(", ").includes(keyword)) { 122 break inner 123 } 124 if (keyword.startsWith("-") && !keyword.startsWith("-$")) { 125 // keywords starting with - have a negative effect; the keyword _must not_ be present to match 126 const kw = keyword.substring(1); 127 if (splitPost.includes(kw)) { 128 break inner 129 } else if (allText.toLowerCase().includes(kw) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(kw)+WSP_STRING))) { 130 // This'll match phrases, but it's a little more intensive so we'll do a light check first 131 break inner 132 } 133 continue; 134 } 135 if (keyword.startsWith("$")) { 136 // keywords starting with $ check self-labels (or language) 137 if (keyword.startsWith("$lang:")) { 138 matchesLang ??= false; 139 // language check 140 const value = keyword.replace("$lang:",""); 141 // deno-lint-ignore no-explicit-any 142 if (((post.record as any).langs as string[] | undefined)?.includes(value)) { 143 matchesLang = true; 144 } 145 } 146 if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) { 147 scores[entry.label] ??= 0 148 scores[entry.label] += entry.score 149 matchedKeywords[entry.label] ??= [] 150 matchedKeywords[entry.label].push(keyword) 151 break inner; 152 } 153 } else if (keyword.startsWith("-$lang:")) { 154 // prefix -$lang: negates language check: posts marked with this language 155 // don't match the keyword (i.e. dem,-$lang:de) 156 const value = keyword.replace("-$lang:",""); 157 // deno-lint-ignore no-explicit-any 158 if (((post.record as any).langs as string[] | undefined)?.includes(value)) { 159 break inner; 160 } 161 } else if (keyword.startsWith("-$")) { 162 if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) { 163 break inner; 164 } 165 } 166 if (matchesLang === false) break inner; 167 168 if (splitPost.includes(keyword)) { 169 scores[entry.label] ??= 0 170 scores[entry.label] += entry.score 171 matchedKeywords[entry.label] ??= [] 172 matchedKeywords[entry.label].push(keyword) 173 break inner 174 } else if (allText.toLowerCase().includes(keyword) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(keyword)+WSP_STRING))) { 175 // This'll match phrases, but it's a little more intensive so we'll do a light check first 176 scores[entry.label] ??= 0 177 // make them less potent 178 //scores[entry.label] += Math.max(entry.score-10,0) 179 scores[entry.label] += entry.score 180 matchedKeywords[entry.label] ??= [] 181 matchedKeywords[entry.label].push(keyword) 182 break inner 183 } 184 } 185 } 186 for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("or:"))) { 187 // A label in keywords.tsv with the or: prefix, i.e. or:miscellaneous-uspol, indicates that 188 // if there are no labels to match it to, the prefixed label should be used. 189 // If there are other labels, it applies as bonus points instead. 190 const label = fullLabel.replace("or:",""); 191 // The misc-or-bonus pseudo-label converts into bonus points, or into miscellaneous-uspol if there is no other category. 192 if ((Object.keys(scores).length >= 2 && !scores["bonus-points-only"]) || Object.keys(scores).length >= 3) { 193 // Another category! Make it bonus points 194 scores["bonus-points-only"] ??= 0 195 scores["bonus-points-only"] += score 196 matchedKeywords["bonus-points-only"] ??= [] 197 matchedKeywords["bonus-points-only"].push(...matchedKeywords[fullLabel]) 198 } else { 199 // No other category! Make it miscellaneous-uspol 200 scores[label] = score 201 matchedKeywords[label] = matchedKeywords[fullLabel] 202 } 203 delete scores[fullLabel]; 204 delete matchedKeywords[fullLabel]; 205 } 206 for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("bo:"))) { 207 // Applies bonus points to a specific label ONLY, and only if it already exists 208 // (so bonus points alone can't label a post) 209 // in other words, for a bo:<LABEL> to be considered, another keyword under 210 // <LABEL> has to already have matched 211 const label = fullLabel.replace("or:",""); 212 if (scores[label] && scores[label] > 0) { 213 scores[label] += score; 214 matchedKeywords[label] ??= []; // shouldn't be necesssary 215 matchedKeywords[label].push(...matchedKeywords[fullLabel]) 216 } 217 delete scores[fullLabel]; 218 delete matchedKeywords[fullLabel]; 219 } 220 for (const [fullLabel, _] of Object.entries(scores).filter(([k,_]) => k.includes(":"))) { 221 // Clean up any invalid-prefixed labels 222 delete scores[fullLabel]; 223 delete matchedKeywords[fullLabel]; 224 } 225 if (Object.keys(scores).length > 0 && Object.entries(scores).some(([_,v]) => v > 0)) { 226 if (Object.keys(scores).length == 1 && scores["bonus-points-only"]) { 227 if (!opts.isReport) return; 228 if (DRY_RUN) { 229 console.info(" * Escalated due to report"); 230 console.info(" * Bonus points matched: "+matchedKeywords["bonus-points-only"].join(", ")); 231 } 232 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({ 233 event: { 234 $type: "tools.ozone.moderation.defs#modEventEscalate", 235 comment: `[Automated] Escalated due to report (only bonus points matched: ${matchedKeywords["bonus-points-only"].join(", ")})`, 236 }, 237 subject: { 238 $type: "com.atproto.repo.strongRef", 239 uri: post.uri, 240 cid: post.cid 241 }, 242 createdBy: me.data.did, 243 }); 244 }; 245 // Add share points to each category, if multiple categories are present: 246 if (Object.keys(scores).length >= 2) { 247 // if there are multiple entries... 248 for (const key in scores) { 249 // add CATEGORY_SHARE_POINTS (25 right now) to every entry 250 //scores.set(key, scores.get(key)! + CATEGORY_SHARE_POINTS) 251 if (key == "bonus-points-only") {continue} 252 if (scores["bonus-points-only"] > 0) { 253 scores[key] += scores["bonus-points-only"] 254 } else { 255 scores[key] += CATEGORY_SHARE_POINTS 256 } 257 } 258 } 259 // deno-lint-ignore no-empty 260 try {delete scores["bonus-points-only"]} finally {} 261 // Add bonus points for reports 262 if (opts.isReport) { 263 for (const key in scores) { 264 scores[key] += REPORT_BONUS_POINTS 265 } 266 } 267 // Add bonus for crawled threads/quotes 268 if (opts.crawled && opts.rootLabeled) { 269 if (VERBOSE) console.log("Reached by crawling:",post.uri) 270 for (const key in scores) { 271 scores[key] += CRAWLED_BONUS_POINTS 272 } 273 } 274 // deno-lint-ignore no-inner-declarations no-var 275 var comment = "[Automated] Confidence levels:"; 276 const escalatingLabels: string[] = []; 277 const likelyLabels: string[] = []; 278 const certainLabels: string[] = []; 279 if (NO_ALT_TEXT_LABEL && scores[NO_ALT_TEXT_LABEL] && Object.keys(scores).length == 1) { 280 comment = "[Automated] Missing alt text!"; 281 if (!certainLabels.includes(NO_ALT_TEXT_LABEL)) certainLabels.push(NO_ALT_TEXT_LABEL) 282 } else if (NO_ALT_TEXT_LABEL && scores[NO_ALT_TEXT_LABEL]) { 283 comment = "[Automated] Missing alt text!\r\nConfidence levels:" 284 if (!certainLabels.includes(NO_ALT_TEXT_LABEL)) certainLabels.push(NO_ALT_TEXT_LABEL) 285 } 286 for (const key in scores) { 287 if (key === undefined) continue; 288 if (NO_ALT_TEXT_LABEL && key == NO_ALT_TEXT_LABEL) continue; 289 const score = scores[key]!; 290 const kws = matchedKeywords[key] ?? ["unknown"]; 291 comment += `\r\n${key}: ${score} (matched: ${kws.join(", ")}); `; 292 if (score >= LABEL_THRESHOLD) { 293 certainLabels.push(key) 294 } else if (score >= ESCALATE_AND_LABEL_THRESHOLD) { 295 likelyLabels.push(key) 296 } else if (score >= ESCALATION_THRESHOLD) { 297 escalatingLabels.push(key) 298 } 299 } 300 if (matchedKeywords["bonus-points-only"]) { 301 comment += `\r\nBonus point keywords: ${matchedKeywords["bonus-points-only"].join(", ")}`; 302 } 303 await retry(async () => await redis?.pipelineCommands([ 304 ["MULTI"], 305 ["HSET", "alreadyHandled", post.uri, "true"], 306 ["HEXPIRE", "alreadyHandled", 86400, "NX", "FIELDS", 1, post.uri], 307 ["EXEC"], 308 ]), {maxRetries: 3}); 309 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({ 310 event: { 311 $type: "tools.ozone.moderation.defs#modEventTag", 312 add: ["auto-handled"], 313 remove: [], 314 }, 315 subject: { 316 $type: "com.atproto.repo.strongRef", 317 uri: post.uri, 318 cid: post.cid 319 }, 320 createdBy: me.data.did, 321 }); 322 const combinedLabels = [...certainLabels, ...likelyLabels]; 323 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({ 324 event: { 325 $type: combinedLabels.length > 0 326 ? "tools.ozone.moderation.defs#modEventLabel" 327 : escalatingLabels.length > 0 328 ? "tools.ozone.moderation.defs#modEventEscalate" 329 : "tools.ozone.moderation.defs#modEventComment", 330 comment: comment, 331 createLabelVals: combinedLabels.length == 0 ? undefined : combinedLabels, 332 negateLabelVals: combinedLabels.length == 0 ? undefined : [], 333 }, 334 subject: { 335 $type: "com.atproto.repo.strongRef", 336 uri: post.uri, 337 cid: post.cid 338 }, 339 createdBy: me.data.did, 340 }); 341 if (DRY_RUN && certainLabels.length > 0) console.info(" * Auto label:", comment) 342 else if (DRY_RUN && likelyLabels.length > 0) console.info(" * Auto label and escalate:", comment) 343 else if (DRY_RUN && escalatingLabels.length > 0) console.info(" * Escalate:", comment) 344 else if (DRY_RUN) console.info(" * No action:", comment) 345 if (certainLabels.length > 0) { 346 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({ 347 event: { 348 $type: "tools.ozone.moderation.defs#modEventAcknowledge", 349 }, 350 subject: { 351 $type: "com.atproto.repo.strongRef", 352 uri: post.uri, 353 cid: post.cid 354 }, 355 createdBy: me.data.did, 356 }); 357 } else if (likelyLabels.length > 0) { 358 if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({ 359 event: { 360 $type: "tools.ozone.moderation.defs#modEventEscalate", 361 comment: "[Automated] Escalated due to labels: "+likelyLabels.join(",") 362 }, 363 subject: { 364 $type: "com.atproto.repo.strongRef", 365 uri: post.uri, 366 cid: post.cid 367 }, 368 createdBy: me.data.did, 369 }); 370 } 371 if (CRAWL_THREAD && !opts.crawled) try { 372 if (await hasAlreadyHandled((post.record as AppBskyFeedPost.Record).reply?.parent?.uri??"__defnot__") == true) return; 373 // If the parent has already/recently been handled, there's not 374 // much use in crawling the thread again. 375 const thread = await publicAgent.app.bsky.feed.getPostThread({ 376 uri: post.uri, 377 // depth: 6, 378 // parentHeight: 80, 379 }); 380 if (thread.success) { 381 const allPosts: AppBskyFeedDefs.PostView[] = []; 382 // deno-lint-ignore no-inner-declarations 383 function crawl(threadPost: AppBskyFeedDefs.ThreadViewPost) { 384 /// Crawls just the parent and replies fields. 385 /// The "post" field is added next to crawl(). 386 if (AppBskyFeedDefs.isThreadViewPost(threadPost.parent)) { 387 allPosts.push(threadPost.parent.post); 388 crawl(threadPost.parent); 389 } 390 for (const reply of (threadPost.replies??[])) { 391 if (AppBskyFeedDefs.isThreadViewPost(reply)) { 392 allPosts.push(reply.post); 393 crawl(reply); 394 } 395 } 396 } 397 // Not adding allPosts.push(thread.post) here, 398 // because we already did that one 399 if (AppBskyFeedDefs.isThreadViewPost(thread.data.thread)) { 400 crawl(thread.data.thread) 401 } 402 for (const threadPost of allPosts) { 403 const rootLabeled = certainLabels.length > 0; 404 await auditPostForKeywords(threadPost, agent, me, { 405 crawled: true, 406 rootLabeled: rootLabeled, 407 isReport: false, //opts.isReport, 408 }); 409 } 410 } 411 } catch(e) { 412 console.error("Failed while crawling %s:", post.uri, e) 413 } 414 return 415 } 416 if (opts.isReport && DRY_RUN) console.info(" * Escalated due to report"); 417 if (opts.isReport && !DRY_RUN) await agent.tools.ozone.moderation.emitEvent({ 418 event: { 419 $type: "tools.ozone.moderation.defs#modEventEscalate", 420 comment: "[Automated] Escalated due to report", 421 }, 422 subject: { 423 $type: "com.atproto.repo.strongRef", 424 uri: post.uri, 425 cid: post.cid 426 }, 427 createdBy: me.data.did, 428 }); 429}