import { AppBskyActorGetProfile, AppBskyEmbedExternal, AppBskyEmbedImages, AppBskyEmbedVideo, AppBskyFeedDefs, AppBskyFeedPost } from "@atproto/api"; import { ESCALATION_THRESHOLD, ESCALATE_AND_LABEL_THRESHOLD, keywordList, LABEL_THRESHOLD, overridesList, redis, VERBOSE, CRAWL_THREAD, DRY_RUN, MISCELLANEOUS_LABEL, NO_ALT_TEXT_LABEL } from "../main.ts"; import { Agent } from "@atproto/api"; import { retry } from "@atproto/common"; import { CredentialSession } from "@atproto/api"; import { hasAlreadyHandled } from "./redis.ts"; /** If there are more than 2 categories set, both of them get 25 points added. So if a "political" post is detected, it's more likely that other keywords may be related to their intended subjects. For example, a post that says "vote trump" should definitely be labeled under the Trump label, and maybe the Election label. But either separately could refer to a "trump card" or to a PTA or corporate election, or benefits election, which we don't care about. This value can be tweaked as needed. */ export const CATEGORY_SHARE_POINTS = 25; /** The "bonus points" added to every category if the post came to our attention via a report. * If a maybe-political post is reported, it's much more likely that it is indeed political. * If it still doesn't meet criteria, the report is not dismissed. */ export const REPORT_BONUS_POINTS = 25; /** If this post was reached because of crawling, that means another post in the thread matched * a keyword. Therefore, this one is more likely to be political, and therefore it gets bonus * points. */ export const CRAWLED_BONUS_POINTS = 10; export const publicAgent = new Agent(new CredentialSession(new URL("https://public.api.bsky.app"), fetch, undefined)); function escapeRegExp(string: string) { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string } const WORD_SEPARATOR_PATTERN = /(?:^|[ ",!.?:_~#+=$%&*)(\[\]{}<>'’-]|$)/; const WSP_STRING = WORD_SEPARATOR_PATTERN.toString().slice(1).slice(0,-1); export interface AuditPostOpts { isReport?: boolean; /** Did you crawl from another post to find this one? */ crawled?: boolean; /** Used with [crawled]; indicates whether or not the root post got a label. * The bonus points are not applied if no label was applied to the initial post. */ rootLabeled?: boolean; }; const defaultAuditOpts: Required = { isReport: false, crawled: false, rootLabeled: false, } export async function auditPostForKeywords(post: AppBskyFeedDefs.PostView, agent: Agent, me: AppBskyActorGetProfile.Response, zzzoptions: Partial) { const opts: AuditPostOpts = {...defaultAuditOpts, ...zzzoptions}; const scores: Record = {} const matchedKeywords: Record = {} const alreadyHandled = await hasAlreadyHandled(post.uri); if (alreadyHandled && VERBOSE) console.log("Already handled this one:", post.uri); const overrideData = overridesList.find((v,_,__) => v.subject == post.author.did); if (overrideData?.skip || alreadyHandled) { const _suffix = overrideData?.skip ? "user is skipped in overrides" : "post was recently handled" if (opts.isReport) await agent.tools.ozone.moderation.emitEvent({ event: { $type: "tools.ozone.moderation.defs#modEventEscalate", comment: `[Automated] Escalated due to report (${_suffix})`, }, subject: { $type: "com.atproto.repo.strongRef", uri: post.uri, cid: post.cid }, createdBy: me.data.did, }); return; // don't do anything for "skipped" users } else if (overrideData?.score && MISCELLANEOUS_LABEL) { // apply to "misc-or-bonus", which gets augmented and applied later scores["or:"+MISCELLANEOUS_LABEL] ??= 0 scores["or:"+MISCELLANEOUS_LABEL] += overrideData.score } const _badAltText = ["alt text", "screenshot"]; const allTextList = [(post.record as AppBskyFeedPost.Record).text] // It would be nice if we could check records (quotes) here, // but that would hit the rate limit really fast. // Looking for the reply chain would do the same. // So neither of those can be done. Such a shame. const embed = (post.record as AppBskyFeedPost.Record).embed; if (AppBskyEmbedExternal.isMain(embed)) { allTextList.push(embed?.external.title) allTextList.push(embed?.external.description) } else if (AppBskyEmbedImages.isMain(embed)) { for (const image of embed?.images) { if (image.alt) allTextList.push(image.alt) if (!image.alt || _badAltText.includes(image.alt.toLowerCase())) { if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD; if (DRY_RUN || VERBOSE) console.info(" * Missing alt text"); } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for image"); } } else if (AppBskyEmbedVideo.isMain(embed)) { if (embed?.alt) allTextList.push(embed?.alt) if (!embed?.alt || _badAltText.includes(embed?.alt.toLowerCase())) { if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD; if (DRY_RUN || VERBOSE) console.info(" * Missing alt text"); } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for video"); } if (Object.keys(post.record).includes("bridgyOriginalText")) { // deno-lint-ignore no-explicit-any const bridgyOriginalText = (post.record as any).bridgyOriginalText; allTextList.push(bridgyOriginalText) // TODO: if BridgyFed ever adds a CW field, I can add it here } for (const tag in (post.record as AppBskyFeedPost.Record).tags) { allTextList.push("#"+tag); } const allText = allTextList.join(" "); const splitPost = allText.toLowerCase().split(WORD_SEPARATOR_PATTERN).filter((v,_,__) => v != "") for (const entry of keywordList) { // deno-lint-ignore no-inner-declarations no-var var matchesLang: boolean|null = null; inner: for (const keyword of entry.keywords) { if (matchedKeywords[entry.label]?.join(", ").includes(keyword)) { break inner } if (keyword.startsWith("-") && !keyword.startsWith("-$")) { // keywords starting with - have a negative effect; the keyword _must not_ be present to match const kw = keyword.substring(1); if (splitPost.includes(kw)) { break inner } else if (allText.toLowerCase().includes(kw) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(kw)+WSP_STRING))) { // This'll match phrases, but it's a little more intensive so we'll do a light check first break inner } continue; } if (keyword.startsWith("$")) { // keywords starting with $ check self-labels (or language) if (keyword.startsWith("$lang:")) { matchesLang ??= false; // language check const value = keyword.replace("$lang:",""); // deno-lint-ignore no-explicit-any if (((post.record as any).langs as string[] | undefined)?.includes(value)) { matchesLang = true; } } if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) { scores[entry.label] ??= 0 scores[entry.label] += entry.score matchedKeywords[entry.label] ??= [] matchedKeywords[entry.label].push(keyword) break inner; } } else if (keyword.startsWith("-$lang:")) { // prefix -$lang: negates language check: posts marked with this language // don't match the keyword (i.e. dem,-$lang:de) const value = keyword.replace("-$lang:",""); // deno-lint-ignore no-explicit-any if (((post.record as any).langs as string[] | undefined)?.includes(value)) { break inner; } } else if (keyword.startsWith("-$")) { if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) { break inner; } } if (matchesLang === false) break inner; if (splitPost.includes(keyword)) { scores[entry.label] ??= 0 scores[entry.label] += entry.score matchedKeywords[entry.label] ??= [] matchedKeywords[entry.label].push(keyword) break inner } else if (allText.toLowerCase().includes(keyword) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(keyword)+WSP_STRING))) { // This'll match phrases, but it's a little more intensive so we'll do a light check first scores[entry.label] ??= 0 // make them less potent //scores[entry.label] += Math.max(entry.score-10,0) scores[entry.label] += entry.score matchedKeywords[entry.label] ??= [] matchedKeywords[entry.label].push(keyword) break inner } } } for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("or:"))) { // A label in keywords.tsv with the or: prefix, i.e. or:miscellaneous-uspol, indicates that // if there are no labels to match it to, the prefixed label should be used. // If there are other labels, it applies as bonus points instead. const label = fullLabel.replace("or:",""); // The misc-or-bonus pseudo-label converts into bonus points, or into miscellaneous-uspol if there is no other category. if ((Object.keys(scores).length >= 2 && !scores["bonus-points-only"]) || Object.keys(scores).length >= 3) { // Another category! Make it bonus points scores["bonus-points-only"] ??= 0 scores["bonus-points-only"] += score matchedKeywords["bonus-points-only"] ??= [] matchedKeywords["bonus-points-only"].push(...matchedKeywords[fullLabel]) } else { // No other category! Make it miscellaneous-uspol scores[label] = score matchedKeywords[label] = matchedKeywords[fullLabel] } delete scores[fullLabel]; delete matchedKeywords[fullLabel]; } for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("bo:"))) { // Applies bonus points to a specific label ONLY, and only if it already exists // (so bonus points alone can't label a post) // in other words, for a bo: