src/handler.ts at tip · haileys.quest/star-spangled

Archive of the USPol Labeler's automatic labeling script. Out-of-date. Must run alongside Ozone. DO NOT OPEN ISSUES OR PULLS -- THEY WILL BE IGNORED/CLOSED
fork atom
star-spangled / src / handler.ts
at tip 429 lines 19 kB view raw
wrap content
Blake Leonard fix(report): bonus point only matches wouldn't get escalated 1y ago
b7a55a6e
  1import { AppBskyActorGetProfile, AppBskyEmbedExternal, AppBskyEmbedImages, AppBskyEmbedVideo, AppBskyFeedDefs, AppBskyFeedPost } from "@atproto/api";
  2import { ESCALATION_THRESHOLD, ESCALATE_AND_LABEL_THRESHOLD, keywordList, LABEL_THRESHOLD, overridesList, redis, VERBOSE, CRAWL_THREAD, DRY_RUN, MISCELLANEOUS_LABEL, NO_ALT_TEXT_LABEL } from "../main.ts";
  3import { Agent } from "@atproto/api";
  4import { retry } from "@atproto/common";
  5import { CredentialSession } from "@atproto/api";
  6import { hasAlreadyHandled } from "./redis.ts";
  7
  8/** If there are more than 2 categories set, both of them get 25 points added.
  9    So if a "political" post is detected, it's more likely that other keywords
 10    may be related to their intended subjects. For example, a post that says 
 11    "vote trump" should definitely be labeled under the Trump label, and maybe 
 12    the Election label. But either separately could refer to a "trump card" or
 13    to a PTA or corporate election, or benefits election, which we don't care about.
 14    This value can be tweaked as needed. */
 15export const CATEGORY_SHARE_POINTS = 25;
 16
 17/** The "bonus points" added to every category if the post came to our attention via a report.
 18 *  If a maybe-political post is reported, it's much more likely that it is indeed political.
 19 *  If it still doesn't meet criteria, the report is not dismissed. */
 20export const REPORT_BONUS_POINTS = 25;
 21
 22/** If this post was reached because of crawling, that means another post in the thread matched
 23 *  a keyword. Therefore, this one is more likely to be political, and therefore it gets bonus
 24 *  points. */
 25export const CRAWLED_BONUS_POINTS = 10;
 26
 27export const publicAgent = new Agent(new CredentialSession(new URL("https://public.api.bsky.app"), fetch, undefined));
 28
 29function escapeRegExp(string: string) {
 30    return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string
 31}
 32
 33const WORD_SEPARATOR_PATTERN = /(?:^|[ ",!.?:_~#+=$%&*)(\[\]{}<>'’-]|$)/;
 34const WSP_STRING = WORD_SEPARATOR_PATTERN.toString().slice(1).slice(0,-1);
 35
 36export interface AuditPostOpts {
 37  isReport?: boolean;
 38  /** Did you crawl from another post to find this one? */
 39  crawled?: boolean;
 40  /** Used with [crawled]; indicates whether or not the root post got a label.
 41   * The bonus points are not applied if no label was applied to the initial post. */
 42  rootLabeled?: boolean;
 43};
 44const defaultAuditOpts: Required<AuditPostOpts> = {
 45  isReport: false,
 46  crawled: false,
 47  rootLabeled: false,
 48}
 49
 50export async function auditPostForKeywords(post: AppBskyFeedDefs.PostView, agent: Agent, me: AppBskyActorGetProfile.Response, zzzoptions: Partial<AuditPostOpts>) {
 51  const opts: AuditPostOpts = {...defaultAuditOpts, ...zzzoptions};
 52  const scores: Record<string, number> = {}
 53  const matchedKeywords: Record<string, string[]> = {}
 54
 55  const alreadyHandled = await hasAlreadyHandled(post.uri);
 56  if (alreadyHandled && VERBOSE) console.log("Already handled this one:", post.uri);
 57
 58  const overrideData = overridesList.find((v,_,__) => v.subject == post.author.did);
 59  if (overrideData?.skip || alreadyHandled) {
 60    const _suffix = overrideData?.skip ? "user is skipped in overrides" : "post was recently handled"
 61    if (opts.isReport) await agent.tools.ozone.moderation.emitEvent({
 62      event: {
 63        $type: "tools.ozone.moderation.defs#modEventEscalate",
 64        comment: `[Automated] Escalated due to report (${_suffix})`,
 65      },
 66      subject: {
 67        $type: "com.atproto.repo.strongRef",
 68        uri: post.uri,
 69        cid: post.cid
 70      },
 71      createdBy: me.data.did,
 72    });
 73    return; // don't do anything for "skipped" users
 74  } else if (overrideData?.score && MISCELLANEOUS_LABEL) {
 75    // apply to "misc-or-bonus", which gets augmented and applied later
 76    scores["or:"+MISCELLANEOUS_LABEL] ??= 0
 77    scores["or:"+MISCELLANEOUS_LABEL] += overrideData.score
 78  }
 79
 80  const _badAltText = ["alt text", "screenshot"];
 81  const allTextList = [(post.record as AppBskyFeedPost.Record).text]
 82  // It would be nice if we could check records (quotes) here,
 83  // but that would hit the rate limit really fast.
 84  // Looking for the reply chain would do the same.
 85  // So neither of those can be done. Such a shame.
 86  const embed = (post.record as AppBskyFeedPost.Record).embed;
 87  if (AppBskyEmbedExternal.isMain(embed)) {
 88    allTextList.push(embed?.external.title)
 89    allTextList.push(embed?.external.description)
 90  } else if (AppBskyEmbedImages.isMain(embed)) {
 91    for (const image of embed?.images) {
 92      if (image.alt) allTextList.push(image.alt)
 93      if (!image.alt || _badAltText.includes(image.alt.toLowerCase())) {
 94        if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD;
 95        if (DRY_RUN || VERBOSE) console.info(" * Missing alt text");
 96      } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for image");
 97    }
 98  } else if (AppBskyEmbedVideo.isMain(embed)) {
 99    if (embed?.alt) allTextList.push(embed?.alt)
100    if (!embed?.alt || _badAltText.includes(embed?.alt.toLowerCase())) {
101      if (NO_ALT_TEXT_LABEL) scores[NO_ALT_TEXT_LABEL] = LABEL_THRESHOLD;
102      if (DRY_RUN || VERBOSE) console.info(" * Missing alt text");
103    } else if (DRY_RUN || VERBOSE) console.info(" * Has alt text for video");
104  }
105  if (Object.keys(post.record).includes("bridgyOriginalText")) {
106    // deno-lint-ignore no-explicit-any
107    const bridgyOriginalText = (post.record as any).bridgyOriginalText;
108    allTextList.push(bridgyOriginalText)
109    // TODO: if BridgyFed ever adds a CW field, I can add it here
110  }
111  for (const tag in (post.record as AppBskyFeedPost.Record).tags) {
112    allTextList.push("#"+tag);
113  }
114  const allText = allTextList.join(" ");
115  const splitPost = allText.toLowerCase().split(WORD_SEPARATOR_PATTERN).filter((v,_,__) => v != "")
116
117  for (const entry of keywordList) {
118    // deno-lint-ignore no-inner-declarations no-var
119    var matchesLang: boolean|null = null;
120    inner: for (const keyword of entry.keywords) {
121      if (matchedKeywords[entry.label]?.join(", ").includes(keyword)) {
122        break inner
123      }
124      if (keyword.startsWith("-") && !keyword.startsWith("-$")) {
125        // keywords starting with - have a negative effect; the keyword _must not_ be present to match
126        const kw = keyword.substring(1);
127        if (splitPost.includes(kw)) {
128          break inner
129        } else if (allText.toLowerCase().includes(kw) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(kw)+WSP_STRING))) {
130          // This'll match phrases, but it's a little more intensive so we'll do a light check first
131          break inner
132        }
133        continue;
134      }
135      if (keyword.startsWith("$")) {
136        // keywords starting with $ check self-labels (or language)
137        if (keyword.startsWith("$lang:")) {
138          matchesLang ??= false;
139          // language check
140          const value = keyword.replace("$lang:","");
141          // deno-lint-ignore no-explicit-any
142          if (((post.record as any).langs as string[] | undefined)?.includes(value)) {
143            matchesLang = true;
144          }
145        }
146        if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) {
147          scores[entry.label] ??= 0
148          scores[entry.label] += entry.score
149          matchedKeywords[entry.label] ??= []
150          matchedKeywords[entry.label].push(keyword)
151          break inner;
152        }
153      } else if (keyword.startsWith("-$lang:")) {
154        // prefix -$lang: negates language check: posts marked with this language 
155        // don't match the keyword (i.e. dem,-$lang:de)
156        const value = keyword.replace("-$lang:","");
157        // deno-lint-ignore no-explicit-any
158        if (((post.record as any).langs as string[] | undefined)?.includes(value)) {
159          break inner;
160        }
161      } else if (keyword.startsWith("-$")) {
162        if (post.labels?.find((v,_,__) => v.val == keyword.substring(1))) {
163          break inner;
164        }
165      }
166      if (matchesLang === false) break inner;
167
168      if (splitPost.includes(keyword)) {
169        scores[entry.label] ??= 0
170        scores[entry.label] += entry.score
171        matchedKeywords[entry.label] ??= []
172        matchedKeywords[entry.label].push(keyword)
173        break inner
174      } else if (allText.toLowerCase().includes(keyword) && allText.toLowerCase().match(new RegExp(WSP_STRING+escapeRegExp(keyword)+WSP_STRING))) {
175        // This'll match phrases, but it's a little more intensive so we'll do a light check first
176        scores[entry.label] ??= 0
177        // make them less potent
178        //scores[entry.label] += Math.max(entry.score-10,0)
179        scores[entry.label] += entry.score
180        matchedKeywords[entry.label] ??= []
181        matchedKeywords[entry.label].push(keyword)
182        break inner
183      }
184    }
185  }
186  for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("or:"))) {
187    // A label in keywords.tsv with the or: prefix, i.e. or:miscellaneous-uspol, indicates that
188    // if there are no labels to match it to, the prefixed label should be used.
189    // If there are other labels, it applies as bonus points instead.
190    const label = fullLabel.replace("or:","");
191    // The misc-or-bonus pseudo-label converts into bonus points, or into miscellaneous-uspol if there is no other category.
192    if ((Object.keys(scores).length >= 2 && !scores["bonus-points-only"]) || Object.keys(scores).length >= 3) {
193      // Another category! Make it bonus points
194      scores["bonus-points-only"] ??= 0
195      scores["bonus-points-only"] += score
196      matchedKeywords["bonus-points-only"] ??= []
197      matchedKeywords["bonus-points-only"].push(...matchedKeywords[fullLabel])
198    } else {
199      // No other category! Make it miscellaneous-uspol
200      scores[label] = score
201      matchedKeywords[label] = matchedKeywords[fullLabel]
202    }
203    delete scores[fullLabel];
204    delete matchedKeywords[fullLabel];
205  }
206  for (const [fullLabel, score] of Object.entries(scores).filter(([k,_]) => k.endsWith("bo:"))) {
207    // Applies bonus points to a specific label ONLY, and only if it already exists
208    // (so bonus points alone can't label a post)
209    // in other words, for a bo:<LABEL> to be considered, another keyword under
210    // <LABEL> has to already have matched
211    const label = fullLabel.replace("or:","");
212    if (scores[label] && scores[label] > 0) {
213      scores[label] += score;
214      matchedKeywords[label] ??= []; // shouldn't be necesssary
215      matchedKeywords[label].push(...matchedKeywords[fullLabel])
216    }
217    delete scores[fullLabel];
218    delete matchedKeywords[fullLabel];
219  }
220  for (const [fullLabel, _] of Object.entries(scores).filter(([k,_]) => k.includes(":"))) {
221    // Clean up any invalid-prefixed labels
222    delete scores[fullLabel];
223    delete matchedKeywords[fullLabel];
224  }
225  if (Object.keys(scores).length > 0 && Object.entries(scores).some(([_,v]) => v > 0)) {
226    if (Object.keys(scores).length == 1 && scores["bonus-points-only"]) {
227      if (!opts.isReport) return;
228      if (DRY_RUN) {
229        console.info(" * Escalated due to report");
230        console.info(" * Bonus points matched: "+matchedKeywords["bonus-points-only"].join(", "));
231      }
232      if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
233        event: {
234          $type: "tools.ozone.moderation.defs#modEventEscalate",
235          comment: `[Automated] Escalated due to report (only bonus points matched: ${matchedKeywords["bonus-points-only"].join(", ")})`,
236        },
237        subject: {
238          $type: "com.atproto.repo.strongRef",
239          uri: post.uri,
240          cid: post.cid
241        },
242        createdBy: me.data.did,
243      });
244    };
245    // Add share points to each category, if multiple categories are present:
246    if (Object.keys(scores).length >= 2) {
247      // if there are multiple entries...
248      for (const key in scores) {
249        // add CATEGORY_SHARE_POINTS (25 right now) to every entry
250        //scores.set(key, scores.get(key)! + CATEGORY_SHARE_POINTS)
251        if (key == "bonus-points-only") {continue}
252        if (scores["bonus-points-only"] > 0) {
253          scores[key] += scores["bonus-points-only"]
254        } else {
255          scores[key] += CATEGORY_SHARE_POINTS
256        }
257      }
258    }
259    // deno-lint-ignore no-empty
260    try {delete scores["bonus-points-only"]} finally {}
261    // Add bonus points for reports
262    if (opts.isReport) {
263      for (const key in scores) {
264        scores[key] += REPORT_BONUS_POINTS
265      }
266    }
267    // Add bonus for crawled threads/quotes
268    if (opts.crawled && opts.rootLabeled) {
269      if (VERBOSE) console.log("Reached by crawling:",post.uri)
270      for (const key in scores) {
271        scores[key] += CRAWLED_BONUS_POINTS
272      }
273    }
274    // deno-lint-ignore no-inner-declarations no-var
275    var comment = "[Automated] Confidence levels:";
276    const escalatingLabels: string[] = [];
277    const likelyLabels: string[] = [];
278    const certainLabels: string[] = [];
279    if (NO_ALT_TEXT_LABEL && scores[NO_ALT_TEXT_LABEL] && Object.keys(scores).length == 1) {
280      comment = "[Automated] Missing alt text!";
281      if (!certainLabels.includes(NO_ALT_TEXT_LABEL)) certainLabels.push(NO_ALT_TEXT_LABEL)
282    } else if (NO_ALT_TEXT_LABEL && scores[NO_ALT_TEXT_LABEL]) {
283      comment = "[Automated] Missing alt text!\r\nConfidence levels:"
284      if (!certainLabels.includes(NO_ALT_TEXT_LABEL)) certainLabels.push(NO_ALT_TEXT_LABEL)
285    }
286    for (const key in scores) {
287      if (key === undefined) continue;
288      if (NO_ALT_TEXT_LABEL && key == NO_ALT_TEXT_LABEL) continue;
289      const score = scores[key]!;
290      const kws = matchedKeywords[key] ?? ["unknown"];
291      comment += `\r\n${key}: ${score} (matched: ${kws.join(", ")}); `;
292      if (score >= LABEL_THRESHOLD) {
293        certainLabels.push(key)
294      } else if (score >= ESCALATE_AND_LABEL_THRESHOLD) {
295        likelyLabels.push(key)
296      } else if (score >= ESCALATION_THRESHOLD) {
297        escalatingLabels.push(key)
298      }
299    }
300    if (matchedKeywords["bonus-points-only"]) {
301      comment += `\r\nBonus point keywords: ${matchedKeywords["bonus-points-only"].join(", ")}`;
302    }
303    await retry(async () => await redis?.pipelineCommands([
304      ["MULTI"],
305      ["HSET", "alreadyHandled", post.uri, "true"],
306      ["HEXPIRE", "alreadyHandled", 86400, "NX", "FIELDS", 1, post.uri],
307      ["EXEC"],
308    ]), {maxRetries: 3});
309    if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
310      event: {
311        $type: "tools.ozone.moderation.defs#modEventTag",
312        add: ["auto-handled"],
313        remove: [],
314      },
315      subject: {
316        $type: "com.atproto.repo.strongRef",
317        uri: post.uri,
318        cid: post.cid
319      },
320      createdBy: me.data.did,
321    });
322    const combinedLabels = [...certainLabels, ...likelyLabels];
323    if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
324      event: {
325        $type: combinedLabels.length > 0
326          ? "tools.ozone.moderation.defs#modEventLabel"
327          : escalatingLabels.length > 0
328          ? "tools.ozone.moderation.defs#modEventEscalate"
329          : "tools.ozone.moderation.defs#modEventComment",
330        comment: comment,
331        createLabelVals: combinedLabels.length == 0 ? undefined : combinedLabels,
332        negateLabelVals: combinedLabels.length == 0 ? undefined : [],
333      },
334      subject: {
335        $type: "com.atproto.repo.strongRef",
336        uri: post.uri,
337        cid: post.cid
338      },
339      createdBy: me.data.did,
340    });
341    if (DRY_RUN && certainLabels.length > 0) console.info(" * Auto label:", comment)
342    else if (DRY_RUN && likelyLabels.length > 0) console.info(" * Auto label and escalate:", comment)
343    else if (DRY_RUN && escalatingLabels.length > 0) console.info(" * Escalate:", comment)
344    else if (DRY_RUN) console.info(" * No action:", comment)
345    if (certainLabels.length > 0) {
346      if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
347        event: {
348          $type: "tools.ozone.moderation.defs#modEventAcknowledge",
349        },
350        subject: {
351          $type: "com.atproto.repo.strongRef",
352          uri: post.uri,
353          cid: post.cid
354        },
355        createdBy: me.data.did,
356      });
357    } else if (likelyLabels.length > 0) {
358      if (!DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
359        event: {
360          $type: "tools.ozone.moderation.defs#modEventEscalate",
361          comment: "[Automated] Escalated due to labels: "+likelyLabels.join(",")
362        },
363        subject: {
364          $type: "com.atproto.repo.strongRef",
365          uri: post.uri,
366          cid: post.cid
367        },
368        createdBy: me.data.did,
369      });
370    }
371    if (CRAWL_THREAD && !opts.crawled) try {
372      if (await hasAlreadyHandled((post.record as AppBskyFeedPost.Record).reply?.parent?.uri??"__defnot__") == true) return;
373      // If the parent has already/recently been handled, there's not
374      // much use in crawling the thread again.
375      const thread = await publicAgent.app.bsky.feed.getPostThread({
376        uri: post.uri,
377        // depth: 6,
378        // parentHeight: 80,
379      });
380      if (thread.success) {
381        const allPosts: AppBskyFeedDefs.PostView[] = [];
382        // deno-lint-ignore no-inner-declarations
383        function crawl(threadPost: AppBskyFeedDefs.ThreadViewPost) {
384          /// Crawls just the parent and replies fields.
385          /// The "post" field is added next to crawl().
386          if (AppBskyFeedDefs.isThreadViewPost(threadPost.parent)) {
387            allPosts.push(threadPost.parent.post);
388            crawl(threadPost.parent);
389          }
390          for (const reply of (threadPost.replies??[])) {
391            if (AppBskyFeedDefs.isThreadViewPost(reply)) {
392              allPosts.push(reply.post);
393              crawl(reply);
394            }
395          }
396        }
397        // Not adding allPosts.push(thread.post) here,
398        // because we already did that one
399        if (AppBskyFeedDefs.isThreadViewPost(thread.data.thread)) {
400          crawl(thread.data.thread)
401        }
402        for (const threadPost of allPosts) {
403          const rootLabeled = certainLabels.length > 0;
404          await auditPostForKeywords(threadPost, agent, me, {
405            crawled: true,
406            rootLabeled: rootLabeled,
407            isReport: false, //opts.isReport,
408          });
409        }
410      }
411    } catch(e) {
412      console.error("Failed while crawling %s:", post.uri, e)
413    }
414    return
415  }
416  if (opts.isReport && DRY_RUN) console.info(" * Escalated due to report");
417  if (opts.isReport && !DRY_RUN) await agent.tools.ozone.moderation.emitEvent({
418    event: {
419      $type: "tools.ozone.moderation.defs#modEventEscalate",
420      comment: "[Automated] Escalated due to report",
421    },
422    subject: {
423      $type: "com.atproto.repo.strongRef",
424      uri: post.uri,
425      cid: post.cid
426    },
427    createdBy: me.data.did,
428  });
429}