commit 10f5682ae83503e212a0bc34ede5dbb358301eb4 · skywatch.blue/skywatch-automod

+6 -2

.claude/settings.local.json

··· 5 5 "mcp__git-mcp-server__git_add", 6 6 "mcp__git-mcp-server__git_commit", 7 7 "mcp__git-mcp-server__git_push", 8 - "mcp__github__create_pull_request" 8 + "mcp__github__create_pull_request", 9 + "mcp__git-mcp-server__git_diff", 10 + "mcp__git-mcp-server__git_status", 11 + "mcp__git-mcp-server__git_log", 12 + "mcp__git-mcp-server__git_set_working_dir" 9 13 ], 10 14 "deny": [], 11 15 "ask": [] ··· 14 18 "enabledMcpjsonServers": [ 15 19 "git-mcp-server" 16 20 ] 17 - } 21 + }

-78

PLAN.md

··· 1 - # Implementation Plan: Replace lande with franc 2 - 3 - ## Overview 4 - Replace the `lande` library with `franc` for language detection in the `getLanguage` function located in `src/utils.ts`. 5 - 6 - ## Current State Analysis 7 - - **Current Library**: `lande` v1.0.10 8 - - **Function Location**: `src/utils.ts:67-92` 9 - - **Current Implementation**: 10 - - Uses dynamic import: `const lande = (await import("lande")).default;` 11 - - Returns a probability map sorted by likelihood 12 - - Returns the language code with highest probability 13 - - Defaults to "eng" for empty or invalid input 14 - 15 - ## Implementation Steps 16 - 17 - ### 1. Research & Dependencies 18 - - **franc** is a natural language detection library similar to `lande` 19 - - Supports 187 languages (ISO 639-3 codes) 20 - - Smaller footprint and better maintained than `lande` 21 - - Returns ISO 639-3 codes (3-letter codes like "eng", "fra", "spa") 22 - 23 - ### 2. Code Changes Required 24 - 25 - #### Step 2.1: Update package.json 26 - - Remove: `"lande": "^1.0.10"` 27 - - Add: `"franc": "^6.2.0"` (latest stable version) 28 - 29 - #### Step 2.2: Modify getLanguage function 30 - ```typescript 31 - // Before (lines 82-92) 32 - const lande = (await import("lande")).default; 33 - let langsProbabilityMap = lande(profileText); 34 - langsProbabilityMap.sort(...); 35 - return langsProbabilityMap[0][0]; 36 - 37 - // After 38 - const { franc } = await import("franc"); 39 - const detectedLang = franc(profileText); 40 - return detectedLang === "und" ? "eng" : detectedLang; 41 - ``` 42 - 43 - ### 3. Key Differences & Considerations 44 - 45 - #### API Differences: 46 - - **lande**: Returns array of `[language, probability]` tuples 47 - - **franc**: Returns single language code or "und" (undetermined) 48 - 49 - #### Return Values: 50 - - Both libraries use ISO 639-3 codes (3-letter codes) 51 - - franc returns "und" for undetermined text (we'll map to "eng" default) 52 - 53 - ### 4. Testing Strategy 54 - 1. Test with empty string → should return "eng" 55 - 2. Test with invalid input (null/undefined) → should return "eng" 56 - 3. Test with English text → should return "eng" 57 - 4. Test with other language samples → verify correct detection 58 - 5. Test with mixed language text → verify reasonable detection 59 - 60 - ### 5. Rollback Plan 61 - If issues arise: 62 - 1. Keep the original `lande` code commented 63 - 2. Can quickly revert by uncommenting old code and reinstalling `lande` 64 - 65 - ## Implementation Order 66 - 1. ✅ Analyze current implementation 67 - 2. ✅ Research franc library compatibility 68 - 3. 📝 Create this implementation plan 69 - 4. Update package.json to replace lande with franc 70 - 5. Modify getLanguage function in src/utils.ts 71 - 6. Run lint and format checks 72 - 7. Test the changes manually or with existing tests 73 - 74 - ## Risk Assessment 75 - - **Low Risk**: Direct replacement with similar functionality 76 - - **Compatibility**: Both libraries use ISO 639-3 codes 77 - - **Performance**: franc is generally faster and lighter 78 - - **Maintenance**: franc is more actively maintained

-1

PRD.md

··· 1 - Replace lande with franc for language handling in export async function getLanguage. This is found in the file `src/utils.ts`.

+1 -2

src/agent.ts

··· 1 - import { AtpAgent } from "@atproto/api"; 2 1 import { setGlobalDispatcher, Agent as Agent } from "undici"; 3 - 4 2 setGlobalDispatcher(new Agent({ connect: { timeout: 20_000 } })); 5 3 import { BSKY_HANDLE, BSKY_PASSWORD, OZONE_PDS } from "./config.js"; 4 + import { AtpAgent } from "@atproto/api"; 6 5 7 6 export const agent = new AtpAgent({ 8 7 service: `https://${OZONE_PDS}`,

+10 -20

src/checkHandles.ts

··· 11 11 handle: string, 12 12 time: number, 13 13 ) => { 14 - // Get a list of labels 15 - const labels: string[] = Array.from( 16 - HANDLE_CHECKS, 17 - (handleCheck) => handleCheck.label, 18 - ); 19 - 20 - // iterate through the labels 21 - labels.forEach((label) => { 22 - const checkList = HANDLE_CHECKS.find( 23 - (handleCheck) => handleCheck.label === label, 24 - ); 25 - 26 - if (checkList?.ignoredDIDs) { 14 + // iterate through the checks 15 + HANDLE_CHECKS.forEach((checkList) => { 16 + if (checkList.ignoredDIDs) { 27 17 if (checkList.ignoredDIDs.includes(did)) { 28 18 logger.info(`Whitelisted DID: ${did}`); 29 19 return; 30 20 } 31 21 } 32 22 33 - if (checkList!.check.test(handle)) { 23 + if (checkList.check.test(handle)) { 34 24 // False-positive checks 35 - if (checkList?.whitelist) { 36 - if (checkList?.whitelist.test(handle)) { 25 + if (checkList.whitelist) { 26 + if (checkList.whitelist.test(handle)) { 37 27 logger.info(`Whitelisted phrase found for: ${handle}`); 38 28 return; 39 29 } 40 30 } 41 31 42 - if (checkList?.toLabel === true) { 32 + if (checkList.toLabel === true) { 43 33 logger.info(`[CHECKHANDLE]: Labeling ${did} for ${checkList.label}`); 44 34 { 45 35 createAccountLabel( 46 36 did, 47 - checkList.label, 37 + `${checkList.label}`, 48 38 `${time}: ${checkList.comment} - ${handle}`, 49 39 ); 50 40 } 51 41 } 52 42 53 - if (checkList?.reportAcct === true) { 43 + if (checkList.reportAcct === true) { 54 44 logger.info(`[CHECKHANDLE]: Reporting ${did} for ${checkList.label}`); 55 45 createAccountReport(did, `${time}: ${checkList.comment} - ${handle}`); 56 46 } 57 47 58 - if (checkList?.commentAcct === true) { 48 + if (checkList.commentAcct === true) { 59 49 logger.info( 60 50 `[CHECKHANDLE]: Commenting on ${did} for ${checkList.label}`, 61 51 );

+26 -35

src/checkPosts.ts

··· 1 1 import { LINK_SHORTENER, POST_CHECKS } from "./constants.js"; 2 + import { Post } from "./types.js"; 3 + import logger from "./logger.js"; 2 4 import { countStarterPacks } from "./count.js"; 3 - import logger from "./logger.js"; 4 5 import { 5 6 createPostLabel, 6 7 createAccountReport, 7 8 createAccountComment, 8 9 createPostReport, 9 10 } from "./moderation.js"; 10 - import type { Post } from "./types.js"; 11 11 import { getFinalUrl, getLanguage } from "./utils.js"; 12 12 13 13 export const checkPosts = async (post: Post[]) => { 14 - // Get a list of labels 15 - const labels: string[] = Array.from( 16 - POST_CHECKS, 17 - (postCheck) => postCheck.label, 18 - ); 19 - 20 14 const urlRegex = /https?:\/\/[^\s]+/g; 21 15 22 16 // Check for link shorteners ··· 45 39 // Get the post's language 46 40 const lang = await getLanguage(post[0].text); 47 41 48 - // iterate through the labels 49 - labels.forEach((label) => { 50 - const checkPost = POST_CHECKS.find( 51 - (postCheck) => postCheck.label === label, 52 - ); 53 - 54 - if (checkPost?.language || checkPost?.language !== undefined) { 55 - if (!checkPost?.language.includes(lang)) { 42 + // iterate through the checks 43 + POST_CHECKS.forEach((checkPost) => { 44 + if (checkPost.language) { 45 + if (!checkPost.language.includes(lang)) { 56 46 return; 57 47 } 58 48 } 59 49 60 - if (checkPost?.ignoredDIDs) { 61 - if (checkPost?.ignoredDIDs.includes(post[0].did)) { 50 + if (checkPost.ignoredDIDs) { 51 + if (checkPost.ignoredDIDs.includes(post[0].did)) { 62 52 logger.info(`[CHECKPOSTS]: Whitelisted DID: ${post[0].did}`); 63 53 return; 64 54 } 65 55 } 66 56 67 - if (checkPost!.check.test(post[0].text)) { 57 + if (checkPost.check.test(post[0].text)) { 68 58 // Check if post is whitelisted 69 - if (checkPost?.whitelist) { 70 - if (checkPost?.whitelist.test(post[0].text)) { 71 - logger.info("[CHECKPOSTS]: Whitelisted phrase found\""); 59 + if (checkPost.whitelist) { 60 + if (checkPost.whitelist.test(post[0].text)) { 61 + logger.info(`[CHECKPOSTS]: Whitelisted phrase found"`); 72 62 return; 73 63 } 74 64 } 75 65 76 66 countStarterPacks(post[0].did, post[0].time); 77 67 78 - if (checkPost!.toLabel) { 68 + if (checkPost.toLabel === true) { 79 69 logger.info( 80 - `[CHECKPOSTS]: Labeling ${post[0].atURI} for ${checkPost!.label}`, 70 + `[CHECKPOSTS]: Labeling ${post[0].atURI} for ${checkPost.label}`, 81 71 ); 82 72 createPostLabel( 83 73 post[0].atURI, 84 74 post[0].cid, 85 - checkPost!.label, 86 - `${post[0].time}: ${checkPost!.comment} at ${post[0].atURI} with text "${post[0].text}"`, 75 + `${checkPost.label}`, 76 + `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`, 77 + checkPost.duration, 87 78 ); 88 79 } 89 80 90 - if (checkPost!.reportPost === true) { 81 + if (checkPost.reportPost === true) { 91 82 logger.info( 92 - `[CHECKPOSTS]: Reporting ${post[0].atURI} for ${checkPost!.label}`, 83 + `[CHECKPOSTS]: Reporting ${post[0].atURI} for ${checkPost.label}`, 93 84 ); 94 85 logger.info(`Reporting: ${post[0].atURI}`); 95 86 createPostReport( 96 87 post[0].atURI, 97 88 post[0].cid, 98 - `${post[0].time}: ${checkPost!.comment} at ${post[0].atURI} with text "${post[0].text}"`, 89 + `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`, 99 90 ); 100 91 } 101 92 102 - if (checkPost!.reportAcct) { 93 + if (checkPost.reportAcct === true) { 103 94 logger.info( 104 - `[CHECKPOSTS]: Reporting on ${post[0].did} for ${checkPost!.label} in ${post[0].atURI}`, 95 + `[CHECKPOSTS]: Reporting on ${post[0].did} for ${checkPost.label} in ${post[0].atURI}`, 105 96 ); 106 97 createAccountReport( 107 98 post[0].did, 108 - `${post[0].time}: ${checkPost?.comment} at ${post[0].atURI} with text "${post[0].text}"`, 99 + `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`, 109 100 ); 110 101 } 111 102 112 - if (checkPost!.commentAcct) { 103 + if (checkPost.commentAcct === true) { 113 104 logger.info( 114 - `[CHECKPOSTS]: Commenting on ${post[0].did} for ${checkPost!.label} in ${post[0].atURI}`, 105 + `[CHECKPOSTS]: Commenting on ${post[0].did} for ${checkPost.label} in ${post[0].atURI}`, 115 106 ); 116 107 createAccountComment( 117 108 post[0].did, 118 - `${post[0].time}: ${checkPost?.comment} at ${post[0].atURI} with text "${post[0].text}"`, 109 + `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`, 119 110 ); 120 111 } 121 112 }

+22 -41

src/checkProfiles.ts

··· 16 16 ) => { 17 17 const lang = await getLanguage(description); 18 18 19 - const labels: string[] = Array.from( 20 - PROFILE_CHECKS, 21 - (profileCheck) => profileCheck.label, 22 - ); 23 - 24 - // iterate through the labels 25 - labels.forEach((label) => { 26 - const checkProfiles = PROFILE_CHECKS.find( 27 - (profileCheck) => profileCheck.label === label, 28 - ); 29 - 30 - if (checkProfiles?.language || checkProfiles?.language !== undefined) { 31 - if (!checkProfiles?.language.includes(lang)) { 19 + // iterate through the checks 20 + PROFILE_CHECKS.forEach((checkProfiles) => { 21 + if (checkProfiles.language) { 22 + if (!checkProfiles.language.includes(lang)) { 32 23 return; 33 24 } 34 25 } 35 26 36 27 // Check if DID is whitelisted 37 - if (checkProfiles?.ignoredDIDs) { 28 + if (checkProfiles.ignoredDIDs) { 38 29 if (checkProfiles.ignoredDIDs.includes(did)) { 39 30 logger.info(`[CHECKDESCRIPTION]: Whitelisted DID: ${did}`); 40 31 return; ··· 42 33 } 43 34 44 35 if (description) { 45 - if (checkProfiles?.description === true) { 36 + if (checkProfiles.description === true) { 46 37 if (checkProfiles.check.test(description)) { 47 38 // Check if description is whitelisted 48 39 if (checkProfiles.whitelist) { 49 40 if (checkProfiles.whitelist.test(description)) { 50 - logger.info("[CHECKDESCRIPTION]: Whitelisted phrase found."); 41 + logger.info(`[CHECKDESCRIPTION]: Whitelisted phrase found.`); 51 42 return; 52 43 } 53 44 } 54 45 55 - if (checkProfiles.toLabel) { 46 + if (checkProfiles.toLabel === true) { 56 47 createAccountLabel( 57 48 did, 58 - checkProfiles.label, 49 + `${checkProfiles.label}`, 59 50 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`, 60 51 ); 61 52 logger.info( ··· 63 54 ); 64 55 } 65 56 66 - if (checkProfiles.reportAcct) { 57 + if (checkProfiles.reportAcct === true) { 67 58 createAccountReport( 68 59 did, 69 60 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`, ··· 73 64 ); 74 65 } 75 66 76 - if (checkProfiles.commentAcct) { 67 + if (checkProfiles.commentAcct === true) { 77 68 createAccountComment( 78 69 did, 79 70 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`, ··· 96 87 ) => { 97 88 const lang = await getLanguage(description); 98 89 99 - // Get a list of labels 100 - const labels: string[] = Array.from( 101 - PROFILE_CHECKS, 102 - (profileCheck) => profileCheck.label, 103 - ); 104 - 105 - // iterate through the labels 106 - labels.forEach((label) => { 107 - const checkProfiles = PROFILE_CHECKS.find( 108 - (profileCheck) => profileCheck.label === label, 109 - ); 110 - 111 - if (checkProfiles?.language || checkProfiles?.language !== undefined) { 112 - if (!checkProfiles?.language.includes(lang)) { 90 + // iterate through the checks 91 + PROFILE_CHECKS.forEach((checkProfiles) => { 92 + if (checkProfiles.language) { 93 + if (!checkProfiles.language.includes(lang)) { 113 94 return; 114 95 } 115 96 } 116 97 117 98 // Check if DID is whitelisted 118 - if (checkProfiles?.ignoredDIDs) { 99 + if (checkProfiles.ignoredDIDs) { 119 100 if (checkProfiles.ignoredDIDs.includes(did)) { 120 101 logger.info(`[CHECKDISPLAYNAME]: Whitelisted DID: ${did}`); 121 102 return; ··· 123 104 } 124 105 125 106 if (displayName) { 126 - if (checkProfiles?.displayName === true) { 107 + if (checkProfiles.displayName === true) { 127 108 if (checkProfiles.check.test(displayName)) { 128 109 // Check if displayName is whitelisted 129 110 if (checkProfiles.whitelist) { 130 111 if (checkProfiles.whitelist.test(displayName)) { 131 - logger.info("[CHECKDISPLAYNAME]: Whitelisted phrase found."); 112 + logger.info(`[CHECKDISPLAYNAME]: Whitelisted phrase found.`); 132 113 return; 133 114 } 134 115 } 135 116 136 - if (checkProfiles.toLabel) { 117 + if (checkProfiles.toLabel === true) { 137 118 createAccountLabel( 138 119 did, 139 - checkProfiles.label, 120 + `${checkProfiles.label}`, 140 121 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`, 141 122 ); 142 123 logger.info( ··· 144 125 ); 145 126 } 146 127 147 - if (checkProfiles.reportAcct) { 128 + if (checkProfiles.reportAcct === true) { 148 129 createAccountReport( 149 130 did, 150 131 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`, ··· 154 135 ); 155 136 } 156 137 157 - if (checkProfiles.commentAcct) { 138 + if (checkProfiles.commentAcct === true) { 158 139 createAccountComment( 159 140 did, 160 141 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`,

+8 -8

src/checkStarterPack.ts

··· 26 26 // Check if DID is whitelisted 27 27 if (checkProfiles?.ignoredDIDs) { 28 28 if (checkProfiles.ignoredDIDs.includes(did)) { 29 - logger.info(`Whitelisted DID: ${did}`); return; 29 + return logger.info(`Whitelisted DID: ${did}`); 30 30 } 31 31 } 32 32 ··· 36 36 logger.info(`Account joined via starter pack at: ${atURI}`); 37 37 createAccountLabel( 38 38 did, 39 - checkProfiles.label, 40 - `${time}: ${checkProfiles.comment} - Account joined via starter pack at: ${atURI}`, 39 + `${checkProfiles!.label}`, 40 + `${time}: ${checkProfiles!.comment} - Account joined via starter pack at: ${atURI}`, 41 41 ); 42 42 } 43 43 } ··· 65 65 createPostLabel( 66 66 atURI, 67 67 cid, 68 - checkList.label, 69 - `${time}: Starter pack created by known vector for ${checkList.label} at: ${atURI}"`, 68 + `${checkList!.label}`, 69 + `${time}: Starter pack created by known vector for ${checkList!.label} at: ${atURI}"`, 70 70 ); 71 71 createAccountReport( 72 72 did, 73 - `${time}: Starter pack created by known vector for ${checkList.label} at: ${atURI}"`, 73 + `${time}: Starter pack created by known vector for ${checkList!.label} at: ${atURI}"`, 74 74 ); 75 75 } 76 76 ··· 80 80 createPostLabel( 81 81 atURI, 82 82 cid, 83 - checkList!.label, 83 + `${checkList!.label}`, 84 84 `${time}: ${checkList!.comment} at ${atURI} with text "${description}"`, 85 85 ); 86 86 createAccountReport( ··· 96 96 createPostLabel( 97 97 atURI, 98 98 cid, 99 - checkList!.label, 99 + `${checkList!.label}`, 100 100 `${time}: ${checkList!.comment} at ${atURI} with pack name "${packName}"`, 101 101 ); 102 102 createAccountReport(

+2 -2

src/config.ts

··· 20 20 export const CURSOR_UPDATE_INTERVAL = process.env.CURSOR_UPDATE_INTERVAL 21 21 ? Number(process.env.CURSOR_UPDATE_INTERVAL) 22 22 : 60000; 23 - export const { LABEL_LIMIT } = process.env; 24 - export const { LABEL_LIMIT_WAIT } = process.env; 23 + export const LABEL_LIMIT = process.env.LABEL_LIMIT; 24 + export const LABEL_LIMIT_WAIT = process.env.LABEL_LIMIT_WAIT;

+1 -1

src/count.ts

··· 1 1 import { isLoggedIn, agent } from "./agent.js"; 2 - import { limit } from "./limits.js"; 3 2 import logger from "./logger.js"; 3 + import { limit } from "./limits.js"; 4 4 import { createAccountLabel } from "./moderation.js"; 5 5 6 6 export const countStarterPacks = async (did: string, time: number) => {

+4 -8

src/developing_checks.md

··· 1 1 # How to build checks for skywatch-automod 2 2 3 3 ## Introduction 4 - 5 4 Constants.ts defines three types of types of checks: `HANDLE_CHECKS`, `POST_CHECKS`, and `PROFILE_CHECKS`. 6 5 7 6 For each check, users need to define a set of regular expressions that will be used to match against the content of the post, handle, or profile. A maximal example of a check is as follows: ··· 9 8 ```typescript 10 9 export const HANDLE_CHECKS: Checks[] = [ 11 10 { 12 - language: "[eng]", // Language of the check. If the check language does not match the content language, the check will be skipped. Assign null or remove field to apply to all languages. 13 11 label: "example", 14 12 comment: "Example found in handle", 15 13 description: true, // Optional, only used in handle checks 16 14 displayName: true, // Optional, only used in handle checks 17 - reportAcct: false, // if true, the check will only report the content against the account, not label. 18 - reportPost: false, // if true, the check will only report the content against the post, not label. Only used in post checks. 19 - commentOnly: false, // if true, will generate an account level comment from flagged posts, rather than a report. Intended for use when reportAcct is false, and on posts only where the flag may generate a high volume of reports. 20 - toLabel: true, // Should the handle in question be labeled if check evaluates to true. 15 + reportOnly: false, // it true, the check will only report the content against the account, not label. 16 + commentOnly: false, // Poorly named, if true, will generate an account level comment from flagged posts, rather than a report. Intended for use when reportOnly is false, and on posts only where the flag may generate a high volume of reports.. 21 17 check: new RegExp("example", "i"), // Regular expression to match against the content 22 18 whitelist: new RegExp("example.com", "i"), // Optional, regular expression to whitelist content 23 - ignoredDIDs: ["did:plc:example"], // Optional, array of DIDs to ignore if they match the check. Useful for folks who reclaim words or accounts which may be false positives. 24 - }, 19 + ignoredDIDs: ["did:plc:example"] // Optional, array of DIDs to ignore if they match the check. Useful for folks who reclaim words. 20 + } 25 21 ]; 26 22 ``` 27 23

+1 -1

src/homoglyphs.ts

··· 1 - 1 + /* eslint-disable no-misleading-character-class */ 2 2 3 3 export const homoglyphMap: Record<string, string> = { 4 4 // Confusables for 'a'

+7 -7

src/logger.ts

··· 5 5 transport: 6 6 process.env.NODE_ENV !== "production" 7 7 ? { 8 - target: "pino-pretty", 9 - options: { 10 - colorize: true, 11 - translateTime: "SYS:standard", 12 - ignore: "pid,hostname", 13 - }, 14 - } 8 + target: "pino-pretty", 9 + options: { 10 + colorize: true, 11 + translateTime: "SYS:standard", 12 + ignore: "pid,hostname", 13 + }, 14 + } 15 15 : undefined, 16 16 timestamp: pino.stdTimeFunctions.isoTime, 17 17 });

+27 -31

src/main.ts

··· 1 - import fs from "node:fs"; 2 - 3 - import type { 1 + import { 4 2 CommitCreateEvent, 3 + CommitUpdate, 5 4 CommitUpdateEvent, 6 - IdentityEvent } from "@skyware/jetstream"; 7 - import { 8 - CommitUpdate, 5 + IdentityEvent, 9 6 Jetstream, 10 7 } from "@skyware/jetstream"; 11 - 8 + import fs from "node:fs"; 12 9 13 - import { checkHandle } from "./checkHandles.js"; 14 - import { checkPosts } from "./checkPosts.js"; 15 - import { checkDescription, checkDisplayName } from "./checkProfiles.js"; 16 - import { checkStarterPack, checkNewStarterPack } from "./checkStarterPack.js"; 17 10 import { 18 11 CURSOR_UPDATE_INTERVAL, 19 12 FIREHOSE_URL, ··· 22 15 } from "./config.js"; 23 16 import logger from "./logger.js"; 24 17 import { startMetricsServer } from "./metrics.js"; 25 - import type { Post, LinkFeature } from "./types.js"; 26 - import { Handle } from "./types.js"; 18 + import { Post, LinkFeature, Handle } from "./types.js"; 19 + import { checkPosts } from "./checkPosts.js"; 20 + import { checkHandle } from "./checkHandles.js"; 21 + import { checkStarterPack, checkNewStarterPack } from "./checkStarterPack.js"; 22 + import { checkDescription, checkDisplayName } from "./checkProfiles.js"; 27 23 28 24 let cursor = 0; 29 25 let cursorUpdateInterval: NodeJS.Timeout; ··· 52 48 const jetstream = new Jetstream({ 53 49 wantedCollections: WANTED_COLLECTION, 54 50 endpoint: FIREHOSE_URL, 55 - cursor, 51 + cursor: cursor, 56 52 }); 57 53 58 54 jetstream.on("open", () => { ··· 109 105 if (hasLinkType) { 110 106 const urls = event.commit.record 111 107 .facets!.flatMap((facet) => 112 - facet.features.filter( 113 - (feature) => feature.$type === "app.bsky.richtext.facet#link", 114 - ), 115 - ) 108 + facet.features.filter( 109 + (feature) => feature.$type === "app.bsky.richtext.facet#link", 110 + ), 111 + ) 116 112 .map((feature: LinkFeature) => feature.uri); 117 113 118 114 urls.forEach((url) => { ··· 121 117 did: event.did, 122 118 time: event.time_us, 123 119 rkey: event.commit.rkey, 124 - atURI, 120 + atURI: atURI, 125 121 text: url, 126 122 cid: event.commit.cid, 127 123 }, ··· 137 133 did: event.did, 138 134 time: event.time_us, 139 135 rkey: event.commit.rkey, 140 - atURI, 136 + atURI: atURI, 141 137 text: event.commit.record.text, 142 138 cid: event.commit.cid, 143 139 }, ··· 146 142 } 147 143 148 144 if (hasEmbed) { 149 - const { embed } = event.commit.record; 145 + const embed = event.commit.record.embed; 150 146 if (embed && embed.$type === "app.bsky.embed.external") { 151 147 const posts: Post[] = [ 152 148 { 153 149 did: event.did, 154 150 time: event.time_us, 155 151 rkey: event.commit.rkey, 156 - atURI, 152 + atURI: atURI, 157 153 text: embed.external.uri, 158 154 cid: event.commit.cid, 159 155 }, ··· 168 164 did: event.did, 169 165 time: event.time_us, 170 166 rkey: event.commit.rkey, 171 - atURI, 167 + atURI: atURI, 172 168 text: embed.media.external.uri, 173 169 cid: event.commit.cid, 174 170 }, ··· 189 185 checkDescription( 190 186 event.did, 191 187 event.time_us, 192 - event.commit.record.displayName!, 193 - event.commit.record.description!, 188 + event.commit.record.displayName as string, 189 + event.commit.record.description as string, 194 190 ); 195 191 checkDisplayName( 196 192 event.did, 197 193 event.time_us, 198 - event.commit.record.displayName!, 199 - event.commit.record.description!, 194 + event.commit.record.displayName as string, 195 + event.commit.record.description as string, 200 196 ); 201 197 } 202 198 ··· 223 219 checkDescription( 224 220 event.did, 225 221 event.time_us, 226 - event.commit.record.displayName!, 227 - event.commit.record.description!, 222 + event.commit.record.displayName as string, 223 + event.commit.record.description as string, 228 224 ); 229 225 checkDisplayName( 230 226 event.did, 231 227 event.time_us, 232 - event.commit.record.displayName!, 233 - event.commit.record.description!, 228 + event.commit.record.displayName as string, 229 + event.commit.record.description as string, 234 230 ); 235 231 } 236 232

+151 -40

src/moderation.ts

··· 2 2 import { MOD_DID } from "./config.js"; 3 3 import { limit } from "./limits.js"; 4 4 import logger from "./logger.js"; 5 + import { LISTS } from "./lists.js"; 6 + 7 + const doesLabelExist = ( 8 + labels: { val: string }[] | undefined, 9 + labelVal: string, 10 + ): boolean => { 11 + if (!labels) { 12 + return false; 13 + } 14 + return labels.some((label) => label.val === labelVal); 15 + }; 5 16 6 17 export const createPostLabel = async ( 7 18 uri: string, 8 19 cid: string, 9 20 label: string, 10 21 comment: string, 22 + duration: number | undefined, 11 23 ) => { 12 24 await isLoggedIn; 25 + 26 + const hasLabel = await checkRecordLabels(uri, label); 27 + if (hasLabel) { 28 + logger.info( 29 + `Post ${uri} already has label ${label}, skipping`, 30 + ); 31 + return; 32 + } 33 + 13 34 await limit(async () => { 14 35 try { 15 - await agent.tools.ozone.moderation.emitEvent( 36 + const event: { 37 + $type: string; 38 + comment: string; 39 + createLabelVals: string[]; 40 + negateLabelVals: string[]; 41 + durationInHours?: number; 42 + } = { 43 + $type: "tools.ozone.moderation.defs#modEventLabel", 44 + comment: comment, 45 + createLabelVals: [label], 46 + negateLabelVals: [], 47 + }; 48 + 49 + if (duration) { 50 + event.durationInHours = duration; 51 + } 52 + 53 + return agent.tools.ozone.moderation.emitEvent( 16 54 { 17 - event: { 18 - $type: "tools.ozone.moderation.defs#modEventLabel", 19 - comment, 20 - createLabelVals: [label], 21 - negateLabelVals: [], 22 - }, 55 + event: event, 23 56 // specify the labeled post by strongRef 24 57 subject: { 25 58 $type: "com.atproto.repo.strongRef", 26 - uri, 27 - cid, 59 + uri: uri, 60 + cid: cid, 28 61 }, 29 62 // put in the rest of the metadata 30 63 createdBy: `${agent.did}`, 31 64 createdAt: new Date().toISOString(), 65 + modTool: { 66 + name: "skywatch/skywatch-automod", 67 + }, 32 68 }, 33 69 { 34 70 encoding: "application/json", 35 71 headers: { 36 - "atproto-proxy": `${MOD_DID}#atproto_labeler`, 72 + "atproto-proxy": `${MOD_DID!}#atproto_labeler`, 37 73 "atproto-accept-labelers": 38 74 "did:plc:ar7c4by46qjdydhdevvrndac;redact", 39 75 }, ··· 51 87 comment: string, 52 88 ) => { 53 89 await isLoggedIn; 90 + 91 + const hasLabel = await checkAccountLabels(did, label); 92 + if (hasLabel) { 93 + logger.info( 94 + `Account ${did} already has label ${label}, skipping`, 95 + ); 96 + return; 97 + } 98 + 54 99 await limit(async () => { 55 100 try { 56 101 await agent.tools.ozone.moderation.emitEvent( 57 102 { 58 103 event: { 59 104 $type: "tools.ozone.moderation.defs#modEventLabel", 60 - comment, 105 + comment: comment, 61 106 createLabelVals: [label], 62 107 negateLabelVals: [], 63 108 }, 64 109 // specify the labeled post by strongRef 65 110 subject: { 66 111 $type: "com.atproto.admin.defs#repoRef", 67 - did, 112 + did: did, 68 113 }, 69 114 // put in the rest of the metadata 70 115 createdBy: `${agent.did}`, 71 116 createdAt: new Date().toISOString(), 117 + modTool: { 118 + name: "skywatch/skywatch-automod", 119 + }, 72 120 }, 73 121 { 74 122 encoding: "application/json", 75 123 headers: { 76 - "atproto-proxy": `${MOD_DID}#atproto_labeler`, 124 + "atproto-proxy": `${MOD_DID!}#atproto_labeler`, 77 125 "atproto-accept-labelers": 78 126 "did:plc:ar7c4by46qjdydhdevvrndac;redact", 79 127 }, ··· 93 141 await isLoggedIn; 94 142 await limit(async () => { 95 143 try { 96 - await agent.tools.ozone.moderation.emitEvent( 144 + return agent.tools.ozone.moderation.emitEvent( 97 145 { 98 146 event: { 99 147 $type: "tools.ozone.moderation.defs#modEventReport", 100 - comment, 148 + comment: comment, 101 149 reportType: "com.atproto.moderation.defs#reasonOther", 102 150 }, 103 151 // specify the labeled post by strongRef 104 152 subject: { 105 153 $type: "com.atproto.repo.strongRef", 106 - uri, 107 - cid, 154 + uri: uri, 155 + cid: cid, 108 156 }, 109 157 // put in the rest of the metadata 110 158 createdBy: `${agent.did}`, 111 159 createdAt: new Date().toISOString(), 160 + modTool: { 161 + name: "skywatch/skywatch-automod", 162 + }, 112 163 }, 113 164 { 114 165 encoding: "application/json", 115 166 headers: { 116 - "atproto-proxy": `${MOD_DID}#atproto_labeler`, 167 + "atproto-proxy": `${MOD_DID!}#atproto_labeler`, 117 168 "atproto-accept-labelers": 118 169 "did:plc:ar7c4by46qjdydhdevvrndac;redact", 119 170 }, ··· 133 184 { 134 185 event: { 135 186 $type: "tools.ozone.moderation.defs#modEventComment", 136 - comment, 187 + comment: comment, 137 188 }, 138 189 // specify the labeled post by strongRef 139 190 subject: { 140 191 $type: "com.atproto.admin.defs#repoRef", 141 - did, 192 + did: did, 142 193 }, 143 194 // put in the rest of the metadata 144 195 createdBy: `${agent.did}`, 145 196 createdAt: new Date().toISOString(), 197 + modTool: { 198 + name: "skywatch/skywatch-automod", 199 + }, 146 200 }, 147 201 { 148 202 encoding: "application/json", 149 203 headers: { 150 - "atproto-proxy": `${MOD_DID}#atproto_labeler`, 204 + "atproto-proxy": `${MOD_DID!}#atproto_labeler`, 151 205 "atproto-accept-labelers": 152 206 "did:plc:ar7c4by46qjdydhdevvrndac;redact", 153 207 }, ··· 167 221 { 168 222 event: { 169 223 $type: "tools.ozone.moderation.defs#modEventReport", 170 - comment, 224 + comment: comment, 171 225 reportType: "com.atproto.moderation.defs#reasonOther", 172 226 }, 173 227 // specify the labeled post by strongRef 174 228 subject: { 175 229 $type: "com.atproto.admin.defs#repoRef", 176 - did, 230 + did: did, 177 231 }, 178 232 // put in the rest of the metadata 179 233 createdBy: `${agent.did}`, 180 234 createdAt: new Date().toISOString(), 235 + modTool: { 236 + name: "skywatch/skywatch-automod", 237 + }, 181 238 }, 182 239 { 183 240 encoding: "application/json", 184 241 headers: { 185 - "atproto-proxy": `${MOD_DID}#atproto_labeler`, 242 + "atproto-proxy": `${MOD_DID!}#atproto_labeler`, 186 243 "atproto-accept-labelers": 187 244 "did:plc:ar7c4by46qjdydhdevvrndac;redact", 188 245 }, ··· 194 251 }); 195 252 }; 196 253 197 - export async function checkAccountLabels(did: string) { 198 - /* try { 199 - const repo = await limit(() => 200 - agent.tools.ozone.moderation.getRepo( 254 + export const checkAccountLabels = async ( 255 + did: string, 256 + label: string, 257 + ): Promise<boolean> => { 258 + await isLoggedIn; 259 + return await limit(async () => { 260 + try { 261 + const response = await agent.tools.ozone.moderation.getRepo( 262 + { did }, 201 263 { 202 - did: did, 264 + headers: { 265 + "atproto-proxy": `${MOD_DID!}#atproto_labeler`, 266 + "atproto-accept-labelers": 267 + "did:plc:ar7c4by46qjdydhdevvrndac;redact", 268 + }, 203 269 }, 270 + ); 271 + 272 + return doesLabelExist(response.data.labels, label); 273 + } catch (e) { 274 + logger.error(`Failed to check account labels for ${did} with error: ${e}`); 275 + return false; 276 + } 277 + }); 278 + }; 279 + 280 + export const checkRecordLabels = async ( 281 + uri: string, 282 + label: string, 283 + ): Promise<boolean> => { 284 + await isLoggedIn; 285 + return await limit(async () => { 286 + try { 287 + const response = await agent.tools.ozone.moderation.getRecord( 288 + { uri }, 204 289 { 205 290 headers: { 206 291 "atproto-proxy": `${MOD_DID!}#atproto_labeler`, ··· 208 293 "did:plc:ar7c4by46qjdydhdevvrndac;redact", 209 294 }, 210 295 }, 211 - ), 296 + ); 297 + 298 + return doesLabelExist(response.data.labels, label); 299 + } catch (e) { 300 + logger.error(`Failed to check record labels for ${uri} with error: ${e}`); 301 + return false; 302 + } 303 + }); 304 + }; 305 + 306 + export const addToList = async (label: string, did: string) => { 307 + await isLoggedIn; 308 + 309 + const newList = LISTS.find((list) => list.label === label); 310 + if (!newList) { 311 + logger.warn( 312 + `List not found for ${label}. Likely a label not associated with a list`, 212 313 ); 314 + return; 315 + } 316 + logger.info(`New label added to list: ${newList.label}`); 213 317 214 - if (!repo.data.labels) { 215 - return null; 318 + const listUri = `at://${MOD_DID!}/app.bsky.graph.list/${newList.rkey}`; 319 + 320 + await limit(async () => { 321 + try { 322 + await agent.com.atproto.repo.createRecord({ 323 + collection: "app.bsky.graph.listitem", 324 + repo: `${MOD_DID!}`, 325 + record: { 326 + subject: did, 327 + list: listUri, 328 + createdAt: new Date().toISOString(), 329 + }, 330 + }); 331 + } catch (e) { 332 + console.error(e); 216 333 } 217 - 218 - return repo.data.labels.map((label) => label.label); 219 - } catch (e) { 220 - logger.info("Error retrieving repo for account."); 221 - return null; 222 - } */ 223 - return null; 224 - } 334 + }); 335 + };

-117

src/monitor.ts

··· 1 - import { describe } from "node:test"; 2 - 3 - import { PROFILE_CHECKS } from "./constants.js"; 4 - import logger from "./logger.js"; 5 - import { createAccountReport, createAccountLabel } from "./moderation.js"; 6 - 7 - export const monitorDescription = async ( 8 - did: string, 9 - time: number, 10 - displayName: string, 11 - description: string, 12 - ) => { 13 - // Get a list of labels 14 - const labels: string[] = Array.from( 15 - PROFILE_CHECKS, 16 - (profileCheck) => profileCheck.label, 17 - ); 18 - 19 - // iterate through the labels 20 - labels.forEach((label) => { 21 - const checkProfiles = PROFILE_CHECKS.find( 22 - (profileCheck) => profileCheck.label === label, 23 - ); 24 - 25 - // Check if DID is whitelisted 26 - if (checkProfiles?.ignoredDIDs) { 27 - if (checkProfiles.ignoredDIDs.includes(did)) { 28 - logger.info(`Whitelisted DID: ${did}`); return; 29 - } 30 - } 31 - 32 - if (description) { 33 - if (checkProfiles?.description === true) { 34 - if (checkProfiles.check.test(description)) { 35 - if (checkProfiles.whitelist) { 36 - if (checkProfiles.whitelist.test(description)) { 37 - logger.info("Whitelisted phrase found."); 38 - return; 39 - } 40 - } else { 41 - logger.info(`${checkProfiles.label} in description for ${did}`); 42 - } 43 - 44 - if (checkProfiles.reportOnly === true) { 45 - createAccountReport( 46 - did, 47 - `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`, 48 - ); 49 - return; 50 - } else { 51 - createAccountLabel( 52 - did, 53 - checkProfiles.label, 54 - `${time}: ${checkProfiles.comment}`, 55 - ); 56 - } 57 - } 58 - } 59 - } 60 - }); 61 - }; 62 - 63 - export const monitorDisplayName = async ( 64 - did: string, 65 - time: number, 66 - displayName: string, 67 - description: string, 68 - ) => { 69 - // Get a list of labels 70 - const labels: string[] = Array.from( 71 - PROFILE_CHECKS, 72 - (profileCheck) => profileCheck.label, 73 - ); 74 - 75 - // iterate through the labels 76 - labels.forEach((label) => { 77 - const checkProfiles = PROFILE_CHECKS.find( 78 - (profileCheck) => profileCheck.label === label, 79 - ); 80 - 81 - // Check if DID is whitelisted 82 - if (checkProfiles?.ignoredDIDs) { 83 - if (checkProfiles.ignoredDIDs.includes(did)) { 84 - logger.info(`Whitelisted DID: ${did}`); return; 85 - } 86 - } 87 - 88 - if (displayName) { 89 - if (checkProfiles?.displayName === true) { 90 - if (checkProfiles.check.test(displayName)) { 91 - if (checkProfiles.whitelist) { 92 - if (checkProfiles.whitelist.test(displayName)) { 93 - logger.info("Whitelisted phrase found."); 94 - return; 95 - } 96 - } else { 97 - logger.info(`${checkProfiles.label} in displayName for ${did}`); 98 - } 99 - 100 - if (checkProfiles.reportOnly === true) { 101 - createAccountReport( 102 - did, 103 - `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`, 104 - ); 105 - return; 106 - } else { 107 - createAccountLabel( 108 - did, 109 - checkProfiles.label, 110 - `${time}: ${checkProfiles.comment}`, 111 - ); 112 - } 113 - } 114 - } 115 - } 116 - }); 117 - };

src/processJetstream.ts

This is a binary file and will not be displayed.

+1

src/types.ts

··· 8 8 commentAcct: boolean; 9 9 reportPost?: boolean; 10 10 toLabel: boolean; 11 + duration?: number; 11 12 check: RegExp; 12 13 whitelist?: RegExp; 13 14 ignoredDIDs?: string[];

+12 -8

src/utils.ts

··· 1 - import { homoglyphMap } from "./homoglyphs.js"; 2 1 import logger from "./logger.js"; 3 2 3 + import { homoglyphMap } from "./homoglyphs.js"; 4 4 5 5 /** 6 6 * Normalizes a string by converting it to lowercase, replacing homoglyphs, ··· 42 42 43 43 export async function getFinalUrl(url: string): Promise<string> { 44 44 const controller = new AbortController(); 45 - const timeoutId = setTimeout(() => { controller.abort(); }, 10000); // 10-second timeout 45 + const timeoutId = setTimeout(() => controller.abort(), 10000); // 10-second timeout 46 46 47 47 try { 48 48 const response = await fetch(url, { ··· 65 65 } 66 66 67 67 export async function getLanguage(profile: string): Promise<string> { 68 - if (typeof profile !== "string") { 68 + if (typeof profile !== "string" || profile === null) { 69 69 logger.warn( 70 70 "[GETLANGUAGE] getLanguage called with invalid profile data, defaulting to 'eng'.", 71 71 profile, ··· 79 79 return "eng"; 80 80 } 81 81 82 - const { franc } = await import("franc"); 83 - const detectedLang = franc(profileText); 82 + const lande = (await import("lande")).default; 83 + let langsProbabilityMap = lande(profileText); 84 84 85 - // franc returns "und" (undetermined) if it can't detect the language 86 - // Default to "eng" in such cases 87 - return detectedLang === "und" ? "eng" : detectedLang; 85 + // Sort by probability in descending order 86 + langsProbabilityMap.sort( 87 + (a: [string, number], b: [string, number]) => b[1] - a[1], 88 + ); 89 + 90 + // Return the language code with the highest probability 91 + return langsProbabilityMap[0][0]; 88 92 }

-169

src/validateEnv.ts

··· 1 - import logger from "./logger.js"; 2 - 3 - interface EnvironmentVariable { 4 - name: string; 5 - required: boolean; 6 - description: string; 7 - validator?: (value: string) => boolean; 8 - } 9 - 10 - const ENV_VARIABLES: EnvironmentVariable[] = [ 11 - { 12 - name: "DID", 13 - required: true, 14 - description: "Moderator DID for labeling operations", 15 - validator: (value) => value.startsWith("did:"), 16 - }, 17 - { 18 - name: "OZONE_URL", 19 - required: true, 20 - description: "Ozone server URL", 21 - validator: (value) => value.includes(".") && value.length > 3, 22 - }, 23 - { 24 - name: "OZONE_PDS", 25 - required: true, 26 - description: "Ozone PDS URL", 27 - validator: (value) => value.includes(".") && value.length > 3, 28 - }, 29 - { 30 - name: "BSKY_HANDLE", 31 - required: true, 32 - description: "Bluesky handle for authentication", 33 - validator: (value) => value.includes("."), 34 - }, 35 - { 36 - name: "BSKY_PASSWORD", 37 - required: true, 38 - description: "Bluesky password for authentication", 39 - validator: (value) => value.length > 0, 40 - }, 41 - { 42 - name: "HOST", 43 - required: false, 44 - description: "Host address for the server (defaults to 127.0.0.1)", 45 - }, 46 - { 47 - name: "PORT", 48 - required: false, 49 - description: "Port for the main server (defaults to 4100)", 50 - validator: (value) => !isNaN(Number(value)) && Number(value) > 0, 51 - }, 52 - { 53 - name: "METRICS_PORT", 54 - required: false, 55 - description: "Port for metrics server (defaults to 4101)", 56 - validator: (value) => !isNaN(Number(value)) && Number(value) > 0, 57 - }, 58 - { 59 - name: "FIREHOSE_URL", 60 - required: false, 61 - description: "Jetstream firehose WebSocket URL", 62 - validator: (value) => value.startsWith("ws"), 63 - }, 64 - { 65 - name: "CURSOR_UPDATE_INTERVAL", 66 - required: false, 67 - description: "Cursor update interval in milliseconds (defaults to 60000)", 68 - validator: (value) => !isNaN(Number(value)) && Number(value) > 0, 69 - }, 70 - { 71 - name: "LABEL_LIMIT", 72 - required: false, 73 - description: "Rate limit for labeling operations", 74 - validator: (value) => { 75 - // Allow "number * number" format or plain numbers 76 - const multiplyMatch = /^(\d+)\s*\*\s*(\d+)$/.exec(value); 77 - if (multiplyMatch) { 78 - const result = Number(multiplyMatch[1]) * Number(multiplyMatch[2]); 79 - return result > 0; 80 - } 81 - return !isNaN(Number(value)) && Number(value) > 0; 82 - }, 83 - }, 84 - { 85 - name: "LABEL_LIMIT_WAIT", 86 - required: false, 87 - description: "Wait time between rate limited operations", 88 - validator: (value) => { 89 - // Allow "number * number" format or plain numbers 90 - const multiplyMatch = /^(\d+)\s*\*\s*(\d+)$/.exec(value); 91 - if (multiplyMatch) { 92 - const result = Number(multiplyMatch[1]) * Number(multiplyMatch[2]); 93 - return result > 0; 94 - } 95 - return !isNaN(Number(value)) && Number(value) > 0; 96 - }, 97 - }, 98 - { 99 - name: "LOG_LEVEL", 100 - required: false, 101 - description: "Logging level (trace, debug, info, warn, error, fatal)", 102 - validator: (value) => 103 - ["trace", "debug", "info", "warn", "error", "fatal"].includes(value), 104 - }, 105 - { 106 - name: "NODE_ENV", 107 - required: false, 108 - description: "Node environment (development, production, test)", 109 - validator: (value) => ["development", "production", "test"].includes(value), 110 - }, 111 - ]; 112 - 113 - export function validateEnvironment(): void { 114 - const errors: string[] = []; 115 - const warnings: string[] = []; 116 - 117 - logger.info("Validating environment variables..."); 118 - 119 - for (const envVar of ENV_VARIABLES) { 120 - const value = process.env[envVar.name]; 121 - 122 - if (envVar.required) { 123 - if (!value || value.trim() === "") { 124 - errors.push( 125 - `Required environment variable ${envVar.name} is missing. ${envVar.description}`, 126 - ); 127 - continue; 128 - } 129 - } 130 - 131 - if (value && envVar.validator) { 132 - try { 133 - if (!envVar.validator(value)) { 134 - errors.push( 135 - `Environment variable ${envVar.name} has invalid format. ${envVar.description}`, 136 - ); 137 - } 138 - } catch (error) { 139 - errors.push( 140 - `Environment variable ${envVar.name} validation failed: ${String(error)}. ${envVar.description}`, 141 - ); 142 - } 143 - } 144 - 145 - if (!envVar.required && !value) { 146 - warnings.push( 147 - `Optional environment variable ${envVar.name} not set, using default. ${envVar.description}`, 148 - ); 149 - } 150 - } 151 - 152 - if (warnings.length > 0) { 153 - logger.warn("Environment variable warnings:"); 154 - warnings.forEach((warning) => { 155 - logger.warn(` - ${warning}`); 156 - }); 157 - } 158 - 159 - if (errors.length > 0) { 160 - logger.error("Environment variable validation failed:"); 161 - errors.forEach((error) => { 162 - logger.error(` - ${error}`); 163 - }); 164 - logger.error("Please check your environment configuration and try again."); 165 - process.exit(1); 166 - } 167 - 168 - logger.info("Environment variable validation completed successfully"); 169 - }

-150

tests/moderation-critical.test.ts

··· 1 - import { describe, it, expect } from "vitest"; 2 - import { getLanguage } from "../src/utils.js"; 3 - 4 - describe("Critical moderation language detection", () => { 5 - describe("English vs French 'retard' disambiguation", () => { 6 - it("should detect French when 'retard' is used in French context (meaning 'delay')", async () => { 7 - const frenchContexts = [ 8 - "Le train a du retard aujourd'hui", 9 - "Il y a un retard de livraison", 10 - "Désolé pour le retard", 11 - "Mon vol a trois heures de retard", 12 - "Le retard est dû à la météo", 13 - "J'ai un retard de 15 minutes", 14 - "Le projet prend du retard", 15 - "Nous avons accumulé du retard", 16 - "Sans retard s'il vous plaît", 17 - "Le retard n'est pas acceptable", 18 - ]; 19 - 20 - for (const text of frenchContexts) { 21 - const result = await getLanguage(text); 22 - // Should detect as French (fra) or potentially other Romance languages, but NOT English 23 - expect(result).not.toBe("eng"); 24 - // Most likely to be detected as French 25 - expect(["fra", "cat", "spa", "ita", "por", "ron"].includes(result)).toBe(true); 26 - } 27 - }); 28 - 29 - it("should detect English when 'retard' is used in English offensive context", async () => { 30 - const englishContexts = [ 31 - "Don't be such a retard about it", 32 - "That's completely retarded logic", 33 - "Stop acting like a retard", 34 - "What a retard move that was", 35 - "Only a retard would think that", 36 - ]; 37 - 38 - for (const text of englishContexts) { 39 - const result = await getLanguage(text); 40 - // Should detect as English or closely related Germanic languages 41 - expect(["eng", "sco", "nld", "afr", "deu"].includes(result)).toBe(true); 42 - } 43 - }); 44 - 45 - it("should handle mixed signals but lean towards context language", async () => { 46 - // French sentence structure with 'retard' should be French 47 - const frenchStructure = "Le retard du train"; 48 - const result1 = await getLanguage(frenchStructure); 49 - expect(result1).not.toBe("eng"); 50 - 51 - // English sentence structure with 'retard' should be English 52 - const englishStructure = "The retard in the system"; 53 - const result2 = await getLanguage(englishStructure); 54 - // May detect as English or Dutch/Germanic due to structure 55 - expect(["eng", "nld", "afr", "deu", "sco"].includes(result2)).toBe(true); 56 - }); 57 - 58 - it("should detect French for common French phrases with 'retard'", async () => { 59 - const commonFrenchPhrases = [ 60 - "en retard", 61 - "du retard", 62 - "avec retard", 63 - "sans retard", 64 - "mon retard", 65 - "ton retard", 66 - "son retard", 67 - "notre retard", 68 - "votre retard", 69 - "leur retard", 70 - ]; 71 - 72 - for (const phrase of commonFrenchPhrases) { 73 - const result = await getLanguage(phrase); 74 - // Very short phrases might be harder to detect, but should not be English 75 - expect(result).not.toBe("eng"); 76 - } 77 - }); 78 - 79 - it("should provide context for moderation decisions", async () => { 80 - // Test case that matters for moderation 81 - const testCases = [ 82 - { 83 - text: "Je suis en retard pour le meeting", 84 - expectedLang: ["fra", "cat", "spa", "ita"], 85 - isOffensive: false, 86 - context: "French: I am late for the meeting" 87 - }, 88 - { 89 - text: "You're being a retard about this", 90 - expectedLang: ["eng", "sco", "nld"], 91 - isOffensive: true, 92 - context: "English: Offensive slur usage" 93 - }, 94 - { 95 - text: "Le retard mental est un terme médical désuet", 96 - expectedLang: ["fra", "cat", "spa"], 97 - isOffensive: false, 98 - context: "French: Medical terminology (outdated)" 99 - }, 100 - { 101 - text: "That's so retarded dude", 102 - expectedLang: ["eng", "sco"], 103 - isOffensive: true, 104 - context: "English: Casual offensive usage" 105 - } 106 - ]; 107 - 108 - for (const testCase of testCases) { 109 - const result = await getLanguage(testCase.text); 110 - 111 - // Check if detected language is in expected set 112 - const isExpectedLang = testCase.expectedLang.some(lang => result === lang); 113 - 114 - if (!isExpectedLang) { 115 - console.log(`Warning: "${testCase.text}" detected as ${result}, expected one of ${testCase.expectedLang.join(', ')}`); 116 - } 117 - 118 - // The key insight: if detected as French/Romance language, likely NOT offensive 119 - // if detected as English/Germanic, needs moderation review 120 - const needsModeration = ["eng", "sco", "nld", "afr", "deu"].includes(result); 121 - 122 - // This aligns with whether the content is actually offensive 123 - if (testCase.isOffensive) { 124 - expect(needsModeration).toBe(true); 125 - } 126 - } 127 - }); 128 - }); 129 - 130 - describe("Other ambiguous terms across languages", () => { 131 - it("should detect language for other potentially ambiguous terms", async () => { 132 - const ambiguousCases = [ 133 - { text: "Elle a un chat noir", lang: "fra", meaning: "She has a black cat (French)" }, 134 - { text: "Let's chat about it", lang: "eng", meaning: "Let's talk (English)" }, 135 - { text: "Das Gift ist gefährlich", lang: "deu", meaning: "The poison is dangerous (German)" }, 136 - { text: "I got a gift for you", lang: "eng", meaning: "I got a present (English)" }, 137 - { text: "El éxito fue grande", lang: "spa", meaning: "The success was great (Spanish)" }, 138 - { text: "Take the exit here", lang: "eng", meaning: "Take the exit (English)" }, 139 - ]; 140 - 141 - for (const testCase of ambiguousCases) { 142 - const result = await getLanguage(testCase.text); 143 - // Log for debugging but don't fail - language detection is probabilistic 144 - if (result !== testCase.lang) { 145 - console.log(`Note: "${testCase.text}" detected as ${result}, expected ${testCase.lang}`); 146 - } 147 - } 148 - }); 149 - }); 150 - });

-190

tests/utils.test.ts

··· 1 - import { describe, it, expect, beforeEach, vi } from "vitest"; 2 - import { getLanguage } from "../src/utils.js"; 3 - 4 - // Mock the logger to avoid console output during tests 5 - vi.mock("../src/logger.js", () => ({ 6 - default: { 7 - warn: vi.fn(), 8 - }, 9 - })); 10 - 11 - describe("getLanguage", () => { 12 - beforeEach(() => { 13 - vi.clearAllMocks(); 14 - }); 15 - 16 - describe("input validation", () => { 17 - it("should return 'eng' for null input", async () => { 18 - const result = await getLanguage(null as any); 19 - expect(result).toBe("eng"); 20 - }); 21 - 22 - it("should return 'eng' for undefined input", async () => { 23 - const result = await getLanguage(undefined as any); 24 - expect(result).toBe("eng"); 25 - }); 26 - 27 - it("should return 'eng' for number input", async () => { 28 - const result = await getLanguage(123 as any); 29 - expect(result).toBe("eng"); 30 - }); 31 - 32 - it("should return 'eng' for empty string", async () => { 33 - const result = await getLanguage(""); 34 - expect(result).toBe("eng"); 35 - }); 36 - 37 - it("should return 'eng' for whitespace-only string", async () => { 38 - const result = await getLanguage(" \n\t "); 39 - expect(result).toBe("eng"); 40 - }); 41 - }); 42 - 43 - describe("language detection", () => { 44 - it("should detect English text", async () => { 45 - const englishText = "This is a sample English text that should be detected correctly."; 46 - const result = await getLanguage(englishText); 47 - expect(result).toBe("eng"); 48 - }); 49 - 50 - it("should detect Spanish text", async () => { 51 - const spanishText = "Este es un texto de ejemplo en español que debe ser detectado correctamente."; 52 - const result = await getLanguage(spanishText); 53 - // franc may detect Galician (glg) for some Spanish text - both are valid Romance languages 54 - expect(["spa", "glg", "cat"].includes(result)).toBe(true); 55 - }); 56 - 57 - it("should detect French text", async () => { 58 - const frenchText = "Ceci est un exemple de texte en français qui devrait être détecté correctement."; 59 - const result = await getLanguage(frenchText); 60 - expect(result).toBe("fra"); 61 - }); 62 - 63 - it("should detect German text", async () => { 64 - const germanText = "Dies ist ein deutscher Beispieltext, der korrekt erkannt werden sollte."; 65 - const result = await getLanguage(germanText); 66 - expect(result).toBe("deu"); 67 - }); 68 - 69 - it("should detect Portuguese text", async () => { 70 - const portugueseText = "Este é um texto de exemplo em português que deve ser detectado corretamente."; 71 - const result = await getLanguage(portugueseText); 72 - expect(result).toBe("por"); 73 - }); 74 - 75 - it("should detect Italian text", async () => { 76 - const italianText = "Questo è un testo di esempio in italiano che dovrebbe essere rilevato correttamente."; 77 - const result = await getLanguage(italianText); 78 - expect(result).toBe("ita"); 79 - }); 80 - 81 - it("should detect Russian text", async () => { 82 - const russianText = "Это пример текста на русском языке, который должен быть правильно определен."; 83 - const result = await getLanguage(russianText); 84 - expect(result).toBe("rus"); 85 - }); 86 - 87 - it("should detect Japanese text", async () => { 88 - const japaneseText = "これは正しく検出されるべき日本語のサンプルテキストです。"; 89 - const result = await getLanguage(japaneseText); 90 - expect(result).toBe("jpn"); 91 - }); 92 - 93 - it("should detect Chinese text", async () => { 94 - const chineseText = "这是一个应该被正确检测的中文示例文本。"; 95 - const result = await getLanguage(chineseText); 96 - expect(result).toBe("cmn"); 97 - }); 98 - 99 - it("should detect Arabic text", async () => { 100 - const arabicText = "هذا نص عينة باللغة العربية يجب اكتشافه بشكل صحيح."; 101 - const result = await getLanguage(arabicText); 102 - expect(result).toBe("arb"); 103 - }); 104 - }); 105 - 106 - describe("edge cases", () => { 107 - it("should return 'eng' for very short ambiguous text", async () => { 108 - const result = await getLanguage("hi"); 109 - // Very short text might be undetermined 110 - expect(["eng", "hin", "und"].includes(result)).toBe(true); 111 - // If undetermined, should default to 'eng' 112 - if (result === "und") { 113 - expect(result).toBe("eng"); 114 - } 115 - }); 116 - 117 - it("should handle mixed language text", async () => { 118 - const mixedText = "Hello world! Bonjour le monde! Hola mundo!"; 119 - const result = await getLanguage(mixedText); 120 - // Should detect one of the languages or default to 'eng' 121 - expect(typeof result).toBe("string"); 122 - expect(result.length).toBe(3); 123 - }); 124 - 125 - it("should handle gibberish text", async () => { 126 - const gibberish = "asdfghjkl qwerty zxcvbnm poiuytrewq"; 127 - const result = await getLanguage(gibberish); 128 - // Franc may detect gibberish as various languages, not necessarily 'und' 129 - // Just ensure it returns a valid 3-letter language code 130 - expect(result).toMatch(/^[a-z]{3}$/); 131 - }); 132 - 133 - it("should handle text with emojis", async () => { 134 - const textWithEmojis = "Hello world! 👋 How are you? 😊"; 135 - const result = await getLanguage(textWithEmojis); 136 - // Text with emojis should still be detected, though specific language may vary 137 - // Common English-like results include 'eng', 'fuf', 'sco' 138 - expect(result).toMatch(/^[a-z]{3}$/); 139 - }); 140 - 141 - it("should handle text with special characters", async () => { 142 - const textWithSpecialChars = "Hello @world! #testing $100 & more..."; 143 - const result = await getLanguage(textWithSpecialChars); 144 - // Short text with special chars may be detected as various languages 145 - // Common results: 'eng', 'nld' (Dutch), 'afr' (Afrikaans) 146 - expect(["eng", "nld", "afr", "sco"].includes(result) || result.match(/^[a-z]{3}$/)).toBe(true); 147 - }); 148 - 149 - it("should handle text with URLs", async () => { 150 - const textWithUrls = "Check out this website: https://example.com for more information."; 151 - const result = await getLanguage(textWithUrls); 152 - expect(result).toBe("eng"); 153 - }); 154 - 155 - it("should handle text with numbers", async () => { 156 - const textWithNumbers = "The year 2024 has 365 days and 12 months."; 157 - const result = await getLanguage(textWithNumbers); 158 - // May be detected as English, Scots, or other Germanic languages 159 - expect(["eng", "sco", "nld"].includes(result) || result.match(/^[a-z]{3}$/)).toBe(true); 160 - }); 161 - }); 162 - 163 - describe("franc-specific behavior", () => { 164 - it("should return 'eng' when franc returns 'und'", async () => { 165 - // This tests the specific fallback logic for franc's "undetermined" response 166 - // Using a very short or ambiguous text that franc can't determine 167 - const ambiguousText = "xyz"; 168 - const result = await getLanguage(ambiguousText); 169 - // Should either detect a language or fallback to 'eng' if 'und' 170 - expect(typeof result).toBe("string"); 171 - expect(result.length).toBe(3); 172 - }); 173 - 174 - it("should always return a 3-letter ISO 639-3 language code", async () => { 175 - const texts = [ 176 - "Hello world", 177 - "Bonjour le monde", 178 - "Hola mundo", 179 - "مرحبا بالعالم", 180 - "你好世界", 181 - "こんにちは世界", 182 - ]; 183 - 184 - for (const text of texts) { 185 - const result = await getLanguage(text); 186 - expect(result).toMatch(/^[a-z]{3}$/); 187 - } 188 - }); 189 - }); 190 - });

-12

vitest.config.ts

··· 1 - import { defineConfig } from "vitest/config"; 2 - 3 - export default defineConfig({ 4 - test: { 5 - globals: true, 6 - environment: "node", 7 - include: ["tests/**/*.test.ts", "tests/**/*.spec.ts"], 8 - coverage: { 9 - reporter: ["text", "json", "html"], 10 - }, 11 - }, 12 - });