social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky

feat: strip mfm from crossposted posts, leave a link to the original post. (this code sucks) fix: html in markdown

zenfyr.dev c004136e 5918f9b4

verified
+11
bluesky/common.py
··· 131 131 def get_attachments(self) -> list[MediaInfo]: 132 132 return self.attachments 133 133 134 + def get_text_type(self) -> str: 135 + return "text/plain" 136 + 137 + def get_post_url(self) -> str | None: 138 + at_uri: str = self.post['$xpost.strongRef']['uri'][len("at://"):] 139 + 140 + parts = at_uri.split("/") 141 + did, _, post_id = parts 142 + 143 + return f"https://bsky.app/profile/{did}/post/{post_id}" 144 + 134 145 135 146 def tokens_to_richtext(tokens: list[cross.Token]) -> client_utils.TextBuilder | None: 136 147 builder = client_utils.TextBuilder()
+8 -1
bluesky/output.py
··· 8 8 from bluesky.common import SERVICE, ADULT_PATTERN, PORN_PATTERN, tokens_to_richtext 9 9 10 10 import cross, util.database as database 11 + import misskey.mfm_util as mfm_util 11 12 from util.util import LOGGER, as_envvar 12 13 from util.media import MediaInfo, get_filename_from_url, get_media_meta, compress_image, convert_to_mp4 13 14 from util.database import DataBaseWorker ··· 194 195 f"[{get_filename_from_url(attachment.url)}]" 195 196 )) 196 197 tokens.append(cross.TextToken(' ')) 197 - 198 + 199 + if post.get_text_type() == "text/x.misskeymarkdown": 200 + tokens, status = mfm_util.strip_mfm(tokens) 201 + post_url = post.get_post_url() 202 + if status and post_url: 203 + tokens.append(cross.TextToken('\n')) 204 + tokens.append(cross.LinkToken(post_url, "[Post contains MFM, see original]")) 198 205 199 206 split_tokens: list[list[cross.Token]] = cross.split_tokens(tokens, 300) 200 207 post_text: list[client_utils.TextBuilder] = []
+10 -83
cross.py
··· 6 6 import re 7 7 8 8 ALTERNATE = re.compile(r'\S+|\s+') 9 - URL = re.compile(r'(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+', re.IGNORECASE) 10 - MD_INLINE_LINK = re.compile(r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE) 11 - MD_AUTOLINK = re.compile(r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE) 12 - HASHTAG = re.compile(r'(?<!\w)\#([\w]+)') 13 - FEDIVERSE_HANDLE = re.compile(r'(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?') 14 9 15 10 # generic token 16 11 class Token(): ··· 59 54 60 55 class Post(): 61 56 def __init__(self) -> None: 57 + self.now_timestamp = datetime.now(timezone.utc).isoformat() 62 58 pass 63 59 64 60 def get_tokens(self) -> list[Token]: ··· 68 64 return None 69 65 70 66 def get_post_date_iso(self) -> str: 71 - return datetime.now(timezone.utc).isoformat() 67 + return self.now_timestamp 72 68 73 69 def get_attachments(self) -> list[MediaInfo]: 74 70 return [] ··· 85 81 def is_sensitive(self) -> bool: 86 82 return False 87 83 84 + # returns input text type. 85 + # text/plain, text/markdown, text/x.misskeymarkdown 86 + def get_text_type(self) -> str: 87 + return 'text/plain' 88 + 89 + def get_post_url(self) -> str | None: 90 + return None 91 + 88 92 # generic input service. 89 93 # user and service for db queries 90 94 class Input(): ··· 142 146 return False 143 147 144 148 return True 145 - 146 - def tokenize_markdown(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[Token]: 147 - if not text: 148 - return [] 149 - 150 - index: int = 0 151 - total: int = len(text) 152 - buffer: list[str] = [] 153 - 154 - tokens: list[Token] = [] 155 - 156 - def flush(): 157 - nonlocal buffer 158 - if buffer: 159 - tokens.append(TextToken(''.join(buffer))) 160 - buffer = [] 161 - 162 - while index < total: 163 - if text[index] == '[': 164 - md_inline = MD_INLINE_LINK.match(text, index) 165 - if md_inline: 166 - flush() 167 - label = md_inline.group(1) 168 - href = md_inline.group(2) 169 - tokens.append(LinkToken(href, label)) 170 - index = md_inline.end() 171 - continue 172 - 173 - if text[index] == '<': 174 - md_auto = MD_AUTOLINK.match(text, index) 175 - if md_auto: 176 - flush() 177 - href = md_auto.group(1) 178 - tokens.append(LinkToken(href, href)) 179 - index = md_auto.end() 180 - continue 181 - 182 - if text[index] == '#': 183 - tag = HASHTAG.match(text, index) 184 - if tag: 185 - tag_text = tag.group(1) 186 - if tag_text.lower() in tags: 187 - flush() 188 - tokens.append(TagToken(tag_text)) 189 - index = tag.end() 190 - continue 191 - 192 - if text[index] == '@': 193 - handle = FEDIVERSE_HANDLE.match(text, index) 194 - if handle: 195 - handle_text = handle.group(0) 196 - stripped_handle = handle_text.strip() 197 - 198 - match = next( 199 - (pair for pair in handles if stripped_handle in pair), 200 - None 201 - ) 202 - 203 - if match: 204 - flush() 205 - tokens.append(MentionToken(match[1], '')) # TODO: misskey doesn’t provide a uri 206 - index = handle.end() 207 - continue 208 - 209 - url = URL.match(text, index) 210 - if url: 211 - flush() 212 - href = url.group(0) 213 - tokens.append(LinkToken(href, href)) 214 - index = url.end() 215 - continue 216 - 217 - buffer.append(text[index]) 218 - index += 1 219 - 220 - flush() 221 - return tokens 222 149 223 150 def split_tokens(tokens: list[Token], max_chars: int, max_link_len: int = 35) -> list[list[Token]]: 224 151 def new_block():
+10 -6
mastodon/common.py
··· 7 7 self.status = status 8 8 self.media_attachments = media_attachments 9 9 self.tokens = tokens 10 + self.content_type = status.get('content_type', 'text/plain') 10 11 11 12 def get_tokens(self) -> list[cross.Token]: 12 13 return self.tokens ··· 15 16 return self.status.get('in_reply_to_id') 16 17 17 18 def get_post_date_iso(self) -> str: 18 - date = self.status.get('created_at') 19 - return date or super().get_post_date_iso() 19 + return self.status.get('created_at') or self.now_timestamp 20 20 21 21 def get_cw(self) -> str: 22 22 return self.status.get('spoiler_text') or '' ··· 25 25 return self.status['id'] 26 26 27 27 def get_languages(self) -> list[str]: 28 - if self.status.get('language'): 29 - return [self.status['language']] 30 - return [] 28 + return [self.status['language']] if self.status.get('language') else [] 31 29 32 30 def is_sensitive(self) -> bool: 33 31 return self.status.get('sensitive', False) 34 32 35 33 def get_attachments(self) -> list[MediaInfo]: 36 - return self.media_attachments 34 + return self.media_attachments 35 + 36 + def get_text_type(self) -> str: 37 + return self.content_type 38 + 39 + def get_post_url(self) -> str | None: 40 + return self.status.get('url')
+13 -13
mastodon/html_util.py util/html_util.py
··· 7 7 self.tokens: list[cross.Token] = [] 8 8 self.status: dict 9 9 10 + self.mentions: list[tuple[str, str]] 11 + self.tags: list[str] 12 + 10 13 self.in_pre = False 11 14 self.in_code = False 12 15 ··· 98 101 self.anchor_data = [] 99 102 100 103 if anchor_data.startswith('#'): 101 - tags: list[dict] = self.status.get('tags', []) 102 - 103 104 as_tag = anchor_data[1:].lower() 104 - if any(as_tag == block.get('name') for block in tags): 105 + if any(as_tag == block for block in self.tags): 105 106 self.tokens.append(cross.TagToken(anchor_data[1:])) 106 107 elif anchor_data.startswith('@'): 107 - mentions: list[dict] = self.status.get('mentions', []) 108 + match = next( 109 + (pair for pair in self.mentions if anchor_data in pair), 110 + None 111 + ) 108 112 109 - as_mention = anchor_data[1:] 110 - for block in mentions: 111 - if href == block.get('url'): 112 - self.tokens.append(cross.MentionToken(block['acct'], block['url'])) 113 - break 114 - elif as_mention == block.get('acct') or as_mention == block.get('username'): 115 - self.tokens.append(cross.MentionToken(block['acct'], block['url'])) 116 - break 113 + if match: 114 + self.tokens.append(cross.MentionToken(match[1], '')) 117 115 else: 118 116 self.tokens.append(cross.LinkToken(href, anchor_data)) 119 117 ··· 180 178 """Reset the parser state for reuse.""" 181 179 super().reset() 182 180 self.tokens = [] 183 - self.status = {} 181 + 182 + self.mentions = [] 183 + self.tags = [] 184 184 185 185 self.in_pre = False 186 186 self.in_code = False
+6 -4
mastodon/input.py
··· 4 4 import asyncio 5 5 6 6 from mastodon.common import MastodonPost 7 - import mastodon.html_util as html_util 7 + import util.html_util as html_util 8 + import util.md_util as md_util 8 9 9 10 import cross, util.database as database 10 11 from util.util import LOGGER, as_envvar ··· 69 70 mentions.append(('@' + mention['username'], '@' + mention['acct'])) 70 71 71 72 if raw_text and content_type in MARKDOWNY: 72 - return cross.tokenize_markdown(raw_text, tags, mentions) 73 + return md_util.tokenize_markdown(raw_text, tags, mentions) 73 74 74 75 akkoma_ext: dict | None = status.get('akkoma', {}).get('source') 75 76 if akkoma_ext: 76 77 if akkoma_ext.get('mediaType') in MARKDOWNY: 77 - return cross.tokenize_markdown(akkoma_ext["content"], tags, mentions) 78 + return md_util.tokenize_markdown(akkoma_ext["content"], tags, mentions) 78 79 79 80 tokenizer = html_util.HTMLPostTokenizer() 80 - tokenizer.status = status 81 + tokenizer.mentions = mentions 82 + tokenizer.tags = tags 81 83 tokenizer.feed(status.get('content', "")) 82 84 return tokenizer.get_tokens() 83 85
+11 -2
mastodon/output.py
··· 1 1 import requests, time 2 2 3 3 import cross, util.database as database 4 + import misskey.mfm_util as mfm_util 4 5 from util.util import LOGGER, as_envvar, canonical_label 5 6 from util.media import MediaInfo 6 7 from util.database import DataBaseWorker ··· 249 250 lang = post.get_languages()[0] 250 251 else: 251 252 lang = 'en' 252 - 253 - raw_statuses = self.split_tokens_media(post.get_tokens(), post.get_attachments()) 253 + 254 + post_tokens = post.get_tokens() 255 + if post.get_text_type() == "text/x.misskeymarkdown": 256 + post_tokens, status = mfm_util.strip_mfm(post_tokens) 257 + post_url = post.get_post_url() 258 + if status and post_url: 259 + post_tokens.append(cross.TextToken('\n')) 260 + post_tokens.append(cross.LinkToken(post_url, "[Post contains MFM, see original]")) 261 + 262 + raw_statuses = self.split_tokens_media(post_tokens, post.get_attachments()) 254 263 if not raw_statuses: 255 264 LOGGER.error("Failed to split post into statuses?") 256 265 return None
+9 -2
misskey/common.py
··· 2 2 from util.media import MediaInfo 3 3 4 4 class MisskeyPost(cross.Post): 5 - def __init__(self, note: dict, tokens: list[cross.Token], files: list[MediaInfo]) -> None: 5 + def __init__(self, instance_url: str, note: dict, tokens: list[cross.Token], files: list[MediaInfo]) -> None: 6 6 super().__init__() 7 7 self.note = note 8 8 self.sensitive = any([a.get('isSensitive', False) for a in note.get('files', [])]) 9 9 self.media_attachments = files 10 10 self.tokens = tokens 11 + self.url = instance_url + '/notes/' + note['id'] 11 12 12 13 def get_tokens(self) -> list[cross.Token]: 13 14 return self.tokens ··· 32 33 return [] 33 34 34 35 def is_sensitive(self) -> bool: 35 - return self.sensitive 36 + return self.sensitive 37 + 38 + def get_text_type(self) -> str: 39 + return "text/x.misskeymarkdown" 40 + 41 + def get_post_url(self) -> str | None: 42 + return self.url
+3 -2
misskey/input.py
··· 6 6 from misskey.common import MisskeyPost 7 7 8 8 import cross, util.database as database 9 + import util.md_util as md_util 9 10 from util.media import MediaInfo, download_media 10 11 from util.util import LOGGER, as_envvar 11 12 ··· 75 76 for key, value in mention_handles.items(): 76 77 handles.append((value, value)) 77 78 78 - tokens = cross.tokenize_markdown(note.get('text', ''), tags, handles) 79 + tokens = md_util.tokenize_markdown(note.get('text', ''), tags, handles) 79 80 if not cross.test_filters(tokens, self.options.filters): 80 81 LOGGER.info("Skipping '%s'. Matched a filter!", note['id']) 81 82 return ··· 91 92 return 92 93 media_attachments.append(info) 93 94 94 - cross_post = MisskeyPost(note, tokens, media_attachments) 95 + cross_post = MisskeyPost(self.service, note, tokens, media_attachments) 95 96 for output in outputs: 96 97 output.accept_post(cross_post) 97 98
+35
misskey/mfm_util.py
··· 1 + import re, cross 2 + 3 + MFM_PATTERN = re.compile(r'\$\[([^\[\]]+)\]') 4 + 5 + def strip_mfm(tokens: list[cross.Token]) -> tuple[list[cross.Token], bool]: 6 + modified = False 7 + 8 + for tk in tokens: 9 + if isinstance(tk, cross.TextToken): 10 + original = tk.text 11 + cleaned = __strip_mfm(original) 12 + if cleaned != original: 13 + modified = True 14 + tk.text = cleaned 15 + 16 + elif isinstance(tk, cross.LinkToken): 17 + original = tk.label 18 + cleaned = __strip_mfm(original) 19 + if cleaned != original: 20 + modified = True 21 + tk.label = cleaned 22 + 23 + return tokens, modified 24 + 25 + def __strip_mfm(text: str) -> str: 26 + def match_contents(match: re.Match[str]): 27 + content = match.group(1).strip() 28 + parts = content.split(' ', 1) 29 + return parts[1] if len(parts) > 1 else '' 30 + 31 + while MFM_PATTERN.search(text): 32 + text = MFM_PATTERN.sub(match_contents, text) 33 + 34 + return text 35 +
+112
util/md_util.py
··· 1 + import re 2 + 3 + import cross 4 + import util.html_util as html_util 5 + import util.util as util 6 + 7 + URL = re.compile(r'(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+', re.IGNORECASE) 8 + MD_INLINE_LINK = re.compile(r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE) 9 + MD_AUTOLINK = re.compile(r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE) 10 + HASHTAG = re.compile(r'(?<!\w)\#([\w]+)') 11 + FEDIVERSE_HANDLE = re.compile(r'(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?') 12 + 13 + def tokenize_markdown(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]: 14 + if not text: 15 + return [] 16 + 17 + tokenizer = html_util.HTMLPostTokenizer() 18 + tokenizer.mentions = handles 19 + tokenizer.tags = tags 20 + tokenizer.feed(text) 21 + html_tokens = tokenizer.get_tokens() 22 + 23 + tokens: list[cross.Token] = [] 24 + 25 + for tk in html_tokens: 26 + if isinstance(tk, cross.TextToken): 27 + tokens.extend(__tokenize_md(tk.text, tags, handles)) 28 + elif isinstance(tk, cross.LinkToken): 29 + if not tk.label or util.canonical_label(tk.label, tk.href): 30 + tokens.append(tk) 31 + continue 32 + 33 + tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)) 34 + else: 35 + tokens.append(tk) 36 + 37 + return tokens 38 + 39 + 40 + def __tokenize_md(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]: 41 + index: int = 0 42 + total: int = len(text) 43 + buffer: list[str] = [] 44 + 45 + tokens: list[cross.Token] = [] 46 + 47 + def flush(): 48 + nonlocal buffer 49 + if buffer: 50 + tokens.append(cross.TextToken(''.join(buffer))) 51 + buffer = [] 52 + 53 + while index < total: 54 + if text[index] == '[': 55 + md_inline = MD_INLINE_LINK.match(text, index) 56 + if md_inline: 57 + flush() 58 + label = md_inline.group(1) 59 + href = md_inline.group(2) 60 + tokens.append(cross.LinkToken(href, label)) 61 + index = md_inline.end() 62 + continue 63 + 64 + if text[index] == '<': 65 + md_auto = MD_AUTOLINK.match(text, index) 66 + if md_auto: 67 + flush() 68 + href = md_auto.group(1) 69 + tokens.append(cross.LinkToken(href, href)) 70 + index = md_auto.end() 71 + continue 72 + 73 + if text[index] == '#': 74 + tag = HASHTAG.match(text, index) 75 + if tag: 76 + tag_text = tag.group(1) 77 + if tag_text.lower() in tags: 78 + flush() 79 + tokens.append(cross.TagToken(tag_text)) 80 + index = tag.end() 81 + continue 82 + 83 + if text[index] == '@': 84 + handle = FEDIVERSE_HANDLE.match(text, index) 85 + if handle: 86 + handle_text = handle.group(0) 87 + stripped_handle = handle_text.strip() 88 + 89 + match = next( 90 + (pair for pair in handles if stripped_handle in pair), 91 + None 92 + ) 93 + 94 + if match: 95 + flush() 96 + tokens.append(cross.MentionToken(match[1], '')) # TODO: misskey doesn’t provide a uri 97 + index = handle.end() 98 + continue 99 + 100 + url = URL.match(text, index) 101 + if url: 102 + flush() 103 + href = url.group(0) 104 + tokens.append(cross.LinkToken(href, href)) 105 + index = url.end() 106 + continue 107 + 108 + buffer.append(text[index]) 109 + index += 1 110 + 111 + flush() 112 + return tokens