+11
bluesky/common.py
+11
bluesky/common.py
···
131
131
def get_attachments(self) -> list[MediaInfo]:
132
132
return self.attachments
133
133
134
+
def get_text_type(self) -> str:
135
+
return "text/plain"
136
+
137
+
def get_post_url(self) -> str | None:
138
+
at_uri: str = self.post['$xpost.strongRef']['uri'][len("at://"):]
139
+
140
+
parts = at_uri.split("/")
141
+
did, _, post_id = parts
142
+
143
+
return f"https://bsky.app/profile/{did}/post/{post_id}"
144
+
134
145
135
146
def tokens_to_richtext(tokens: list[cross.Token]) -> client_utils.TextBuilder | None:
136
147
builder = client_utils.TextBuilder()
+8
-1
bluesky/output.py
+8
-1
bluesky/output.py
···
8
8
from bluesky.common import SERVICE, ADULT_PATTERN, PORN_PATTERN, tokens_to_richtext
9
9
10
10
import cross, util.database as database
11
+
import misskey.mfm_util as mfm_util
11
12
from util.util import LOGGER, as_envvar
12
13
from util.media import MediaInfo, get_filename_from_url, get_media_meta, compress_image, convert_to_mp4
13
14
from util.database import DataBaseWorker
···
194
195
f"[{get_filename_from_url(attachment.url)}]"
195
196
))
196
197
tokens.append(cross.TextToken(' '))
197
-
198
+
199
+
if post.get_text_type() == "text/x.misskeymarkdown":
200
+
tokens, status = mfm_util.strip_mfm(tokens)
201
+
post_url = post.get_post_url()
202
+
if status and post_url:
203
+
tokens.append(cross.TextToken('\n'))
204
+
tokens.append(cross.LinkToken(post_url, "[Post contains MFM, see original]"))
198
205
199
206
split_tokens: list[list[cross.Token]] = cross.split_tokens(tokens, 300)
200
207
post_text: list[client_utils.TextBuilder] = []
+10
-83
cross.py
+10
-83
cross.py
···
6
6
import re
7
7
8
8
ALTERNATE = re.compile(r'\S+|\s+')
9
-
URL = re.compile(r'(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+', re.IGNORECASE)
10
-
MD_INLINE_LINK = re.compile(r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE)
11
-
MD_AUTOLINK = re.compile(r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE)
12
-
HASHTAG = re.compile(r'(?<!\w)\#([\w]+)')
13
-
FEDIVERSE_HANDLE = re.compile(r'(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?')
14
9
15
10
# generic token
16
11
class Token():
···
59
54
60
55
class Post():
61
56
def __init__(self) -> None:
57
+
self.now_timestamp = datetime.now(timezone.utc).isoformat()
62
58
pass
63
59
64
60
def get_tokens(self) -> list[Token]:
···
68
64
return None
69
65
70
66
def get_post_date_iso(self) -> str:
71
-
return datetime.now(timezone.utc).isoformat()
67
+
return self.now_timestamp
72
68
73
69
def get_attachments(self) -> list[MediaInfo]:
74
70
return []
···
85
81
def is_sensitive(self) -> bool:
86
82
return False
87
83
84
+
# returns input text type.
85
+
# text/plain, text/markdown, text/x.misskeymarkdown
86
+
def get_text_type(self) -> str:
87
+
return 'text/plain'
88
+
89
+
def get_post_url(self) -> str | None:
90
+
return None
91
+
88
92
# generic input service.
89
93
# user and service for db queries
90
94
class Input():
···
142
146
return False
143
147
144
148
return True
145
-
146
-
def tokenize_markdown(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[Token]:
147
-
if not text:
148
-
return []
149
-
150
-
index: int = 0
151
-
total: int = len(text)
152
-
buffer: list[str] = []
153
-
154
-
tokens: list[Token] = []
155
-
156
-
def flush():
157
-
nonlocal buffer
158
-
if buffer:
159
-
tokens.append(TextToken(''.join(buffer)))
160
-
buffer = []
161
-
162
-
while index < total:
163
-
if text[index] == '[':
164
-
md_inline = MD_INLINE_LINK.match(text, index)
165
-
if md_inline:
166
-
flush()
167
-
label = md_inline.group(1)
168
-
href = md_inline.group(2)
169
-
tokens.append(LinkToken(href, label))
170
-
index = md_inline.end()
171
-
continue
172
-
173
-
if text[index] == '<':
174
-
md_auto = MD_AUTOLINK.match(text, index)
175
-
if md_auto:
176
-
flush()
177
-
href = md_auto.group(1)
178
-
tokens.append(LinkToken(href, href))
179
-
index = md_auto.end()
180
-
continue
181
-
182
-
if text[index] == '#':
183
-
tag = HASHTAG.match(text, index)
184
-
if tag:
185
-
tag_text = tag.group(1)
186
-
if tag_text.lower() in tags:
187
-
flush()
188
-
tokens.append(TagToken(tag_text))
189
-
index = tag.end()
190
-
continue
191
-
192
-
if text[index] == '@':
193
-
handle = FEDIVERSE_HANDLE.match(text, index)
194
-
if handle:
195
-
handle_text = handle.group(0)
196
-
stripped_handle = handle_text.strip()
197
-
198
-
match = next(
199
-
(pair for pair in handles if stripped_handle in pair),
200
-
None
201
-
)
202
-
203
-
if match:
204
-
flush()
205
-
tokens.append(MentionToken(match[1], '')) # TODO: misskey doesn’t provide a uri
206
-
index = handle.end()
207
-
continue
208
-
209
-
url = URL.match(text, index)
210
-
if url:
211
-
flush()
212
-
href = url.group(0)
213
-
tokens.append(LinkToken(href, href))
214
-
index = url.end()
215
-
continue
216
-
217
-
buffer.append(text[index])
218
-
index += 1
219
-
220
-
flush()
221
-
return tokens
222
149
223
150
def split_tokens(tokens: list[Token], max_chars: int, max_link_len: int = 35) -> list[list[Token]]:
224
151
def new_block():
+10
-6
mastodon/common.py
+10
-6
mastodon/common.py
···
7
7
self.status = status
8
8
self.media_attachments = media_attachments
9
9
self.tokens = tokens
10
+
self.content_type = status.get('content_type', 'text/plain')
10
11
11
12
def get_tokens(self) -> list[cross.Token]:
12
13
return self.tokens
···
15
16
return self.status.get('in_reply_to_id')
16
17
17
18
def get_post_date_iso(self) -> str:
18
-
date = self.status.get('created_at')
19
-
return date or super().get_post_date_iso()
19
+
return self.status.get('created_at') or self.now_timestamp
20
20
21
21
def get_cw(self) -> str:
22
22
return self.status.get('spoiler_text') or ''
···
25
25
return self.status['id']
26
26
27
27
def get_languages(self) -> list[str]:
28
-
if self.status.get('language'):
29
-
return [self.status['language']]
30
-
return []
28
+
return [self.status['language']] if self.status.get('language') else []
31
29
32
30
def is_sensitive(self) -> bool:
33
31
return self.status.get('sensitive', False)
34
32
35
33
def get_attachments(self) -> list[MediaInfo]:
36
-
return self.media_attachments
34
+
return self.media_attachments
35
+
36
+
def get_text_type(self) -> str:
37
+
return self.content_type
38
+
39
+
def get_post_url(self) -> str | None:
40
+
return self.status.get('url')
+13
-13
mastodon/html_util.py
util/html_util.py
+13
-13
mastodon/html_util.py
util/html_util.py
···
7
7
self.tokens: list[cross.Token] = []
8
8
self.status: dict
9
9
10
+
self.mentions: list[tuple[str, str]]
11
+
self.tags: list[str]
12
+
10
13
self.in_pre = False
11
14
self.in_code = False
12
15
···
98
101
self.anchor_data = []
99
102
100
103
if anchor_data.startswith('#'):
101
-
tags: list[dict] = self.status.get('tags', [])
102
-
103
104
as_tag = anchor_data[1:].lower()
104
-
if any(as_tag == block.get('name') for block in tags):
105
+
if any(as_tag == block for block in self.tags):
105
106
self.tokens.append(cross.TagToken(anchor_data[1:]))
106
107
elif anchor_data.startswith('@'):
107
-
mentions: list[dict] = self.status.get('mentions', [])
108
+
match = next(
109
+
(pair for pair in self.mentions if anchor_data in pair),
110
+
None
111
+
)
108
112
109
-
as_mention = anchor_data[1:]
110
-
for block in mentions:
111
-
if href == block.get('url'):
112
-
self.tokens.append(cross.MentionToken(block['acct'], block['url']))
113
-
break
114
-
elif as_mention == block.get('acct') or as_mention == block.get('username'):
115
-
self.tokens.append(cross.MentionToken(block['acct'], block['url']))
116
-
break
113
+
if match:
114
+
self.tokens.append(cross.MentionToken(match[1], ''))
117
115
else:
118
116
self.tokens.append(cross.LinkToken(href, anchor_data))
119
117
···
180
178
"""Reset the parser state for reuse."""
181
179
super().reset()
182
180
self.tokens = []
183
-
self.status = {}
181
+
182
+
self.mentions = []
183
+
self.tags = []
184
184
185
185
self.in_pre = False
186
186
self.in_code = False
+6
-4
mastodon/input.py
+6
-4
mastodon/input.py
···
4
4
import asyncio
5
5
6
6
from mastodon.common import MastodonPost
7
-
import mastodon.html_util as html_util
7
+
import util.html_util as html_util
8
+
import util.md_util as md_util
8
9
9
10
import cross, util.database as database
10
11
from util.util import LOGGER, as_envvar
···
69
70
mentions.append(('@' + mention['username'], '@' + mention['acct']))
70
71
71
72
if raw_text and content_type in MARKDOWNY:
72
-
return cross.tokenize_markdown(raw_text, tags, mentions)
73
+
return md_util.tokenize_markdown(raw_text, tags, mentions)
73
74
74
75
akkoma_ext: dict | None = status.get('akkoma', {}).get('source')
75
76
if akkoma_ext:
76
77
if akkoma_ext.get('mediaType') in MARKDOWNY:
77
-
return cross.tokenize_markdown(akkoma_ext["content"], tags, mentions)
78
+
return md_util.tokenize_markdown(akkoma_ext["content"], tags, mentions)
78
79
79
80
tokenizer = html_util.HTMLPostTokenizer()
80
-
tokenizer.status = status
81
+
tokenizer.mentions = mentions
82
+
tokenizer.tags = tags
81
83
tokenizer.feed(status.get('content', ""))
82
84
return tokenizer.get_tokens()
83
85
+11
-2
mastodon/output.py
+11
-2
mastodon/output.py
···
1
1
import requests, time
2
2
3
3
import cross, util.database as database
4
+
import misskey.mfm_util as mfm_util
4
5
from util.util import LOGGER, as_envvar, canonical_label
5
6
from util.media import MediaInfo
6
7
from util.database import DataBaseWorker
···
249
250
lang = post.get_languages()[0]
250
251
else:
251
252
lang = 'en'
252
-
253
-
raw_statuses = self.split_tokens_media(post.get_tokens(), post.get_attachments())
253
+
254
+
post_tokens = post.get_tokens()
255
+
if post.get_text_type() == "text/x.misskeymarkdown":
256
+
post_tokens, status = mfm_util.strip_mfm(post_tokens)
257
+
post_url = post.get_post_url()
258
+
if status and post_url:
259
+
post_tokens.append(cross.TextToken('\n'))
260
+
post_tokens.append(cross.LinkToken(post_url, "[Post contains MFM, see original]"))
261
+
262
+
raw_statuses = self.split_tokens_media(post_tokens, post.get_attachments())
254
263
if not raw_statuses:
255
264
LOGGER.error("Failed to split post into statuses?")
256
265
return None
+9
-2
misskey/common.py
+9
-2
misskey/common.py
···
2
2
from util.media import MediaInfo
3
3
4
4
class MisskeyPost(cross.Post):
5
-
def __init__(self, note: dict, tokens: list[cross.Token], files: list[MediaInfo]) -> None:
5
+
def __init__(self, instance_url: str, note: dict, tokens: list[cross.Token], files: list[MediaInfo]) -> None:
6
6
super().__init__()
7
7
self.note = note
8
8
self.sensitive = any([a.get('isSensitive', False) for a in note.get('files', [])])
9
9
self.media_attachments = files
10
10
self.tokens = tokens
11
+
self.url = instance_url + '/notes/' + note['id']
11
12
12
13
def get_tokens(self) -> list[cross.Token]:
13
14
return self.tokens
···
32
33
return []
33
34
34
35
def is_sensitive(self) -> bool:
35
-
return self.sensitive
36
+
return self.sensitive
37
+
38
+
def get_text_type(self) -> str:
39
+
return "text/x.misskeymarkdown"
40
+
41
+
def get_post_url(self) -> str | None:
42
+
return self.url
+3
-2
misskey/input.py
+3
-2
misskey/input.py
···
6
6
from misskey.common import MisskeyPost
7
7
8
8
import cross, util.database as database
9
+
import util.md_util as md_util
9
10
from util.media import MediaInfo, download_media
10
11
from util.util import LOGGER, as_envvar
11
12
···
75
76
for key, value in mention_handles.items():
76
77
handles.append((value, value))
77
78
78
-
tokens = cross.tokenize_markdown(note.get('text', ''), tags, handles)
79
+
tokens = md_util.tokenize_markdown(note.get('text', ''), tags, handles)
79
80
if not cross.test_filters(tokens, self.options.filters):
80
81
LOGGER.info("Skipping '%s'. Matched a filter!", note['id'])
81
82
return
···
91
92
return
92
93
media_attachments.append(info)
93
94
94
-
cross_post = MisskeyPost(note, tokens, media_attachments)
95
+
cross_post = MisskeyPost(self.service, note, tokens, media_attachments)
95
96
for output in outputs:
96
97
output.accept_post(cross_post)
97
98
+35
misskey/mfm_util.py
+35
misskey/mfm_util.py
···
1
+
import re, cross
2
+
3
+
MFM_PATTERN = re.compile(r'\$\[([^\[\]]+)\]')
4
+
5
+
def strip_mfm(tokens: list[cross.Token]) -> tuple[list[cross.Token], bool]:
6
+
modified = False
7
+
8
+
for tk in tokens:
9
+
if isinstance(tk, cross.TextToken):
10
+
original = tk.text
11
+
cleaned = __strip_mfm(original)
12
+
if cleaned != original:
13
+
modified = True
14
+
tk.text = cleaned
15
+
16
+
elif isinstance(tk, cross.LinkToken):
17
+
original = tk.label
18
+
cleaned = __strip_mfm(original)
19
+
if cleaned != original:
20
+
modified = True
21
+
tk.label = cleaned
22
+
23
+
return tokens, modified
24
+
25
+
def __strip_mfm(text: str) -> str:
26
+
def match_contents(match: re.Match[str]):
27
+
content = match.group(1).strip()
28
+
parts = content.split(' ', 1)
29
+
return parts[1] if len(parts) > 1 else ''
30
+
31
+
while MFM_PATTERN.search(text):
32
+
text = MFM_PATTERN.sub(match_contents, text)
33
+
34
+
return text
35
+
+112
util/md_util.py
+112
util/md_util.py
···
1
+
import re
2
+
3
+
import cross
4
+
import util.html_util as html_util
5
+
import util.util as util
6
+
7
+
URL = re.compile(r'(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+', re.IGNORECASE)
8
+
MD_INLINE_LINK = re.compile(r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE)
9
+
MD_AUTOLINK = re.compile(r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE)
10
+
HASHTAG = re.compile(r'(?<!\w)\#([\w]+)')
11
+
FEDIVERSE_HANDLE = re.compile(r'(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?')
12
+
13
+
def tokenize_markdown(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]:
14
+
if not text:
15
+
return []
16
+
17
+
tokenizer = html_util.HTMLPostTokenizer()
18
+
tokenizer.mentions = handles
19
+
tokenizer.tags = tags
20
+
tokenizer.feed(text)
21
+
html_tokens = tokenizer.get_tokens()
22
+
23
+
tokens: list[cross.Token] = []
24
+
25
+
for tk in html_tokens:
26
+
if isinstance(tk, cross.TextToken):
27
+
tokens.extend(__tokenize_md(tk.text, tags, handles))
28
+
elif isinstance(tk, cross.LinkToken):
29
+
if not tk.label or util.canonical_label(tk.label, tk.href):
30
+
tokens.append(tk)
31
+
continue
32
+
33
+
tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles))
34
+
else:
35
+
tokens.append(tk)
36
+
37
+
return tokens
38
+
39
+
40
+
def __tokenize_md(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]:
41
+
index: int = 0
42
+
total: int = len(text)
43
+
buffer: list[str] = []
44
+
45
+
tokens: list[cross.Token] = []
46
+
47
+
def flush():
48
+
nonlocal buffer
49
+
if buffer:
50
+
tokens.append(cross.TextToken(''.join(buffer)))
51
+
buffer = []
52
+
53
+
while index < total:
54
+
if text[index] == '[':
55
+
md_inline = MD_INLINE_LINK.match(text, index)
56
+
if md_inline:
57
+
flush()
58
+
label = md_inline.group(1)
59
+
href = md_inline.group(2)
60
+
tokens.append(cross.LinkToken(href, label))
61
+
index = md_inline.end()
62
+
continue
63
+
64
+
if text[index] == '<':
65
+
md_auto = MD_AUTOLINK.match(text, index)
66
+
if md_auto:
67
+
flush()
68
+
href = md_auto.group(1)
69
+
tokens.append(cross.LinkToken(href, href))
70
+
index = md_auto.end()
71
+
continue
72
+
73
+
if text[index] == '#':
74
+
tag = HASHTAG.match(text, index)
75
+
if tag:
76
+
tag_text = tag.group(1)
77
+
if tag_text.lower() in tags:
78
+
flush()
79
+
tokens.append(cross.TagToken(tag_text))
80
+
index = tag.end()
81
+
continue
82
+
83
+
if text[index] == '@':
84
+
handle = FEDIVERSE_HANDLE.match(text, index)
85
+
if handle:
86
+
handle_text = handle.group(0)
87
+
stripped_handle = handle_text.strip()
88
+
89
+
match = next(
90
+
(pair for pair in handles if stripped_handle in pair),
91
+
None
92
+
)
93
+
94
+
if match:
95
+
flush()
96
+
tokens.append(cross.MentionToken(match[1], '')) # TODO: misskey doesn’t provide a uri
97
+
index = handle.end()
98
+
continue
99
+
100
+
url = URL.match(text, index)
101
+
if url:
102
+
flush()
103
+
href = url.group(0)
104
+
tokens.append(cross.LinkToken(href, href))
105
+
index = url.end()
106
+
continue
107
+
108
+
buffer.append(text[index])
109
+
index += 1
110
+
111
+
flush()
112
+
return tokens