+61
-109
mastodon/markeddown.py
+61
-109
mastodon/markeddown.py
···
1
-
import re
2
1
from html.parser import HTMLParser
3
-
from html import unescape
4
2
5
-
### VIBECODED CODE ALERT!!! ###
6
-
7
-
class HTMLToMarkdownParser(HTMLParser):
8
-
def __init__(self):
3
+
class MastoHTMLToMarkdownParser(HTMLParser):
4
+
def __init__(self) -> None:
9
5
super().__init__()
10
6
self.markdown = []
7
+
self.current_tag_stack = []
11
8
12
9
self.in_pre = False
13
10
self.in_code = False
14
11
15
-
self.current_tag_stack = []
16
12
self.list_stack = []
17
-
18
-
self.table_data = []
19
-
self.current_row = []
20
-
self.in_table = False
21
-
22
13
self.link_stack = []
23
-
self.preserve_spaces = False
24
-
25
-
def handle_starttag(self, tag, attrs):
14
+
15
+
def get_markdown(self):
16
+
return ''.join(self.markdown)
17
+
18
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
26
19
attrs_dict = dict(attrs)
27
20
28
-
if tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
29
-
level = int(tag[1])
30
-
self.markdown.append("\n" + "#" * level + " ")
31
-
elif tag == 'p':
32
-
#self.markdown.append('\n\n')
33
-
pass
34
-
elif tag == 'br':
21
+
if tag == 'br':
35
22
self.markdown.append(' \n')
23
+
24
+
elif tag == 'a':
25
+
href = attrs_dict.get('href', '')
26
+
self.link_stack.append(href)
27
+
self.markdown.append('[')
28
+
36
29
elif tag == 'strong' or tag == 'b':
37
30
self.markdown.append('**')
31
+
38
32
elif tag == 'em' or tag == 'i':
39
33
self.markdown.append('*')
34
+
35
+
elif tag == 'del' or tag == 's':
36
+
self.markdown.append('~~')
37
+
40
38
elif tag == 'code':
41
39
if not self.in_pre:
42
40
self.markdown.append('`')
43
41
self.in_code = True
42
+
44
43
elif tag == 'pre':
45
-
self.markdown.append('\n```\n')
44
+
if self.markdown and not str(self.markdown[-1]).endswith('\n'):
45
+
self.markdown.append('\n')
46
+
47
+
self.markdown.append('```\n')
46
48
self.in_pre = True
49
+
47
50
elif tag == 'blockquote':
51
+
if self.markdown and not str(self.markdown[-1]).endswith('\n'):
52
+
self.markdown.append('\n')
53
+
48
54
self.markdown.append('\n> ')
55
+
49
56
elif tag == 'ul':
50
57
self.list_stack.append('ul')
51
58
self.markdown.append('\n')
59
+
52
60
elif tag == 'ol':
53
61
self.list_stack.append('ol')
54
62
self.markdown.append('\n')
63
+
55
64
elif tag == 'li':
56
65
indent = ' ' * (len(self.list_stack) - 1)
57
66
if self.list_stack and self.list_stack[-1] == 'ul':
58
67
self.markdown.append(f'{indent}- ')
59
68
elif self.list_stack and self.list_stack[-1] == 'ol':
60
69
self.markdown.append(f'{indent}1. ')
61
-
elif tag == 'a':
62
-
href = attrs_dict.get('href', '')
63
-
self.link_stack.append(href)
64
-
self.markdown.append('[')
65
-
elif tag == 'img':
66
-
src = attrs_dict.get('src', '')
67
-
alt = attrs_dict.get('alt', '')
68
-
title = attrs_dict.get('title', '')
69
-
if title:
70
-
self.markdown.append(f'')
71
-
else:
72
-
self.markdown.append(f'')
73
-
elif tag == 'hr':
74
-
self.markdown.append('\n---\n')
75
-
elif tag == 'table':
76
-
self.in_table = True
77
-
self.table_data = []
78
-
elif tag == 'tr':
79
-
self.current_row = []
80
-
elif tag == 'th' or tag == 'td':
81
-
pass # Handle in handle_data
82
-
elif tag == 'del' or tag == 's':
83
-
self.markdown.append('~~')
84
-
70
+
71
+
elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
72
+
level = int(tag[1])
73
+
self.markdown.append("\n" + "#" * level + " ")
74
+
85
75
self.current_tag_stack.append(tag)
86
76
87
-
def handle_endtag(self, tag):
77
+
def handle_endtag(self, tag: str) -> None:
88
78
if not self.current_tag_stack:
89
79
return
90
-
91
-
# Remove the tag from stack
80
+
92
81
if tag in self.current_tag_stack:
93
82
self.current_tag_stack.remove(tag)
94
83
95
-
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
96
-
self.markdown.append('\n')
97
-
elif tag == 'p':
84
+
if tag == 'p':
98
85
self.markdown.append('\n\n')
86
+
87
+
elif tag == 'a':
88
+
if self.link_stack:
89
+
href = self.link_stack.pop()
90
+
self.markdown.append(f']({href})')
91
+
99
92
elif tag == 'strong' or tag == 'b':
100
93
self.markdown.append('**')
94
+
101
95
elif tag == 'em' or tag == 'i':
102
96
self.markdown.append('*')
97
+
98
+
elif tag == 'del' or tag == 's':
99
+
self.markdown.append('~~')
100
+
103
101
elif tag == 'code':
104
102
if not self.in_pre and self.in_code:
105
103
self.markdown.append('`')
106
104
self.in_code = False
105
+
107
106
elif tag == 'pre':
108
107
self.markdown.append('\n```\n')
109
108
self.in_pre = False
109
+
110
110
elif tag == 'blockquote':
111
111
self.markdown.append('\n')
112
+
112
113
elif tag == 'ul' or tag == 'ol':
113
114
if self.list_stack:
114
115
self.list_stack.pop()
115
116
self.markdown.append('\n')
117
+
116
118
elif tag == 'li':
117
119
self.markdown.append('\n')
118
-
elif tag == 'a':
119
-
if self.link_stack:
120
-
href = self.link_stack.pop()
121
-
self.markdown.append(f']({href})')
122
-
elif tag == 'table':
123
-
self.in_table = False
124
-
self._process_table()
125
-
elif tag == 'tr':
126
-
if self.in_table:
127
-
self.table_data.append(self.current_row[:])
128
-
self.current_row = []
129
-
elif tag == 'del' or tag == 's':
130
-
self.markdown.append('~~')
131
-
132
-
def handle_data(self, data):
133
-
# Clean up whitespace, but preserve intentional spacing
134
-
if self.in_pre:
135
-
self.markdown.append(data)
136
-
else:
137
-
# Check if we're in a table cell
138
-
if self.in_table and (not self.current_tag_stack or
139
-
self.current_tag_stack[-1] in ['td', 'th']):
140
-
self.current_row.append(data.strip())
141
-
else:
142
-
cleaned_data = re.sub(r'[\r\n\t]+', ' ', data)
143
-
# Remove leading/trailing whitespace only from the entire content
144
-
if cleaned_data.strip():
145
-
self.markdown.append(cleaned_data)
146
-
147
-
def _process_table(self):
148
-
if not self.table_data:
149
-
return
150
-
151
-
self.markdown.append('\n')
152
120
153
-
# Process header row if exists
154
-
if self.table_data:
155
-
header = self.table_data[0]
156
-
self.markdown.append('| ' + ' | '.join(header) + ' |\n')
157
-
self.markdown.append('| ' + ' | '.join(['---'] * len(header)) + ' |\n')
158
-
159
-
# Process data rows
160
-
for row in self.table_data[1:]:
161
-
# Pad row to match header length
162
-
while len(row) < len(header):
163
-
row.append('')
164
-
self.markdown.append('| ' + ' | '.join(row) + ' |\n')
165
-
166
-
self.markdown.append('\n')
167
-
168
-
def get_markdown(self):
169
-
return ''.join(self.markdown)
121
+
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
122
+
self.markdown.append('\n')
170
123
171
124
def reset(self):
172
125
"""Reset the parser state for reuse."""
173
126
super().reset()
174
127
self.markdown = []
175
128
self.current_tag_stack = []
176
-
self.list_stack = []
129
+
177
130
self.in_pre = False
178
131
self.in_code = False
179
-
self.table_data = []
180
-
self.current_row = []
181
-
self.in_table = False
182
-
self.link_stack = []
132
+
133
+
self.link_stack = []
134
+
self.list_stack = []