···1818 "github.com/gomarkdown/markdown/html"
1919 "github.com/gomarkdown/markdown/parser"
2020 "github.com/stormlightlabs/noteleaf/internal/models"
2121+ exhtml "golang.org/x/net/html"
2122)
22232324//go:embed rules/*.txt
···34353536// ParsingRule represents XPath rules for extracting content from a specific domain
3637type ParsingRule struct {
3737- Domain string
3838- Title string
3939- Author string
4040- Date string
4141- Body string
4242- Strip []string // XPath selectors for elements to remove
4343- TestURLs []string
3838+ Domain string
3939+ Title string
4040+ Author string
4141+ Date string
4242+ Body string
4343+ Strip []string // XPath selectors for elements to remove
4444+ StripIDsOrClasses []string
4545+ TestURLs []string
4646+ Headers map[string]string
4747+ Prune bool
4848+ Tidy bool
4449}
45504651// Parser interface defines methods for parsing articles from URLs
···138143 rule.Body = value
139144 case "strip":
140145 rule.Strip = append(rule.Strip, value)
146146+ case "strip_id_or_class":
147147+ rule.StripIDsOrClasses = append(rule.StripIDsOrClasses, value)
148148+ case "prune":
149149+ rule.Prune = parseBool(value)
150150+ case "tidy":
151151+ rule.Tidy = parseBool(value)
141152 case "test_url":
142153 rule.TestURLs = append(rule.TestURLs, value)
154154+ default:
155155+ if strings.HasPrefix(key, "http_header(") && strings.HasSuffix(key, ")") {
156156+ headerName := strings.TrimSuffix(strings.TrimPrefix(key, "http_header("), ")")
157157+ if headerName != "" {
158158+ if rule.Headers == nil {
159159+ rule.Headers = make(map[string]string)
160160+ }
161161+ rule.Headers[http.CanonicalHeaderKey(headerName)] = value
162162+ }
163163+ }
143164 }
144165 }
145166···150171 return rule, nil
151172}
152173174174+func parseBool(value string) bool {
175175+ switch strings.ToLower(strings.TrimSpace(value)) {
176176+ case "1", "true", "yes", "on":
177177+ return true
178178+ default:
179179+ return false
180180+ }
181181+}
182182+183183+func (p *ArticleParser) findRule(domain string) *ParsingRule {
184184+ for ruleDomain, rule := range p.rules {
185185+ if domain == ruleDomain || strings.HasSuffix(domain, ruleDomain) {
186186+ return rule
187187+ }
188188+ }
189189+ return nil
190190+}
191191+153192// ParseURL extracts article content from a given URL
154193func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error) {
155194 parsedURL, err := url.Parse(s)
···159198160199 domain := parsedURL.Hostname()
161200162162- resp, err := p.client.Get(s)
201201+ rule := p.findRule(domain)
202202+203203+ req, err := http.NewRequest(http.MethodGet, s, nil)
204204+ if err != nil {
205205+ return nil, fmt.Errorf("failed to create request: %w", err)
206206+ }
207207+208208+ if rule != nil {
209209+ for header, value := range rule.Headers {
210210+ if value == "" {
211211+ continue
212212+ }
213213+ if req.Header.Get(header) == "" {
214214+ req.Header.Set(header, value)
215215+ }
216216+ }
217217+ }
218218+219219+ resp, err := p.client.Do(req)
163220 if err != nil {
164221 return nil, fmt.Errorf("failed to fetch URL: %w", err)
165222 }
···179236180237// ParseHTML extracts article content from HTML string using domain-specific rules
181238func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) {
182182- var rule *ParsingRule
183183- for ruleDomain, r := range p.rules {
184184- if strings.Contains(domain, ruleDomain) {
185185- rule = r
186186- break
187187- }
188188- }
239239+ rule := p.findRule(domain)
189240190241 if rule == nil {
191242 return nil, fmt.Errorf("no parsing rule found for domain: %s", domain)
···217268 }
218269219270 if rule.Body != "" {
220220- if bodyNode := htmlquery.FindOne(doc, rule.Body); bodyNode != nil {
221221- for _, stripXPath := range rule.Strip {
222222- stripNodes := htmlquery.Find(bodyNode, stripXPath)
223223- for _, node := range stripNodes {
224224- node.Parent.RemoveChild(node)
225225- }
226226- }
271271+ bodyNode := htmlquery.FindOne(doc, rule.Body)
272272+ if bodyNode == nil {
273273+ return nil, fmt.Errorf("could not extract body content from HTML")
274274+ }
227275228228- content.Content = strings.TrimSpace(htmlquery.InnerText(bodyNode))
276276+ for _, stripXPath := range rule.Strip {
277277+ removeNodesByXPath(bodyNode, stripXPath)
229278 }
279279+280280+ for _, identifier := range rule.StripIDsOrClasses {
281281+ removeNodesByIdentifier(bodyNode, identifier)
282282+ }
283283+284284+ removeDefaultNonContentNodes(bodyNode)
285285+286286+ content.Content = normalizeWhitespace(htmlquery.InnerText(bodyNode))
230287 }
231288232289 if content.Title == "" {
···236293 return content, nil
237294}
238295296296+func removeNodesByXPath(root *exhtml.Node, xpath string) {
297297+ if root == nil {
298298+ return
299299+ }
300300+301301+ xpath = strings.TrimSpace(xpath)
302302+ if xpath == "" {
303303+ return
304304+ }
305305+306306+ nodes := htmlquery.Find(root, xpath)
307307+ for _, node := range nodes {
308308+ if node != nil && node.Parent != nil {
309309+ node.Parent.RemoveChild(node)
310310+ }
311311+ }
312312+}
313313+314314+func removeNodesByIdentifier(root *exhtml.Node, identifier string) {
315315+ identifier = strings.TrimSpace(identifier)
316316+ if root == nil || identifier == "" {
317317+ return
318318+ }
319319+320320+ idLiteral := buildXPathLiteral(identifier)
321321+ removeNodesByXPath(root, fmt.Sprintf(".//*[@id=%s]", idLiteral))
322322+323323+ classLiteral := buildXPathLiteral(" " + identifier + " ")
324324+ removeNodesByXPath(root, fmt.Sprintf(".//*[contains(concat(' ', normalize-space(@class), ' '), %s)]", classLiteral))
325325+}
326326+327327+func removeDefaultNonContentNodes(root *exhtml.Node) {
328328+ for _, xp := range []string{
329329+ ".//script",
330330+ ".//style",
331331+ ".//noscript",
332332+ } {
333333+ removeNodesByXPath(root, xp)
334334+ }
335335+}
336336+337337+func normalizeWhitespace(value string) string {
338338+ value = strings.ReplaceAll(value, "\u00a0", " ")
339339+ return strings.TrimSpace(value)
340340+}
341341+342342+func buildXPathLiteral(value string) string {
343343+ if !strings.Contains(value, "'") {
344344+ return "'" + value + "'"
345345+ }
346346+347347+ if !strings.Contains(value, "\"") {
348348+ return `"` + value + `"`
349349+ }
350350+351351+ segments := strings.Split(value, "'")
352352+ var builder strings.Builder
353353+ builder.WriteString("concat(")
354354+355355+ for i, segment := range segments {
356356+ if i > 0 {
357357+ builder.WriteString(", \"'\", ")
358358+ }
359359+ if segment == "" {
360360+ builder.WriteString("''")
361361+ continue
362362+ }
363363+ builder.WriteString("'")
364364+ builder.WriteString(segment)
365365+ builder.WriteString("'")
366366+ }
367367+368368+ builder.WriteString(")")
369369+ return builder.String()
370370+}
371371+239372// Convert HTML content directly to markdown using domain-specific rules
240373func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) {
241374 content, err := p.Parse(htmlContent, domain, sourceURL)
···268401269402 baseMarkdownPath := filepath.Join(dir, slug+".md")
270403 baseHTMLPath := filepath.Join(dir, slug+".html")
271271-272404 markdownPath = baseMarkdownPath
273405 htmlPath = baseHTMLPath
274406
+16-13
internal/articles/parser_test.go
···218218 if err == nil {
219219 t.Error("Expected error when no title can be extracted")
220220 }
221221- if !strings.Contains(err.Error(), "could not extract title") {
222222- t.Errorf("Expected 'could not extract title' error, got %v", err)
221221+ if !strings.Contains(err.Error(), "could not extract title") &&
222222+ !strings.Contains(err.Error(), "could not extract body content") {
223223+ t.Errorf("Expected title or body extraction error, got %v", err)
223224 }
224225 })
225226···229230 <body>
230231 <h1 id="firstHeading">Test Article Title</h1>
231232 <div id="bodyContent">
233233+ <style>.mw-parser-output .hatnote{font-style:italic;}</style>
232234 <p>This is the main content of the article.</p>
233235 <div class="noprint">This should be stripped</div>
236236+ <div class="editsection">Edit this section</div>
234237 <p>More content here.</p>
235238 </div>
236239 </body>
···253256 if strings.Contains(markdown, "This should be stripped") {
254257 t.Error("Expected stripped content to be removed from markdown")
255258 }
259259+ if strings.Contains(markdown, ".mw-parser-output") {
260260+ t.Error("Expected style content to be removed from markdown")
261261+ }
262262+ if strings.Contains(markdown, "Edit this section") {
263263+ t.Error("Expected edit section markers to be removed from markdown")
264264+ }
256265 })
257266 })
258267···265274 w.WriteHeader(http.StatusOK)
266275 w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
267276 default:
268268- // Return Wikipedia-like structure for localhost rule
269277 w.WriteHeader(http.StatusOK)
270278 w.Write([]byte(`<html>
271279 <head><title>Test Article</title></head>
···555563 }
556564 })
557565558558- t.Run("fails with invalid directory", func(t *testing.T) {
559559- // Skip this test as it would require network access to test with real URLs
560560- t.Skip("Skipping invalid directory test - requires network access")
561561- })
562562-563566 t.Run("fails with malformed HTML", func(t *testing.T) {
564567 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
565568 w.WriteHeader(http.StatusOK)
···584587 if err == nil {
585588 t.Error("Expected error for malformed HTML")
586589 }
587587- if !strings.Contains(err.Error(), "failed to parse HTML") && !strings.Contains(err.Error(), "could not extract title") {
588588- t.Errorf("Expected HTML parsing or title extraction error, got %v", err)
590590+ if !strings.Contains(err.Error(), "failed to parse HTML") &&
591591+ !strings.Contains(err.Error(), "could not extract title") &&
592592+ !strings.Contains(err.Error(), "could not extract body content") {
593593+ t.Errorf("Expected HTML parsing or extraction error, got %v", err)
589594 }
590595 })
591596···599604 <p>Content without proper title</p>
600605 </div>
601606 </body>
602602- </html>`)) // No h1 with id="firstHeading"
607607+ </html>`))
603608 }))
604609 defer server.Close()
605610···664669 t.Fatalf("Failed to save article: %v", err)
665670 }
666671667667- // Test that it creates a proper models.Article structure (simulating CreateArticleFromURL)
668672 article := &models.Article{
669673 URL: server.URL,
670674 Title: content.Title,
···779783 t.Fatalf("Failed to save article: %v", err)
780784 }
781785782782- // Verify markdown contains all metadata
783786 mdContent, err := os.ReadFile(mdPath)
784787 if err != nil {
785788 t.Fatalf("Failed to read markdown file: %v", err)
···11+// TODO: create variants of colored output without icons
22+// TODO: refactor existing (relevant) calls to old styles
33+// TODO: k v wrappers
44+package ui
55+66+import (
77+ "fmt"
88+99+ "github.com/charmbracelet/lipgloss"
1010+)
1111+1212+func newStyle() lipgloss.Style { return lipgloss.NewStyle() }
1313+func newPStyle(v, h int) lipgloss.Style { return lipgloss.NewStyle().Padding(v, h) }
1414+func newBoldStyle() lipgloss.Style { return newStyle().Bold(true) }
1515+func newPBoldStyle(v, h int) lipgloss.Style { return newPStyle(v, h).Bold(true) }
1616+func newEmStyle() lipgloss.Style { return newStyle().Italic(true) }
1717+1818+func success(msg string) string { return SuccessStyle.Render("โ " + msg) }
1919+func errorMsg(msg string) string { return ErrorStyle.Render("โ " + msg) }
2020+func warning(msg string) string { return WarningStyle.Render("โ " + msg) }
2121+func info(msg string) string { return InfoStyle.Render("โน " + msg) }
2222+func title(msg string) string { return TitleStyle.Render(msg) }
2323+func subtitle(msg string) string { return SubtitleStyle.Render(msg) }
2424+func box(content string) string { return BoxStyle.Render(content) }
2525+func errorBox(content string) string { return ErrorBoxStyle.Render(content) }
2626+func text(content string) string { return TextStyle.Render(content) }
2727+func header(content string) string { return HeaderStyle.Render(content) }
2828+2929+// Success prints a formatted success message
3030+func Success(format string, a ...any) {
3131+ fmt.Print(success(fmt.Sprintf(format, a...)))
3232+}
3333+3434+// Successln prints a formatted success message with a newline
3535+func Successln(format string, a ...any) {
3636+ fmt.Println(success(fmt.Sprintf(format, a...)))
3737+}
3838+3939+// Error prints a formatted error message
4040+func Error(format string, a ...any) {
4141+ fmt.Print(errorMsg(fmt.Sprintf(format, a...)))
4242+}
4343+4444+// Errorln prints a formatted error message with a newline
4545+func Errorln(format string, a ...any) {
4646+ fmt.Println(errorMsg(fmt.Sprintf(format, a...)))
4747+}
4848+4949+// Warning prints a formatted warning message
5050+func Warning(format string, a ...any) {
5151+ fmt.Print(warning(fmt.Sprintf(format, a...)))
5252+}
5353+5454+// Warningln prints a formatted warning message with a newline
5555+func Warningln(format string, a ...any) {
5656+ fmt.Println(warning(fmt.Sprintf(format, a...)))
5757+}
5858+5959+// Info prints a formatted info message
6060+func Info(format string, a ...any) {
6161+ fmt.Print(info(fmt.Sprintf(format, a...)))
6262+}
6363+6464+// Infoln prints a formatted info message with a newline
6565+func Infoln(format string, a ...any) {
6666+ fmt.Println(info(fmt.Sprintf(format, a...)))
6767+}
6868+6969+// Title prints a formatted title
7070+func Title(format string, a ...any) {
7171+ fmt.Print(title(fmt.Sprintf(format, a...)))
7272+}
7373+7474+// Titleln prints a formatted title with a newline
7575+func Titleln(format string, a ...any) {
7676+ fmt.Println(title(fmt.Sprintf(format, a...)))
7777+}
7878+7979+// Subtitle prints a formatted subtitle
8080+func Subtitle(format string, a ...any) {
8181+ fmt.Print(subtitle(fmt.Sprintf(format, a...)))
8282+}
8383+8484+// Subtitleln prints a formatted subtitle with a newline
8585+func Subtitleln(format string, a ...any) {
8686+ fmt.Println(subtitle(fmt.Sprintf(format, a...)))
8787+}
8888+8989+// Box prints content in a styled box
9090+func Box(format string, a ...any) {
9191+ fmt.Print(box(fmt.Sprintf(format, a...)))
9292+}
9393+9494+// Boxln prints content in a styled box with a newline
9595+func Boxln(format string, a ...any) {
9696+ fmt.Println(box(fmt.Sprintf(format, a...)))
9797+}
9898+9999+// ErrorBox prints error content in a styled error box
100100+func ErrorBox(format string, a ...any) {
101101+ fmt.Print(errorBox(fmt.Sprintf(format, a...)))
102102+}
103103+104104+// ErrorBoxln prints error content in a styled error box with a newline
105105+func ErrorBoxln(format string, a ...any) {
106106+ fmt.Println(errorBox(fmt.Sprintf(format, a...)))
107107+}
108108+109109+func Newline() { fmt.Println() }
110110+func Plain(format string, a ...any) { fmt.Print(text(fmt.Sprintf(format, a...))) }
111111+func Plainln(format string, a ...any) { fmt.Println(text(fmt.Sprintf(format, a...))) }
112112+func Header(format string, a ...any) { fmt.Print(header(fmt.Sprintf(format, a...))) }
113113+func Headerln(format string, a ...any) { fmt.Print(header(fmt.Sprintf(format, a...))) }
+2-1
internal/ui/logo.go
···11// See https://patorjk.com/software/taag/
22+//
33+// NOTE: these aren't used anymore but are left in because they're cool
24package ui
3546import (
···7880}
79818082// Colored returns a colored version of the logo using lipgloss with vertical spiral design
8181-//
8283// Creates a vertical spiral effect by coloring character by character:
8384//
8485// Combine line position and character position & use modulo to build wave-like transitions