+61
-16
pkg/appview/readme/fetcher.go
+61
-16
pkg/appview/readme/fetcher.go
···
70
70
// FetchAndRender fetches a README from a URL and renders it as HTML
71
71
// Returns the rendered HTML and any error
72
72
func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string, error) {
73
-
// Validate URL
74
-
if readmeURL == "" {
75
-
return "", fmt.Errorf("empty README URL")
76
-
}
77
-
78
-
parsedURL, err := url.Parse(readmeURL)
79
-
if err != nil {
80
-
return "", fmt.Errorf("invalid README URL: %w", err)
81
-
}
82
-
83
-
// Only allow HTTP/HTTPS
84
-
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
85
-
return "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
86
-
}
87
-
88
-
// Fetch content
73
+
// Fetch content (includes URL validation, Content-Type check, and HTML detection)
89
74
content, baseURL, err := f.fetchContent(ctx, readmeURL)
90
75
if err != nil {
91
76
return "", err
···
100
85
return html, nil
101
86
}
102
87
88
+
// FetchRaw fetches raw README content from a URL without rendering
89
+
// Returns raw bytes with Content-Type and HTML validation
90
+
// Use this when you need to store the raw markdown (e.g., in PDS records)
91
+
func (f *Fetcher) FetchRaw(ctx context.Context, readmeURL string) ([]byte, error) {
92
+
// Fetch content (includes URL validation, Content-Type check, and HTML detection)
93
+
content, _, err := f.fetchContent(ctx, readmeURL)
94
+
if err != nil {
95
+
return nil, err
96
+
}
97
+
98
+
return content, nil
99
+
}
100
+
103
101
// fetchContent fetches the raw content from a URL
104
102
func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, string, error) {
103
+
// Validate URL
104
+
if urlStr == "" {
105
+
return nil, "", fmt.Errorf("empty README URL")
106
+
}
107
+
108
+
parsedURL, err := url.Parse(urlStr)
109
+
if err != nil {
110
+
return nil, "", fmt.Errorf("invalid README URL: %w", err)
111
+
}
112
+
113
+
// Only allow HTTP/HTTPS
114
+
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
115
+
return nil, "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
116
+
}
117
+
105
118
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
106
119
if err != nil {
107
120
return nil, "", fmt.Errorf("failed to create request: %w", err)
···
120
133
return nil, "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
121
134
}
122
135
136
+
// Reject HTML content types (catches proper error pages)
137
+
contentType := resp.Header.Get("Content-Type")
138
+
if contentType != "" {
139
+
ct := strings.ToLower(contentType)
140
+
if strings.Contains(ct, "text/html") || strings.Contains(ct, "application/xhtml") {
141
+
return nil, "", fmt.Errorf("unsupported content type: %s (expected markdown or plain text)", contentType)
142
+
}
143
+
}
144
+
123
145
// Limit content size to 1MB
124
146
limitedReader := io.LimitReader(resp.Body, 1*1024*1024)
125
147
content, err := io.ReadAll(limitedReader)
···
127
149
return nil, "", fmt.Errorf("failed to read response body: %w", err)
128
150
}
129
151
152
+
// Detect HTML content by checking for common markers (catches soft 404s)
153
+
if LooksLikeHTML(content) {
154
+
return nil, "", fmt.Errorf("detected HTML content instead of markdown")
155
+
}
156
+
130
157
// Get base URL for relative link resolution
131
158
baseURL := getBaseURL(resp.Request.URL)
132
159
133
160
return content, baseURL, nil
161
+
}
162
+
163
+
// LooksLikeHTML checks if content appears to be HTML rather than markdown
164
+
// Exported for use by other packages that fetch README content
165
+
func LooksLikeHTML(content []byte) bool {
166
+
if len(content) == 0 {
167
+
return false
168
+
}
169
+
170
+
// Check first 512 bytes for HTML markers
171
+
checkLen := min(len(content), 512)
172
+
173
+
trimmed := strings.TrimSpace(string(content[:checkLen]))
174
+
lower := strings.ToLower(trimmed)
175
+
176
+
return strings.HasPrefix(lower, "<!doctype") ||
177
+
strings.HasPrefix(lower, "<html") ||
178
+
strings.HasPrefix(lower, "<?xml")
134
179
}
135
180
136
181
// renderMarkdown renders markdown content to sanitized HTML
+271
-1
pkg/appview/readme/fetcher_test.go
+271
-1
pkg/appview/readme/fetcher_test.go
···
1
1
package readme
2
2
3
3
import (
4
+
"context"
5
+
"net/http"
6
+
"net/http/httptest"
4
7
"net/url"
8
+
"strings"
5
9
"testing"
6
10
)
7
11
···
305
309
return false
306
310
}
307
311
308
-
// TODO: Add README fetching and caching tests
312
+
func TestLooksLikeHTML(t *testing.T) {
313
+
tests := []struct {
314
+
name string
315
+
content string
316
+
expected bool
317
+
}{
318
+
{
319
+
name: "empty content",
320
+
content: "",
321
+
expected: false,
322
+
},
323
+
{
324
+
name: "markdown content",
325
+
content: "# Hello World\n\nThis is a README.",
326
+
expected: false,
327
+
},
328
+
{
329
+
name: "plain text",
330
+
content: "Just some plain text without any HTML.",
331
+
expected: false,
332
+
},
333
+
{
334
+
name: "doctype html",
335
+
content: "<!DOCTYPE html>\n<html><body>Page</body></html>",
336
+
expected: true,
337
+
},
338
+
{
339
+
name: "doctype html lowercase",
340
+
content: "<!doctype html>\n<html><body>Page</body></html>",
341
+
expected: true,
342
+
},
343
+
{
344
+
name: "html tag only",
345
+
content: "<html><head></head><body>Page</body></html>",
346
+
expected: true,
347
+
},
348
+
{
349
+
name: "html tag with whitespace",
350
+
content: " \n <html>\n<body>Page</body></html>",
351
+
expected: true,
352
+
},
353
+
{
354
+
name: "xml declaration",
355
+
content: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html>...</html>",
356
+
expected: true,
357
+
},
358
+
{
359
+
name: "soft 404 page",
360
+
content: "<!DOCTYPE html><html><head><title>Page Not Found</title></head><body><h1>404</h1></body></html>",
361
+
expected: true,
362
+
},
363
+
{
364
+
name: "markdown with inline html",
365
+
content: "# Title\n\nSome text with <strong>bold</strong> inline.",
366
+
expected: false,
367
+
},
368
+
{
369
+
name: "markdown starting with hash",
370
+
content: "## Section\n\nContent here.",
371
+
expected: false,
372
+
},
373
+
}
374
+
375
+
for _, tt := range tests {
376
+
t.Run(tt.name, func(t *testing.T) {
377
+
result := LooksLikeHTML([]byte(tt.content))
378
+
if result != tt.expected {
379
+
t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.content, result, tt.expected)
380
+
}
381
+
})
382
+
}
383
+
}
384
+
385
+
func TestFetcher_FetchRaw(t *testing.T) {
386
+
fetcher := NewFetcher()
387
+
388
+
tests := []struct {
389
+
name string
390
+
handler http.HandlerFunc
391
+
wantErr bool
392
+
errContains string
393
+
wantContent string
394
+
}{
395
+
{
396
+
name: "successful markdown fetch",
397
+
handler: func(w http.ResponseWriter, r *http.Request) {
398
+
w.Header().Set("Content-Type", "text/plain")
399
+
w.Write([]byte("# Hello World\n\nThis is markdown."))
400
+
},
401
+
wantErr: false,
402
+
wantContent: "# Hello World",
403
+
},
404
+
{
405
+
name: "rejects HTML content type",
406
+
handler: func(w http.ResponseWriter, r *http.Request) {
407
+
w.Header().Set("Content-Type", "text/html; charset=utf-8")
408
+
w.Write([]byte("<html><body>Error</body></html>"))
409
+
},
410
+
wantErr: true,
411
+
errContains: "unsupported content type",
412
+
},
413
+
{
414
+
name: "rejects soft 404 HTML content",
415
+
handler: func(w http.ResponseWriter, r *http.Request) {
416
+
w.Header().Set("Content-Type", "text/plain")
417
+
w.Write([]byte("<!DOCTYPE html><html><body>404 Not Found</body></html>"))
418
+
},
419
+
wantErr: true,
420
+
errContains: "detected HTML content",
421
+
},
422
+
{
423
+
name: "rejects 404 status",
424
+
handler: func(w http.ResponseWriter, r *http.Request) {
425
+
w.WriteHeader(http.StatusNotFound)
426
+
w.Write([]byte("Not Found"))
427
+
},
428
+
wantErr: true,
429
+
errContains: "unexpected status code: 404",
430
+
},
431
+
{
432
+
name: "rejects 500 status",
433
+
handler: func(w http.ResponseWriter, r *http.Request) {
434
+
w.WriteHeader(http.StatusInternalServerError)
435
+
w.Write([]byte("Internal Server Error"))
436
+
},
437
+
wantErr: true,
438
+
errContains: "unexpected status code: 500",
439
+
},
440
+
}
441
+
442
+
for _, tt := range tests {
443
+
t.Run(tt.name, func(t *testing.T) {
444
+
server := httptest.NewServer(tt.handler)
445
+
defer server.Close()
446
+
447
+
content, err := fetcher.FetchRaw(context.Background(), server.URL)
448
+
449
+
if tt.wantErr {
450
+
if err == nil {
451
+
t.Errorf("FetchRaw() expected error containing %q, got nil", tt.errContains)
452
+
return
453
+
}
454
+
if !strings.Contains(err.Error(), tt.errContains) {
455
+
t.Errorf("FetchRaw() error = %q, want error containing %q", err.Error(), tt.errContains)
456
+
}
457
+
return
458
+
}
459
+
460
+
if err != nil {
461
+
t.Errorf("FetchRaw() unexpected error: %v", err)
462
+
return
463
+
}
464
+
465
+
if !strings.Contains(string(content), tt.wantContent) {
466
+
t.Errorf("FetchRaw() content = %q, want content containing %q", string(content), tt.wantContent)
467
+
}
468
+
})
469
+
}
470
+
}
471
+
472
+
func TestFetcher_FetchRaw_URLValidation(t *testing.T) {
473
+
fetcher := NewFetcher()
474
+
475
+
tests := []struct {
476
+
name string
477
+
url string
478
+
errContains string
479
+
}{
480
+
{
481
+
name: "empty URL",
482
+
url: "",
483
+
errContains: "empty README URL",
484
+
},
485
+
{
486
+
name: "invalid URL scheme",
487
+
url: "ftp://example.com/README.md",
488
+
errContains: "invalid URL scheme",
489
+
},
490
+
{
491
+
name: "file URL scheme",
492
+
url: "file:///etc/passwd",
493
+
errContains: "invalid URL scheme",
494
+
},
495
+
}
496
+
497
+
for _, tt := range tests {
498
+
t.Run(tt.name, func(t *testing.T) {
499
+
_, err := fetcher.FetchRaw(context.Background(), tt.url)
500
+
if err == nil {
501
+
t.Errorf("FetchRaw(%q) expected error, got nil", tt.url)
502
+
return
503
+
}
504
+
if !strings.Contains(err.Error(), tt.errContains) {
505
+
t.Errorf("FetchRaw(%q) error = %q, want error containing %q", tt.url, err.Error(), tt.errContains)
506
+
}
507
+
})
508
+
}
509
+
}
510
+
511
+
func TestFetcher_FetchAndRender(t *testing.T) {
512
+
fetcher := NewFetcher()
513
+
514
+
tests := []struct {
515
+
name string
516
+
handler http.HandlerFunc
517
+
wantErr bool
518
+
errContains string
519
+
wantContain string
520
+
}{
521
+
{
522
+
name: "renders markdown to HTML",
523
+
handler: func(w http.ResponseWriter, r *http.Request) {
524
+
w.Header().Set("Content-Type", "text/plain")
525
+
w.Write([]byte("# Hello World\n\nThis is **bold** text."))
526
+
},
527
+
wantErr: false,
528
+
wantContain: "<strong>bold</strong>",
529
+
},
530
+
{
531
+
name: "rejects HTML content type",
532
+
handler: func(w http.ResponseWriter, r *http.Request) {
533
+
w.Header().Set("Content-Type", "text/html")
534
+
w.Write([]byte("<html><body>Error</body></html>"))
535
+
},
536
+
wantErr: true,
537
+
errContains: "unsupported content type",
538
+
},
539
+
{
540
+
name: "rejects soft 404",
541
+
handler: func(w http.ResponseWriter, r *http.Request) {
542
+
w.Header().Set("Content-Type", "text/plain")
543
+
w.Write([]byte("<!doctype html><html><body>Not Found</body></html>"))
544
+
},
545
+
wantErr: true,
546
+
errContains: "detected HTML content",
547
+
},
548
+
}
549
+
550
+
for _, tt := range tests {
551
+
t.Run(tt.name, func(t *testing.T) {
552
+
server := httptest.NewServer(tt.handler)
553
+
defer server.Close()
554
+
555
+
html, err := fetcher.FetchAndRender(context.Background(), server.URL)
556
+
557
+
if tt.wantErr {
558
+
if err == nil {
559
+
t.Errorf("FetchAndRender() expected error containing %q, got nil", tt.errContains)
560
+
return
561
+
}
562
+
if !strings.Contains(err.Error(), tt.errContains) {
563
+
t.Errorf("FetchAndRender() error = %q, want error containing %q", err.Error(), tt.errContains)
564
+
}
565
+
return
566
+
}
567
+
568
+
if err != nil {
569
+
t.Errorf("FetchAndRender() unexpected error: %v", err)
570
+
return
571
+
}
572
+
573
+
if !strings.Contains(html, tt.wantContain) {
574
+
t.Errorf("FetchAndRender() = %q, want HTML containing %q", html, tt.wantContain)
575
+
}
576
+
})
577
+
}
578
+
}
+40
-67
pkg/appview/storage/manifest_store.go
+40
-67
pkg/appview/storage/manifest_store.go
···
424
424
return nil
425
425
}
426
426
427
-
// ensureRepoPage creates or updates a repo page record in the user's PDS if needed
427
+
// ensureRepoPage creates or updates a repo page record in the user's PDS
428
428
// This syncs repository metadata from manifest annotations to the io.atcr.repo.page collection
429
-
// Only creates a new record if one doesn't exist (doesn't overwrite user's custom content)
429
+
// Always updates the description on push (since users can't edit it via appview yet)
430
+
// Preserves user's avatar if they've set one via the appview
430
431
func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atproto.ManifestRecord) {
431
-
// Check if repo page already exists (don't overwrite user's custom content)
432
432
rkey := s.ctx.Repository
433
-
_, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey)
434
-
if err == nil {
435
-
// Record already exists - don't overwrite
436
-
slog.Debug("Repo page already exists, skipping creation", "did", s.ctx.DID, "repository", s.ctx.Repository)
437
-
return
438
-
}
439
433
440
-
// Only continue if it's a "not found" error - other errors mean we should skip
441
-
if !errors.Is(err, atproto.ErrRecordNotFound) {
434
+
// Check for existing record to preserve user's avatar
435
+
var existingAvatarRef *atproto.ATProtoBlobRef
436
+
var existingRecord *atproto.RepoPageRecord
437
+
record, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey)
438
+
if err == nil && record != nil {
439
+
// Unmarshal the Value to get the RepoPageRecord
440
+
var repoPage atproto.RepoPageRecord
441
+
if unmarshalErr := json.Unmarshal(record.Value, &repoPage); unmarshalErr == nil {
442
+
existingRecord = &repoPage
443
+
existingAvatarRef = repoPage.Avatar
444
+
slog.Debug("Found existing repo page, will update", "did", s.ctx.DID, "repository", s.ctx.Repository, "hasExistingAvatar", existingAvatarRef != nil)
445
+
} else {
446
+
slog.Warn("Failed to unmarshal existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", unmarshalErr)
447
+
}
448
+
} else if err != nil && !errors.Is(err, atproto.ErrRecordNotFound) {
449
+
// Unexpected error - log and continue (will create new record)
442
450
slog.Warn("Failed to check for existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
443
-
return
444
451
}
445
452
446
453
// Get annotations (may be nil if image has no OCI labels)
···
458
465
description = annotations["org.opencontainers.image.description"]
459
466
}
460
467
461
-
// Try to fetch and upload icon from io.atcr.icon annotation
462
-
var avatarRef *atproto.ATProtoBlobRef
468
+
// Determine avatar: prefer new icon from annotations, otherwise keep existing
469
+
avatarRef := existingAvatarRef
463
470
if iconURL := annotations["io.atcr.icon"]; iconURL != "" {
464
-
avatarRef = s.fetchAndUploadIcon(ctx, iconURL)
471
+
if newAvatar := s.fetchAndUploadIcon(ctx, iconURL); newAvatar != nil {
472
+
avatarRef = newAvatar
473
+
}
465
474
}
466
475
467
-
// Create new repo page record with description and optional avatar
476
+
// Create/update repo page record with description and avatar
468
477
repoPage := atproto.NewRepoPageRecord(s.ctx.Repository, description, avatarRef)
469
478
470
-
slog.Info("Creating repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil)
479
+
isUpdate := existingRecord != nil
480
+
action := "Creating"
481
+
if isUpdate {
482
+
action = "Updating"
483
+
}
484
+
slog.Info(action+" repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil)
471
485
472
486
_, err = s.ctx.ATProtoClient.PutRecord(ctx, atproto.RepoPageCollection, rkey, repoPage)
473
487
if err != nil {
474
-
slog.Warn("Failed to create repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
488
+
slog.Warn("Failed to "+strings.ToLower(action)+" repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
475
489
return
476
490
}
477
491
478
-
slog.Info("Repo page created successfully", "did", s.ctx.DID, "repository", s.ctx.Repository)
492
+
slog.Info("Repo page "+strings.ToLower(action)+"d successfully", "did", s.ctx.DID, "repository", s.ctx.Repository)
479
493
}
480
494
481
495
// fetchReadmeContent attempts to fetch README content from external sources
482
496
// Priority: io.atcr.readme annotation > derived from org.opencontainers.image.source
483
497
// Returns the raw markdown content, or empty string if not available
498
+
// Uses the shared readme.Fetcher which validates Content-Type and rejects HTML content
484
499
func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[string]string) string {
485
500
if s.ctx.ReadmeFetcher == nil {
486
501
return ""
···
492
507
493
508
// Priority 1: Direct README URL from io.atcr.readme annotation
494
509
if readmeURL := annotations["io.atcr.readme"]; readmeURL != "" {
495
-
content, err := s.fetchRawReadme(fetchCtx, readmeURL)
510
+
content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL)
496
511
if err != nil {
497
512
slog.Debug("Failed to fetch README from io.atcr.readme annotation", "url", readmeURL, "error", err)
498
-
} else if content != "" {
513
+
} else if len(content) > 0 {
499
514
slog.Info("Fetched README from io.atcr.readme annotation", "url", readmeURL, "length", len(content))
500
-
return content
515
+
return string(content)
501
516
}
502
517
}
503
518
···
510
525
continue
511
526
}
512
527
513
-
content, err := s.fetchRawReadme(fetchCtx, readmeURL)
528
+
content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL)
514
529
if err != nil {
515
530
// Only log non-404 errors (404 is expected when trying main vs master)
516
531
if !readme.Is404(err) {
···
519
534
continue
520
535
}
521
536
522
-
if content != "" {
537
+
if len(content) > 0 {
523
538
slog.Info("Fetched README from source URL", "sourceURL", sourceURL, "branch", branch, "length", len(content))
524
-
return content
539
+
return string(content)
525
540
}
526
541
}
527
542
}
528
543
529
544
return ""
530
-
}
531
-
532
-
// fetchRawReadme fetches raw markdown content from a URL
533
-
// Returns the raw markdown (not rendered HTML) for storage in the repo page record
534
-
func (s *ManifestStore) fetchRawReadme(ctx context.Context, readmeURL string) (string, error) {
535
-
// Use a simple HTTP client to fetch raw content
536
-
// We want raw markdown, not rendered HTML (the Fetcher renders to HTML)
537
-
req, err := http.NewRequestWithContext(ctx, "GET", readmeURL, nil)
538
-
if err != nil {
539
-
return "", fmt.Errorf("failed to create request: %w", err)
540
-
}
541
-
542
-
req.Header.Set("User-Agent", "ATCR-README-Fetcher/1.0")
543
-
544
-
client := &http.Client{
545
-
Timeout: 10 * time.Second,
546
-
CheckRedirect: func(req *http.Request, via []*http.Request) error {
547
-
if len(via) >= 5 {
548
-
return fmt.Errorf("too many redirects")
549
-
}
550
-
return nil
551
-
},
552
-
}
553
-
554
-
resp, err := client.Do(req)
555
-
if err != nil {
556
-
return "", fmt.Errorf("failed to fetch URL: %w", err)
557
-
}
558
-
defer resp.Body.Close()
559
-
560
-
if resp.StatusCode != http.StatusOK {
561
-
return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
562
-
}
563
-
564
-
// Limit content size to 100KB (repo page description has 100KB limit in lexicon)
565
-
limitedReader := io.LimitReader(resp.Body, 100*1024)
566
-
content, err := io.ReadAll(limitedReader)
567
-
if err != nil {
568
-
return "", fmt.Errorf("failed to read response body: %w", err)
569
-
}
570
-
571
-
return string(content), nil
572
545
}
573
546
574
547
// fetchAndUploadIcon fetches an image from a URL and uploads it as a blob to the user's PDS