A container registry that uses the AT Protocol for manifest storage and S3 for blob storage. atcr.io
docker container atproto go

fix issue where soft 404 pages were being rendered in readme content. always update content on push

evan.jarrett.net f74bc301 6dd612e1

verified
Changed files
+372 -84
pkg
+61 -16
pkg/appview/readme/fetcher.go
··· 70 70 // FetchAndRender fetches a README from a URL and renders it as HTML 71 71 // Returns the rendered HTML and any error 72 72 func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string, error) { 73 - // Validate URL 74 - if readmeURL == "" { 75 - return "", fmt.Errorf("empty README URL") 76 - } 77 - 78 - parsedURL, err := url.Parse(readmeURL) 79 - if err != nil { 80 - return "", fmt.Errorf("invalid README URL: %w", err) 81 - } 82 - 83 - // Only allow HTTP/HTTPS 84 - if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { 85 - return "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme) 86 - } 87 - 88 - // Fetch content 73 + // Fetch content (includes URL validation, Content-Type check, and HTML detection) 89 74 content, baseURL, err := f.fetchContent(ctx, readmeURL) 90 75 if err != nil { 91 76 return "", err ··· 100 85 return html, nil 101 86 } 102 87 88 + // FetchRaw fetches raw README content from a URL without rendering 89 + // Returns raw bytes with Content-Type and HTML validation 90 + // Use this when you need to store the raw markdown (e.g., in PDS records) 91 + func (f *Fetcher) FetchRaw(ctx context.Context, readmeURL string) ([]byte, error) { 92 + // Fetch content (includes URL validation, Content-Type check, and HTML detection) 93 + content, _, err := f.fetchContent(ctx, readmeURL) 94 + if err != nil { 95 + return nil, err 96 + } 97 + 98 + return content, nil 99 + } 100 + 103 101 // fetchContent fetches the raw content from a URL 104 102 func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, string, error) { 103 + // Validate URL 104 + if urlStr == "" { 105 + return nil, "", fmt.Errorf("empty README URL") 106 + } 107 + 108 + parsedURL, err := url.Parse(urlStr) 109 + if err != nil { 110 + return nil, "", fmt.Errorf("invalid README URL: %w", err) 111 + } 112 + 113 + // Only allow HTTP/HTTPS 114 + if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { 115 + return nil, "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme) 116 + } 117 + 105 118 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) 106 119 if err != nil { 107 120 return nil, "", fmt.Errorf("failed to create request: %w", err) ··· 120 133 return nil, "", fmt.Errorf("unexpected status code: %d", resp.StatusCode) 121 134 } 122 135 136 + // Reject HTML content types (catches proper error pages) 137 + contentType := resp.Header.Get("Content-Type") 138 + if contentType != "" { 139 + ct := strings.ToLower(contentType) 140 + if strings.Contains(ct, "text/html") || strings.Contains(ct, "application/xhtml") { 141 + return nil, "", fmt.Errorf("unsupported content type: %s (expected markdown or plain text)", contentType) 142 + } 143 + } 144 + 123 145 // Limit content size to 1MB 124 146 limitedReader := io.LimitReader(resp.Body, 1*1024*1024) 125 147 content, err := io.ReadAll(limitedReader) ··· 127 149 return nil, "", fmt.Errorf("failed to read response body: %w", err) 128 150 } 129 151 152 + // Detect HTML content by checking for common markers (catches soft 404s) 153 + if LooksLikeHTML(content) { 154 + return nil, "", fmt.Errorf("detected HTML content instead of markdown") 155 + } 156 + 130 157 // Get base URL for relative link resolution 131 158 baseURL := getBaseURL(resp.Request.URL) 132 159 133 160 return content, baseURL, nil 161 + } 162 + 163 + // LooksLikeHTML checks if content appears to be HTML rather than markdown 164 + // Exported for use by other packages that fetch README content 165 + func LooksLikeHTML(content []byte) bool { 166 + if len(content) == 0 { 167 + return false 168 + } 169 + 170 + // Check first 512 bytes for HTML markers 171 + checkLen := min(len(content), 512) 172 + 173 + trimmed := strings.TrimSpace(string(content[:checkLen])) 174 + lower := strings.ToLower(trimmed) 175 + 176 + return strings.HasPrefix(lower, "<!doctype") || 177 + strings.HasPrefix(lower, "<html") || 178 + strings.HasPrefix(lower, "<?xml") 134 179 } 135 180 136 181 // renderMarkdown renders markdown content to sanitized HTML
+271 -1
pkg/appview/readme/fetcher_test.go
··· 1 1 package readme 2 2 3 3 import ( 4 + "context" 5 + "net/http" 6 + "net/http/httptest" 4 7 "net/url" 8 + "strings" 5 9 "testing" 6 10 ) 7 11 ··· 305 309 return false 306 310 } 307 311 308 - // TODO: Add README fetching and caching tests 312 + func TestLooksLikeHTML(t *testing.T) { 313 + tests := []struct { 314 + name string 315 + content string 316 + expected bool 317 + }{ 318 + { 319 + name: "empty content", 320 + content: "", 321 + expected: false, 322 + }, 323 + { 324 + name: "markdown content", 325 + content: "# Hello World\n\nThis is a README.", 326 + expected: false, 327 + }, 328 + { 329 + name: "plain text", 330 + content: "Just some plain text without any HTML.", 331 + expected: false, 332 + }, 333 + { 334 + name: "doctype html", 335 + content: "<!DOCTYPE html>\n<html><body>Page</body></html>", 336 + expected: true, 337 + }, 338 + { 339 + name: "doctype html lowercase", 340 + content: "<!doctype html>\n<html><body>Page</body></html>", 341 + expected: true, 342 + }, 343 + { 344 + name: "html tag only", 345 + content: "<html><head></head><body>Page</body></html>", 346 + expected: true, 347 + }, 348 + { 349 + name: "html tag with whitespace", 350 + content: " \n <html>\n<body>Page</body></html>", 351 + expected: true, 352 + }, 353 + { 354 + name: "xml declaration", 355 + content: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html>...</html>", 356 + expected: true, 357 + }, 358 + { 359 + name: "soft 404 page", 360 + content: "<!DOCTYPE html><html><head><title>Page Not Found</title></head><body><h1>404</h1></body></html>", 361 + expected: true, 362 + }, 363 + { 364 + name: "markdown with inline html", 365 + content: "# Title\n\nSome text with <strong>bold</strong> inline.", 366 + expected: false, 367 + }, 368 + { 369 + name: "markdown starting with hash", 370 + content: "## Section\n\nContent here.", 371 + expected: false, 372 + }, 373 + } 374 + 375 + for _, tt := range tests { 376 + t.Run(tt.name, func(t *testing.T) { 377 + result := LooksLikeHTML([]byte(tt.content)) 378 + if result != tt.expected { 379 + t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.content, result, tt.expected) 380 + } 381 + }) 382 + } 383 + } 384 + 385 + func TestFetcher_FetchRaw(t *testing.T) { 386 + fetcher := NewFetcher() 387 + 388 + tests := []struct { 389 + name string 390 + handler http.HandlerFunc 391 + wantErr bool 392 + errContains string 393 + wantContent string 394 + }{ 395 + { 396 + name: "successful markdown fetch", 397 + handler: func(w http.ResponseWriter, r *http.Request) { 398 + w.Header().Set("Content-Type", "text/plain") 399 + w.Write([]byte("# Hello World\n\nThis is markdown.")) 400 + }, 401 + wantErr: false, 402 + wantContent: "# Hello World", 403 + }, 404 + { 405 + name: "rejects HTML content type", 406 + handler: func(w http.ResponseWriter, r *http.Request) { 407 + w.Header().Set("Content-Type", "text/html; charset=utf-8") 408 + w.Write([]byte("<html><body>Error</body></html>")) 409 + }, 410 + wantErr: true, 411 + errContains: "unsupported content type", 412 + }, 413 + { 414 + name: "rejects soft 404 HTML content", 415 + handler: func(w http.ResponseWriter, r *http.Request) { 416 + w.Header().Set("Content-Type", "text/plain") 417 + w.Write([]byte("<!DOCTYPE html><html><body>404 Not Found</body></html>")) 418 + }, 419 + wantErr: true, 420 + errContains: "detected HTML content", 421 + }, 422 + { 423 + name: "rejects 404 status", 424 + handler: func(w http.ResponseWriter, r *http.Request) { 425 + w.WriteHeader(http.StatusNotFound) 426 + w.Write([]byte("Not Found")) 427 + }, 428 + wantErr: true, 429 + errContains: "unexpected status code: 404", 430 + }, 431 + { 432 + name: "rejects 500 status", 433 + handler: func(w http.ResponseWriter, r *http.Request) { 434 + w.WriteHeader(http.StatusInternalServerError) 435 + w.Write([]byte("Internal Server Error")) 436 + }, 437 + wantErr: true, 438 + errContains: "unexpected status code: 500", 439 + }, 440 + } 441 + 442 + for _, tt := range tests { 443 + t.Run(tt.name, func(t *testing.T) { 444 + server := httptest.NewServer(tt.handler) 445 + defer server.Close() 446 + 447 + content, err := fetcher.FetchRaw(context.Background(), server.URL) 448 + 449 + if tt.wantErr { 450 + if err == nil { 451 + t.Errorf("FetchRaw() expected error containing %q, got nil", tt.errContains) 452 + return 453 + } 454 + if !strings.Contains(err.Error(), tt.errContains) { 455 + t.Errorf("FetchRaw() error = %q, want error containing %q", err.Error(), tt.errContains) 456 + } 457 + return 458 + } 459 + 460 + if err != nil { 461 + t.Errorf("FetchRaw() unexpected error: %v", err) 462 + return 463 + } 464 + 465 + if !strings.Contains(string(content), tt.wantContent) { 466 + t.Errorf("FetchRaw() content = %q, want content containing %q", string(content), tt.wantContent) 467 + } 468 + }) 469 + } 470 + } 471 + 472 + func TestFetcher_FetchRaw_URLValidation(t *testing.T) { 473 + fetcher := NewFetcher() 474 + 475 + tests := []struct { 476 + name string 477 + url string 478 + errContains string 479 + }{ 480 + { 481 + name: "empty URL", 482 + url: "", 483 + errContains: "empty README URL", 484 + }, 485 + { 486 + name: "invalid URL scheme", 487 + url: "ftp://example.com/README.md", 488 + errContains: "invalid URL scheme", 489 + }, 490 + { 491 + name: "file URL scheme", 492 + url: "file:///etc/passwd", 493 + errContains: "invalid URL scheme", 494 + }, 495 + } 496 + 497 + for _, tt := range tests { 498 + t.Run(tt.name, func(t *testing.T) { 499 + _, err := fetcher.FetchRaw(context.Background(), tt.url) 500 + if err == nil { 501 + t.Errorf("FetchRaw(%q) expected error, got nil", tt.url) 502 + return 503 + } 504 + if !strings.Contains(err.Error(), tt.errContains) { 505 + t.Errorf("FetchRaw(%q) error = %q, want error containing %q", tt.url, err.Error(), tt.errContains) 506 + } 507 + }) 508 + } 509 + } 510 + 511 + func TestFetcher_FetchAndRender(t *testing.T) { 512 + fetcher := NewFetcher() 513 + 514 + tests := []struct { 515 + name string 516 + handler http.HandlerFunc 517 + wantErr bool 518 + errContains string 519 + wantContain string 520 + }{ 521 + { 522 + name: "renders markdown to HTML", 523 + handler: func(w http.ResponseWriter, r *http.Request) { 524 + w.Header().Set("Content-Type", "text/plain") 525 + w.Write([]byte("# Hello World\n\nThis is **bold** text.")) 526 + }, 527 + wantErr: false, 528 + wantContain: "<strong>bold</strong>", 529 + }, 530 + { 531 + name: "rejects HTML content type", 532 + handler: func(w http.ResponseWriter, r *http.Request) { 533 + w.Header().Set("Content-Type", "text/html") 534 + w.Write([]byte("<html><body>Error</body></html>")) 535 + }, 536 + wantErr: true, 537 + errContains: "unsupported content type", 538 + }, 539 + { 540 + name: "rejects soft 404", 541 + handler: func(w http.ResponseWriter, r *http.Request) { 542 + w.Header().Set("Content-Type", "text/plain") 543 + w.Write([]byte("<!doctype html><html><body>Not Found</body></html>")) 544 + }, 545 + wantErr: true, 546 + errContains: "detected HTML content", 547 + }, 548 + } 549 + 550 + for _, tt := range tests { 551 + t.Run(tt.name, func(t *testing.T) { 552 + server := httptest.NewServer(tt.handler) 553 + defer server.Close() 554 + 555 + html, err := fetcher.FetchAndRender(context.Background(), server.URL) 556 + 557 + if tt.wantErr { 558 + if err == nil { 559 + t.Errorf("FetchAndRender() expected error containing %q, got nil", tt.errContains) 560 + return 561 + } 562 + if !strings.Contains(err.Error(), tt.errContains) { 563 + t.Errorf("FetchAndRender() error = %q, want error containing %q", err.Error(), tt.errContains) 564 + } 565 + return 566 + } 567 + 568 + if err != nil { 569 + t.Errorf("FetchAndRender() unexpected error: %v", err) 570 + return 571 + } 572 + 573 + if !strings.Contains(html, tt.wantContain) { 574 + t.Errorf("FetchAndRender() = %q, want HTML containing %q", html, tt.wantContain) 575 + } 576 + }) 577 + } 578 + }
+40 -67
pkg/appview/storage/manifest_store.go
··· 424 424 return nil 425 425 } 426 426 427 - // ensureRepoPage creates or updates a repo page record in the user's PDS if needed 427 + // ensureRepoPage creates or updates a repo page record in the user's PDS 428 428 // This syncs repository metadata from manifest annotations to the io.atcr.repo.page collection 429 - // Only creates a new record if one doesn't exist (doesn't overwrite user's custom content) 429 + // Always updates the description on push (since users can't edit it via appview yet) 430 + // Preserves user's avatar if they've set one via the appview 430 431 func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atproto.ManifestRecord) { 431 - // Check if repo page already exists (don't overwrite user's custom content) 432 432 rkey := s.ctx.Repository 433 - _, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey) 434 - if err == nil { 435 - // Record already exists - don't overwrite 436 - slog.Debug("Repo page already exists, skipping creation", "did", s.ctx.DID, "repository", s.ctx.Repository) 437 - return 438 - } 439 433 440 - // Only continue if it's a "not found" error - other errors mean we should skip 441 - if !errors.Is(err, atproto.ErrRecordNotFound) { 434 + // Check for existing record to preserve user's avatar 435 + var existingAvatarRef *atproto.ATProtoBlobRef 436 + var existingRecord *atproto.RepoPageRecord 437 + record, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey) 438 + if err == nil && record != nil { 439 + // Unmarshal the Value to get the RepoPageRecord 440 + var repoPage atproto.RepoPageRecord 441 + if unmarshalErr := json.Unmarshal(record.Value, &repoPage); unmarshalErr == nil { 442 + existingRecord = &repoPage 443 + existingAvatarRef = repoPage.Avatar 444 + slog.Debug("Found existing repo page, will update", "did", s.ctx.DID, "repository", s.ctx.Repository, "hasExistingAvatar", existingAvatarRef != nil) 445 + } else { 446 + slog.Warn("Failed to unmarshal existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", unmarshalErr) 447 + } 448 + } else if err != nil && !errors.Is(err, atproto.ErrRecordNotFound) { 449 + // Unexpected error - log and continue (will create new record) 442 450 slog.Warn("Failed to check for existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err) 443 - return 444 451 } 445 452 446 453 // Get annotations (may be nil if image has no OCI labels) ··· 458 465 description = annotations["org.opencontainers.image.description"] 459 466 } 460 467 461 - // Try to fetch and upload icon from io.atcr.icon annotation 462 - var avatarRef *atproto.ATProtoBlobRef 468 + // Determine avatar: prefer new icon from annotations, otherwise keep existing 469 + avatarRef := existingAvatarRef 463 470 if iconURL := annotations["io.atcr.icon"]; iconURL != "" { 464 - avatarRef = s.fetchAndUploadIcon(ctx, iconURL) 471 + if newAvatar := s.fetchAndUploadIcon(ctx, iconURL); newAvatar != nil { 472 + avatarRef = newAvatar 473 + } 465 474 } 466 475 467 - // Create new repo page record with description and optional avatar 476 + // Create/update repo page record with description and avatar 468 477 repoPage := atproto.NewRepoPageRecord(s.ctx.Repository, description, avatarRef) 469 478 470 - slog.Info("Creating repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil) 479 + isUpdate := existingRecord != nil 480 + action := "Creating" 481 + if isUpdate { 482 + action = "Updating" 483 + } 484 + slog.Info(action+" repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil) 471 485 472 486 _, err = s.ctx.ATProtoClient.PutRecord(ctx, atproto.RepoPageCollection, rkey, repoPage) 473 487 if err != nil { 474 - slog.Warn("Failed to create repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err) 488 + slog.Warn("Failed to "+strings.ToLower(action)+" repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err) 475 489 return 476 490 } 477 491 478 - slog.Info("Repo page created successfully", "did", s.ctx.DID, "repository", s.ctx.Repository) 492 + slog.Info("Repo page "+strings.ToLower(action)+"d successfully", "did", s.ctx.DID, "repository", s.ctx.Repository) 479 493 } 480 494 481 495 // fetchReadmeContent attempts to fetch README content from external sources 482 496 // Priority: io.atcr.readme annotation > derived from org.opencontainers.image.source 483 497 // Returns the raw markdown content, or empty string if not available 498 + // Uses the shared readme.Fetcher which validates Content-Type and rejects HTML content 484 499 func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[string]string) string { 485 500 if s.ctx.ReadmeFetcher == nil { 486 501 return "" ··· 492 507 493 508 // Priority 1: Direct README URL from io.atcr.readme annotation 494 509 if readmeURL := annotations["io.atcr.readme"]; readmeURL != "" { 495 - content, err := s.fetchRawReadme(fetchCtx, readmeURL) 510 + content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL) 496 511 if err != nil { 497 512 slog.Debug("Failed to fetch README from io.atcr.readme annotation", "url", readmeURL, "error", err) 498 - } else if content != "" { 513 + } else if len(content) > 0 { 499 514 slog.Info("Fetched README from io.atcr.readme annotation", "url", readmeURL, "length", len(content)) 500 - return content 515 + return string(content) 501 516 } 502 517 } 503 518 ··· 510 525 continue 511 526 } 512 527 513 - content, err := s.fetchRawReadme(fetchCtx, readmeURL) 528 + content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL) 514 529 if err != nil { 515 530 // Only log non-404 errors (404 is expected when trying main vs master) 516 531 if !readme.Is404(err) { ··· 519 534 continue 520 535 } 521 536 522 - if content != "" { 537 + if len(content) > 0 { 523 538 slog.Info("Fetched README from source URL", "sourceURL", sourceURL, "branch", branch, "length", len(content)) 524 - return content 539 + return string(content) 525 540 } 526 541 } 527 542 } 528 543 529 544 return "" 530 - } 531 - 532 - // fetchRawReadme fetches raw markdown content from a URL 533 - // Returns the raw markdown (not rendered HTML) for storage in the repo page record 534 - func (s *ManifestStore) fetchRawReadme(ctx context.Context, readmeURL string) (string, error) { 535 - // Use a simple HTTP client to fetch raw content 536 - // We want raw markdown, not rendered HTML (the Fetcher renders to HTML) 537 - req, err := http.NewRequestWithContext(ctx, "GET", readmeURL, nil) 538 - if err != nil { 539 - return "", fmt.Errorf("failed to create request: %w", err) 540 - } 541 - 542 - req.Header.Set("User-Agent", "ATCR-README-Fetcher/1.0") 543 - 544 - client := &http.Client{ 545 - Timeout: 10 * time.Second, 546 - CheckRedirect: func(req *http.Request, via []*http.Request) error { 547 - if len(via) >= 5 { 548 - return fmt.Errorf("too many redirects") 549 - } 550 - return nil 551 - }, 552 - } 553 - 554 - resp, err := client.Do(req) 555 - if err != nil { 556 - return "", fmt.Errorf("failed to fetch URL: %w", err) 557 - } 558 - defer resp.Body.Close() 559 - 560 - if resp.StatusCode != http.StatusOK { 561 - return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode) 562 - } 563 - 564 - // Limit content size to 100KB (repo page description has 100KB limit in lexicon) 565 - limitedReader := io.LimitReader(resp.Body, 100*1024) 566 - content, err := io.ReadAll(limitedReader) 567 - if err != nil { 568 - return "", fmt.Errorf("failed to read response body: %w", err) 569 - } 570 - 571 - return string(content), nil 572 545 } 573 546 574 547 // fetchAndUploadIcon fetches an image from a URL and uploads it as a blob to the user's PDS