downloads cedarville publishing books as pdf
at main 3.9 kB view raw
1#!/usr/bin/env python3 2""" 3Create PDF using Playwright's native PDF printing (creates vectors!). 4""" 5 6import asyncio 7from pathlib import Path 8from PIL import Image 9from playwright.async_api import async_playwright 10 11async def main(): 12 script_dir = Path(__file__).parent 13 svg_dir = script_dir / "svg_layers" 14 webp_dir = script_dir / "webp_highres" 15 pdf_dir = script_dir / "pdf_pages" 16 pdf_dir.mkdir(exist_ok=True) 17 18 print("Creating vector PDFs using Playwright Print-to-PDF...") 19 print() 20 21 # Get page dimensions 22 sample_webp = webp_dir / "page_0020_3.webp" 23 img = Image.open(sample_webp) 24 page_width, page_height = img.size 25 26 # Convert pixels to inches (assuming 96 DPI) 27 width_inches = page_width / 96 28 height_inches = page_height / 96 29 30 async with async_playwright() as p: 31 browser = await p.chromium.launch(headless=True) 32 page = await browser.new_page() 33 34 # Generate PDF for each page 35 for page_num in range(1, 341): 36 if page_num % 10 == 0: 37 print(f" Creating PDF: {page_num}/340...") 38 39 svg_file = svg_dir / f"page_{page_num:04d}.svg" 40 webp_file = webp_dir / f"page_{page_num:04d}_3.webp" 41 42 # Create HTML 43 html = f"""<!DOCTYPE html> 44<html> 45<head> 46 <style> 47 * {{ margin: 0; padding: 0; }} 48 body {{ 49 width: {page_width}px; 50 height: {page_height}px; 51 position: relative; 52 }} 53 .layer {{ 54 position: absolute; 55 top: 0; 56 left: 0; 57 width: {page_width}px; 58 height: {page_height}px; 59 }} 60 </style> 61</head> 62<body> 63""" 64 65 if webp_file.exists(): 66 html += f' <img class="layer" src="file://{webp_file.absolute()}" />\n' 67 68 if svg_file.exists(): 69 html += f' <img class="layer" src="file://{svg_file.absolute()}" />\n' 70 71 html += """</body> 72</html>""" 73 74 # Save HTML 75 html_file = pdf_dir / f"page_{page_num:04d}.html" 76 with open(html_file, 'w') as f: 77 f.write(html) 78 79 # Navigate and print to PDF 80 await page.goto(f"file://{html_file.absolute()}") 81 await page.wait_for_load_state('networkidle') 82 83 pdf_file = pdf_dir / f"page_{page_num:04d}.pdf" 84 await page.pdf( 85 path=str(pdf_file), 86 width=f"{width_inches}in", 87 height=f"{height_inches}in", 88 print_background=True, 89 margin={'top': '0', 'bottom': '0', 'left': '0', 'right': '0'} 90 ) 91 92 # Cleanup HTML 93 html_file.unlink() 94 95 await browser.close() 96 97 print() 98 print("Merging individual PDFs...") 99 100 # Merge all PDFs using PyPDF2 101 from PyPDF2 import PdfMerger 102 103 merger = PdfMerger() 104 pdf_files = sorted(pdf_dir.glob("page_*.pdf")) 105 106 for i, pdf_file in enumerate(pdf_files, 1): 107 if i % 50 == 0: 108 print(f" Merging: {i}/{len(pdf_files)}...") 109 merger.append(str(pdf_file)) 110 111 output_pdf = script_dir / "Invitation_to_Cybersecurity.pdf" 112 merger.write(str(output_pdf)) 113 merger.close() 114 115 # Cleanup individual PDFs 116 print("\nCleaning up...") 117 for pdf_file in pdf_files: 118 pdf_file.unlink() 119 pdf_dir.rmdir() 120 121 file_size = output_pdf.stat().st_size / 1024 / 1024 122 print() 123 print(f"✓ Vector PDF created!") 124 print(f" Location: {output_pdf}") 125 print(f" Pages: {len(pdf_files)}") 126 print(f" Size: {file_size:.1f} MB") 127 print() 128 print("Text should be vector with embedded fonts!") 129 130if __name__ == "__main__": 131 asyncio.run(main())