create_pdf.py at main · dunkirk.sh/cedrus

dunkirk.sh / cedrus
downloads cedarville publishing books as pdf
cedrus / create_pdf.py
at main 3.9 kB view raw
  1#!/usr/bin/env python3
  2"""
  3Create PDF using Playwright's native PDF printing (creates vectors!).
  4"""
  5
  6import asyncio
  7from pathlib import Path
  8from PIL import Image
  9from playwright.async_api import async_playwright
 10
 11async def main():
 12    script_dir = Path(__file__).parent
 13    svg_dir = script_dir / "svg_layers"
 14    webp_dir = script_dir / "webp_highres"
 15    pdf_dir = script_dir / "pdf_pages"
 16    pdf_dir.mkdir(exist_ok=True)
 17    
 18    print("Creating vector PDFs using Playwright Print-to-PDF...")
 19    print()
 20    
 21    # Get page dimensions
 22    sample_webp = webp_dir / "page_0020_3.webp"
 23    img = Image.open(sample_webp)
 24    page_width, page_height = img.size
 25    
 26    # Convert pixels to inches (assuming 96 DPI)
 27    width_inches = page_width / 96
 28    height_inches = page_height / 96
 29    
 30    async with async_playwright() as p:
 31        browser = await p.chromium.launch(headless=True)
 32        page = await browser.new_page()
 33        
 34        # Generate PDF for each page
 35        for page_num in range(1, 341):
 36            if page_num % 10 == 0:
 37                print(f"  Creating PDF: {page_num}/340...")
 38            
 39            svg_file = svg_dir / f"page_{page_num:04d}.svg"
 40            webp_file = webp_dir / f"page_{page_num:04d}_3.webp"
 41            
 42            # Create HTML
 43            html = f"""<!DOCTYPE html>
 44<html>
 45<head>
 46    <style>
 47        * {{ margin: 0; padding: 0; }}
 48        body {{ 
 49            width: {page_width}px; 
 50            height: {page_height}px;
 51            position: relative;
 52        }}
 53        .layer {{
 54            position: absolute;
 55            top: 0;
 56            left: 0;
 57            width: {page_width}px;
 58            height: {page_height}px;
 59        }}
 60    </style>
 61</head>
 62<body>
 63"""
 64            
 65            if webp_file.exists():
 66                html += f'    <img class="layer" src="file://{webp_file.absolute()}" />\n'
 67            
 68            if svg_file.exists():
 69                html += f'    <img class="layer" src="file://{svg_file.absolute()}" />\n'
 70            
 71            html += """</body>
 72</html>"""
 73            
 74            # Save HTML
 75            html_file = pdf_dir / f"page_{page_num:04d}.html"
 76            with open(html_file, 'w') as f:
 77                f.write(html)
 78            
 79            # Navigate and print to PDF
 80            await page.goto(f"file://{html_file.absolute()}")
 81            await page.wait_for_load_state('networkidle')
 82            
 83            pdf_file = pdf_dir / f"page_{page_num:04d}.pdf"
 84            await page.pdf(
 85                path=str(pdf_file),
 86                width=f"{width_inches}in",
 87                height=f"{height_inches}in",
 88                print_background=True,
 89                margin={'top': '0', 'bottom': '0', 'left': '0', 'right': '0'}
 90            )
 91            
 92            # Cleanup HTML
 93            html_file.unlink()
 94        
 95        await browser.close()
 96    
 97    print()
 98    print("Merging individual PDFs...")
 99    
100    # Merge all PDFs using PyPDF2
101    from PyPDF2 import PdfMerger
102    
103    merger = PdfMerger()
104    pdf_files = sorted(pdf_dir.glob("page_*.pdf"))
105    
106    for i, pdf_file in enumerate(pdf_files, 1):
107        if i % 50 == 0:
108            print(f"  Merging: {i}/{len(pdf_files)}...")
109        merger.append(str(pdf_file))
110    
111    output_pdf = script_dir / "Invitation_to_Cybersecurity.pdf"
112    merger.write(str(output_pdf))
113    merger.close()
114    
115    # Cleanup individual PDFs
116    print("\nCleaning up...")
117    for pdf_file in pdf_files:
118        pdf_file.unlink()
119    pdf_dir.rmdir()
120    
121    file_size = output_pdf.stat().st_size / 1024 / 1024
122    print()
123    print(f"✓ Vector PDF created!")
124    print(f"  Location: {output_pdf}")
125    print(f"  Pages: {len(pdf_files)}")
126    print(f"  Size: {file_size:.1f} MB")
127    print()
128    print("Text should be vector with embedded fonts!")
129
130if __name__ == "__main__":
131    asyncio.run(main())