downloads cedarville publishing books as pdf
1#!/usr/bin/env python3
2"""
3Create PDF using Playwright's native PDF printing (creates vectors!).
4"""
5
6import asyncio
7from pathlib import Path
8from PIL import Image
9from playwright.async_api import async_playwright
10
11async def main():
12 script_dir = Path(__file__).parent
13 svg_dir = script_dir / "svg_layers"
14 webp_dir = script_dir / "webp_highres"
15 pdf_dir = script_dir / "pdf_pages"
16 pdf_dir.mkdir(exist_ok=True)
17
18 print("Creating vector PDFs using Playwright Print-to-PDF...")
19 print()
20
21 # Get page dimensions
22 sample_webp = webp_dir / "page_0020_3.webp"
23 img = Image.open(sample_webp)
24 page_width, page_height = img.size
25
26 # Convert pixels to inches (assuming 96 DPI)
27 width_inches = page_width / 96
28 height_inches = page_height / 96
29
30 async with async_playwright() as p:
31 browser = await p.chromium.launch(headless=True)
32 page = await browser.new_page()
33
34 # Generate PDF for each page
35 for page_num in range(1, 341):
36 if page_num % 10 == 0:
37 print(f" Creating PDF: {page_num}/340...")
38
39 svg_file = svg_dir / f"page_{page_num:04d}.svg"
40 webp_file = webp_dir / f"page_{page_num:04d}_3.webp"
41
42 # Create HTML
43 html = f"""<!DOCTYPE html>
44<html>
45<head>
46 <style>
47 * {{ margin: 0; padding: 0; }}
48 body {{
49 width: {page_width}px;
50 height: {page_height}px;
51 position: relative;
52 }}
53 .layer {{
54 position: absolute;
55 top: 0;
56 left: 0;
57 width: {page_width}px;
58 height: {page_height}px;
59 }}
60 </style>
61</head>
62<body>
63"""
64
65 if webp_file.exists():
66 html += f' <img class="layer" src="file://{webp_file.absolute()}" />\n'
67
68 if svg_file.exists():
69 html += f' <img class="layer" src="file://{svg_file.absolute()}" />\n'
70
71 html += """</body>
72</html>"""
73
74 # Save HTML
75 html_file = pdf_dir / f"page_{page_num:04d}.html"
76 with open(html_file, 'w') as f:
77 f.write(html)
78
79 # Navigate and print to PDF
80 await page.goto(f"file://{html_file.absolute()}")
81 await page.wait_for_load_state('networkidle')
82
83 pdf_file = pdf_dir / f"page_{page_num:04d}.pdf"
84 await page.pdf(
85 path=str(pdf_file),
86 width=f"{width_inches}in",
87 height=f"{height_inches}in",
88 print_background=True,
89 margin={'top': '0', 'bottom': '0', 'left': '0', 'right': '0'}
90 )
91
92 # Cleanup HTML
93 html_file.unlink()
94
95 await browser.close()
96
97 print()
98 print("Merging individual PDFs...")
99
100 # Merge all PDFs using PyPDF2
101 from PyPDF2 import PdfMerger
102
103 merger = PdfMerger()
104 pdf_files = sorted(pdf_dir.glob("page_*.pdf"))
105
106 for i, pdf_file in enumerate(pdf_files, 1):
107 if i % 50 == 0:
108 print(f" Merging: {i}/{len(pdf_files)}...")
109 merger.append(str(pdf_file))
110
111 output_pdf = script_dir / "Invitation_to_Cybersecurity.pdf"
112 merger.write(str(output_pdf))
113 merger.close()
114
115 # Cleanup individual PDFs
116 print("\nCleaning up...")
117 for pdf_file in pdf_files:
118 pdf_file.unlink()
119 pdf_dir.rmdir()
120
121 file_size = output_pdf.stat().st_size / 1024 / 1024
122 print()
123 print(f"✓ Vector PDF created!")
124 print(f" Location: {output_pdf}")
125 print(f" Pages: {len(pdf_files)}")
126 print(f" Size: {file_size:.1f} MB")
127 print()
128 print("Text should be vector with embedded fonts!")
129
130if __name__ == "__main__":
131 asyncio.run(main())