commit 0848ddfc33cba06bc3f1fbb055df13a7cca6e2b4 · dunkirk.sh/cedrus

+42

.gitignore

··· 1 + # Python 2 + venv/ 3 + __pycache__/ 4 + *.pyc 5 + *.pyo 6 + *.pyd 7 + .Python 8 + *.egg-info/ 9 + dist/ 10 + build/ 11 + 12 + # Downloaded content 13 + svg_layers/ 14 + webp_highres/ 15 + webp_layers/ 16 + merged_pages/ 17 + 18 + # Generated PDFs 19 + *.pdf 20 + 21 + # Temporary files 22 + temp_*.html 23 + test_*.png 24 + test_*.webp 25 + merged_page_*.png 26 + 27 + # OS files 28 + .DS_Store 29 + .AppleDouble 30 + .LSOverride 31 + Thumbs.db 32 + Desktop.ini 33 + 34 + # IDE 35 + .vscode/ 36 + .idea/ 37 + *.swp 38 + *.swo 39 + *~ 40 + 41 + # Crush 42 + .crush/

+108

README.md

··· 1 + # Cedarville Cybersecurity Textbook PDF Creator 2 + 3 + Automated tool to download and convert the Cedarville "Invitation to Cybersecurity" textbook to PDF format. 4 + 5 + ## Features 6 + 7 + - Downloads all 340 pages (SVG text layers + high-res WebP images) 8 + - Composites layers with proper font rendering 9 + - Creates high-quality PDF (1045x1350 pixels per page) 10 + - Optional: Add searchable text with OCR 11 + 12 + ## Quick Start 13 + 14 + ```bash 15 + ./build.sh 16 + ``` 17 + 18 + That's it! The script will: 19 + 1. Create a Python virtual environment 20 + 2. Install dependencies 21 + 3. Download all page layers (~10-15 min) 22 + 4. Create the PDF (~8-10 min) 23 + 5. Optionally add OCR for selectable text (~30-60 min) 24 + 25 + ## Manual Steps 26 + 27 + If you prefer to run steps individually: 28 + 29 + ### 1. Setup Environment 30 + 31 + ```bash 32 + python3 -m venv venv 33 + source venv/bin/activate 34 + pip install -r requirements.txt 35 + python -m playwright install chromium 36 + ``` 37 + 38 + ### 2. Download Layers 39 + 40 + ```bash 41 + python download_layers.py 42 + ``` 43 + 44 + Downloads 340 pages: 45 + - SVG layers (text, vector graphics) → `svg_layers/` 46 + - High-res WebP images (1045x1350) → `webp_highres/` 47 + 48 + ### 3. Create PDF 49 + 50 + ```bash 51 + python create_pdf.py 52 + ``` 53 + 54 + Composites SVG + WebP and creates `Invitation_to_Cybersecurity.pdf` 55 + 56 + ### 4. Add OCR (Optional) 57 + 58 + ```bash 59 + brew install ocrmypdf 60 + ocrmypdf Invitation_to_Cybersecurity.pdf Invitation_to_Cybersecurity_OCR.pdf 61 + ``` 62 + 63 + Creates a version with selectable/searchable text. 64 + 65 + ## Requirements 66 + 67 + - Python 3.9+ 68 + - macOS (tested on Apple Silicon) 69 + - Homebrew (for OCR step) 70 + 71 + ## Output 72 + 73 + - **Invitation_to_Cybersecurity.pdf** - 340 pages, ~70-80 MB, high quality 74 + - **Invitation_to_Cybersecurity_OCR.pdf** - Same as above + searchable text (optional) 75 + 76 + ## File Structure 77 + 78 + ``` 79 + cedrus/ 80 + ├── build.sh # Main build script 81 + ├── requirements.txt # Python dependencies 82 + ├── download_layers.py # Download SVG + WebP 83 + ├── create_pdf.py # Composite and create PDF 84 + ├── svg_layers/ # Downloaded SVG files 85 + ├── webp_highres/ # Downloaded WebP files 86 + ├── merged_pages/ # Temporary composited PNGs 87 + └── Invitation_to_Cybersecurity.pdf 88 + ``` 89 + 90 + ## Troubleshooting 91 + 92 + **"Command not found: python3"** 93 + - Install Python 3: `brew install python3` 94 + 95 + **"ocrmypdf not found"** 96 + - OCR step is optional. Install with: `brew install ocrmypdf` 97 + 98 + **Fonts look wrong** 99 + - The script uses Playwright (Chromium) which properly renders embedded fonts 100 + - If issues persist, check that Playwright browser installed: `python -m playwright install chromium` 101 + 102 + ## Notes 103 + 104 + - Total time: ~20-30 minutes (without OCR) 105 + - With OCR: ~50-90 minutes total 106 + - Disk space needed: ~500 MB temporary files 107 + - The script downloads from the official Cedarville publication server 108 + - Be patient - high-quality rendering takes time!

+109

build.sh

··· 1 + #!/bin/bash 2 + set -e 3 + 4 + echo "================================================" 5 + echo "Cedarville Cybersecurity Textbook PDF Creator" 6 + echo "================================================" 7 + echo "" 8 + 9 + # Colors for output 10 + GREEN='\033[0;32m' 11 + BLUE='\033[0;34m' 12 + NC='\033[0m' # No Color 13 + 14 + # Get the directory where this script is located 15 + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 16 + cd "$SCRIPT_DIR" 17 + 18 + # Step 1: Setup virtual environment 19 + echo -e "${BLUE}[Step 1/6] Setting up Python virtual environment...${NC}" 20 + if [ ! -d "venv" ]; then 21 + python3 -m venv venv 22 + echo "✓ Virtual environment created" 23 + else 24 + echo "✓ Virtual environment already exists" 25 + fi 26 + 27 + source venv/bin/activate 28 + 29 + # Step 2: Install Python dependencies 30 + echo -e "${BLUE}[Step 2/6] Installing Python dependencies...${NC}" 31 + pip install -q --upgrade pip 32 + pip install -q -r requirements.txt 33 + echo "✓ Python packages installed" 34 + 35 + # Step 3: Install Playwright browsers 36 + echo -e "${BLUE}[Step 3/6] Installing Playwright browsers...${NC}" 37 + if [ ! -d "$HOME/.cache/ms-playwright" ]; then 38 + python -m playwright install chromium 39 + echo "✓ Playwright browser installed" 40 + else 41 + echo "✓ Playwright browser already installed" 42 + fi 43 + 44 + # Step 4: Download page layers (SVG + WebP) 45 + echo -e "${BLUE}[Step 4/6] Downloading page layers...${NC}" 46 + if [ ! -d "svg_layers" ] || [ ! -d "webp_highres" ]; then 47 + echo "This will download 340 pages (SVG + high-res WebP)" 48 + echo "Estimated time: 10-15 minutes" 49 + echo "" 50 + python download_layers.py 51 + echo -e "${GREEN}✓ All page layers downloaded${NC}" 52 + else 53 + echo "✓ Page layers already downloaded" 54 + fi 55 + 56 + # Step 5: Create PDF 57 + echo -e "${BLUE}[Step 5/6] Creating PDF from layers...${NC}" 58 + if [ ! -f "Invitation_to_Cybersecurity.pdf" ]; then 59 + echo "This will composite SVG + WebP and create the PDF" 60 + echo "Estimated time: 8-10 minutes" 61 + echo "" 62 + python create_pdf.py 63 + echo -e "${GREEN}✓ PDF created successfully${NC}" 64 + else 65 + echo "✓ PDF already exists" 66 + read -p "Recreate PDF? (y/N): " recreate 67 + if [[ $recreate =~ ^[Yy]$ ]]; then 68 + rm Invitation_to_Cybersecurity.pdf 69 + python create_pdf.py 70 + echo -e "${GREEN}✓ PDF recreated${NC}" 71 + fi 72 + fi 73 + 74 + # Step 6: Add OCR text layer (optional) 75 + echo -e "${BLUE}[Step 6/6] Adding OCR text layer (optional)...${NC}" 76 + if command -v ocrmypdf &> /dev/null; then 77 + if [ ! -f "Invitation_to_Cybersecurity_OCR.pdf" ]; then 78 + read -p "Add searchable text layer with OCR? This will take 30-60 minutes. (y/N): " add_ocr 79 + if [[ $add_ocr =~ ^[Yy]$ ]]; then 80 + echo "Running OCR (this will take a while)..." 81 + ocrmypdf --force-ocr Invitation_to_Cybersecurity.pdf Invitation_to_Cybersecurity_OCR.pdf 82 + echo -e "${GREEN}✓ OCR PDF created with selectable text${NC}" 83 + else 84 + echo "Skipped OCR step" 85 + fi 86 + else 87 + echo "✓ OCR PDF already exists" 88 + fi 89 + else 90 + echo "⚠ ocrmypdf not installed. To add selectable text, run:" 91 + echo " brew install ocrmypdf" 92 + echo " ocrmypdf Invitation_to_Cybersecurity.pdf Invitation_to_Cybersecurity_OCR.pdf" 93 + fi 94 + 95 + echo "" 96 + echo -e "${GREEN}================================================${NC}" 97 + echo -e "${GREEN}✓ Complete!${NC}" 98 + echo -e "${GREEN}================================================${NC}" 99 + echo "" 100 + echo "Output files:" 101 + if [ -f "Invitation_to_Cybersecurity.pdf" ]; then 102 + SIZE=$(du -h "Invitation_to_Cybersecurity.pdf" | cut -f1) 103 + echo " 📄 Invitation_to_Cybersecurity.pdf ($SIZE)" 104 + fi 105 + if [ -f "Invitation_to_Cybersecurity_OCR.pdf" ]; then 106 + SIZE=$(du -h "Invitation_to_Cybersecurity_OCR.pdf" | cut -f1) 107 + echo " 📄 Invitation_to_Cybersecurity_OCR.pdf ($SIZE) [with selectable text]" 108 + fi 109 + echo ""

+23

clean.sh

··· 1 + #!/bin/bash 2 + # Cleanup temporary files and start fresh 3 + 4 + echo "Cleaning up temporary files..." 5 + 6 + # Remove downloaded layers 7 + rm -rf svg_layers webp_highres 8 + echo "✓ Removed downloaded layers" 9 + 10 + # Remove merged pages 11 + rm -rf merged_pages 12 + echo "✓ Removed merged pages" 13 + 14 + # Remove generated PDFs 15 + rm -f Invitation_to_Cybersecurity.pdf Invitation_to_Cybersecurity_OCR.pdf 16 + echo "✓ Removed PDFs" 17 + 18 + # Remove virtual environment 19 + rm -rf venv 20 + echo "✓ Removed virtual environment" 21 + 22 + echo "" 23 + echo "Cleanup complete! Run ./build.sh to start fresh."

+145

create_pdf.py

··· 1 + #!/usr/bin/env python3 2 + """ 3 + Create PDF by compositing SVG + WebP layers using Playwright. 4 + Ensures proper font rendering. 5 + """ 6 + 7 + import asyncio 8 + from pathlib import Path 9 + from PIL import Image 10 + from playwright.async_api import async_playwright 11 + import sys 12 + 13 + async def merge_page(page, page_num, svg_dir, webp_dir, output_file): 14 + """Merge SVG and WebP using browser rendering.""" 15 + svg_file = svg_dir / f"page_{page_num:04d}.svg" 16 + webp_file = webp_dir / f"page_{page_num:04d}_3.webp" 17 + 18 + if not webp_file.exists() and not svg_file.exists(): 19 + return False 20 + 21 + # Get dimensions from WebP 22 + if webp_file.exists(): 23 + img = Image.open(webp_file) 24 + width, height = img.size 25 + else: 26 + width, height = 1045, 1350 27 + 28 + # Create HTML 29 + html_content = f""" 30 + <!DOCTYPE html> 31 + <html> 32 + <head> 33 + <style> 34 + body {{ margin: 0; padding: 0; }} 35 + .container {{ 36 + position: relative; 37 + width: {width}px; 38 + height: {height}px; 39 + }} 40 + .layer {{ 41 + position: absolute; 42 + top: 0; 43 + left: 0; 44 + width: 100%; 45 + height: 100%; 46 + }} 47 + </style> 48 + </head> 49 + <body> 50 + <div class="container"> 51 + """ 52 + 53 + if webp_file.exists(): 54 + html_content += f'<img class="layer" src="file://{webp_file.absolute()}" />' 55 + 56 + if svg_file.exists(): 57 + html_content += f'<img class="layer" src="file://{svg_file.absolute()}" />' 58 + 59 + html_content += """ 60 + </div> 61 + </body> 62 + </html> 63 + """ 64 + 65 + temp_html = output_file.parent / "temp_render.html" 66 + with open(temp_html, 'w') as f: 67 + f.write(html_content) 68 + 69 + try: 70 + await page.goto(f"file://{temp_html.absolute()}") 71 + await page.wait_for_load_state('networkidle', timeout=10000) 72 + 73 + container = await page.query_selector('.container') 74 + screenshot_bytes = await container.screenshot() 75 + 76 + with open(output_file, 'wb') as f: 77 + f.write(screenshot_bytes) 78 + 79 + temp_html.unlink() 80 + return True 81 + except Exception as e: 82 + print(f" Error page {page_num}: {e}") 83 + if temp_html.exists(): 84 + temp_html.unlink() 85 + return False 86 + 87 + async def main(): 88 + script_dir = Path(__file__).parent 89 + svg_dir = script_dir / "svg_layers" 90 + webp_dir = script_dir / "webp_highres" 91 + merged_dir = script_dir / "merged_pages" 92 + merged_dir.mkdir(exist_ok=True) 93 + 94 + print("Creating PDF with Playwright (proper font rendering)...") 95 + print("Estimated time: 8-10 minutes") 96 + print() 97 + 98 + async with async_playwright() as p: 99 + browser = await p.chromium.launch(headless=True) 100 + page = await browser.new_page() 101 + 102 + # Merge all pages 103 + for page_num in range(1, 341): 104 + if page_num % 10 == 0: 105 + print(f" Rendering: {page_num}/340 pages...") 106 + sys.stdout.flush() 107 + 108 + output_file = merged_dir / f"page_{page_num:04d}.png" 109 + await merge_page(page, page_num, svg_dir, webp_dir, output_file) 110 + 111 + await browser.close() 112 + 113 + print() 114 + print("All pages rendered! Creating PDF...") 115 + 116 + # Convert to PDF 117 + image_files = sorted(merged_dir.glob("page_*.png")) 118 + 119 + images = [] 120 + for i, img_file in enumerate(image_files, 1): 121 + if i % 50 == 0: 122 + print(f" Adding to PDF: {i}/{len(image_files)}...") 123 + sys.stdout.flush() 124 + img = Image.open(img_file).convert('RGB') 125 + images.append(img) 126 + 127 + if images: 128 + output_pdf = script_dir / "Invitation_to_Cybersecurity.pdf" 129 + images[0].save( 130 + output_pdf, 131 + "PDF", 132 + resolution=100.0, 133 + save_all=True, 134 + append_images=images[1:] 135 + ) 136 + 137 + file_size = output_pdf.stat().st_size / 1024 / 1024 138 + print() 139 + print(f"✓ PDF created successfully!") 140 + print(f" Location: {output_pdf}") 141 + print(f" Pages: {len(images)}") 142 + print(f" Size: {file_size:.1f} MB") 143 + 144 + if __name__ == "__main__": 145 + asyncio.run(main())

+81

download_layers.py

··· 1 + #!/usr/bin/env python3 2 + """ 3 + Download all page layers (SVG + high-res WebP) from Cedarville textbook. 4 + """ 5 + 6 + import requests 7 + from pathlib import Path 8 + import time 9 + import sys 10 + 11 + GUID = "abc80436024deb05cedb27129b7ae6b0" 12 + BASE_URL = "https://publications.cedarville.edu/cedrus_press/invitation_to_cybersecurity/files/assets" 13 + 14 + def download_svg(page_num, output_dir): 15 + """Download SVG text layer.""" 16 + url = f"{BASE_URL}/common/page-vectorlayers/{page_num:04d}.svg?uni={GUID}" 17 + output_file = output_dir / f"page_{page_num:04d}.svg" 18 + 19 + try: 20 + response = requests.get(url, timeout=30) 21 + if response.status_code == 200: 22 + with open(output_file, 'wb') as f: 23 + f.write(response.content) 24 + return True 25 + except: 26 + pass 27 + return False 28 + 29 + def download_webp(page_num, output_dir): 30 + """Download high-res WebP background.""" 31 + url = f"{BASE_URL}/common/page-html5-substrates/page{page_num:04d}_3.webp?uni={GUID}" 32 + output_file = output_dir / f"page_{page_num:04d}_3.webp" 33 + 34 + try: 35 + response = requests.get(url, timeout=30) 36 + if response.status_code == 200: 37 + with open(output_file, 'wb') as f: 38 + f.write(response.content) 39 + return True 40 + except: 41 + pass 42 + return False 43 + 44 + def main(): 45 + script_dir = Path(__file__).parent 46 + svg_dir = script_dir / "svg_layers" 47 + webp_dir = script_dir / "webp_highres" 48 + 49 + svg_dir.mkdir(exist_ok=True) 50 + webp_dir.mkdir(exist_ok=True) 51 + 52 + print("Downloading 340 pages (SVG + high-res WebP)...") 53 + print(f" SVG layers → {svg_dir}") 54 + print(f" WebP layers → {webp_dir}") 55 + print() 56 + 57 + svg_count = 0 58 + webp_count = 0 59 + 60 + for page_num in range(1, 341): 61 + # Download both layers 62 + if download_svg(page_num, svg_dir): 63 + svg_count += 1 64 + if download_webp(page_num, webp_dir): 65 + webp_count += 1 66 + 67 + # Progress update 68 + if page_num % 20 == 0: 69 + print(f" Progress: {page_num}/340 (SVG: {svg_count}, WebP: {webp_count})") 70 + sys.stdout.flush() 71 + 72 + # Be polite to server 73 + time.sleep(0.3) 74 + 75 + print() 76 + print(f"Download complete!") 77 + print(f" SVG layers: {svg_count}/340") 78 + print(f" WebP layers: {webp_count}/340") 79 + 80 + if __name__ == "__main__": 81 + main()

+3

requirements.txt

··· 1 + requests==2.31.0 2 + Pillow==10.1.0 3 + playwright==1.40.0