Update main.py · jaspermayone.com/ocr-schedule-microservice@581a615

+137 -61

1 changed file

expand all

main.py

+137 -61

main.py

··· 2 2 from fastapi import FastAPI, File, UploadFile 3 3 from fastapi.middleware.cors import CORSMiddleware 4 4 import pytesseract 5 - from PIL import Image 5 + from PIL import Image, ImageEnhance 6 6 import io 7 - import cv2 7 + import re 8 + from datetime import datetime 8 9 import numpy as np 9 - import uvicorn 10 10 11 11 app = FastAPI() 12 12 13 - # Add CORS middleware 14 13 app.add_middleware( 15 14 CORSMiddleware, 16 - allow_origins=["*"], # Allows all origins 15 + allow_origins=["*"], 17 16 allow_credentials=True, 18 - allow_methods=["*"], # Allows all methods 19 - allow_headers=["*"], # Allows all headers 17 + allow_methods=["*"], 18 + allow_headers=["*"], 20 19 ) 21 20 21 + def clean_shift(shift): 22 + """Clean and standardize shift notation""" 23 + if not shift: 24 + return '' 25 + 26 + # Convert to uppercase for consistent comparison 27 + shift = str(shift).strip().upper() 28 + 29 + # Return special cases as-is 30 + if shift in ['OFF', 'GH']: 31 + return shift 32 + 33 + # Handle special cases with words 34 + if 'CLOSE' in shift or 'OPEN' in shift: 35 + return shift 36 + 37 + # Handle multiple shifts (separated by comma, 'and', or '/') 38 + shifts = re.split(r'[,/]|\s+AND\s+', shift) 39 + cleaned_shifts = [] 40 + 41 + for s in shifts: 42 + s = s.strip() 43 + # Extract times using regex 44 + times = re.findall(r'\d{1,2}(?::\d{2})?(?:-\d{1,2}(?::\d{2})?)?', s) 45 + if times: 46 + cleaned_shifts.extend(times) 47 + 48 + return ' & '.join(cleaned_shifts) if cleaned_shifts else shift 49 + 22 50 def process_image(image_bytes): 23 - # Convert bytes to numpy array 24 - nparr = np.frombuffer(image_bytes, np.uint8) 25 - img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) 51 + # Open image with Pillow 52 + image = Image.open(io.BytesIO(image_bytes)) 26 53 27 54 # Convert to grayscale 28 - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 55 + image = image.convert('L') 29 56 30 - # Apply thresholding 31 - thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] 57 + # Enhance contrast 58 + enhancer = ImageEnhance.Contrast(image) 59 + image = enhancer.enhance(2.0) 32 60 33 - # Convert back to PIL Image 34 - pil_image = Image.fromarray(thresh) 61 + # Enhance sharpness 62 + enhancer = ImageEnhance.Sharpness(image) 63 + image = enhancer.enhance(2.0) 35 64 36 - # Perform OCR 37 - text = pytesseract.image_to_string(pil_image) 65 + # Perform OCR with specific config 66 + custom_config = r'--oem 3 --psm 6' 67 + text = pytesseract.image_to_string(image, config=custom_config) 38 68 39 69 return text 40 70 71 + def parse_date(date_str): 72 + """Parse date string expecting MM/DD format""" 73 + if not date_str: 74 + return '' 75 + 76 + # Remove any non-numeric/slash characters 77 + date_str = re.sub(r'[^\d/]', '', date_str) 78 + 79 + try: 80 + if '/' in date_str: 81 + month, day = map(str, date_str.split('/')) 82 + # Ensure both parts exist and are numbers 83 + if month.isdigit() and day.isdigit(): 84 + return f"{int(month):02d}/{int(day):02d}" 85 + except: 86 + pass 87 + 88 + return date_str 89 + 41 90 def parse_schedule(text): 42 91 lines = text.split('\n') 43 - schedule = {} 92 + schedule = { 93 + 'metadata': { 94 + 'title': '', 95 + 'updated': '', 96 + 'notes': [] 97 + }, 98 + 'weeks': [] 99 + } 100 + 101 + current_week = { 102 + 'dates': {}, 103 + 'employees': {} 104 + } 105 + 44 106 current_person = None 45 - dates = {} 107 + dates_found = False 46 108 47 109 for line in lines: 48 - # Skip empty lines 49 - if not line.strip(): 110 + line = line.strip() 111 + if not line: 112 + continue 113 + 114 + # Capture metadata 115 + if 'PARADISE SCHEDULE' in line: 116 + schedule['metadata']['title'] = 'PARADISE SCHEDULE' 117 + # Look for updated date 118 + update_match = re.search(r'Updated\s+(\d{1,2}/\d{1,2})', line) 119 + if update_match: 120 + schedule['metadata']['updated'] = update_match.group(1) 121 + continue 122 + 123 + # Capture notes at bottom 124 + if 'DELI' in line.upper() or 'OPEN TILL' in line.upper(): 125 + schedule['metadata']['notes'].append(line.strip()) 50 126 continue 51 127 52 128 parts = line.split() 53 129 if not parts: 54 130 continue 55 131 56 - # If we find the date row (contains multiple '1/' patterns) 57 - if sum('1/' in part for part in parts) > 1: 58 - dates = { 59 - 'monday': next((p for p in parts if '1/' in p), ''), 60 - 'tuesday': next((p for p in parts[1:] if '1/' in p), ''), 61 - 'wednesday': next((p for p in parts[2:] if '1/' in p), ''), 62 - 'thursday': next((p for p in parts[3:] if '1/' in p), ''), 63 - 'friday': next((p for p in parts[4:] if '1/' in p), ''), 64 - 'saturday': next((p for p in parts[5:] if '1/' in p), ''), 65 - 'sunday': next((p for p in parts[6:] if '1/' in p), '') 132 + # If we find a row with multiple dates 133 + if sum(bool(re.search(r'\d{1,2}/\d{1,2}', part)) for part in parts) > 1: 134 + dates_found = True 135 + current_week['dates'] = { 136 + 'monday': parse_date(parts[0]) if len(parts) > 0 else '', 137 + 'tuesday': parse_date(parts[1]) if len(parts) > 1 else '', 138 + 'wednesday': parse_date(parts[2]) if len(parts) > 2 else '', 139 + 'thursday': parse_date(parts[3]) if len(parts) > 3 else '', 140 + 'friday': parse_date(parts[4]) if len(parts) > 4 else '', 141 + 'saturday': parse_date(parts[5]) if len(parts) > 5 else '', 142 + 'sunday': parse_date(parts[6]) if len(parts) > 6 else '' 66 143 } 67 144 continue 68 145 69 - # Check if this line starts with a name (no numbers or 'OFF') 70 - if len(parts) >= 1 and not parts[0].replace('-', '').isdigit() and parts[0] != 'OFF' and ':' not in parts[0]: 71 - # Ignore "CASHIERS" header 72 - if parts[0] == "CASHIERS": 73 - continue 74 - current_person = ' '.join(parts) 75 - schedule[current_person] = { 76 - 'monday': {'date': dates.get('monday', ''), 'shift': ''}, 77 - 'tuesday': {'date': dates.get('tuesday', ''), 'shift': ''}, 78 - 'wednesday': {'date': dates.get('wednesday', ''), 'shift': ''}, 79 - 'thursday': {'date': dates.get('thursday', ''), 'shift': ''}, 80 - 'friday': {'date': dates.get('friday', ''), 'shift': ''}, 81 - 'saturday': {'date': dates.get('saturday', ''), 'shift': ''}, 82 - 'sunday': {'date': dates.get('sunday', ''), 'shift': ''} 146 + # Skip headers 147 + if parts[0] == "CASHIERS" or "2024" in line: 148 + continue 149 + 150 + # Check for employee name 151 + if not parts[0].replace('-', '').isdigit() and parts[0] != 'OFF' and ':' not in parts[0]: 152 + current_person = ' '.join(p for p in parts if not re.match(r'\d{1,2}[-:]\d{1,2}', p)) 153 + if current_person not in current_week['employees']: 154 + current_week['employees'][current_person] = { 155 + 'monday': '', 'tuesday': '', 'wednesday': '', 'thursday': '', 156 + 'friday': '', 'saturday': '', 'sunday': '' 157 + } 158 + continue 159 + 160 + # Process shifts for current person 161 + if current_person and len(parts) >= 7: 162 + shifts = { 163 + 'monday': clean_shift(parts[0]), 164 + 'tuesday': clean_shift(parts[1]), 165 + 'wednesday': clean_shift(parts[2]), 166 + 'thursday': clean_shift(parts[3]), 167 + 'friday': clean_shift(parts[4]), 168 + 'saturday': clean_shift(parts[5]), 169 + 'sunday': clean_shift(parts[6]) 83 170 } 84 - # If we have shifts data and a current person 85 - elif current_person and len(parts) >= 7: 86 - schedule[current_person] = { 87 - 'monday': {'date': dates.get('monday', ''), 'shift': parts[0]}, 88 - 'tuesday': {'date': dates.get('tuesday', ''), 'shift': parts[1]}, 89 - 'wednesday': {'date': dates.get('wednesday', ''), 'shift': parts[2]}, 90 - 'thursday': {'date': dates.get('thursday', ''), 'shift': parts[3]}, 91 - 'friday': {'date': dates.get('friday', ''), 'shift': parts[4]}, 92 - 'saturday': {'date': dates.get('saturday', ''), 'shift': parts[5]}, 93 - 'sunday': {'date': dates.get('sunday', ''), 'shift': parts[6]} 94 - } 171 + current_week['employees'][current_person].update(shifts) 172 + 173 + if dates_found: 174 + schedule['weeks'].append(current_week) 95 175 96 176 return schedule 97 177 98 178 @app.post("/ocr") 99 179 async def ocr_endpoint(file: UploadFile = File(...)): 100 - # Read the image file 101 180 image_bytes = await file.read() 102 - 103 - # Process the image and get text 104 181 text = process_image(image_bytes) 105 - 106 - # Parse the schedule 107 182 schedule = parse_schedule(text) 108 183 109 184 return { ··· 112 187 } 113 188 114 189 if __name__ == "__main__": 190 + import uvicorn 115 191 uvicorn.run(app, host="0.0.0.0", port=8000)

Configure Feed

Configure Feed