diff --git a/fotograf-de-scraper/backend/Dockerfile b/fotograf-de-scraper/backend/Dockerfile index eeda8e71..7a9bd63d 100644 --- a/fotograf-de-scraper/backend/Dockerfile +++ b/fotograf-de-scraper/backend/Dockerfile @@ -38,8 +38,11 @@ RUN pip install --no-cache-dir -r requirements.txt # Copy the application code COPY . . +# Create directory for error screenshots +RUN mkdir -p /app/errors && chmod 777 /app/errors + # Expose the port FastAPI will run on EXPOSE 8000 -# Command to run the application -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] +# Command to run the application with DEBUG logging +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--log-level", "debug"] diff --git a/fotograf-de-scraper/backend/main.py b/fotograf-de-scraper/backend/main.py index 0ff018b2..4b26b715 100644 --- a/fotograf-de-scraper/backend/main.py +++ b/fotograf-de-scraper/backend/main.py @@ -1,10 +1,5 @@ import os -from dotenv import load_dotenv -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import FileResponse -from typing import List, Dict, Any, Optional -import time +import logging import datetime import base64 import re @@ -13,6 +8,12 @@ from jinja2 import Environment, FileSystemLoader from weasyprint import HTML import tempfile import shutil +import time +from dotenv import load_dotenv +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse +from typing import List, Dict, Any, Optional from selenium import webdriver from selenium.webdriver.chrome.options import Options @@ -21,6 +22,16 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException +# --- Logging Configuration --- +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler() + ] +) +logger = logging.getLogger("fotograf-scraper") + # Load environment variables load_dotenv() @@ -49,38 +60,46 @@ SELECTORS = { "job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", "job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", "export_dropdown": "[data-qa-id='dropdown:export']", - "export_csv_link": "//a[contains(text(), 'CSV') or contains(., 'CSV')]", # Common pattern for CSV export in dropdowns + "export_csv_link": "//a[contains(text(), 'CSV') or contains(., 'CSV')]", } -# --- PDF Generation Logic (Reused from List-Generator) --- +# --- PDF Generation Logic --- def get_logo_base64(): logo_path = os.path.join(os.path.dirname(__file__), "assets", "logo.png") + logger.debug(f"Loading logo from: {logo_path}") try: with open(logo_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') except FileNotFoundError: - print(f"Warning: Logo file not found at {logo_path}") + logger.warning(f"Logo file not found at {logo_path}") return None def generate_pdf_from_csv(csv_path: str, institution: str, date_info: str, list_type: str, output_path: str): + logger.info(f"Generating PDF for {institution} from {csv_path}") df = None - # Try different separators - for sep in [";", ","]: + for sep in [';', ',']: try: + logger.debug(f"Trying CSV separator: '{sep}'") test_df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig", nrows=5) if len(test_df.columns) > 1: df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig") + logger.debug(f"Successfully read CSV with separator '{sep}'") break - except Exception: + except Exception as e: + logger.debug(f"Failed to read with separator '{sep}': {e}") continue + if df is None: + logger.error("Could not read CSV with standard separators.") try: df = pd.read_csv(csv_path, sep=";", encoding="latin1") + logger.info("Fallback to latin1 encoding successful.") except: raise Exception("CSV konnte nicht gelesen werden.") df.columns = df.columns.str.strip().str.replace("\"", "") + logger.debug(f"CSV Columns: {list(df.columns)}") group_label = "Gruppe" if list_type == 'k' else "Klasse" person_label_plural = "Kinder" if list_type == 'k' else "Schüler" @@ -100,6 +119,7 @@ def generate_pdf_from_csv(csv_path: str, institution: str, date_info: str, list_ for col in ["Vorname", "Nachname", group_label]: if col not in df.columns: + logger.warning(f"Column '{col}' not found in CSV, using default values.") df[col] = "Alle" if col == group_label else "" df = df.sort_values(by=[group_label, "Nachname", "Vorname"]) @@ -113,6 +133,7 @@ def generate_pdf_from_csv(csv_path: str, institution: str, date_info: str, list_ total_students = len(df) template_dir = os.path.join(os.path.dirname(__file__), "templates") + logger.debug(f"Using template directory: {template_dir}") env = Environment(loader=FileSystemLoader(template_dir)) template = env.get_template("school_list.html") @@ -132,13 +153,27 @@ def generate_pdf_from_csv(csv_path: str, institution: str, date_info: str, list_ "group_column_name": group_label } + logger.debug("Rendering HTML template...") html_out = template.render(render_context) + logger.info(f"Writing PDF to: {output_path}") HTML(string=html_out).write_pdf(output_path) # --- Selenium Scraper Functions --- +def take_error_screenshot(driver, error_name): + errors_dir = os.path.join(os.path.dirname(__file__), 'errors') + os.makedirs(errors_dir, exist_ok=True) + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"error_{error_name}_{timestamp}.png" + filepath = os.path.join(errors_dir, filename) + try: + driver.save_screenshot(filepath) + logger.error(f"!!! Error screenshot saved to: {filepath}") + except Exception as e: + logger.error(f"!!! Could not save screenshot: {e}") + def setup_driver(download_path: str = None): - print("Initialisiere Chrome WebDriver...") + logger.info("Initializing Chrome WebDriver...") options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') @@ -147,6 +182,7 @@ def setup_driver(download_path: str = None): options.binary_location = '/usr/bin/chromium' if download_path: + logger.debug(f"Configuring download path: {download_path}") prefs = { "download.default_directory": download_path, "download.prompt_for_download": False, @@ -157,48 +193,57 @@ def setup_driver(download_path: str = None): try: driver = webdriver.Chrome(options=options) - if download_path: - # Crucial for headless mode: Allow downloads + logger.debug("Allowing downloads in headless mode via CDP...") driver.execute_cdp_cmd('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': download_path }) - return driver except Exception as e: - print(f"Fehler bei der Initialisierung des WebDrivers: {e}") + logger.error(f"Failed to initialize WebDriver: {e}") return None def login(driver, username, password): - print("Starte Login-Vorgang...") + logger.info(f"Starting login process for user: {username}") try: driver.get(LOGIN_URL) - wait = WebDriverWait(driver, 45) + wait = WebDriverWait(driver, 30) try: + logger.debug("Checking for cookie banner...") cookie_wait = WebDriverWait(driver, 5) cookie_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click() - time.sleep(1) + logger.info("Cookie banner accepted.") except: - pass + logger.debug("No cookie banner found.") + logger.debug("Entering credentials...") wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username) driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password) + + logger.info("Clicking login button...") driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click() + logger.info("Waiting for dashboard redirect...") wait.until(EC.url_contains('/config_dashboard/index')) + logger.info("Login successful!") return True except Exception as e: - print(f"Login fehlgeschlagen: {e}") + logger.error(f"Login failed: {e}") + take_error_screenshot(driver, "login_error") return False def get_jobs_list(driver) -> List[Dict[str, Any]]: jobs_list_url = "https://app.fotograf.de/config_jobs/index" + logger.info(f"Navigating to jobs list: {jobs_list_url}") driver.get(jobs_list_url) - wait = WebDriverWait(driver, 45) + wait = WebDriverWait(driver, 30) jobs = [] try: + logger.debug("Waiting for job rows to appear...") job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"]))) + logger.info(f"Found {len(job_rows)} job rows.") + for row in job_rows: try: name_element = row.find_element(By.XPATH, SELECTORS["job_row_name_link"]) @@ -206,6 +251,9 @@ def get_jobs_list(driver) -> List[Dict[str, Any]]: job_url = name_element.get_attribute('href') job_id_match = re.search(r'/(\d+)$', job_url) job_id = job_id_match.group(1) if job_id_match else None + + logger.debug(f"Parsing job: {job_name} (ID: {job_id})") + status_element = row.find_element(By.XPATH, SELECTORS["job_row_status"]) job_status = status_element.text.strip() date_element = row.find_element(By.XPATH, SELECTORS["job_row_date"]) @@ -221,10 +269,13 @@ def get_jobs_list(driver) -> List[Dict[str, Any]]: "date": job_date, "shooting_type": shooting_type, }) - except: + except Exception as e: + logger.warning(f"Error parsing single job row: {e}") continue - except: - pass + except Exception as e: + logger.error(f"Error retrieving job list: {e}") + take_error_screenshot(driver, "job_list_error") + return jobs # --- API Endpoints --- @@ -235,9 +286,11 @@ async def health_check(): @app.get("/api/jobs", response_model=List[Dict[str, Any]]) async def get_jobs(account_type: str): + logger.info(f"API Request: GET /api/jobs for {account_type}") username = os.getenv(f"{account_type.upper()}_USER") password = os.getenv(f"{account_type.upper()}_PW") if not username or not password: + logger.error(f"Credentials for {account_type} not found in .env") raise HTTPException(status_code=400, detail="Credentials not found.") driver = None @@ -247,43 +300,47 @@ async def get_jobs(account_type: str): raise HTTPException(status_code=401, detail="Login failed.") return get_jobs_list(driver) finally: - if driver: driver.quit() + if driver: + logger.debug("Closing driver.") + driver.quit() @app.get("/api/jobs/{job_id}/generate-pdf") async def generate_pdf(job_id: str, account_type: str): + logger.info(f"API Request: Generate PDF for job {job_id} ({account_type})") username = os.getenv(f"{account_type.upper()}_USER") password = os.getenv(f"{account_type.upper()}_PW") with tempfile.TemporaryDirectory() as temp_dir: + logger.debug(f"Using temp directory for download: {temp_dir}") driver = setup_driver(download_path=temp_dir) try: if not login(driver, username, password): raise HTTPException(status_code=401, detail="Login failed.") - # 1. Navigate to registrations page reg_url = f"https://app.fotograf.de/config_children/view_registrations/{job_id}" - print(f"Navigiere zu Registrierungen: {reg_url}") + logger.info(f"Navigating to registrations page: {reg_url}") driver.get(reg_url) wait = WebDriverWait(driver, 30) - # Get Institution Name for PDF try: institution = driver.find_element(By.TAG_NAME, "h1").text.strip() + logger.debug(f"Detected institution name: {institution}") except: institution = "Fotoauftrag" - # 2. Click Export and trigger CSV download - print("Trigger Export...") + logger.info("Triggering CSV Export...") export_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["export_dropdown"]))) export_btn.click() - time.sleep(1) # Wait for menu + logger.debug("Export dropdown clicked, waiting for menu items...") + time.sleep(1) csv_btn = wait.until(EC.element_to_be_clickable((By.XPATH, SELECTORS["export_csv_link"]))) csv_btn.click() + logger.info("CSV Export link clicked.") - # 3. Wait for download to finish - print("Warte auf CSV Download...") - timeout = 30 + # Wait for file to appear + logger.debug("Waiting for CSV file in download directory...") + timeout = 45 start_time = time.time() csv_file = None while time.time() - start_time < timeout: @@ -291,14 +348,15 @@ async def generate_pdf(job_id: str, account_type: str): csv_files = [f for f in files if f.endswith('.csv')] if csv_files: csv_file = os.path.join(temp_dir, csv_files[0]) + logger.info(f"Download complete: {csv_file}") break time.sleep(1) if not csv_file: + logger.error(f"Download timed out after {timeout} seconds.") + take_error_screenshot(driver, "download_timeout") raise HTTPException(status_code=500, detail="CSV Download fehlgeschlagen.") - # 4. Generate PDF - print(f"Generiere PDF aus {csv_file}...") output_pdf_name = f"Listen_{job_id}.pdf" output_pdf_path = os.path.join(temp_dir, output_pdf_name) @@ -306,17 +364,19 @@ async def generate_pdf(job_id: str, account_type: str): csv_path=csv_file, institution=institution, date_info=datetime.datetime.now().strftime("%d.%m.%Y"), - list_type=account_type, # 'k' or 'schule' + list_type=account_type, output_path=output_pdf_path ) - # 5. Return PDF final_storage = os.path.join("/tmp", output_pdf_name) + logger.info(f"PDF successfully generated. Copying to {final_storage}") shutil.copy(output_pdf_path, final_storage) return FileResponse(path=final_storage, filename=output_pdf_name, media_type="application/pdf") except Exception as e: - print(f"Fehler bei PDF Generierung: {e}") + logger.exception("Unexpected error during PDF generation") raise HTTPException(status_code=500, detail=str(e)) finally: - if driver: driver.quit() \ No newline at end of file + if driver: + logger.debug("Closing driver.") + driver.quit()