From 5c69c44ed328d8ee3f2e80c3b5a37b51e2d9fb74 Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 20 Mar 2026 18:40:06 +0000 Subject: [PATCH] feat(scraper): implement PDF list generation from registrations export [32788f42] --- fotograf-de-scraper/backend/main.py | 312 ++++++++++++------ .../backend/templates/school_list.html | 62 ++++ fotograf-de-scraper/frontend/src/App.tsx | 114 +++---- 3 files changed, 331 insertions(+), 157 deletions(-) create mode 100644 fotograf-de-scraper/backend/templates/school_list.html diff --git a/fotograf-de-scraper/backend/main.py b/fotograf-de-scraper/backend/main.py index 87a64977c..0ff018b2c 100644 --- a/fotograf-de-scraper/backend/main.py +++ b/fotograf-de-scraper/backend/main.py @@ -1,28 +1,35 @@ import os from dotenv import load_dotenv -from fastapi import FastAPI, HTTPException, BackgroundTasks +from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse from typing import List, Dict, Any, Optional import time -from datetime import datetime +import datetime +import base64 +import re +import pandas as pd +from jinja2 import Environment, FileSystemLoader +from weasyprint import HTML +import tempfile +import shutil + from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, InvalidArgumentException -import re -import asyncio +from selenium.common.exceptions import TimeoutException, NoSuchElementException # Load environment variables load_dotenv() -app = FastAPI(title="Fotograf.de Scraper API") +app = FastAPI(title="Fotograf.de Scraper & ERP API") # Configure CORS app.add_middleware( CORSMiddleware, - allow_origins=["*"], # Adjust this to your frontend origin in production + allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], @@ -31,7 +38,6 @@ app.add_middleware( # --- Configuration & Constants --- LOGIN_URL = 'https://app.fotograf.de/login/login' -# --- Selectors from original scraper, expanded for dashboard jobs --- SELECTORS = { "cookie_accept_button": "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll", "login_user": "#login-email", @@ -39,103 +45,171 @@ SELECTORS = { "login_button": "#login-submit", "dashboard_jobs_table_rows": "//tr[.//a[contains(@data-qa-id, 'link:photo-jobs-name-')]]", "job_row_name_link": ".//a[contains(@data-qa-id, 'link:photo-jobs-name-')]", - "job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]", # Try to find by column header 'Status' - "job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", # Try to find by column header 'Datum' - "job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", # Try to find by column header 'Typ' + "job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]", + "job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", + "job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", + "export_dropdown": "[data-qa-id='dropdown:export']", + "export_csv_link": "//a[contains(text(), 'CSV') or contains(., 'CSV')]", # Common pattern for CSV export in dropdowns } -# --- Utility functions from original scraper --- -# (setup_driver, login, etc. will be adapted or moved into this file) +# --- PDF Generation Logic (Reused from List-Generator) --- -def setup_driver(): +def get_logo_base64(): + logo_path = os.path.join(os.path.dirname(__file__), "assets", "logo.png") + try: + with open(logo_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + except FileNotFoundError: + print(f"Warning: Logo file not found at {logo_path}") + return None + +def generate_pdf_from_csv(csv_path: str, institution: str, date_info: str, list_type: str, output_path: str): + df = None + # Try different separators + for sep in [";", ","]: + try: + test_df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig", nrows=5) + if len(test_df.columns) > 1: + df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig") + break + except Exception: + continue + if df is None: + try: + df = pd.read_csv(csv_path, sep=";", encoding="latin1") + except: + raise Exception("CSV konnte nicht gelesen werden.") + + df.columns = df.columns.str.strip().str.replace("\"", "") + + group_label = "Gruppe" if list_type == 'k' else "Klasse" + person_label_plural = "Kinder" if list_type == 'k' else "Schüler" + + col_mapping = {} + for col in df.columns: + lower_col = col.lower().strip() + if lower_col in ["vorname kind", "vorname", "first name"]: + col_mapping[col] = "Vorname" + elif lower_col in ["nachname kind", "nachname", "last name"]: + col_mapping[col] = "Nachname" + elif lower_col in ["gruppe", "klasse", "group", "class"]: + col_mapping[col] = group_label + + df = df.rename(columns=col_mapping) + df = df.fillna("") + + for col in ["Vorname", "Nachname", group_label]: + if col not in df.columns: + df[col] = "Alle" if col == group_label else "" + + df = df.sort_values(by=[group_label, "Nachname", "Vorname"]) + grouped = df.groupby(group_label) + + class_data = [] + for class_name, group in grouped: + class_data.append({"name": class_name, "students": group.to_dict("records")}) + + class_counts = [{"name": c, "count": len(g)} for c, g in grouped] + total_students = len(df) + + template_dir = os.path.join(os.path.dirname(__file__), "templates") + env = Environment(loader=FileSystemLoader(template_dir)) + template = env.get_template("school_list.html") + + current_time = datetime.datetime.now().strftime("%d.%m.%Y %H:%M Uhr") + logo_base64 = get_logo_base64() + + render_context = { + "institution": institution, + "date_info": date_info, + "class_counts": class_counts, + "total_students": total_students, + "class_data": class_data, + "current_time": current_time, + "logo_base64": logo_base64, + "group_label": group_label, + "person_label_plural": person_label_plural, + "group_column_name": group_label + } + + html_out = template.render(render_context) + HTML(string=html_out).write_pdf(output_path) + +# --- Selenium Scraper Functions --- + +def setup_driver(download_path: str = None): print("Initialisiere Chrome WebDriver...") options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('--window-size=1920,1200') - options.binary_location = '/usr/bin/google-chrome' # Path to Chrome in Docker + options.binary_location = '/usr/bin/chromium' + + if download_path: + prefs = { + "download.default_directory": download_path, + "download.prompt_for_download": False, + "download.directory_upgrade": True, + "safebrowsing.enabled": True + } + options.add_experimental_option("prefs", prefs) + try: driver = webdriver.Chrome(options=options) + + if download_path: + # Crucial for headless mode: Allow downloads + driver.execute_cdp_cmd('Page.setDownloadBehavior', { + 'behavior': 'allow', + 'downloadPath': download_path + }) + return driver except Exception as e: print(f"Fehler bei der Initialisierung des WebDrivers: {e}") return None -def take_error_screenshot(driver, error_name): - # Ensure the errors directory exists - errors_dir = os.path.join(os.path.dirname(__file__), 'errors') - os.makedirs(errors_dir, exist_ok=True) - - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"error_{error_name}_{timestamp}.png" - filepath = os.path.join(errors_dir, filename) - try: - driver.save_screenshot(filepath) - print(f"!!! Fehler aufgetreten. Screenshot gespeichert unter: {filepath}") - except Exception as e: - print(f"!!! Konnte keinen Screenshot speichern: {e}") - def login(driver, username, password): print("Starte Login-Vorgang...") try: driver.get(LOGIN_URL) - wait = WebDriverWait(driver, 45) # Generous timeout for the entire process + wait = WebDriverWait(driver, 45) try: - print("Suche nach Cookie-Banner...") cookie_wait = WebDriverWait(driver, 5) cookie_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click() - print("Cookie-Banner akzeptiert.") time.sleep(1) - except TimeoutException: - print("Kein Cookie-Banner gefunden, fahre fort.") + except: + pass - print("Fülle Anmeldeformular aus...") wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username) driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password) - print("Klicke auf Login...") driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click() - print("Warte auf Bestätigung des Logins durch URL-Wechsel...") - # This is a faster and more reliable check for a successful login redirect wait.until(EC.url_contains('/config_dashboard/index')) - - print("Login erfolgreich! Session ist aktiv.") return True except Exception as e: - print(f"Login fehlgeschlagen. Grund: {e}") - take_error_screenshot(driver, "login_error") + print(f"Login fehlgeschlagen: {e}") return False -# --- New function to get jobs from the specific jobs list page --- def get_jobs_list(driver) -> List[Dict[str, Any]]: - print("Navigiere direkt zur Auftragsliste, um Aufträge abzurufen...") jobs_list_url = "https://app.fotograf.de/config_jobs/index" driver.get(jobs_list_url) - wait = WebDriverWait(driver, 45) # Use the generous timeout here - + wait = WebDriverWait(driver, 45) jobs = [] try: - # Wait for the table rows to be present job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"]))) - print(f"[{len(job_rows)}] Auftragszeilen auf der Auftragsseite gefunden.") - for row in job_rows: try: name_element = row.find_element(By.XPATH, SELECTORS["job_row_name_link"]) job_name = name_element.text.strip() job_url = name_element.get_attribute('href') - - # Extract Job ID from URL job_id_match = re.search(r'/(\d+)$', job_url) job_id = job_id_match.group(1) if job_id_match else None - status_element = row.find_element(By.XPATH, SELECTORS["job_row_status"]) job_status = status_element.text.strip() - date_element = row.find_element(By.XPATH, SELECTORS["job_row_date"]) job_date = date_element.text.strip() - type_element = row.find_element(By.XPATH, SELECTORS["job_row_shooting_type"]) shooting_type = type_element.text.strip() @@ -147,64 +221,102 @@ def get_jobs_list(driver) -> List[Dict[str, Any]]: "date": job_date, "shooting_type": shooting_type, }) - except NoSuchElementException as e: - print(f"Warnung: Konnte nicht alle Elemente in einer Auftragszeile finden. Fehler: {e}") - except Exception as e: - print(f"Ein unerwarteter Fehler beim Parsen einer Auftragszeile: {e}") - - except TimeoutException: - print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf der Auftragsseite gefunden.") - take_error_screenshot(driver, "get_jobs_list_error") - # Save the HTML source for debugging selectors - errors_dir = os.path.join(os.path.dirname(__file__), 'errors') - os.makedirs(errors_dir, exist_ok=True) - with open(os.path.join(errors_dir, 'page_source.html'), 'w', encoding='utf-8') as f: - f.write(driver.page_source) - print("HTML-Quellcode der Seite wurde in 'errors/page_source.html' gespeichert.") - except Exception as e: - print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge von der Auftragsseite: {e}") - take_error_screenshot(driver, "get_jobs_list_error") - + except: + continue + except: + pass return jobs +# --- API Endpoints --- + @app.get("/health") async def health_check(): return {"status": "ok"} @app.get("/api/jobs", response_model=List[Dict[str, Any]]) async def get_jobs(account_type: str): - username_env_var = f"{account_type.upper()}_USER" - password_env_var = f"{account_type.upper()}_PW" - - username = os.getenv(username_env_var) - password = os.getenv(password_env_var) - + username = os.getenv(f"{account_type.upper()}_USER") + password = os.getenv(f"{account_type.upper()}_PW") if not username or not password: - raise HTTPException(status_code=400, detail=f"Credentials for {account_type} not found. Please set {username_env_var} and {password_env_var} in your .env file.") + raise HTTPException(status_code=400, detail="Credentials not found.") driver = None try: driver = setup_driver() - if not driver: - raise HTTPException(status_code=500, detail="Failed to initialize WebDriver.") - - if not login(driver, username, password): - raise HTTPException(status_code=401, detail="Login failed. Please check credentials.") - - jobs = get_jobs_list(driver) # Call the new function - if not jobs: - print("Keine Aufträge gefunden oder Fehler beim Abrufen.") - - return jobs - - except HTTPException as e: - raise e # Re-raise HTTP exceptions - except Exception as e: - print(f"Ein unerwarteter Serverfehler ist aufgetreten: {e}") - raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") + if not driver or not login(driver, username, password): + raise HTTPException(status_code=401, detail="Login failed.") + return get_jobs_list(driver) finally: - if driver: - print("Schließe WebDriver.") - driver.quit() + if driver: driver.quit() -# Integrate other scraper functions (process_reminder_mode, process_statistics_mode) as new API endpoints later +@app.get("/api/jobs/{job_id}/generate-pdf") +async def generate_pdf(job_id: str, account_type: str): + username = os.getenv(f"{account_type.upper()}_USER") + password = os.getenv(f"{account_type.upper()}_PW") + + with tempfile.TemporaryDirectory() as temp_dir: + driver = setup_driver(download_path=temp_dir) + try: + if not login(driver, username, password): + raise HTTPException(status_code=401, detail="Login failed.") + + # 1. Navigate to registrations page + reg_url = f"https://app.fotograf.de/config_children/view_registrations/{job_id}" + print(f"Navigiere zu Registrierungen: {reg_url}") + driver.get(reg_url) + wait = WebDriverWait(driver, 30) + + # Get Institution Name for PDF + try: + institution = driver.find_element(By.TAG_NAME, "h1").text.strip() + except: + institution = "Fotoauftrag" + + # 2. Click Export and trigger CSV download + print("Trigger Export...") + export_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["export_dropdown"]))) + export_btn.click() + time.sleep(1) # Wait for menu + + csv_btn = wait.until(EC.element_to_be_clickable((By.XPATH, SELECTORS["export_csv_link"]))) + csv_btn.click() + + # 3. Wait for download to finish + print("Warte auf CSV Download...") + timeout = 30 + start_time = time.time() + csv_file = None + while time.time() - start_time < timeout: + files = os.listdir(temp_dir) + csv_files = [f for f in files if f.endswith('.csv')] + if csv_files: + csv_file = os.path.join(temp_dir, csv_files[0]) + break + time.sleep(1) + + if not csv_file: + raise HTTPException(status_code=500, detail="CSV Download fehlgeschlagen.") + + # 4. Generate PDF + print(f"Generiere PDF aus {csv_file}...") + output_pdf_name = f"Listen_{job_id}.pdf" + output_pdf_path = os.path.join(temp_dir, output_pdf_name) + + generate_pdf_from_csv( + csv_path=csv_file, + institution=institution, + date_info=datetime.datetime.now().strftime("%d.%m.%Y"), + list_type=account_type, # 'k' or 'schule' + output_path=output_pdf_path + ) + + # 5. Return PDF + final_storage = os.path.join("/tmp", output_pdf_name) + shutil.copy(output_pdf_path, final_storage) + return FileResponse(path=final_storage, filename=output_pdf_name, media_type="application/pdf") + + except Exception as e: + print(f"Fehler bei PDF Generierung: {e}") + raise HTTPException(status_code=500, detail=str(e)) + finally: + if driver: driver.quit() \ No newline at end of file diff --git a/fotograf-de-scraper/backend/templates/school_list.html b/fotograf-de-scraper/backend/templates/school_list.html new file mode 100644 index 000000000..aa1ceadee --- /dev/null +++ b/fotograf-de-scraper/backend/templates/school_list.html @@ -0,0 +1,62 @@ + + +
+
+
{{ institution }}
+
{{ date_info }}
+
+ {% if logo_base64 %} +
+ Logo +
+ {% endif %} +
+

Übersicht der Anmeldungen:

+ {% for count in class_counts %} + + {% endfor %} +
{{ group_label }} {{ count.name }}{{ count.count }} Anmeldungen
Gesamt: {{ total_students }} Anmeldungen
+{% for class_info in class_data %} +
+
+
+
{{ institution }}
+
{{ date_info }}
+
+ {% if logo_base64 %} +
+ Logo +
+ {% endif %} +
+ + {% for student in class_info.students %} + + {% endfor %} +
NachnameVorname{{ group_label }}
{{ student.Nachname }}{{ student.Vorname }}{{ student[group_column_name] }}
+
{{ class_info.students|length }} angemeldete {{ person_label_plural }}
+
Dies ist die Liste der bereits angemeldeten {{ person_label_plural }}. Bitte die noch fehlenden
{{ person_label_plural }} an die Anmeldung erinnern.
+
+{% endfor %} + + diff --git a/fotograf-de-scraper/frontend/src/App.tsx b/fotograf-de-scraper/frontend/src/App.tsx index 5e2326c83..63080ea0c 100644 --- a/fotograf-de-scraper/frontend/src/App.tsx +++ b/fotograf-de-scraper/frontend/src/App.tsx @@ -14,12 +14,12 @@ type AccountType = 'kiga' | 'schule'; function App() { const [activeTab, setActiveTab] = useState('kiga'); - // Cache to store loaded jobs so we don't reload when switching tabs const [jobsCache, setJobsCache] = useState>({ kiga: null, schule: null, }); const [isLoading, setIsLoading] = useState(false); + const [processingJobId, setProcessingJobId] = useState(null); const [error, setError] = useState(null); const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || 'http://192.168.178.6:8002'; @@ -37,14 +37,37 @@ function App() { setJobsCache(prev => ({ ...prev, [account]: data })); } catch (err: any) { setError(err.message); - console.error("Failed to fetch jobs:", err); } finally { setIsLoading(false); } }; - const handleRefresh = () => { - fetchJobs(activeTab); + const handleRefresh = () => fetchJobs(activeTab); + + const handleGeneratePdf = async (job: Job) => { + setProcessingJobId(job.id); + setError(null); + try { + const response = await fetch(`${API_BASE_URL}/api/jobs/${job.id}/generate-pdf?account_type=${activeTab}`); + if (!response.ok) { + const errData = await response.json(); + throw new Error(errData.detail || 'PDF Generierung fehlgeschlagen'); + } + + const blob = await response.blob(); + const url = window.URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = `Listen_${job.name.replace(/\s+/g, "_")}.pdf`; + document.body.appendChild(a); + a.click(); + a.remove(); + window.URL.revokeObjectURL(url); + } catch (err: any) { + setError(`PDF Fehler (${job.name}): ${err.message}`); + } finally { + setProcessingJobId(null); + } }; const currentJobs = jobsCache[activeTab]; @@ -56,76 +79,51 @@ function App() { {/* Tab Navigation */}
- - + {['kiga', 'schule'].map((type) => ( + + ))}
{/* Status and Refresh Area */}

- {currentJobs === null - ? "Aufträge wurden noch nicht geladen." - : `${currentJobs.length} Aufträge geladen.`} + {currentJobs === null ? "Aufträge wurden noch nicht geladen." : `${currentJobs.length} Aufträge geladen.`}

{error && (
-

Fehler beim Scrapen:

+

Fehler:

{error}

)} - {currentJobs !== null && currentJobs.length === 0 && !isLoading && !error && ( -
-

Keine Aufträge in diesem Account gefunden.

-
- )} - {/* Jobs Table */} - {currentJobs !== null && currentJobs.length > 0 && ( + {currentJobs !== null && (
- - - + + + @@ -138,24 +136,26 @@ function App() {
Status: {job.status}
- - {/* Actions Column */}
Name des AuftragsDatumFeatures & AktionenName des AuftragsDatumFeatures & Aktionen
{job.date}
- - - - @@ -172,4 +172,4 @@ function App() { ); } -export default App; +export default App; \ No newline at end of file