feat(scraper): implement PDF list generation from registrations export [32788f42]

This commit is contained in:
2026-03-20 18:40:06 +00:00
parent ae61cc44e1
commit 5c69c44ed3
3 changed files with 331 additions and 157 deletions

View File

@@ -1,28 +1,35 @@
import os
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from typing import List, Dict, Any, Optional
import time
from datetime import datetime
import datetime
import base64
import re
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML
import tempfile
import shutil
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, InvalidArgumentException
import re
import asyncio
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# Load environment variables
load_dotenv()
app = FastAPI(title="Fotograf.de Scraper API")
app = FastAPI(title="Fotograf.de Scraper & ERP API")
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Adjust this to your frontend origin in production
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
@@ -31,7 +38,6 @@ app.add_middleware(
# --- Configuration & Constants ---
LOGIN_URL = 'https://app.fotograf.de/login/login'
# --- Selectors from original scraper, expanded for dashboard jobs ---
SELECTORS = {
"cookie_accept_button": "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
"login_user": "#login-email",
@@ -39,103 +45,171 @@ SELECTORS = {
"login_button": "#login-submit",
"dashboard_jobs_table_rows": "//tr[.//a[contains(@data-qa-id, 'link:photo-jobs-name-')]]",
"job_row_name_link": ".//a[contains(@data-qa-id, 'link:photo-jobs-name-')]",
"job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]", # Try to find by column header 'Status'
"job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", # Try to find by column header 'Datum'
"job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", # Try to find by column header 'Typ'
"job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]",
"job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]",
"job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]",
"export_dropdown": "[data-qa-id='dropdown:export']",
"export_csv_link": "//a[contains(text(), 'CSV') or contains(., 'CSV')]", # Common pattern for CSV export in dropdowns
}
# --- Utility functions from original scraper ---
# (setup_driver, login, etc. will be adapted or moved into this file)
# --- PDF Generation Logic (Reused from List-Generator) ---
def setup_driver():
def get_logo_base64():
logo_path = os.path.join(os.path.dirname(__file__), "assets", "logo.png")
try:
with open(logo_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
except FileNotFoundError:
print(f"Warning: Logo file not found at {logo_path}")
return None
def generate_pdf_from_csv(csv_path: str, institution: str, date_info: str, list_type: str, output_path: str):
df = None
# Try different separators
for sep in [";", ","]:
try:
test_df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig", nrows=5)
if len(test_df.columns) > 1:
df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig")
break
except Exception:
continue
if df is None:
try:
df = pd.read_csv(csv_path, sep=";", encoding="latin1")
except:
raise Exception("CSV konnte nicht gelesen werden.")
df.columns = df.columns.str.strip().str.replace("\"", "")
group_label = "Gruppe" if list_type == 'k' else "Klasse"
person_label_plural = "Kinder" if list_type == 'k' else "Schüler"
col_mapping = {}
for col in df.columns:
lower_col = col.lower().strip()
if lower_col in ["vorname kind", "vorname", "first name"]:
col_mapping[col] = "Vorname"
elif lower_col in ["nachname kind", "nachname", "last name"]:
col_mapping[col] = "Nachname"
elif lower_col in ["gruppe", "klasse", "group", "class"]:
col_mapping[col] = group_label
df = df.rename(columns=col_mapping)
df = df.fillna("")
for col in ["Vorname", "Nachname", group_label]:
if col not in df.columns:
df[col] = "Alle" if col == group_label else ""
df = df.sort_values(by=[group_label, "Nachname", "Vorname"])
grouped = df.groupby(group_label)
class_data = []
for class_name, group in grouped:
class_data.append({"name": class_name, "students": group.to_dict("records")})
class_counts = [{"name": c, "count": len(g)} for c, g in grouped]
total_students = len(df)
template_dir = os.path.join(os.path.dirname(__file__), "templates")
env = Environment(loader=FileSystemLoader(template_dir))
template = env.get_template("school_list.html")
current_time = datetime.datetime.now().strftime("%d.%m.%Y %H:%M Uhr")
logo_base64 = get_logo_base64()
render_context = {
"institution": institution,
"date_info": date_info,
"class_counts": class_counts,
"total_students": total_students,
"class_data": class_data,
"current_time": current_time,
"logo_base64": logo_base64,
"group_label": group_label,
"person_label_plural": person_label_plural,
"group_column_name": group_label
}
html_out = template.render(render_context)
HTML(string=html_out).write_pdf(output_path)
# --- Selenium Scraper Functions ---
def setup_driver(download_path: str = None):
print("Initialisiere Chrome WebDriver...")
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--window-size=1920,1200')
options.binary_location = '/usr/bin/google-chrome' # Path to Chrome in Docker
options.binary_location = '/usr/bin/chromium'
if download_path:
prefs = {
"download.default_directory": download_path,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}
options.add_experimental_option("prefs", prefs)
try:
driver = webdriver.Chrome(options=options)
if download_path:
# Crucial for headless mode: Allow downloads
driver.execute_cdp_cmd('Page.setDownloadBehavior', {
'behavior': 'allow',
'downloadPath': download_path
})
return driver
except Exception as e:
print(f"Fehler bei der Initialisierung des WebDrivers: {e}")
return None
def take_error_screenshot(driver, error_name):
# Ensure the errors directory exists
errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
os.makedirs(errors_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"error_{error_name}_{timestamp}.png"
filepath = os.path.join(errors_dir, filename)
try:
driver.save_screenshot(filepath)
print(f"!!! Fehler aufgetreten. Screenshot gespeichert unter: {filepath}")
except Exception as e:
print(f"!!! Konnte keinen Screenshot speichern: {e}")
def login(driver, username, password):
print("Starte Login-Vorgang...")
try:
driver.get(LOGIN_URL)
wait = WebDriverWait(driver, 45) # Generous timeout for the entire process
wait = WebDriverWait(driver, 45)
try:
print("Suche nach Cookie-Banner...")
cookie_wait = WebDriverWait(driver, 5)
cookie_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
print("Cookie-Banner akzeptiert.")
time.sleep(1)
except TimeoutException:
print("Kein Cookie-Banner gefunden, fahre fort.")
except:
pass
print("Fülle Anmeldeformular aus...")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username)
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password)
print("Klicke auf Login...")
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click()
print("Warte auf Bestätigung des Logins durch URL-Wechsel...")
# This is a faster and more reliable check for a successful login redirect
wait.until(EC.url_contains('/config_dashboard/index'))
print("Login erfolgreich! Session ist aktiv.")
return True
except Exception as e:
print(f"Login fehlgeschlagen. Grund: {e}")
take_error_screenshot(driver, "login_error")
print(f"Login fehlgeschlagen: {e}")
return False
# --- New function to get jobs from the specific jobs list page ---
def get_jobs_list(driver) -> List[Dict[str, Any]]:
print("Navigiere direkt zur Auftragsliste, um Aufträge abzurufen...")
jobs_list_url = "https://app.fotograf.de/config_jobs/index"
driver.get(jobs_list_url)
wait = WebDriverWait(driver, 45) # Use the generous timeout here
wait = WebDriverWait(driver, 45)
jobs = []
try:
# Wait for the table rows to be present
job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"])))
print(f"[{len(job_rows)}] Auftragszeilen auf der Auftragsseite gefunden.")
for row in job_rows:
try:
name_element = row.find_element(By.XPATH, SELECTORS["job_row_name_link"])
job_name = name_element.text.strip()
job_url = name_element.get_attribute('href')
# Extract Job ID from URL
job_id_match = re.search(r'/(\d+)$', job_url)
job_id = job_id_match.group(1) if job_id_match else None
status_element = row.find_element(By.XPATH, SELECTORS["job_row_status"])
job_status = status_element.text.strip()
date_element = row.find_element(By.XPATH, SELECTORS["job_row_date"])
job_date = date_element.text.strip()
type_element = row.find_element(By.XPATH, SELECTORS["job_row_shooting_type"])
shooting_type = type_element.text.strip()
@@ -147,64 +221,102 @@ def get_jobs_list(driver) -> List[Dict[str, Any]]:
"date": job_date,
"shooting_type": shooting_type,
})
except NoSuchElementException as e:
print(f"Warnung: Konnte nicht alle Elemente in einer Auftragszeile finden. Fehler: {e}")
except Exception as e:
print(f"Ein unerwarteter Fehler beim Parsen einer Auftragszeile: {e}")
except TimeoutException:
print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf der Auftragsseite gefunden.")
take_error_screenshot(driver, "get_jobs_list_error")
# Save the HTML source for debugging selectors
errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
os.makedirs(errors_dir, exist_ok=True)
with open(os.path.join(errors_dir, 'page_source.html'), 'w', encoding='utf-8') as f:
f.write(driver.page_source)
print("HTML-Quellcode der Seite wurde in 'errors/page_source.html' gespeichert.")
except Exception as e:
print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge von der Auftragsseite: {e}")
take_error_screenshot(driver, "get_jobs_list_error")
except:
continue
except:
pass
return jobs
# --- API Endpoints ---
@app.get("/health")
async def health_check():
return {"status": "ok"}
@app.get("/api/jobs", response_model=List[Dict[str, Any]])
async def get_jobs(account_type: str):
username_env_var = f"{account_type.upper()}_USER"
password_env_var = f"{account_type.upper()}_PW"
username = os.getenv(username_env_var)
password = os.getenv(password_env_var)
username = os.getenv(f"{account_type.upper()}_USER")
password = os.getenv(f"{account_type.upper()}_PW")
if not username or not password:
raise HTTPException(status_code=400, detail=f"Credentials for {account_type} not found. Please set {username_env_var} and {password_env_var} in your .env file.")
raise HTTPException(status_code=400, detail="Credentials not found.")
driver = None
try:
driver = setup_driver()
if not driver:
raise HTTPException(status_code=500, detail="Failed to initialize WebDriver.")
if not login(driver, username, password):
raise HTTPException(status_code=401, detail="Login failed. Please check credentials.")
jobs = get_jobs_list(driver) # Call the new function
if not jobs:
print("Keine Aufträge gefunden oder Fehler beim Abrufen.")
return jobs
except HTTPException as e:
raise e # Re-raise HTTP exceptions
except Exception as e:
print(f"Ein unerwarteter Serverfehler ist aufgetreten: {e}")
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
if not driver or not login(driver, username, password):
raise HTTPException(status_code=401, detail="Login failed.")
return get_jobs_list(driver)
finally:
if driver:
print("Schließe WebDriver.")
driver.quit()
if driver: driver.quit()
# Integrate other scraper functions (process_reminder_mode, process_statistics_mode) as new API endpoints later
@app.get("/api/jobs/{job_id}/generate-pdf")
async def generate_pdf(job_id: str, account_type: str):
username = os.getenv(f"{account_type.upper()}_USER")
password = os.getenv(f"{account_type.upper()}_PW")
with tempfile.TemporaryDirectory() as temp_dir:
driver = setup_driver(download_path=temp_dir)
try:
if not login(driver, username, password):
raise HTTPException(status_code=401, detail="Login failed.")
# 1. Navigate to registrations page
reg_url = f"https://app.fotograf.de/config_children/view_registrations/{job_id}"
print(f"Navigiere zu Registrierungen: {reg_url}")
driver.get(reg_url)
wait = WebDriverWait(driver, 30)
# Get Institution Name for PDF
try:
institution = driver.find_element(By.TAG_NAME, "h1").text.strip()
except:
institution = "Fotoauftrag"
# 2. Click Export and trigger CSV download
print("Trigger Export...")
export_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["export_dropdown"])))
export_btn.click()
time.sleep(1) # Wait for menu
csv_btn = wait.until(EC.element_to_be_clickable((By.XPATH, SELECTORS["export_csv_link"])))
csv_btn.click()
# 3. Wait for download to finish
print("Warte auf CSV Download...")
timeout = 30
start_time = time.time()
csv_file = None
while time.time() - start_time < timeout:
files = os.listdir(temp_dir)
csv_files = [f for f in files if f.endswith('.csv')]
if csv_files:
csv_file = os.path.join(temp_dir, csv_files[0])
break
time.sleep(1)
if not csv_file:
raise HTTPException(status_code=500, detail="CSV Download fehlgeschlagen.")
# 4. Generate PDF
print(f"Generiere PDF aus {csv_file}...")
output_pdf_name = f"Listen_{job_id}.pdf"
output_pdf_path = os.path.join(temp_dir, output_pdf_name)
generate_pdf_from_csv(
csv_path=csv_file,
institution=institution,
date_info=datetime.datetime.now().strftime("%d.%m.%Y"),
list_type=account_type, # 'k' or 'schule'
output_path=output_pdf_path
)
# 5. Return PDF
final_storage = os.path.join("/tmp", output_pdf_name)
shutil.copy(output_pdf_path, final_storage)
return FileResponse(path=final_storage, filename=output_pdf_name, media_type="application/pdf")
except Exception as e:
print(f"Fehler bei PDF Generierung: {e}")
raise HTTPException(status_code=500, detail=str(e))
finally:
if driver: driver.quit()