[32788f42] feat: implement database persistence, modernized UI with Tailwind, and Calendly-integrated QR card generator for Fotograf.de scraper

This commit is contained in:
2026-03-21 09:04:03 +00:00
parent 22fe4dbd9f
commit c02facdf5d
6975 changed files with 1835694 additions and 179 deletions

View File

@@ -10,10 +10,14 @@ import tempfile
import shutil
import time
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, HTTPException, Depends, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from typing import List, Dict, Any, Optional
from sqlalchemy.orm import Session
from database import get_db, Job as DBJob, engine, Base
import math
import uuid
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
@@ -35,6 +39,9 @@ logger = logging.getLogger("fotograf-scraper")
# Load environment variables
load_dotenv()
# Ensure DB is created
Base.metadata.create_all(bind=engine)
app = FastAPI(title="Fotograf.de Scraper & ERP API")
# Configure CORS
@@ -61,6 +68,14 @@ SELECTORS = {
"job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]",
"export_dropdown": "[data-qa-id='dropdown:export']",
"export_csv_link": "button[data-qa-id='button:csv']",
# --- Statistics Selectors ---
"album_overview_rows": "//table/tbody/tr",
"album_overview_link": ".//td[2]//a",
"access_code_count": "//span[text()='Zugangscodes']/following-sibling::strong",
"person_rows": "//div[contains(@class, 'border-legacy-silver-550') and .//span[text()='Logins']]",
"person_all_photos": ".//div[@data-key]",
"person_purchased_photos": ".//div[@data-key and .//img[@alt='Bestellungen mit diesem Foto']]",
"person_access_card_photo": ".//div[@data-key and contains(@class, 'opacity-50')]",
}
# --- PDF Generation Logic ---
@@ -278,15 +293,214 @@ def get_jobs_list(driver) -> List[Dict[str, Any]]:
return jobs
# --- Background Task Engine ---
task_store: Dict[str, Dict[str, Any]] = {}
def process_statistics(task_id: str, job_id: str, account_type: str):
logger.info(f"Task {task_id}: Starting statistics calculation for job {job_id}")
task_store[task_id] = {"status": "running", "progress": "Initialisiere Browser...", "result": None}
username = os.getenv(f"{account_type.upper()}_USER")
password = os.getenv(f"{account_type.upper()}_PW")
driver = None
try:
driver = setup_driver()
if not driver or not login(driver, username, password):
task_store[task_id] = {"status": "error", "progress": "Login fehlgeschlagen. Überprüfe die Zugangsdaten."}
return
task_store[task_id]["progress"] = f"Lade Alben-Übersicht für Auftrag..."
albums_overview_url = f"https://app.fotograf.de/config_jobs_photos/index/{job_id}"
logger.info(f"Navigating to albums: {albums_overview_url}")
driver.get(albums_overview_url)
wait = WebDriverWait(driver, 15)
albums_to_visit = []
try:
album_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["album_overview_rows"])))
for row in album_rows:
try:
album_link = row.find_element(By.XPATH, SELECTORS["album_overview_link"])
albums_to_visit.append({"name": album_link.text, "url": album_link.get_attribute('href')})
except NoSuchElementException:
continue
except TimeoutException:
task_store[task_id] = {"status": "error", "progress": "Konnte die Album-Liste nicht finden."}
return
total_albums = len(albums_to_visit)
task_store[task_id]["progress"] = f"{total_albums} Alben gefunden. Starte Auswertung..."
statistics = []
for index, album in enumerate(albums_to_visit):
album_name = album['name']
task_store[task_id]["progress"] = f"Bearbeite Album {index + 1}/{total_albums}: '{album_name}'..."
driver.get(album['url'])
try:
total_codes_text = wait.until(EC.visibility_of_element_located((By.XPATH, SELECTORS["access_code_count"]))).text
num_pages = math.ceil(int(total_codes_text) / 20)
total_children_in_album = 0
children_with_purchase = 0
children_with_all_purchased = 0
for page_num in range(1, num_pages + 1):
task_store[task_id]["progress"] = f"Bearbeite Album {index + 1}/{total_albums}: '{album_name}' (Seite {page_num}/{num_pages})..."
if page_num > 1:
driver.get(album['url'] + f"?page_guest_accesses={page_num}")
person_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["person_rows"])))
for person_row in person_rows:
total_children_in_album += 1
try:
photo_container = person_row.find_element(By.XPATH, "./following-sibling::div[1]")
num_total_photos = len(photo_container.find_elements(By.XPATH, SELECTORS["person_all_photos"]))
num_purchased_photos = len(photo_container.find_elements(By.XPATH, SELECTORS["person_purchased_photos"]))
num_access_cards = len(photo_container.find_elements(By.XPATH, SELECTORS["person_access_card_photo"]))
buyable_photos = num_total_photos - num_access_cards
if num_purchased_photos > 0:
children_with_purchase += 1
if buyable_photos > 0 and buyable_photos == num_purchased_photos:
children_with_all_purchased += 1
except NoSuchElementException:
continue
statistics.append({
"Album": album_name,
"Kinder_insgesamt": total_children_in_album,
"Kinder_mit_Käufen": children_with_purchase,
"Kinder_Alle_Bilder_gekauft": children_with_all_purchased
})
except Exception as e:
logger.error(f"Fehler bei Auswertung von Album '{album_name}': {e}")
continue
task_store[task_id] = {
"status": "completed",
"progress": "Auswertung erfolgreich abgeschlossen!",
"result": statistics
}
except Exception as e:
logger.exception(f"Unexpected error in task {task_id}")
task_store[task_id] = {"status": "error", "progress": f"Unerwarteter Fehler: {str(e)}"}
finally:
if driver:
logger.debug(f"Task {task_id}: Closing driver.")
driver.quit()
from fastapi import FastAPI, HTTPException, Depends, BackgroundTasks, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, JSONResponse
from typing import List, Dict, Any, Optional
from sqlalchemy.orm import Session
from database import get_db, Job as DBJob, engine, Base
import math
import uuid
from qr_generator import get_calendly_events, overlay_text_on_pdf
# --- API Endpoints ---
@app.get("/api/calendly/events")
async def fetch_calendly_events(start_time: str, end_time: str, event_type_name: Optional[str] = None):
"""
Debug endpoint to fetch and inspect raw Calendly data.
"""
api_token = os.getenv("CALENDLY_TOKEN")
if not api_token:
raise HTTPException(status_code=400, detail="Calendly API token missing.")
try:
from qr_generator import get_calendly_events_raw
raw_data = get_calendly_events_raw(api_token, start_time, end_time, event_type_name)
return {"count": len(raw_data), "events": raw_data}
except Exception as e:
logger.error(f"Error fetching Calendly events: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/qr-cards/generate")
async def generate_qr_cards(
start_time: str = Form(...),
end_time: str = Form(...),
event_type_name: str = Form(None),
pdf_file: UploadFile = File(...)
):
logger.info(f"API Request: Generate QR cards from {start_time} to {end_time} for event type '{event_type_name}'")
api_token = os.getenv("CALENDLY_TOKEN")
if not api_token:
raise HTTPException(status_code=400, detail="Calendly API token missing.")
try:
# Save uploaded PDF temporarily
temp_dir = tempfile.gettempdir()
base_pdf_path = os.path.join(temp_dir, f"upload_{uuid.uuid4()}.pdf")
with open(base_pdf_path, "wb") as buffer:
shutil.copyfileobj(pdf_file.file, buffer)
# 1. Fetch formatted data from Calendly
texts = get_calendly_events(api_token, start_time, end_time, event_type_name)
if not texts:
os.remove(base_pdf_path)
return JSONResponse(status_code=404, content={"message": "Keine passenden Termine gefunden."})
# 2. Overlay text on blank PDF
output_name = f"QR_Karten_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
output_path = os.path.join(temp_dir, output_name)
overlay_text_on_pdf(base_pdf_path, output_path, texts)
# Cleanup uploaded file
os.remove(base_pdf_path)
return FileResponse(path=output_path, filename=output_name, media_type="application/pdf")
except Exception as e:
logger.error(f"Error generating QR cards: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
return {"status": "ok"}
@app.get("/api/jobs", response_model=List[Dict[str, Any]])
async def get_jobs(account_type: str):
logger.info(f"API Request: GET /api/jobs for {account_type}")
async def get_jobs(account_type: str, force_refresh: bool = False, db: Session = Depends(get_db)):
logger.info(f"API Request: GET /api/jobs for {account_type} (force_refresh={force_refresh})")
# 1. Check database first if not forcing a refresh
if not force_refresh:
cached_jobs = db.query(DBJob).filter(DBJob.account_type == account_type).all()
if cached_jobs:
logger.info(f"Returning {len(cached_jobs)} cached jobs for {account_type}")
return [
{
"id": job.id,
"name": job.name,
"url": job.url,
"status": job.status,
"date": job.date,
"shooting_type": job.shooting_type,
"last_updated": job.last_updated.isoformat() if job.last_updated else None
}
for job in cached_jobs
]
else:
logger.info(f"No cached jobs found for {account_type}. Initiating scrape...")
# 2. Scrape from fotograf.de if forcing refresh or no cached jobs
username = os.getenv(f"{account_type.upper()}_USER")
password = os.getenv(f"{account_type.upper()}_PW")
if not username or not password:
@@ -298,12 +512,61 @@ async def get_jobs(account_type: str):
driver = setup_driver()
if not driver or not login(driver, username, password):
raise HTTPException(status_code=401, detail="Login failed.")
return get_jobs_list(driver)
scraped_jobs = get_jobs_list(driver)
# 3. Save to database
if scraped_jobs:
logger.info(f"Saving {len(scraped_jobs)} jobs to database for {account_type}...")
# Clear old jobs for this account type
db.query(DBJob).filter(DBJob.account_type == account_type).delete()
# Insert new jobs
now = datetime.datetime.utcnow()
for job_data in scraped_jobs:
if job_data["id"]: # Ensure we have an ID
new_job = DBJob(
id=job_data["id"],
name=job_data["name"],
url=job_data["url"],
status=job_data["status"],
date=job_data["date"],
shooting_type=job_data["shooting_type"],
account_type=account_type,
last_updated=now
)
db.add(new_job)
# Update dict for return value
job_data["last_updated"] = now.isoformat()
db.commit()
logger.info("Database updated successfully.")
return scraped_jobs
except Exception as e:
logger.error(f"Error during scraping or database save: {e}")
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
if driver:
logger.debug("Closing driver.")
driver.quit()
@app.get("/api/tasks/{task_id}")
async def get_task_status(task_id: str):
logger.debug(f"API Request: Check task status for {task_id}")
if task_id not in task_store:
raise HTTPException(status_code=404, detail="Task nicht gefunden.")
return task_store[task_id]
@app.post("/api/jobs/{job_id}/statistics")
async def start_statistics(job_id: str, account_type: str, background_tasks: BackgroundTasks):
logger.info(f"API Request: Start statistics for job {job_id} ({account_type})")
task_id = str(uuid.uuid4())
background_tasks.add_task(process_statistics, task_id, job_id, account_type)
return {"task_id": task_id}
@app.get("/api/jobs/{job_id}/generate-pdf")
async def generate_pdf(job_id: str, account_type: str):
logger.info(f"API Request: Generate PDF for job {job_id} ({account_type})")