[32788f42] feat: implement database persistence, modernized UI with Tailwind, and Calendly-integrated QR card generator for Fotograf.de scraper

2026-03-21 09:04:03 +00:00
parent 22fe4dbd9f
commit c02facdf5d
6975 changed files with 1835694 additions and 179 deletions
--- a/fotograf-de-scraper/backend/main.py
+++ b/fotograf-de-scraper/backend/main.py
@@ -10,10 +10,14 @@ import tempfile
 import shutil
 import time
 from dotenv import load_dotenv
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, HTTPException, Depends, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from typing import List, Dict, Any, Optional
+from sqlalchemy.orm import Session
+from database import get_db, Job as DBJob, engine, Base
+import math
+import uuid

 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
@@ -35,6 +39,9 @@ logger = logging.getLogger("fotograf-scraper")
 # Load environment variables
 load_dotenv()

+# Ensure DB is created
+Base.metadata.create_all(bind=engine)
+
 app = FastAPI(title="Fotograf.de Scraper & ERP API")

 # Configure CORS
@@ -61,6 +68,14 @@ SELECTORS = {
    "job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]",
    "export_dropdown": "[data-qa-id='dropdown:export']",
    "export_csv_link": "button[data-qa-id='button:csv']",
+    # --- Statistics Selectors ---
+    "album_overview_rows": "//table/tbody/tr",
+    "album_overview_link": ".//td[2]//a",
+    "access_code_count": "//span[text()='Zugangscodes']/following-sibling::strong",
+    "person_rows": "//div[contains(@class, 'border-legacy-silver-550') and .//span[text()='Logins']]",
+    "person_all_photos": ".//div[@data-key]",
+    "person_purchased_photos": ".//div[@data-key and .//img[@alt='Bestellungen mit diesem Foto']]",
+    "person_access_card_photo": ".//div[@data-key and contains(@class, 'opacity-50')]",
 }

 # --- PDF Generation Logic ---
@@ -278,15 +293,214 @@ def get_jobs_list(driver) -> List[Dict[str, Any]]:
    
    return jobs

+# --- Background Task Engine ---
+
+task_store: Dict[str, Dict[str, Any]] = {}
+
+def process_statistics(task_id: str, job_id: str, account_type: str):
+    logger.info(f"Task {task_id}: Starting statistics calculation for job {job_id}")
+    task_store[task_id] = {"status": "running", "progress": "Initialisiere Browser...", "result": None}
+    
+    username = os.getenv(f"{account_type.upper()}_USER")
+    password = os.getenv(f"{account_type.upper()}_PW")
+    driver = None
+    
+    try:
+        driver = setup_driver()
+        if not driver or not login(driver, username, password):
+            task_store[task_id] = {"status": "error", "progress": "Login fehlgeschlagen. Überprüfe die Zugangsdaten."}
+            return
+
+        task_store[task_id]["progress"] = f"Lade Alben-Übersicht für Auftrag..."
+        
+        albums_overview_url = f"https://app.fotograf.de/config_jobs_photos/index/{job_id}"
+        logger.info(f"Navigating to albums: {albums_overview_url}")
+        driver.get(albums_overview_url)
+        wait = WebDriverWait(driver, 15)
+        
+        albums_to_visit = []
+        try:
+            album_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["album_overview_rows"])))
+            for row in album_rows:
+                try:
+                    album_link = row.find_element(By.XPATH, SELECTORS["album_overview_link"])
+                    albums_to_visit.append({"name": album_link.text, "url": album_link.get_attribute('href')})
+                except NoSuchElementException:
+                    continue
+        except TimeoutException:
+            task_store[task_id] = {"status": "error", "progress": "Konnte die Album-Liste nicht finden."}
+            return
+            
+        total_albums = len(albums_to_visit)
+        task_store[task_id]["progress"] = f"{total_albums} Alben gefunden. Starte Auswertung..."
+        
+        statistics = []
+        
+        for index, album in enumerate(albums_to_visit):
+            album_name = album['name']
+            task_store[task_id]["progress"] = f"Bearbeite Album {index + 1}/{total_albums}: '{album_name}'..."
+            driver.get(album['url'])
+            
+            try:
+                total_codes_text = wait.until(EC.visibility_of_element_located((By.XPATH, SELECTORS["access_code_count"]))).text
+                num_pages = math.ceil(int(total_codes_text) / 20)
+                
+                total_children_in_album = 0
+                children_with_purchase = 0
+                children_with_all_purchased = 0
+
+                for page_num in range(1, num_pages + 1):
+                    task_store[task_id]["progress"] = f"Bearbeite Album {index + 1}/{total_albums}: '{album_name}' (Seite {page_num}/{num_pages})..."
+                    
+                    if page_num > 1:
+                        driver.get(album['url'] + f"?page_guest_accesses={page_num}")
+                    
+                    person_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["person_rows"])))
+                    
+                    for person_row in person_rows:
+                        total_children_in_album += 1
+                        try:
+                            photo_container = person_row.find_element(By.XPATH, "./following-sibling::div[1]")
+                            
+                            num_total_photos = len(photo_container.find_elements(By.XPATH, SELECTORS["person_all_photos"]))
+                            num_purchased_photos = len(photo_container.find_elements(By.XPATH, SELECTORS["person_purchased_photos"]))
+                            num_access_cards = len(photo_container.find_elements(By.XPATH, SELECTORS["person_access_card_photo"]))
+                            
+                            buyable_photos = num_total_photos - num_access_cards
+                            
+                            if num_purchased_photos > 0:
+                                children_with_purchase += 1
+                            
+                            if buyable_photos > 0 and buyable_photos == num_purchased_photos:
+                                children_with_all_purchased += 1
+                        except NoSuchElementException:
+                            continue
+                
+                statistics.append({
+                    "Album": album_name,
+                    "Kinder_insgesamt": total_children_in_album,
+                    "Kinder_mit_Käufen": children_with_purchase,
+                    "Kinder_Alle_Bilder_gekauft": children_with_all_purchased
+                })
+
+            except Exception as e:
+                logger.error(f"Fehler bei Auswertung von Album '{album_name}': {e}")
+                continue
+        
+        task_store[task_id] = {
+            "status": "completed", 
+            "progress": "Auswertung erfolgreich abgeschlossen!", 
+            "result": statistics
+        }
+        
+    except Exception as e:
+        logger.exception(f"Unexpected error in task {task_id}")
+        task_store[task_id] = {"status": "error", "progress": f"Unerwarteter Fehler: {str(e)}"}
+    finally:
+        if driver:
+            logger.debug(f"Task {task_id}: Closing driver.")
+            driver.quit()
+
+from fastapi import FastAPI, HTTPException, Depends, BackgroundTasks, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+from typing import List, Dict, Any, Optional
+
+from sqlalchemy.orm import Session
+from database import get_db, Job as DBJob, engine, Base
+import math
+import uuid
+from qr_generator import get_calendly_events, overlay_text_on_pdf
+
 # --- API Endpoints ---

+@app.get("/api/calendly/events")
+async def fetch_calendly_events(start_time: str, end_time: str, event_type_name: Optional[str] = None):
+    """
+    Debug endpoint to fetch and inspect raw Calendly data.
+    """
+    api_token = os.getenv("CALENDLY_TOKEN")
+    if not api_token:
+        raise HTTPException(status_code=400, detail="Calendly API token missing.")
+    
+    try:
+        from qr_generator import get_calendly_events_raw
+        raw_data = get_calendly_events_raw(api_token, start_time, end_time, event_type_name)
+        return {"count": len(raw_data), "events": raw_data}
+    except Exception as e:
+        logger.error(f"Error fetching Calendly events: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/api/qr-cards/generate")
+async def generate_qr_cards(
+    start_time: str = Form(...), 
+    end_time: str = Form(...), 
+    event_type_name: str = Form(None),
+    pdf_file: UploadFile = File(...)
+):
+    logger.info(f"API Request: Generate QR cards from {start_time} to {end_time} for event type '{event_type_name}'")
+    api_token = os.getenv("CALENDLY_TOKEN")
+    if not api_token:
+        raise HTTPException(status_code=400, detail="Calendly API token missing.")
+        
+    try:
+        # Save uploaded PDF temporarily
+        temp_dir = tempfile.gettempdir()
+        base_pdf_path = os.path.join(temp_dir, f"upload_{uuid.uuid4()}.pdf")
+        with open(base_pdf_path, "wb") as buffer:
+            shutil.copyfileobj(pdf_file.file, buffer)
+
+        # 1. Fetch formatted data from Calendly
+        texts = get_calendly_events(api_token, start_time, end_time, event_type_name)
+        if not texts:
+            os.remove(base_pdf_path)
+            return JSONResponse(status_code=404, content={"message": "Keine passenden Termine gefunden."})
+            
+        # 2. Overlay text on blank PDF
+        output_name = f"QR_Karten_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
+        output_path = os.path.join(temp_dir, output_name)
+        
+        overlay_text_on_pdf(base_pdf_path, output_path, texts)
+        
+        # Cleanup uploaded file
+        os.remove(base_pdf_path)
+        
+        return FileResponse(path=output_path, filename=output_name, media_type="application/pdf")
+        
+    except Exception as e:
+        logger.error(f"Error generating QR cards: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
@app.get("/health")
 async def health_check():
    return {"status": "ok"}

@app.get("/api/jobs", response_model=List[Dict[str, Any]])
-async def get_jobs(account_type: str):
-    logger.info(f"API Request: GET /api/jobs for {account_type}")
+async def get_jobs(account_type: str, force_refresh: bool = False, db: Session = Depends(get_db)):
+    logger.info(f"API Request: GET /api/jobs for {account_type} (force_refresh={force_refresh})")
+    
+    # 1. Check database first if not forcing a refresh
+    if not force_refresh:
+        cached_jobs = db.query(DBJob).filter(DBJob.account_type == account_type).all()
+        if cached_jobs:
+            logger.info(f"Returning {len(cached_jobs)} cached jobs for {account_type}")
+            return [
+                {
+                    "id": job.id,
+                    "name": job.name,
+                    "url": job.url,
+                    "status": job.status,
+                    "date": job.date,
+                    "shooting_type": job.shooting_type,
+                    "last_updated": job.last_updated.isoformat() if job.last_updated else None
+                }
+                for job in cached_jobs
+            ]
+        else:
+            logger.info(f"No cached jobs found for {account_type}. Initiating scrape...")
+
+    # 2. Scrape from fotograf.de if forcing refresh or no cached jobs
    username = os.getenv(f"{account_type.upper()}_USER")
    password = os.getenv(f"{account_type.upper()}_PW")
    if not username or not password:
@@ -298,12 +512,61 @@ async def get_jobs(account_type: str):
        driver = setup_driver()
        if not driver or not login(driver, username, password):
            raise HTTPException(status_code=401, detail="Login failed.")
-        return get_jobs_list(driver)
+        
+        scraped_jobs = get_jobs_list(driver)
+        
+        # 3. Save to database
+        if scraped_jobs:
+            logger.info(f"Saving {len(scraped_jobs)} jobs to database for {account_type}...")
+            # Clear old jobs for this account type
+            db.query(DBJob).filter(DBJob.account_type == account_type).delete()
+            
+            # Insert new jobs
+            now = datetime.datetime.utcnow()
+            for job_data in scraped_jobs:
+                if job_data["id"]: # Ensure we have an ID
+                    new_job = DBJob(
+                        id=job_data["id"],
+                        name=job_data["name"],
+                        url=job_data["url"],
+                        status=job_data["status"],
+                        date=job_data["date"],
+                        shooting_type=job_data["shooting_type"],
+                        account_type=account_type,
+                        last_updated=now
+                    )
+                    db.add(new_job)
+                
+                # Update dict for return value
+                job_data["last_updated"] = now.isoformat()
+            
+            db.commit()
+            logger.info("Database updated successfully.")
+            
+        return scraped_jobs
+    except Exception as e:
+        logger.error(f"Error during scraping or database save: {e}")
+        db.rollback()
+        raise HTTPException(status_code=500, detail=str(e))
    finally:
        if driver: 
            logger.debug("Closing driver.")
            driver.quit()

+@app.get("/api/tasks/{task_id}")
+async def get_task_status(task_id: str):
+    logger.debug(f"API Request: Check task status for {task_id}")
+    if task_id not in task_store:
+        raise HTTPException(status_code=404, detail="Task nicht gefunden.")
+    return task_store[task_id]
+
+@app.post("/api/jobs/{job_id}/statistics")
+async def start_statistics(job_id: str, account_type: str, background_tasks: BackgroundTasks):
+    logger.info(f"API Request: Start statistics for job {job_id} ({account_type})")
+    task_id = str(uuid.uuid4())
+    background_tasks.add_task(process_statistics, task_id, job_id, account_type)
+    return {"task_id": task_id}
+
@app.get("/api/jobs/{job_id}/generate-pdf")
 async def generate_pdf(job_id: str, account_type: str):
    logger.info(f"API Request: Generate PDF for job {job_id} ({account_type})")