Files
Brancheneinstufung2/fotograf-de-scraper/backend/main.py

186 lines
7.6 KiB
Python

import os
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Dict, Any, Optional
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, InvalidArgumentException
import re
import asyncio
# Load environment variables
load_dotenv()
app = FastAPI(title="Fotograf.de Scraper API")
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Adjust this to your frontend origin in production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# --- Configuration & Constants ---
LOGIN_URL = 'https://app.fotograf.de/login/login'
# --- Selectors from original scraper, expanded for dashboard jobs ---
SELECTORS = {
"cookie_accept_button": "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
"login_user": "#login-email",
"login_pass": "#login-password",
"login_button": "#login-submit",
"dashboard_jobs_table_rows": "//table[contains(@class, 'table-legacy')]/tbody/tr", # Assuming there's a table for jobs
"job_row_name_link": ".//td[contains(@class, 'table-col-jobname')]//a",
"job_row_status": ".//td[contains(@class, 'table-col-status')]//span",
"job_row_date": ".//td[contains(@class, 'table-col-shootingDate')]",
"job_row_shooting_type": ".//td[contains(@class, 'table-col-shootingType')]",
}
# --- Utility functions from original scraper ---
# (setup_driver, login, etc. will be adapted or moved into this file)
def setup_driver():
print("Initialisiere Chrome WebDriver...")
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--window-size=1920,1200')
options.binary_location = '/usr/bin/google-chrome' # Path to Chrome in Docker
try:
driver = webdriver.Chrome(options=options)
return driver
except Exception as e:
print(f"Fehler bei der Initialisierung des WebDrivers: {e}")
return None
def login(driver, username, password):
print("Starte Login-Vorgang...")
try:
driver.get(LOGIN_URL)
wait = WebDriverWait(driver, 10)
try:
print("Suche nach Cookie-Banner...")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
print("Cookie-Banner akzeptiert.")
time.sleep(1)
except TimeoutException:
print("Kein Cookie-Banner gefunden, fahre fort.")
print("Fülle Anmeldeformular aus...")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username)
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password)
print("Klicke auf Login...")
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click()
print("Warte auf die nächste Seite...")
wait.until(EC.url_contains('/config_dashboard/index'))
print("Login erfolgreich!")
return True
except Exception as e:
print(f"Login fehlgeschlagen. Grund: {e}")
# take_error_screenshot(driver, "login_error") # Removed for now, will re-add later if needed
return False
# --- New function to get jobs from dashboard ---
def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]:
print("Navigiere zum Dashboard, um Aufträge abzurufen...")
dashboard_url = "https://app.fotograf.de/config_dashboard/index"
driver.get(dashboard_url)
wait = WebDriverWait(driver, 20) # Increased timeout for dashboard load
jobs = []
try:
# Wait for the table rows to be present
job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"])))
print(f"[{len(job_rows)}] Auftragszeilen auf dem Dashboard gefunden.")
for row in job_rows:
try:
name_element = row.find_element(By.XPATH, SELECTORS["job_row_name_link"])
job_name = name_element.text.strip()
job_url = name_element.get_attribute('href')
# Extract Job ID from URL
job_id_match = re.search(r'/(\d+)$', job_url)
job_id = job_id_match.group(1) if job_id_match else None
status_element = row.find_element(By.XPATH, SELECTORS["job_row_status"])
job_status = status_element.text.strip()
date_element = row.find_element(By.XPATH, SELECTORS["job_row_date"])
job_date = date_element.text.strip()
type_element = row.find_element(By.XPATH, SELECTORS["job_row_shooting_type"])
shooting_type = type_element.text.strip()
jobs.append({
"id": job_id,
"name": job_name,
"url": job_url,
"status": job_status,
"date": job_date,
"shooting_type": shooting_type,
})
except NoSuchElementException as e:
print(f"Warnung: Konnte nicht alle Elemente in einer Auftragszeile finden. Fehler: {e}")
except Exception as e:
print(f"Ein unerwarteter Fehler beim Parsen einer Auftragszeile: {e}")
except TimeoutException:
print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf dem Dashboard gefunden.")
except Exception as e:
print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge vom Dashboard: {e}")
return jobs
@app.get("/health")
async def health_check():
return {"status": "ok"}
@app.get("/api/jobs", response_model=List[Dict[str, Any]])
async def get_jobs(account_type: str):
username_env_var = f"{account_type.upper()}_USER"
password_env_var = f"{account_type.upper()}_PW"
username = os.getenv(username_env_var)
password = os.getenv(password_env_var)
if not username or not password:
raise HTTPException(status_code=400, detail=f"Credentials for {account_type} not found. Please set {username_env_var} and {password_env_var} in your .env file.")
driver = None
try:
driver = setup_driver()
if not driver:
raise HTTPException(status_code=500, detail="Failed to initialize WebDriver.")
if not login(driver, username, password):
raise HTTPException(status_code=401, detail="Login failed. Please check credentials.")
jobs = get_jobs_from_dashboard(driver)
if not jobs:
print("Keine Aufträge gefunden oder Fehler beim Abrufen vom Dashboard.")
# Depending on desired behavior, might raise HTTPException or return empty list
# For now, returning empty list if no jobs found but login was successful.
return jobs
except HTTPException as e:
raise e # Re-raise HTTP exceptions
except Exception as e:
print(f"Ein unerwarteter Serverfehler ist aufgetreten: {e}")
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
finally:
if driver:
print("Schließe WebDriver.")
driver.quit()
# Integrate other scraper functions (process_reminder_mode, process_statistics_mode) as new API endpoints later