feat(scraper): job list extraction is working [32788f42]

This commit is contained in:
2026-03-20 17:50:13 +00:00
parent 07b70762ee
commit a5f0d0473d
2 changed files with 117 additions and 24 deletions

View File

@@ -0,0 +1,68 @@
import os
from dotenv import load_dotenv
# We import directly from main to reuse the already configured functions
from main import setup_driver, login, get_jobs_list
def run_debug():
"""
Runs the scraper logic directly for debugging purposes inside the container.
"""
load_dotenv()
print("--- Starting Standalone Scraper Debug ---")
# --- Configuration ---
# Change this to 'schule' to test the other account
ACCOUNT_TO_TEST = "kiga"
username = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_USER")
password = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_PW")
if not username or not password:
print(f"!!! FATAL ERROR: Credentials for {ACCOUNT_TO_TEST} not found in .env file.")
print("Please ensure KIGA_USER, KIGA_PW, etc. are set correctly.")
return
print(f"Attempting to log in with user: {username}")
driver = None
try:
driver = setup_driver()
if not driver:
print("!!! FATAL ERROR: WebDriver initialization failed.")
return
# Perform the login
if login(driver, username, password):
print("\n✅ LOGIN SUCCESSFUL!")
print("-----------------------------------------")
print("Now attempting to fetch jobs from the dashboard...")
# Fetch the jobs
jobs = get_jobs_list(driver)
if jobs:
print(f"\n✅ SUCCESS: Found {len(jobs)} jobs!")
for i, job in enumerate(jobs):
print(f" {i+1}. Name: {job['name']}")
print(f" Status: {job['status']}")
print(f" Date: {job['date']}")
else:
print("\n⚠️ WARNING: Login seemed successful, but no jobs were found on the dashboard.")
print("This could be due to incorrect page selectors for the job list.")
else:
print("\n❌ LOGIN FAILED.")
print("Please check credentials in .env and the login selectors in main.py.")
print("A screenshot of the error might have been saved if the scraper has permission.")
except Exception as e:
print(f"\n\n!!! AN UNEXPECTED ERROR OCCURRED: {e}")
import traceback
traceback.print_exc()
finally:
if driver:
print("\n--- Debug script finished. Closing WebDriver. ---")
driver.quit()
if __name__ == "__main__":
run_debug()

View File

@@ -37,11 +37,11 @@ SELECTORS = {
"login_user": "#login-email",
"login_pass": "#login-password",
"login_button": "#login-submit",
"dashboard_jobs_table_rows": "//table[contains(@class, 'table-legacy')]/tbody/tr", # Assuming there's a table for jobs
"job_row_name_link": ".//td[contains(@class, 'table-col-jobname')]//a",
"job_row_status": ".//td[contains(@class, 'table-col-status')]//span",
"job_row_date": ".//td[contains(@class, 'table-col-shootingDate')]",
"job_row_shooting_type": ".//td[contains(@class, 'table-col-shootingType')]",
"dashboard_jobs_table_rows": "//tr[.//a[contains(@data-qa-id, 'link:photo-jobs-name-')]]",
"job_row_name_link": ".//a[contains(@data-qa-id, 'link:photo-jobs-name-')]",
"job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]", # Try to find by column header 'Status'
"job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", # Try to find by column header 'Datum'
"job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", # Try to find by column header 'Typ'
}
# --- Utility functions from original scraper ---
@@ -62,44 +62,63 @@ def setup_driver():
print(f"Fehler bei der Initialisierung des WebDrivers: {e}")
return None
def take_error_screenshot(driver, error_name):
# Ensure the errors directory exists
errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
os.makedirs(errors_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"error_{error_name}_{timestamp}.png"
filepath = os.path.join(errors_dir, filename)
try:
driver.save_screenshot(filepath)
print(f"!!! Fehler aufgetreten. Screenshot gespeichert unter: {filepath}")
except Exception as e:
print(f"!!! Konnte keinen Screenshot speichern: {e}")
def login(driver, username, password):
print("Starte Login-Vorgang...")
try:
driver.get(LOGIN_URL)
wait = WebDriverWait(driver, 10)
wait = WebDriverWait(driver, 45) # Generous timeout for the entire process
try:
print("Suche nach Cookie-Banner...")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
cookie_wait = WebDriverWait(driver, 5)
cookie_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
print("Cookie-Banner akzeptiert.")
time.sleep(1)
time.sleep(1)
except TimeoutException:
print("Kein Cookie-Banner gefunden, fahre fort.")
print("Fülle Anmeldeformular aus...")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username)
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password)
print("Klicke auf Login...")
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click()
print("Warte auf die nächste Seite...")
print("Warte auf Bestätigung des Logins durch URL-Wechsel...")
# This is a faster and more reliable check for a successful login redirect
wait.until(EC.url_contains('/config_dashboard/index'))
print("Login erfolgreich!")
print("Login erfolgreich! Session ist aktiv.")
return True
except Exception as e:
print(f"Login fehlgeschlagen. Grund: {e}")
# take_error_screenshot(driver, "login_error") # Removed for now, will re-add later if needed
take_error_screenshot(driver, "login_error")
return False
# --- New function to get jobs from dashboard ---
def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]:
print("Navigiere zum Dashboard, um Aufträge abzurufen...")
dashboard_url = "https://app.fotograf.de/config_dashboard/index"
driver.get(dashboard_url)
wait = WebDriverWait(driver, 20) # Increased timeout for dashboard load
# --- New function to get jobs from the specific jobs list page ---
def get_jobs_list(driver) -> List[Dict[str, Any]]:
print("Navigiere direkt zur Auftragsliste, um Aufträge abzurufen...")
jobs_list_url = "https://app.fotograf.de/config_jobs/index"
driver.get(jobs_list_url)
wait = WebDriverWait(driver, 45) # Use the generous timeout here
jobs = []
try:
# Wait for the table rows to be present
job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"])))
print(f"[{len(job_rows)}] Auftragszeilen auf dem Dashboard gefunden.")
print(f"[{len(job_rows)}] Auftragszeilen auf der Auftragsseite gefunden.")
for row in job_rows:
try:
@@ -134,9 +153,17 @@ def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]:
print(f"Ein unerwarteter Fehler beim Parsen einer Auftragszeile: {e}")
except TimeoutException:
print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf dem Dashboard gefunden.")
print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf der Auftragsseite gefunden.")
take_error_screenshot(driver, "get_jobs_list_error")
# Save the HTML source for debugging selectors
errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
os.makedirs(errors_dir, exist_ok=True)
with open(os.path.join(errors_dir, 'page_source.html'), 'w', encoding='utf-8') as f:
f.write(driver.page_source)
print("HTML-Quellcode der Seite wurde in 'errors/page_source.html' gespeichert.")
except Exception as e:
print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge vom Dashboard: {e}")
print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge von der Auftragsseite: {e}")
take_error_screenshot(driver, "get_jobs_list_error")
return jobs
@@ -164,11 +191,9 @@ async def get_jobs(account_type: str):
if not login(driver, username, password):
raise HTTPException(status_code=401, detail="Login failed. Please check credentials.")
jobs = get_jobs_from_dashboard(driver)
jobs = get_jobs_list(driver) # Call the new function
if not jobs:
print("Keine Aufträge gefunden oder Fehler beim Abrufen vom Dashboard.")
# Depending on desired behavior, might raise HTTPException or return empty list
# For now, returning empty list if no jobs found but login was successful.
print("Keine Aufträge gefunden oder Fehler beim Abrufen.")
return jobs