feat(scraper): job list extraction is working [32788f42]

2026-03-20 17:50:13 +00:00
parent 07b70762ee
commit a5f0d0473d
2 changed files with 117 additions and 24 deletions
--- a/fotograf-de-scraper/backend/debug_scraper.py
+++ b/fotograf-de-scraper/backend/debug_scraper.py
@@ -0,0 +1,68 @@
+import os
+from dotenv import load_dotenv
+# We import directly from main to reuse the already configured functions
+from main import setup_driver, login, get_jobs_list
+
+def run_debug():
+    """
+    Runs the scraper logic directly for debugging purposes inside the container.
+    """
+    load_dotenv()
+    print("--- Starting Standalone Scraper Debug ---")
+
+    # --- Configuration ---
+    # Change this to 'schule' to test the other account
+    ACCOUNT_TO_TEST = "kiga" 
+    
+    username = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_USER")
+    password = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_PW")
+
+    if not username or not password:
+        print(f"!!! FATAL ERROR: Credentials for {ACCOUNT_TO_TEST} not found in .env file.")
+        print("Please ensure KIGA_USER, KIGA_PW, etc. are set correctly.")
+        return
+
+    print(f"Attempting to log in with user: {username}")
+
+    driver = None
+    try:
+        driver = setup_driver()
+        if not driver:
+            print("!!! FATAL ERROR: WebDriver initialization failed.")
+            return
+
+        # Perform the login
+        if login(driver, username, password):
+            print("\n✅ LOGIN SUCCESSFUL!")
+            print("-----------------------------------------")
+            print("Now attempting to fetch jobs from the dashboard...")
+            
+            # Fetch the jobs
+            jobs = get_jobs_list(driver)
+            
+            if jobs:
+                print(f"\n✅ SUCCESS: Found {len(jobs)} jobs!")
+                for i, job in enumerate(jobs):
+                    print(f"  {i+1}. Name: {job['name']}")
+                    print(f"     Status: {job['status']}")
+                    print(f"     Date: {job['date']}")
+            else:
+                print("\n⚠️ WARNING: Login seemed successful, but no jobs were found on the dashboard.")
+                print("This could be due to incorrect page selectors for the job list.")
+
+        else:
+            print("\n❌ LOGIN FAILED.")
+            print("Please check credentials in .env and the login selectors in main.py.")
+            print("A screenshot of the error might have been saved if the scraper has permission.")
+
+    except Exception as e:
+        print(f"\n\n!!! AN UNEXPECTED ERROR OCCURRED: {e}")
+        import traceback
+        traceback.print_exc()
+    finally:
+        if driver:
+            print("\n--- Debug script finished. Closing WebDriver. ---")
+            driver.quit()
+
+if __name__ == "__main__":
+    run_debug()
--- a/fotograf-de-scraper/backend/main.py
+++ b/fotograf-de-scraper/backend/main.py
@@ -37,11 +37,11 @@ SELECTORS = {
    "login_user": "#login-email",
    "login_pass": "#login-password",
    "login_button": "#login-submit",
-    "dashboard_jobs_table_rows": "//table[contains(@class, 'table-legacy')]/tbody/tr", # Assuming there's a table for jobs
-    "job_row_name_link": ".//td[contains(@class, 'table-col-jobname')]//a",
-    "job_row_status": ".//td[contains(@class, 'table-col-status')]//span",
-    "job_row_date": ".//td[contains(@class, 'table-col-shootingDate')]",
-    "job_row_shooting_type": ".//td[contains(@class, 'table-col-shootingType')]",
+    "dashboard_jobs_table_rows": "//tr[.//a[contains(@data-qa-id, 'link:photo-jobs-name-')]]", 
+    "job_row_name_link": ".//a[contains(@data-qa-id, 'link:photo-jobs-name-')]",
+    "job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]", # Try to find by column header 'Status'
+    "job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", # Try to find by column header 'Datum'
+    "job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", # Try to find by column header 'Typ'
 }

 # --- Utility functions from original scraper ---
@@ -62,44 +62,63 @@ def setup_driver():
        print(f"Fehler bei der Initialisierung des WebDrivers: {e}")
        return None

+def take_error_screenshot(driver, error_name):
+    # Ensure the errors directory exists
+    errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
+    os.makedirs(errors_dir, exist_ok=True)
+    
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"error_{error_name}_{timestamp}.png"
+    filepath = os.path.join(errors_dir, filename)
+    try:
+        driver.save_screenshot(filepath)
+        print(f"!!! Fehler aufgetreten. Screenshot gespeichert unter: {filepath}")
+    except Exception as e:
+        print(f"!!! Konnte keinen Screenshot speichern: {e}")
+
 def login(driver, username, password):
    print("Starte Login-Vorgang...")
    try:
        driver.get(LOGIN_URL)
-        wait = WebDriverWait(driver, 10)
+        wait = WebDriverWait(driver, 45) # Generous timeout for the entire process
        try:
            print("Suche nach Cookie-Banner...")
-            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
+            cookie_wait = WebDriverWait(driver, 5)
+            cookie_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
            print("Cookie-Banner akzeptiert.")
-            time.sleep(1) 
+            time.sleep(1)
        except TimeoutException:
            print("Kein Cookie-Banner gefunden, fahre fort.")
+
        print("Fülle Anmeldeformular aus...")
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username)
        driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password)
        print("Klicke auf Login...")
        driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click()
-        print("Warte auf die nächste Seite...")
+
+        print("Warte auf Bestätigung des Logins durch URL-Wechsel...")
+        # This is a faster and more reliable check for a successful login redirect
        wait.until(EC.url_contains('/config_dashboard/index'))
-        print("Login erfolgreich!")
+
+        print("Login erfolgreich! Session ist aktiv.")
        return True
    except Exception as e:
        print(f"Login fehlgeschlagen. Grund: {e}")
-        # take_error_screenshot(driver, "login_error") # Removed for now, will re-add later if needed
+        take_error_screenshot(driver, "login_error")
        return False

-# --- New function to get jobs from dashboard ---
-def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]:
-    print("Navigiere zum Dashboard, um Aufträge abzurufen...")
-    dashboard_url = "https://app.fotograf.de/config_dashboard/index"
-    driver.get(dashboard_url)
-    wait = WebDriverWait(driver, 20) # Increased timeout for dashboard load
+# --- New function to get jobs from the specific jobs list page ---
+def get_jobs_list(driver) -> List[Dict[str, Any]]:
+    print("Navigiere direkt zur Auftragsliste, um Aufträge abzurufen...")
+    jobs_list_url = "https://app.fotograf.de/config_jobs/index"
+    driver.get(jobs_list_url)
+    wait = WebDriverWait(driver, 45) # Use the generous timeout here

    jobs = []
    try:
        # Wait for the table rows to be present
        job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"])))
-        print(f"[{len(job_rows)}] Auftragszeilen auf dem Dashboard gefunden.")
+        print(f"[{len(job_rows)}] Auftragszeilen auf der Auftragsseite gefunden.")

        for row in job_rows:
            try:
@@ -134,9 +153,17 @@ def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]:
                print(f"Ein unerwarteter Fehler beim Parsen einer Auftragszeile: {e}")

    except TimeoutException:
-        print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf dem Dashboard gefunden.")
+        print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf der Auftragsseite gefunden.")
+        take_error_screenshot(driver, "get_jobs_list_error")
+        # Save the HTML source for debugging selectors
+        errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
+        os.makedirs(errors_dir, exist_ok=True)
+        with open(os.path.join(errors_dir, 'page_source.html'), 'w', encoding='utf-8') as f:
+            f.write(driver.page_source)
+            print("HTML-Quellcode der Seite wurde in 'errors/page_source.html' gespeichert.")
    except Exception as e:
-        print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge vom Dashboard: {e}")
+        print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge von der Auftragsseite: {e}")
+        take_error_screenshot(driver, "get_jobs_list_error")
    
    return jobs

@@ -164,11 +191,9 @@ async def get_jobs(account_type: str):
        if not login(driver, username, password):
            raise HTTPException(status_code=401, detail="Login failed. Please check credentials.")
        
-        jobs = get_jobs_from_dashboard(driver)
+        jobs = get_jobs_list(driver) # Call the new function
        if not jobs:
-            print("Keine Aufträge gefunden oder Fehler beim Abrufen vom Dashboard.")
-            # Depending on desired behavior, might raise HTTPException or return empty list
-            # For now, returning empty list if no jobs found but login was successful.
+            print("Keine Aufträge gefunden oder Fehler beim Abrufen.")
        
        return jobs