diff --git a/fotograf-de-scraper/backend/debug_scraper.py b/fotograf-de-scraper/backend/debug_scraper.py new file mode 100644 index 00000000..24142acf --- /dev/null +++ b/fotograf-de-scraper/backend/debug_scraper.py @@ -0,0 +1,68 @@ +import os +from dotenv import load_dotenv +# We import directly from main to reuse the already configured functions +from main import setup_driver, login, get_jobs_list + +def run_debug(): + """ + Runs the scraper logic directly for debugging purposes inside the container. + """ + load_dotenv() + print("--- Starting Standalone Scraper Debug ---") + + # --- Configuration --- + # Change this to 'schule' to test the other account + ACCOUNT_TO_TEST = "kiga" + + username = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_USER") + password = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_PW") + + if not username or not password: + print(f"!!! FATAL ERROR: Credentials for {ACCOUNT_TO_TEST} not found in .env file.") + print("Please ensure KIGA_USER, KIGA_PW, etc. are set correctly.") + return + + print(f"Attempting to log in with user: {username}") + + driver = None + try: + driver = setup_driver() + if not driver: + print("!!! FATAL ERROR: WebDriver initialization failed.") + return + + # Perform the login + if login(driver, username, password): + print("\n✅ LOGIN SUCCESSFUL!") + print("-----------------------------------------") + print("Now attempting to fetch jobs from the dashboard...") + + # Fetch the jobs + jobs = get_jobs_list(driver) + + if jobs: + print(f"\n✅ SUCCESS: Found {len(jobs)} jobs!") + for i, job in enumerate(jobs): + print(f" {i+1}. Name: {job['name']}") + print(f" Status: {job['status']}") + print(f" Date: {job['date']}") + else: + print("\n⚠️ WARNING: Login seemed successful, but no jobs were found on the dashboard.") + print("This could be due to incorrect page selectors for the job list.") + + else: + print("\n❌ LOGIN FAILED.") + print("Please check credentials in .env and the login selectors in main.py.") + print("A screenshot of the error might have been saved if the scraper has permission.") + + except Exception as e: + print(f"\n\n!!! AN UNEXPECTED ERROR OCCURRED: {e}") + import traceback + traceback.print_exc() + finally: + if driver: + print("\n--- Debug script finished. Closing WebDriver. ---") + driver.quit() + +if __name__ == "__main__": + run_debug() diff --git a/fotograf-de-scraper/backend/main.py b/fotograf-de-scraper/backend/main.py index eaf3e7da..87a64977 100644 --- a/fotograf-de-scraper/backend/main.py +++ b/fotograf-de-scraper/backend/main.py @@ -37,11 +37,11 @@ SELECTORS = { "login_user": "#login-email", "login_pass": "#login-password", "login_button": "#login-submit", - "dashboard_jobs_table_rows": "//table[contains(@class, 'table-legacy')]/tbody/tr", # Assuming there's a table for jobs - "job_row_name_link": ".//td[contains(@class, 'table-col-jobname')]//a", - "job_row_status": ".//td[contains(@class, 'table-col-status')]//span", - "job_row_date": ".//td[contains(@class, 'table-col-shootingDate')]", - "job_row_shooting_type": ".//td[contains(@class, 'table-col-shootingType')]", + "dashboard_jobs_table_rows": "//tr[.//a[contains(@data-qa-id, 'link:photo-jobs-name-')]]", + "job_row_name_link": ".//a[contains(@data-qa-id, 'link:photo-jobs-name-')]", + "job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]", # Try to find by column header 'Status' + "job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", # Try to find by column header 'Datum' + "job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", # Try to find by column header 'Typ' } # --- Utility functions from original scraper --- @@ -62,44 +62,63 @@ def setup_driver(): print(f"Fehler bei der Initialisierung des WebDrivers: {e}") return None +def take_error_screenshot(driver, error_name): + # Ensure the errors directory exists + errors_dir = os.path.join(os.path.dirname(__file__), 'errors') + os.makedirs(errors_dir, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"error_{error_name}_{timestamp}.png" + filepath = os.path.join(errors_dir, filename) + try: + driver.save_screenshot(filepath) + print(f"!!! Fehler aufgetreten. Screenshot gespeichert unter: {filepath}") + except Exception as e: + print(f"!!! Konnte keinen Screenshot speichern: {e}") + def login(driver, username, password): print("Starte Login-Vorgang...") try: driver.get(LOGIN_URL) - wait = WebDriverWait(driver, 10) + wait = WebDriverWait(driver, 45) # Generous timeout for the entire process try: print("Suche nach Cookie-Banner...") - wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click() + cookie_wait = WebDriverWait(driver, 5) + cookie_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click() print("Cookie-Banner akzeptiert.") - time.sleep(1) + time.sleep(1) except TimeoutException: print("Kein Cookie-Banner gefunden, fahre fort.") + print("Fülle Anmeldeformular aus...") wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username) driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password) print("Klicke auf Login...") driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click() - print("Warte auf die nächste Seite...") + + print("Warte auf Bestätigung des Logins durch URL-Wechsel...") + # This is a faster and more reliable check for a successful login redirect wait.until(EC.url_contains('/config_dashboard/index')) - print("Login erfolgreich!") + + print("Login erfolgreich! Session ist aktiv.") return True except Exception as e: print(f"Login fehlgeschlagen. Grund: {e}") - # take_error_screenshot(driver, "login_error") # Removed for now, will re-add later if needed + take_error_screenshot(driver, "login_error") return False -# --- New function to get jobs from dashboard --- -def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]: - print("Navigiere zum Dashboard, um Aufträge abzurufen...") - dashboard_url = "https://app.fotograf.de/config_dashboard/index" - driver.get(dashboard_url) - wait = WebDriverWait(driver, 20) # Increased timeout for dashboard load +# --- New function to get jobs from the specific jobs list page --- +def get_jobs_list(driver) -> List[Dict[str, Any]]: + print("Navigiere direkt zur Auftragsliste, um Aufträge abzurufen...") + jobs_list_url = "https://app.fotograf.de/config_jobs/index" + driver.get(jobs_list_url) + wait = WebDriverWait(driver, 45) # Use the generous timeout here jobs = [] try: # Wait for the table rows to be present job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"]))) - print(f"[{len(job_rows)}] Auftragszeilen auf dem Dashboard gefunden.") + print(f"[{len(job_rows)}] Auftragszeilen auf der Auftragsseite gefunden.") for row in job_rows: try: @@ -134,9 +153,17 @@ def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]: print(f"Ein unerwarteter Fehler beim Parsen einer Auftragszeile: {e}") except TimeoutException: - print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf dem Dashboard gefunden.") + print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf der Auftragsseite gefunden.") + take_error_screenshot(driver, "get_jobs_list_error") + # Save the HTML source for debugging selectors + errors_dir = os.path.join(os.path.dirname(__file__), 'errors') + os.makedirs(errors_dir, exist_ok=True) + with open(os.path.join(errors_dir, 'page_source.html'), 'w', encoding='utf-8') as f: + f.write(driver.page_source) + print("HTML-Quellcode der Seite wurde in 'errors/page_source.html' gespeichert.") except Exception as e: - print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge vom Dashboard: {e}") + print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge von der Auftragsseite: {e}") + take_error_screenshot(driver, "get_jobs_list_error") return jobs @@ -164,11 +191,9 @@ async def get_jobs(account_type: str): if not login(driver, username, password): raise HTTPException(status_code=401, detail="Login failed. Please check credentials.") - jobs = get_jobs_from_dashboard(driver) + jobs = get_jobs_list(driver) # Call the new function if not jobs: - print("Keine Aufträge gefunden oder Fehler beim Abrufen vom Dashboard.") - # Depending on desired behavior, might raise HTTPException or return empty list - # For now, returning empty list if no jobs found but login was successful. + print("Keine Aufträge gefunden oder Fehler beim Abrufen.") return jobs