feat(scraper): job list extraction is working [32788f42]
This commit is contained in:
68
fotograf-de-scraper/backend/debug_scraper.py
Normal file
68
fotograf-de-scraper/backend/debug_scraper.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
# We import directly from main to reuse the already configured functions
|
||||
from main import setup_driver, login, get_jobs_list
|
||||
|
||||
def run_debug():
|
||||
"""
|
||||
Runs the scraper logic directly for debugging purposes inside the container.
|
||||
"""
|
||||
load_dotenv()
|
||||
print("--- Starting Standalone Scraper Debug ---")
|
||||
|
||||
# --- Configuration ---
|
||||
# Change this to 'schule' to test the other account
|
||||
ACCOUNT_TO_TEST = "kiga"
|
||||
|
||||
username = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_USER")
|
||||
password = os.getenv(f"{ACCOUNT_TO_TEST.upper()}_PW")
|
||||
|
||||
if not username or not password:
|
||||
print(f"!!! FATAL ERROR: Credentials for {ACCOUNT_TO_TEST} not found in .env file.")
|
||||
print("Please ensure KIGA_USER, KIGA_PW, etc. are set correctly.")
|
||||
return
|
||||
|
||||
print(f"Attempting to log in with user: {username}")
|
||||
|
||||
driver = None
|
||||
try:
|
||||
driver = setup_driver()
|
||||
if not driver:
|
||||
print("!!! FATAL ERROR: WebDriver initialization failed.")
|
||||
return
|
||||
|
||||
# Perform the login
|
||||
if login(driver, username, password):
|
||||
print("\n✅ LOGIN SUCCESSFUL!")
|
||||
print("-----------------------------------------")
|
||||
print("Now attempting to fetch jobs from the dashboard...")
|
||||
|
||||
# Fetch the jobs
|
||||
jobs = get_jobs_list(driver)
|
||||
|
||||
if jobs:
|
||||
print(f"\n✅ SUCCESS: Found {len(jobs)} jobs!")
|
||||
for i, job in enumerate(jobs):
|
||||
print(f" {i+1}. Name: {job['name']}")
|
||||
print(f" Status: {job['status']}")
|
||||
print(f" Date: {job['date']}")
|
||||
else:
|
||||
print("\n⚠️ WARNING: Login seemed successful, but no jobs were found on the dashboard.")
|
||||
print("This could be due to incorrect page selectors for the job list.")
|
||||
|
||||
else:
|
||||
print("\n❌ LOGIN FAILED.")
|
||||
print("Please check credentials in .env and the login selectors in main.py.")
|
||||
print("A screenshot of the error might have been saved if the scraper has permission.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n\n!!! AN UNEXPECTED ERROR OCCURRED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
if driver:
|
||||
print("\n--- Debug script finished. Closing WebDriver. ---")
|
||||
driver.quit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_debug()
|
||||
@@ -37,11 +37,11 @@ SELECTORS = {
|
||||
"login_user": "#login-email",
|
||||
"login_pass": "#login-password",
|
||||
"login_button": "#login-submit",
|
||||
"dashboard_jobs_table_rows": "//table[contains(@class, 'table-legacy')]/tbody/tr", # Assuming there's a table for jobs
|
||||
"job_row_name_link": ".//td[contains(@class, 'table-col-jobname')]//a",
|
||||
"job_row_status": ".//td[contains(@class, 'table-col-status')]//span",
|
||||
"job_row_date": ".//td[contains(@class, 'table-col-shootingDate')]",
|
||||
"job_row_shooting_type": ".//td[contains(@class, 'table-col-shootingType')]",
|
||||
"dashboard_jobs_table_rows": "//tr[.//a[contains(@data-qa-id, 'link:photo-jobs-name-')]]",
|
||||
"job_row_name_link": ".//a[contains(@data-qa-id, 'link:photo-jobs-name-')]",
|
||||
"job_row_status": ".//td[count(//th[contains(., 'Status')]/preceding-sibling::th) + 1]", # Try to find by column header 'Status'
|
||||
"job_row_date": ".//td[count(//th[contains(., 'Datum')]/preceding-sibling::th) + 1]", # Try to find by column header 'Datum'
|
||||
"job_row_shooting_type": ".//td[count(//th[contains(., 'Typ')]/preceding-sibling::th) + 1]", # Try to find by column header 'Typ'
|
||||
}
|
||||
|
||||
# --- Utility functions from original scraper ---
|
||||
@@ -62,44 +62,63 @@ def setup_driver():
|
||||
print(f"Fehler bei der Initialisierung des WebDrivers: {e}")
|
||||
return None
|
||||
|
||||
def take_error_screenshot(driver, error_name):
|
||||
# Ensure the errors directory exists
|
||||
errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
|
||||
os.makedirs(errors_dir, exist_ok=True)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"error_{error_name}_{timestamp}.png"
|
||||
filepath = os.path.join(errors_dir, filename)
|
||||
try:
|
||||
driver.save_screenshot(filepath)
|
||||
print(f"!!! Fehler aufgetreten. Screenshot gespeichert unter: {filepath}")
|
||||
except Exception as e:
|
||||
print(f"!!! Konnte keinen Screenshot speichern: {e}")
|
||||
|
||||
def login(driver, username, password):
|
||||
print("Starte Login-Vorgang...")
|
||||
try:
|
||||
driver.get(LOGIN_URL)
|
||||
wait = WebDriverWait(driver, 10)
|
||||
wait = WebDriverWait(driver, 45) # Generous timeout for the entire process
|
||||
try:
|
||||
print("Suche nach Cookie-Banner...")
|
||||
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
|
||||
cookie_wait = WebDriverWait(driver, 5)
|
||||
cookie_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["cookie_accept_button"]))).click()
|
||||
print("Cookie-Banner akzeptiert.")
|
||||
time.sleep(1)
|
||||
time.sleep(1)
|
||||
except TimeoutException:
|
||||
print("Kein Cookie-Banner gefunden, fahre fort.")
|
||||
|
||||
print("Fülle Anmeldeformular aus...")
|
||||
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, SELECTORS["login_user"]))).send_keys(username)
|
||||
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_pass"]).send_keys(password)
|
||||
print("Klicke auf Login...")
|
||||
driver.find_element(By.CSS_SELECTOR, SELECTORS["login_button"]).click()
|
||||
print("Warte auf die nächste Seite...")
|
||||
|
||||
print("Warte auf Bestätigung des Logins durch URL-Wechsel...")
|
||||
# This is a faster and more reliable check for a successful login redirect
|
||||
wait.until(EC.url_contains('/config_dashboard/index'))
|
||||
print("Login erfolgreich!")
|
||||
|
||||
print("Login erfolgreich! Session ist aktiv.")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Login fehlgeschlagen. Grund: {e}")
|
||||
# take_error_screenshot(driver, "login_error") # Removed for now, will re-add later if needed
|
||||
take_error_screenshot(driver, "login_error")
|
||||
return False
|
||||
|
||||
# --- New function to get jobs from dashboard ---
|
||||
def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]:
|
||||
print("Navigiere zum Dashboard, um Aufträge abzurufen...")
|
||||
dashboard_url = "https://app.fotograf.de/config_dashboard/index"
|
||||
driver.get(dashboard_url)
|
||||
wait = WebDriverWait(driver, 20) # Increased timeout for dashboard load
|
||||
# --- New function to get jobs from the specific jobs list page ---
|
||||
def get_jobs_list(driver) -> List[Dict[str, Any]]:
|
||||
print("Navigiere direkt zur Auftragsliste, um Aufträge abzurufen...")
|
||||
jobs_list_url = "https://app.fotograf.de/config_jobs/index"
|
||||
driver.get(jobs_list_url)
|
||||
wait = WebDriverWait(driver, 45) # Use the generous timeout here
|
||||
|
||||
jobs = []
|
||||
try:
|
||||
# Wait for the table rows to be present
|
||||
job_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["dashboard_jobs_table_rows"])))
|
||||
print(f"[{len(job_rows)}] Auftragszeilen auf dem Dashboard gefunden.")
|
||||
print(f"[{len(job_rows)}] Auftragszeilen auf der Auftragsseite gefunden.")
|
||||
|
||||
for row in job_rows:
|
||||
try:
|
||||
@@ -134,9 +153,17 @@ def get_jobs_from_dashboard(driver) -> List[Dict[str, Any]]:
|
||||
print(f"Ein unerwarteter Fehler beim Parsen einer Auftragszeile: {e}")
|
||||
|
||||
except TimeoutException:
|
||||
print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf dem Dashboard gefunden.")
|
||||
print("Timeout: Keine Auftrags-Tabelle oder -Zeilen auf der Auftragsseite gefunden.")
|
||||
take_error_screenshot(driver, "get_jobs_list_error")
|
||||
# Save the HTML source for debugging selectors
|
||||
errors_dir = os.path.join(os.path.dirname(__file__), 'errors')
|
||||
os.makedirs(errors_dir, exist_ok=True)
|
||||
with open(os.path.join(errors_dir, 'page_source.html'), 'w', encoding='utf-8') as f:
|
||||
f.write(driver.page_source)
|
||||
print("HTML-Quellcode der Seite wurde in 'errors/page_source.html' gespeichert.")
|
||||
except Exception as e:
|
||||
print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge vom Dashboard: {e}")
|
||||
print(f"Ein Fehler ist aufgetreten beim Abrufen der Aufträge von der Auftragsseite: {e}")
|
||||
take_error_screenshot(driver, "get_jobs_list_error")
|
||||
|
||||
return jobs
|
||||
|
||||
@@ -164,11 +191,9 @@ async def get_jobs(account_type: str):
|
||||
if not login(driver, username, password):
|
||||
raise HTTPException(status_code=401, detail="Login failed. Please check credentials.")
|
||||
|
||||
jobs = get_jobs_from_dashboard(driver)
|
||||
jobs = get_jobs_list(driver) # Call the new function
|
||||
if not jobs:
|
||||
print("Keine Aufträge gefunden oder Fehler beim Abrufen vom Dashboard.")
|
||||
# Depending on desired behavior, might raise HTTPException or return empty list
|
||||
# For now, returning empty list if no jobs found but login was successful.
|
||||
print("Keine Aufträge gefunden oder Fehler beim Abrufen.")
|
||||
|
||||
return jobs
|
||||
|
||||
|
||||
Reference in New Issue
Block a user