diff --git a/scrape_fotograf.py b/scrape_fotograf.py index 29a571a7..7b7ef3d8 100644 --- a/scrape_fotograf.py +++ b/scrape_fotograf.py @@ -2,6 +2,7 @@ import json import os import time import csv +import math from datetime import datetime from selenium import webdriver from selenium.webdriver.chrome.options import Options @@ -16,7 +17,7 @@ OUTPUT_DIR = 'output' OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'nutzer_ohne_logins.csv') LOGIN_URL = 'https://app.fotograf.de/login/login' -# --- Selektoren --- +# --- Selektoren (FINALE, VOLLSTÄNDIGE VERSION) --- SELECTORS = { "cookie_accept_button": "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll", "login_user": "#login-email", @@ -25,6 +26,8 @@ SELECTORS = { "job_name": "h1", "album_overview_rows": "//table/tbody/tr", "album_overview_link": ".//td[2]//a", + # NEU: Selector für die Gesamtzahl der Zugangscodes + "access_code_count": "//span[text()='Zugangscodes']/following-sibling::strong", "person_rows": "//div[contains(@class, 'border-legacy-silver-550') and .//span[text()='Logins']]", "person_vorname": ".//span[text()='Vorname']/following-sibling::strong", "person_logins": ".//span[text()='Logins']/following-sibling::strong", @@ -100,7 +103,7 @@ def process_full_job(driver, job_url): try: driver.get(job_url) except InvalidArgumentException: - print(f"!!! FEHLER: Die URL '{job_url}' wurde von Selenium als ungültig angesehen. Bitte prüfen Sie die Eingabe.") + print(f"!!! FEHLER: Die URL '{job_url}' wurde von Selenium als ungültig angesehen.") return [] try: @@ -137,59 +140,69 @@ def process_full_job(driver, job_url): print(f"\n--- Betrete Album: {album['name']} ---") driver.get(album['url']) try: - num_persons = len(wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["person_rows"])))) - print(f"{num_persons} Personen in diesem Album gefunden.") + # NEU: Pagination-Logik + total_codes_text = wait.until(EC.visibility_of_element_located((By.XPATH, SELECTORS["access_code_count"]))).text + num_pages = math.ceil(int(total_codes_text) / 20) + print(f"Album hat {total_codes_text} Zugangscodes auf {num_pages} Seite(n).") - for i in range(num_persons): - person_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["person_rows"]))) - person_row = person_rows[i] + for page_num in range(1, num_pages + 1): + current_page_url = album['url'] + if page_num > 1: + current_page_url += f"?page_guest_accesses={page_num}" - login_count_text = person_row.find_element(By.XPATH, SELECTORS["person_logins"]).text - if int(login_count_text) == 0: - vorname = person_row.find_element(By.XPATH, SELECTORS["person_vorname"]).text - print(f" --> ERFOLG: '{vorname}' mit 0 Logins gefunden!") - - access_code_page_url = person_row.find_element(By.XPATH, SELECTORS["person_access_code_link"]).get_attribute('href') - driver.get(access_code_page_url) - print(f" Navigiere zur Kommunikations-Seite für '{vorname}'...") - - for attempt in range(3): - try: - wait.until(EC.visibility_of_element_located((By.XPATH, SELECTORS["quick_login_url"]))) - schnell_login_url = driver.find_element(By.XPATH, SELECTORS["quick_login_url"]).get_attribute('href') - potential_buyer_element = driver.find_element(By.XPATH, SELECTORS["potential_buyer_link"]) - kaeufer_name = potential_buyer_element.text - - print(f" Käufer: '{kaeufer_name}', Schnell-Login: GEFUNDEN") - potential_buyer_element.click() - - print(f" Navigiere zur Käufer-Detailseite...") - email = wait.until(EC.visibility_of_element_located((By.XPATH, SELECTORS["buyer_email"]))).text - print(f" FINALE ERFOLG: E-Mail gefunden: {email}") - - final_results.append({ - "Name des Kindes": vorname, - "Name Käufer": kaeufer_name, - "E-Mail-Adresse Käufer": email, - "Schnell Login URL": schnell_login_url - }) - break - - except StaleElementReferenceException: - print(f" Timing-Fehler (StaleElement), Versuch {attempt + 1}/3. Warte kurz...") - time.sleep(1) - if attempt == 2: - print(" Fehler war persistent, überspringe diese Person.") - take_error_screenshot(driver, f"stale_error_{vorname}") - - except TimeoutException: - print(f" Timeout beim Warten auf Details für '{vorname}'. Überspringe.") - take_error_screenshot(driver, f"timeout_error_{vorname}") - break + print(f" Verarbeite Seite {page_num}...") + driver.get(current_page_url) - print(f" Kehre zurück zur Album-Übersicht '{album['name']}'...") - driver.get(album['url']) - wait.until(EC.presence_of_element_located((By.XPATH, SELECTORS["person_rows"]))) + num_persons = len(wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["person_rows"])))) + print(f" {num_persons} Personen auf dieser Seite gefunden.") + + for i in range(num_persons): + person_rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, SELECTORS["person_rows"]))) + person_row = person_rows[i] + + login_count_text = person_row.find_element(By.XPATH, SELECTORS["person_logins"]).text + if int(login_count_text) == 0: + vorname = person_row.find_element(By.XPATH, SELECTORS["person_vorname"]).text + print(f" --> ERFOLG: '{vorname}' mit 0 Logins gefunden!") + + access_code_page_url = person_row.find_element(By.XPATH, SELECTORS["person_access_code_link"]).get_attribute('href') + driver.get(access_code_page_url) + print(f" Navigiere zur Kommunikations-Seite für '{vorname}'...") + + for attempt in range(3): + try: + wait.until(EC.visibility_of_element_located((By.XPATH, SELECTORS["quick_login_url"]))) + schnell_login_url = driver.find_element(By.XPATH, SELECTORS["quick_login_url"]).get_attribute('href') + potential_buyer_element = driver.find_element(By.XPATH, SELECTORS["potential_buyer_link"]) + kaeufer_name = potential_buyer_element.text + + print(f" Käufer: '{kaeufer_name}', Schnell-Login: GEFUNDEN") + potential_buyer_element.click() + + print(f" Navigiere zur Käufer-Detailseite...") + email = wait.until(EC.visibility_of_element_located((By.XPATH, SELECTORS["buyer_email"]))).text + print(f" FINALE ERFOLG: E-Mail gefunden: {email}") + + final_results.append({ + "Name des Kindes": vorname, + "Name Käufer": kaeufer_name, + "E-Mail-Adresse Käufer": email, + "Schnell Login URL": schnell_login_url + }) + break + + except StaleElementReferenceException: + print(f" Timing-Fehler, Versuch {attempt + 1}/3...") + time.sleep(1) + if attempt == 2: raise + except TimeoutException: + print(f" Timeout beim Warten auf Details für '{vorname}'.") + take_error_screenshot(driver, f"timeout_error_{vorname}") + break + + print(f" Kehre zurück zur Album-Seite {page_num}...") + driver.get(current_page_url) + wait.until(EC.presence_of_element_located((By.XPATH, SELECTORS["person_rows"]))) except TimeoutException: print(f" Keine Personen-Daten im Album '{album['name']}' gefunden. Überspringe.") take_error_screenshot(driver, f"album_{album['name']}_error") @@ -231,7 +244,6 @@ def main(): credentials = get_profile_choice() if not credentials: return - # GEÄNDERT: URL-Eingabe wird explizit von "Bracketed Paste" Codes bereinigt job_url_raw = input("Bitte gib die URL des zu bearbeitenden Fotoauftrags ein (Einstellungs-Seite): ") job_url_cleaned = job_url_raw.replace("\x1b[200~", "").replace("\x1b[201~", "") job_url = job_url_cleaned.strip()