import os import time import json import logging from selenium import webdriver from selenium.webdriver.chrome.options import Options, ChromeOptions from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from config import TempConfig # Import deiner Konfigurationsklasse # Logging konfigurieren LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logger = logging.getLogger(__name__) class DealfrontScraper: def __init__(self): logger.info("Initialisiere den DealfrontScraper...") chrome_options = ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH) try: self.driver = webdriver.Chrome(service=service, options=chrome_options) except Exception: logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True) raise self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT) self.username, self.password = self._load_credentials() logger.info("WebDriver erfolgreich initialisiert.") def _load_credentials(self): try: with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r', encoding='utf-8') as f: creds = json.load(f) return creds.get('username'), creds.get('password') except Exception as e: logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}") raise def _save_debug_artifacts(self): try: os.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True) ts = time.strftime("%Y%m%d-%H%M%S") png = os.path.join(TempConfig.OUTPUT_DIR, f"error_{ts}.png") html = os.path.join(TempConfig.OUTPUT_DIR, f"error_{ts}.html") self.driver.save_screenshot(png) logger.error(f"Screenshot '{png}' gespeichert.") with open(html, 'w', encoding='utf-8') as f: f.write(self.driver.page_source) logger.error(f"HTML-Source '{html}' gespeichert.") except Exception as e: logger.error(f"Debug-Artefakte konnten nicht gespeichert werden: {e}") def login_and_find_list(self, search_name): # Login logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}") self.driver.get(TempConfig.DEALFRONT_LOGIN_URL) self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username) self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password) self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() logger.info("Login gesendet.") # 'Prospects finden' tile = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile]")) ) tile.click() logger.info("'Prospects finden' geklickt.") # Vordefinierte Suche auswählen sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{search_name}']") item = self.wait.until(EC.element_to_be_clickable(sel)) item.click() logger.info(f"Suche '{search_name}' geladen.") def extract_current_page_results(self): # 1) Kurzer Implicit-Wait self.driver.implicitly_wait(1) # 2) Warten auf erstes Daten-Element first_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") self.wait.until(EC.visibility_of_element_located(first_locator)) logger.info("Extrahiere aktuelle Seite...") results = [] # 3) Auf mindestens eine Zeile warten rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") self.wait.until(EC.presence_of_all_elements_located(rows_sel)) rows = self.driver.find_elements(*rows_sel) logger.info(f"{len(rows)} Zeilen gefunden.") # 4) Namen & Websites extrahieren for i, row in enumerate(rows, 1): names = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") if not names: logger.warning(f"Zeile {i}: Kein Name gefunden.") continue name_elem = names[0] name = (name_elem.get_attribute('title') or name_elem.text).strip() webs = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") if webs: web = webs[0].text.strip() else: cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") web = cell[0].text.strip() if cell else '' results.append({'name': name, 'website': web}) logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") # Implicit-Wait reset self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT) return results def click_next_page(self) -> bool: btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") if not btns: return False nxt = btns[-1] if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'): return False curr = self.driver.find_element( By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active" ).text nxt.click() self.wait.until( lambda d: d.find_element( By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active" ).text != curr ) return True def run(self, search_name): try: self.login_and_find_list(search_name) all_res = [] while True: page_res = self.extract_current_page_results() all_res.extend(page_res) if not self.click_next_page(): break return all_res finally: self.driver.quit() if __name__ == '__main__': scraper = DealfrontScraper() data = scraper.run('Facility Management') for d in data: print(d)