import os import json import time import logging from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException # Temporäre, autarke Konfiguration (ersetzt externes config.py) class TempConfig: DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login" TARGET_SEARCH_NAME = "Facility Management" # Kann angepasst werden DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json" CHROMEDRIVER_PATH = "/usr/bin/chromedriver" DEFAULT_TIMEOUT = 30 IMPLICIT_WAIT = 10 OUTPUT_DIR = "/app/output" # Logging konfigurieren LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logging.getLogger("selenium").setLevel(logging.WARNING) logger = logging.getLogger(__name__) # Sicherstellen, dass OUTPUT_DIR existiert os.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True) ios.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True) class DealfrontScraper: def __init__(self): logger.info("Initialisiere den DealfrontScraper...") # Chrome-Optionen chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) # WebDriver service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH) try: self.driver = webdriver.Chrome(service=service, options=chrome_options) except Exception: logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True) raise self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT) self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT) # Credentials laden self.username, self.password = self._load_credentials() logger.info("WebDriver erfolgreich initialisiert.") def _load_credentials(self): try: with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r', encoding='utf-8') as f: creds = json.load(f) return creds['username'], creds['password'] except Exception as e: logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}") raise def login_and_find_list(self): # Login logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}") self.driver.get(TempConfig.DEALFRONT_LOGIN_URL) self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username) self.driver.find_element(By.NAME, 'password').send_keys(self.password) self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() logger.info("Login gesendet.") # 'Prospects finden' anklicken tile = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile='Prospects finden']")) ) tile.click() logger.info("'Prospects finden' geklickt.") # Vordefinierte Suche auswählen sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{TempConfig.TARGET_SEARCH_NAME}']") item = self.wait.until(EC.element_to_be_clickable(sel)) item.click() logger.info(f"Suche '{TempConfig.TARGET_SEARCH_NAME}' geladen.") def extract_current_page_results(self): # Warte auf erstes Daten-Element first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") self.wait.until(EC.visibility_of_element_located(first)) logger.info("Extrahiere aktuelle Seite...") results = [] # Warten auf Zeilen rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") self.wait.until(EC.presence_of_all_elements_located(rows_sel)) rows = self.driver.find_elements(*rows_sel) logger.info(f"{len(rows)} Zeilen gefunden.") for i, row in enumerate(rows, 1): # Name name_el = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") if not name_el: logger.warning(f"Zeile {i}: Kein Name gefunden. Überspringe.") continue name = (name_el[0].get_attribute('title') or name_el[0].text).strip() # Website web_el = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") if web_el: web = web_el[0].text.strip() else: cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") web = cell[0].text.strip() if cell else '' results.append({'name': name, 'website': web}) logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") return results def click_next_page(self) -> bool: btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") if not btns: return False nxt = btns[-1] if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'): return False curr = self.driver.find_element(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active").text nxt.click() self.wait.until(lambda d: d.find_element(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active").text != curr) return True def run(self): try: self.login_and_find_list() all_data = [] while True: all_data.extend(self.extract_current_page_results()) if not self.click_next_page(): break return all_data finally: self.driver.quit() if __name__ == '__main__': scraper = DealfrontScraper() data = scraper.run() for entry in data: print(entry)