From 8c69b2d7e132ec7e5b58ebe0bcc8529b5368a387 Mon Sep 17 00:00:00 2001 From: Floke Date: Tue, 8 Jul 2025 19:07:01 +0000 Subject: [PATCH] dealfront_enrichment.py aktualisiert --- dealfront_enrichment.py | 82 +++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/dealfront_enrichment.py b/dealfront_enrichment.py index 764792fe..ade4b776 100644 --- a/dealfront_enrichment.py +++ b/dealfront_enrichment.py @@ -1,45 +1,56 @@ import os -import time import json +import time import logging - from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoSuchElementException -from config import TempConfig # Deine Konfigurationsklasse mit Pfaden und URLs +# Temporäre, autarke Konfiguration (ersetzt externes config.py) +class TempConfig: + DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login" + TARGET_SEARCH_NAME = "Facility Management" # Kann angepasst werden + DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json" + CHROMEDRIVER_PATH = "/usr/bin/chromedriver" + DEFAULT_TIMEOUT = 30 + IMPLICIT_WAIT = 10 + OUTPUT_DIR = "/app/output" # Logging konfigurieren -template = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' -logging.basicConfig(level=logging.INFO, format=template) +LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' +logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) +logging.getLogger("selenium").setLevel(logging.WARNING) logger = logging.getLogger(__name__) +# Sicherstellen, dass OUTPUT_DIR existiert +ios.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True) class DealfrontScraper: def __init__(self): logger.info("Initialisiere den DealfrontScraper...") # Chrome-Optionen chrome_options = Options() - prefs = {"profile.managed_default_content_settings.images": 2} - chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") + prefs = {"profile.managed_default_content_settings.images": 2} + chrome_options.add_experimental_option("prefs", prefs) - # WebDriver-Service + # WebDriver service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH) try: self.driver = webdriver.Chrome(service=service, options=chrome_options) except Exception: logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True) raise - - # Explicit Wait self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT) + self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT) + # Credentials laden self.username, self.password = self._load_credentials() logger.info("WebDriver erfolgreich initialisiert.") @@ -53,8 +64,8 @@ class DealfrontScraper: logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}") raise - def login_and_find_list(self, search_name): - # Login-Flow + def login_and_find_list(self): + # Login logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}") self.driver.get(TempConfig.DEALFRONT_LOGIN_URL) self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username) @@ -62,7 +73,7 @@ class DealfrontScraper: self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() logger.info("Login gesendet.") - # Klicken auf 'Prospects finden' + # 'Prospects finden' anklicken tile = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile='Prospects finden']")) ) @@ -70,37 +81,32 @@ class DealfrontScraper: logger.info("'Prospects finden' geklickt.") # Vordefinierte Suche auswählen - selector = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{search_name}']") - item = self.wait.until(EC.element_to_be_clickable(selector)) + sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{TempConfig.TARGET_SEARCH_NAME}']") + item = self.wait.until(EC.element_to_be_clickable(sel)) item.click() - logger.info(f"Suche '{search_name}' geladen.") + logger.info(f"Suche '{TempConfig.TARGET_SEARCH_NAME}' geladen.") def extract_current_page_results(self): - # Kurz Implicit-Wait - self.driver.implicitly_wait(1) - - # Warten auf erstes Daten-Element + # Warte auf erstes Daten-Element first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") self.wait.until(EC.visibility_of_element_located(first)) logger.info("Extrahiere aktuelle Seite...") results = [] - # Warten auf mindestens eine Tabellen-Zeile + # Warten auf Zeilen rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") self.wait.until(EC.presence_of_all_elements_located(rows_sel)) rows = self.driver.find_elements(*rows_sel) logger.info(f"{len(rows)} Zeilen gefunden.") - # Extraktion Namen & Website for i, row in enumerate(rows, 1): # Name name_el = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") if not name_el: logger.warning(f"Zeile {i}: Kein Name gefunden. Überspringe.") continue - elem = name_el[0] - name = (elem.get_attribute('title') or elem.text).strip() + name = (name_el[0].get_attribute('title') or name_el[0].text).strip() # Website web_el = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") @@ -113,40 +119,36 @@ class DealfrontScraper: results.append({'name': name, 'website': web}) logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") - # Implicit-Wait zurücksetzen - self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT) return results def click_next_page(self) -> bool: - buttons = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") - if not buttons: + btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") + if not btns: return False - nxt = buttons[-1] + nxt = btns[-1] if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'): return False - current = self.driver.find_element(By.CSS_SELECTOR, - "nav.eb-pagination a.eb-pagination-button.active").text + curr = self.driver.find_element(By.CSS_SELECTOR, + "nav.eb-pagination a.eb-pagination-button.active").text nxt.click() self.wait.until(lambda d: d.find_element(By.CSS_SELECTOR, - "nav.eb-pagination a.eb-pagination-button.active").text != current) + "nav.eb-pagination a.eb-pagination-button.active").text != curr) return True - def run(self, search_name): + def run(self): try: - self.login_and_find_list(search_name) + self.login_and_find_list() all_data = [] while True: - page = self.extract_current_page_results() - all_data.extend(page) + all_data.extend(self.extract_current_page_results()) if not self.click_next_page(): break return all_data finally: self.driver.quit() - if __name__ == '__main__': scraper = DealfrontScraper() - data = scraper.run('Facility Management') + data = scraper.run() for entry in data: print(entry)