From 4333a6e70a16bf5e9a39650cd2b56e720ac1d1ee Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 10 Jul 2025 11:36:03 +0000 Subject: [PATCH] dealfront_enrichment.py aktualisiert --- dealfront_enrichment.py | 293 ++++++++++++++++++++++------------------ 1 file changed, 159 insertions(+), 134 deletions(-) diff --git a/dealfront_enrichment.py b/dealfront_enrichment.py index 0a67c7d7..a051fe44 100644 --- a/dealfront_enrichment.py +++ b/dealfront_enrichment.py @@ -1,182 +1,207 @@ -#!/usr/bin/env python3 import os -import sys import json import time import logging - +import pandas as pd from selenium import webdriver -from selenium.webdriver.chrome.service import Service -from selenium.webdriver.chrome.options import Options -from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException, NoSuchElementException -# ──────────────────────────────────────────────────────────────── -# Konstanten -LOGIN_URL = "https://app.dealfront.com/login" -TARGET_TAB = "Target" -SEARCH_NAME = "Facility Management" -CREDS_FILE = "dealfront_credentials.json" -OUTPUT_DIR = "output" -CHROMEDRIVER_PATH = "/usr/bin/chromedriver" -LOG_FORMAT = "%(asctime)s %(levelname)-8s %(message)s" -# ──────────────────────────────────────────────────────────────── +# ============================================================================== +# TEMPORÄRE, AUTARKE KONFIGURATION +# ============================================================================== +class TempConfig: + # --- Direkt hier definierte Werte, um config.py zu umgehen --- + DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login" + TARGET_SEARCH_NAME = "Facility Management" # <-- BITTE AN IHRE SUCHE ANPASSEN + DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json" +# ============================================================================== -logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True) -logger = logging.getLogger(__name__) +OUTPUT_DIR = "/app/output" +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s', force=True) logging.getLogger("selenium").setLevel(logging.WARNING) +logger = logging.getLogger(__name__) -def load_creds(path): - if not os.path.exists(path): - logger.error("Credentials-Datei nicht gefunden: %s", path) - sys.exit(1) - creds = json.load(open(path, encoding="utf-8")) - u, p = creds.get("username"), creds.get("password") - if not u or not p: - logger.error("username/password fehlen in %s", path) - sys.exit(1) - return u, p +log_filename = f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.txt" +log_filepath = os.path.join(OUTPUT_DIR, log_filename) +try: + file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8') + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(logging.Formatter(LOG_FORMAT)) + logging.getLogger().addHandler(file_handler) + logger.info(f"Logging konfiguriert. Log-Datei: {log_filepath}") +except Exception as e: + logger.error(f"Konnte Log-Datei nicht erstellen: {e}") class DealfrontScraper: - def __init__(self, driver, wait, user, pwd): - self.driver = driver - self.wait = wait - self.user = user - self.pwd = pwd + def __init__(self): + logger.info("Initialisiere den DealfrontScraper...") + chrome_options = ChromeOptions() + prefs = {"profile.managed_default_content_settings.images": 2} + chrome_options.add_experimental_option("prefs", prefs) + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--window-size=1920,1080") + service = Service(executable_path='/usr/bin/chromedriver') + try: + self.driver = webdriver.Chrome(service=service, options=chrome_options) + except Exception as e: + logger.critical(f"WebDriver konnte nicht initialisiert werden.", exc_info=True) + raise + self.wait = WebDriverWait(self.driver, 30) + self.username, self.password = self._load_credentials() + logger.info("WebDriver erfolgreich initialisiert.") - def login_and_select_search(self): - # 1) Login abschließen - self.driver.get(LOGIN_URL) - self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[type='email'],input[type='text']"))) - self.driver.find_element(By.CSS_SELECTOR, "input[type='email'],input[type='text']").send_keys(self.user) - self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.pwd) - self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click() + def _load_credentials(self): + try: + with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r') as f: + creds = json.load(f) + return creds.get("username"), creds.get("password") + except Exception as e: + logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}") + return None, None - # 2) Warte kurz auf URL-Wechsel nach Login - self.wait.until(lambda d: d.current_url != LOGIN_URL) - - # 3) Direkt zur Target-Übersichtsseite navigieren - self.driver.get("https://app.dealfront.com/target") - - # 4) Sidebar mit Such-List laden - sidebar_sel = "ul[data-userpilot-id='sidebar-searches-list']" - self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, sidebar_sel))) - - # 5) Deine Suche anklicken (div[title=…]) - div_sel = f"div[title='{SEARCH_NAME}']" - el = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, div_sel))) - self.driver.execute_script("arguments[0].click()", el) - - # 6) Erstes Daten-Element abwarten - first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") - self.wait.until(EC.visibility_of_element_located(first)) - time.sleep(1) + def _save_debug_artifacts(self): + # ... (Diese Methode bleibt unverändert) ... + try: + os.makedirs(OUTPUT_DIR, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + screenshot_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png") + html_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html") + self.driver.save_screenshot(screenshot_filepath) + logger.error(f"Screenshot '{screenshot_filepath}' wurde für die Analyse gespeichert.") + with open(html_filepath, "w", encoding="utf-8") as f: + f.write(self.driver.page_source) + logger.error(f"HTML-Quellcode '{html_filepath}' wurde für die Analyse gespeichert.") + except Exception as e: + logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}") + def login_and_find_list(self, search_name): + # ... (Diese Methode bleibt unverändert, verwendet aber jetzt TempConfig) ... + try: + logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}") + self.driver.get(TempConfig.DEALFRONT_LOGIN_URL) + self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username) + self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password) + self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() + logger.info("Login-Befehl gesendet.") + logger.info("Warte auf Dashboard und den 'Prospects finden' Quick-Link...") + prospects_link_selector = (By.XPATH, "//a[@data-test-target-product-tile]") + prospects_link = self.wait.until(EC.element_to_be_clickable(prospects_link_selector)) + prospects_link.click() + logger.info("'Prospects finden' geklickt.") + logger.info(f"Warte auf die Liste der Suchen und klicke auf '{search_name}'...") + search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']") + search_item = self.wait.until(EC.element_to_be_clickable(search_item_selector)) + search_item.click() + logger.info(f"Suche '{search_name}' geladen. Warte auf das Rendern der Ergebnistabelle.") + first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") + self.wait.until( + EC.visibility_of_element_located(first_row_locator) + ) + time.sleep(5) + logger.info("Zielseite mit Ergebnissen erfolgreich erreicht.") + return True + except Exception as e: + logger.critical(f"Der Prozess ist fehlgeschlagen: {type(e).__name__}", exc_info=True) + self._save_debug_artifacts() + return False def extract_current_page_results(self): # 1) Kurz Implicit-Wait absenken self.driver.implicitly_wait(1) - # 2) Auf erstes Daten-Element warten und Puffer - first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") - self.wait.until(EC.visibility_of_element_located(first)) + # 2) Auf das erste Daten-Element warten + first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") + self.wait.until(EC.visibility_of_element_located(first_row_locator)) time.sleep(1) try: logger.info("Extrahiere Ergebnisse von der aktuellen Seite...") results = [] - # 3) Auf mindestens eine Tabellenzeile warten - rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") - self.wait.until(EC.presence_of_all_elements_located(rows_sel)) - rows = self.driver.find_elements(*rows_sel) + # 3) Warten auf mindestens eine Tabellen-Zeile + rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") + self.wait.until(EC.presence_of_all_elements_located(rows_selector)) + rows = self.driver.find_elements(*rows_selector) logger.info(f"{len(rows)} Firmen-Zeilen gefunden.") - # 4) Schleife ohne weitere Sleeps for i, row in enumerate(rows, 1): + # Name-Extraktion (bewährter Selector) name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") if not name_elems: logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.") continue - ne = name_elems[0] - company_name = (ne.get_attribute("title") or ne.text).strip() + name_elem = name_elems[0] + company_name = (name_elem.get_attribute("title") or name_elem.text).strip() - web_elems = row.find_elements(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text") - website = web_elems[0].text.strip() if web_elems else "" + # Website-Extraktion aus 3. Spalte + web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") + if web_elems: + # Link-Text ist der Domain-Name + website = web_elems[0].text.strip() + else: + # Fallback: reiner Zellen-Text + cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") + website = cell[0].text.strip() if cell else "" results.append({'name': company_name, 'website': website}) logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") return results - except Exception as e: - logger.error(f"Schwerwiegender Fehler bei der Extraktion: {type(e).__name__}", exc_info=True) - self._save_debug_artifacts() - return [] - finally: - # 5) Implicit-Wait wieder auf Standard setzen (z.B. 10 s) + # 4) Implicit-Wait auf Standard zurücksetzen self.driver.implicitly_wait(10) - def click_next_page(self): - # Paginator-Buttons greifen - btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") - if not btns: - return False - nxt = btns[-1] - # Ende erreicht? - if not nxt.is_enabled() or "disabled" in nxt.get_attribute("class"): - return False + + def close(self): + if self.driver: + logger.info("Schließe den WebDriver.") + self.driver.quit() - current = self.driver.find_element( - By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active" - ).text - nxt.click() - # auf Seitenwechsel warten - self.wait.until(lambda d: d.find_element( - By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active" - ).text != current) - return True - - def run(self): - logger.info("Starte Login und Sucheauswahl…") - self.login_and_select_search() - - all_res = [] - page = 1 - while True: - logger.info(f"Seite {page}: Extrahiere Daten…") - all_res.extend(self.extract_current_page_results()) - if not self.click_next_page(): - break - page += 1 - return all_res - -def main(): - user, pwd = load_creds(CREDS_FILE) - - opts = Options() - opts.add_argument("--headless") - opts.add_argument("--no-sandbox") - opts.add_argument("--disable-dev-shm-usage") - service = Service(CHROMEDRIVER_PATH) - driver = webdriver.Chrome(service=service, options=opts) - wait = WebDriverWait(driver, 30) - - try: - scraper = DealfrontScraper(driver, wait, user, pwd) - results = scraper.run() - finally: - driver.quit() - - os.makedirs(OUTPUT_DIR, exist_ok=True) - path = os.path.join(OUTPUT_DIR, "results.json") - with open(path, "w", encoding="utf-8") as f: - json.dump(results, f, ensure_ascii=False, indent=2) - - logger.info(f"✅ Fertig: {len(results)} Einträge in {path}") if __name__ == "__main__": - main() + logger.info("Starte Dealfront Automatisierung - DEBUG-MODUS") + scraper = None + try: + scraper = DealfrontScraper() + if not scraper.driver: + raise Exception("WebDriver konnte nicht initialisiert werden.") + + if not scraper.login_and_find_list(TempConfig.TARGET_SEARCH_NAME): + raise Exception("Der Prozess vom Login bis zum Laden der Liste ist fehlgeschlagen.") + + # In dieser Version gibt es keine handle_overlays Methode mehr + # scraper.handle_overlays() + + companies = scraper.extract_current_page_results() + if companies: + df = pd.DataFrame(companies) + pd.set_option('display.max_rows', None) + pd.set_option('display.max_columns', None) + pd.set_option('display.width', 1000) + pd.set_option('display.max_colwidth', None) + print("\n" + "="*80) + print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "=")) + print("="*80) + print(df.to_string(index=False)) + print("="*80 + "\n") + else: + logger.warning("Obwohl die Seite geladen wurde, konnten keine Firmen extrahiert werden.") + + logger.info("Test erfolgreich abgeschlossen. Warte vor dem Schließen...") + time.sleep(10) + + except Exception as e: + logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=False) + finally: + if scraper: + scraper.close() + + logger.info("Dealfront Automatisierung beendet.") \ No newline at end of file