Brancheneinstufung2/dealfront_enrichment.py

import os
import json
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# --- Konfiguration ---
class Config:
    LOGIN_URL = "https://app.dealfront.com/login"
    TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
    SEARCH_NAME = "Facility Management"  # <-- PASSEN SIE DIES AN IHRE GESPEICHERTE SUCHE AN
    CREDENTIALS_FILE = "/app/dealfront_credentials.json"
    OUTPUT_DIR = "/app/output"

# --- Logging Setup ---
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
logging.getLogger("selenium.webdriver.remote").setLevel(logging.WARNING) # Reduziert Selenium-Spam
logger = logging.getLogger(__name__)

os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
log_filepath = os.path.join(Config.OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger().addHandler(file_handler)

class DealfrontScraper:
    def __init__(self):
        logger.info("Initialisiere WebDriver...")
        chrome_options = ChromeOptions()
        chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
        chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")

        try:
            self.driver = webdriver.Chrome(options=chrome_options)
        except Exception as e:
            logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
            raise

        self.wait = WebDriverWait(self.driver, 30)
        self.username, self.password = self._load_credentials()

        # FAIL-FAST: Sofortiger Abbruch, wenn Credentials fehlen
        if not self.username or not self.password:
            raise ValueError("Benutzername oder Passwort konnten nicht aus der Credentials-Datei geladen werden. Breche ab.")

        logger.info("WebDriver erfolgreich initialisiert.")

    def _load_credentials(self):
        try:
            with open(Config.CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
                creds = json.load(f)
            return creds.get("username"), creds.get("password")
        except Exception as e:
            logger.error(f"Credentials-Datei {Config.CREDENTIALS_FILE} konnte nicht geladen werden: {e}")
            return None, None

    def _save_debug_artifacts(self):
        try:
            timestamp = time.strftime("%Y%m%d-%H%M%S")
            screenshot_path = os.path.join(Config.OUTPUT_DIR, f"error_{timestamp}.png")
            html_path = os.path.join(Config.OUTPUT_DIR, f"error_{timestamp}.html")
            self.driver.save_screenshot(screenshot_path)
            logger.error(f"Debug-Screenshot gespeichert: {screenshot_path}")
            with open(html_path, "w", encoding="utf-8") as f:
                f.write(self.driver.page_source)
            logger.error(f"Debug-HTML-Quellcode gespeichert: {html_path}")
        except Exception as e:
            logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")

    def run(self):
        # 1. LOGIN
        logger.info(f"Navigiere zur Login-Seite: {Config.LOGIN_URL}")
        self.driver.get(Config.LOGIN_URL)
        self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
        self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
        self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
        logger.info("Login-Befehl gesendet. Kurze Pause für die Weiterleitung.")
        time.sleep(5)

        # 2. NAVIGATION & SUCHE LADEN
        logger.info(f"Navigiere direkt zur Target-Seite und lade die Suche: '{Config.SEARCH_NAME}'")
        self.driver.get(Config.TARGET_URL)
        self.wait.until(EC.url_contains("/t/prospector/"))

        search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{Config.SEARCH_NAME}']")
        self.wait.until(EC.element_to_be_clickable(search_item_selector)).click()

        # 3. ERGEBNISSE EXTRAHIEREN
        logger.info("Suche geladen. Extrahiere Ergebnisse der ersten Seite.")
        results_table_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
        self.wait.until(EC.presence_of_element_located(results_table_selector))

        data_rows = self.driver.find_elements(By.XPATH, "//tr[.//a[contains(@class, 't-highlight-text')]]")
        logger.info(f"{len(data_rows)} gültige Datenzeilen gefunden.")

        companies = []
        for row in data_rows:
            try:
                name = row.find_element(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text").get_attribute("title").strip()
                website = row.find_element(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text").text.strip()
                companies.append({'name': name, 'website': website})
            except NoSuchElementException:
                continue

        return companies

    def close(self):
        if self.driver:
            self.driver.quit()
            logger.info("WebDriver geschlossen.")

if __name__ == "__main__":
    scraper = None
    try:
        scraper = DealfrontScraper()
        company_list = scraper.run()

        if company_list:
            df = pd.DataFrame(company_list)
            pd.set_option('display.max_rows', None)
            pd.set_option('display.max_columns', None)
            pd.set_option('display.width', 120)
            pd.set_option('display.max_colwidth', None)

            print("\n" + "="*80)
            print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "="))
            print("="*80)
            print(df.to_string(index=False))
            print("="*80 + "\n")
        else:
            logger.warning("Keine Firmen konnten extrahiert werden.")

        logger.info("Workflow erfolgreich abgeschlossen.")
    except Exception as e:
        logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=False)
    finally:
        if scraper:
            scraper.close()
    logger.info("Dealfront Automatisierung beendet.")