Brancheneinstufung2/dealfront_enrichment.py

import os
import json
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# --- Konfiguration ---
class Config:
    LOGIN_URL = "https://app.dealfront.com/login"
    TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
    SEARCH_NAME = "Facility Management"  # <-- PASSEN SIE DIES AN IHRE GESPEICHERTE SUCHE AN
    CREDENTIALS_FILE = "/app/dealfront_credentials.json"
    OUTPUT_DIR = "/app/output"

# --- Logging Setup ---
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
logging.getLogger("selenium.webdriver.remote").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
log_filepath = os.path.join(Config.OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger().addHandler(file_handler)

class DealfrontScraper:
    def __init__(self):
        logger.info("Initialisiere WebDriver...")
        chrome_options = ChromeOptions()
        chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
        chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1200")

        try:
            self.driver = webdriver.Chrome(options=chrome_options)
        except Exception as e:
            logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
            raise

        self.wait = WebDriverWait(self.driver, 30)
        self.username, self.password = self._load_credentials()
        if not self.username or not self.password:
            raise ValueError("Credentials konnten nicht geladen werden. Breche ab.")
        logger.info("WebDriver erfolgreich initialisiert.")

    def _load_credentials(self):
        try:
            with open(Config.CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
                creds = json.load(f)
            return creds.get("username"), creds.get("password")
        except Exception:
            logger.error(f"Credentials-Datei {Config.CREDENTIALS_FILE} nicht gefunden.")
            return None, None

    def _save_debug_artifacts(self, suffix=""):
        try:
            timestamp = time.strftime("%Y%m%d-%H%M%S")
            filename_base = os.path.join(Config.OUTPUT_DIR, f"error_{suffix}_{timestamp}")
            self.driver.save_screenshot(f"{filename_base}.png")
            with open(f"{filename_base}.html", "w", encoding="utf-8") as f:
                f.write(self.driver.page_source)
            logger.error(f"Debug-Artefakte gespeichert: {filename_base}.*")
        except Exception as e:
            logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")

    def run(self):
        # 1. LOGIN
        logger.info(f"Navigiere zu: {Config.LOGIN_URL}")
        self.driver.get(Config.LOGIN_URL)
        self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
        self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
        self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
        logger.info("Login-Befehl gesendet. Warte 5s auf Session-Etablierung.")
        time.sleep(5)

        # 2. NAVIGATION & SUCHE LADEN
        logger.info(f"Navigiere direkt zur Target-Seite und lade Suche: '{Config.SEARCH_NAME}'")
        self.driver.get(Config.TARGET_URL)
        self.wait.until(EC.element_to_be_clickable((By.XPATH, f"//*[normalize-space()='{Config.SEARCH_NAME}']"))).click()

        # 3. ERGEBNISSE EXTRAHIEREN
        logger.info("Suche geladen. Extrahiere Ergebnisse der ersten Seite.")
        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table")))
        time.sleep(3) # Letzte kurze Pause für das Rendering

        company_elements = self.driver.find_elements(By.CSS_SELECTOR, "td.sticky-column a.t-highlight-text")
        website_elements = self.driver.find_elements(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text")

        logger.info(f"{len(company_elements)} Firmen und {len(website_elements)} Webseiten gefunden.")

        results = []
        for i in range(len(company_elements)):
            name = company_elements[i].get_attribute("title").strip()
            website = website_elements[i].text.strip() if i < len(website_elements) else "N/A"
            results.append({'name': name, 'website': website})

        return results

    def close(self):
        if self.driver: self.driver.quit()

if __name__ == "__main__":
    scraper = None
    try:
        scraper = DealfrontScraper()
        companies = scraper.run()

        if companies:
            df = pd.DataFrame(companies)
            pd.set_option('display.max_rows', None)
            pd.set_option('display.max_columns', None)
            pd.set_option('display.width', 120)
            pd.set_option('display.max_colwidth', None)

            print("\n" + "="*80)
            print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "="))
            print("="*80)
            print(df.to_string(index=False))
            print("="*80 + "\n")
            logger.info("Workflow erfolgreich abgeschlossen.")
        else:
            logger.warning("Keine Firmen konnten extrahiert werden. Bitte Debug-Artefakte prüfen.")

    except Exception as e:
        logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=False)
    finally:
        if scraper:
            scraper.close()
    logger.info("Dealfront Automatisierung beendet.")