Brancheneinstufung2/dealfront_enrichment.py

import os
import json
import time
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

# Temporäre, autarke Konfiguration (ersetzt externes config.py)
class TempConfig:
    DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
    TARGET_SEARCH_NAME = "Facility Management"  # Kann angepasst werden
    DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json"
    CHROMEDRIVER_PATH = "/usr/bin/chromedriver"
    DEFAULT_TIMEOUT = 30
    IMPLICIT_WAIT = 10
    OUTPUT_DIR = "/app/output"

# Logging konfigurieren
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logging.getLogger("selenium").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

# Sicherstellen, dass OUTPUT_DIR existiert
os.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True)

ios.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True)

class DealfrontScraper:
    def __init__(self):
        logger.info("Initialisiere den DealfrontScraper...")
        # Chrome-Optionen
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)

        # WebDriver
        service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH)
        try:
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
        except Exception:
            logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
            raise
        self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT)
        self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT)

        # Credentials laden
        self.username, self.password = self._load_credentials()
        logger.info("WebDriver erfolgreich initialisiert.")

    def _load_credentials(self):
        try:
            with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
                creds = json.load(f)
            return creds['username'], creds['password']
        except Exception as e:
            logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}")
            raise

    def login_and_find_list(self):
        # Login
        logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
        self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
        self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username)
        self.driver.find_element(By.NAME, 'password').send_keys(self.password)
        self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
        logger.info("Login gesendet.")

        # 'Prospects finden' anklicken
        tile = self.wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile='Prospects finden']"))
        )
        tile.click()
        logger.info("'Prospects finden' geklickt.")

        # Vordefinierte Suche auswählen
        sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{TempConfig.TARGET_SEARCH_NAME}']")
        item = self.wait.until(EC.element_to_be_clickable(sel))
        item.click()
        logger.info(f"Suche '{TempConfig.TARGET_SEARCH_NAME}' geladen.")

    def extract_current_page_results(self):
        # Warte auf erstes Daten-Element
        first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
        self.wait.until(EC.visibility_of_element_located(first))

        logger.info("Extrahiere aktuelle Seite...")
        results = []

        # Warten auf Zeilen
        rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
        self.wait.until(EC.presence_of_all_elements_located(rows_sel))
        rows = self.driver.find_elements(*rows_sel)
        logger.info(f"{len(rows)} Zeilen gefunden.")

        for i, row in enumerate(rows, 1):
            # Name
            name_el = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
            if not name_el:
                logger.warning(f"Zeile {i}: Kein Name gefunden. Überspringe.")
                continue
            name = (name_el[0].get_attribute('title') or name_el[0].text).strip()

            # Website
            web_el = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
            if web_el:
                web = web_el[0].text.strip()
            else:
                cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
                web = cell[0].text.strip() if cell else ''

            results.append({'name': name, 'website': web})

        logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
        return results

    def click_next_page(self) -> bool:
        btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
        if not btns:
            return False
        nxt = btns[-1]
        if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'):
            return False
        curr = self.driver.find_element(By.CSS_SELECTOR,
                                        "nav.eb-pagination a.eb-pagination-button.active").text
        nxt.click()
        self.wait.until(lambda d: d.find_element(By.CSS_SELECTOR,
                                                  "nav.eb-pagination a.eb-pagination-button.active").text != curr)
        return True

    def run(self):
        try:
            self.login_and_find_list()
            all_data = []
            while True:
                all_data.extend(self.extract_current_page_results())
                if not self.click_next_page():
                    break
            return all_data
        finally:
            self.driver.quit()

if __name__ == '__main__':
    scraper = DealfrontScraper()
    data = scraper.run()
    for entry in data:
        print(entry)