Files
Brancheneinstufung2/dealfront_enrichment.py
2025-07-11 07:22:31 +00:00

157 lines
7.3 KiB
Python

import os
import json
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# --- Konfiguration ---
class TempConfig:
DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
DEALFRONT_TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
TARGET_SEARCH_NAME = "Facility Management" # BITTE AN IHRE SUCHE ANPASSEN
DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json"
# --- Logging Setup ---
OUTPUT_DIR = "/app/output"
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
logging.getLogger("selenium").setLevel(logging.INFO) # Selenium-Logs auf INFO reduzieren
logger = logging.getLogger(__name__)
os.makedirs(OUTPUT_DIR, exist_ok=True)
log_filepath = os.path.join(OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger().addHandler(file_handler)
class DealfrontScraper:
def __init__(self):
logger.info("Initialisiere WebDriver...")
chrome_options = ChromeOptions()
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
try:
self.driver = webdriver.Chrome(options=chrome_options)
except Exception as e:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, 30)
self.username, self.password = self._load_credentials()
logger.info("WebDriver erfolgreich initialisiert.")
def _load_credentials(self):
try:
with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r') as f:
return json.load(f).get("username"), json.load(f).get("password")
except Exception:
logger.error(f"Credentials-Datei {TempConfig.DEALFRONT_CREDENTIALS_FILE} nicht gefunden oder fehlerhaft.")
return None, None
def _save_debug_artifacts(self):
try:
timestamp = time.strftime("%Y%m%d-%H%M%S")
screenshot_path = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png")
html_path = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html")
self.driver.save_screenshot(screenshot_path)
with open(html_path, "w", encoding="utf-8") as f:
f.write(self.driver.page_source)
logger.error(f"Debug-Artefakte gespeichert: {screenshot_path}, {html_path}")
except Exception as e:
logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
def login(self):
try:
logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login-Befehl gesendet. Warte 5 Sekunden, damit die Session etabliert wird.")
time.sleep(5)
return True
except Exception as e:
logger.critical("Login-Prozess fehlgeschlagen.", exc_info=True)
self._save_debug_artifacts()
return False
def navigate_and_load_search(self, search_name):
try:
logger.info(f"Navigiere direkt zur Target-URL und lade die Suche...")
self.driver.get(TempConfig.DEALFRONT_TARGET_URL)
self.wait.until(EC.url_contains("/t/prospector/"))
logger.info("Target-Seite erreicht. Klicke auf die Suche: '{}'".format(search_name))
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']")
self.wait.until(EC.element_to_be_clickable(search_item_selector)).click()
logger.info("Suche geladen. Warte auf die Ergebnistabelle.")
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table tbody tr")))
return True
except Exception as e:
logger.critical("Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True)
self._save_debug_artifacts()
return False
def extract_results_from_page(self):
try:
logger.info("Extrahiere Daten von der aktuellen Seite...")
results = []
rows_selector = (By.XPATH, "//table[@id='t-result-table']/tbody/tr[.//a[contains(@class, 't-highlight-text')]]")
data_rows = self.wait.until(EC.presence_of_all_elements_located(rows_selector))
logger.info(f"{len(data_rows)} gültige Firmen-Datenzeilen gefunden.")
for row in data_rows:
try:
name = row.find_element(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text").get_attribute("title").strip()
website = row.find_element(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text").text.strip()
results.append({'name': name, 'website': website})
except NoSuchElementException:
logger.warning("Einzelne Zeile konnte nicht verarbeitet werden, überspringe.")
continue
return results
except Exception as e:
logger.error("Fehler bei der Extraktion.", exc_info=True)
self._save_debug_artifacts()
return []
def close(self):
if self.driver:
logger.info("Schließe den WebDriver.")
self.driver.quit()
if __name__ == "__main__":
logger.info("Starte Dealfront Automatisierung - Finaler, robuster Workflow")
scraper = None
try:
scraper = DealfrontScraper()
if not scraper.login(): raise Exception("Login-Phase fehlgeschlagen")
if not scraper.navigate_and_load_search(TempConfig.TARGET_SEARCH_NAME): raise Exception("Navigations-Phase fehlgeschlagen")
companies = scraper.extract_results_from_page()
if companies:
df = pd.DataFrame(companies)
print("\n" + "="*80)
print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "="))
print("="*80)
print(df.to_string(index=False))
print("="*80 + "\n")
else:
logger.warning("Keine Firmen konnten extrahiert werden.")
logger.info("Test erfolgreich abgeschlossen.")
except Exception as e:
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}")
finally:
if scraper:
scraper.close()
logger.info("Dealfront Automatisierung beendet.")