Files
Brancheneinstufung2/dealfront_enrichment.py

149 lines
6.6 KiB
Python

import os
import json
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# --- Konfiguration ---
class Config:
LOGIN_URL = "https://app.dealfront.com/login"
TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIES AN IHRE GESPEICHERTE SUCHE AN
CREDENTIALS_FILE = "/app/dealfront_credentials.json"
OUTPUT_DIR = "/app/output"
# --- Logging Setup ---
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
logging.getLogger("selenium.webdriver.remote").setLevel(logging.WARNING) # Reduziert Selenium-Spam
logger = logging.getLogger(__name__)
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
log_filepath = os.path.join(Config.OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger().addHandler(file_handler)
class DealfrontScraper:
def __init__(self):
logger.info("Initialisiere WebDriver...")
chrome_options = ChromeOptions()
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
try:
self.driver = webdriver.Chrome(options=chrome_options)
except Exception as e:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, 30)
self.username, self.password = self._load_credentials()
# FAIL-FAST: Sofortiger Abbruch, wenn Credentials fehlen
if not self.username or not self.password:
raise ValueError("Benutzername oder Passwort konnten nicht aus der Credentials-Datei geladen werden. Breche ab.")
logger.info("WebDriver erfolgreich initialisiert.")
def _load_credentials(self):
try:
with open(Config.CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
creds = json.load(f)
return creds.get("username"), creds.get("password")
except Exception as e:
logger.error(f"Credentials-Datei {Config.CREDENTIALS_FILE} konnte nicht geladen werden: {e}")
return None, None
def _save_debug_artifacts(self):
try:
timestamp = time.strftime("%Y%m%d-%H%M%S")
screenshot_path = os.path.join(Config.OUTPUT_DIR, f"error_{timestamp}.png")
html_path = os.path.join(Config.OUTPUT_DIR, f"error_{timestamp}.html")
self.driver.save_screenshot(screenshot_path)
logger.error(f"Debug-Screenshot gespeichert: {screenshot_path}")
with open(html_path, "w", encoding="utf-8") as f:
f.write(self.driver.page_source)
logger.error(f"Debug-HTML-Quellcode gespeichert: {html_path}")
except Exception as e:
logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
def run(self):
# 1. LOGIN
logger.info(f"Navigiere zur Login-Seite: {Config.LOGIN_URL}")
self.driver.get(Config.LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login-Befehl gesendet. Kurze Pause für die Weiterleitung.")
time.sleep(5)
# 2. NAVIGATION & SUCHE LADEN
logger.info(f"Navigiere direkt zur Target-Seite und lade die Suche: '{Config.SEARCH_NAME}'")
self.driver.get(Config.TARGET_URL)
self.wait.until(EC.url_contains("/t/prospector/"))
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{Config.SEARCH_NAME}']")
self.wait.until(EC.element_to_be_clickable(search_item_selector)).click()
# 3. ERGEBNISSE EXTRAHIEREN
logger.info("Suche geladen. Extrahiere Ergebnisse der ersten Seite.")
results_table_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_element_located(results_table_selector))
data_rows = self.driver.find_elements(By.XPATH, "//tr[.//a[contains(@class, 't-highlight-text')]]")
logger.info(f"{len(data_rows)} gültige Datenzeilen gefunden.")
companies = []
for row in data_rows:
try:
name = row.find_element(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text").get_attribute("title").strip()
website = row.find_element(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text").text.strip()
companies.append({'name': name, 'website': website})
except NoSuchElementException:
continue
return companies
def close(self):
if self.driver:
self.driver.quit()
logger.info("WebDriver geschlossen.")
if __name__ == "__main__":
scraper = None
try:
scraper = DealfrontScraper()
company_list = scraper.run()
if company_list:
df = pd.DataFrame(company_list)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
pd.set_option('display.max_colwidth', None)
print("\n" + "="*80)
print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "="))
print("="*80)
print(df.to_string(index=False))
print("="*80 + "\n")
else:
logger.warning("Keine Firmen konnten extrahiert werden.")
logger.info("Workflow erfolgreich abgeschlossen.")
except Exception as e:
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=False)
finally:
if scraper:
scraper.close()
logger.info("Dealfront Automatisierung beendet.")