dealfront_enrichment.py aktualisiert

This commit is contained in:
2025-07-10 11:36:03 +00:00
parent d086df91e3
commit 4333a6e70a

View File

@@ -1,182 +1,207 @@
#!/usr/bin/env python3
import os
import sys
import json
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# ────────────────────────────────────────────────────────────────
# Konstanten
LOGIN_URL = "https://app.dealfront.com/login"
TARGET_TAB = "Target"
SEARCH_NAME = "Facility Management"
CREDS_FILE = "dealfront_credentials.json"
OUTPUT_DIR = "output"
CHROMEDRIVER_PATH = "/usr/bin/chromedriver"
LOG_FORMAT = "%(asctime)s %(levelname)-8s %(message)s"
# ────────────────────────────────────────────────────────────────
# ==============================================================================
# TEMPORÄRE, AUTARKE KONFIGURATION
# ==============================================================================
class TempConfig:
# --- Direkt hier definierte Werte, um config.py zu umgehen ---
DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
TARGET_SEARCH_NAME = "Facility Management" # <-- BITTE AN IHRE SUCHE ANPASSEN
DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json"
# ==============================================================================
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
logger = logging.getLogger(__name__)
OUTPUT_DIR = "/app/output"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s', force=True)
logging.getLogger("selenium").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
def load_creds(path):
if not os.path.exists(path):
logger.error("Credentials-Datei nicht gefunden: %s", path)
sys.exit(1)
creds = json.load(open(path, encoding="utf-8"))
u, p = creds.get("username"), creds.get("password")
if not u or not p:
logger.error("username/password fehlen in %s", path)
sys.exit(1)
return u, p
log_filename = f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.txt"
log_filepath = os.path.join(OUTPUT_DIR, log_filename)
try:
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger().addHandler(file_handler)
logger.info(f"Logging konfiguriert. Log-Datei: {log_filepath}")
except Exception as e:
logger.error(f"Konnte Log-Datei nicht erstellen: {e}")
class DealfrontScraper:
def __init__(self, driver, wait, user, pwd):
self.driver = driver
self.wait = wait
self.user = user
self.pwd = pwd
def __init__(self):
logger.info("Initialisiere den DealfrontScraper...")
chrome_options = ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
service = Service(executable_path='/usr/bin/chromedriver')
try:
self.driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception as e:
logger.critical(f"WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, 30)
self.username, self.password = self._load_credentials()
logger.info("WebDriver erfolgreich initialisiert.")
def login_and_select_search(self):
# 1) Login abschließen
self.driver.get(LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[type='email'],input[type='text']")))
self.driver.find_element(By.CSS_SELECTOR, "input[type='email'],input[type='text']").send_keys(self.user)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.pwd)
self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
def _load_credentials(self):
try:
with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r') as f:
creds = json.load(f)
return creds.get("username"), creds.get("password")
except Exception as e:
logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}")
return None, None
# 2) Warte kurz auf URL-Wechsel nach Login
self.wait.until(lambda d: d.current_url != LOGIN_URL)
# 3) Direkt zur Target-Übersichtsseite navigieren
self.driver.get("https://app.dealfront.com/target")
# 4) Sidebar mit Such-List laden
sidebar_sel = "ul[data-userpilot-id='sidebar-searches-list']"
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, sidebar_sel)))
# 5) Deine Suche anklicken (div[title=…])
div_sel = f"div[title='{SEARCH_NAME}']"
el = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, div_sel)))
self.driver.execute_script("arguments[0].click()", el)
# 6) Erstes Daten-Element abwarten
first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first))
time.sleep(1)
def _save_debug_artifacts(self):
# ... (Diese Methode bleibt unverändert) ...
try:
os.makedirs(OUTPUT_DIR, exist_ok=True)
timestamp = time.strftime("%Y%m%d-%H%M%S")
screenshot_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png")
html_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html")
self.driver.save_screenshot(screenshot_filepath)
logger.error(f"Screenshot '{screenshot_filepath}' wurde für die Analyse gespeichert.")
with open(html_filepath, "w", encoding="utf-8") as f:
f.write(self.driver.page_source)
logger.error(f"HTML-Quellcode '{html_filepath}' wurde für die Analyse gespeichert.")
except Exception as e:
logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
def login_and_find_list(self, search_name):
# ... (Diese Methode bleibt unverändert, verwendet aber jetzt TempConfig) ...
try:
logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login-Befehl gesendet.")
logger.info("Warte auf Dashboard und den 'Prospects finden' Quick-Link...")
prospects_link_selector = (By.XPATH, "//a[@data-test-target-product-tile]")
prospects_link = self.wait.until(EC.element_to_be_clickable(prospects_link_selector))
prospects_link.click()
logger.info("'Prospects finden' geklickt.")
logger.info(f"Warte auf die Liste der Suchen und klicke auf '{search_name}'...")
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']")
search_item = self.wait.until(EC.element_to_be_clickable(search_item_selector))
search_item.click()
logger.info(f"Suche '{search_name}' geladen. Warte auf das Rendern der Ergebnistabelle.")
first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(
EC.visibility_of_element_located(first_row_locator)
)
time.sleep(5)
logger.info("Zielseite mit Ergebnissen erfolgreich erreicht.")
return True
except Exception as e:
logger.critical(f"Der Prozess ist fehlgeschlagen: {type(e).__name__}", exc_info=True)
self._save_debug_artifacts()
return False
def extract_current_page_results(self):
# 1) Kurz Implicit-Wait absenken
self.driver.implicitly_wait(1)
# 2) Auf erstes Daten-Element warten und Puffer
first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first))
# 2) Auf das erste Daten-Element warten
first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first_row_locator))
time.sleep(1)
try:
logger.info("Extrahiere Ergebnisse von der aktuellen Seite...")
results = []
# 3) Auf mindestens eine Tabellenzeile warten
rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_all_elements_located(rows_sel))
rows = self.driver.find_elements(*rows_sel)
# 3) Warten auf mindestens eine Tabellen-Zeile
rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_all_elements_located(rows_selector))
rows = self.driver.find_elements(*rows_selector)
logger.info(f"{len(rows)} Firmen-Zeilen gefunden.")
# 4) Schleife ohne weitere Sleeps
for i, row in enumerate(rows, 1):
# Name-Extraktion (bewährter Selector)
name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
if not name_elems:
logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.")
continue
ne = name_elems[0]
company_name = (ne.get_attribute("title") or ne.text).strip()
name_elem = name_elems[0]
company_name = (name_elem.get_attribute("title") or name_elem.text).strip()
web_elems = row.find_elements(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text")
website = web_elems[0].text.strip() if web_elems else ""
# Website-Extraktion aus 3. Spalte
web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
if web_elems:
# Link-Text ist der Domain-Name
website = web_elems[0].text.strip()
else:
# Fallback: reiner Zellen-Text
cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
website = cell[0].text.strip() if cell else ""
results.append({'name': company_name, 'website': website})
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
return results
except Exception as e:
logger.error(f"Schwerwiegender Fehler bei der Extraktion: {type(e).__name__}", exc_info=True)
self._save_debug_artifacts()
return []
finally:
# 5) Implicit-Wait wieder auf Standard setzen (z.B. 10 s)
# 4) Implicit-Wait auf Standard zurücksetzen
self.driver.implicitly_wait(10)
def click_next_page(self):
# Paginator-Buttons greifen
btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
if not btns:
return False
nxt = btns[-1]
# Ende erreicht?
if not nxt.is_enabled() or "disabled" in nxt.get_attribute("class"):
return False
def close(self):
if self.driver:
logger.info("Schließe den WebDriver.")
self.driver.quit()
current = self.driver.find_element(
By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active"
).text
nxt.click()
# auf Seitenwechsel warten
self.wait.until(lambda d: d.find_element(
By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active"
).text != current)
return True
def run(self):
logger.info("Starte Login und Sucheauswahl…")
self.login_and_select_search()
all_res = []
page = 1
while True:
logger.info(f"Seite {page}: Extrahiere Daten…")
all_res.extend(self.extract_current_page_results())
if not self.click_next_page():
break
page += 1
return all_res
def main():
user, pwd = load_creds(CREDS_FILE)
opts = Options()
opts.add_argument("--headless")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=opts)
wait = WebDriverWait(driver, 30)
try:
scraper = DealfrontScraper(driver, wait, user, pwd)
results = scraper.run()
finally:
driver.quit()
os.makedirs(OUTPUT_DIR, exist_ok=True)
path = os.path.join(OUTPUT_DIR, "results.json")
with open(path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
logger.info(f"✅ Fertig: {len(results)} Einträge in {path}")
if __name__ == "__main__":
main()
logger.info("Starte Dealfront Automatisierung - DEBUG-MODUS")
scraper = None
try:
scraper = DealfrontScraper()
if not scraper.driver:
raise Exception("WebDriver konnte nicht initialisiert werden.")
if not scraper.login_and_find_list(TempConfig.TARGET_SEARCH_NAME):
raise Exception("Der Prozess vom Login bis zum Laden der Liste ist fehlgeschlagen.")
# In dieser Version gibt es keine handle_overlays Methode mehr
# scraper.handle_overlays()
companies = scraper.extract_current_page_results()
if companies:
df = pd.DataFrame(companies)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)
print("\n" + "="*80)
print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "="))
print("="*80)
print(df.to_string(index=False))
print("="*80 + "\n")
else:
logger.warning("Obwohl die Seite geladen wurde, konnten keine Firmen extrahiert werden.")
logger.info("Test erfolgreich abgeschlossen. Warte vor dem Schließen...")
time.sleep(10)
except Exception as e:
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=False)
finally:
if scraper:
scraper.close()
logger.info("Dealfront Automatisierung beendet.")