dealfront_enrichment.py aktualisiert

This commit is contained in:
2025-07-10 11:36:03 +00:00
parent d086df91e3
commit 4333a6e70a

View File

@@ -1,182 +1,207 @@
#!/usr/bin/env python3
import os import os
import sys
import json import json
import time import time
import logging import logging
import pandas as pd
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# ──────────────────────────────────────────────────────────────── # ==============================================================================
# Konstanten # TEMPORÄRE, AUTARKE KONFIGURATION
LOGIN_URL = "https://app.dealfront.com/login" # ==============================================================================
TARGET_TAB = "Target" class TempConfig:
SEARCH_NAME = "Facility Management" # --- Direkt hier definierte Werte, um config.py zu umgehen ---
CREDS_FILE = "dealfront_credentials.json" DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
OUTPUT_DIR = "output" TARGET_SEARCH_NAME = "Facility Management" # <-- BITTE AN IHRE SUCHE ANPASSEN
CHROMEDRIVER_PATH = "/usr/bin/chromedriver" DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json"
LOG_FORMAT = "%(asctime)s %(levelname)-8s %(message)s" # ==============================================================================
# ────────────────────────────────────────────────────────────────
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True) OUTPUT_DIR = "/app/output"
logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s', force=True)
logging.getLogger("selenium").setLevel(logging.WARNING) logging.getLogger("selenium").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
def load_creds(path): log_filename = f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.txt"
if not os.path.exists(path): log_filepath = os.path.join(OUTPUT_DIR, log_filename)
logger.error("Credentials-Datei nicht gefunden: %s", path) try:
sys.exit(1) file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
creds = json.load(open(path, encoding="utf-8")) file_handler.setLevel(logging.DEBUG)
u, p = creds.get("username"), creds.get("password") file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
if not u or not p: logging.getLogger().addHandler(file_handler)
logger.error("username/password fehlen in %s", path) logger.info(f"Logging konfiguriert. Log-Datei: {log_filepath}")
sys.exit(1) except Exception as e:
return u, p logger.error(f"Konnte Log-Datei nicht erstellen: {e}")
class DealfrontScraper: class DealfrontScraper:
def __init__(self, driver, wait, user, pwd): def __init__(self):
self.driver = driver logger.info("Initialisiere den DealfrontScraper...")
self.wait = wait chrome_options = ChromeOptions()
self.user = user prefs = {"profile.managed_default_content_settings.images": 2}
self.pwd = pwd chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
service = Service(executable_path='/usr/bin/chromedriver')
try:
self.driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception as e:
logger.critical(f"WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, 30)
self.username, self.password = self._load_credentials()
logger.info("WebDriver erfolgreich initialisiert.")
def login_and_select_search(self): def _load_credentials(self):
# 1) Login abschließen try:
self.driver.get(LOGIN_URL) with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r') as f:
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[type='email'],input[type='text']"))) creds = json.load(f)
self.driver.find_element(By.CSS_SELECTOR, "input[type='email'],input[type='text']").send_keys(self.user) return creds.get("username"), creds.get("password")
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.pwd) except Exception as e:
self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click() logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}")
return None, None
# 2) Warte kurz auf URL-Wechsel nach Login def _save_debug_artifacts(self):
self.wait.until(lambda d: d.current_url != LOGIN_URL) # ... (Diese Methode bleibt unverändert) ...
try:
# 3) Direkt zur Target-Übersichtsseite navigieren os.makedirs(OUTPUT_DIR, exist_ok=True)
self.driver.get("https://app.dealfront.com/target") timestamp = time.strftime("%Y%m%d-%H%M%S")
screenshot_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png")
# 4) Sidebar mit Such-List laden html_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html")
sidebar_sel = "ul[data-userpilot-id='sidebar-searches-list']" self.driver.save_screenshot(screenshot_filepath)
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, sidebar_sel))) logger.error(f"Screenshot '{screenshot_filepath}' wurde für die Analyse gespeichert.")
with open(html_filepath, "w", encoding="utf-8") as f:
# 5) Deine Suche anklicken (div[title=…]) f.write(self.driver.page_source)
div_sel = f"div[title='{SEARCH_NAME}']" logger.error(f"HTML-Quellcode '{html_filepath}' wurde für die Analyse gespeichert.")
el = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, div_sel))) except Exception as e:
self.driver.execute_script("arguments[0].click()", el) logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
# 6) Erstes Daten-Element abwarten
first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first))
time.sleep(1)
def login_and_find_list(self, search_name):
# ... (Diese Methode bleibt unverändert, verwendet aber jetzt TempConfig) ...
try:
logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login-Befehl gesendet.")
logger.info("Warte auf Dashboard und den 'Prospects finden' Quick-Link...")
prospects_link_selector = (By.XPATH, "//a[@data-test-target-product-tile]")
prospects_link = self.wait.until(EC.element_to_be_clickable(prospects_link_selector))
prospects_link.click()
logger.info("'Prospects finden' geklickt.")
logger.info(f"Warte auf die Liste der Suchen und klicke auf '{search_name}'...")
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']")
search_item = self.wait.until(EC.element_to_be_clickable(search_item_selector))
search_item.click()
logger.info(f"Suche '{search_name}' geladen. Warte auf das Rendern der Ergebnistabelle.")
first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(
EC.visibility_of_element_located(first_row_locator)
)
time.sleep(5)
logger.info("Zielseite mit Ergebnissen erfolgreich erreicht.")
return True
except Exception as e:
logger.critical(f"Der Prozess ist fehlgeschlagen: {type(e).__name__}", exc_info=True)
self._save_debug_artifacts()
return False
def extract_current_page_results(self): def extract_current_page_results(self):
# 1) Kurz Implicit-Wait absenken # 1) Kurz Implicit-Wait absenken
self.driver.implicitly_wait(1) self.driver.implicitly_wait(1)
# 2) Auf erstes Daten-Element warten und Puffer # 2) Auf das erste Daten-Element warten
first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first)) self.wait.until(EC.visibility_of_element_located(first_row_locator))
time.sleep(1) time.sleep(1)
try: try:
logger.info("Extrahiere Ergebnisse von der aktuellen Seite...") logger.info("Extrahiere Ergebnisse von der aktuellen Seite...")
results = [] results = []
# 3) Auf mindestens eine Tabellenzeile warten # 3) Warten auf mindestens eine Tabellen-Zeile
rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_all_elements_located(rows_sel)) self.wait.until(EC.presence_of_all_elements_located(rows_selector))
rows = self.driver.find_elements(*rows_sel) rows = self.driver.find_elements(*rows_selector)
logger.info(f"{len(rows)} Firmen-Zeilen gefunden.") logger.info(f"{len(rows)} Firmen-Zeilen gefunden.")
# 4) Schleife ohne weitere Sleeps
for i, row in enumerate(rows, 1): for i, row in enumerate(rows, 1):
# Name-Extraktion (bewährter Selector)
name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
if not name_elems: if not name_elems:
logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.") logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.")
continue continue
ne = name_elems[0] name_elem = name_elems[0]
company_name = (ne.get_attribute("title") or ne.text).strip() company_name = (name_elem.get_attribute("title") or name_elem.text).strip()
web_elems = row.find_elements(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text") # Website-Extraktion aus 3. Spalte
website = web_elems[0].text.strip() if web_elems else "" web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
if web_elems:
# Link-Text ist der Domain-Name
website = web_elems[0].text.strip()
else:
# Fallback: reiner Zellen-Text
cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
website = cell[0].text.strip() if cell else ""
results.append({'name': company_name, 'website': website}) results.append({'name': company_name, 'website': website})
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
return results return results
except Exception as e:
logger.error(f"Schwerwiegender Fehler bei der Extraktion: {type(e).__name__}", exc_info=True)
self._save_debug_artifacts()
return []
finally: finally:
# 5) Implicit-Wait wieder auf Standard setzen (z.B. 10 s) # 4) Implicit-Wait auf Standard zurücksetzen
self.driver.implicitly_wait(10) self.driver.implicitly_wait(10)
def click_next_page(self):
# Paginator-Buttons greifen def close(self):
btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") if self.driver:
if not btns: logger.info("Schließe den WebDriver.")
return False self.driver.quit()
nxt = btns[-1]
# Ende erreicht?
if not nxt.is_enabled() or "disabled" in nxt.get_attribute("class"):
return False
current = self.driver.find_element(
By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active"
).text
nxt.click()
# auf Seitenwechsel warten
self.wait.until(lambda d: d.find_element(
By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active"
).text != current)
return True
def run(self):
logger.info("Starte Login und Sucheauswahl…")
self.login_and_select_search()
all_res = []
page = 1
while True:
logger.info(f"Seite {page}: Extrahiere Daten…")
all_res.extend(self.extract_current_page_results())
if not self.click_next_page():
break
page += 1
return all_res
def main():
user, pwd = load_creds(CREDS_FILE)
opts = Options()
opts.add_argument("--headless")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=opts)
wait = WebDriverWait(driver, 30)
try:
scraper = DealfrontScraper(driver, wait, user, pwd)
results = scraper.run()
finally:
driver.quit()
os.makedirs(OUTPUT_DIR, exist_ok=True)
path = os.path.join(OUTPUT_DIR, "results.json")
with open(path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
logger.info(f"✅ Fertig: {len(results)} Einträge in {path}")
if __name__ == "__main__": if __name__ == "__main__":
main() logger.info("Starte Dealfront Automatisierung - DEBUG-MODUS")
scraper = None
try:
scraper = DealfrontScraper()
if not scraper.driver:
raise Exception("WebDriver konnte nicht initialisiert werden.")
if not scraper.login_and_find_list(TempConfig.TARGET_SEARCH_NAME):
raise Exception("Der Prozess vom Login bis zum Laden der Liste ist fehlgeschlagen.")
# In dieser Version gibt es keine handle_overlays Methode mehr
# scraper.handle_overlays()
companies = scraper.extract_current_page_results()
if companies:
df = pd.DataFrame(companies)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)
print("\n" + "="*80)
print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "="))
print("="*80)
print(df.to_string(index=False))
print("="*80 + "\n")
else:
logger.warning("Obwohl die Seite geladen wurde, konnten keine Firmen extrahiert werden.")
logger.info("Test erfolgreich abgeschlossen. Warte vor dem Schließen...")
time.sleep(10)
except Exception as e:
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=False)
finally:
if scraper:
scraper.close()
logger.info("Dealfront Automatisierung beendet.")