Files
Brancheneinstufung2/dealfront_enrichment.py

113 lines
4.8 KiB
Python

import os
import json
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
# --- Konfiguration ---
class Config:
LOGIN_URL = "https://app.dealfront.com/login"
BASE_TARGET_URL = "https://app.dealfront.com/t/prospector/companies/p/"
SEARCH_NAME = "Facility Management"
CREDENTIALS_FILE = "/app/dealfront_credentials.json"
OUTPUT_DIR = "/app/output"
# --- Logging Setup ---
# ... (bleibt unverändert) ...
class DealfrontScraper:
# ... (__init__, _load_credentials, _save_debug_artifacts bleiben unverändert) ...
def login_and_prepare_search(self):
# ... (Login bleibt unverändert) ...
# Navigieren Sie zur ersten Seite der Suche, um die Session zu initialisieren
self.driver.get(f"{Config.BASE_TARGET_URL}1?search_name={Config.SEARCH_NAME}")
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table")))
logger.info("Erste Ergebnisseite erfolgreich geladen.")
return True
def extract_data_with_js(self):
"""Extrahiert die Daten mit einem direkt im Browser ausgeführten JavaScript."""
script = """
const results = [];
const rows = document.querySelectorAll("table#t-result-table tbody tr[id]");
rows.forEach(row => {
const companyElem = row.querySelector(".sticky-column a.t-highlight-text");
const websiteElem = row.querySelector("a.text-gray-400.t-highlight-text");
if (companyElem) {
results.push({
name: companyElem.getAttribute('title') || companyElem.innerText,
website: websiteElem ? websiteElem.innerText : 'N/A'
});
}
});
return results;
"""
try:
return self.driver.execute_script(script)
except Exception as e:
logger.error(f"JavaScript-Extraktion fehlgeschlagen: {e}")
return []
def run_full_extraction(self, max_pages=6):
all_companies = {}
if not self.login_and_prepare_search():
return []
for page_number in range(1, max_pages + 1):
try:
page_url = f"{Config.BASE_TARGET_URL}{page_number}?search_name={Config.SEARCH_NAME}"
logger.info(f"--- Navigiere zu Seite {page_number}: {page_url} ---")
self.driver.get(page_url)
# Warten auf ein stabiles Element, das anzeigt, dass die Tabelle da ist
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table")))
time.sleep(3) # Kurze Pause, damit das JS rendern kann
page_results = self.extract_data_with_js()
if not page_results:
logger.warning(f"Seite {page_number}: Keine Daten extrahiert. Möglicherweise das Ende erreicht.")
break
for company in page_results:
unique_key = (company['name'], company['website'])
if unique_key not in all_companies:
all_companies[unique_key] = company
logger.info(f"Seite {page_number}: {len(page_results)} Firmen gefunden. Gesamt einzigartig: {len(all_companies)}")
except TimeoutException:
logger.warning(f"Timeout beim Laden von Seite {page_number}. Breche Paginierung ab.")
self._save_debug_artifacts(f"page_{page_number}")
break
return list(all_companies.values())
# ... (close-Methode bleibt unverändert) ...
if __name__ == "__main__":
scraper = None
try:
scraper = DealfrontScraper()
all_companies = scraper.run_full_extraction(max_pages=6) # Setzen Sie hier die maximale Seitenzahl
if all_companies:
df = pd.DataFrame(all_companies)
output_csv_path = os.path.join(Config.OUTPUT_DIR, f"dealfront_results_{time.strftime('%Y%m%d-%H%M%S')}.csv")
df.to_csv(output_csv_path, index=False, sep=';', encoding='utf-8-sig')
logger.info(f"Alle Ergebnisse ({len(df)} Firmen) erfolgreich gespeichert: {output_csv_path}")
else:
logger.warning("Keine Firmen konnten extrahiert werden.")
except Exception as e:
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten.", exc_info=True)
finally:
if scraper:
scraper.close()