dealfront_enrichment.py aktualisiert

This commit is contained in:
2025-07-08 19:07:01 +00:00
parent eb2d7dacd1
commit 8c69b2d7e1

View File

@@ -1,45 +1,56 @@
import os
import time
import json
import time
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from config import TempConfig # Deine Konfigurationsklasse mit Pfaden und URLs
# Temporäre, autarke Konfiguration (ersetzt externes config.py)
class TempConfig:
DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
TARGET_SEARCH_NAME = "Facility Management" # Kann angepasst werden
DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json"
CHROMEDRIVER_PATH = "/usr/bin/chromedriver"
DEFAULT_TIMEOUT = 30
IMPLICIT_WAIT = 10
OUTPUT_DIR = "/app/output"
# Logging konfigurieren
template = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=template)
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logging.getLogger("selenium").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
# Sicherstellen, dass OUTPUT_DIR existiert
ios.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True)
class DealfrontScraper:
def __init__(self):
logger.info("Initialisiere den DealfrontScraper...")
# Chrome-Optionen
chrome_options = Options()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
# WebDriver-Service
# WebDriver
service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH)
try:
self.driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
# Explicit Wait
self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT)
self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT)
# Credentials laden
self.username, self.password = self._load_credentials()
logger.info("WebDriver erfolgreich initialisiert.")
@@ -53,8 +64,8 @@ class DealfrontScraper:
logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}")
raise
def login_and_find_list(self, search_name):
# Login-Flow
def login_and_find_list(self):
# Login
logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username)
@@ -62,7 +73,7 @@ class DealfrontScraper:
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login gesendet.")
# Klicken auf 'Prospects finden'
# 'Prospects finden' anklicken
tile = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile='Prospects finden']"))
)
@@ -70,37 +81,32 @@ class DealfrontScraper:
logger.info("'Prospects finden' geklickt.")
# Vordefinierte Suche auswählen
selector = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{search_name}']")
item = self.wait.until(EC.element_to_be_clickable(selector))
sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{TempConfig.TARGET_SEARCH_NAME}']")
item = self.wait.until(EC.element_to_be_clickable(sel))
item.click()
logger.info(f"Suche '{search_name}' geladen.")
logger.info(f"Suche '{TempConfig.TARGET_SEARCH_NAME}' geladen.")
def extract_current_page_results(self):
# Kurz Implicit-Wait
self.driver.implicitly_wait(1)
# Warten auf erstes Daten-Element
# Warte auf erstes Daten-Element
first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first))
logger.info("Extrahiere aktuelle Seite...")
results = []
# Warten auf mindestens eine Tabellen-Zeile
# Warten auf Zeilen
rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_all_elements_located(rows_sel))
rows = self.driver.find_elements(*rows_sel)
logger.info(f"{len(rows)} Zeilen gefunden.")
# Extraktion Namen & Website
for i, row in enumerate(rows, 1):
# Name
name_el = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
if not name_el:
logger.warning(f"Zeile {i}: Kein Name gefunden. Überspringe.")
continue
elem = name_el[0]
name = (elem.get_attribute('title') or elem.text).strip()
name = (name_el[0].get_attribute('title') or name_el[0].text).strip()
# Website
web_el = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
@@ -113,40 +119,36 @@ class DealfrontScraper:
results.append({'name': name, 'website': web})
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
# Implicit-Wait zurücksetzen
self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT)
return results
def click_next_page(self) -> bool:
buttons = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
if not buttons:
btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
if not btns:
return False
nxt = buttons[-1]
nxt = btns[-1]
if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'):
return False
current = self.driver.find_element(By.CSS_SELECTOR,
"nav.eb-pagination a.eb-pagination-button.active").text
curr = self.driver.find_element(By.CSS_SELECTOR,
"nav.eb-pagination a.eb-pagination-button.active").text
nxt.click()
self.wait.until(lambda d: d.find_element(By.CSS_SELECTOR,
"nav.eb-pagination a.eb-pagination-button.active").text != current)
"nav.eb-pagination a.eb-pagination-button.active").text != curr)
return True
def run(self, search_name):
def run(self):
try:
self.login_and_find_list(search_name)
self.login_and_find_list()
all_data = []
while True:
page = self.extract_current_page_results()
all_data.extend(page)
all_data.extend(self.extract_current_page_results())
if not self.click_next_page():
break
return all_data
finally:
self.driver.quit()
if __name__ == '__main__':
scraper = DealfrontScraper()
data = scraper.run('Facility Management')
data = scraper.run()
for entry in data:
print(entry)