Files
Brancheneinstufung2/dealfront_enrichment.py

163 lines
6.5 KiB
Python

import os
import time
import json
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options, ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from config import TempConfig # Import deiner Konfigurationsklasse
# Logging konfigurieren
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logger = logging.getLogger(__name__)
class DealfrontScraper:
def __init__(self):
logger.info("Initialisiere den DealfrontScraper...")
chrome_options = ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH)
try:
self.driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT)
self.username, self.password = self._load_credentials()
logger.info("WebDriver erfolgreich initialisiert.")
def _load_credentials(self):
try:
with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
creds = json.load(f)
return creds.get('username'), creds.get('password')
except Exception as e:
logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}")
raise
def _save_debug_artifacts(self):
try:
os.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True)
ts = time.strftime("%Y%m%d-%H%M%S")
png = os.path.join(TempConfig.OUTPUT_DIR, f"error_{ts}.png")
html = os.path.join(TempConfig.OUTPUT_DIR, f"error_{ts}.html")
self.driver.save_screenshot(png)
logger.error(f"Screenshot '{png}' gespeichert.")
with open(html, 'w', encoding='utf-8') as f:
f.write(self.driver.page_source)
logger.error(f"HTML-Source '{html}' gespeichert.")
except Exception as e:
logger.error(f"Debug-Artefakte konnten nicht gespeichert werden: {e}")
def login_and_find_list(self, search_name):
# Login
logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login gesendet.")
# 'Prospects finden'
tile = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile]"))
)
tile.click()
logger.info("'Prospects finden' geklickt.")
# Vordefinierte Suche auswählen
sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{search_name}']")
item = self.wait.until(EC.element_to_be_clickable(sel))
item.click()
logger.info(f"Suche '{search_name}' geladen.")
def extract_current_page_results(self):
# 1) Kurzer Implicit-Wait
self.driver.implicitly_wait(1)
# 2) Warten auf erstes Daten-Element
first_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first_locator))
logger.info("Extrahiere aktuelle Seite...")
results = []
# 3) Auf mindestens eine Zeile warten
rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_all_elements_located(rows_sel))
rows = self.driver.find_elements(*rows_sel)
logger.info(f"{len(rows)} Zeilen gefunden.")
# 4) Namen & Websites extrahieren
for i, row in enumerate(rows, 1):
names = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
if not names:
logger.warning(f"Zeile {i}: Kein Name gefunden.")
continue
name_elem = names[0]
name = (name_elem.get_attribute('title') or name_elem.text).strip()
webs = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
if webs:
web = webs[0].text.strip()
else:
cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
web = cell[0].text.strip() if cell else ''
results.append({'name': name, 'website': web})
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
# Implicit-Wait reset
self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT)
return results
def click_next_page(self) -> bool:
btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
if not btns:
return False
nxt = btns[-1]
if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'):
return False
curr = self.driver.find_element(
By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active"
).text
nxt.click()
self.wait.until(
lambda d: d.find_element(
By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button.active"
).text != curr
)
return True
def run(self, search_name):
try:
self.login_and_find_list(search_name)
all_res = []
while True:
page_res = self.extract_current_page_results()
all_res.extend(page_res)
if not self.click_next_page():
break
return all_res
finally:
self.driver.quit()
if __name__ == '__main__':
scraper = DealfrontScraper()
data = scraper.run('Facility Management')
for d in data:
print(d)