From ec4fc642ff7556450e9742d07c65603a6f8247ec Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 8 Aug 2025 05:43:45 +0000 Subject: [PATCH] =?UTF-8?q?url=5Fcheck=20nur=20f=C3=BCr=20matching?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- duplicate_checker.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index e0dabb2b..6471bda4 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -7,15 +7,15 @@ from helpers import normalize_company_name, simple_normalize_url, serp_website_l from config import Config from google_sheet_handler import GoogleSheetHandler -# duplicate_checker.py v2.10 (Mit SerpAPI-Fallback für fehlende Domains) -# Version: 2025-08-06_18-45 +# duplicate_checker.py v2.11 (SerpAPI nur für Matching-Accounts) +# Version: 2025-08-08_10-00 # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" SCORE_THRESHOLD = 80 # Score-Schwelle LOG_DIR = "Log" -LOG_FILE = "duplicate_check_v2.10.log" +LOG_FILE = "duplicate_check_v2.11.txt" # --- Logging Setup --- if not os.path.exists(LOG_DIR): @@ -35,7 +35,7 @@ fh.setFormatter(formatter) root.addHandler(fh) logger = logging.getLogger(__name__) logger.info(f"Logging to console and file: {log_path}") -logger.info("Starting duplicate_checker.py v2.10 | Version: 2025-08-06_18-45") +logger.info("Starting duplicate_checker.py v2.11 | Version: 2025-08-08_10-00") # --- SerpAPI Key laden --- try: @@ -67,7 +67,7 @@ def calculate_similarity(record1, record2): # --- Hauptfunktion --- def main(): - logger.info("Starte Duplikats-Check v2.10 mit SerpAPI-Fallback") + logger.info("Starte Duplikats-Check v2.11 mit SerpAPI-Fallback (nur Matching)") try: sheet = GoogleSheetHandler() logger.info("GoogleSheetHandler initialisiert") @@ -85,18 +85,28 @@ def main(): logger.critical("Leere Daten in einem der Sheets. Abbruch.") return - # --- SerpAPI-Fallback für leere Domains --- + # --- SerpAPI-Fallback für leere Domains (nur MATCHING) --- if serp_key: - for df, label in [(crm_df,'CRM'), (match_df,'Matching')]: - for idx, row in df[df['CRM Website'].fillna('').astype(str).str.strip()==''].iterrows(): + empty_mask = match_df['CRM Website'].fillna('').astype(str).str.strip() == '' + empty_count = int(empty_mask.sum()) + if empty_count > 0: + logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL") + found_cnt = 0 + for idx, row in match_df[empty_mask].iterrows(): company = row['CRM Name'] try: url = serp_website_lookup(company) - if url and 'http' in url: - df.at[idx,'CRM Website'] = url - logger.info(f"Serp-Fallback ({label}): '{company}' -> {url}") + if url and 'http' in url and 'k.A.' not in url: + match_df.at[idx, 'CRM Website'] = url + logger.info(f" ✓ URL gefunden: '{company}' -> {url}") + found_cnt += 1 + else: + logger.debug(f" ✗ Keine eindeutige URL: '{company}' -> {url}") except Exception as e: - logger.warning(f"Serp lookup fehlgeschlagen für '{company}': {e}") + logger.warning(f" ! Serp-Fehler für '{company}': {e}") + logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt") + else: + logger.info("Serp-Fallback übersprungen: keine fehlenden Matching-URLs") # Normalisierung & Blocking-Key for df, label in [(crm_df,'CRM'), (match_df,'Matching')]: