From bb7710d8a174bc5a03924732b040244ba3ac2ceb Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 8 Aug 2025 06:31:57 +0000 Subject: [PATCH] duplicate_checker.py aktualisiert --- duplicate_checker.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index a1c0fd14..4fbe4bb6 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -10,7 +10,7 @@ from helpers import normalize_company_name, simple_normalize_url, serp_website_l from config import Config from google_sheet_handler import GoogleSheetHandler -# duplicate_checker.py v2.13 (Quality-first: Domain-Gate, Location-Penalties, Smart Blocking, Serp-Trust, Metrics) +# duplicate_checker.py v2.14 (Quality-first + SERP nur falls B/E leer: Domain-Gate, Location-Penalties, Smart Blocking, Serp-Trust, Metrics) # Version-Build: dynamic timestamp below # --- Konfiguration --- @@ -24,7 +24,7 @@ PREFILTER_MIN_PARTIAL = 60 # Vorfilter über gesamte CRM-Liste bei fehlende PREFILTER_LIMIT = 50 # Max. Kandidaten aus Vorfilter LOG_DIR = "Log" now = datetime.now().strftime('%Y-%m-%d_%H-%M') -LOG_FILE = f"{now}_duplicate_check_v2.13.txt" +LOG_FILE = f"{now}_duplicate_check_v2.14.txt" # --- Logging Setup --- if not os.path.exists(LOG_DIR): @@ -45,7 +45,7 @@ fh.setFormatter(formatter) root.addHandler(fh) logger = logging.getLogger(__name__) logger.info(f"Logging to console and file: {log_path}") -logger.info(f"Starting duplicate_checker.py v2.13 | Build: {now}") +logger.info(f"Starting duplicate_checker.py v2.14 | Build: {now}") # --- SerpAPI Key laden --- try: @@ -174,7 +174,7 @@ def choose_rarest_token(norm_name: str, token_freq: Counter): # --- Hauptfunktion --- def main(): - logger.info("Starte Duplikats-Check v2.13 (Quality-first)") + logger.info("Starte Duplikats-Check v2.14 (Quality-first)") try: sheet = GoogleSheetHandler() logger.info("GoogleSheetHandler initialisiert") @@ -190,12 +190,18 @@ def main(): logger.critical("Leere Daten in einem der Sheets. Abbruch.") return - # SerpAPI nur für Matching (fehlende URLs) → in 'Gefundene Website' speichern + # SerpAPI nur für Matching (fehlende URLs in B/E) → in 'Gefundene Website' speichern if serp_key: - empty_mask = match_df['CRM Website'].fillna('').astype(str).str.strip() == '' + # Stelle sicher, dass Spalte E existiert + if 'Gefundene Website' not in match_df.columns: + match_df['Gefundene Website'] = '' + # B/E beide leer? Dann erst suchen. Alles andere: überspringen. + b_empty = match_df['CRM Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na']) + e_empty = match_df['Gefundene Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na']) + empty_mask = b_empty & e_empty empty_count = int(empty_mask.sum()) if empty_count > 0: - logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL") + logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL in B/E") found_cnt = 0 trust_stats = Counter() for idx, row in match_df[empty_mask].iterrows(): @@ -217,7 +223,7 @@ def main(): logger.warning(f" ! Serp-Fehler für '{company}': {e}") logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt | Trust: {dict(trust_stats)}") else: - logger.info("Serp-Fallback übersprungen: keine fehlenden Matching-URLs") + logger.info("Serp-Fallback übersprungen: B oder E bereits befüllt (keine fehlenden Matching-URLs)") # Normalisierung CRM crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name) @@ -361,4 +367,4 @@ def main(): logger.info(f"Config: TH={SCORE_THRESHOLD}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})") if __name__=='__main__': - main() \ No newline at end of file + main()