From aa4cf6ed240b843942e356890ab110a64611d033 Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 8 Aug 2025 05:34:32 +0000 Subject: [PATCH] =?UTF-8?q?url=20check=20erg=C3=A4nzt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- duplicate_checker.py | 130 +++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index 6eceeee0..e0dabb2b 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -3,18 +3,19 @@ import sys import logging import pandas as pd from thefuzz import fuzz -from helpers import normalize_company_name, simple_normalize_url +from helpers import normalize_company_name, simple_normalize_url, serp_website_lookup +from config import Config from google_sheet_handler import GoogleSheetHandler -# duplicate_checker.py v2.9 (Bulletproof Name-Partial/SORT/SET + Bonus) -# Version: 2025-08-06_18-10 +# duplicate_checker.py v2.10 (Mit SerpAPI-Fallback für fehlende Domains) +# Version: 2025-08-06_18-45 # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" SCORE_THRESHOLD = 80 # Score-Schwelle LOG_DIR = "Log" -LOG_FILE = "duplicate_check_v2.9.txt" +LOG_FILE = "duplicate_check_v2.10.log" # --- Logging Setup --- if not os.path.exists(LOG_DIR): @@ -22,58 +23,51 @@ if not os.path.exists(LOG_DIR): log_path = os.path.join(LOG_DIR, LOG_FILE) root = logging.getLogger() root.setLevel(logging.DEBUG) -# Remove existing handlers -for h in list(root.handlers): - root.removeHandler(h) +for h in list(root.handlers): root.removeHandler(h) formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s") -# Console handler (INFO+) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) ch.setFormatter(formatter) root.addHandler(ch) -# File handler (DEBUG+) fh = logging.FileHandler(log_path, mode='a', encoding='utf-8') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) root.addHandler(fh) logger = logging.getLogger(__name__) logger.info(f"Logging to console and file: {log_path}") -logger.info("Starting duplicate_checker.py v2.9 | Version: 2025-08-06_18-10") +logger.info("Starting duplicate_checker.py v2.10 | Version: 2025-08-06_18-45") + +# --- SerpAPI Key laden --- +try: + Config.load_api_keys() + serp_key = Config.API_KEYS.get('serpapi') + if not serp_key: + logger.warning("SerpAPI Key nicht gefunden; Serp-Fallback deaktiviert.") +except Exception as e: + logger.warning(f"Fehler beim Laden API-Keys: {e}") + serp_key = None # --- Ähnlichkeitsberechnung --- def calculate_similarity(record1, record2): - """Berechnet Score-Komponenten: Domain, Name (SET,PARTIAL,SORT), Ort und Bonus.""" - # Domain exact match - dom1 = record1.get('normalized_domain', '') - dom2 = record2.get('normalized_domain', '') + dom1 = record1.get('normalized_domain','') + dom2 = record2.get('normalized_domain','') domain_flag = 1 if dom1 and dom1 == dom2 else 0 - - # Location exact match - loc_flag = 1 if (record1.get('CRM Ort') == record2.get('CRM Ort') and - record1.get('CRM Land') == record2.get('CRM Land')) else 0 - - # Name scores - n1 = record1.get('normalized_name', '') - n2 = record2.get('normalized_name', '') + loc_flag = 1 if (record1.get('CRM Ort')==record2.get('CRM Ort') and record1.get('CRM Land')==record2.get('CRM Land')) else 0 + n1, n2 = record1.get('normalized_name',''), record2.get('normalized_name','') if n1 and n2: - ts = fuzz.token_set_ratio(n1, n2) - pr = fuzz.partial_ratio(n1, n2) - ss = fuzz.token_sort_ratio(n1, n2) - name_score = max(ts, pr, ss) + ts = fuzz.token_set_ratio(n1,n2) + pr = fuzz.partial_ratio(n1,n2) + ss = fuzz.token_sort_ratio(n1,n2) + name_score = max(ts,pr,ss) else: name_score = 0 - - # Bonus für reine Name-Matches - bonus_flag = 1 if domain_flag == 0 and loc_flag == 0 and name_score >= 85 else 0 - - # Gesamtscore - total = domain_flag * 100 + name_score * 1.0 + loc_flag * 20 + bonus_flag * 20 + bonus_flag = 1 if domain_flag==0 and loc_flag==0 and name_score>=85 else 0 + total = domain_flag*100 + name_score*1.0 + loc_flag*20 + bonus_flag*20 return round(total), domain_flag, name_score, loc_flag, bonus_flag # --- Hauptfunktion --- def main(): - logger.info("Starte Duplikats-Check v2.9 (Bulletproof)") - # GoogleSheetHandler init + logger.info("Starte Duplikats-Check v2.10 mit SerpAPI-Fallback") try: sheet = GoogleSheetHandler() logger.info("GoogleSheetHandler initialisiert") @@ -81,20 +75,31 @@ def main(): logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}") sys.exit(1) - # Daten laden logger.info(f"Lade CRM-Daten aus '{CRM_SHEET_NAME}'...") crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME) logger.info(f"{0 if crm_df is None else len(crm_df)} CRM-Datensätze geladen") logger.info(f"Lade Matching-Daten aus '{MATCHING_SHEET_NAME}'...") match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME) logger.info(f"{0 if match_df is None else len(match_df)} Matching-Datensätze geladen") - if crm_df is None or crm_df.empty or match_df is None or match_df.empty: logger.critical("Leere Daten in einem der Sheets. Abbruch.") return + # --- SerpAPI-Fallback für leere Domains --- + if serp_key: + for df, label in [(crm_df,'CRM'), (match_df,'Matching')]: + for idx, row in df[df['CRM Website'].fillna('').astype(str).str.strip()==''].iterrows(): + company = row['CRM Name'] + try: + url = serp_website_lookup(company) + if url and 'http' in url: + df.at[idx,'CRM Website'] = url + logger.info(f"Serp-Fallback ({label}): '{company}' -> {url}") + except Exception as e: + logger.warning(f"Serp lookup fehlgeschlagen für '{company}': {e}") + # Normalisierung & Blocking-Key - for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]: + for df, label in [(crm_df,'CRM'), (match_df,'Matching')]: df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() @@ -102,52 +107,47 @@ def main(): df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") - # Blocking-Index erzeugen + # Blocking-Index erstellen crm_index = {} for _, row in crm_df.iterrows(): key = row['block_key'] if key: - crm_index.setdefault(key, []).append(row) + crm_index.setdefault(key,[]).append(row) logger.info(f"Blocking-Index mit {len(crm_index)} Keys erstellt") # Matching - results = [] - total = len(match_df) + results=[] + total=len(match_df) logger.info("Starte Matching-Prozess...") - for i, mrow in match_df.iterrows(): - key = mrow['block_key'] - cands = crm_index.get(key, []) + for i,mrow in match_df.iterrows(): + key = mrow['block_key']; cands=crm_index.get(key,[]) logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' -> {len(cands)} Kandidaten") if not cands: - results.append({'Match':'', 'Score':0}) - continue - - scored = [] + results.append({'Match':'','Score':0}); continue + scored=[] for crow in cands: - sc, dm, ns, lm, bf = calculate_similarity(mrow, crow) - scored.append((crow['CRM Name'], sc, dm, ns, lm, bf)) - # Top 3 loggen - for name, sc, dm, ns, lm, bf in sorted(scored, key=lambda x: x[1], reverse=True)[:3]: + sc,dm,ns,lm,bf=calculate_similarity(mrow,crow) + scored.append((crow['CRM Name'],sc,dm,ns,lm,bf)) + for name,sc,dm,ns,lm,bf in sorted(scored,key=lambda x:x[1],reverse=True)[:3]: logger.debug(f" Kandidat: {name}, Score={sc}, Dom={dm}, Name={ns}, Ort={lm}, Bonus={bf}") - - best_name, best_score, dm, ns, lm, bf = max(scored, key=lambda x: x[1]) - if best_score >= SCORE_THRESHOLD: - results.append({'Match':best_name, 'Score':best_score}) - logger.info(f" --> Match: '{best_name}' ({best_score}) [Dom={dm}, Name={ns}, Ort={lm}, Bonus={bf}]") + best_name,best_score,dm,ns,lm,bf=max(scored,key=lambda x:x[1]) + if best_score>=SCORE_THRESHOLD: + results.append({'Match':best_name,'Score':best_score}) + logger.info(f" --> Match: '{best_name}' ({best_score}) [Dom={dm},Name={ns},Ort={lm},Bonus={bf}]") else: - results.append({'Match':'', 'Score':best_score}) - logger.info(f" --> Kein Match (Score={best_score}) [Dom={dm}, Name={ns}, Ort={lm}, Bonus={bf}]") + results.append({'Match':'','Score':best_score}) + logger.info(f" --> Kein Match (Score={best_score}) [Dom={dm},Name={ns},Ort={lm},Bonus={bf}]") # Ergebnisse zurückschreiben logger.info("Schreibe Ergebnisse ins Sheet...") - out = pd.DataFrame(results) - output = match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy() - output = pd.concat([output.reset_index(drop=True), out], axis=1) - data = [output.columns.tolist()] + output.values.tolist() - if sheet.clear_and_write_data(MATCHING_SHEET_NAME, data): + out=pd.DataFrame(results) + output=match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy() + output=pd.concat([output.reset_index(drop=True),out],axis=1) + data=[output.columns.tolist()]+output.values.tolist() + if sheet.clear_and_write_data(MATCHING_SHEET_NAME,data): logger.info("Ergebnisse erfolgreich geschrieben") else: logger.error("Fehler beim Schreiben ins Google Sheet") -if __name__ == '__main__': +if __name__=='__main__': main()