duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-08 06:31:57 +00:00
parent aea5d45c7d
commit 96ba680cb9

View File

@@ -10,7 +10,7 @@ from helpers import normalize_company_name, simple_normalize_url, serp_website_l
from config import Config from config import Config
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.13 (Quality-first: Domain-Gate, Location-Penalties, Smart Blocking, Serp-Trust, Metrics) # duplicate_checker.py v2.14 (Quality-first + SERP nur falls B/E leer: Domain-Gate, Location-Penalties, Smart Blocking, Serp-Trust, Metrics)
# Version-Build: dynamic timestamp below # Version-Build: dynamic timestamp below
# --- Konfiguration --- # --- Konfiguration ---
@@ -24,7 +24,7 @@ PREFILTER_MIN_PARTIAL = 60 # Vorfilter über gesamte CRM-Liste bei fehlende
PREFILTER_LIMIT = 50 # Max. Kandidaten aus Vorfilter PREFILTER_LIMIT = 50 # Max. Kandidaten aus Vorfilter
LOG_DIR = "Log" LOG_DIR = "Log"
now = datetime.now().strftime('%Y-%m-%d_%H-%M') now = datetime.now().strftime('%Y-%m-%d_%H-%M')
LOG_FILE = f"{now}_duplicate_check_v2.13.txt" LOG_FILE = f"{now}_duplicate_check_v2.14.txt"
# --- Logging Setup --- # --- Logging Setup ---
if not os.path.exists(LOG_DIR): if not os.path.exists(LOG_DIR):
@@ -45,7 +45,7 @@ fh.setFormatter(formatter)
root.addHandler(fh) root.addHandler(fh)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info(f"Logging to console and file: {log_path}") logger.info(f"Logging to console and file: {log_path}")
logger.info(f"Starting duplicate_checker.py v2.13 | Build: {now}") logger.info(f"Starting duplicate_checker.py v2.14 | Build: {now}")
# --- SerpAPI Key laden --- # --- SerpAPI Key laden ---
try: try:
@@ -174,7 +174,7 @@ def choose_rarest_token(norm_name: str, token_freq: Counter):
# --- Hauptfunktion --- # --- Hauptfunktion ---
def main(): def main():
logger.info("Starte Duplikats-Check v2.13 (Quality-first)") logger.info("Starte Duplikats-Check v2.14 (Quality-first)")
try: try:
sheet = GoogleSheetHandler() sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert") logger.info("GoogleSheetHandler initialisiert")
@@ -190,12 +190,18 @@ def main():
logger.critical("Leere Daten in einem der Sheets. Abbruch.") logger.critical("Leere Daten in einem der Sheets. Abbruch.")
return return
# SerpAPI nur für Matching (fehlende URLs) → in 'Gefundene Website' speichern # SerpAPI nur für Matching (fehlende URLs in B/E) → in 'Gefundene Website' speichern
if serp_key: if serp_key:
empty_mask = match_df['CRM Website'].fillna('').astype(str).str.strip() == '' # Stelle sicher, dass Spalte E existiert
if 'Gefundene Website' not in match_df.columns:
match_df['Gefundene Website'] = ''
# B/E beide leer? Dann erst suchen. Alles andere: überspringen.
b_empty = match_df['CRM Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na'])
e_empty = match_df['Gefundene Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na'])
empty_mask = b_empty & e_empty
empty_count = int(empty_mask.sum()) empty_count = int(empty_mask.sum())
if empty_count > 0: if empty_count > 0:
logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL") logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL in B/E")
found_cnt = 0 found_cnt = 0
trust_stats = Counter() trust_stats = Counter()
for idx, row in match_df[empty_mask].iterrows(): for idx, row in match_df[empty_mask].iterrows():
@@ -217,7 +223,7 @@ def main():
logger.warning(f" ! Serp-Fehler für '{company}': {e}") logger.warning(f" ! Serp-Fehler für '{company}': {e}")
logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt | Trust: {dict(trust_stats)}") logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt | Trust: {dict(trust_stats)}")
else: else:
logger.info("Serp-Fallback übersprungen: keine fehlenden Matching-URLs") logger.info("Serp-Fallback übersprungen: B oder E bereits befüllt (keine fehlenden Matching-URLs)")
# Normalisierung CRM # Normalisierung CRM
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name) crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
@@ -361,4 +367,4 @@ def main():
logger.info(f"Config: TH={SCORE_THRESHOLD}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})") logger.info(f"Config: TH={SCORE_THRESHOLD}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})")
if __name__=='__main__': if __name__=='__main__':
main() main()