url_check nur für matching

This commit is contained in:
2025-08-08 05:43:45 +00:00
parent aa4cf6ed24
commit be3f48aceb

View File

@@ -7,15 +7,15 @@ from helpers import normalize_company_name, simple_normalize_url, serp_website_l
from config import Config
from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.10 (Mit SerpAPI-Fallback für fehlende Domains)
# Version: 2025-08-06_18-45
# duplicate_checker.py v2.11 (SerpAPI nur für Matching-Accounts)
# Version: 2025-08-08_10-00
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # Score-Schwelle
LOG_DIR = "Log"
LOG_FILE = "duplicate_check_v2.10.log"
LOG_FILE = "duplicate_check_v2.11.txt"
# --- Logging Setup ---
if not os.path.exists(LOG_DIR):
@@ -35,7 +35,7 @@ fh.setFormatter(formatter)
root.addHandler(fh)
logger = logging.getLogger(__name__)
logger.info(f"Logging to console and file: {log_path}")
logger.info("Starting duplicate_checker.py v2.10 | Version: 2025-08-06_18-45")
logger.info("Starting duplicate_checker.py v2.11 | Version: 2025-08-08_10-00")
# --- SerpAPI Key laden ---
try:
@@ -67,7 +67,7 @@ def calculate_similarity(record1, record2):
# --- Hauptfunktion ---
def main():
logger.info("Starte Duplikats-Check v2.10 mit SerpAPI-Fallback")
logger.info("Starte Duplikats-Check v2.11 mit SerpAPI-Fallback (nur Matching)")
try:
sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert")
@@ -85,18 +85,28 @@ def main():
logger.critical("Leere Daten in einem der Sheets. Abbruch.")
return
# --- SerpAPI-Fallback für leere Domains ---
# --- SerpAPI-Fallback für leere Domains (nur MATCHING) ---
if serp_key:
for df, label in [(crm_df,'CRM'), (match_df,'Matching')]:
for idx, row in df[df['CRM Website'].fillna('').astype(str).str.strip()==''].iterrows():
empty_mask = match_df['CRM Website'].fillna('').astype(str).str.strip() == ''
empty_count = int(empty_mask.sum())
if empty_count > 0:
logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL")
found_cnt = 0
for idx, row in match_df[empty_mask].iterrows():
company = row['CRM Name']
try:
url = serp_website_lookup(company)
if url and 'http' in url:
df.at[idx,'CRM Website'] = url
logger.info(f"Serp-Fallback ({label}): '{company}' -> {url}")
if url and 'http' in url and 'k.A.' not in url:
match_df.at[idx, 'CRM Website'] = url
logger.info(f" ✓ URL gefunden: '{company}' -> {url}")
found_cnt += 1
else:
logger.debug(f" ✗ Keine eindeutige URL: '{company}' -> {url}")
except Exception as e:
logger.warning(f"Serp lookup fehlgeschlagen für '{company}': {e}")
logger.warning(f" ! Serp-Fehler für '{company}': {e}")
logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt")
else:
logger.info("Serp-Fallback übersprungen: keine fehlenden Matching-URLs")
# Normalisierung & Blocking-Key
for df, label in [(crm_df,'CRM'), (match_df,'Matching')]: