url_check nur für matching
This commit is contained in:
@@ -7,15 +7,15 @@ from helpers import normalize_company_name, simple_normalize_url, serp_website_l
|
||||
from config import Config
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
|
||||
# duplicate_checker.py v2.10 (Mit SerpAPI-Fallback für fehlende Domains)
|
||||
# Version: 2025-08-06_18-45
|
||||
# duplicate_checker.py v2.11 (SerpAPI nur für Matching-Accounts)
|
||||
# Version: 2025-08-08_10-00
|
||||
|
||||
# --- Konfiguration ---
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||
SCORE_THRESHOLD = 80 # Score-Schwelle
|
||||
LOG_DIR = "Log"
|
||||
LOG_FILE = "duplicate_check_v2.10.log"
|
||||
LOG_FILE = "duplicate_check_v2.11.txt"
|
||||
|
||||
# --- Logging Setup ---
|
||||
if not os.path.exists(LOG_DIR):
|
||||
@@ -35,7 +35,7 @@ fh.setFormatter(formatter)
|
||||
root.addHandler(fh)
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Logging to console and file: {log_path}")
|
||||
logger.info("Starting duplicate_checker.py v2.10 | Version: 2025-08-06_18-45")
|
||||
logger.info("Starting duplicate_checker.py v2.11 | Version: 2025-08-08_10-00")
|
||||
|
||||
# --- SerpAPI Key laden ---
|
||||
try:
|
||||
@@ -67,7 +67,7 @@ def calculate_similarity(record1, record2):
|
||||
|
||||
# --- Hauptfunktion ---
|
||||
def main():
|
||||
logger.info("Starte Duplikats-Check v2.10 mit SerpAPI-Fallback")
|
||||
logger.info("Starte Duplikats-Check v2.11 mit SerpAPI-Fallback (nur Matching)")
|
||||
try:
|
||||
sheet = GoogleSheetHandler()
|
||||
logger.info("GoogleSheetHandler initialisiert")
|
||||
@@ -85,18 +85,28 @@ def main():
|
||||
logger.critical("Leere Daten in einem der Sheets. Abbruch.")
|
||||
return
|
||||
|
||||
# --- SerpAPI-Fallback für leere Domains ---
|
||||
# --- SerpAPI-Fallback für leere Domains (nur MATCHING) ---
|
||||
if serp_key:
|
||||
for df, label in [(crm_df,'CRM'), (match_df,'Matching')]:
|
||||
for idx, row in df[df['CRM Website'].fillna('').astype(str).str.strip()==''].iterrows():
|
||||
empty_mask = match_df['CRM Website'].fillna('').astype(str).str.strip() == ''
|
||||
empty_count = int(empty_mask.sum())
|
||||
if empty_count > 0:
|
||||
logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL")
|
||||
found_cnt = 0
|
||||
for idx, row in match_df[empty_mask].iterrows():
|
||||
company = row['CRM Name']
|
||||
try:
|
||||
url = serp_website_lookup(company)
|
||||
if url and 'http' in url:
|
||||
df.at[idx,'CRM Website'] = url
|
||||
logger.info(f"Serp-Fallback ({label}): '{company}' -> {url}")
|
||||
if url and 'http' in url and 'k.A.' not in url:
|
||||
match_df.at[idx, 'CRM Website'] = url
|
||||
logger.info(f" ✓ URL gefunden: '{company}' -> {url}")
|
||||
found_cnt += 1
|
||||
else:
|
||||
logger.debug(f" ✗ Keine eindeutige URL: '{company}' -> {url}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Serp lookup fehlgeschlagen für '{company}': {e}")
|
||||
logger.warning(f" ! Serp-Fehler für '{company}': {e}")
|
||||
logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt")
|
||||
else:
|
||||
logger.info("Serp-Fallback übersprungen: keine fehlenden Matching-URLs")
|
||||
|
||||
# Normalisierung & Blocking-Key
|
||||
for df, label in [(crm_df,'CRM'), (match_df,'Matching')]:
|
||||
|
||||
Reference in New Issue
Block a user