duplicate_checker.py aktualisiert
This commit is contained in:
@@ -1,15 +1,19 @@
|
||||
# duplicate_checker.py (v2.0 - mit Blocking-Strategie)
|
||||
|
||||
import logging
|
||||
import pandas as pd
|
||||
from thefuzz import fuzz
|
||||
from config import Config
|
||||
from helpers import normalize_company_name, simple_normalize_url
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
--- Konfiguration ---
|
||||
|
||||
# --- Konfiguration ---
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||
SCORE_THRESHOLD = 80
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def calculate_similarity(record1, record2):
|
||||
"""Berechnet einen gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen."""
|
||||
total_score = 0
|
||||
@@ -22,9 +26,11 @@ if record1['CRM Ort'] and record1['CRM Ort'] == record2['CRM Ort']:
|
||||
if record1['CRM Land'] and record1['CRM Land'] == record2['CRM Land']:
|
||||
total_score += 20
|
||||
return round(total_score)
|
||||
|
||||
def main():
|
||||
"""Hauptfunktion zum Laden, Vergleichen und Schreiben der Daten."""
|
||||
logging.info("Starte den Duplikats-Check (v2.0 mit Blocking)...")
|
||||
|
||||
try:
|
||||
sheet_handler = GoogleSheetHandler()
|
||||
except Exception as e:
|
||||
@@ -103,5 +109,6 @@ if success:
|
||||
logging.info(f"Ergebnisse erfolgreich in das Tabellenblatt '{MATCHING_SHEET_NAME}' geschrieben.")
|
||||
else:
|
||||
logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.")
|
||||
if name == "main":
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user