duplicate_checker.py aktualisiert
This commit is contained in:
@@ -1,15 +1,19 @@
|
|||||||
# duplicate_checker.py (v2.0 - mit Blocking-Strategie)
|
# duplicate_checker.py (v2.0 - mit Blocking-Strategie)
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from thefuzz import fuzz
|
from thefuzz import fuzz
|
||||||
from config import Config
|
from config import Config
|
||||||
from helpers import normalize_company_name, simple_normalize_url
|
from helpers import normalize_company_name, simple_normalize_url
|
||||||
from google_sheet_handler import GoogleSheetHandler
|
from google_sheet_handler import GoogleSheetHandler
|
||||||
--- Konfiguration ---
|
|
||||||
|
# --- Konfiguration ---
|
||||||
CRM_SHEET_NAME = "CRM_Accounts"
|
CRM_SHEET_NAME = "CRM_Accounts"
|
||||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||||
SCORE_THRESHOLD = 80
|
SCORE_THRESHOLD = 80
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
def calculate_similarity(record1, record2):
|
def calculate_similarity(record1, record2):
|
||||||
"""Berechnet einen gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen."""
|
"""Berechnet einen gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen."""
|
||||||
total_score = 0
|
total_score = 0
|
||||||
@@ -22,9 +26,11 @@ if record1['CRM Ort'] and record1['CRM Ort'] == record2['CRM Ort']:
|
|||||||
if record1['CRM Land'] and record1['CRM Land'] == record2['CRM Land']:
|
if record1['CRM Land'] and record1['CRM Land'] == record2['CRM Land']:
|
||||||
total_score += 20
|
total_score += 20
|
||||||
return round(total_score)
|
return round(total_score)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Hauptfunktion zum Laden, Vergleichen und Schreiben der Daten."""
|
"""Hauptfunktion zum Laden, Vergleichen und Schreiben der Daten."""
|
||||||
logging.info("Starte den Duplikats-Check (v2.0 mit Blocking)...")
|
logging.info("Starte den Duplikats-Check (v2.0 mit Blocking)...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sheet_handler = GoogleSheetHandler()
|
sheet_handler = GoogleSheetHandler()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -103,5 +109,6 @@ if success:
|
|||||||
logging.info(f"Ergebnisse erfolgreich in das Tabellenblatt '{MATCHING_SHEET_NAME}' geschrieben.")
|
logging.info(f"Ergebnisse erfolgreich in das Tabellenblatt '{MATCHING_SHEET_NAME}' geschrieben.")
|
||||||
else:
|
else:
|
||||||
logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.")
|
logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.")
|
||||||
if name == "main":
|
|
||||||
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Reference in New Issue
Block a user