[31f88f42] Keine neuen Commits in dieser Session.

Keine neuen Commits in dieser Session.
2026-03-10 13:54:07 +00:00
parent a3f79db2d2
commit 3fd3c5acfa
8 changed files with 268 additions and 9 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -107,6 +107,12 @@ class ReportMistakeRequest(BaseModel):
    quote: Optional[str] = None
    user_comment: Optional[str] = None

+class CompanyMatchRequest(BaseModel):
+    name: str
+    website: Optional[str] = None
+    city: Optional[str] = None
+    country: Optional[str] = "Deutschland"
+
 class ProvisioningRequest(BaseModel):
    so_contact_id: int
    so_person_id: Optional[int] = None
@@ -302,6 +308,58 @@ def unsubscribe_contact(token: str, db: Session = Depends(get_db)):
 def health_check(username: str = Depends(authenticate_user)):
    return {"status": "ok", "version": settings.VERSION, "db": settings.DATABASE_URL}

+@app.post("/api/match-company/reload")
+async def reload_matching_service(db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
+    """
+    Forces the matching service (Deduplicator) to reload all company records from DB.
+    Should be called after major imports or SuperOffice syncs.
+    """
+    try:
+        app.state.deduplicator = Deduplicator(db)
+        return {
+            "status": "success", 
+            "records_loaded": len(app.state.deduplicator.reference_data)
+        }
+    except Exception as e:
+        logger.error(f"Failed to reload matching service: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/api/match-company")
+async def match_company(request: CompanyMatchRequest, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
+    """
+    Centralized Account Matching Service.
+    Checks if a company already exists in SuperOffice (via Company Explorer DB).
+    Returns list of matches with scores and CRM IDs.
+    """
+    try:
+        # Lazy initialization of Deduplicator
+        if not hasattr(app.state, 'deduplicator'):
+            logger.info("Initializing Deduplicator for the first time...")
+            app.state.deduplicator = Deduplicator(db)
+        
+        # Prepare Candidate dict for the service
+        candidate = {
+            'name': request.name,
+            'website': request.website,
+            'city': request.city,
+            'country': request.country
+        }
+        
+        results = app.state.deduplicator.find_duplicates(candidate)
+        
+        # Return structured results
+        return {
+            "query": candidate,
+            "match_found": len(results) > 0,
+            "best_match": results[0] if results else None,
+            "all_matches": results
+        }
+    except Exception as e:
+        logger.error(f"Error in company matching: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=f"Matching failed: {str(e)}")
+
@app.post("/api/provision/superoffice-contact", response_model=ProvisioningResponse)
 def provision_superoffice_contact(
    req: ProvisioningRequest,
--- a/company-explorer/backend/services/deduplication.py
+++ b/company-explorer/backend/services/deduplication.py
@@ -63,7 +63,8 @@ class Deduplicator:
        Optimized for 10k-50k records.
        """
        logger.info("Loading reference data for deduplication...")
-        query = self.db.query(Company.id, Company.name, Company.website, Company.city, Company.country)
+        # Include crm_id in the query
+        query = self.db.query(Company.id, Company.name, Company.website, Company.city, Company.country, Company.crm_id)
        companies = query.all()
        
        for c in companies:
@@ -72,6 +73,7 @@ class Deduplicator:
            
            record = {
                'id': c.id,
+                'crm_id': c.crm_id,
                'name': c.name,
                'normalized_name': norm_name,
                'normalized_domain': norm_domain,
@@ -81,7 +83,7 @@ class Deduplicator:
            self.reference_data.append(record)
            
            # Build Indexes
-            if norm_domain:
+            if norm_domain and norm_domain != "k.a.":
                self.domain_index.setdefault(norm_domain, []).append(record)
            
            # Token Frequency
@@ -113,7 +115,7 @@ class Deduplicator:
        candidates_to_check = {} # Map ID -> Record

        # 1. Domain Match (Fastest)
-        if c_norm_domain and c_norm_domain in self.domain_index:
+        if c_norm_domain and c_norm_domain != "k.a." and c_norm_domain in self.domain_index:
            for r in self.domain_index[c_norm_domain]:
                candidates_to_check[r['id']] = r

@@ -123,6 +125,14 @@ class Deduplicator:
            for r in self.token_index[rtok]:
                candidates_to_check[r['id']] = r

+        if not candidates_to_check:
+            # Fallback: if no domain or rare token match, we might have an exact name match that wasn't indexed correctly (e.g. all tokens are stop words)
+            # This is rare but possible. We check reference_data directly if name is short and candidate pool is empty.
+            if len(c_norm_name) > 3:
+                for r in self.reference_data:
+                    if r['normalized_name'] == c_norm_name:
+                        candidates_to_check[r['id']] = r
+
        if not candidates_to_check:
            return []

@@ -135,12 +145,14 @@ class Deduplicator:
            )
            
            # Threshold Logic (Weak vs Strong)
+            # A match is "weak" if there is no domain match AND no location match
            is_weak = (details['domain_match'] == 0 and not (details['loc_match']))
            threshold = SCORE_THRESHOLD_WEAK if is_weak else SCORE_THRESHOLD
            
            if score >= threshold:
                matches.append({
                    'company_id': db_rec['id'],
+                    'crm_id': db_rec['crm_id'],
                    'name': db_rec['name'],
                    'score': score,
                    'details': details
@@ -155,11 +167,11 @@ class Deduplicator:
        
        # Exact Name Shortcut
        if n1 and n1 == n2:
-            return 100, {'exact': True, 'domain_match': 0, 'loc_match': 0}
+            return 100, {'exact': True, 'domain_match': 0, 'loc_match': 1 if (cand['c'] and ref['city'] and cand['c'] == ref['city']) else 0, 'name_score': 100, 'penalties': 0}

        # Domain
        d1, d2 = cand['d'], ref['normalized_domain']
-        domain_match = 1 if (d1 and d2 and d1 == d2) else 0
+        domain_match = 1 if (d1 and d2 and d1 != "k.a." and d1 == d2) else 0

        # Location
        city_match = 1 if (cand['c'] and ref['city'] and cand['c'] == ref['city']) else 0
@@ -176,7 +188,8 @@ class Deduplicator:
            ss = fuzz.token_sort_ratio(clean1, clean2)
            name_score = max(ts, pr, ss)
        else:
-            name_score = 0
+            # If cleaning removed everything, fallback to raw fuzzy on normalized names
+            name_score = fuzz.ratio(n1, n2) if (n1 and n2) else 0

        # Penalties
        penalties = 0
@@ -194,7 +207,7 @@ class Deduplicator:
            total = name_score
        
        if loc_match:
-            total += 10 # Bonus
+            total += 10 # Bonus for location match
        
        total -= penalties
        
--- a/company-explorer/backend/tests/test_matching_logic.py
+++ b/company-explorer/backend/tests/test_matching_logic.py
@@ -0,0 +1,44 @@
+import sys
+import os
+import logging
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+# Add backend to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from database import Company
+from services.deduplication import Deduplicator
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Mock DB or use live DB (safely)
+# The config uses /data/companies_v3_fixed_2.db in Docker, but locally it's in the root.
+DB_PATH = "../../companies_v3_fixed_2.db"
+engine = create_engine(f"sqlite:///{DB_PATH}")
+Session = sessionmaker(bind=engine)
+db = Session()
+
+def test_matching():
+    dedup = Deduplicator(db)
+    
+    test_cases = [
+        {"name": "Wolfra", "website": "wolfra.de", "city": "Erding"},
+        {"name": "Wolfra Kelterei", "website": "wolfra.de", "city": "Erding"},
+        {"name": "Wolfra Fruchtsaft GmbH", "website": "https://www.wolfra.de/", "city": "Erding"},
+        {"name": "Müller GmbH", "city": "München"}, # Broad search
+        {"name": "NonExistentCompany", "city": "Berlin"}
+    ]
+    
+    for case in test_cases:
+        print(f"\n--- Matching Query: {case['name']} ({case.get('website', 'no-url')}) ---")
+        results = dedup.find_duplicates(case)
+        if results:
+            for i, res in enumerate(results[:3]):
+                print(f"  [{i+1}] Match: {res['name']} (Score: {res['score']}) | CRM ID: {res['crm_id']}")
+        else:
+            print("  No matches found.")
+
+if __name__ == "__main__":
+    test_matching()