[31f88f42] Keine neuen Commits in dieser Session.
Keine neuen Commits in dieser Session.
This commit is contained in:
@@ -107,6 +107,12 @@ class ReportMistakeRequest(BaseModel):
|
||||
quote: Optional[str] = None
|
||||
user_comment: Optional[str] = None
|
||||
|
||||
class CompanyMatchRequest(BaseModel):
|
||||
name: str
|
||||
website: Optional[str] = None
|
||||
city: Optional[str] = None
|
||||
country: Optional[str] = "Deutschland"
|
||||
|
||||
class ProvisioningRequest(BaseModel):
|
||||
so_contact_id: int
|
||||
so_person_id: Optional[int] = None
|
||||
@@ -302,6 +308,58 @@ def unsubscribe_contact(token: str, db: Session = Depends(get_db)):
|
||||
def health_check(username: str = Depends(authenticate_user)):
|
||||
return {"status": "ok", "version": settings.VERSION, "db": settings.DATABASE_URL}
|
||||
|
||||
@app.post("/api/match-company/reload")
|
||||
async def reload_matching_service(db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
||||
"""
|
||||
Forces the matching service (Deduplicator) to reload all company records from DB.
|
||||
Should be called after major imports or SuperOffice syncs.
|
||||
"""
|
||||
try:
|
||||
app.state.deduplicator = Deduplicator(db)
|
||||
return {
|
||||
"status": "success",
|
||||
"records_loaded": len(app.state.deduplicator.reference_data)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to reload matching service: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/api/match-company")
|
||||
async def match_company(request: CompanyMatchRequest, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
||||
"""
|
||||
Centralized Account Matching Service.
|
||||
Checks if a company already exists in SuperOffice (via Company Explorer DB).
|
||||
Returns list of matches with scores and CRM IDs.
|
||||
"""
|
||||
try:
|
||||
# Lazy initialization of Deduplicator
|
||||
if not hasattr(app.state, 'deduplicator'):
|
||||
logger.info("Initializing Deduplicator for the first time...")
|
||||
app.state.deduplicator = Deduplicator(db)
|
||||
|
||||
# Prepare Candidate dict for the service
|
||||
candidate = {
|
||||
'name': request.name,
|
||||
'website': request.website,
|
||||
'city': request.city,
|
||||
'country': request.country
|
||||
}
|
||||
|
||||
results = app.state.deduplicator.find_duplicates(candidate)
|
||||
|
||||
# Return structured results
|
||||
return {
|
||||
"query": candidate,
|
||||
"match_found": len(results) > 0,
|
||||
"best_match": results[0] if results else None,
|
||||
"all_matches": results
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error in company matching: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
raise HTTPException(status_code=500, detail=f"Matching failed: {str(e)}")
|
||||
|
||||
@app.post("/api/provision/superoffice-contact", response_model=ProvisioningResponse)
|
||||
def provision_superoffice_contact(
|
||||
req: ProvisioningRequest,
|
||||
|
||||
@@ -63,7 +63,8 @@ class Deduplicator:
|
||||
Optimized for 10k-50k records.
|
||||
"""
|
||||
logger.info("Loading reference data for deduplication...")
|
||||
query = self.db.query(Company.id, Company.name, Company.website, Company.city, Company.country)
|
||||
# Include crm_id in the query
|
||||
query = self.db.query(Company.id, Company.name, Company.website, Company.city, Company.country, Company.crm_id)
|
||||
companies = query.all()
|
||||
|
||||
for c in companies:
|
||||
@@ -72,6 +73,7 @@ class Deduplicator:
|
||||
|
||||
record = {
|
||||
'id': c.id,
|
||||
'crm_id': c.crm_id,
|
||||
'name': c.name,
|
||||
'normalized_name': norm_name,
|
||||
'normalized_domain': norm_domain,
|
||||
@@ -81,7 +83,7 @@ class Deduplicator:
|
||||
self.reference_data.append(record)
|
||||
|
||||
# Build Indexes
|
||||
if norm_domain:
|
||||
if norm_domain and norm_domain != "k.a.":
|
||||
self.domain_index.setdefault(norm_domain, []).append(record)
|
||||
|
||||
# Token Frequency
|
||||
@@ -113,7 +115,7 @@ class Deduplicator:
|
||||
candidates_to_check = {} # Map ID -> Record
|
||||
|
||||
# 1. Domain Match (Fastest)
|
||||
if c_norm_domain and c_norm_domain in self.domain_index:
|
||||
if c_norm_domain and c_norm_domain != "k.a." and c_norm_domain in self.domain_index:
|
||||
for r in self.domain_index[c_norm_domain]:
|
||||
candidates_to_check[r['id']] = r
|
||||
|
||||
@@ -123,6 +125,14 @@ class Deduplicator:
|
||||
for r in self.token_index[rtok]:
|
||||
candidates_to_check[r['id']] = r
|
||||
|
||||
if not candidates_to_check:
|
||||
# Fallback: if no domain or rare token match, we might have an exact name match that wasn't indexed correctly (e.g. all tokens are stop words)
|
||||
# This is rare but possible. We check reference_data directly if name is short and candidate pool is empty.
|
||||
if len(c_norm_name) > 3:
|
||||
for r in self.reference_data:
|
||||
if r['normalized_name'] == c_norm_name:
|
||||
candidates_to_check[r['id']] = r
|
||||
|
||||
if not candidates_to_check:
|
||||
return []
|
||||
|
||||
@@ -135,12 +145,14 @@ class Deduplicator:
|
||||
)
|
||||
|
||||
# Threshold Logic (Weak vs Strong)
|
||||
# A match is "weak" if there is no domain match AND no location match
|
||||
is_weak = (details['domain_match'] == 0 and not (details['loc_match']))
|
||||
threshold = SCORE_THRESHOLD_WEAK if is_weak else SCORE_THRESHOLD
|
||||
|
||||
if score >= threshold:
|
||||
matches.append({
|
||||
'company_id': db_rec['id'],
|
||||
'crm_id': db_rec['crm_id'],
|
||||
'name': db_rec['name'],
|
||||
'score': score,
|
||||
'details': details
|
||||
@@ -155,11 +167,11 @@ class Deduplicator:
|
||||
|
||||
# Exact Name Shortcut
|
||||
if n1 and n1 == n2:
|
||||
return 100, {'exact': True, 'domain_match': 0, 'loc_match': 0}
|
||||
return 100, {'exact': True, 'domain_match': 0, 'loc_match': 1 if (cand['c'] and ref['city'] and cand['c'] == ref['city']) else 0, 'name_score': 100, 'penalties': 0}
|
||||
|
||||
# Domain
|
||||
d1, d2 = cand['d'], ref['normalized_domain']
|
||||
domain_match = 1 if (d1 and d2 and d1 == d2) else 0
|
||||
domain_match = 1 if (d1 and d2 and d1 != "k.a." and d1 == d2) else 0
|
||||
|
||||
# Location
|
||||
city_match = 1 if (cand['c'] and ref['city'] and cand['c'] == ref['city']) else 0
|
||||
@@ -176,7 +188,8 @@ class Deduplicator:
|
||||
ss = fuzz.token_sort_ratio(clean1, clean2)
|
||||
name_score = max(ts, pr, ss)
|
||||
else:
|
||||
name_score = 0
|
||||
# If cleaning removed everything, fallback to raw fuzzy on normalized names
|
||||
name_score = fuzz.ratio(n1, n2) if (n1 and n2) else 0
|
||||
|
||||
# Penalties
|
||||
penalties = 0
|
||||
@@ -194,7 +207,7 @@ class Deduplicator:
|
||||
total = name_score
|
||||
|
||||
if loc_match:
|
||||
total += 10 # Bonus
|
||||
total += 10 # Bonus for location match
|
||||
|
||||
total -= penalties
|
||||
|
||||
|
||||
44
company-explorer/backend/tests/test_matching_logic.py
Normal file
44
company-explorer/backend/tests/test_matching_logic.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
# Add backend to path
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from database import Company
|
||||
from services.deduplication import Deduplicator
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Mock DB or use live DB (safely)
|
||||
# The config uses /data/companies_v3_fixed_2.db in Docker, but locally it's in the root.
|
||||
DB_PATH = "../../companies_v3_fixed_2.db"
|
||||
engine = create_engine(f"sqlite:///{DB_PATH}")
|
||||
Session = sessionmaker(bind=engine)
|
||||
db = Session()
|
||||
|
||||
def test_matching():
|
||||
dedup = Deduplicator(db)
|
||||
|
||||
test_cases = [
|
||||
{"name": "Wolfra", "website": "wolfra.de", "city": "Erding"},
|
||||
{"name": "Wolfra Kelterei", "website": "wolfra.de", "city": "Erding"},
|
||||
{"name": "Wolfra Fruchtsaft GmbH", "website": "https://www.wolfra.de/", "city": "Erding"},
|
||||
{"name": "Müller GmbH", "city": "München"}, # Broad search
|
||||
{"name": "NonExistentCompany", "city": "Berlin"}
|
||||
]
|
||||
|
||||
for case in test_cases:
|
||||
print(f"\n--- Matching Query: {case['name']} ({case.get('website', 'no-url')}) ---")
|
||||
results = dedup.find_duplicates(case)
|
||||
if results:
|
||||
for i, res in enumerate(results[:3]):
|
||||
print(f" [{i+1}] Match: {res['name']} (Score: {res['score']}) | CRM ID: {res['crm_id']}")
|
||||
else:
|
||||
print(" No matches found.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_matching()
|
||||
Reference in New Issue
Block a user