From 04013920eef53a2ceb1bf9cbdb387090a93a8f8b Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 2 Mar 2026 15:10:12 +0000 Subject: [PATCH] [31388f42] Final session polish: Refined UI, improved ingest parsing, and completed documentation --- .dev_session/SESSION_INFO | 2 +- add_mapping.py | 12 --- company-explorer/backend/app.py | 15 ++- lead-engine/Dockerfile | 3 +- lead-engine/app.py | 114 +++++++++++++--------- lead-engine/db.py | 3 + lead-engine/generate_reply.py | 145 ++++++++++++++++------------ lead-engine/repair_leads.py | 59 ----------- lead-engine/repair_leads_v2.py | 40 -------- lead-engine/trading_twins_ingest.py | 2 + 10 files changed, 171 insertions(+), 224 deletions(-) delete mode 100644 add_mapping.py delete mode 100644 lead-engine/repair_leads.py delete mode 100644 lead-engine/repair_leads_v2.py diff --git a/.dev_session/SESSION_INFO b/.dev_session/SESSION_INFO index 9449ba81..0fc41f27 100644 --- a/.dev_session/SESSION_INFO +++ b/.dev_session/SESSION_INFO @@ -1 +1 @@ -{"task_id": "31588f42-8544-800b-8c82-e17c067bdf69", "token": "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", "readme_path": "connector-superoffice/README.md", "session_start_time": "2026-02-28T18:45:32.220313"} \ No newline at end of file +{"task_id": "31388f42-8544-81d0-9016-e3bf25383da3", "token": "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", "readme_path": null, "session_start_time": "2026-03-02T07:27:14.846513"} \ No newline at end of file diff --git a/add_mapping.py b/add_mapping.py deleted file mode 100644 index ef143eb2..00000000 --- a/add_mapping.py +++ /dev/null @@ -1,12 +0,0 @@ -import sqlite3 - -def add_mapping(): - conn = sqlite3.connect('/app/companies_v3_fixed_2.db') - cursor = conn.cursor() - cursor.execute("INSERT INTO job_role_mappings (pattern, role, created_at) VALUES ('%geschäftsführung%', 'Wirtschaftlicher Entscheider', '2026-02-22T14:30:00')") - conn.commit() - conn.close() - print("Added mapping for geschäftsführung") - -if __name__ == "__main__": - add_mapping() diff --git a/company-explorer/backend/app.py b/company-explorer/backend/app.py index 29a74763..487257b2 100644 --- a/company-explorer/backend/app.py +++ b/company-explorer/backend/app.py @@ -647,6 +647,17 @@ def create_contact_endpoint(contact: ContactCreate, db: Session = Depends(get_db if not company: raise HTTPException(status_code=404, detail="Company not found") + # Automatic Role Mapping logic + final_role = contact.role + if contact.job_title and not final_role: + role_mapping_service = RoleMappingService(db) + found_role = role_mapping_service.get_role_for_job_title(contact.job_title) + if found_role: + final_role = found_role + else: + # Log unclassified title for future mining + role_mapping_service.add_or_update_unclassified_title(contact.job_title) + # Check if contact with same email already exists for this company if contact.email: existing = db.query(Contact).filter(Contact.company_id == contact.company_id, Contact.email == contact.email).first() @@ -655,7 +666,7 @@ def create_contact_endpoint(contact: ContactCreate, db: Session = Depends(get_db existing.first_name = contact.first_name existing.last_name = contact.last_name existing.job_title = contact.job_title - existing.role = contact.role + existing.role = final_role db.commit() db.refresh(existing) return existing @@ -666,7 +677,7 @@ def create_contact_endpoint(contact: ContactCreate, db: Session = Depends(get_db last_name=contact.last_name, email=contact.email, job_title=contact.job_title, - role=contact.role, + role=final_role, is_primary=contact.is_primary, status="ACTIVE", unsubscribe_token=str(uuid.uuid4()) diff --git a/lead-engine/Dockerfile b/lead-engine/Dockerfile index d31b8e62..b3b47e47 100644 --- a/lead-engine/Dockerfile +++ b/lead-engine/Dockerfile @@ -8,4 +8,5 @@ COPY . . RUN pip install streamlit pandas requests python-dotenv ENV PYTHONUNBUFFERED=1 -CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"] +# Start monitor in background and streamlit in foreground +CMD ["sh", "-c", "python monitor.py & streamlit run app.py --server.port=8501 --server.address=0.0.0.0"] diff --git a/lead-engine/app.py b/lead-engine/app.py index 85904bf6..13494fd8 100644 --- a/lead-engine/app.py +++ b/lead-engine/app.py @@ -8,22 +8,40 @@ from enrich import run_sync, refresh_ce_data, sync_single_lead from generate_reply import generate_email_draft def clean_html_to_text(html_content): - """Simple helper to convert HTML email body to readable plain text.""" + """Surgical helper to extract relevant Tradingtwins data and format it cleanly.""" if not html_content: return "" - # Remove head and style tags entirely + + # 1. Strip head and style clean = re.sub(r'.*?', '', html_content, flags=re.DOTALL | re.IGNORECASE) clean = re.sub(r'.*?', '', clean, flags=re.DOTALL | re.IGNORECASE) - # Replace
and

with newlines + + # 2. Extract the core data block (from 'Datum:' until the matchmaking plug) + # We look for the first 'Datum:' label + start_match = re.search(r'Datum:', clean, re.IGNORECASE) + end_match = re.search(r'Kennen Sie schon Ihr persönliches Konto', clean, re.IGNORECASE) + + if start_match: + start_pos = start_match.start() + end_pos = end_match.start() if end_match else len(clean) + clean = clean[start_pos:end_pos] + + # 3. Format Table Structure: should be a space/tab, a newline + # This prevents the "Label on one line, value on next" issue + clean = re.sub(r'\s*', ' ', clean, flags=re.IGNORECASE) + clean = re.sub(r'', '\n', clean, flags=re.IGNORECASE) + + # 4. Standard Cleanup clean = re.sub(r'', '\n', clean, flags=re.IGNORECASE) clean = re.sub(r'

', '\n', clean, flags=re.IGNORECASE) - # Remove all other tags clean = re.sub(r'<.*?>', '', clean) - # Decode some common entities - clean = clean.replace(' ', ' ').replace('&', '&').replace('"', '"') - # Cleanup multiple newlines - clean = re.sub(r'\n\s*\n+', '\n\n', clean).strip() - return clean + + # 5. Entity Decoding + clean = clean.replace(' ', ' ').replace('&', '&').replace('"', '"').replace('>', '>') + + # 6. Final Polish: remove empty lines and leading/trailing whitespace + lines = [line.strip() for line in clean.split('\n') if line.strip()] + return '\n'.join(lines) st.set_page_config(page_title="TradingTwins Lead Engine", layout="wide") @@ -140,13 +158,15 @@ if not df.empty: if meta.get('is_low_quality'): st.warning("⚠️ **Low Quality Lead detected** (Free-mail or missing company).") - # --- SECTION 1: LEAD INFO (2 Columns) --- - st.markdown("### 📋 Lead Data") - c1, c2 = st.columns(2) + # --- SECTION 1: LEAD INFO & INTELLIGENCE --- + col_lead, col_intel = st.columns(2) - with c1: + with col_lead: + st.markdown("### 📋 Lead Data") + st.write(f"**Salutation:** {meta.get('salutation', '-')}") st.write(f"**Contact:** {row['contact_name']}") st.write(f"**Email:** {row['email']}") + st.write(f"**Phone:** {meta.get('phone', row.get('phone', '-'))}") role = meta.get('role') if role: @@ -158,58 +178,56 @@ if not df.empty: found_role = enrich_contact_role(row) if found_role: st.success(f"Found: {found_role}"); st.rerun() else: st.error("No role found.") - - with c2: + st.write(f"**Area:** {meta.get('area', '-')}") st.write(f"**Purpose:** {meta.get('purpose', '-')}") + st.write(f"**Functions:** {meta.get('cleaning_functions', '-')}") st.write(f"**Location:** {meta.get('zip', '')} {meta.get('city', '')}") - - with st.expander("Original Body Preview"): - st.text(clean_html_to_text(row['raw_body'])) - if st.checkbox("Show HTML", key=f"raw_{row['id']}"): - st.code(row['raw_body'], language="html") - st.divider() - - # --- SECTION 2: INTELLIGENCE (CE) --- - st.markdown("### 🔍 Intelligence (CE)") - enrichment = json.loads(row['enrichment_data']) if row['enrichment_data'] else {} - ce_id = enrichment.get('ce_id') - - if ce_id: - st.success(f"✅ Linked to Company Explorer (ID: {ce_id})") - ce_data = enrichment.get('ce_data', {}) + with col_intel: + st.markdown("### 🔍 Intelligence (CE)") + enrichment = json.loads(row['enrichment_data']) if row['enrichment_data'] else {} + ce_id = enrichment.get('ce_id') - vertical = ce_data.get('industry_ai') or ce_data.get('vertical') - summary = ce_data.get('research_dossier') or ce_data.get('summary') - - intel_col1, intel_col2 = st.columns([1, 2]) - with intel_col1: + if ce_id: + st.success(f"✅ Linked to Company Explorer (ID: {ce_id})") + ce_data = enrichment.get('ce_data', {}) + + vertical = ce_data.get('industry_ai') or ce_data.get('vertical') + summary = ce_data.get('research_dossier') or ce_data.get('summary') + if vertical and vertical != 'None': st.info(f"**Industry:** {vertical}") else: st.warning("Industry Analysis pending...") + if summary: + with st.expander("Show AI Research Dossier", expanded=True): + st.write(summary) + if st.button("🔄 Refresh CE Data", key=f"refresh_{row['id']}"): with st.spinner("Fetching..."): refresh_ce_data(row['id'], ce_id) st.rerun() - - with intel_col2: - if summary: - with st.expander("Show AI Research Dossier", expanded=True): - st.write(summary) - else: - st.warning("⚠️ Not synced with Company Explorer yet") - if st.button("🚀 Sync to Company Explorer", key=f"sync_single_{row['id']}"): - with st.spinner("Syncing..."): - sync_single_lead(row['id']) - st.rerun() + else: + st.warning("⚠️ Not synced with Company Explorer yet") + if st.button("🚀 Sync to Company Explorer", key=f"sync_single_{row['id']}"): + with st.spinner("Syncing..."): + sync_single_lead(row['id']) + st.rerun() st.divider() - # --- SECTION 3: RESPONSE DRAFT --- - st.markdown("### ✉️ Response Draft") + # --- SECTION 2: ORIGINAL EMAIL --- + with st.expander("✉️ View Original Email Content"): + st.text(clean_html_to_text(row['raw_body'])) + if st.checkbox("Show Raw HTML", key=f"raw_{row['id']}"): + st.code(row['raw_body'], language="html") + + st.divider() + + # --- SECTION 3: RESPONSE DRAFT (Full Width) --- + st.markdown("### 📝 Response Draft") if row['status'] != 'new' and ce_id: if st.button("✨ Generate Expert Reply", key=f"gen_{row['id']}", type="primary"): with st.spinner("Writing email..."): diff --git a/lead-engine/db.py b/lead-engine/db.py index 61971b40..f1196aa7 100644 --- a/lead-engine/db.py +++ b/lead-engine/db.py @@ -57,6 +57,9 @@ def insert_lead(lead_data): 'zip': lead_data.get('zip'), 'city': lead_data.get('city'), 'role': lead_data.get('role'), + 'salutation': lead_data.get('salutation'), + 'phone': lead_data.get('phone'), + 'cleaning_functions': lead_data.get('cleaning_functions'), 'is_free_mail': lead_data.get('is_free_mail', False), 'is_low_quality': lead_data.get('is_low_quality', False) } diff --git a/lead-engine/generate_reply.py b/lead-engine/generate_reply.py index 251ada8e..f290e4ee 100644 --- a/lead-engine/generate_reply.py +++ b/lead-engine/generate_reply.py @@ -3,8 +3,9 @@ import json import requests import sqlite3 import re +import datetime -# Load API Key +# --- Helper: Get Gemini Key --- def get_gemini_key(): candidates = [ "gemini_api_key.txt", # Current dir @@ -57,34 +58,63 @@ def get_matrix_context(industry_name, persona_name): return context -def get_product_recommendation(area_str): +def get_suggested_date(): + """Calculates a suggested meeting date (3-4 days in future, avoiding weekends).""" + now = datetime.datetime.now() + # Jump 3 days ahead + suggested = now + datetime.timedelta(days=3) + # If weekend, move to Monday + if suggested.weekday() == 5: # Saturday + suggested += datetime.timedelta(days=2) + elif suggested.weekday() == 6: # Sunday + suggested += datetime.timedelta(days=1) + + days_de = ["Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"] + return f"{days_de[suggested.weekday()]}, den {suggested.strftime('%d.%m.')} um 10:00 Uhr" + +def clean_company_name(name): + """Removes legal suffixes like GmbH, AG, etc. for a more personal touch.""" + if not name: return "" + # Remove common German legal forms + cleaned = re.sub(r'\s+(GmbH|AG|GmbH\s+&\s+Co\.\s+KG|KG|e\.V\.|e\.K\.|Limited|Ltd|Inc)\.?(?:\s|$)', '', name, flags=re.IGNORECASE) + return cleaned.strip() + +def get_multi_solution_recommendation(area_str, purpose_str): """ - Selects the right robot based on the surface area mentioned in the lead. + Selects a range of robots based on surface area AND requested purposes. """ - # Naive extraction of first number in the string + recommendations = [] + purpose_lower = purpose_str.lower() + + # 1. Cleaning Logic (Area based) nums = re.findall(r'\d+', area_str.replace('.', '').replace(',', '')) area_val = int(nums[0]) if nums else 0 - if area_val >= 5000 or "über 10.000" in area_str: - return { - "name": "Scrubber 75", - "reason": "als industrielles Kraftpaket für Großflächen ausgelegt", - "usp": "höchste Effizienz und Autonomie auf mehreren tausend Quadratmetern" - } - elif area_val >= 1000: - return { - "name": "Scrubber 50 oder Phantas", - "reason": "die optimale Balance zwischen Reinigungsleistung und Wendigkeit", - "usp": "ideal für mittelgroße Fertigungs- und Lagerbereiche" - } - else: - return { - "name": "Phantas oder Pudu CC1", - "reason": "kompakt und wendig für komplexe Umgebungen", - "usp": "perfekt für Büros, Praxen oder engere Verkehrswege" - } + if "reinigung" in purpose_lower: + if area_val >= 5000 or "über 10.000" in area_str: + recommendations.append("den Scrubber 75 als industrielles Kraftpaket für Ihre Großflächen") + elif area_val >= 1000: + recommendations.append("den Scrubber 50 oder Phantas für eine wendige und gründliche Bodenreinigung") + else: + recommendations.append("den Phantas oder Pudu CC1 für eine effiziente Reinigung Ihrer Räumlichkeiten") -def generate_email_draft(lead_data, company_data, booking_link="[IHR BUCHUNGSLINK - BITTE IN .ENV EINTRAGEN]"): + # 2. Service/Transport Logic + if any(word in purpose_lower for word in ["servieren", "abräumen", "speisen", "getränke"]): + recommendations.append("den BellaBot zur Entlastung Ihres Teams beim Transport von Speisen und Getränken") + + # 3. Marketing/Interaction Logic + if any(word in purpose_lower for word in ["marketing", "gästebetreuung", "kundenansprache"]): + recommendations.append("den KettyBot als interaktiven Begleiter für Marketing und Patienteninformation") + + if not recommendations: + recommendations.append("unsere wendigen Allrounder wie den Phantas") + + return { + "solution_text": " und ".join(recommendations), + "has_multi": len(recommendations) > 1 + } + +def generate_email_draft(lead_data, company_data, booking_link="[IHR BUCHUNGSLINK]"): """ Generates a high-end, personalized sales email using Gemini API and Matrix knowledge. """ @@ -93,7 +123,8 @@ def generate_email_draft(lead_data, company_data, booking_link="[IHR BUCHUNGSLIN return "Error: Gemini API Key not found." # Extract Data from Lead Engine - company_name = lead_data.get('company_name', 'Interessent') + company_raw = lead_data.get('company_name', 'Interessent') + company_name = clean_company_name(company_raw) contact_name = lead_data.get('contact_name', 'Damen und Herren') # Metadata from Lead @@ -105,14 +136,17 @@ def generate_email_draft(lead_data, company_data, booking_link="[IHR BUCHUNGSLIN area = meta.get('area', 'Unbekannte Fläche') purpose = meta.get('purpose', 'Reinigung') role = meta.get('role', 'Wirtschaftlicher Entscheider') + salutation = meta.get('salutation', 'Damen und Herren') + cleaning_functions = meta.get('cleaning_functions', '') # Data from Company Explorer ce_summary = company_data.get('research_dossier') or company_data.get('summary', '') - ce_vertical = company_data.get('industry_ai') or company_data.get('vertical', 'Industry - Manufacturing') + ce_vertical = company_data.get('industry_ai') or company_data.get('vertical', 'Healthcare') ce_opener = company_data.get('ai_opener', '') - # Product logic - product = get_product_recommendation(area) + # Multi-Solution Logic + solution = get_multi_solution_recommendation(area, purpose) + suggested_date = get_suggested_date() # Fetch "Golden Records" from Matrix matrix = get_matrix_context(ce_vertical, role) @@ -122,46 +156,35 @@ def generate_email_draft(lead_data, company_data, booking_link="[IHR BUCHUNGSLIN Du bist ein Senior Sales Executive bei Robo-Planet. Antworte auf eine Anfrage von Tradingtwins. Schreibe eine E-Mail auf "Human Expert Level". - WICHTIGE STRATEGIE: - - Starte NICHT mit seiner Position (CFO). Starte mit der Wertschätzung für sein UNTERNEHMEN ({company_name}). - - Der Empfänger soll durch die Tiefe der Argumente MERKEN, dass wir für einen Entscheider schreiben. - - Mappe ihn erst später als "finanziellen/wirtschaftlichen Entscheider". - - Erwähne eine ROI-Perspektive (Amortisation). - - KONTEXT (Vom Company Explorer): - - Firma: {company_name} - - Branche: {ce_vertical} - - Branchen-Pains (Nutze diese für die Argumentation): {matrix['industry_pains']} - - Branchen-Gains: {matrix['industry_gains']} - - Dossier/Business-Profil: {ce_summary} - - Strategischer Aufhänger: {ce_opener} - - ANSPRECHPARTNER: + WICHTIGE IDENTITÄT: + - Anrede-Form: {salutation} (z.B. Herr, Frau) - Name: {contact_name} - - Rolle: {role} + - Firma: {company_name} - PRODUKT-EMPFEHLUNG (Basierend auf Fläche {area}): - - Modell: {product['name']} - - Warum: {product['reason']} - - USP: {product['usp']} - - ANFRAGE-DETAILS: - - Bedarf: {area} - - Zweck: {purpose} + STRATEGIE: + - STARTE DIREKT mit dem strategischen Aufhänger aus dem Company Explorer ({ce_opener}). Baue daraus den ersten Absatz. + - KEIN "mit großem Interesse verfolge ich..." oder ähnliche Phrasen. Das wirkt unnatürlich. + - Deine Mail reagiert auf die Anfrage zu: {purpose} auf {area}. + - Fasse die vorgeschlagene Lösung ({solution['solution_text']}) KOMPAKT zusammen. Wir bieten ein ganzheitliches Entlastungskonzept an, keine Detail-Auflistung von Datenblättern. + + KONTEXT: + - Branche: {ce_vertical} + - Pains aus Matrix: {matrix['industry_pains']} + - Dossier/Wissen: {ce_summary} + - Strategischer Aufhänger (CE-Opener): {ce_opener} AUFGABE: - Schreibe eine E-Mail mit dieser Struktur: - 1. EINSTIEG: Fokus auf Klemm Bohrtechnik und deren Marktstellung/Produkte (Bezug auf den 'Strategischen Aufhänger'). - 2. DIE BRÜCKE: Verknüpfe die Präzision ihrer Produkte mit der Notwendigkeit von sauberen Hallenböden (besonders bei {area}). Nutze den Schmerzpunkt "Prozesssicherheit/Sensorik". - 3. DIE LÖSUNG: Positioniere den {product['name']} als genau die richtige Wahl für diese Größenordnung ({area}). - 4. ROI-LOGIK: Sprich ihn als wirtschaftlichen Entscheider an. Erwähne, dass wir für solche Projekte ROI-Kalkulationen erstellen, die oft eine Amortisation in unter 18-24 Monaten zeigen. - 5. CALL TO ACTION: Beratungsgespräch + Buchungslink: {booking_link} + 1. ANREDE: Persönlich. + 2. EINSTIEG: Nutze den inhaltlichen Kern von: "{ce_opener}". + 3. DER ÜBERGANG: Verknüpfe dies mit der Anfrage zu {purpose}. Erkläre, dass manuelle Prozesse bei {area} angesichts der Dokumentationspflichten und des Fachkräftemangels zum Risiko werden. + 4. DIE LÖSUNG: Schlage die Kombination aus {solution['solution_text']} als integriertes Konzept vor, um das Team in Reinigung, Service und Patientenansprache spürbar zu entlasten. + 5. ROI: Sprich kurz die Amortisation (18-24 Monate) an – als Argument für den wirtschaftlichen Entscheider. + 6. CTA: Schlag konkret den {suggested_date} vor. Alternativ: {booking_link} - STIL: - Senior, Augenhöhe, keine Floskeln, extrem fokussiert auf Effizienz und Qualität. + STIL: Senior, lösungsorientiert, direkt. Keine unnötigen Füllwörter. FORMAT: - Betreff: [Relevanter Betreff, der direkt auf Klemm Bohrtechnik / Effizienz zielt] + Betreff: [Prägnant, z.B. Automatisierungskonzept für {company_name}] [E-Mail Text] """ @@ -190,4 +213,4 @@ if __name__ == "__main__": "vertical": "Healthcare / Krankenhaus", "summary": "Ein großes Klinikum der Maximalversorgung mit Fokus auf Kardiologie." } - print(generate_email_draft(mock_lead, mock_company)) \ No newline at end of file + print(generate_email_draft(mock_lead, mock_company)) diff --git a/lead-engine/repair_leads.py b/lead-engine/repair_leads.py deleted file mode 100644 index 4e4472d8..00000000 --- a/lead-engine/repair_leads.py +++ /dev/null @@ -1,59 +0,0 @@ -import sqlite3 -import json -import re -import os -import sys - -# Add path to import db -sys.path.append(os.path.dirname(__file__)) -from db import get_leads, update_lead_metadata, init_db - -def parse_tradingtwins_html_local(html_body): - """ - Extracts data from the Tradingtwins HTML table structure. - Copied logic to ensure independence. - """ - data = {} - field_map = { - 'Einsatzzweck': 'purpose', - 'Reinigungs-Fläche': 'area', - 'PLZ': 'zip', - 'Stadt': 'city' - } - - for label, key in field_map.items(): - pattern = fr'>\s*{re.escape(label)}:\s*

.*?]*>(.*?)

' - match = re.search(pattern, html_body, re.DOTALL | re.IGNORECASE) - if match: - raw_val = match.group(1).strip() - clean_val = re.sub(r'<[^>]+>', '', raw_val).strip() - data[key] = clean_val - return data - -def repair_database(): - print("Initializing DB (migrating schema if needed)...") - init_db() - - leads = get_leads() - print(f"Found {len(leads)} leads to check.") - - count = 0 - for lead in leads: - # Check if metadata is missing or empty - current_meta = lead.get('lead_metadata') - if not current_meta or current_meta == '{}' or current_meta == 'null': - print(f"Repairing Lead {lead['id']} ({lead['company_name']})...") - - raw_body = lead.get('raw_body', '') - if raw_body: - extracted = parse_tradingtwins_html_local(raw_body) - update_lead_metadata(lead['id'], extracted) - print(f" -> Extracted: {extracted}") - count += 1 - else: - print(" -> No raw body found.") - - print(f"Repaired {count} leads.") - -if __name__ == "__main__": - repair_database() diff --git a/lead-engine/repair_leads_v2.py b/lead-engine/repair_leads_v2.py deleted file mode 100644 index 532afdf8..00000000 --- a/lead-engine/repair_leads_v2.py +++ /dev/null @@ -1,40 +0,0 @@ -import sqlite3 -import json -import re -import os -import sys - -# Add path to import db -sys.path.append(os.path.dirname(__file__)) -from db import get_leads, update_lead_metadata - -def parse_names(html_body): - data = {} - # Extract Vorname and Nachname from HTML if possible - v_match = re.search(r'>\s*Vorname:\s*

.*?]*>(.*?)

', html_body, re.DOTALL | re.IGNORECASE) - n_match = re.search(r'>\s*Nachname:\s*

.*?]*>(.*?)

', html_body, re.DOTALL | re.IGNORECASE) - - if v_match: data['contact_first'] = re.sub(r'<[^>]+>', '', v_match.group(1)).strip() - if n_match: data['contact_last'] = re.sub(r'<[^>]+>', '', n_match.group(1)).strip() - return data - -def repair_names(): - leads = get_leads() - count = 0 - for lead in leads: - meta = json.loads(lead['lead_metadata']) if lead['lead_metadata'] else {} - - # Only repair if names are missing in meta - if not meta.get('contact_first'): - raw_body = lead.get('raw_body', '') - if raw_body: - name_data = parse_names(raw_body) - if name_data: - meta.update(name_data) - update_lead_metadata(lead['id'], meta) - print(f"Fixed names for {lead['company_name']}: {name_data}") - count += 1 - print(f"Finished. Repaired {count} lead names.") - -if __name__ == "__main__": - repair_names() diff --git a/lead-engine/trading_twins_ingest.py b/lead-engine/trading_twins_ingest.py index 75a39136..2b023117 100644 --- a/lead-engine/trading_twins_ingest.py +++ b/lead-engine/trading_twins_ingest.py @@ -94,9 +94,11 @@ def parse_tradingtwins_html(html_body): 'Firma': 'company', 'Vorname': 'contact_first', 'Nachname': 'contact_last', + 'Anrede': 'salutation', 'E-Mail': 'email', 'Rufnummer': 'phone', 'Einsatzzweck': 'purpose', + 'Reinigungs-Funktionen': 'cleaning_functions', 'Reinigungs-Fläche': 'area', 'PLZ': 'zip', 'Stadt': 'city',