From a8715c8dce4f3f25008fb8a08986999683fdb7db Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 2 Mar 2026 07:39:46 +0000 Subject: [PATCH] [31388f42] Implement end-to-end email ingest for Tradingtwins leads via MS Graph API --- lead-engine/ingest.py | 34 +++++++ lead-engine/trading_twins_ingest.py | 143 ++++++++++++++++++++++++++++ trading_twins_tool.py | 64 ++++++++----- 3 files changed, 219 insertions(+), 22 deletions(-) create mode 100644 lead-engine/trading_twins_ingest.py diff --git a/lead-engine/ingest.py b/lead-engine/ingest.py index 49f09a12..153fa980 100644 --- a/lead-engine/ingest.py +++ b/lead-engine/ingest.py @@ -28,6 +28,40 @@ def parse_tradingtwins_email(body): data['raw_body'] = body return data +def parse_roboplanet_form(body): + """ + Parses the Roboplanet website contact form (HTML format). + Example: Vorname: BÄKO
Nachname: eG
Email: Alexander.Grau@baeko-hr.de ... + """ + data = {} + + # Helper to strip HTML tags if needed, but we'll use regex on the content + patterns = { + 'contact_first': r'Vorname:\s*(.*?)\s*
', + 'contact_last': r'Nachname:\s*(.*?)\s*
', + 'email': r'Email:\s*(.*?)\s*
', + 'phone': r'Telefon:\s*(.*?)\s*
', + 'company': r'Firma:\s*(.*?)\s*
', + 'zip': r'PLZ:\s*(.*?)\s*
', + 'message': r'Nachricht:\s*(.*?)\s*(?:
|--|$)' + } + + for key, pattern in patterns.items(): + # Use re.DOTALL for message if it spans lines, but usually it's one block + match = re.search(pattern, body, re.IGNORECASE | re.DOTALL) + if match: + # Clean HTML tags from the captured value if any + val = re.sub(r'<.*?>', '', match.group(1)).strip() + data[key] = val + + # Combine names + if 'contact_first' in data and 'contact_last' in data: + data['contact'] = f"{data['contact_first']} {data['contact_last']}" + + # For Roboplanet forms, we use the timestamp as ID or a hash if missing + data['raw_body'] = body + return data + def ingest_mock_leads(): # Mock data from the session context leads = [ diff --git a/lead-engine/trading_twins_ingest.py b/lead-engine/trading_twins_ingest.py new file mode 100644 index 00000000..6e6e5987 --- /dev/null +++ b/lead-engine/trading_twins_ingest.py @@ -0,0 +1,143 @@ +import os +import sys +import re +import logging +import requests +import json +from datetime import datetime +from dotenv import load_dotenv + +# Ensure we can import from root directory +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +# Import from root modules +try: + from company_explorer_connector import handle_company_workflow +except ImportError: + # Fallback/Mock for testing if run in isolation without full env + def handle_company_workflow(company_name): + return {"status": "mock", "data": {"name": company_name, "id": "mock-id"}} + +# Configuration +load_dotenv(override=True) +CLIENT_ID = os.getenv("INFO_Application_ID") +TENANT_ID = os.getenv("INFO_Tenant_ID") +CLIENT_SECRET = os.getenv("INFO_Secret") +USER_EMAIL = "info@robo-planet.de" + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def get_access_token(): + url = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token" + data = { + "client_id": CLIENT_ID, + "scope": "https://graph.microsoft.com/.default", + "client_secret": CLIENT_SECRET, + "grant_type": "client_credentials" + } + response = requests.post(url, data=data) + response.raise_for_status() + return response.json().get("access_token") + +def fetch_tradingtwins_emails(token, limit=20): + url = f"https://graph.microsoft.com/v1.0/users/{USER_EMAIL}/messages" + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json" + } + # Filter for Tradingtwins subject + params = { + "$top": limit, + "$select": "id,subject,receivedDateTime,body", + "$orderby": "receivedDateTime desc" + } + response = requests.get(url, headers=headers, params=params) + if response.status_code != 200: + logger.error(f"Graph API Error: {response.status_code} - {response.text}") + return [] + + all_msgs = response.json().get("value", []) + # Filter strictly for the subject pattern we saw + return [m for m in all_msgs if "Neue Anfrage zum Thema Roboter" in m.get('subject', '')] + +def parse_tradingtwins_html(html_body): + """ + Extracts data from the Tradingtwins HTML table structure. + Pattern:

Label:

...

Value

+ """ + data = {} + + # Map label names in HTML to our keys + field_map = { + 'Firma': 'company', + 'Vorname': 'first_name', + 'Nachname': 'last_name', + 'E-Mail': 'email', + 'Rufnummer': 'phone', + 'Einsatzzweck': 'purpose', + 'Reinigungs-Fläche': 'area', + 'PLZ': 'zip', + 'Stadt': 'city', + 'Lead-ID': 'lead_id' + } + + for label, key in field_map.items(): + # Regex explanation: + # >\s*{label}:\s*

-> Finds the label inside a p tag, ending with colon + # .*? -> Non-greedy match for table cell closing/opening + # ]*> -> Finds the start of the value paragraph + # (.*?) -> Captures the value + #

-> Ends at closing paragraph tag + pattern = fr'>\s*{re.escape(label)}:\s*

.*?]*>(.*?)

' + + match = re.search(pattern, html_body, re.DOTALL | re.IGNORECASE) + if match: + # Clean up the value (remove HTML tags inside if any, though usually plain text) + raw_val = match.group(1).strip() + # Remove any link tags if present (e.g. for email/phone) + clean_val = re.sub(r'<[^>]+>', '', raw_val).strip() + data[key] = clean_val + + # Composite fields + if data.get('first_name') and data.get('last_name'): + data['contact_name'] = f"{data['first_name']} {data['last_name']}" + + return data + +def process_leads(): + try: + token = get_access_token() + emails = fetch_tradingtwins_emails(token) + logger.info(f"Found {len(emails)} Tradingtwins emails.") + + for email in emails: + body = email.get('body', {}).get('content', '') + lead_data = parse_tradingtwins_html(body) + + company_name = lead_data.get('company') + if not company_name or company_name == '-': + # Fallback if company is empty (sometimes happens with private persons) + # Use contact name as company name + company_name = lead_data.get('contact_name') + + if not company_name: + logger.warning(f"Skipping email {email['id']}: No company or contact name found.") + continue + + logger.info(f"Processing Lead: {company_name} (ID: {lead_data.get('lead_id')})") + + # Trigger Company Explorer Workflow + # Note: In a real scenario, we might want to check if we already processed this message ID + # to avoid duplicates. For now, we rely on the Company Explorer's deduplication. + logger.info(f" -> Triggering Company Explorer for '{company_name}'...") + result = handle_company_workflow(company_name) + + logger.info(f" -> Result: {result.get('status')} (ID: {result.get('data', {}).get('id')})") + + except Exception as e: + logger.error(f"Error in process_leads: {e}") + +if __name__ == "__main__": + process_leads() diff --git a/trading_twins_tool.py b/trading_twins_tool.py index 773cc18b..1f4df58a 100644 --- a/trading_twins_tool.py +++ b/trading_twins_tool.py @@ -1,6 +1,16 @@ import json import time import os +import sys + +# Ensure we can import from lead-engine +sys.path.append(os.path.join(os.path.dirname(__file__), 'lead-engine')) +try: + from trading_twins_ingest import process_leads +except ImportError: + print("Warning: Could not import trading_twins_ingest from lead-engine. Email ingestion disabled.") + process_leads = None + from company_explorer_connector import handle_company_workflow def run_trading_twins_process(target_company_name: str): @@ -46,6 +56,14 @@ def run_trading_twins_process(target_company_name: str): print(f"Trading Twins Analyse für {target_company_name} abgeschlossen.") print(f"{'='*50}\n") +def run_email_ingest(): + """Starts the automated email ingestion process for Tradingtwins leads.""" + if process_leads: + print("\nStarting automated email ingestion via Microsoft Graph...") + process_leads() + print("Email ingestion completed.") + else: + print("Error: Email ingestion module not available.") if __name__ == "__main__": # Simulieren der Umgebungsvariablen für diesen Testlauf, falls nicht gesetzt @@ -54,26 +72,28 @@ if __name__ == "__main__": if "COMPANY_EXPLORER_API_PASSWORD" not in os.environ: os.environ["COMPANY_EXPLORER_API_PASSWORD"] = "gemini" - # Testfall 1: Ein Unternehmen, das wahrscheinlich bereits existiert - # Da 'Robo-Planet GmbH' bei den vorherigen Läufen erstellt wurde, sollte es jetzt gefunden werden. - run_trading_twins_process("Robo-Planet GmbH") - - # Kurze Pause zwischen den Testläufen - time.sleep(5) - - # Testfall 1b: Ein bekanntes, real existierendes Unternehmen - run_trading_twins_process("Klinikum Landkreis Erding") - - # Kurze Pause zwischen den Testläufen - time.sleep(5) - - # Testfall 2: Ein neues, eindeutiges Unternehmen - new_unique_company_name = f"Trading Twins New Target {int(time.time())}" - run_trading_twins_process(new_unique_company_name) - - # Kurze Pause - time.sleep(5) + print("Trading Twins Tool - Main Menu") + print("1. Process specific company name") + print("2. Ingest leads from Email (info@robo-planet.de)") + print("3. Run demo sequence (Robo-Planet, Erding, etc.)") - # Testfall 3: Ein weiteres neues Unternehmen, um die Erstellung zu prüfen - another_new_company_name = f"Another Demo Corp {int(time.time())}" - run_trading_twins_process(another_new_company_name) + choice = input("\nSelect option (1-3): ").strip() + + if choice == "1": + name = input("Enter company name: ").strip() + if name: + run_trading_twins_process(name) + elif choice == "2": + run_email_ingest() + elif choice == "3": + # Testfall 1: Ein Unternehmen, das wahrscheinlich bereits existiert + run_trading_twins_process("Robo-Planet GmbH") + time.sleep(2) + # Testfall 1b: Ein bekanntes, real existierendes Unternehmen + run_trading_twins_process("Klinikum Landkreis Erding") + time.sleep(2) + # Testfall 2: Ein neues, eindeutiges Unternehmen + new_unique_company_name = f"Trading Twins New Target {int(time.time())}" + run_trading_twins_process(new_unique_company_name) + else: + print("Invalid choice.")