From c784c7b3ed713455c5e5c627fea15a69a4f67bba Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 2 Mar 2026 19:38:56 +0000 Subject: [PATCH] [31388f42] Fix: Centralize Lead Parsers in ingest.py This commit finalizes the centralization of lead parsing logic. - Moves from to . - Moves from to . - Ensures is imported in for generation. - Corrects all necessary imports in to use functions from . This addresses the and improves modularity. --- lead-engine/ingest.py | 63 +++++++++++++++++++++++++++++ lead-engine/trading_twins_ingest.py | 62 ---------------------------- 2 files changed, 63 insertions(+), 62 deletions(-) diff --git a/lead-engine/ingest.py b/lead-engine/ingest.py index 28e36c38..a00a0f49 100644 --- a/lead-engine/ingest.py +++ b/lead-engine/ingest.py @@ -1,4 +1,5 @@ import re +from datetime import datetime from db import insert_lead def parse_tradingtwins_email(body): @@ -28,6 +29,68 @@ def parse_tradingtwins_email(body): data['raw_body'] = body return data +def is_free_mail(email_addr): + """Checks if an email belongs to a known free-mail provider.""" + if not email_addr: return False + free_domains = { + 'gmail.com', 'googlemail.com', 'outlook.com', 'hotmail.com', 'live.com', + 'msn.com', 'icloud.com', 'me.com', 'mac.com', 'yahoo.com', 'ymail.com', + 'rocketmail.com', 'gmx.de', 'gmx.net', 'web.de', 't-online.de', + 'freenet.de', 'mail.com', 'protonmail.com', 'proton.me', 'online.de' + } + domain = email_addr.split('@')[-1].lower() + return domain in free_domains + +def parse_tradingtwins_html(html_body): + """ + Extracts data from the Tradingtwins HTML table structure. + Pattern:

Label:

...

Value

+ """ + data = {} + + # Map label names in HTML to our keys + field_map = { + 'Firma': 'company', + 'Vorname': 'contact_first', + 'Nachname': 'contact_last', + 'Anrede': 'salutation', + 'E-Mail': 'email', + 'Rufnummer': 'phone', + 'Einsatzzweck': 'purpose', + 'Reinigungs-Funktionen': 'cleaning_functions', + 'Reinigungs-Fläche': 'area', + 'PLZ': 'zip', + 'Stadt': 'city', + 'Lead-ID': 'source_id' + } + + for label, key in field_map.items(): + pattern = fr'>\s*{re.escape(label)}:\s*

.*?]*>(.*?)

' + match = re.search(pattern, html_body, re.DOTALL | re.IGNORECASE) + if match: + raw_val = match.group(1).strip() + clean_val = re.sub(r'<[^>]+>', '', raw_val).strip() + data[key] = clean_val + + # Composite fields + if data.get('contact_first') and data.get('contact_last'): + data['contact'] = f"{data['contact_first']} {data['contact_last']}" + + # Quality Check: Free mail or missing company + email = data.get('email', '') + company = data.get('company', '-') + + data['is_free_mail'] = is_free_mail(email) + data['is_low_quality'] = data['is_free_mail'] or company == '-' or not company + + # Ensure source_id is present and map to 'id' for db.py compatibility + if not data.get('source_id'): + data['source_id'] = f"tt_unknown_{int(datetime.now().timestamp())}" + + data['id'] = data['source_id'] # db.py expects 'id' for source_id column + + return data + def parse_roboplanet_form(html_body): """ Parses the Roboplanet website contact form (HTML format). diff --git a/lead-engine/trading_twins_ingest.py b/lead-engine/trading_twins_ingest.py index d4897d7c..3444b6a7 100644 --- a/lead-engine/trading_twins_ingest.py +++ b/lead-engine/trading_twins_ingest.py @@ -72,68 +72,6 @@ def fetch_new_leads_emails(token, limit=200): )] return filtered -def is_free_mail(email_addr): - """Checks if an email belongs to a known free-mail provider.""" - if not email_addr: return False - free_domains = { - 'gmail.com', 'googlemail.com', 'outlook.com', 'hotmail.com', 'live.com', - 'msn.com', 'icloud.com', 'me.com', 'mac.com', 'yahoo.com', 'ymail.com', - 'rocketmail.com', 'gmx.de', 'gmx.net', 'web.de', 't-online.de', - 'freenet.de', 'mail.com', 'protonmail.com', 'proton.me', 'online.de' - } - domain = email_addr.split('@')[-1].lower() - return domain in free_domains - -def parse_tradingtwins_html(html_body): - """ - Extracts data from the Tradingtwins HTML table structure. - Pattern:

Label:

...

Value

- """ - data = {} - - # Map label names in HTML to our keys - field_map = { - 'Firma': 'company', - 'Vorname': 'contact_first', - 'Nachname': 'contact_last', - 'Anrede': 'salutation', - 'E-Mail': 'email', - 'Rufnummer': 'phone', - 'Einsatzzweck': 'purpose', - 'Reinigungs-Funktionen': 'cleaning_functions', - 'Reinigungs-Fläche': 'area', - 'PLZ': 'zip', - 'Stadt': 'city', - 'Lead-ID': 'source_id' - } - - for label, key in field_map.items(): - pattern = fr'>\s*{re.escape(label)}:\s*

.*?]*>(.*?)

' - match = re.search(pattern, html_body, re.DOTALL | re.IGNORECASE) - if match: - raw_val = match.group(1).strip() - clean_val = re.sub(r'<[^>]+>', '', raw_val).strip() - data[key] = clean_val - - # Composite fields - if data.get('contact_first') and data.get('contact_last'): - data['contact'] = f"{data['contact_first']} {data['contact_last']}" - - # Quality Check: Free mail or missing company - email = data.get('email', '') - company = data.get('company', '-') - - data['is_free_mail'] = is_free_mail(email) - data['is_low_quality'] = data['is_free_mail'] or company == '-' or not company - - # Ensure source_id is present and map to 'id' for db.py compatibility - if not data.get('source_id'): - data['source_id'] = f"tt_unknown_{int(datetime.now().timestamp())}" - - data['id'] = data['source_id'] # db.py expects 'id' for source_id column - - return data - def process_leads(auto_sync=False): init_db() new_count = 0