Files
Brancheneinstufung2/lead-engine/repair_leads.py

60 lines
1.8 KiB
Python

import sqlite3
import json
import re
import os
import sys
# Add path to import db
sys.path.append(os.path.dirname(__file__))
from db import get_leads, update_lead_metadata, init_db
def parse_tradingtwins_html_local(html_body):
"""
Extracts data from the Tradingtwins HTML table structure.
Copied logic to ensure independence.
"""
data = {}
field_map = {
'Einsatzzweck': 'purpose',
'Reinigungs-Fläche': 'area',
'PLZ': 'zip',
'Stadt': 'city'
}
for label, key in field_map.items():
pattern = fr'>\s*{re.escape(label)}:\s*</p>.*?<p[^>]*>(.*?)</p>'
match = re.search(pattern, html_body, re.DOTALL | re.IGNORECASE)
if match:
raw_val = match.group(1).strip()
clean_val = re.sub(r'<[^>]+>', '', raw_val).strip()
data[key] = clean_val
return data
def repair_database():
print("Initializing DB (migrating schema if needed)...")
init_db()
leads = get_leads()
print(f"Found {len(leads)} leads to check.")
count = 0
for lead in leads:
# Check if metadata is missing or empty
current_meta = lead.get('lead_metadata')
if not current_meta or current_meta == '{}' or current_meta == 'null':
print(f"Repairing Lead {lead['id']} ({lead['company_name']})...")
raw_body = lead.get('raw_body', '')
if raw_body:
extracted = parse_tradingtwins_html_local(raw_body)
update_lead_metadata(lead['id'], extracted)
print(f" -> Extracted: {extracted}")
count += 1
else:
print(" -> No raw body found.")
print(f"Repaired {count} leads.")
if __name__ == "__main__":
repair_database()