[31388f42] Final session polish: Refined UI, improved ingest parsing, and completed documentation

2026-03-02 15:10:12 +00:00
parent aa38c555d8
commit ee2dfd5b00
10 changed files with 171 additions and 224 deletions
--- a/lead-engine/app.py
+++ b/lead-engine/app.py
@@ -8,22 +8,40 @@ from enrich import run_sync, refresh_ce_data, sync_single_lead
 from generate_reply import generate_email_draft

 def clean_html_to_text(html_content):
-    """Simple helper to convert HTML email body to readable plain text."""
+    """Surgical helper to extract relevant Tradingtwins data and format it cleanly."""
    if not html_content:
        return ""
-    # Remove head and style tags entirely
+    
+    # 1. Strip head and style
    clean = re.sub(r'<head.*?>.*?</head>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
    clean = re.sub(r'<style.*?>.*?</style>', '', clean, flags=re.DOTALL | re.IGNORECASE)
-    # Replace <br> and </p> with newlines
+    
+    # 2. Extract the core data block (from 'Datum:' until the matchmaking plug)
+    # We look for the first 'Datum:' label
+    start_match = re.search(r'Datum:', clean, re.IGNORECASE)
+    end_match = re.search(r'Kennen Sie schon Ihr persönliches Konto', clean, re.IGNORECASE)
+    
+    if start_match:
+        start_pos = start_match.start()
+        end_pos = end_match.start() if end_match else len(clean)
+        clean = clean[start_pos:end_pos]
+    
+    # 3. Format Table Structure: </td><td> should be a space/tab, </tr> a newline
+    # This prevents the "Label on one line, value on next" issue
+    clean = re.sub(r'</td>\s*<td.*?>', '  ', clean, flags=re.IGNORECASE)
+    clean = re.sub(r'</tr>', '\n', clean, flags=re.IGNORECASE)
+    
+    # 4. Standard Cleanup
    clean = re.sub(r'<br\s*/?>', '\n', clean, flags=re.IGNORECASE)
    clean = re.sub(r'</p>', '\n', clean, flags=re.IGNORECASE)
-    # Remove all other tags
    clean = re.sub(r'<.*?>', '', clean)
-    # Decode some common entities
-    clean = clean.replace('&nbsp;', ' ').replace('&amp;', '&').replace('&quot;', '"')
-    # Cleanup multiple newlines
-    clean = re.sub(r'\n\s*\n+', '\n\n', clean).strip()
-    return clean
+    
+    # 5. Entity Decoding
+    clean = clean.replace('&nbsp;', ' ').replace('&amp;', '&').replace('&quot;', '"').replace('&gt;', '>')
+    
+    # 6. Final Polish: remove empty lines and leading/trailing whitespace
+    lines = [line.strip() for line in clean.split('\n') if line.strip()]
+    return '\n'.join(lines)

 st.set_page_config(page_title="TradingTwins Lead Engine", layout="wide")

@@ -140,13 +158,15 @@ if not df.empty:
            if meta.get('is_low_quality'):
                st.warning("⚠️ **Low Quality Lead detected** (Free-mail or missing company).")

-            # --- SECTION 1: LEAD INFO (2 Columns) ---
-            st.markdown("### 📋 Lead Data")
-            c1, c2 = st.columns(2)
+            # --- SECTION 1: LEAD INFO & INTELLIGENCE ---
+            col_lead, col_intel = st.columns(2)
            
-            with c1:
+            with col_lead:
+                st.markdown("### 📋 Lead Data")
+                st.write(f"**Salutation:** {meta.get('salutation', '-')}")
                st.write(f"**Contact:** {row['contact_name']}")
                st.write(f"**Email:** {row['email']}")
+                st.write(f"**Phone:** {meta.get('phone', row.get('phone', '-'))}")
                
                role = meta.get('role')
                if role:
@@ -158,58 +178,56 @@ if not df.empty:
                            found_role = enrich_contact_role(row)
                            if found_role: st.success(f"Found: {found_role}"); st.rerun()
                            else: st.error("No role found.")
-            
-            with c2:
+                
                st.write(f"**Area:** {meta.get('area', '-')}")
                st.write(f"**Purpose:** {meta.get('purpose', '-')}")
+                st.write(f"**Functions:** {meta.get('cleaning_functions', '-')}")
                st.write(f"**Location:** {meta.get('zip', '')} {meta.get('city', '')}")
-                
-                with st.expander("Original Body Preview"):
-                    st.text(clean_html_to_text(row['raw_body']))
-                    if st.checkbox("Show HTML", key=f"raw_{row['id']}"):
-                        st.code(row['raw_body'], language="html")

-            st.divider()
-
-            # --- SECTION 2: INTELLIGENCE (CE) ---
-            st.markdown("### 🔍 Intelligence (CE)")
-            enrichment = json.loads(row['enrichment_data']) if row['enrichment_data'] else {}
-            ce_id = enrichment.get('ce_id')
-            
-            if ce_id:
-                st.success(f"✅ Linked to Company Explorer (ID: {ce_id})")
-                ce_data = enrichment.get('ce_data', {})
+            with col_intel:
+                st.markdown("### 🔍 Intelligence (CE)")
+                enrichment = json.loads(row['enrichment_data']) if row['enrichment_data'] else {}
+                ce_id = enrichment.get('ce_id')
                
-                vertical = ce_data.get('industry_ai') or ce_data.get('vertical')
-                summary = ce_data.get('research_dossier') or ce_data.get('summary')
-                
-                intel_col1, intel_col2 = st.columns([1, 2])
-                with intel_col1:
+                if ce_id:
+                    st.success(f"✅ Linked to Company Explorer (ID: {ce_id})")
+                    ce_data = enrichment.get('ce_data', {})
+                    
+                    vertical = ce_data.get('industry_ai') or ce_data.get('vertical')
+                    summary = ce_data.get('research_dossier') or ce_data.get('summary')
+                    
                    if vertical and vertical != 'None':
                        st.info(f"**Industry:** {vertical}")
                    else:
                        st.warning("Industry Analysis pending...")
                    
+                    if summary:
+                        with st.expander("Show AI Research Dossier", expanded=True):
+                            st.write(summary)
+                    
                    if st.button("🔄 Refresh CE Data", key=f"refresh_{row['id']}"):
                        with st.spinner("Fetching..."):
                            refresh_ce_data(row['id'], ce_id)
                            st.rerun()
-                
-                with intel_col2:
-                    if summary:
-                        with st.expander("Show AI Research Dossier", expanded=True):
-                            st.write(summary)
-            else:
-                st.warning("⚠️ Not synced with Company Explorer yet")
-                if st.button("🚀 Sync to Company Explorer", key=f"sync_single_{row['id']}"):
-                    with st.spinner("Syncing..."):
-                        sync_single_lead(row['id'])
-                        st.rerun()
+                else:
+                    st.warning("⚠️ Not synced with Company Explorer yet")
+                    if st.button("🚀 Sync to Company Explorer", key=f"sync_single_{row['id']}"):
+                        with st.spinner("Syncing..."):
+                            sync_single_lead(row['id'])
+                            st.rerun()

            st.divider()

-            # --- SECTION 3: RESPONSE DRAFT ---
-            st.markdown("### ✉️ Response Draft")
+            # --- SECTION 2: ORIGINAL EMAIL ---
+            with st.expander("✉️ View Original Email Content"):
+                st.text(clean_html_to_text(row['raw_body']))
+                if st.checkbox("Show Raw HTML", key=f"raw_{row['id']}"):
+                    st.code(row['raw_body'], language="html")
+
+            st.divider()
+
+            # --- SECTION 3: RESPONSE DRAFT (Full Width) ---
+            st.markdown("### 📝 Response Draft")
            if row['status'] != 'new' and ce_id:
                if st.button("✨ Generate Expert Reply", key=f"gen_{row['id']}", type="primary"):
                    with st.spinner("Writing email..."):