feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling

- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection. - Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors. - Improved numeric extraction for German formatting (thousands separators vs decimals). - Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data. - Added Manual Override for Website URL.
2026-01-08 11:59:11 +00:00
parent a43b01bb6e
commit dbc3ce9b34
5 changed files with 296 additions and 49 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -383,6 +383,18 @@ def run_analysis_task(company_id: int, url: str):
                    )
                    db.add(new_signal)
                
+                # Save Full Analysis Blob (Business Model + Evidence)
+                existing_analysis = db.query(EnrichmentData).filter(
+                    EnrichmentData.company_id == company.id,
+                    EnrichmentData.source_type == "ai_analysis"
+                ).first()
+                
+                if not existing_analysis:
+                    db.add(EnrichmentData(company_id=company.id, source_type="ai_analysis", content=analysis))
+                else:
+                    existing_analysis.content = analysis
+                    existing_analysis.updated_at = datetime.utcnow()
+                
                company.status = "ENRICHED"
                company.last_classification_at = datetime.utcnow()
                logger.info(f"Robotics analysis complete for {company.name}.")
--- a/company-explorer/backend/lib/core_utils.py
+++ b/company-explorer/backend/lib/core_utils.py
@@ -124,6 +124,7 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
    """
    Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
    Returns string representation of the number or 'k.A.'.
+    Handles German number formatting (1.000 = 1000, 1,5 = 1.5).
    """
    if not raw_value:
        return "k.A."
@@ -134,25 +135,50 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:

    # Simple multiplier handling
    multiplier = 1.0
-    if 'mrd' in raw_value or 'billion' in raw_value:
+    if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
        multiplier = 1000.0 if is_umsatz else 1000000000.0
-    elif 'mio' in raw_value or 'million' in raw_value:
+    elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
        multiplier = 1.0 if is_umsatz else 1000000.0
    elif 'tsd' in raw_value or 'thousand' in raw_value:
        multiplier = 0.001 if is_umsatz else 1000.0
        
-    # Extract number
-    # Matches 123,45 or 123.45
-    matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
+    # Extract number candidates
+    # Regex for "1.000,50" or "1,000.50" or "1000"
+    matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
    if not matches:
        return "k.A."
        
    try:
-        # Take the first number found
-        num_str = matches[0].replace(',', '.')
-        # Fix for thousands separator if like 1.000.000 -> 1000000
-        if num_str.count('.') > 1:
-            num_str = num_str.replace('.', '')
+        num_str = matches[0]
+        
+        # Heuristic for German formatting (1.000,00) vs English (1,000.00)
+        # If it contains both, the last separator is likely the decimal
+        if '.' in num_str and ',' in num_str:
+            if num_str.rfind(',') > num_str.rfind('.'):
+                # German: 1.000,00 -> remove dots, replace comma with dot
+                num_str = num_str.replace('.', '').replace(',', '.')
+            else:
+                # English: 1,000.00 -> remove commas
+                num_str = num_str.replace(',', '')
+        elif '.' in num_str:
+            # Ambiguous: 1.005 could be 1005 or 1.005
+            # Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
+            parts = num_str.split('.')
+            if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
+                 # Likely thousands separator for employees (e.g. 1.005)
+                 num_str = num_str.replace('.', '')
+            elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
+                 # For revenue, 375.6 vs 1.000 is tricky. 
+                 # But usually revenue in millions is small numbers with decimals (250.5).
+                 # Large integers usually mean thousands.
+                 # Let's assume dot is decimal for revenue unless context implies otherwise, 
+                 # but for "375.6" it works. For "1.000" it becomes 1.0.
+                 # Let's keep dot as decimal for revenue by default unless we detect multiple dots
+                 if num_str.count('.') > 1:
+                     num_str = num_str.replace('.', '')
+        elif ',' in num_str:
+            # German decimal: 1,5 -> 1.5
+            num_str = num_str.replace(',', '.')
            
        val = float(num_str) * multiplier
        
@@ -173,6 +199,20 @@ def fuzzy_similarity(str1: str, str2: str) -> float:
        return 0.0
    return fuzz.ratio(str1, str2) / 100.0

+def clean_json_response(response_text: str) -> str:
+    """
+    Cleans LLM response to ensure valid JSON.
+    Removes Markdown code blocks (```json ... ```).
+    """
+    if not response_text: return "{}"
+    
+    # Remove markdown code blocks
+    cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
+    cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
+    cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
+    
+    return cleaned.strip()
+
 # ==============================================================================
 # 3. LLM WRAPPER (GEMINI)

--- a/company-explorer/backend/services/classification.py
+++ b/company-explorer/backend/services/classification.py
@@ -2,7 +2,7 @@ import json
 import logging
 import os
 from typing import Dict, Any, List
-from ..lib.core_utils import call_gemini
+from ..lib.core_utils import call_gemini, clean_json_response
 from ..config import settings
 from ..database import SessionLocal, RoboticsCategory

@@ -55,7 +55,7 @@ class ClassificationService:

        prompt = f"""
        You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
-        Your task is to analyze a target company based on their website text to determine their **operational need** for service robotics.
+        Your task is to analyze the target company based on their website text and create a concise **Dossier**.

        --- TARGET COMPANY ---
        Name: {company_name}
@@ -66,36 +66,33 @@ class ClassificationService:
        You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
        {json.dumps(self.allowed_industries, ensure_ascii=False)}

-        --- ANALYSIS GUIDELINES (CHAIN OF THOUGHT) ---
-        1. **Infrastructure Analysis:** What physical assets does this company likely operate based on their business model? 
-           - Factories / Production Plants? (-> Needs Cleaning, Security, Intralogistics)
-           - Large Warehouses? (-> Needs Intralogistics, Security, Floor Washing)
-           - Offices / Headquarters? (-> Needs Vacuuming, Window Cleaning)
-           - Critical Infrastructure (Solar Parks, Wind Farms)? (-> Needs Perimeter Security, Inspection)
-           - Hotels / Hospitals? (-> Needs Service, Cleaning, Transport)
-        
-        2. **Provider vs. User Distinction (CRITICAL):**
-           - If a company SELLS cleaning products (e.g., 3M, Henkel), they do NOT necessarily have a higher need for cleaning robots than any other manufacturer. Do not score them high just because the word "cleaning" appears. Score them based on their *factories*.
-           - If a company SELLS security services, they might be a potential PARTNER, but check if they *manage* sites.
-        
-        3. **Scale Assessment:** 
-           - 5 locations implies more need than 1. 
-           - "Global player" implies large facilities.
+        --- ANALYSIS PART 1: BUSINESS MODEL ---
+        1. Identify the core products/services.
+        2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model")

-        --- SCORING CATEGORIES (0-100) ---
-        Based on the current strategic focus of Roboplanet:
+        --- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) ---
+        1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*.
+        2. **Provider vs. User Check:** 
+           - Does the company USE this infrastructure (Potential Customer)?
+           - Or do they SELL products for it (Competitor/Partner)? 
+           - *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)?
+        3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence")
+
+        --- ANALYSIS PART 3: SCORING (0-100) ---
+        Based on the identified infrastructure, score the potential for these categories:
        
        {category_guidance}

        --- OUTPUT FORMAT (JSON ONLY) ---
        {{
            "industry": "String (from list)",
-            "summary": "Concise analysis of their infrastructure and business model (German)",
+            "business_model": "2-3 sentences summary (German)",
+            "infrastructure_evidence": "1-2 key sentences proving physical assets (German)",
            "potentials": {{
-                "cleaning": {{ "score": 0-100, "reason": "Specific reasoning based on infrastructure (e.g. 'Operates 5 production plants in DE')." }},
-                "transport": {{ "score": 0-100, "reason": "..." }},
-                "security": {{ "score": 0-100, "reason": "..." }},
-                "service": {{ "score": 0-100, "reason": "..." }}
+                "cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }},
+                "transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }},
+                "security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }},
+                "service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }}
            }}
        }}
        """
@@ -106,7 +103,7 @@ class ClassificationService:
                json_mode=True,
                temperature=0.1 # Very low temp for analytical reasoning
            )
-            return json.loads(response_text)
+            return json.loads(clean_json_response(response_text))
        except Exception as e:
            logger.error(f"Classification failed: {e}")
            return {"error": str(e)}
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -2,9 +2,11 @@ import logging
 import requests
 import random
 import re
+import json
+from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from typing import Optional, Dict
-from ..lib.core_utils import clean_text, retry_on_failure
+from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response

 logger = logging.getLogger(__name__)

@@ -22,6 +24,7 @@ class ScraperService:
    def scrape_url(self, url: str) -> Dict[str, str]:
        """
        Fetches a URL and returns cleaned text content + meta info.
+        Also attempts to find and scrape the Impressum (Imprint).
        """
        if not url.startswith("http"):
            url = "https://" + url
@@ -38,7 +41,36 @@ class ScraperService:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}

-            return self._parse_html(response.content)
+            # Parse Main Page
+            result = self._parse_html(response.content)
+            
+            # --- IMPRESSUM LOGIC ---
+            soup = BeautifulSoup(response.content, 'html.parser')
+            impressum_url = self._find_impressum_link(soup, url)
+            
+            # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
+            if not impressum_url and url.count('/') > 3:
+                try:
+                    parsed = urlparse(url)
+                    root_url = f"{parsed.scheme}://{parsed.netloc}/"
+                    logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")
+                    
+                    root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
+                    if root_resp.status_code == 200:
+                        root_soup = BeautifulSoup(root_resp.content, 'html.parser')
+                        impressum_url = self._find_impressum_link(root_soup, root_url)
+                except Exception as ex:
+                    logger.warning(f"Root URL fallback failed: {ex}")
+
+            if impressum_url:
+                logger.info(f"Found Impressum URL: {impressum_url}")
+                impressum_data = self._scrape_impressum_data(impressum_url)
+                result["impressum"] = impressum_data
+            else:
+                logger.info(f"No Impressum link found for {url}")
+                result["impressum"] = None
+
+            return result

        except requests.exceptions.SSLError:
            # Retry with HTTP if HTTPS fails
@@ -50,13 +82,96 @@ class ScraperService:
            logger.error(f"Scraping failed for {url}: {e}")
            return {"error": str(e)}

+    def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
+        """
+        Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
+        Returns the absolute URL.
+        """
+        keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
+        
+        # Candidate tracking
+        candidates = []
+
+        for a in soup.find_all('a', href=True):
+            text = clean_text(a.get_text()).lower()
+            href = a['href'].lower()
+            
+            # Debug log for potential candidates (verbose)
+            # if "imp" in text or "imp" in href: 
+            #    logger.debug(f"Checking link: '{text}' -> {href}")
+
+            # Check text content or href keywords
+            if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
+                # Avoid mailto links or purely social links if possible
+                if "mailto:" in href or "tel:" in href or "javascript:" in href:
+                    continue
+                
+                full_url = urljoin(base_url, a['href'])
+                
+                # Prioritize 'impressum' in text over href
+                score = 0
+                if "impressum" in text: score += 10
+                if "impressum" in href: score += 5
+                
+                candidates.append((score, full_url))
+        
+        if candidates:
+            # Sort by score desc
+            candidates.sort(key=lambda x: x[0], reverse=True)
+            best_match = candidates[0][1]
+            logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
+            return best_match
+            
+        return None
+
+    def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
+        """
+        Fetches the Impressum page and uses LLM to extract structured data.
+        """
+        try:
+            headers = {'User-Agent': random.choice(USER_AGENTS)}
+            response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
+            response.raise_for_status()
+            
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Aggressive cleaning for Impressum too
+            for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
+                element.decompose()
+                
+            raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
+            
+            # LLM Extraction
+            prompt = f"""
+            Extract the official company details from this German 'Impressum' text.
+            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
+            If a field is missing, use null.
+            
+            Text:
+            {raw_text}
+            """
+            
+            response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
+            return json.loads(clean_json_response(response_text))
+            
+        except Exception as e:
+            logger.error(f"Impressum scrape failed for {url}: {e}")
+            return None
+
    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
        soup = BeautifulSoup(html_content, 'html.parser')
        
-        # 1. Cleanup Junk
-        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
+        # 1. Cleanup Junk (Aggressive, matching legacy logic)
+        # Removed 'a' tags to prevent menu links from polluting the text analysis
+        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
            element.decompose()
            
+        # 1b. Remove common Cookie Banners / Popups by class/id heuristics
+        for div in soup.find_all("div"):
+            classes = str(div.get("class", "")).lower()
+            ids = str(div.get("id", "")).lower()
+            if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
+                div.decompose()
+
        # 2. Extract Title & Meta Description
        title = soup.title.string if soup.title else ""
        meta_desc = ""
--- a/company-explorer/frontend/src/components/Inspector.tsx
+++ b/company-explorer/frontend/src/components/Inspector.tsx
@@ -38,25 +38,52 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
  const [loading, setLoading] = useState(false)
  const [isProcessing, setIsProcessing] = useState(false)
  
+  // Polling Logic
+  useEffect(() => {
+      let interval: NodeJS.Timeout;
+      if (isProcessing) {
+          interval = setInterval(() => {
+              fetchData(true) // Silent fetch
+          }, 2000)
+      }
+      return () => clearInterval(interval)
+  }, [isProcessing, companyId]) // Dependencies
+
  // Manual Override State
  const [isEditingWiki, setIsEditingWiki] = useState(false)
  const [wikiUrlInput, setWikiUrlInput] = useState("")
  const [isEditingWebsite, setIsEditingWebsite] = useState(false)
  const [websiteInput, setWebsiteInput] = useState("")

-  const fetchData = () => {
+  const fetchData = (silent = false) => {
    if (!companyId) return
-    setLoading(true)
+    if (!silent) setLoading(true)
+    
    axios.get(`${apiBase}/companies/${companyId}`)
-      .then(res => setData(res.data))
+      .then(res => {
+          const newData = res.data
+          setData(newData)
+          
+          // Auto-stop processing if status changes to ENRICHED or we see data
+          if (isProcessing) {
+             const hasWiki = newData.enrichment_data?.some((e:any) => e.source_type === 'wikipedia')
+             const hasAnalysis = newData.enrichment_data?.some((e:any) => e.source_type === 'ai_analysis')
+             
+             // If we were waiting for Discover (Wiki) or Analyze (AI)
+             if ((hasWiki && newData.status === 'DISCOVERED') || (hasAnalysis && newData.status === 'ENRICHED')) {
+                 setIsProcessing(false)
+             }
+          }
+      })
      .catch(console.error)
-      .finally(() => setLoading(false))
+      .finally(() => { if (!silent) setLoading(false) })
  }

  useEffect(() => {
    fetchData()
    setIsEditingWiki(false)
    setIsEditingWebsite(false)
+    setIsProcessing(false) // Reset on ID change
  }, [companyId])

  const handleDiscover = async () => {
@@ -64,10 +91,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
    setIsProcessing(true)
    try {
      await axios.post(`${apiBase}/enrich/discover`, { company_id: companyId })
-      setTimeout(fetchData, 3000)
+      // Polling effect will handle the rest
    } catch (e) {
      console.error(e)
-    } finally {
      setIsProcessing(false)
    }
  }
@@ -77,10 +103,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
    setIsProcessing(true)
    try {
      await axios.post(`${apiBase}/enrich/analyze`, { company_id: companyId })
-      setTimeout(fetchData, 5000)
+      // Polling effect will handle the rest
    } catch (e) {
      console.error(e)
-    } finally {
      setIsProcessing(false)
    }
  }
@@ -120,6 +145,11 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
  const wikiEntry = data?.enrichment_data?.find(e => e.source_type === 'wikipedia')
  const wiki = wikiEntry?.content
  const isLocked = wikiEntry?.is_locked
+  
+  const aiAnalysis = data?.enrichment_data?.find(e => e.source_type === 'ai_analysis')?.content
+  
+  const scrapeData = data?.enrichment_data?.find(e => e.source_type === 'website_scrape')?.content
+  const impressum = scrapeData?.impressum

  return (
    <div className="fixed inset-y-0 right-0 w-[550px] bg-slate-900 border-l border-slate-800 shadow-2xl transform transition-transform duration-300 ease-in-out z-40 overflow-y-auto">
@@ -135,7 +165,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
              <h2 className="text-xl font-bold text-white leading-tight">{data.name}</h2>
              <div className="flex items-center gap-2">
                <button 
-                  onClick={fetchData} 
+                  onClick={() => fetchData(true)} 
                  className="p-1.5 text-slate-500 hover:text-white transition-colors"
                  title="Refresh"
                >
@@ -227,6 +257,59 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
          </div>

          <div className="p-6 space-y-8">
+            
+            {/* Impressum / Legal Data (NEW) */}
+            {impressum && (
+                <div className="bg-slate-950 rounded-lg p-4 border border-slate-800 flex flex-col gap-2">
+                    <div className="flex items-center gap-2 mb-1">
+                        <div className="p-1 bg-slate-800 rounded text-slate-400">
+                            <Briefcase className="h-3 w-3" />
+                        </div>
+                        <span className="text-[10px] uppercase font-bold text-slate-500 tracking-wider">Official Legal Data</span>
+                    </div>
+                    
+                    <div className="text-sm font-medium text-white">
+                        {impressum.legal_name || "Unknown Legal Name"}
+                    </div>
+                    
+                    <div className="flex items-start gap-2 text-xs text-slate-400">
+                        <MapPin className="h-3 w-3 mt-0.5 shrink-0" />
+                        <div>
+                            <div>{impressum.street}</div>
+                            <div>{impressum.zip} {impressum.city}</div>
+                        </div>
+                    </div>
+                    
+                    {(impressum.email || impressum.phone) && (
+                        <div className="mt-2 pt-2 border-t border-slate-900 flex gap-4 text-[10px] text-slate-500 font-mono">
+                            {impressum.email && <span>{impressum.email}</span>}
+                            {impressum.phone && <span>{impressum.phone}</span>}
+                        </div>
+                    )}
+                </div>
+            )}
+
+            {/* AI Analysis Dossier (NEW) */}
+            {aiAnalysis && (
+                <div className="space-y-4">
+                    <h3 className="text-sm font-semibold text-slate-400 uppercase tracking-wider flex items-center gap-2">
+                        <Bot className="h-4 w-4" /> AI Strategic Dossier
+                    </h3>
+                    <div className="bg-slate-800/30 rounded-xl p-5 border border-slate-800/50 space-y-4">
+                        <div>
+                            <div className="text-[10px] text-blue-400 uppercase font-bold tracking-tight mb-1">Business Model</div>
+                            <p className="text-sm text-slate-200 leading-relaxed">{aiAnalysis.business_model || "No summary available."}</p>
+                        </div>
+                        {aiAnalysis.infrastructure_evidence && (
+                            <div className="pt-4 border-t border-slate-800/50">
+                                <div className="text-[10px] text-orange-400 uppercase font-bold tracking-tight mb-1">Infrastructure Evidence</div>
+                                <p className="text-sm text-slate-300 italic leading-relaxed">"{aiAnalysis.infrastructure_evidence}"</p>
+                            </div>
+                        )}
+                    </div>
+                </div>
+            )}
+
            {/* Wikipedia Section */}
            <div className="space-y-4">
               <div className="flex items-center justify-between">
@@ -309,7 +392,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
                      </div>
                      <div>
                        <div className="text-[10px] text-slate-500 uppercase font-bold tracking-tight">Revenue</div>
-                        <div className="text-sm text-slate-200 font-medium">{wiki.umsatz ? `${wiki.umsatz} Mio. €` : 'k.A.'}</div>
+                        <div className="text-sm text-slate-200 font-medium">{wiki.umsatz && wiki.umsatz !== 'k.A.' ? `${wiki.umsatz} Mio. €` : 'k.A.'}</div>
                      </div>
                    </div>