feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling

- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection. - Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors. - Improved numeric extraction for German formatting (thousands separators vs decimals). - Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data. - Added Manual Override for Website URL.
2026-01-08 11:59:11 +00:00
parent a43b01bb6e
commit dbc3ce9b34
5 changed files with 296 additions and 49 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -383,6 +383,18 @@ def run_analysis_task(company_id: int, url: str):
                    )
                    db.add(new_signal)
                # Save Full Analysis Blob (Business Model + Evidence)
                existing_analysis = db.query(EnrichmentData).filter(
                    EnrichmentData.company_id == company.id,
                    EnrichmentData.source_type == "ai_analysis"
                ).first()
                if not existing_analysis:
                    db.add(EnrichmentData(company_id=company.id, source_type="ai_analysis", content=analysis))
                else:
                    existing_analysis.content = analysis
                    existing_analysis.updated_at = datetime.utcnow()
                company.status = "ENRICHED"
                company.last_classification_at = datetime.utcnow()
                logger.info(f"Robotics analysis complete for {company.name}.")
--- a/company-explorer/backend/lib/core_utils.py
+++ b/company-explorer/backend/lib/core_utils.py
@@ -124,6 +124,7 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
    """
    Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
    Returns string representation of the number or 'k.A.'.
    Handles German number formatting (1.000 = 1000, 1,5 = 1.5).
    """
    if not raw_value:
        return "k.A."
@@ -134,25 +135,50 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
    # Simple multiplier handling
    multiplier = 1.0
-    if 'mrd' in raw_value or 'billion' in raw_value:
+    if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
        multiplier = 1000.0 if is_umsatz else 1000000000.0
-    elif 'mio' in raw_value or 'million' in raw_value:
+    elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
        multiplier = 1.0 if is_umsatz else 1000000.0
    elif 'tsd' in raw_value or 'thousand' in raw_value:
        multiplier = 0.001 if is_umsatz else 1000.0
-    # Extract number
+    # Extract number candidates
-    # Matches 123,45 or 123.45
+    # Regex for "1.000,50" or "1,000.50" or "1000"
-    matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
+    matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
    if not matches:
        return "k.A."
    try:
-        # Take the first number found
+        num_str = matches[0]
-        num_str = matches[0].replace(',', '.')
+        
-        # Fix for thousands separator if like 1.000.000 -> 1000000
+        # Heuristic for German formatting (1.000,00) vs English (1,000.00)
        # If it contains both, the last separator is likely the decimal
        if '.' in num_str and ',' in num_str:
            if num_str.rfind(',') > num_str.rfind('.'):
                # German: 1.000,00 -> remove dots, replace comma with dot
                num_str = num_str.replace('.', '').replace(',', '.')
            else:
                # English: 1,000.00 -> remove commas
                num_str = num_str.replace(',', '')
        elif '.' in num_str:
            # Ambiguous: 1.005 could be 1005 or 1.005
            # Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
            parts = num_str.split('.')
            if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
                 # Likely thousands separator for employees (e.g. 1.005)
                 num_str = num_str.replace('.', '')
            elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
                 # For revenue, 375.6 vs 1.000 is tricky. 
                 # But usually revenue in millions is small numbers with decimals (250.5).
                 # Large integers usually mean thousands.
                 # Let's assume dot is decimal for revenue unless context implies otherwise, 
                 # but for "375.6" it works. For "1.000" it becomes 1.0.
                 # Let's keep dot as decimal for revenue by default unless we detect multiple dots
                 if num_str.count('.') > 1:
                     num_str = num_str.replace('.', '')
        elif ',' in num_str:
            # German decimal: 1,5 -> 1.5
            num_str = num_str.replace(',', '.')
        val = float(num_str) * multiplier
@@ -173,6 +199,20 @@ def fuzzy_similarity(str1: str, str2: str) -> float:
        return 0.0
    return fuzz.ratio(str1, str2) / 100.0
 def clean_json_response(response_text: str) -> str:
    """
    Cleans LLM response to ensure valid JSON.
    Removes Markdown code blocks (```json ... ```).
    """
    if not response_text: return "{}"
    # Remove markdown code blocks
    cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
    cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
    cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
    return cleaned.strip()
 # ==============================================================================
 # 3. LLM WRAPPER (GEMINI)
--- a/company-explorer/backend/services/classification.py
+++ b/company-explorer/backend/services/classification.py
@@ -2,7 +2,7 @@ import json
 import logging
 import os
 from typing import Dict, Any, List
-from ..lib.core_utils import call_gemini
+from ..lib.core_utils import call_gemini, clean_json_response
 from ..config import settings
 from ..database import SessionLocal, RoboticsCategory
@@ -55,7 +55,7 @@ class ClassificationService:
        prompt = f"""
        You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
-        Your task is to analyze a target company based on their website text to determine their **operational need** for service robotics.
+        Your task is to analyze the target company based on their website text and create a concise **Dossier**.
        --- TARGET COMPANY ---
        Name: {company_name}
@@ -66,36 +66,33 @@ class ClassificationService:
        You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
        {json.dumps(self.allowed_industries, ensure_ascii=False)}
-        --- ANALYSIS GUIDELINES (CHAIN OF THOUGHT) ---
+        --- ANALYSIS PART 1: BUSINESS MODEL ---
-        1. **Infrastructure Analysis:** What physical assets does this company likely operate based on their business model? 
+        1. Identify the core products/services.
-           - Factories / Production Plants? (-> Needs Cleaning, Security, Intralogistics)
+        2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model")
           - Large Warehouses? (-> Needs Intralogistics, Security, Floor Washing)
           - Offices / Headquarters? (-> Needs Vacuuming, Window Cleaning)
           - Critical Infrastructure (Solar Parks, Wind Farms)? (-> Needs Perimeter Security, Inspection)
           - Hotels / Hospitals? (-> Needs Service, Cleaning, Transport)
-        2. **Provider vs. User Distinction (CRITICAL):**
+        --- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) ---
-           - If a company SELLS cleaning products (e.g., 3M, Henkel), they do NOT necessarily have a higher need for cleaning robots than any other manufacturer. Do not score them high just because the word "cleaning" appears. Score them based on their *factories*.
+        1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*.
-           - If a company SELLS security services, they might be a potential PARTNER, but check if they *manage* sites.
+        2. **Provider vs. User Check:** 
           - Does the company USE this infrastructure (Potential Customer)?
           - Or do they SELL products for it (Competitor/Partner)? 
           - *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)?
        3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence")
-        3. **Scale Assessment:** 
+        --- ANALYSIS PART 3: SCORING (0-100) ---
-           - 5 locations implies more need than 1. 
+        Based on the identified infrastructure, score the potential for these categories:
           - "Global player" implies large facilities.
        --- SCORING CATEGORIES (0-100) ---
        Based on the current strategic focus of Roboplanet:
        {category_guidance}
        --- OUTPUT FORMAT (JSON ONLY) ---
        {{
            "industry": "String (from list)",
-            "summary": "Concise analysis of their infrastructure and business model (German)",
+            "business_model": "2-3 sentences summary (German)",
            "infrastructure_evidence": "1-2 key sentences proving physical assets (German)",
            "potentials": {{
-                "cleaning": {{ "score": 0-100, "reason": "Specific reasoning based on infrastructure (e.g. 'Operates 5 production plants in DE')." }},
+                "cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }},
-                "transport": {{ "score": 0-100, "reason": "..." }},
+                "transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }},
-                "security": {{ "score": 0-100, "reason": "..." }},
+                "security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }},
-                "service": {{ "score": 0-100, "reason": "..." }}
+                "service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }}
            }}
        }}
        """
@@ -106,7 +103,7 @@ class ClassificationService:
                json_mode=True,
                temperature=0.1 # Very low temp for analytical reasoning
            )
-            return json.loads(response_text)
+            return json.loads(clean_json_response(response_text))
        except Exception as e:
            logger.error(f"Classification failed: {e}")
            return {"error": str(e)}
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -2,9 +2,11 @@ import logging
 import requests
 import random
 import re
 import json
 from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from typing import Optional, Dict
-from ..lib.core_utils import clean_text, retry_on_failure
+from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response
 logger = logging.getLogger(__name__)
@@ -22,6 +24,7 @@ class ScraperService:
    def scrape_url(self, url: str) -> Dict[str, str]:
        """
        Fetches a URL and returns cleaned text content + meta info.
        Also attempts to find and scrape the Impressum (Imprint).
        """
        if not url.startswith("http"):
            url = "https://" + url
@@ -38,7 +41,36 @@ class ScraperService:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}
-            return self._parse_html(response.content)
+            # Parse Main Page
            result = self._parse_html(response.content)
            # --- IMPRESSUM LOGIC ---
            soup = BeautifulSoup(response.content, 'html.parser')
            impressum_url = self._find_impressum_link(soup, url)
            # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
            if not impressum_url and url.count('/') > 3:
                try:
                    parsed = urlparse(url)
                    root_url = f"{parsed.scheme}://{parsed.netloc}/"
                    logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")
                    root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
                    if root_resp.status_code == 200:
                        root_soup = BeautifulSoup(root_resp.content, 'html.parser')
                        impressum_url = self._find_impressum_link(root_soup, root_url)
                except Exception as ex:
                    logger.warning(f"Root URL fallback failed: {ex}")
            if impressum_url:
                logger.info(f"Found Impressum URL: {impressum_url}")
                impressum_data = self._scrape_impressum_data(impressum_url)
                result["impressum"] = impressum_data
            else:
                logger.info(f"No Impressum link found for {url}")
                result["impressum"] = None
            return result
        except requests.exceptions.SSLError:
            # Retry with HTTP if HTTPS fails
@@ -50,13 +82,96 @@ class ScraperService:
            logger.error(f"Scraping failed for {url}: {e}")
            return {"error": str(e)}
    def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
        """
        Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
        Returns the absolute URL.
        """
        keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
        # Candidate tracking
        candidates = []
        for a in soup.find_all('a', href=True):
            text = clean_text(a.get_text()).lower()
            href = a['href'].lower()
            # Debug log for potential candidates (verbose)
            # if "imp" in text or "imp" in href: 
            #    logger.debug(f"Checking link: '{text}' -> {href}")
            # Check text content or href keywords
            if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
                # Avoid mailto links or purely social links if possible
                if "mailto:" in href or "tel:" in href or "javascript:" in href:
                    continue
                full_url = urljoin(base_url, a['href'])
                # Prioritize 'impressum' in text over href
                score = 0
                if "impressum" in text: score += 10
                if "impressum" in href: score += 5
                candidates.append((score, full_url))
        if candidates:
            # Sort by score desc
            candidates.sort(key=lambda x: x[0], reverse=True)
            best_match = candidates[0][1]
            logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
            return best_match
        return None
    def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
        """
        Fetches the Impressum page and uses LLM to extract structured data.
        """
        try:
            headers = {'User-Agent': random.choice(USER_AGENTS)}
            response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            # Aggressive cleaning for Impressum too
            for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
                element.decompose()
            raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
            # LLM Extraction
            prompt = f"""
            Extract the official company details from this German 'Impressum' text.
            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
            If a field is missing, use null.
            Text:
            {raw_text}
            """
            response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
            return json.loads(clean_json_response(response_text))
        except Exception as e:
            logger.error(f"Impressum scrape failed for {url}: {e}")
            return None
    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
        soup = BeautifulSoup(html_content, 'html.parser')
-        # 1. Cleanup Junk
+        # 1. Cleanup Junk (Aggressive, matching legacy logic)
-        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
+        # Removed 'a' tags to prevent menu links from polluting the text analysis
        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
            element.decompose()
        # 1b. Remove common Cookie Banners / Popups by class/id heuristics
        for div in soup.find_all("div"):
            classes = str(div.get("class", "")).lower()
            ids = str(div.get("id", "")).lower()
            if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
                div.decompose()
        # 2. Extract Title & Meta Description
        title = soup.title.string if soup.title else ""
        meta_desc = ""
--- a/company-explorer/frontend/src/components/Inspector.tsx
+++ b/company-explorer/frontend/src/components/Inspector.tsx
@@ -38,25 +38,52 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
  const [loading, setLoading] = useState(false)
  const [isProcessing, setIsProcessing] = useState(false)
  // Polling Logic
  useEffect(() => {
      let interval: NodeJS.Timeout;
      if (isProcessing) {
          interval = setInterval(() => {
              fetchData(true) // Silent fetch
          }, 2000)
      }
      return () => clearInterval(interval)
  }, [isProcessing, companyId]) // Dependencies
  // Manual Override State
  const [isEditingWiki, setIsEditingWiki] = useState(false)
  const [wikiUrlInput, setWikiUrlInput] = useState("")
  const [isEditingWebsite, setIsEditingWebsite] = useState(false)
  const [websiteInput, setWebsiteInput] = useState("")
-  const fetchData = () => {
+  const fetchData = (silent = false) => {
    if (!companyId) return
-    setLoading(true)
+    if (!silent) setLoading(true)
    axios.get(`${apiBase}/companies/${companyId}`)
-      .then(res => setData(res.data))
+      .then(res => {
          const newData = res.data
          setData(newData)
          // Auto-stop processing if status changes to ENRICHED or we see data
          if (isProcessing) {
             const hasWiki = newData.enrichment_data?.some((e:any) => e.source_type === 'wikipedia')
             const hasAnalysis = newData.enrichment_data?.some((e:any) => e.source_type === 'ai_analysis')
             // If we were waiting for Discover (Wiki) or Analyze (AI)
             if ((hasWiki && newData.status === 'DISCOVERED') || (hasAnalysis && newData.status === 'ENRICHED')) {
                 setIsProcessing(false)
             }
          }
      })
      .catch(console.error)
-      .finally(() => setLoading(false))
+      .finally(() => { if (!silent) setLoading(false) })
  }
  useEffect(() => {
    fetchData()
    setIsEditingWiki(false)
    setIsEditingWebsite(false)
    setIsProcessing(false) // Reset on ID change
  }, [companyId])
  const handleDiscover = async () => {
@@ -64,10 +91,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
    setIsProcessing(true)
    try {
      await axios.post(`${apiBase}/enrich/discover`, { company_id: companyId })
-      setTimeout(fetchData, 3000)
+      // Polling effect will handle the rest
    } catch (e) {
      console.error(e)
    } finally {
      setIsProcessing(false)
    }
  }
@@ -77,10 +103,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
    setIsProcessing(true)
    try {
      await axios.post(`${apiBase}/enrich/analyze`, { company_id: companyId })
-      setTimeout(fetchData, 5000)
+      // Polling effect will handle the rest
    } catch (e) {
      console.error(e)
    } finally {
      setIsProcessing(false)
    }
  }
@@ -121,6 +146,11 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
  const wiki = wikiEntry?.content
  const isLocked = wikiEntry?.is_locked
  const aiAnalysis = data?.enrichment_data?.find(e => e.source_type === 'ai_analysis')?.content
  const scrapeData = data?.enrichment_data?.find(e => e.source_type === 'website_scrape')?.content
  const impressum = scrapeData?.impressum
  return (
    <div className="fixed inset-y-0 right-0 w-[550px] bg-slate-900 border-l border-slate-800 shadow-2xl transform transition-transform duration-300 ease-in-out z-40 overflow-y-auto">
      {loading ? (
@@ -135,7 +165,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
              <h2 className="text-xl font-bold text-white leading-tight">{data.name}</h2>
              <div className="flex items-center gap-2">
                <button 
-                  onClick={fetchData} 
+                  onClick={() => fetchData(true)} 
                  className="p-1.5 text-slate-500 hover:text-white transition-colors"
                  title="Refresh"
                >
@@ -227,6 +257,59 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
          </div>
          <div className="p-6 space-y-8">
            {/* Impressum / Legal Data (NEW) */}
            {impressum && (
                <div className="bg-slate-950 rounded-lg p-4 border border-slate-800 flex flex-col gap-2">
                    <div className="flex items-center gap-2 mb-1">
                        <div className="p-1 bg-slate-800 rounded text-slate-400">
                            <Briefcase className="h-3 w-3" />
                        </div>
                        <span className="text-[10px] uppercase font-bold text-slate-500 tracking-wider">Official Legal Data</span>
                    </div>
                    <div className="text-sm font-medium text-white">
                        {impressum.legal_name || "Unknown Legal Name"}
                    </div>
                    <div className="flex items-start gap-2 text-xs text-slate-400">
                        <MapPin className="h-3 w-3 mt-0.5 shrink-0" />
                        <div>
                            <div>{impressum.street}</div>
                            <div>{impressum.zip} {impressum.city}</div>
                        </div>
                    </div>
                    {(impressum.email || impressum.phone) && (
                        <div className="mt-2 pt-2 border-t border-slate-900 flex gap-4 text-[10px] text-slate-500 font-mono">
                            {impressum.email && <span>{impressum.email}</span>}
                            {impressum.phone && <span>{impressum.phone}</span>}
                        </div>
                    )}
                </div>
            )}
            {/* AI Analysis Dossier (NEW) */}
            {aiAnalysis && (
                <div className="space-y-4">
                    <h3 className="text-sm font-semibold text-slate-400 uppercase tracking-wider flex items-center gap-2">
                        <Bot className="h-4 w-4" /> AI Strategic Dossier
                    </h3>
                    <div className="bg-slate-800/30 rounded-xl p-5 border border-slate-800/50 space-y-4">
                        <div>
                            <div className="text-[10px] text-blue-400 uppercase font-bold tracking-tight mb-1">Business Model</div>
                            <p className="text-sm text-slate-200 leading-relaxed">{aiAnalysis.business_model || "No summary available."}</p>
                        </div>
                        {aiAnalysis.infrastructure_evidence && (
                            <div className="pt-4 border-t border-slate-800/50">
                                <div className="text-[10px] text-orange-400 uppercase font-bold tracking-tight mb-1">Infrastructure Evidence</div>
                                <p className="text-sm text-slate-300 italic leading-relaxed">"{aiAnalysis.infrastructure_evidence}"</p>
                            </div>
                        )}
                    </div>
                </div>
            )}
            {/* Wikipedia Section */}
            <div className="space-y-4">
               <div className="flex items-center justify-between">
@@ -309,7 +392,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
                      </div>
                      <div>
                        <div className="text-[10px] text-slate-500 uppercase font-bold tracking-tight">Revenue</div>
-                        <div className="text-sm text-slate-200 font-medium">{wiki.umsatz ? `${wiki.umsatz} Mio. €` : 'k.A.'}</div>
+                        <div className="text-sm text-slate-200 font-medium">{wiki.umsatz && wiki.umsatz !== 'k.A.' ? `${wiki.umsatz} Mio. €` : 'k.A.'}</div>
                      </div>
                    </div>