feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling

- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection.
- Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors.
- Improved numeric extraction for German formatting (thousands separators vs decimals).
- Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data.
- Added Manual Override for Website URL.
This commit is contained in:
2026-01-08 11:59:11 +00:00
parent a43b01bb6e
commit dbc3ce9b34
5 changed files with 296 additions and 49 deletions

View File

@@ -383,6 +383,18 @@ def run_analysis_task(company_id: int, url: str):
)
db.add(new_signal)
# Save Full Analysis Blob (Business Model + Evidence)
existing_analysis = db.query(EnrichmentData).filter(
EnrichmentData.company_id == company.id,
EnrichmentData.source_type == "ai_analysis"
).first()
if not existing_analysis:
db.add(EnrichmentData(company_id=company.id, source_type="ai_analysis", content=analysis))
else:
existing_analysis.content = analysis
existing_analysis.updated_at = datetime.utcnow()
company.status = "ENRICHED"
company.last_classification_at = datetime.utcnow()
logger.info(f"Robotics analysis complete for {company.name}.")

View File

@@ -124,6 +124,7 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
"""
Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
Returns string representation of the number or 'k.A.'.
Handles German number formatting (1.000 = 1000, 1,5 = 1.5).
"""
if not raw_value:
return "k.A."
@@ -134,25 +135,50 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
# Simple multiplier handling
multiplier = 1.0
if 'mrd' in raw_value or 'billion' in raw_value:
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
multiplier = 1000.0 if is_umsatz else 1000000000.0
elif 'mio' in raw_value or 'million' in raw_value:
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
multiplier = 1.0 if is_umsatz else 1000000.0
elif 'tsd' in raw_value or 'thousand' in raw_value:
multiplier = 0.001 if is_umsatz else 1000.0
# Extract number
# Matches 123,45 or 123.45
matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
# Extract number candidates
# Regex for "1.000,50" or "1,000.50" or "1000"
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
if not matches:
return "k.A."
try:
# Take the first number found
num_str = matches[0].replace(',', '.')
# Fix for thousands separator if like 1.000.000 -> 1000000
if num_str.count('.') > 1:
num_str = num_str.replace('.', '')
num_str = matches[0]
# Heuristic for German formatting (1.000,00) vs English (1,000.00)
# If it contains both, the last separator is likely the decimal
if '.' in num_str and ',' in num_str:
if num_str.rfind(',') > num_str.rfind('.'):
# German: 1.000,00 -> remove dots, replace comma with dot
num_str = num_str.replace('.', '').replace(',', '.')
else:
# English: 1,000.00 -> remove commas
num_str = num_str.replace(',', '')
elif '.' in num_str:
# Ambiguous: 1.005 could be 1005 or 1.005
# Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
parts = num_str.split('.')
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
# Likely thousands separator for employees (e.g. 1.005)
num_str = num_str.replace('.', '')
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
# For revenue, 375.6 vs 1.000 is tricky.
# But usually revenue in millions is small numbers with decimals (250.5).
# Large integers usually mean thousands.
# Let's assume dot is decimal for revenue unless context implies otherwise,
# but for "375.6" it works. For "1.000" it becomes 1.0.
# Let's keep dot as decimal for revenue by default unless we detect multiple dots
if num_str.count('.') > 1:
num_str = num_str.replace('.', '')
elif ',' in num_str:
# German decimal: 1,5 -> 1.5
num_str = num_str.replace(',', '.')
val = float(num_str) * multiplier
@@ -173,6 +199,20 @@ def fuzzy_similarity(str1: str, str2: str) -> float:
return 0.0
return fuzz.ratio(str1, str2) / 100.0
def clean_json_response(response_text: str) -> str:
"""
Cleans LLM response to ensure valid JSON.
Removes Markdown code blocks (```json ... ```).
"""
if not response_text: return "{}"
# Remove markdown code blocks
cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
return cleaned.strip()
# ==============================================================================
# 3. LLM WRAPPER (GEMINI)

View File

@@ -2,7 +2,7 @@ import json
import logging
import os
from typing import Dict, Any, List
from ..lib.core_utils import call_gemini
from ..lib.core_utils import call_gemini, clean_json_response
from ..config import settings
from ..database import SessionLocal, RoboticsCategory
@@ -55,7 +55,7 @@ class ClassificationService:
prompt = f"""
You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
Your task is to analyze a target company based on their website text to determine their **operational need** for service robotics.
Your task is to analyze the target company based on their website text and create a concise **Dossier**.
--- TARGET COMPANY ---
Name: {company_name}
@@ -66,36 +66,33 @@ class ClassificationService:
You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
{json.dumps(self.allowed_industries, ensure_ascii=False)}
--- ANALYSIS GUIDELINES (CHAIN OF THOUGHT) ---
1. **Infrastructure Analysis:** What physical assets does this company likely operate based on their business model?
- Factories / Production Plants? (-> Needs Cleaning, Security, Intralogistics)
- Large Warehouses? (-> Needs Intralogistics, Security, Floor Washing)
- Offices / Headquarters? (-> Needs Vacuuming, Window Cleaning)
- Critical Infrastructure (Solar Parks, Wind Farms)? (-> Needs Perimeter Security, Inspection)
- Hotels / Hospitals? (-> Needs Service, Cleaning, Transport)
2. **Provider vs. User Distinction (CRITICAL):**
- If a company SELLS cleaning products (e.g., 3M, Henkel), they do NOT necessarily have a higher need for cleaning robots than any other manufacturer. Do not score them high just because the word "cleaning" appears. Score them based on their *factories*.
- If a company SELLS security services, they might be a potential PARTNER, but check if they *manage* sites.
3. **Scale Assessment:**
- 5 locations implies more need than 1.
- "Global player" implies large facilities.
--- ANALYSIS PART 1: BUSINESS MODEL ---
1. Identify the core products/services.
2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model")
--- SCORING CATEGORIES (0-100) ---
Based on the current strategic focus of Roboplanet:
--- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) ---
1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*.
2. **Provider vs. User Check:**
- Does the company USE this infrastructure (Potential Customer)?
- Or do they SELL products for it (Competitor/Partner)?
- *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)?
3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence")
--- ANALYSIS PART 3: SCORING (0-100) ---
Based on the identified infrastructure, score the potential for these categories:
{category_guidance}
--- OUTPUT FORMAT (JSON ONLY) ---
{{
"industry": "String (from list)",
"summary": "Concise analysis of their infrastructure and business model (German)",
"business_model": "2-3 sentences summary (German)",
"infrastructure_evidence": "1-2 key sentences proving physical assets (German)",
"potentials": {{
"cleaning": {{ "score": 0-100, "reason": "Specific reasoning based on infrastructure (e.g. 'Operates 5 production plants in DE')." }},
"transport": {{ "score": 0-100, "reason": "..." }},
"security": {{ "score": 0-100, "reason": "..." }},
"service": {{ "score": 0-100, "reason": "..." }}
"cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }},
"transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }},
"security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }},
"service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }}
}}
}}
"""
@@ -106,7 +103,7 @@ class ClassificationService:
json_mode=True,
temperature=0.1 # Very low temp for analytical reasoning
)
return json.loads(response_text)
return json.loads(clean_json_response(response_text))
except Exception as e:
logger.error(f"Classification failed: {e}")
return {"error": str(e)}

View File

@@ -2,9 +2,11 @@ import logging
import requests
import random
import re
import json
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from typing import Optional, Dict
from ..lib.core_utils import clean_text, retry_on_failure
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response
logger = logging.getLogger(__name__)
@@ -22,6 +24,7 @@ class ScraperService:
def scrape_url(self, url: str) -> Dict[str, str]:
"""
Fetches a URL and returns cleaned text content + meta info.
Also attempts to find and scrape the Impressum (Imprint).
"""
if not url.startswith("http"):
url = "https://" + url
@@ -38,7 +41,36 @@ class ScraperService:
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
return {"error": "Not HTML"}
return self._parse_html(response.content)
# Parse Main Page
result = self._parse_html(response.content)
# --- IMPRESSUM LOGIC ---
soup = BeautifulSoup(response.content, 'html.parser')
impressum_url = self._find_impressum_link(soup, url)
# FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
if not impressum_url and url.count('/') > 3:
try:
parsed = urlparse(url)
root_url = f"{parsed.scheme}://{parsed.netloc}/"
logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")
root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
if root_resp.status_code == 200:
root_soup = BeautifulSoup(root_resp.content, 'html.parser')
impressum_url = self._find_impressum_link(root_soup, root_url)
except Exception as ex:
logger.warning(f"Root URL fallback failed: {ex}")
if impressum_url:
logger.info(f"Found Impressum URL: {impressum_url}")
impressum_data = self._scrape_impressum_data(impressum_url)
result["impressum"] = impressum_data
else:
logger.info(f"No Impressum link found for {url}")
result["impressum"] = None
return result
except requests.exceptions.SSLError:
# Retry with HTTP if HTTPS fails
@@ -50,13 +82,96 @@ class ScraperService:
logger.error(f"Scraping failed for {url}: {e}")
return {"error": str(e)}
def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
"""
Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
Returns the absolute URL.
"""
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
# Candidate tracking
candidates = []
for a in soup.find_all('a', href=True):
text = clean_text(a.get_text()).lower()
href = a['href'].lower()
# Debug log for potential candidates (verbose)
# if "imp" in text or "imp" in href:
# logger.debug(f"Checking link: '{text}' -> {href}")
# Check text content or href keywords
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
# Avoid mailto links or purely social links if possible
if "mailto:" in href or "tel:" in href or "javascript:" in href:
continue
full_url = urljoin(base_url, a['href'])
# Prioritize 'impressum' in text over href
score = 0
if "impressum" in text: score += 10
if "impressum" in href: score += 5
candidates.append((score, full_url))
if candidates:
# Sort by score desc
candidates.sort(key=lambda x: x[0], reverse=True)
best_match = candidates[0][1]
logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
return best_match
return None
def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
"""
Fetches the Impressum page and uses LLM to extract structured data.
"""
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Aggressive cleaning for Impressum too
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
element.decompose()
raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
# LLM Extraction
prompt = f"""
Extract the official company details from this German 'Impressum' text.
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
If a field is missing, use null.
Text:
{raw_text}
"""
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
return json.loads(clean_json_response(response_text))
except Exception as e:
logger.error(f"Impressum scrape failed for {url}: {e}")
return None
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
soup = BeautifulSoup(html_content, 'html.parser')
# 1. Cleanup Junk
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
# 1. Cleanup Junk (Aggressive, matching legacy logic)
# Removed 'a' tags to prevent menu links from polluting the text analysis
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
element.decompose()
# 1b. Remove common Cookie Banners / Popups by class/id heuristics
for div in soup.find_all("div"):
classes = str(div.get("class", "")).lower()
ids = str(div.get("id", "")).lower()
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
div.decompose()
# 2. Extract Title & Meta Description
title = soup.title.string if soup.title else ""
meta_desc = ""

View File

@@ -38,25 +38,52 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
const [loading, setLoading] = useState(false)
const [isProcessing, setIsProcessing] = useState(false)
// Polling Logic
useEffect(() => {
let interval: NodeJS.Timeout;
if (isProcessing) {
interval = setInterval(() => {
fetchData(true) // Silent fetch
}, 2000)
}
return () => clearInterval(interval)
}, [isProcessing, companyId]) // Dependencies
// Manual Override State
const [isEditingWiki, setIsEditingWiki] = useState(false)
const [wikiUrlInput, setWikiUrlInput] = useState("")
const [isEditingWebsite, setIsEditingWebsite] = useState(false)
const [websiteInput, setWebsiteInput] = useState("")
const fetchData = () => {
const fetchData = (silent = false) => {
if (!companyId) return
setLoading(true)
if (!silent) setLoading(true)
axios.get(`${apiBase}/companies/${companyId}`)
.then(res => setData(res.data))
.then(res => {
const newData = res.data
setData(newData)
// Auto-stop processing if status changes to ENRICHED or we see data
if (isProcessing) {
const hasWiki = newData.enrichment_data?.some((e:any) => e.source_type === 'wikipedia')
const hasAnalysis = newData.enrichment_data?.some((e:any) => e.source_type === 'ai_analysis')
// If we were waiting for Discover (Wiki) or Analyze (AI)
if ((hasWiki && newData.status === 'DISCOVERED') || (hasAnalysis && newData.status === 'ENRICHED')) {
setIsProcessing(false)
}
}
})
.catch(console.error)
.finally(() => setLoading(false))
.finally(() => { if (!silent) setLoading(false) })
}
useEffect(() => {
fetchData()
setIsEditingWiki(false)
setIsEditingWebsite(false)
setIsProcessing(false) // Reset on ID change
}, [companyId])
const handleDiscover = async () => {
@@ -64,10 +91,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
setIsProcessing(true)
try {
await axios.post(`${apiBase}/enrich/discover`, { company_id: companyId })
setTimeout(fetchData, 3000)
// Polling effect will handle the rest
} catch (e) {
console.error(e)
} finally {
setIsProcessing(false)
}
}
@@ -77,10 +103,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
setIsProcessing(true)
try {
await axios.post(`${apiBase}/enrich/analyze`, { company_id: companyId })
setTimeout(fetchData, 5000)
// Polling effect will handle the rest
} catch (e) {
console.error(e)
} finally {
setIsProcessing(false)
}
}
@@ -120,6 +145,11 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
const wikiEntry = data?.enrichment_data?.find(e => e.source_type === 'wikipedia')
const wiki = wikiEntry?.content
const isLocked = wikiEntry?.is_locked
const aiAnalysis = data?.enrichment_data?.find(e => e.source_type === 'ai_analysis')?.content
const scrapeData = data?.enrichment_data?.find(e => e.source_type === 'website_scrape')?.content
const impressum = scrapeData?.impressum
return (
<div className="fixed inset-y-0 right-0 w-[550px] bg-slate-900 border-l border-slate-800 shadow-2xl transform transition-transform duration-300 ease-in-out z-40 overflow-y-auto">
@@ -135,7 +165,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
<h2 className="text-xl font-bold text-white leading-tight">{data.name}</h2>
<div className="flex items-center gap-2">
<button
onClick={fetchData}
onClick={() => fetchData(true)}
className="p-1.5 text-slate-500 hover:text-white transition-colors"
title="Refresh"
>
@@ -227,6 +257,59 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
</div>
<div className="p-6 space-y-8">
{/* Impressum / Legal Data (NEW) */}
{impressum && (
<div className="bg-slate-950 rounded-lg p-4 border border-slate-800 flex flex-col gap-2">
<div className="flex items-center gap-2 mb-1">
<div className="p-1 bg-slate-800 rounded text-slate-400">
<Briefcase className="h-3 w-3" />
</div>
<span className="text-[10px] uppercase font-bold text-slate-500 tracking-wider">Official Legal Data</span>
</div>
<div className="text-sm font-medium text-white">
{impressum.legal_name || "Unknown Legal Name"}
</div>
<div className="flex items-start gap-2 text-xs text-slate-400">
<MapPin className="h-3 w-3 mt-0.5 shrink-0" />
<div>
<div>{impressum.street}</div>
<div>{impressum.zip} {impressum.city}</div>
</div>
</div>
{(impressum.email || impressum.phone) && (
<div className="mt-2 pt-2 border-t border-slate-900 flex gap-4 text-[10px] text-slate-500 font-mono">
{impressum.email && <span>{impressum.email}</span>}
{impressum.phone && <span>{impressum.phone}</span>}
</div>
)}
</div>
)}
{/* AI Analysis Dossier (NEW) */}
{aiAnalysis && (
<div className="space-y-4">
<h3 className="text-sm font-semibold text-slate-400 uppercase tracking-wider flex items-center gap-2">
<Bot className="h-4 w-4" /> AI Strategic Dossier
</h3>
<div className="bg-slate-800/30 rounded-xl p-5 border border-slate-800/50 space-y-4">
<div>
<div className="text-[10px] text-blue-400 uppercase font-bold tracking-tight mb-1">Business Model</div>
<p className="text-sm text-slate-200 leading-relaxed">{aiAnalysis.business_model || "No summary available."}</p>
</div>
{aiAnalysis.infrastructure_evidence && (
<div className="pt-4 border-t border-slate-800/50">
<div className="text-[10px] text-orange-400 uppercase font-bold tracking-tight mb-1">Infrastructure Evidence</div>
<p className="text-sm text-slate-300 italic leading-relaxed">"{aiAnalysis.infrastructure_evidence}"</p>
</div>
)}
</div>
</div>
)}
{/* Wikipedia Section */}
<div className="space-y-4">
<div className="flex items-center justify-between">
@@ -309,7 +392,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
</div>
<div>
<div className="text-[10px] text-slate-500 uppercase font-bold tracking-tight">Revenue</div>
<div className="text-sm text-slate-200 font-medium">{wiki.umsatz ? `${wiki.umsatz} Mio. €` : 'k.A.'}</div>
<div className="text-sm text-slate-200 font-medium">{wiki.umsatz && wiki.umsatz !== 'k.A.' ? `${wiki.umsatz} Mio. €` : 'k.A.'}</div>
</div>
</div>