diff --git a/company-explorer/backend/app.py b/company-explorer/backend/app.py index 0c120234..31da4028 100644 --- a/company-explorer/backend/app.py +++ b/company-explorer/backend/app.py @@ -383,6 +383,18 @@ def run_analysis_task(company_id: int, url: str): ) db.add(new_signal) + # Save Full Analysis Blob (Business Model + Evidence) + existing_analysis = db.query(EnrichmentData).filter( + EnrichmentData.company_id == company.id, + EnrichmentData.source_type == "ai_analysis" + ).first() + + if not existing_analysis: + db.add(EnrichmentData(company_id=company.id, source_type="ai_analysis", content=analysis)) + else: + existing_analysis.content = analysis + existing_analysis.updated_at = datetime.utcnow() + company.status = "ENRICHED" company.last_classification_at = datetime.utcnow() logger.info(f"Robotics analysis complete for {company.name}.") diff --git a/company-explorer/backend/lib/core_utils.py b/company-explorer/backend/lib/core_utils.py index 0850cce6..9a2ebd78 100644 --- a/company-explorer/backend/lib/core_utils.py +++ b/company-explorer/backend/lib/core_utils.py @@ -124,6 +124,7 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str: """ Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc. Returns string representation of the number or 'k.A.'. + Handles German number formatting (1.000 = 1000, 1,5 = 1.5). """ if not raw_value: return "k.A." @@ -134,25 +135,50 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str: # Simple multiplier handling multiplier = 1.0 - if 'mrd' in raw_value or 'billion' in raw_value: + if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value: multiplier = 1000.0 if is_umsatz else 1000000000.0 - elif 'mio' in raw_value or 'million' in raw_value: + elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value: multiplier = 1.0 if is_umsatz else 1000000.0 elif 'tsd' in raw_value or 'thousand' in raw_value: multiplier = 0.001 if is_umsatz else 1000.0 - # Extract number - # Matches 123,45 or 123.45 - matches = re.findall(r'(\d+[.,]?\d*)', raw_value) + # Extract number candidates + # Regex for "1.000,50" or "1,000.50" or "1000" + matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value) if not matches: return "k.A." try: - # Take the first number found - num_str = matches[0].replace(',', '.') - # Fix for thousands separator if like 1.000.000 -> 1000000 - if num_str.count('.') > 1: - num_str = num_str.replace('.', '') + num_str = matches[0] + + # Heuristic for German formatting (1.000,00) vs English (1,000.00) + # If it contains both, the last separator is likely the decimal + if '.' in num_str and ',' in num_str: + if num_str.rfind(',') > num_str.rfind('.'): + # German: 1.000,00 -> remove dots, replace comma with dot + num_str = num_str.replace('.', '').replace(',', '.') + else: + # English: 1,000.00 -> remove commas + num_str = num_str.replace(',', '') + elif '.' in num_str: + # Ambiguous: 1.005 could be 1005 or 1.005 + # Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands + parts = num_str.split('.') + if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz: + # Likely thousands separator for employees (e.g. 1.005) + num_str = num_str.replace('.', '') + elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3: + # For revenue, 375.6 vs 1.000 is tricky. + # But usually revenue in millions is small numbers with decimals (250.5). + # Large integers usually mean thousands. + # Let's assume dot is decimal for revenue unless context implies otherwise, + # but for "375.6" it works. For "1.000" it becomes 1.0. + # Let's keep dot as decimal for revenue by default unless we detect multiple dots + if num_str.count('.') > 1: + num_str = num_str.replace('.', '') + elif ',' in num_str: + # German decimal: 1,5 -> 1.5 + num_str = num_str.replace(',', '.') val = float(num_str) * multiplier @@ -173,6 +199,20 @@ def fuzzy_similarity(str1: str, str2: str) -> float: return 0.0 return fuzz.ratio(str1, str2) / 100.0 +def clean_json_response(response_text: str) -> str: + """ + Cleans LLM response to ensure valid JSON. + Removes Markdown code blocks (```json ... ```). + """ + if not response_text: return "{}" + + # Remove markdown code blocks + cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE) + cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE) + cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE) + + return cleaned.strip() + # ============================================================================== # 3. LLM WRAPPER (GEMINI) diff --git a/company-explorer/backend/services/classification.py b/company-explorer/backend/services/classification.py index d6493a5d..3a077691 100644 --- a/company-explorer/backend/services/classification.py +++ b/company-explorer/backend/services/classification.py @@ -2,7 +2,7 @@ import json import logging import os from typing import Dict, Any, List -from ..lib.core_utils import call_gemini +from ..lib.core_utils import call_gemini, clean_json_response from ..config import settings from ..database import SessionLocal, RoboticsCategory @@ -55,7 +55,7 @@ class ClassificationService: prompt = f""" You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor. - Your task is to analyze a target company based on their website text to determine their **operational need** for service robotics. + Your task is to analyze the target company based on their website text and create a concise **Dossier**. --- TARGET COMPANY --- Name: {company_name} @@ -66,36 +66,33 @@ class ClassificationService: You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige". {json.dumps(self.allowed_industries, ensure_ascii=False)} - --- ANALYSIS GUIDELINES (CHAIN OF THOUGHT) --- - 1. **Infrastructure Analysis:** What physical assets does this company likely operate based on their business model? - - Factories / Production Plants? (-> Needs Cleaning, Security, Intralogistics) - - Large Warehouses? (-> Needs Intralogistics, Security, Floor Washing) - - Offices / Headquarters? (-> Needs Vacuuming, Window Cleaning) - - Critical Infrastructure (Solar Parks, Wind Farms)? (-> Needs Perimeter Security, Inspection) - - Hotels / Hospitals? (-> Needs Service, Cleaning, Transport) - - 2. **Provider vs. User Distinction (CRITICAL):** - - If a company SELLS cleaning products (e.g., 3M, Henkel), they do NOT necessarily have a higher need for cleaning robots than any other manufacturer. Do not score them high just because the word "cleaning" appears. Score them based on their *factories*. - - If a company SELLS security services, they might be a potential PARTNER, but check if they *manage* sites. - - 3. **Scale Assessment:** - - 5 locations implies more need than 1. - - "Global player" implies large facilities. + --- ANALYSIS PART 1: BUSINESS MODEL --- + 1. Identify the core products/services. + 2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model") - --- SCORING CATEGORIES (0-100) --- - Based on the current strategic focus of Roboplanet: + --- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) --- + 1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*. + 2. **Provider vs. User Check:** + - Does the company USE this infrastructure (Potential Customer)? + - Or do they SELL products for it (Competitor/Partner)? + - *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)? + 3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence") + + --- ANALYSIS PART 3: SCORING (0-100) --- + Based on the identified infrastructure, score the potential for these categories: {category_guidance} --- OUTPUT FORMAT (JSON ONLY) --- {{ "industry": "String (from list)", - "summary": "Concise analysis of their infrastructure and business model (German)", + "business_model": "2-3 sentences summary (German)", + "infrastructure_evidence": "1-2 key sentences proving physical assets (German)", "potentials": {{ - "cleaning": {{ "score": 0-100, "reason": "Specific reasoning based on infrastructure (e.g. 'Operates 5 production plants in DE')." }}, - "transport": {{ "score": 0-100, "reason": "..." }}, - "security": {{ "score": 0-100, "reason": "..." }}, - "service": {{ "score": 0-100, "reason": "..." }} + "cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }}, + "transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }}, + "security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }}, + "service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }} }} }} """ @@ -106,7 +103,7 @@ class ClassificationService: json_mode=True, temperature=0.1 # Very low temp for analytical reasoning ) - return json.loads(response_text) + return json.loads(clean_json_response(response_text)) except Exception as e: logger.error(f"Classification failed: {e}") return {"error": str(e)} diff --git a/company-explorer/backend/services/scraping.py b/company-explorer/backend/services/scraping.py index 470b07a1..875410b9 100644 --- a/company-explorer/backend/services/scraping.py +++ b/company-explorer/backend/services/scraping.py @@ -2,9 +2,11 @@ import logging import requests import random import re +import json +from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from typing import Optional, Dict -from ..lib.core_utils import clean_text, retry_on_failure +from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response logger = logging.getLogger(__name__) @@ -22,6 +24,7 @@ class ScraperService: def scrape_url(self, url: str) -> Dict[str, str]: """ Fetches a URL and returns cleaned text content + meta info. + Also attempts to find and scrape the Impressum (Imprint). """ if not url.startswith("http"): url = "https://" + url @@ -38,7 +41,36 @@ class ScraperService: logger.warning(f"Skipping non-HTML content for {url}: {content_type}") return {"error": "Not HTML"} - return self._parse_html(response.content) + # Parse Main Page + result = self._parse_html(response.content) + + # --- IMPRESSUM LOGIC --- + soup = BeautifulSoup(response.content, 'html.parser') + impressum_url = self._find_impressum_link(soup, url) + + # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL + if not impressum_url and url.count('/') > 3: + try: + parsed = urlparse(url) + root_url = f"{parsed.scheme}://{parsed.netloc}/" + logger.info(f"No Impressum on deep URL. Checking Root: {root_url}") + + root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False) + if root_resp.status_code == 200: + root_soup = BeautifulSoup(root_resp.content, 'html.parser') + impressum_url = self._find_impressum_link(root_soup, root_url) + except Exception as ex: + logger.warning(f"Root URL fallback failed: {ex}") + + if impressum_url: + logger.info(f"Found Impressum URL: {impressum_url}") + impressum_data = self._scrape_impressum_data(impressum_url) + result["impressum"] = impressum_data + else: + logger.info(f"No Impressum link found for {url}") + result["impressum"] = None + + return result except requests.exceptions.SSLError: # Retry with HTTP if HTTPS fails @@ -50,13 +82,96 @@ class ScraperService: logger.error(f"Scraping failed for {url}: {e}") return {"error": str(e)} + def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]: + """ + Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'. + Returns the absolute URL. + """ + keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"] + + # Candidate tracking + candidates = [] + + for a in soup.find_all('a', href=True): + text = clean_text(a.get_text()).lower() + href = a['href'].lower() + + # Debug log for potential candidates (verbose) + # if "imp" in text or "imp" in href: + # logger.debug(f"Checking link: '{text}' -> {href}") + + # Check text content or href keywords + if any(kw in text for kw in keywords) or any(kw in href for kw in keywords): + # Avoid mailto links or purely social links if possible + if "mailto:" in href or "tel:" in href or "javascript:" in href: + continue + + full_url = urljoin(base_url, a['href']) + + # Prioritize 'impressum' in text over href + score = 0 + if "impressum" in text: score += 10 + if "impressum" in href: score += 5 + + candidates.append((score, full_url)) + + if candidates: + # Sort by score desc + candidates.sort(key=lambda x: x[0], reverse=True) + best_match = candidates[0][1] + logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}") + return best_match + + return None + + def _scrape_impressum_data(self, url: str) -> Dict[str, str]: + """ + Fetches the Impressum page and uses LLM to extract structured data. + """ + try: + headers = {'User-Agent': random.choice(USER_AGENTS)} + response = requests.get(url, headers=headers, timeout=self.timeout, verify=False) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + # Aggressive cleaning for Impressum too + for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']): + element.decompose() + + raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context + + # LLM Extraction + prompt = f""" + Extract the official company details from this German 'Impressum' text. + Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'. + If a field is missing, use null. + + Text: + {raw_text} + """ + + response_text = call_gemini(prompt, json_mode=True, temperature=0.1) + return json.loads(clean_json_response(response_text)) + + except Exception as e: + logger.error(f"Impressum scrape failed for {url}: {e}") + return None + def _parse_html(self, html_content: bytes) -> Dict[str, str]: soup = BeautifulSoup(html_content, 'html.parser') - # 1. Cleanup Junk - for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']): + # 1. Cleanup Junk (Aggressive, matching legacy logic) + # Removed 'a' tags to prevent menu links from polluting the text analysis + for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']): element.decompose() + # 1b. Remove common Cookie Banners / Popups by class/id heuristics + for div in soup.find_all("div"): + classes = str(div.get("class", "")).lower() + ids = str(div.get("id", "")).lower() + if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]): + div.decompose() + # 2. Extract Title & Meta Description title = soup.title.string if soup.title else "" meta_desc = "" diff --git a/company-explorer/frontend/src/components/Inspector.tsx b/company-explorer/frontend/src/components/Inspector.tsx index 32a0d0c8..20970179 100644 --- a/company-explorer/frontend/src/components/Inspector.tsx +++ b/company-explorer/frontend/src/components/Inspector.tsx @@ -38,25 +38,52 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) { const [loading, setLoading] = useState(false) const [isProcessing, setIsProcessing] = useState(false) + // Polling Logic + useEffect(() => { + let interval: NodeJS.Timeout; + if (isProcessing) { + interval = setInterval(() => { + fetchData(true) // Silent fetch + }, 2000) + } + return () => clearInterval(interval) + }, [isProcessing, companyId]) // Dependencies + // Manual Override State const [isEditingWiki, setIsEditingWiki] = useState(false) const [wikiUrlInput, setWikiUrlInput] = useState("") const [isEditingWebsite, setIsEditingWebsite] = useState(false) const [websiteInput, setWebsiteInput] = useState("") - const fetchData = () => { + const fetchData = (silent = false) => { if (!companyId) return - setLoading(true) + if (!silent) setLoading(true) + axios.get(`${apiBase}/companies/${companyId}`) - .then(res => setData(res.data)) + .then(res => { + const newData = res.data + setData(newData) + + // Auto-stop processing if status changes to ENRICHED or we see data + if (isProcessing) { + const hasWiki = newData.enrichment_data?.some((e:any) => e.source_type === 'wikipedia') + const hasAnalysis = newData.enrichment_data?.some((e:any) => e.source_type === 'ai_analysis') + + // If we were waiting for Discover (Wiki) or Analyze (AI) + if ((hasWiki && newData.status === 'DISCOVERED') || (hasAnalysis && newData.status === 'ENRICHED')) { + setIsProcessing(false) + } + } + }) .catch(console.error) - .finally(() => setLoading(false)) + .finally(() => { if (!silent) setLoading(false) }) } useEffect(() => { fetchData() setIsEditingWiki(false) setIsEditingWebsite(false) + setIsProcessing(false) // Reset on ID change }, [companyId]) const handleDiscover = async () => { @@ -64,10 +91,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) { setIsProcessing(true) try { await axios.post(`${apiBase}/enrich/discover`, { company_id: companyId }) - setTimeout(fetchData, 3000) + // Polling effect will handle the rest } catch (e) { console.error(e) - } finally { setIsProcessing(false) } } @@ -77,10 +103,9 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) { setIsProcessing(true) try { await axios.post(`${apiBase}/enrich/analyze`, { company_id: companyId }) - setTimeout(fetchData, 5000) + // Polling effect will handle the rest } catch (e) { console.error(e) - } finally { setIsProcessing(false) } } @@ -120,6 +145,11 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) { const wikiEntry = data?.enrichment_data?.find(e => e.source_type === 'wikipedia') const wiki = wikiEntry?.content const isLocked = wikiEntry?.is_locked + + const aiAnalysis = data?.enrichment_data?.find(e => e.source_type === 'ai_analysis')?.content + + const scrapeData = data?.enrichment_data?.find(e => e.source_type === 'website_scrape')?.content + const impressum = scrapeData?.impressum return (
{aiAnalysis.business_model || "No summary available."}
+"{aiAnalysis.infrastructure_evidence}"
+