feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides

- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
2026-01-08 10:08:21 +00:00
parent 3590e34490
commit e4b59b1571
12 changed files with 1320 additions and 160 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -17,7 +17,7 @@ setup_logging()
 import logging
 logger = logging.getLogger(__name__)

-from .database import init_db, get_db, Company, Signal, EnrichmentData
+from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory
 from .services.deduplication import Deduplicator
 from .services.discovery import DiscoveryService
 from .services.scraping import ScraperService
@@ -97,7 +97,10 @@ def list_companies(

@app.get("/api/companies/{company_id}")
 def get_company(company_id: int, db: Session = Depends(get_db)):
-    company = db.query(Company).options(joinedload(Company.signals)).filter(Company.id == company_id).first()
+    company = db.query(Company).options(
+        joinedload(Company.signals),
+        joinedload(Company.enrichment_data)
+    ).filter(Company.id == company_id).first()
    if not company:
        raise HTTPException(status_code=404, detail="Company not found")
    return company
@@ -154,6 +157,27 @@ def bulk_import_names(req: BulkImportRequest, db: Session = Depends(get_db)):
        db.rollback()
        raise HTTPException(status_code=500, detail=str(e))

+@app.get("/api/robotics/categories")
+def list_robotics_categories(db: Session = Depends(get_db)):
+    """Lists all configured robotics categories."""
+    return db.query(RoboticsCategory).all()
+
+class CategoryUpdate(BaseModel):
+    description: str
+    reasoning_guide: str
+
+@app.put("/api/robotics/categories/{id}")
+def update_robotics_category(id: int, cat: CategoryUpdate, db: Session = Depends(get_db)):
+    """Updates a robotics category definition."""
+    category = db.query(RoboticsCategory).filter(RoboticsCategory.id == id).first()
+    if not category:
+        raise HTTPException(404, "Category not found")
+    
+    category.description = cat.description
+    category.reasoning_guide = cat.reasoning_guide
+    db.commit()
+    return category
+
@app.post("/api/enrich/discover")
 def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
    """
@@ -172,6 +196,71 @@ def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db
        logger.error(f"Discovery Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

+@app.post("/api/companies/{company_id}/override/wiki")
+def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
+    """
+    Manually sets the Wikipedia URL for a company and triggers re-extraction.
+    Locks the data against auto-discovery.
+    """
+    company = db.query(Company).filter(Company.id == company_id).first()
+    if not company:
+        raise HTTPException(404, "Company not found")
+
+    logger.info(f"Manual Override for {company.name}: Setting Wiki URL to {url}")
+    
+    # Update or create EnrichmentData entry
+    existing_wiki = db.query(EnrichmentData).filter(
+        EnrichmentData.company_id == company.id, 
+        EnrichmentData.source_type == "wikipedia"
+    ).first()
+    
+    # Extract data immediately
+    wiki_data = {"url": url}
+    if url and url != "k.A.":
+        try:
+            wiki_data = discovery.extract_wikipedia_data(url)
+            wiki_data['url'] = url # Ensure URL is correct
+        except Exception as e:
+            logger.error(f"Extraction failed for manual URL: {e}")
+            wiki_data["error"] = str(e)
+
+    if not existing_wiki:
+        db.add(EnrichmentData(
+            company_id=company.id, 
+            source_type="wikipedia", 
+            content=wiki_data,
+            is_locked=True
+        ))
+    else:
+        existing_wiki.content = wiki_data
+        existing_wiki.updated_at = datetime.utcnow()
+        existing_wiki.is_locked = True # LOCK IT
+    
+    db.commit()
+    return {"status": "updated", "data": wiki_data}
+
+@app.post("/api/companies/{company_id}/override/website")
+def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
+    """
+    Manually sets the Website URL for a company.
+    Clears existing scrape data to force a fresh analysis on next run.
+    """
+    company = db.query(Company).filter(Company.id == company_id).first()
+    if not company:
+        raise HTTPException(404, "Company not found")
+
+    logger.info(f"Manual Override for {company.name}: Setting Website to {url}")
+    company.website = url
+    
+    # Remove old scrape data since URL changed
+    db.query(EnrichmentData).filter(
+        EnrichmentData.company_id == company.id,
+        EnrichmentData.source_type == "website_scrape"
+    ).delete()
+    
+    db.commit()
+    return {"status": "updated", "website": url}
+
 def run_discovery_task(company_id: int):
    # New Session for Background Task
    from .database import SessionLocal
@@ -182,27 +271,38 @@ def run_discovery_task(company_id: int):

        logger.info(f"Running Discovery Task for {company.name}")

-        # 1. Website Search
+        # 1. Website Search (Always try if missing)
        if not company.website or company.website == "k.A.":
            found_url = discovery.find_company_website(company.name, company.city)
            if found_url and found_url != "k.A.":
                company.website = found_url
                logger.info(f"-> Found URL: {found_url}")
        
-        # 2. Wikipedia Search
-        wiki_url = discovery.find_wikipedia_url(company.name)
-        company.last_wiki_search_at = datetime.utcnow()
-        
+        # 2. Wikipedia Search & Extraction
+        # Check if locked
        existing_wiki = db.query(EnrichmentData).filter(
            EnrichmentData.company_id == company.id, 
-            EnrichmentData.source_type == "wikipedia_url"
+            EnrichmentData.source_type == "wikipedia"
        ).first()
-        
-        if not existing_wiki:
-            db.add(EnrichmentData(company_id=company.id, source_type="wikipedia_url", content={"url": wiki_url}))
+
+        if existing_wiki and existing_wiki.is_locked:
+            logger.info(f"Skipping Wiki Discovery for {company.name} - Data is LOCKED.")
        else:
-            existing_wiki.content = {"url": wiki_url}
-            existing_wiki.updated_at = datetime.utcnow()
+            # Pass available info for better validation
+            current_website = company.website if company.website and company.website != "k.A." else None
+            wiki_url = discovery.find_wikipedia_url(company.name, website=current_website, city=company.city)
+            company.last_wiki_search_at = datetime.utcnow()
+            
+            wiki_data = {"url": wiki_url}
+            if wiki_url and wiki_url != "k.A.":
+                logger.info(f"Extracting full data from Wikipedia for {company.name}...")
+                wiki_data = discovery.extract_wikipedia_data(wiki_url)
+
+            if not existing_wiki:
+                db.add(EnrichmentData(company_id=company.id, source_type="wikipedia", content=wiki_data))
+            else:
+                existing_wiki.content = wiki_data
+                existing_wiki.updated_at = datetime.utcnow()

        if company.status == "NEW" and company.website and company.website != "k.A.":
            company.status = "DISCOVERED"
--- a/company-explorer/backend/database.py
+++ b/company-explorer/backend/database.py
@@ -77,13 +77,30 @@ class EnrichmentData(Base):
    id = Column(Integer, primary_key=True, index=True)
    company_id = Column(Integer, ForeignKey("companies.id"))
    
-    source_type = Column(String) # "website_scrape", "wikipedia_api", "google_serp"
+    source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
    content = Column(JSON)       # The raw data
+    is_locked = Column(Boolean, default=False) # Manual override flag
    
    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    
    company = relationship("Company", back_populates="enrichment_data")

+
+class RoboticsCategory(Base):
+    """
+    Stores definitions for robotics categories to allow user customization via UI.
+    """
+    __tablename__ = "robotics_categories"
+
+    id = Column(Integer, primary_key=True, index=True)
+    key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
+    name = Column(String) # Display Name
+    description = Column(Text) # The core definition used in LLM prompts
+    reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought
+    
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
 class ImportLog(Base):
    """
    Logs bulk imports (e.g. from Excel lists).
@@ -104,6 +121,47 @@ class ImportLog(Base):

 def init_db():
    Base.metadata.create_all(bind=engine)
+    init_robotics_defaults()
+
+def init_robotics_defaults():
+    """Seeds the database with default robotics categories if empty."""
+    db = SessionLocal()
+    try:
+        if db.query(RoboticsCategory).count() == 0:
+            defaults = [
+                {
+                    "key": "cleaning",
+                    "name": "Cleaning Robots",
+                    "description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)",
+                    "reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies."
+                },
+                {
+                    "key": "transport",
+                    "name": "Intralogistics / Transport",
+                    "description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)",
+                    "reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms."
+                },
+                {
+                    "key": "security",
+                    "name": "Security & Surveillance",
+                    "description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)",
+                    "reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings."
+                },
+                {
+                    "key": "service",
+                    "name": "Service / Waiter Robots",
+                    "description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?",
+                    "reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services."
+                }
+            ]
+            for d in defaults:
+                db.add(RoboticsCategory(**d))
+            db.commit()
+            print("Seeded Robotics Categories.")
+    except Exception as e:
+        print(f"Error seeding robotics defaults: {e}")
+    finally:
+        db.close()

 def get_db():
    db = SessionLocal()
--- a/company-explorer/backend/lib/core_utils.py
+++ b/company-explorer/backend/lib/core_utils.py
@@ -3,8 +3,11 @@ import logging
 import random
 import os
 import re
+import unicodedata
+from urllib.parse import urlparse
 from functools import wraps
 from typing import Optional, Union, List
+from thefuzz import fuzz

 # Versuche neue Google GenAI Lib (v1.0+)
 try:
@@ -64,6 +67,10 @@ def clean_text(text: str) -> str:
    if not text:
        return ""
    text = str(text).strip()
+    # Normalize unicode characters
+    text = unicodedata.normalize('NFKC', text)
+    # Remove control characters
+    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    text = re.sub(r'\s+', ' ', text)
    return text

@@ -71,8 +78,104 @@ def normalize_string(s: str) -> str:
    """Basic normalization (lowercase, stripped)."""
    return s.lower().strip() if s else ""

+def simple_normalize_url(url: str) -> str:
+    """Normalizes a URL to its core domain (e.g. 'https://www.example.com/foo' -> 'example.com')."""
+    if not url or url.lower() in ["k.a.", "nan", "none"]:
+        return "k.A."
+    
+    # Ensure protocol for urlparse
+    if not url.startswith(('http://', 'https://')):
+        url = 'http://' + url
+        
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc or parsed.path
+        
+        # Remove www.
+        if domain.startswith('www.'):
+            domain = domain[4:]
+            
+        return domain.lower()
+    except Exception:
+        return "k.A."
+
+def normalize_company_name(name: str) -> str:
+    """Normalizes a company name by removing legal forms and special characters."""
+    if not name:
+        return ""
+        
+    name = name.lower()
+    
+    # Remove common legal forms
+    legal_forms = [
+        r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b', 
+        r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b'
+    ]
+    for form in legal_forms:
+        name = re.sub(form, '', name)
+        
+    # Remove special chars and extra spaces
+    name = re.sub(r'[^\w\s]', '', name)
+    name = re.sub(r'\s+', ' ', name).strip()
+    
+    return name
+
+def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
+    """
+    Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
+    Returns string representation of the number or 'k.A.'.
+    """
+    if not raw_value:
+        return "k.A."
+        
+    raw_value = str(raw_value).strip().lower()
+    if raw_value in ["k.a.", "nan", "none"]:
+        return "k.A."
+
+    # Simple multiplier handling
+    multiplier = 1.0
+    if 'mrd' in raw_value or 'billion' in raw_value:
+        multiplier = 1000.0 if is_umsatz else 1000000000.0
+    elif 'mio' in raw_value or 'million' in raw_value:
+        multiplier = 1.0 if is_umsatz else 1000000.0
+    elif 'tsd' in raw_value or 'thousand' in raw_value:
+        multiplier = 0.001 if is_umsatz else 1000.0
+        
+    # Extract number
+    # Matches 123,45 or 123.45
+    matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
+    if not matches:
+        return "k.A."
+        
+    try:
+        # Take the first number found
+        num_str = matches[0].replace(',', '.')
+        # Fix for thousands separator if like 1.000.000 -> 1000000
+        if num_str.count('.') > 1:
+            num_str = num_str.replace('.', '')
+            
+        val = float(num_str) * multiplier
+        
+        # Round appropriately
+        if is_umsatz:
+            # Return in millions, e.g. "250.5"
+            return f"{val:.2f}".rstrip('0').rstrip('.')
+        else:
+            # Return integer for employees
+            return str(int(val))
+            
+    except ValueError:
+        return "k.A."
+
+def fuzzy_similarity(str1: str, str2: str) -> float:
+    """Returns fuzzy similarity between two strings (0.0 to 1.0)."""
+    if not str1 or not str2:
+        return 0.0
+    return fuzz.ratio(str1, str2) / 100.0
+
 # ==============================================================================
 # 3. LLM WRAPPER (GEMINI)
+
 # ==============================================================================

@retry_on_failure(max_retries=3)
--- a/company-explorer/backend/services/classification.py
+++ b/company-explorer/backend/services/classification.py
@@ -4,6 +4,7 @@ import os
 from typing import Dict, Any, List
 from ..lib.core_utils import call_gemini
 from ..config import settings
+from ..database import SessionLocal, RoboticsCategory

 logger = logging.getLogger(__name__)

@@ -21,6 +22,27 @@ class ClassificationService:
            logger.error(f"Failed to load allowed industries: {e}")
            return ["Sonstige"]

+    def _get_category_prompts(self) -> str:
+        """
+        Fetches the latest category definitions from the database.
+        """
+        db = SessionLocal()
+        try:
+            categories = db.query(RoboticsCategory).all()
+            if not categories:
+                return "Error: No categories defined."
+            
+            prompt_parts = []
+            for cat in categories:
+                prompt_parts.append(f"* **{cat.name} ({cat.key}):**\n     - Definition: {cat.description}\n     - Scoring Guide: {cat.reasoning_guide}")
+            
+            return "\n".join(prompt_parts)
+        except Exception as e:
+            logger.error(f"Error fetching categories: {e}")
+            return "Error loading categories."
+        finally:
+            db.close()
+
    def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]:
        """
        Analyzes the company for robotics potential based on website content.
@@ -28,36 +50,49 @@ class ClassificationService:
        """
        if not website_text or len(website_text) < 100:
            return {"error": "Insufficient text content"}
+            
+        category_guidance = self._get_category_prompts()

        prompt = f"""
-        You are a Senior B2B Market Analyst for 'Roboplanet', a robotics distributor.
-        Your job is to analyze a target company based on their website text and determine their potential for using robots.
+        You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
+        Your task is to analyze a target company based on their website text to determine their **operational need** for service robotics.

        --- TARGET COMPANY ---
        Name: {company_name}
        Website Content (Excerpt):
-        {website_text[:15000]} 
+        {website_text[:20000]} 
        
        --- ALLOWED INDUSTRIES (STRICT) ---
        You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
        {json.dumps(self.allowed_industries, ensure_ascii=False)}

-        --- ANALYSIS TASKS ---
-        1. **Industry Classification:** Pick one from the list.
-        2. **Robotics Potential Scoring (0-100):**
-           - **Cleaning:** Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)
-           - **Transport/Logistics:** Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)
-           - **Security:** Do they have large perimeters or night patrols? (Keywords: Werkschutz, Security, Monitoring)
-           - **Service:** Do they interact with guests/patients? (Keywords: Reception, Restaurant, Nursing)
+        --- ANALYSIS GUIDELINES (CHAIN OF THOUGHT) ---
+        1. **Infrastructure Analysis:** What physical assets does this company likely operate based on their business model? 
+           - Factories / Production Plants? (-> Needs Cleaning, Security, Intralogistics)
+           - Large Warehouses? (-> Needs Intralogistics, Security, Floor Washing)
+           - Offices / Headquarters? (-> Needs Vacuuming, Window Cleaning)
+           - Critical Infrastructure (Solar Parks, Wind Farms)? (-> Needs Perimeter Security, Inspection)
+           - Hotels / Hospitals? (-> Needs Service, Cleaning, Transport)
        
-        3. **Explanation:** A short, strategic reason for the scoring (German).
+        2. **Provider vs. User Distinction (CRITICAL):**
+           - If a company SELLS cleaning products (e.g., 3M, Henkel), they do NOT necessarily have a higher need for cleaning robots than any other manufacturer. Do not score them high just because the word "cleaning" appears. Score them based on their *factories*.
+           - If a company SELLS security services, they might be a potential PARTNER, but check if they *manage* sites.
+        
+        3. **Scale Assessment:** 
+           - 5 locations implies more need than 1. 
+           - "Global player" implies large facilities.
+
+        --- SCORING CATEGORIES (0-100) ---
+        Based on the current strategic focus of Roboplanet:
+        
+        {category_guidance}

        --- OUTPUT FORMAT (JSON ONLY) ---
        {{
            "industry": "String (from list)",
-            "summary": "Short business summary (German)",
+            "summary": "Concise analysis of their infrastructure and business model (German)",
            "potentials": {{
-                "cleaning": {{ "score": 0-100, "reason": "..." }},
+                "cleaning": {{ "score": 0-100, "reason": "Specific reasoning based on infrastructure (e.g. 'Operates 5 production plants in DE')." }},
                "transport": {{ "score": 0-100, "reason": "..." }},
                "security": {{ "score": 0-100, "reason": "..." }},
                "service": {{ "score": 0-100, "reason": "..." }}
@@ -69,7 +104,7 @@ class ClassificationService:
            response_text = call_gemini(
                prompt=prompt,
                json_mode=True,
-                temperature=0.2 # Low temp for consistency
+                temperature=0.1 # Very low temp for analytical reasoning
            )
            return json.loads(response_text)
        except Exception as e:
--- a/company-explorer/backend/services/discovery.py
+++ b/company-explorer/backend/services/discovery.py
@@ -5,6 +5,7 @@ from typing import Optional, Dict, Tuple
 from urllib.parse import urlparse
 from ..config import settings
 from ..lib.core_utils import retry_on_failure, normalize_string
+from .wikipedia_service import WikipediaService

 logger = logging.getLogger(__name__)

@@ -21,6 +22,9 @@ class DiscoveryService:
        self.api_key = settings.SERP_API_KEY
        if not self.api_key:
            logger.warning("SERP_API_KEY not set. Discovery features will fail.")
+        
+        # Initialize the specialized Wikipedia Service
+        self.wiki_service = WikipediaService()

    @retry_on_failure(max_retries=2)
    def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
@@ -67,42 +71,42 @@ class DiscoveryService:
            return "k.A."

    @retry_on_failure(max_retries=2)
-    def find_wikipedia_url(self, company_name: str) -> str:
+    def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
        """
-        Searches for a specific German Wikipedia article.
+        Searches for a specific German Wikipedia article using the robust WikipediaService.
+        Includes validation via website domain and city.
        """
        if not self.api_key:
            return "k.A."
            
-        query = f"{company_name} Wikipedia"
-        
        try:
-            params = {
-                "engine": "google",
-                "q": query,
-                "api_key": self.api_key,
-                "num": 3,
-                "gl": "de",
-                "hl": "de"
-            }
-            response = requests.get("https://serpapi.com/search", params=params, timeout=15)
-            response.raise_for_status()
-            data = response.json()
-
-            for result in data.get("organic_results", []):
-                link = result.get("link", "")
-                if "de.wikipedia.org/wiki/" in link:
-                    # Basic validation: Is the title roughly the company?
-                    title = result.get("title", "").replace(" – Wikipedia", "")
-                    if self._check_name_similarity(company_name, title):
-                        return link
+            # Delegate to the robust service
+            # parent_name could be added if available in the future
+            page = self.wiki_service.search_company_article(
+                company_name=company_name,
+                website=website,
+                crm_city=city
+            )
+            
+            if page:
+                return page.url
            
            return "k.A."

        except Exception as e:
-            logger.error(f"Wiki Search Error: {e}")
+            logger.error(f"Wiki Search Error via Service: {e}")
            return "k.A."

+    def extract_wikipedia_data(self, url: str) -> dict:
+        """
+        Extracts full company data from a given Wikipedia URL.
+        """
+        try:
+            return self.wiki_service.extract_company_data(url)
+        except Exception as e:
+            logger.error(f"Wiki Extraction Error for {url}: {e}")
+            return {"url": url, "error": str(e)}
+
    def _is_credible_url(self, url: str) -> bool:
        """Filters out social media, directories, and junk."""
        if not url: return False
@@ -118,9 +122,3 @@ class DiscoveryService:
        except:
            return False

-    def _check_name_similarity(self, name1: str, name2: str) -> bool:
-        """Simple fuzzy check for validation."""
-        n1 = normalize_string(name1)
-        n2 = normalize_string(name2)
-        # Very permissive: if one is contained in the other
-        return n1 in n2 or n2 in n1
--- a/company-explorer/backend/services/wikipedia_service.py
+++ b/company-explorer/backend/services/wikipedia_service.py
@@ -0,0 +1,448 @@
+#!/usr/bin/env python3
+"""
+wikipedia_service.py
+
+Service class for interacting with Wikipedia, including search,
+validation, and extraction of company data.
+"""
+
+import logging
+import re
+from urllib.parse import unquote
+
+import requests
+import wikipedia
+from bs4 import BeautifulSoup
+
+# Import settings and helpers
+from ..config import settings
+from ..lib.core_utils import (
+    retry_on_failure, 
+    simple_normalize_url,
+    normalize_company_name, 
+    extract_numeric_value,
+    clean_text, 
+    fuzzy_similarity
+)
+
+logger = logging.getLogger(__name__)
+
+class WikipediaService:
+    """
+    Handles searching for Wikipedia articles and extracting relevant
+    company data. Includes validation logic for articles.
+    """
+    def __init__(self, user_agent=None):
+        """
+        Initialize the scraper with a requests session.
+        """
+        self.user_agent = user_agent or 'Mozilla/5.0 (compatible; CompanyExplorer/1.0; +http://www.example.com/bot)'
+        self.session = requests.Session()
+        self.session.headers.update({'User-Agent': self.user_agent})
+        
+        self.keywords_map = {
+            'branche': ['branche', 'wirtschaftszweig', 'industry', 'taetigkeit', 'sektor', 'produkte', 'leistungen'],
+            'umsatz': ['umsatz', 'erloes', 'revenue', 'jahresumsatz', 'konzernumsatz', 'ergebnis'],
+            'mitarbeiter': ['mitarbeiter', 'mitarbeiterzahl', 'beschaeftigte', 'employees', 'number of employees', 'personal', 'belegschaft'],
+            'sitz': ['sitz', 'hauptsitz', 'unternehmenssitz', 'firmensitz', 'headquarters', 'standort', 'sitz des unternehmens', 'anschrift', 'adresse']
+        }
+
+        try:
+            # Default to German for now, could be configurable
+            wiki_lang = 'de'
+            wikipedia.set_lang(wiki_lang)
+            wikipedia.set_rate_limiting(False)
+            logger.info(f"Wikipedia library language set to '{wiki_lang}'. Rate limiting DISABLED.")
+        except Exception as e:
+            logger.warning(f"Error setting Wikipedia language or rate limiting: {e}")
+
+    @retry_on_failure(max_retries=3)
+    def serp_wikipedia_lookup(self, company_name: str, lang: str = 'de') -> str:
+        """
+        Searches for the best Wikipedia URL for a company using Google Search (via SerpAPI).
+        Prioritizes Knowledge Graph hits and then organic results.
+
+        Args:
+            company_name (str): The name of the company to search for.
+            lang (str): The language code for Wikipedia search (e.g., 'de').
+
+        Returns:
+            str: The URL of the best hit or None if nothing suitable was found.
+        """
+        logger.info(f"Starting SerpAPI Wikipedia search for '{company_name}'...")
+        serp_key = settings.SERP_API_KEY
+        if not serp_key:
+            logger.warning("SerpAPI Key not configured. Skipping search.")
+            return None
+
+        query = f'site:{lang}.wikipedia.org "{company_name}"'
+        params = {"engine": "google", "q": query, "api_key": serp_key, "hl": lang}
+        
+        try:
+            response = requests.get("https://serpapi.com/search", params=params, timeout=15)
+            response.raise_for_status()
+            data = response.json()
+
+            # 1. Check Knowledge Graph (highest priority)
+            if "knowledge_graph" in data and "source" in data["knowledge_graph"]:
+                source = data["knowledge_graph"]["source"]
+                if "link" in source and f"{lang}.wikipedia.org" in source["link"]:
+                    url = source["link"]
+                    logger.info(f"  -> Hit found in Knowledge Graph: {url}")
+                    return url
+
+            # 2. Check organic results
+            if "organic_results" in data:
+                for result in data.get("organic_results", []):
+                    link = result.get("link")
+                    if link and f"{lang}.wikipedia.org/wiki/" in link:
+                        logger.info(f"  -> Best organic hit found: {link}")
+                        return link
+            
+            logger.warning(f"  -> No suitable Wikipedia URL found for '{company_name}' in SerpAPI results.")
+            return None
+        except Exception as e:
+            logger.error(f"Error during SerpAPI request for '{company_name}': {e}")
+            return None
+
+    @retry_on_failure(max_retries=3)
+    def _get_page_soup(self, url: str) -> BeautifulSoup:
+        """
+        Fetches HTML from a URL and returns a BeautifulSoup object.
+        """
+        if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")):
+            logger.warning(f"_get_page_soup: Invalid URL '{str(url)[:100]}...'")
+            return None
+        try:
+            response = self.session.get(url, timeout=15)
+            response.raise_for_status()
+            # Handle encoding
+            response.encoding = response.apparent_encoding
+            soup = BeautifulSoup(response.text, 'html.parser')
+            return soup
+        except Exception as e:
+            logger.error(f"_get_page_soup: Error fetching or parsing HTML from {str(url)[:100]}...: {e}")
+            raise e
+
+    def _extract_first_paragraph_from_soup(self, soup: BeautifulSoup) -> str:
+        """
+        Extracts the first meaningful paragraph from the Wikipedia article soup.
+        Mimics the sophisticated cleaning from the legacy system.
+        """
+        if not soup: return "k.A."
+        paragraph_text = "k.A."
+        try:
+            content_div = soup.find('div', class_='mw-parser-output')
+            search_area = content_div if content_div else soup
+            paragraphs = search_area.find_all('p', recursive=False)
+            if not paragraphs: paragraphs = search_area.find_all('p')
+
+            for p in paragraphs:
+                # Remove references [1], [2], etc.
+                for sup in p.find_all('sup', class_='reference'): sup.decompose()
+                # Remove hidden spans
+                for span in p.find_all('span', style=lambda v: v and 'display:none' in v): span.decompose()
+                # Remove coordinates
+                for span in p.find_all('span', id='coordinates'): span.decompose()
+                
+                text = clean_text(p.get_text(separator=' ', strip=True))
+                
+                # Filter out meta-paragraphs or too short ones
+                if text != "k.A." and len(text) > 50 and not re.match(r'^(Datei:|Abbildung:|Siehe auch:|Einzelnachweise|Siehe auch|Literatur)', text, re.IGNORECASE):
+                    paragraph_text = text[:2000] # Limit length
+                    break
+        except Exception as e:
+            logger.error(f"Error extracting first paragraph: {e}")
+        return paragraph_text
+
+    def extract_categories(self, soup: BeautifulSoup) -> str:
+        """
+        Extracts Wikipedia categories from the soup object, filtering out meta-categories.
+        """
+        if not soup: return "k.A."
+        cats_filtered = []
+        try:
+            cat_div = soup.find('div', id="mw-normal-catlinks")
+            if cat_div:
+                ul = cat_div.find('ul')
+                if ul:
+                    cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
+                    cats_filtered = [c for c in cats if c and isinstance(c, str) and c.strip() and "kategorien:" not in c.lower()]
+        except Exception as e:
+            logger.error(f"Error extracting categories: {e}")
+        return ", ".join(cats_filtered) if cats_filtered else "k.A."
+
+    def _validate_article(self, page, company_name: str, website: str, crm_city: str, parent_name: str = None) -> bool:
+        """
+        Validates fact-based whether a Wikipedia article matches the company.
+        Prioritizes hard facts (Domain, City) over pure name similarity.
+        """
+        if not page or not hasattr(page, 'html'):
+            return False
+        
+        logger.debug(f"Validating article '{page.title}' for company '{company_name}'...")
+        
+        try:
+            page_html = page.html()
+            soup = BeautifulSoup(page_html, 'html.parser')
+        except Exception as e:
+            logger.error(f"Could not parse HTML for article '{page.title}': {e}")
+            return False
+
+        # --- Stage 1: Website Domain Validation (very strong signal) ---
+        normalized_domain = simple_normalize_url(website)
+        if normalized_domain != "k.A.":
+            # Search for domain in "External links" section or infobox
+            external_links = soup.select('.external, .infobox a[href*="."]')
+            for link in external_links:
+                href = link.get('href', '')
+                if normalized_domain in href:
+                    logger.info(f"  => VALIDATION SUCCESS (Domain Match): Domain '{normalized_domain}' found in links.")
+                    return True
+
+        # --- Stage 2: City Validation (strong signal) ---
+        if crm_city and crm_city.lower() != 'k.a.':
+            infobox_sitz_raw = self._extract_infobox_value(soup, 'sitz')
+            if infobox_sitz_raw and infobox_sitz_raw.lower() != 'k.a.':
+                if crm_city.lower() in infobox_sitz_raw.lower():
+                    logger.info(f"  => VALIDATION SUCCESS (City Match): CRM City '{crm_city}' found in Infobox City '{infobox_sitz_raw}'.")
+                    return True
+
+        # --- Stage 3: Parent Validation ---
+        normalized_parent = normalize_company_name(parent_name) if parent_name else None
+        if normalized_parent:
+            page_content_for_check = (page.title + " " + page.summary).lower()
+            if normalized_parent in page_content_for_check:
+                logger.info(f"  => VALIDATION SUCCESS (Parent Match): Parent Name '{parent_name}' found in article.")
+                return True
+
+        # --- Stage 4: Name Similarity (Fallback with stricter rules) ---
+        normalized_company = normalize_company_name(company_name)
+        normalized_title = normalize_company_name(page.title)
+        similarity = fuzzy_similarity(normalized_title, normalized_company)
+        
+        if similarity > 0.85: # Stricter threshold
+            logger.info(f"  => VALIDATION SUCCESS (High Similarity): High name similarity ({similarity:.2f}).")
+            return True
+
+        logger.debug(f"  => VALIDATION FAILED: No hard fact (Domain, City, Parent) and similarity ({similarity:.2f}) too low.")
+        return False
+        
+    def search_company_article(self, company_name: str, website: str = None, crm_city: str = None, parent_name: str = None):
+        """
+        Searches and validates a matching Wikipedia article using the 'Google-First' strategy.
+        1. Finds the best URL via SerpAPI.
+        2. Validates the found article with hard facts.
+        """
+        if not company_name:
+            return None
+
+        logger.info(f"Starting 'Google-First' Wikipedia search for '{company_name}'...")
+        
+        # 1. Find the best URL candidate via Google Search
+        url_candidate = self.serp_wikipedia_lookup(company_name)
+        
+        if not url_candidate:
+            logger.warning(f"  -> No URL found via SerpAPI. Search aborted.")
+            return None
+
+        # 2. Load and validate the found article
+        try:
+            page_title = unquote(url_candidate.split('/wiki/')[-1].replace('_', ' '))
+            page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
+            
+            # Use the new fact-based validation
+            if self._validate_article(page, company_name, website, crm_city, parent_name):
+                logger.info(f"  -> Article '{page.title}' successfully validated.")
+                return page
+            else:
+                logger.warning(f"  -> Article '{page.title}' could not be validated.")
+                return None
+        except wikipedia.exceptions.PageError:
+            logger.error(f"  -> Error: Found URL '{url_candidate}' did not lead to a valid Wikipedia page.")
+            return None
+        except Exception as e:
+            logger.error(f"  -> Unexpected error processing page '{url_candidate}': {e}")
+            return None
+
+    def _extract_infobox_value(self, soup: BeautifulSoup, target: str) -> str:
+        """
+        Targetedly extracts values (Industry, Revenue, etc.) from the infobox.
+        """
+        if not soup or target not in self.keywords_map:
+            return "k.A."
+        keywords = self.keywords_map[target]
+        infobox = soup.select_one('table[class*="infobox"]')
+        if not infobox: return "k.A."
+
+        value_found = "k.A."
+        try:
+            rows = infobox.find_all('tr')
+            for row in rows:
+                cells = row.find_all(['th', 'td'], recursive=False)
+                header_text, value_cell = None, None
+
+                if len(cells) >= 2:
+                    if cells[0].name == 'th':
+                        header_text, value_cell = cells[0].get_text(strip=True), cells[1]
+                    elif cells[0].name == 'td' and cells[1].name == 'td':
+                        style = cells[0].get('style', '').lower()
+                        is_header_like = 'font-weight' in style and ('bold' in style or '700' in style) or cells[0].find(['b', 'strong'], recursive=False)
+                        if is_header_like:
+                            header_text, value_cell = cells[0].get_text(strip=True), cells[1]
+
+                if header_text and value_cell:
+                    if any(kw in header_text.lower() for kw in keywords):
+                        for sup in value_cell.find_all(['sup', 'span']):
+                            sup.decompose()
+                        
+                        raw_value_text = value_cell.get_text(separator=' ', strip=True)
+
+                        if target == 'branche' or target == 'sitz':
+                            value_found = clean_text(raw_value_text).split('\n')[0].strip()
+                        elif target == 'umsatz':
+                            value_found = extract_numeric_value(raw_value_text, is_umsatz=True)
+                        elif target == 'mitarbeiter':
+                            value_found = extract_numeric_value(raw_value_text, is_umsatz=False)
+                        
+                        value_found = value_found if value_found else "k.A."
+                        logger.info(f"        --> Infobox '{target}' found: '{value_found}'")
+                        break
+        except Exception as e:
+            logger.error(f"Error iterating infobox rows for '{target}': {e}")
+            return "k.A."
+
+        return value_found
+
+    def _parse_sitz_string_detailed(self, raw_sitz_string_input: str) -> dict:
+        """
+        Attempts to extract City and Country in detail from a raw Sitz string.
+        """
+        sitz_stadt_val, sitz_land_val = "k.A.", "k.A."
+        if not raw_sitz_string_input or not isinstance(raw_sitz_string_input, str):
+            return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
+
+        temp_sitz = raw_sitz_string_input.strip()
+        if not temp_sitz or temp_sitz.lower() == "k.a.":
+            return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
+        
+        known_countries_detailed = {
+            "deutschland": "Deutschland", "germany": "Deutschland", "de": "Deutschland",
+            "österreich": "Österreich", "austria": "Österreich", "at": "Österreich",
+            "schweiz": "Schweiz", "switzerland": "Schweiz", "ch": "Schweiz", "suisse": "Schweiz",
+            "usa": "USA", "u.s.": "USA", "united states": "USA", "vereinigte staaten": "USA",
+            "vereinigtes königreich": "Vereinigtes Königreich", "united kingdom": "Vereinigtes Königreich", "uk": "Vereinigtes Königreich",
+        }
+        region_to_country = {
+            "nrw": "Deutschland", "nordrhein-westfalen": "Deutschland", "bayern": "Deutschland", "hessen": "Deutschland",
+            "zg": "Schweiz", "zug": "Schweiz", "zh": "Schweiz", "zürich": "Schweiz",
+            "ca": "USA", "california": "USA", "ny": "USA", "new york": "USA",
+        }
+
+        extracted_country = ""
+        original_temp_sitz = temp_sitz
+
+        klammer_match = re.search(r'\(([^)]+)\)$', temp_sitz)
+        if klammer_match:
+            suffix_in_klammer = klammer_match.group(1).strip().lower()
+            if suffix_in_klammer in known_countries_detailed:
+                extracted_country = known_countries_detailed[suffix_in_klammer]
+                temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
+            elif suffix_in_klammer in region_to_country:
+                extracted_country = region_to_country[suffix_in_klammer]
+                temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
+
+        if not extracted_country and ',' in temp_sitz:
+            parts = [p.strip() for p in temp_sitz.split(',')]
+            if len(parts) > 1:
+                last_part_lower = parts[-1].lower()
+                if last_part_lower in known_countries_detailed:
+                    extracted_country = known_countries_detailed[last_part_lower]
+                    temp_sitz = ", ".join(parts[:-1]).strip(" ,")
+                elif last_part_lower in region_to_country:
+                    extracted_country = region_to_country[last_part_lower]
+                    temp_sitz = ", ".join(parts[:-1]).strip(" ,")
+
+        sitz_land_val = extracted_country if extracted_country else "k.A."
+        sitz_stadt_val = re.sub(r'^\d{4,8}\s*', '', temp_sitz).strip(" ,")
+        
+        if not sitz_stadt_val:
+            sitz_stadt_val = "k.A." if sitz_land_val != "k.A." else re.sub(r'^\d{4,8}\s*', '', original_temp_sitz).strip(" ,") or "k.A."
+
+        return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
+
+    @retry_on_failure(max_retries=3)
+    def extract_company_data(self, url_or_page) -> dict:
+        """
+        Extracts structured company data from a Wikipedia article (URL or page object).
+        """
+        default_result = {
+            'url': 'k.A.', 'title': 'k.A.', 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
+            'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.',
+            'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_text': ''
+        }
+        page = None
+        
+        try:
+            if isinstance(url_or_page, str) and "wikipedia.org" in url_or_page:
+                page_title = unquote(url_or_page.split('/wiki/')[-1].replace('_', ' '))
+                page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
+            elif not isinstance(url_or_page, str): # Assumption: it is a page object
+                page = url_or_page
+            else:
+                logger.warning(f"extract_company_data: Invalid Input '{str(url_or_page)[:100]}...")
+                return default_result
+
+            logger.info(f"Extracting data for Wiki Article: {page.title[:100]}...")
+
+            # Extract basic data directly from page object
+            first_paragraph = page.summary.split('\n')[0] if page.summary else 'k.A.'
+            categories = ", ".join(page.categories)
+            full_text = page.content
+
+            # BeautifulSoup needed for infobox and refined extraction
+            soup = self._get_page_soup(page.url)
+            if not soup:
+                 logger.warning(f"  -> Could not load page for Soup parsing. Extracting basic data only.")
+                 return {
+                     'url': page.url, 'title': page.title, 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
+                     'first_paragraph': page.summary.split('\n')[0] if page.summary else 'k.A.', 
+                     'branche': 'k.A.', 'umsatz': 'k.A.',
+                     'mitarbeiter': 'k.A.', 'categories': ", ".join(page.categories), 'full_text': full_text
+                 }
+
+            # Refined Extraction from Soup
+            first_paragraph = self._extract_first_paragraph_from_soup(soup)
+            categories = self.extract_categories(soup)
+
+            # Extract infobox data
+            branche_val = self._extract_infobox_value(soup, 'branche')
+            umsatz_val = self._extract_infobox_value(soup, 'umsatz')
+            mitarbeiter_val = self._extract_infobox_value(soup, 'mitarbeiter')
+            raw_sitz_string = self._extract_infobox_value(soup, 'sitz')
+            parsed_sitz = self._parse_sitz_string_detailed(raw_sitz_string)
+            sitz_stadt_val = parsed_sitz['sitz_stadt']
+            sitz_land_val = parsed_sitz['sitz_land']
+            
+            result = {
+                'url': page.url,
+                'title': page.title,
+                'sitz_stadt': sitz_stadt_val,
+                'sitz_land': sitz_land_val,
+                'first_paragraph': first_paragraph,
+                'branche': branche_val,
+                'umsatz': umsatz_val,
+                'mitarbeiter': mitarbeiter_val,
+                'categories': categories,
+                'full_text': full_text
+            }
+            
+            logger.info(f"  -> Extracted Data: City='{sitz_stadt_val}', Country='{sitz_land_val}', Rev='{umsatz_val}', Emp='{mitarbeiter_val}'")
+            return result
+
+        except wikipedia.exceptions.PageError:
+            logger.error(f"  -> Error: Wikipedia article for '{str(url_or_page)[:100]}' could not be found (PageError).")
+            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
+        except Exception as e:
+            logger.error(f"  -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}")
+            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}