feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides

- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
2026-01-08 10:08:21 +00:00
parent 6fda69a611
commit 565c56dc9a
12 changed files with 1320 additions and 160 deletions
--- a/company-explorer/backend/services/discovery.py
+++ b/company-explorer/backend/services/discovery.py
@@ -5,6 +5,7 @@ from typing import Optional, Dict, Tuple
 from urllib.parse import urlparse
 from ..config import settings
 from ..lib.core_utils import retry_on_failure, normalize_string
+from .wikipedia_service import WikipediaService

 logger = logging.getLogger(__name__)

@@ -21,6 +22,9 @@ class DiscoveryService:
        self.api_key = settings.SERP_API_KEY
        if not self.api_key:
            logger.warning("SERP_API_KEY not set. Discovery features will fail.")
+        
+        # Initialize the specialized Wikipedia Service
+        self.wiki_service = WikipediaService()

    @retry_on_failure(max_retries=2)
    def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
@@ -67,42 +71,42 @@ class DiscoveryService:
            return "k.A."

    @retry_on_failure(max_retries=2)
-    def find_wikipedia_url(self, company_name: str) -> str:
+    def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
        """
-        Searches for a specific German Wikipedia article.
+        Searches for a specific German Wikipedia article using the robust WikipediaService.
+        Includes validation via website domain and city.
        """
        if not self.api_key:
            return "k.A."
            
-        query = f"{company_name} Wikipedia"
-        
        try:
-            params = {
-                "engine": "google",
-                "q": query,
-                "api_key": self.api_key,
-                "num": 3,
-                "gl": "de",
-                "hl": "de"
-            }
-            response = requests.get("https://serpapi.com/search", params=params, timeout=15)
-            response.raise_for_status()
-            data = response.json()
-
-            for result in data.get("organic_results", []):
-                link = result.get("link", "")
-                if "de.wikipedia.org/wiki/" in link:
-                    # Basic validation: Is the title roughly the company?
-                    title = result.get("title", "").replace(" – Wikipedia", "")
-                    if self._check_name_similarity(company_name, title):
-                        return link
+            # Delegate to the robust service
+            # parent_name could be added if available in the future
+            page = self.wiki_service.search_company_article(
+                company_name=company_name,
+                website=website,
+                crm_city=city
+            )
+            
+            if page:
+                return page.url
            
            return "k.A."

        except Exception as e:
-            logger.error(f"Wiki Search Error: {e}")
+            logger.error(f"Wiki Search Error via Service: {e}")
            return "k.A."

+    def extract_wikipedia_data(self, url: str) -> dict:
+        """
+        Extracts full company data from a given Wikipedia URL.
+        """
+        try:
+            return self.wiki_service.extract_company_data(url)
+        except Exception as e:
+            logger.error(f"Wiki Extraction Error for {url}: {e}")
+            return {"url": url, "error": str(e)}
+
    def _is_credible_url(self, url: str) -> bool:
        """Filters out social media, directories, and junk."""
        if not url: return False
@@ -118,9 +122,3 @@ class DiscoveryService:
        except:
            return False

-    def _check_name_similarity(self, name1: str, name2: str) -> bool:
-        """Simple fuzzy check for validation."""
-        n1 = normalize_string(name1)
-        n2 = normalize_string(name2)
-        # Very permissive: if one is contained in the other
-        return n1 in n2 or n2 in n1