fix(ce): Resolve database schema mismatch and restore docs

- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
2026-01-15 15:54:45 +00:00
parent 4fcbbe3723
commit 4a336f6374
13 changed files with 724 additions and 555 deletions
--- a/company-explorer/backend/services/discovery.py
+++ b/company-explorer/backend/services/discovery.py
@@ -1,10 +1,11 @@
 import logging
 import requests
 import re
-from typing import Optional, Dict, Tuple
+from typing import Optional, Dict, Tuple, Any
 from urllib.parse import urlparse
+
 from ..config import settings
-from ..lib.core_utils import retry_on_failure, normalize_string
+from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
 from .wikipedia_service import WikipediaService

 logger = logging.getLogger(__name__)
@@ -23,7 +24,6 @@ class DiscoveryService:
        if not self.api_key:
            logger.warning("SERP_API_KEY not set. Discovery features will fail.")
        
-        # Initialize the specialized Wikipedia Service
        self.wiki_service = WikipediaService()

    @retry_on_failure(max_retries=2)
@@ -60,42 +60,31 @@ class DiscoveryService:
            for result in data["organic_results"]:
                link = result.get("link", "")
                if self._is_credible_url(link):
-                    # Simple heuristic: If the company name is part of the domain, high confidence
-                    # Otherwise, take the first credible result.
                    return link
            
            return "k.A."

        except Exception as e:
-            logger.error(f"SerpAPI Error: {e}")
+            logger.error(f"SerpAPI Error: {e}", exc_info=True)
            return "k.A."

    @retry_on_failure(max_retries=2)
-    def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
+    def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
        """
        Searches for a specific German Wikipedia article using the robust WikipediaService.
        Includes validation via website domain and city.
        """
-        if not self.api_key:
-            return "k.A."
-            
-        try:
-            # Delegate to the robust service
-            # parent_name could be added if available in the future
-            page = self.wiki_service.search_company_article(
-                company_name=company_name,
-                website=website,
-                crm_city=city
-            )
-            
-            if page:
-                return page.url
-            
-            return "k.A."
-
-        except Exception as e:
-            logger.error(f"Wiki Search Error via Service: {e}")
-            return "k.A."
+        # Pass all available info for robust search and validation
+        page = self.wiki_service.search_company_article(
+            company_name=company_name,
+            website=website,
+            crm_city=city
+        )
+        
+        if page:
+            return page.url
+        
+        return "k.A."

    def extract_wikipedia_data(self, url: str) -> dict:
        """
@@ -104,21 +93,21 @@ class DiscoveryService:
        try:
            return self.wiki_service.extract_company_data(url)
        except Exception as e:
-            logger.error(f"Wiki Extraction Error for {url}: {e}")
+            logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
            return {"url": url, "error": str(e)}

    def _is_credible_url(self, url: str) -> bool:
-        """Filters out social media, directories, and junk."""
+        """
+        Filters out social media, directories, and junk.
+        """
        if not url: return False
        try:
            domain = urlparse(url).netloc.lower().replace("www.", "")
            if domain in BLACKLIST_DOMAINS:
                return False
-            # Check for subdomains of blacklist (e.g. de.linkedin.com)
            for bad in BLACKLIST_DOMAINS:
                if domain.endswith("." + bad):
                    return False
            return True
        except:
-            return False
-
+            return False