fix(ce): Resolve database schema mismatch and restore docs

- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
2026-01-15 15:54:45 +00:00
parent 5df451d47b
commit 86f9962199
13 changed files with 724 additions and 555 deletions
--- a/company-explorer/backend/services/discovery.py
+++ b/company-explorer/backend/services/discovery.py
@@ -1,10 +1,11 @@
 import logging
 import requests
 import re
-from typing import Optional, Dict, Tuple
+from typing import Optional, Dict, Tuple, Any
 from urllib.parse import urlparse
+
 from ..config import settings
-from ..lib.core_utils import retry_on_failure, normalize_string
+from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
 from .wikipedia_service import WikipediaService

 logger = logging.getLogger(__name__)
@@ -23,7 +24,6 @@ class DiscoveryService:
        if not self.api_key:
            logger.warning("SERP_API_KEY not set. Discovery features will fail.")
        
-        # Initialize the specialized Wikipedia Service
        self.wiki_service = WikipediaService()

    @retry_on_failure(max_retries=2)
@@ -60,42 +60,31 @@ class DiscoveryService:
            for result in data["organic_results"]:
                link = result.get("link", "")
                if self._is_credible_url(link):
-                    # Simple heuristic: If the company name is part of the domain, high confidence
-                    # Otherwise, take the first credible result.
                    return link
            
            return "k.A."

        except Exception as e:
-            logger.error(f"SerpAPI Error: {e}")
+            logger.error(f"SerpAPI Error: {e}", exc_info=True)
            return "k.A."

    @retry_on_failure(max_retries=2)
-    def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
+    def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
        """
        Searches for a specific German Wikipedia article using the robust WikipediaService.
        Includes validation via website domain and city.
        """
-        if not self.api_key:
-            return "k.A."
-            
-        try:
-            # Delegate to the robust service
-            # parent_name could be added if available in the future
-            page = self.wiki_service.search_company_article(
-                company_name=company_name,
-                website=website,
-                crm_city=city
-            )
-            
-            if page:
-                return page.url
-            
-            return "k.A."
-
-        except Exception as e:
-            logger.error(f"Wiki Search Error via Service: {e}")
-            return "k.A."
+        # Pass all available info for robust search and validation
+        page = self.wiki_service.search_company_article(
+            company_name=company_name,
+            website=website,
+            crm_city=city
+        )
+        
+        if page:
+            return page.url
+        
+        return "k.A."

    def extract_wikipedia_data(self, url: str) -> dict:
        """
@@ -104,21 +93,21 @@ class DiscoveryService:
        try:
            return self.wiki_service.extract_company_data(url)
        except Exception as e:
-            logger.error(f"Wiki Extraction Error for {url}: {e}")
+            logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
            return {"url": url, "error": str(e)}

    def _is_credible_url(self, url: str) -> bool:
-        """Filters out social media, directories, and junk."""
+        """
+        Filters out social media, directories, and junk.
+        """
        if not url: return False
        try:
            domain = urlparse(url).netloc.lower().replace("www.", "")
            if domain in BLACKLIST_DOMAINS:
                return False
-            # Check for subdomains of blacklist (e.g. de.linkedin.com)
            for bad in BLACKLIST_DOMAINS:
                if domain.endswith("." + bad):
                    return False
            return True
        except:
-            return False
-
+            return False
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -36,17 +36,30 @@ class ScraperService:
            response.raise_for_status()
            
            # Check Content Type
+            logger.debug(f"Response status: {response.status_code}")
+            if response.headers is None:
+                logger.error("Response headers is None!")
+                return {"error": "No headers"}
+                
            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' not in content_type:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}

            # Parse Main Page
-            result = self._parse_html(response.content)
+            try:
+                result = self._parse_html(response.content)
+            except Exception as e:
+                logger.error(f"Error in _parse_html: {e}", exc_info=True)
+                return {"error": f"Parse error: {e}"}
            
            # --- IMPRESSUM LOGIC ---
-            soup = BeautifulSoup(response.content, 'html.parser')
-            impressum_url = self._find_impressum_link(soup, url)
+            try:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                impressum_url = self._find_impressum_link(soup, url)
+            except Exception as e:
+                logger.error(f"Error finding impressum: {e}", exc_info=True)
+                impressum_url = None
            
            # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
            if not impressum_url and url.count('/') > 3:
@@ -160,7 +173,8 @@ class ScraperService:
            # LLM Extraction
            prompt = f"""
            Extract the official company details from this German 'Impressum' text.
-            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
+            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
+            'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
            If a field is missing, use null.
            
            Text:
@@ -184,40 +198,72 @@ class ScraperService:
            return None

    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
-        soup = BeautifulSoup(html_content, 'html.parser')
-        
-        # 1. Cleanup Junk (Aggressive, matching legacy logic)
-        # Removed 'a' tags to prevent menu links from polluting the text analysis
-        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
-            element.decompose()
+        if not html_content:
+            return {"title": "", "description": "", "text": "", "emails": []}
+
+        try:
+            soup = BeautifulSoup(html_content, 'html.parser')
            
-        # 1b. Remove common Cookie Banners / Popups by class/id heuristics
-        for div in soup.find_all("div"):
-            classes = str(div.get("class", "")).lower()
-            ids = str(div.get("id", "")).lower()
-            if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
-                div.decompose()
+            # 1. Cleanup Junk
+            # Safe removal of tags
+            for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
+                if element: element.decompose()
+                
+            # 1b. Remove common Cookie Banners (Defensive)
+            try:
+                for div in soup.find_all("div"):
+                    if not div: continue
+                    # .get can return None for attributes if not found? No, returns None if key not found.
+                    # But if div is somehow None (unlikely in loop), check first.
+                    
+                    # Convert list of classes to string if needed
+                    cls_attr = div.get("class")
+                    classes = " ".join(cls_attr).lower() if isinstance(cls_attr, list) else str(cls_attr or "").lower()
+                    
+                    id_attr = div.get("id")
+                    ids = str(id_attr or "").lower()
+                    
+                    if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
+                        div.decompose()
+            except Exception as e:
+                logger.warning(f"Error filtering divs: {e}")

-        # 2. Extract Title & Meta Description
-        title = soup.title.string if soup.title else ""
-        meta_desc = ""
-        meta_tag = soup.find('meta', attrs={'name': 'description'})
-        if meta_tag:
-            meta_desc = meta_tag.get('content', '')
+            # 2. Extract Title & Meta Description
+            title = ""
+            try:
+                if soup.title and soup.title.string:
+                    title = soup.title.string
+            except: pass

-        # 3. Extract Main Text
-        # Prefer body, fallback to full soup
-        body = soup.find('body')
-        raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
-        
-        cleaned_text = clean_text(raw_text)
-        
-        # 4. Extract Emails (Basic Regex)
-        emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
-        
-        return {
-            "title": clean_text(title),
-            "description": clean_text(meta_desc),
-            "text": cleaned_text[:25000], # Limit to avoid context overflow
-            "emails": list(emails)[:5] # Limit to 5
-        }
+            meta_desc = ""
+            try:
+                meta_tag = soup.find('meta', attrs={'name': 'description'})
+                if meta_tag:
+                    meta_desc = meta_tag.get('content', '') or ""
+            except: pass
+
+            # 3. Extract Main Text
+            try:
+                body = soup.find('body')
+                raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
+                cleaned_text = clean_text(raw_text)
+            except Exception as e:
+                logger.warning(f"Text extraction failed: {e}")
+                cleaned_text = ""
+            
+            # 4. Extract Emails
+            emails = []
+            try:
+                emails = list(set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)))[:5]
+            except: pass
+            
+            return {
+                "title": clean_text(title),
+                "description": clean_text(meta_desc),
+                "text": cleaned_text[:25000],
+                "emails": emails
+            }
+
+        except Exception as e:
+            logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
+            return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
--- a/company-explorer/backend/services/wikipedia_service.py
+++ b/company-explorer/backend/services/wikipedia_service.py
@@ -352,7 +352,7 @@ class WikipediaService:
                extracted_country = region_to_country[suffix_in_klammer]
                temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")

-        if not extracted_country and ',' in temp_sitz:
+        if not extracted_country and "," in temp_sitz:
            parts = [p.strip() for p in temp_sitz.split(',')]
            if len(parts) > 1:
                last_part_lower = parts[-1].lower()
@@ -445,4 +445,4 @@ class WikipediaService:
            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
        except Exception as e:
            logger.error(f"  -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}")
-            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
+            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}