fix(ce): Resolve database schema mismatch and restore docs

- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
2026-01-15 15:54:45 +00:00
parent 5df451d47b
commit 86f9962199
13 changed files with 724 additions and 555 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -106,6 +106,7 @@ def list_companies(
    skip: int = 0, 
    limit: int = 50, 
    search: Optional[str] = None,
+    sort_by: Optional[str] = Query("name_asc"),
    db: Session = Depends(get_db)
 ):
    try:
@@ -114,8 +115,16 @@ def list_companies(
            query = query.filter(Company.name.ilike(f"%{search}%"))
        
        total = query.count()
-        # Sort by ID desc (newest first)
-        items = query.order_by(Company.id.desc()).offset(skip).limit(limit).all()
+        
+        # Sorting Logic
+        if sort_by == "updated_desc":
+            query = query.order_by(Company.updated_at.desc())
+        elif sort_by == "created_desc":
+            query = query.order_by(Company.id.desc())
+        else: # Default: name_asc
+            query = query.order_by(Company.name.asc())
+
+        items = query.offset(skip).limit(limit).all()
        
        return {"total": total, "items": items}
    except Exception as e:
@@ -263,10 +272,48 @@ def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depe
        existing_wiki.content = wiki_data
        existing_wiki.updated_at = datetime.utcnow()
        existing_wiki.is_locked = True # LOCK IT
+        existing_wiki.wiki_verified_empty = False # It's no longer empty
    
    db.commit()
+    # The return needs to be here, outside the else block but inside the main function
    return {"status": "updated", "data": wiki_data}
+        
+@app.post("/api/companies/{company_id}/wiki_mark_empty")
+def mark_wiki_empty(company_id: int, db: Session = Depends(get_db)):
+    """
+    Marks a company as having no valid Wikipedia entry after manual review.
+    Creates a locked, empty Wikipedia enrichment entry.
+    """
+    company = db.query(Company).filter(Company.id == company_id).first()
+    if not company:
+        raise HTTPException(404, "Company not found")

+    logger.info(f"Manual override for {company.name}: Marking Wikipedia as verified empty.")
+    
+    existing_wiki = db.query(EnrichmentData).filter(
+        EnrichmentData.company_id == company.id, 
+        EnrichmentData.source_type == "wikipedia"
+    ).first()
+    
+    empty_wiki_data = {"url": "k.A.", "title": "k.A.", "first_paragraph": "k.A.", "error": "Manually marked as empty"}
+
+    if not existing_wiki:
+        db.add(EnrichmentData(
+            company_id=company.id, 
+            source_type="wikipedia", 
+            content=empty_wiki_data,
+            is_locked=True,
+            wiki_verified_empty=True
+        ))
+    else:
+        existing_wiki.content = empty_wiki_data
+        existing_wiki.updated_at = datetime.utcnow()
+        existing_wiki.is_locked = True # LOCK IT
+        existing_wiki.wiki_verified_empty = True # Mark as empty
+    
+    db.commit()
+    return {"status": "updated", "wiki_verified_empty": True}
+        
@app.post("/api/companies/{company_id}/override/website")
 def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
    """
@@ -305,6 +352,17 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session =
    if not impressum_data:
        raise HTTPException(status_code=400, detail="Failed to extract data from provided URL")

+    # Update company record with city/country if found
+    logger.info(f"override_impressum_url: Scraped impressum_data for {company.name}: City={impressum_data.get('city')}, Country_code={impressum_data.get('country_code')}")
+    if city_val := impressum_data.get("city"):
+        logger.info(f"override_impressum_url: Updating company.city from '{company.city}' to '{city_val}'")
+        company.city = city_val
+    if country_val := impressum_data.get("country_code"):
+        logger.info(f"override_impressum_url: Updating company.country from '{company.country}' to '{country_val}'")
+        company.country = country_val
+    logger.info(f"override_impressum_url: Company object after updates (before commit): City='{company.city}', Country='{company.country}'")
+
+
    # 2. Find existing scrape data or create new
    existing_scrape = db.query(EnrichmentData).filter(
        EnrichmentData.company_id == company.id, 
@@ -312,20 +370,23 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session =
    ).first()

    if not existing_scrape:
-        # Create minimal scrape entry
+        # Create minimal scrape entry and lock it
        db.add(EnrichmentData(
            company_id=company.id, 
            source_type="website_scrape", 
-            content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url}
+            content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url},
+            is_locked=True
        ))
    else:
-        # Update existing
+        # Update existing and lock it
        content = dict(existing_scrape.content) if existing_scrape.content else {}
        content["impressum"] = impressum_data
        existing_scrape.content = content
        existing_scrape.updated_at = datetime.utcnow()
+        existing_scrape.is_locked = True
    
    db.commit()
+    logger.info(f"override_impressum_url: Commit successful. Company ID {company.id} updated.")
    return {"status": "updated", "data": impressum_data}

 # --- Contact Routes ---
@@ -465,6 +526,7 @@ def list_all_contacts(
    skip: int = 0, 
    limit: int = 50, 
    search: Optional[str] = None,
+    sort_by: Optional[str] = Query("name_asc"),
    db: Session = Depends(get_db)
 ):
    """
@@ -482,8 +544,16 @@ def list_all_contacts(
        )
    
    total = query.count()
-    # Sort by ID desc
-    contacts = query.order_by(Contact.id.desc()).offset(skip).limit(limit).all()
+
+    # Sorting Logic
+    if sort_by == "updated_desc":
+        query = query.order_by(Contact.updated_at.desc())
+    elif sort_by == "created_desc":
+        query = query.order_by(Contact.id.desc())
+    else: # Default: name_asc
+        query = query.order_by(Contact.last_name.asc(), Contact.first_name.asc())
+
+    contacts = query.offset(skip).limit(limit).all()
    
    # Enrich with Company Name for the frontend list
    result = []
@@ -552,6 +622,23 @@ def bulk_import_contacts(req: BulkContactImportRequest, db: Session = Depends(ge
    db.commit()
    return stats

+@app.post("/api/enrichment/{company_id}/{source_type}/lock")
+def lock_enrichment(company_id: int, source_type: str, locked: bool = Query(...), db: Session = Depends(get_db)):
+    """
+    Toggles the lock status of a specific enrichment data type (e.g. 'website_scrape', 'wikipedia').
+    """
+    entry = db.query(EnrichmentData).filter(
+        EnrichmentData.company_id == company_id,
+        EnrichmentData.source_type == source_type
+    ).first()
+    
+    if not entry:
+        raise HTTPException(404, "Enrichment data not found")
+    
+    entry.is_locked = locked
+    db.commit()
+    return {"status": "updated", "is_locked": locked}
+
 def run_discovery_task(company_id: int):
    # New Session for Background Task
    from .database import SessionLocal
@@ -616,15 +703,11 @@ def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db:
        return {"error": "No website to analyze. Run Discovery first."}

    # FORCE SCRAPE LOGIC
-    # If explicit force_scrape is requested OR if we want to ensure fresh data for debugging
-    # We delete the old scrape data.
-    # For now, let's assume every manual "Analyze" click implies a desire for fresh results if previous failed.
-    # But let's respect the flag from frontend if we add it later.
-    
-    # Always clearing scrape data for now to fix the "stuck cache" issue reported by user
+    # Respect Locked Data: Only delete if not locked.
    db.query(EnrichmentData).filter(
        EnrichmentData.company_id == company.id,
-        EnrichmentData.source_type == "website_scrape"
+        EnrichmentData.source_type == "website_scrape",
+        EnrichmentData.is_locked == False
    ).delete()
    db.commit()

@@ -640,29 +723,97 @@ def run_analysis_task(company_id: int, url: str):

        logger.info(f"Running Analysis Task for {company.name}")

-        # 1. Scrape Website
-        scrape_result = scraper.scrape_url(url)
-        
-        # Save Scrape Data
-        existing_scrape_data = db.query(EnrichmentData).filter(
+        # 1. Scrape Website OR Use Locked Data
+        scrape_result = {}
+        existing_scrape = db.query(EnrichmentData).filter(
            EnrichmentData.company_id == company.id,
            EnrichmentData.source_type == "website_scrape"
        ).first()

-        if "text" in scrape_result and scrape_result["text"]:
-            if not existing_scrape_data:
-                db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result))
-            else:
-                existing_scrape_data.content = scrape_result
-                existing_scrape_data.updated_at = datetime.utcnow()
-        elif "error" in scrape_result:
-            logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}")
+        if existing_scrape and existing_scrape.is_locked:
+            logger.info(f"Using LOCKED scrape data for {company.name}")
+            scrape_result = dict(existing_scrape.content) # Copy dict
+
+            # Always ensure city/country from locked impressum data is synced to company
+            if "impressum" in scrape_result and scrape_result["impressum"]:
+                impressum_city = scrape_result["impressum"].get("city")
+                impressum_country = scrape_result["impressum"].get("country_code")
+                logger.info(f"Analysis task (locked data): Impressum found. City='{impressum_city}', Country='{impressum_country}'")
+                if impressum_city and company.city != impressum_city:
+                    logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'")
+                    company.city = impressum_city
+                if impressum_country and company.country != impressum_country:
+                    logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'")
+                    company.country = impressum_country
+            
+            text_val = scrape_result.get("text")
+            text_len = len(text_val) if text_val else 0
+            logger.info(f"Locked data keys: {list(scrape_result.keys())}, Text length: {text_len}")
+
+            # AUTO-FIX: If locked data (e.g. Manual Impressum) has no text, fetch main website text
+            if text_len < 100:
+                logger.info(f"Locked data missing text (len={text_len}). Fetching content from {url}...")
+                try:
+                    fresh_scrape = scraper.scrape_url(url)
+                except Exception as e:
+                    logger.error(f"Fresh scrape failed: {e}", exc_info=True)
+                    fresh_scrape = {}
+
+                logger.info(f"Fresh scrape result keys: {list(fresh_scrape.keys())}")
+                
+                if "text" in fresh_scrape and len(fresh_scrape["text"]) > 100:
+                    logger.info(f"Fresh scrape successful. Text len: {len(fresh_scrape['text'])}")
+                    # Update local dict for current processing
+                    scrape_result["text"] = fresh_scrape["text"]
+                    scrape_result["title"] = fresh_scrape.get("title", "")
+                    
+                    # Update DB (Merge into existing content)
+                    updated_content = dict(existing_scrape.content)
+                    updated_content["text"] = fresh_scrape["text"]
+                    updated_content["title"] = fresh_scrape.get("title", "")
+                    
+                    existing_scrape.content = updated_content
+                    existing_scrape.updated_at = datetime.utcnow()
+                    # db.commit() here would be too early
+                    logger.info("Updated locked record with fresh website text in session.")
+                else:
+                    logger.warning(f"Fresh scrape returned insufficient text. Error: {fresh_scrape.get('error')}")
+        else:
+            # Standard Scrape
+            scrape_result = scraper.scrape_url(url)
+            
+            # Update company fields from impressum if found during scrape
+            if "impressum" in scrape_result and scrape_result["impressum"]:
+                impressum_city = scrape_result["impressum"].get("city")
+                impressum_country = scrape_result["impressum"].get("country_code")
+                logger.info(f"Analysis task (standard scrape): Impressum found. City='{impressum_city}', Country='{impressum_country}'")
+                if impressum_city and company.city != impressum_city:
+                    logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'")
+                    company.city = impressum_city
+                if impressum_country and company.country != impressum_country:
+                    logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'")
+                    company.country = impressum_country
+            
+            # Save Scrape Data
+            if "text" in scrape_result and scrape_result["text"]:
+                if not existing_scrape:
+                    db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result))
+                else:
+                    existing_scrape.content = scrape_result
+                    existing_scrape.updated_at = datetime.utcnow()
+            elif "error" in scrape_result:
+                logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}")

        # 2. Classify Robotics Potential
-        if "text" in scrape_result and scrape_result["text"]:
+        text_content = scrape_result.get("text")
+        
+        logger.info(f"Preparing classification. Text content length: {len(text_content) if text_content else 0}")
+
+        if text_content and len(text_content) > 100:
+            logger.info(f"Starting classification for {company.name}...")
            analysis = classifier.analyze_robotics_potential(
                company_name=company.name,
-                website_text=scrape_result["text"]
+                website_text=text_content
            )
            
            if "error" in analysis:
@@ -672,10 +823,8 @@ def run_analysis_task(company_id: int, url: str):
                if industry:
                    company.industry_ai = industry
                
-                # Delete old signals
                db.query(Signal).filter(Signal.company_id == company.id).delete()

-                # Save new signals
                potentials = analysis.get("potentials", {})
                for signal_type, data in potentials.items():
                    new_signal = Signal(
@@ -687,7 +836,6 @@ def run_analysis_task(company_id: int, url: str):
                    )
                    db.add(new_signal)
                
-                # Save Full Analysis Blob (Business Model + Evidence)
                existing_analysis = db.query(EnrichmentData).filter(
                    EnrichmentData.company_id == company.id,
                    EnrichmentData.source_type == "ai_analysis"
@@ -702,6 +850,8 @@ def run_analysis_task(company_id: int, url: str):
                company.status = "ENRICHED"
                company.last_classification_at = datetime.utcnow()
                logger.info(f"Robotics analysis complete for {company.name}.")
+        else:
+            logger.warning(f"Skipping classification for {company.name}: Insufficient text content (len={len(text_content) if text_content else 0})")

        db.commit()
        logger.info(f"Analysis finished for {company.id}")
--- a/company-explorer/backend/config.py
+++ b/company-explorer/backend/config.py
@@ -5,6 +5,7 @@ from typing import Optional
 # Versuche Pydantic zu nutzen, Fallback auf os.environ
 try:
    from pydantic_settings import BaseSettings
+    from pydantic import Extra
    
    class Settings(BaseSettings):
        # App Info
@@ -13,7 +14,7 @@ try:
        DEBUG: bool = True
        
        # Database (Store in App dir for simplicity)
-        DATABASE_URL: str = "sqlite:////app/companies_v3_final.db"
+        DATABASE_URL: str = "sqlite:////app/companies_v3_fixed_2.db"
        
        # API Keys
        GEMINI_API_KEY: Optional[str] = None
@@ -25,6 +26,7 @@ try:

        class Config:
            env_file = ".env"
+            extra = 'ignore'

    settings = Settings()

--- a/company-explorer/backend/database.py
+++ b/company-explorer/backend/database.py
@@ -139,6 +139,7 @@ class EnrichmentData(Base):
    source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
    content = Column(JSON)       # The raw data
    is_locked = Column(Boolean, default=False) # Manual override flag
+    wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty
    
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
--- a/company-explorer/backend/lib/core_utils.py
+++ b/company-explorer/backend/lib/core_utils.py
@@ -9,7 +9,7 @@ from functools import wraps
 from typing import Optional, Union, List
 from thefuzz import fuzz

-# Versuche neue Google GenAI Lib (v1.0+)
+# Try new Google GenAI Lib (v1.0+)
 try:
    from google import genai
    from google.genai import types
@@ -17,7 +17,7 @@ try:
 except ImportError:
    HAS_NEW_GENAI = False

-# Fallback auf alte Lib
+# Fallback to old Lib
 try:
    import google.generativeai as old_genai
    HAS_OLD_GENAI = True
@@ -100,22 +100,33 @@ def simple_normalize_url(url: str) -> str:
        return "k.A."

 def normalize_company_name(name: str) -> str:
-    """Normalizes a company name by removing legal forms and special characters."""
+    """
+    Normalizes a company name by removing common legal forms, special characters, 
+    and extra spaces, for robust comparison.
+    Handles names with numbers more intelligently (e.g., "11 88 0 Solutions" -> "11880 solutions").
+    """
    if not name:
        return ""
        
    name = name.lower()
    
-    # Remove common legal forms
+    # Remove common legal forms (more comprehensive list)
    legal_forms = [
        r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b', 
-        r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b'
+        r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b',
+        r'\bsa\b', r'\bse\b', r'\bs\.a\.\b', r'\bgesellschaft\b', r'\bgp\b', r'\blp\b',
+        r'\bservice\b', r'\bservices\b', r'\bgroup\b', r'\bsolutions\b', r'\bsysteme\b',
+        r'\bhandel\b', r'\bmarketing\b', r'\btechnology\b', r'\binternational\b',
+        r'\bgmbh & co\. kg\b', r'\bholding\b', r'\bverwaltung\b', r'\bfoundation\b'
    ]
    for form in legal_forms:
        name = re.sub(form, '', name)
        
+    # Condense numbers: "11 88 0" -> "11880"
+    name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) # Condense numbers separated by space
+
    # Remove special chars and extra spaces
-    name = re.sub(r'[^\w\s]', '', name)
+    name = re.sub(r'[^\w\s\d]', '', name) # Keep digits
    name = re.sub(r'\s+', ' ', name).strip()
    
    return name
@@ -136,11 +147,14 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
    # Simple multiplier handling
    multiplier = 1.0
    if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
-        multiplier = 1000.0 if is_umsatz else 1000000000.0
+        multiplier = 1000.0 # Standardize to Millions for revenue, Billions for absolute numbers
+        if not is_umsatz: multiplier = 1000000000.0
    elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
-        multiplier = 1.0 if is_umsatz else 1000000.0
+        multiplier = 1.0 # Already in Millions for revenue
+        if not is_umsatz: multiplier = 1000000.0
    elif 'tsd' in raw_value or 'thousand' in raw_value:
-        multiplier = 0.001 if is_umsatz else 1000.0
+        multiplier = 0.001 # Thousands converted to millions for revenue
+        if not is_umsatz: multiplier = 1000.0
        
    # Extract number candidates
    # Regex for "1.000,50" or "1,000.50" or "1000"
@@ -171,8 +185,6 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
                 # For revenue, 375.6 vs 1.000 is tricky. 
                 # But usually revenue in millions is small numbers with decimals (250.5).
                 # Large integers usually mean thousands.
-                 # Let's assume dot is decimal for revenue unless context implies otherwise, 
-                 # but for "375.6" it works. For "1.000" it becomes 1.0.
                 # Let's keep dot as decimal for revenue by default unless we detect multiple dots
                 if num_str.count('.') > 1:
                     num_str = num_str.replace('.', '')
@@ -284,4 +296,4 @@ def call_gemini(
            logger.error(f"Error with google-generativeai lib: {e}")
            raise e
            
-    raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
+    raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
--- a/company-explorer/backend/services/discovery.py
+++ b/company-explorer/backend/services/discovery.py
@@ -1,10 +1,11 @@
 import logging
 import requests
 import re
-from typing import Optional, Dict, Tuple
+from typing import Optional, Dict, Tuple, Any
 from urllib.parse import urlparse
+
 from ..config import settings
-from ..lib.core_utils import retry_on_failure, normalize_string
+from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
 from .wikipedia_service import WikipediaService

 logger = logging.getLogger(__name__)
@@ -23,7 +24,6 @@ class DiscoveryService:
        if not self.api_key:
            logger.warning("SERP_API_KEY not set. Discovery features will fail.")
        
-        # Initialize the specialized Wikipedia Service
        self.wiki_service = WikipediaService()

    @retry_on_failure(max_retries=2)
@@ -60,42 +60,31 @@ class DiscoveryService:
            for result in data["organic_results"]:
                link = result.get("link", "")
                if self._is_credible_url(link):
-                    # Simple heuristic: If the company name is part of the domain, high confidence
-                    # Otherwise, take the first credible result.
                    return link
            
            return "k.A."

        except Exception as e:
-            logger.error(f"SerpAPI Error: {e}")
+            logger.error(f"SerpAPI Error: {e}", exc_info=True)
            return "k.A."

    @retry_on_failure(max_retries=2)
-    def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
+    def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
        """
        Searches for a specific German Wikipedia article using the robust WikipediaService.
        Includes validation via website domain and city.
        """
-        if not self.api_key:
-            return "k.A."
-            
-        try:
-            # Delegate to the robust service
-            # parent_name could be added if available in the future
-            page = self.wiki_service.search_company_article(
-                company_name=company_name,
-                website=website,
-                crm_city=city
-            )
-            
-            if page:
-                return page.url
-            
-            return "k.A."
-
-        except Exception as e:
-            logger.error(f"Wiki Search Error via Service: {e}")
-            return "k.A."
+        # Pass all available info for robust search and validation
+        page = self.wiki_service.search_company_article(
+            company_name=company_name,
+            website=website,
+            crm_city=city
+        )
+        
+        if page:
+            return page.url
+        
+        return "k.A."

    def extract_wikipedia_data(self, url: str) -> dict:
        """
@@ -104,21 +93,21 @@ class DiscoveryService:
        try:
            return self.wiki_service.extract_company_data(url)
        except Exception as e:
-            logger.error(f"Wiki Extraction Error for {url}: {e}")
+            logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
            return {"url": url, "error": str(e)}

    def _is_credible_url(self, url: str) -> bool:
-        """Filters out social media, directories, and junk."""
+        """
+        Filters out social media, directories, and junk.
+        """
        if not url: return False
        try:
            domain = urlparse(url).netloc.lower().replace("www.", "")
            if domain in BLACKLIST_DOMAINS:
                return False
-            # Check for subdomains of blacklist (e.g. de.linkedin.com)
            for bad in BLACKLIST_DOMAINS:
                if domain.endswith("." + bad):
                    return False
            return True
        except:
-            return False
-
+            return False
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -36,17 +36,30 @@ class ScraperService:
            response.raise_for_status()
            
            # Check Content Type
+            logger.debug(f"Response status: {response.status_code}")
+            if response.headers is None:
+                logger.error("Response headers is None!")
+                return {"error": "No headers"}
+                
            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' not in content_type:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}

            # Parse Main Page
-            result = self._parse_html(response.content)
+            try:
+                result = self._parse_html(response.content)
+            except Exception as e:
+                logger.error(f"Error in _parse_html: {e}", exc_info=True)
+                return {"error": f"Parse error: {e}"}
            
            # --- IMPRESSUM LOGIC ---
-            soup = BeautifulSoup(response.content, 'html.parser')
-            impressum_url = self._find_impressum_link(soup, url)
+            try:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                impressum_url = self._find_impressum_link(soup, url)
+            except Exception as e:
+                logger.error(f"Error finding impressum: {e}", exc_info=True)
+                impressum_url = None
            
            # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
            if not impressum_url and url.count('/') > 3:
@@ -160,7 +173,8 @@ class ScraperService:
            # LLM Extraction
            prompt = f"""
            Extract the official company details from this German 'Impressum' text.
-            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
+            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
+            'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
            If a field is missing, use null.
            
            Text:
@@ -184,40 +198,72 @@ class ScraperService:
            return None

    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
-        soup = BeautifulSoup(html_content, 'html.parser')
-        
-        # 1. Cleanup Junk (Aggressive, matching legacy logic)
-        # Removed 'a' tags to prevent menu links from polluting the text analysis
-        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
-            element.decompose()
+        if not html_content:
+            return {"title": "", "description": "", "text": "", "emails": []}
+
+        try:
+            soup = BeautifulSoup(html_content, 'html.parser')
            
-        # 1b. Remove common Cookie Banners / Popups by class/id heuristics
-        for div in soup.find_all("div"):
-            classes = str(div.get("class", "")).lower()
-            ids = str(div.get("id", "")).lower()
-            if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
-                div.decompose()
+            # 1. Cleanup Junk
+            # Safe removal of tags
+            for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
+                if element: element.decompose()
+                
+            # 1b. Remove common Cookie Banners (Defensive)
+            try:
+                for div in soup.find_all("div"):
+                    if not div: continue
+                    # .get can return None for attributes if not found? No, returns None if key not found.
+                    # But if div is somehow None (unlikely in loop), check first.
+                    
+                    # Convert list of classes to string if needed
+                    cls_attr = div.get("class")
+                    classes = " ".join(cls_attr).lower() if isinstance(cls_attr, list) else str(cls_attr or "").lower()
+                    
+                    id_attr = div.get("id")
+                    ids = str(id_attr or "").lower()
+                    
+                    if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
+                        div.decompose()
+            except Exception as e:
+                logger.warning(f"Error filtering divs: {e}")

-        # 2. Extract Title & Meta Description
-        title = soup.title.string if soup.title else ""
-        meta_desc = ""
-        meta_tag = soup.find('meta', attrs={'name': 'description'})
-        if meta_tag:
-            meta_desc = meta_tag.get('content', '')
+            # 2. Extract Title & Meta Description
+            title = ""
+            try:
+                if soup.title and soup.title.string:
+                    title = soup.title.string
+            except: pass

-        # 3. Extract Main Text
-        # Prefer body, fallback to full soup
-        body = soup.find('body')
-        raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
-        
-        cleaned_text = clean_text(raw_text)
-        
-        # 4. Extract Emails (Basic Regex)
-        emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
-        
-        return {
-            "title": clean_text(title),
-            "description": clean_text(meta_desc),
-            "text": cleaned_text[:25000], # Limit to avoid context overflow
-            "emails": list(emails)[:5] # Limit to 5
-        }
+            meta_desc = ""
+            try:
+                meta_tag = soup.find('meta', attrs={'name': 'description'})
+                if meta_tag:
+                    meta_desc = meta_tag.get('content', '') or ""
+            except: pass
+
+            # 3. Extract Main Text
+            try:
+                body = soup.find('body')
+                raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
+                cleaned_text = clean_text(raw_text)
+            except Exception as e:
+                logger.warning(f"Text extraction failed: {e}")
+                cleaned_text = ""
+            
+            # 4. Extract Emails
+            emails = []
+            try:
+                emails = list(set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)))[:5]
+            except: pass
+            
+            return {
+                "title": clean_text(title),
+                "description": clean_text(meta_desc),
+                "text": cleaned_text[:25000],
+                "emails": emails
+            }
+
+        except Exception as e:
+            logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
+            return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
--- a/company-explorer/backend/services/wikipedia_service.py
+++ b/company-explorer/backend/services/wikipedia_service.py
@@ -352,7 +352,7 @@ class WikipediaService:
                extracted_country = region_to_country[suffix_in_klammer]
                temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")

-        if not extracted_country and ',' in temp_sitz:
+        if not extracted_country and "," in temp_sitz:
            parts = [p.strip() for p in temp_sitz.split(',')]
            if len(parts) > 1:
                last_part_lower = parts[-1].lower()
@@ -445,4 +445,4 @@ class WikipediaService:
            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
        except Exception as e:
            logger.error(f"  -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}")
-            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
+            return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}