fix(ce): Resolve database schema mismatch and restore docs

- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
2026-01-15 15:54:45 +00:00
parent 4fcbbe3723
commit 4a336f6374
13 changed files with 724 additions and 555 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -106,6 +106,7 @@ def list_companies(
    skip: int = 0, 
    limit: int = 50, 
    search: Optional[str] = None,
+    sort_by: Optional[str] = Query("name_asc"),
    db: Session = Depends(get_db)
 ):
    try:
@@ -114,8 +115,16 @@ def list_companies(
            query = query.filter(Company.name.ilike(f"%{search}%"))
        
        total = query.count()
-        # Sort by ID desc (newest first)
-        items = query.order_by(Company.id.desc()).offset(skip).limit(limit).all()
+        
+        # Sorting Logic
+        if sort_by == "updated_desc":
+            query = query.order_by(Company.updated_at.desc())
+        elif sort_by == "created_desc":
+            query = query.order_by(Company.id.desc())
+        else: # Default: name_asc
+            query = query.order_by(Company.name.asc())
+
+        items = query.offset(skip).limit(limit).all()
        
        return {"total": total, "items": items}
    except Exception as e:
@@ -263,10 +272,48 @@ def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depe
        existing_wiki.content = wiki_data
        existing_wiki.updated_at = datetime.utcnow()
        existing_wiki.is_locked = True # LOCK IT
+        existing_wiki.wiki_verified_empty = False # It's no longer empty
    
    db.commit()
+    # The return needs to be here, outside the else block but inside the main function
    return {"status": "updated", "data": wiki_data}
+        
+@app.post("/api/companies/{company_id}/wiki_mark_empty")
+def mark_wiki_empty(company_id: int, db: Session = Depends(get_db)):
+    """
+    Marks a company as having no valid Wikipedia entry after manual review.
+    Creates a locked, empty Wikipedia enrichment entry.
+    """
+    company = db.query(Company).filter(Company.id == company_id).first()
+    if not company:
+        raise HTTPException(404, "Company not found")

+    logger.info(f"Manual override for {company.name}: Marking Wikipedia as verified empty.")
+    
+    existing_wiki = db.query(EnrichmentData).filter(
+        EnrichmentData.company_id == company.id, 
+        EnrichmentData.source_type == "wikipedia"
+    ).first()
+    
+    empty_wiki_data = {"url": "k.A.", "title": "k.A.", "first_paragraph": "k.A.", "error": "Manually marked as empty"}
+
+    if not existing_wiki:
+        db.add(EnrichmentData(
+            company_id=company.id, 
+            source_type="wikipedia", 
+            content=empty_wiki_data,
+            is_locked=True,
+            wiki_verified_empty=True
+        ))
+    else:
+        existing_wiki.content = empty_wiki_data
+        existing_wiki.updated_at = datetime.utcnow()
+        existing_wiki.is_locked = True # LOCK IT
+        existing_wiki.wiki_verified_empty = True # Mark as empty
+    
+    db.commit()
+    return {"status": "updated", "wiki_verified_empty": True}
+        
@app.post("/api/companies/{company_id}/override/website")
 def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
    """
@@ -305,6 +352,17 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session =
    if not impressum_data:
        raise HTTPException(status_code=400, detail="Failed to extract data from provided URL")

+    # Update company record with city/country if found
+    logger.info(f"override_impressum_url: Scraped impressum_data for {company.name}: City={impressum_data.get('city')}, Country_code={impressum_data.get('country_code')}")
+    if city_val := impressum_data.get("city"):
+        logger.info(f"override_impressum_url: Updating company.city from '{company.city}' to '{city_val}'")
+        company.city = city_val
+    if country_val := impressum_data.get("country_code"):
+        logger.info(f"override_impressum_url: Updating company.country from '{company.country}' to '{country_val}'")
+        company.country = country_val
+    logger.info(f"override_impressum_url: Company object after updates (before commit): City='{company.city}', Country='{company.country}'")
+
+
    # 2. Find existing scrape data or create new
    existing_scrape = db.query(EnrichmentData).filter(
        EnrichmentData.company_id == company.id, 
@@ -312,20 +370,23 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session =
    ).first()

    if not existing_scrape:
-        # Create minimal scrape entry
+        # Create minimal scrape entry and lock it
        db.add(EnrichmentData(
            company_id=company.id, 
            source_type="website_scrape", 
-            content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url}
+            content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url},
+            is_locked=True
        ))
    else:
-        # Update existing
+        # Update existing and lock it
        content = dict(existing_scrape.content) if existing_scrape.content else {}
        content["impressum"] = impressum_data
        existing_scrape.content = content
        existing_scrape.updated_at = datetime.utcnow()
+        existing_scrape.is_locked = True
    
    db.commit()
+    logger.info(f"override_impressum_url: Commit successful. Company ID {company.id} updated.")
    return {"status": "updated", "data": impressum_data}

 # --- Contact Routes ---
@@ -465,6 +526,7 @@ def list_all_contacts(
    skip: int = 0, 
    limit: int = 50, 
    search: Optional[str] = None,
+    sort_by: Optional[str] = Query("name_asc"),
    db: Session = Depends(get_db)
 ):
    """
@@ -482,8 +544,16 @@ def list_all_contacts(
        )
    
    total = query.count()
-    # Sort by ID desc
-    contacts = query.order_by(Contact.id.desc()).offset(skip).limit(limit).all()
+
+    # Sorting Logic
+    if sort_by == "updated_desc":
+        query = query.order_by(Contact.updated_at.desc())
+    elif sort_by == "created_desc":
+        query = query.order_by(Contact.id.desc())
+    else: # Default: name_asc
+        query = query.order_by(Contact.last_name.asc(), Contact.first_name.asc())
+
+    contacts = query.offset(skip).limit(limit).all()
    
    # Enrich with Company Name for the frontend list
    result = []
@@ -552,6 +622,23 @@ def bulk_import_contacts(req: BulkContactImportRequest, db: Session = Depends(ge
    db.commit()
    return stats

+@app.post("/api/enrichment/{company_id}/{source_type}/lock")
+def lock_enrichment(company_id: int, source_type: str, locked: bool = Query(...), db: Session = Depends(get_db)):
+    """
+    Toggles the lock status of a specific enrichment data type (e.g. 'website_scrape', 'wikipedia').
+    """
+    entry = db.query(EnrichmentData).filter(
+        EnrichmentData.company_id == company_id,
+        EnrichmentData.source_type == source_type
+    ).first()
+    
+    if not entry:
+        raise HTTPException(404, "Enrichment data not found")
+    
+    entry.is_locked = locked
+    db.commit()
+    return {"status": "updated", "is_locked": locked}
+
 def run_discovery_task(company_id: int):
    # New Session for Background Task
    from .database import SessionLocal
@@ -616,15 +703,11 @@ def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db:
        return {"error": "No website to analyze. Run Discovery first."}

    # FORCE SCRAPE LOGIC
-    # If explicit force_scrape is requested OR if we want to ensure fresh data for debugging
-    # We delete the old scrape data.
-    # For now, let's assume every manual "Analyze" click implies a desire for fresh results if previous failed.
-    # But let's respect the flag from frontend if we add it later.
-    
-    # Always clearing scrape data for now to fix the "stuck cache" issue reported by user
+    # Respect Locked Data: Only delete if not locked.
    db.query(EnrichmentData).filter(
        EnrichmentData.company_id == company.id,
-        EnrichmentData.source_type == "website_scrape"
+        EnrichmentData.source_type == "website_scrape",
+        EnrichmentData.is_locked == False
    ).delete()
    db.commit()

@@ -640,29 +723,97 @@ def run_analysis_task(company_id: int, url: str):

        logger.info(f"Running Analysis Task for {company.name}")

-        # 1. Scrape Website
-        scrape_result = scraper.scrape_url(url)
-        
-        # Save Scrape Data
-        existing_scrape_data = db.query(EnrichmentData).filter(
+        # 1. Scrape Website OR Use Locked Data
+        scrape_result = {}
+        existing_scrape = db.query(EnrichmentData).filter(
            EnrichmentData.company_id == company.id,
            EnrichmentData.source_type == "website_scrape"
        ).first()

-        if "text" in scrape_result and scrape_result["text"]:
-            if not existing_scrape_data:
-                db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result))
-            else:
-                existing_scrape_data.content = scrape_result
-                existing_scrape_data.updated_at = datetime.utcnow()
-        elif "error" in scrape_result:
-            logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}")
+        if existing_scrape and existing_scrape.is_locked:
+            logger.info(f"Using LOCKED scrape data for {company.name}")
+            scrape_result = dict(existing_scrape.content) # Copy dict
+
+            # Always ensure city/country from locked impressum data is synced to company
+            if "impressum" in scrape_result and scrape_result["impressum"]:
+                impressum_city = scrape_result["impressum"].get("city")
+                impressum_country = scrape_result["impressum"].get("country_code")
+                logger.info(f"Analysis task (locked data): Impressum found. City='{impressum_city}', Country='{impressum_country}'")
+                if impressum_city and company.city != impressum_city:
+                    logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'")
+                    company.city = impressum_city
+                if impressum_country and company.country != impressum_country:
+                    logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'")
+                    company.country = impressum_country
+            
+            text_val = scrape_result.get("text")
+            text_len = len(text_val) if text_val else 0
+            logger.info(f"Locked data keys: {list(scrape_result.keys())}, Text length: {text_len}")
+
+            # AUTO-FIX: If locked data (e.g. Manual Impressum) has no text, fetch main website text
+            if text_len < 100:
+                logger.info(f"Locked data missing text (len={text_len}). Fetching content from {url}...")
+                try:
+                    fresh_scrape = scraper.scrape_url(url)
+                except Exception as e:
+                    logger.error(f"Fresh scrape failed: {e}", exc_info=True)
+                    fresh_scrape = {}
+
+                logger.info(f"Fresh scrape result keys: {list(fresh_scrape.keys())}")
+                
+                if "text" in fresh_scrape and len(fresh_scrape["text"]) > 100:
+                    logger.info(f"Fresh scrape successful. Text len: {len(fresh_scrape['text'])}")
+                    # Update local dict for current processing
+                    scrape_result["text"] = fresh_scrape["text"]
+                    scrape_result["title"] = fresh_scrape.get("title", "")
+                    
+                    # Update DB (Merge into existing content)
+                    updated_content = dict(existing_scrape.content)
+                    updated_content["text"] = fresh_scrape["text"]
+                    updated_content["title"] = fresh_scrape.get("title", "")
+                    
+                    existing_scrape.content = updated_content
+                    existing_scrape.updated_at = datetime.utcnow()
+                    # db.commit() here would be too early
+                    logger.info("Updated locked record with fresh website text in session.")
+                else:
+                    logger.warning(f"Fresh scrape returned insufficient text. Error: {fresh_scrape.get('error')}")
+        else:
+            # Standard Scrape
+            scrape_result = scraper.scrape_url(url)
+            
+            # Update company fields from impressum if found during scrape
+            if "impressum" in scrape_result and scrape_result["impressum"]:
+                impressum_city = scrape_result["impressum"].get("city")
+                impressum_country = scrape_result["impressum"].get("country_code")
+                logger.info(f"Analysis task (standard scrape): Impressum found. City='{impressum_city}', Country='{impressum_country}'")
+                if impressum_city and company.city != impressum_city:
+                    logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'")
+                    company.city = impressum_city
+                if impressum_country and company.country != impressum_country:
+                    logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'")
+                    company.country = impressum_country
+            
+            # Save Scrape Data
+            if "text" in scrape_result and scrape_result["text"]:
+                if not existing_scrape:
+                    db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result))
+                else:
+                    existing_scrape.content = scrape_result
+                    existing_scrape.updated_at = datetime.utcnow()
+            elif "error" in scrape_result:
+                logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}")

        # 2. Classify Robotics Potential
-        if "text" in scrape_result and scrape_result["text"]:
+        text_content = scrape_result.get("text")
+        
+        logger.info(f"Preparing classification. Text content length: {len(text_content) if text_content else 0}")
+
+        if text_content and len(text_content) > 100:
+            logger.info(f"Starting classification for {company.name}...")
            analysis = classifier.analyze_robotics_potential(
                company_name=company.name,
-                website_text=scrape_result["text"]
+                website_text=text_content
            )
            
            if "error" in analysis:
@@ -672,10 +823,8 @@ def run_analysis_task(company_id: int, url: str):
                if industry:
                    company.industry_ai = industry
                
-                # Delete old signals
                db.query(Signal).filter(Signal.company_id == company.id).delete()

-                # Save new signals
                potentials = analysis.get("potentials", {})
                for signal_type, data in potentials.items():
                    new_signal = Signal(
@@ -687,7 +836,6 @@ def run_analysis_task(company_id: int, url: str):
                    )
                    db.add(new_signal)
                
-                # Save Full Analysis Blob (Business Model + Evidence)
                existing_analysis = db.query(EnrichmentData).filter(
                    EnrichmentData.company_id == company.id,
                    EnrichmentData.source_type == "ai_analysis"
@@ -702,6 +850,8 @@ def run_analysis_task(company_id: int, url: str):
                company.status = "ENRICHED"
                company.last_classification_at = datetime.utcnow()
                logger.info(f"Robotics analysis complete for {company.name}.")
+        else:
+            logger.warning(f"Skipping classification for {company.name}: Insufficient text content (len={len(text_content) if text_content else 0})")

        db.commit()
        logger.info(f"Analysis finished for {company.id}")