fix(ce): Resolve database schema mismatch and restore docs

- Fixed a critical  in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema.
- Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md.

- Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
This commit is contained in:
2026-01-15 15:54:45 +00:00
parent 4fcbbe3723
commit 4a336f6374
13 changed files with 724 additions and 555 deletions

View File

@@ -1,10 +1,11 @@
import logging
import requests
import re
from typing import Optional, Dict, Tuple
from typing import Optional, Dict, Tuple, Any
from urllib.parse import urlparse
from ..config import settings
from ..lib.core_utils import retry_on_failure, normalize_string
from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
from .wikipedia_service import WikipediaService
logger = logging.getLogger(__name__)
@@ -23,7 +24,6 @@ class DiscoveryService:
if not self.api_key:
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
# Initialize the specialized Wikipedia Service
self.wiki_service = WikipediaService()
@retry_on_failure(max_retries=2)
@@ -60,42 +60,31 @@ class DiscoveryService:
for result in data["organic_results"]:
link = result.get("link", "")
if self._is_credible_url(link):
# Simple heuristic: If the company name is part of the domain, high confidence
# Otherwise, take the first credible result.
return link
return "k.A."
except Exception as e:
logger.error(f"SerpAPI Error: {e}")
logger.error(f"SerpAPI Error: {e}", exc_info=True)
return "k.A."
@retry_on_failure(max_retries=2)
def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
"""
Searches for a specific German Wikipedia article using the robust WikipediaService.
Includes validation via website domain and city.
"""
if not self.api_key:
return "k.A."
try:
# Delegate to the robust service
# parent_name could be added if available in the future
page = self.wiki_service.search_company_article(
company_name=company_name,
website=website,
crm_city=city
)
if page:
return page.url
return "k.A."
except Exception as e:
logger.error(f"Wiki Search Error via Service: {e}")
return "k.A."
# Pass all available info for robust search and validation
page = self.wiki_service.search_company_article(
company_name=company_name,
website=website,
crm_city=city
)
if page:
return page.url
return "k.A."
def extract_wikipedia_data(self, url: str) -> dict:
"""
@@ -104,21 +93,21 @@ class DiscoveryService:
try:
return self.wiki_service.extract_company_data(url)
except Exception as e:
logger.error(f"Wiki Extraction Error for {url}: {e}")
logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
return {"url": url, "error": str(e)}
def _is_credible_url(self, url: str) -> bool:
"""Filters out social media, directories, and junk."""
"""
Filters out social media, directories, and junk.
"""
if not url: return False
try:
domain = urlparse(url).netloc.lower().replace("www.", "")
if domain in BLACKLIST_DOMAINS:
return False
# Check for subdomains of blacklist (e.g. de.linkedin.com)
for bad in BLACKLIST_DOMAINS:
if domain.endswith("." + bad):
return False
return True
except:
return False
return False