fix(ce): Resolve database schema mismatch and restore docs
- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
import logging
|
||||
import requests
|
||||
import re
|
||||
from typing import Optional, Dict, Tuple
|
||||
from typing import Optional, Dict, Tuple, Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from ..config import settings
|
||||
from ..lib.core_utils import retry_on_failure, normalize_string
|
||||
from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
|
||||
from .wikipedia_service import WikipediaService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -23,7 +24,6 @@ class DiscoveryService:
|
||||
if not self.api_key:
|
||||
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
|
||||
|
||||
# Initialize the specialized Wikipedia Service
|
||||
self.wiki_service = WikipediaService()
|
||||
|
||||
@retry_on_failure(max_retries=2)
|
||||
@@ -60,42 +60,31 @@ class DiscoveryService:
|
||||
for result in data["organic_results"]:
|
||||
link = result.get("link", "")
|
||||
if self._is_credible_url(link):
|
||||
# Simple heuristic: If the company name is part of the domain, high confidence
|
||||
# Otherwise, take the first credible result.
|
||||
return link
|
||||
|
||||
return "k.A."
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"SerpAPI Error: {e}")
|
||||
logger.error(f"SerpAPI Error: {e}", exc_info=True)
|
||||
return "k.A."
|
||||
|
||||
@retry_on_failure(max_retries=2)
|
||||
def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
|
||||
def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
|
||||
"""
|
||||
Searches for a specific German Wikipedia article using the robust WikipediaService.
|
||||
Includes validation via website domain and city.
|
||||
"""
|
||||
if not self.api_key:
|
||||
return "k.A."
|
||||
|
||||
try:
|
||||
# Delegate to the robust service
|
||||
# parent_name could be added if available in the future
|
||||
page = self.wiki_service.search_company_article(
|
||||
company_name=company_name,
|
||||
website=website,
|
||||
crm_city=city
|
||||
)
|
||||
|
||||
if page:
|
||||
return page.url
|
||||
|
||||
return "k.A."
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Wiki Search Error via Service: {e}")
|
||||
return "k.A."
|
||||
# Pass all available info for robust search and validation
|
||||
page = self.wiki_service.search_company_article(
|
||||
company_name=company_name,
|
||||
website=website,
|
||||
crm_city=city
|
||||
)
|
||||
|
||||
if page:
|
||||
return page.url
|
||||
|
||||
return "k.A."
|
||||
|
||||
def extract_wikipedia_data(self, url: str) -> dict:
|
||||
"""
|
||||
@@ -104,21 +93,21 @@ class DiscoveryService:
|
||||
try:
|
||||
return self.wiki_service.extract_company_data(url)
|
||||
except Exception as e:
|
||||
logger.error(f"Wiki Extraction Error for {url}: {e}")
|
||||
logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
|
||||
return {"url": url, "error": str(e)}
|
||||
|
||||
def _is_credible_url(self, url: str) -> bool:
|
||||
"""Filters out social media, directories, and junk."""
|
||||
"""
|
||||
Filters out social media, directories, and junk.
|
||||
"""
|
||||
if not url: return False
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower().replace("www.", "")
|
||||
if domain in BLACKLIST_DOMAINS:
|
||||
return False
|
||||
# Check for subdomains of blacklist (e.g. de.linkedin.com)
|
||||
for bad in BLACKLIST_DOMAINS:
|
||||
if domain.endswith("." + bad):
|
||||
return False
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
return False
|
||||
@@ -36,17 +36,30 @@ class ScraperService:
|
||||
response.raise_for_status()
|
||||
|
||||
# Check Content Type
|
||||
logger.debug(f"Response status: {response.status_code}")
|
||||
if response.headers is None:
|
||||
logger.error("Response headers is None!")
|
||||
return {"error": "No headers"}
|
||||
|
||||
content_type = response.headers.get('Content-Type', '').lower()
|
||||
if 'text/html' not in content_type:
|
||||
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
|
||||
return {"error": "Not HTML"}
|
||||
|
||||
# Parse Main Page
|
||||
result = self._parse_html(response.content)
|
||||
try:
|
||||
result = self._parse_html(response.content)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in _parse_html: {e}", exc_info=True)
|
||||
return {"error": f"Parse error: {e}"}
|
||||
|
||||
# --- IMPRESSUM LOGIC ---
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
impressum_url = self._find_impressum_link(soup, url)
|
||||
try:
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
impressum_url = self._find_impressum_link(soup, url)
|
||||
except Exception as e:
|
||||
logger.error(f"Error finding impressum: {e}", exc_info=True)
|
||||
impressum_url = None
|
||||
|
||||
# FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
|
||||
if not impressum_url and url.count('/') > 3:
|
||||
@@ -160,7 +173,8 @@ class ScraperService:
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
|
||||
If a field is missing, use null.
|
||||
|
||||
Text:
|
||||
@@ -184,40 +198,72 @@ class ScraperService:
|
||||
return None
|
||||
|
||||
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1. Cleanup Junk (Aggressive, matching legacy logic)
|
||||
# Removed 'a' tags to prevent menu links from polluting the text analysis
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
|
||||
element.decompose()
|
||||
if not html_content:
|
||||
return {"title": "", "description": "", "text": "", "emails": []}
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1b. Remove common Cookie Banners / Popups by class/id heuristics
|
||||
for div in soup.find_all("div"):
|
||||
classes = str(div.get("class", "")).lower()
|
||||
ids = str(div.get("id", "")).lower()
|
||||
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
|
||||
div.decompose()
|
||||
# 1. Cleanup Junk
|
||||
# Safe removal of tags
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
|
||||
if element: element.decompose()
|
||||
|
||||
# 1b. Remove common Cookie Banners (Defensive)
|
||||
try:
|
||||
for div in soup.find_all("div"):
|
||||
if not div: continue
|
||||
# .get can return None for attributes if not found? No, returns None if key not found.
|
||||
# But if div is somehow None (unlikely in loop), check first.
|
||||
|
||||
# Convert list of classes to string if needed
|
||||
cls_attr = div.get("class")
|
||||
classes = " ".join(cls_attr).lower() if isinstance(cls_attr, list) else str(cls_attr or "").lower()
|
||||
|
||||
id_attr = div.get("id")
|
||||
ids = str(id_attr or "").lower()
|
||||
|
||||
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
|
||||
div.decompose()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error filtering divs: {e}")
|
||||
|
||||
# 2. Extract Title & Meta Description
|
||||
title = soup.title.string if soup.title else ""
|
||||
meta_desc = ""
|
||||
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
if meta_tag:
|
||||
meta_desc = meta_tag.get('content', '')
|
||||
# 2. Extract Title & Meta Description
|
||||
title = ""
|
||||
try:
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string
|
||||
except: pass
|
||||
|
||||
# 3. Extract Main Text
|
||||
# Prefer body, fallback to full soup
|
||||
body = soup.find('body')
|
||||
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
|
||||
|
||||
cleaned_text = clean_text(raw_text)
|
||||
|
||||
# 4. Extract Emails (Basic Regex)
|
||||
emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
|
||||
|
||||
return {
|
||||
"title": clean_text(title),
|
||||
"description": clean_text(meta_desc),
|
||||
"text": cleaned_text[:25000], # Limit to avoid context overflow
|
||||
"emails": list(emails)[:5] # Limit to 5
|
||||
}
|
||||
meta_desc = ""
|
||||
try:
|
||||
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
if meta_tag:
|
||||
meta_desc = meta_tag.get('content', '') or ""
|
||||
except: pass
|
||||
|
||||
# 3. Extract Main Text
|
||||
try:
|
||||
body = soup.find('body')
|
||||
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
|
||||
cleaned_text = clean_text(raw_text)
|
||||
except Exception as e:
|
||||
logger.warning(f"Text extraction failed: {e}")
|
||||
cleaned_text = ""
|
||||
|
||||
# 4. Extract Emails
|
||||
emails = []
|
||||
try:
|
||||
emails = list(set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)))[:5]
|
||||
except: pass
|
||||
|
||||
return {
|
||||
"title": clean_text(title),
|
||||
"description": clean_text(meta_desc),
|
||||
"text": cleaned_text[:25000],
|
||||
"emails": emails
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
|
||||
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
|
||||
|
||||
@@ -352,7 +352,7 @@ class WikipediaService:
|
||||
extracted_country = region_to_country[suffix_in_klammer]
|
||||
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
||||
|
||||
if not extracted_country and ',' in temp_sitz:
|
||||
if not extracted_country and "," in temp_sitz:
|
||||
parts = [p.strip() for p in temp_sitz.split(',')]
|
||||
if len(parts) > 1:
|
||||
last_part_lower = parts[-1].lower()
|
||||
@@ -445,4 +445,4 @@ class WikipediaService:
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
except Exception as e:
|
||||
logger.error(f" -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}")
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
Reference in New Issue
Block a user