fix(ce): Resolve database schema mismatch and restore docs

- Fixed a critical  in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema.
- Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md.

- Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
This commit is contained in:
2026-01-15 15:54:45 +00:00
parent 9770387505
commit 23c57987bc
13 changed files with 724 additions and 555 deletions

View File

@@ -36,17 +36,30 @@ class ScraperService:
response.raise_for_status()
# Check Content Type
logger.debug(f"Response status: {response.status_code}")
if response.headers is None:
logger.error("Response headers is None!")
return {"error": "No headers"}
content_type = response.headers.get('Content-Type', '').lower()
if 'text/html' not in content_type:
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
return {"error": "Not HTML"}
# Parse Main Page
result = self._parse_html(response.content)
try:
result = self._parse_html(response.content)
except Exception as e:
logger.error(f"Error in _parse_html: {e}", exc_info=True)
return {"error": f"Parse error: {e}"}
# --- IMPRESSUM LOGIC ---
soup = BeautifulSoup(response.content, 'html.parser')
impressum_url = self._find_impressum_link(soup, url)
try:
soup = BeautifulSoup(response.content, 'html.parser')
impressum_url = self._find_impressum_link(soup, url)
except Exception as e:
logger.error(f"Error finding impressum: {e}", exc_info=True)
impressum_url = None
# FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
if not impressum_url and url.count('/') > 3:
@@ -160,7 +173,8 @@ class ScraperService:
# LLM Extraction
prompt = f"""
Extract the official company details from this German 'Impressum' text.
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
If a field is missing, use null.
Text:
@@ -184,40 +198,72 @@ class ScraperService:
return None
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
soup = BeautifulSoup(html_content, 'html.parser')
# 1. Cleanup Junk (Aggressive, matching legacy logic)
# Removed 'a' tags to prevent menu links from polluting the text analysis
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
element.decompose()
if not html_content:
return {"title": "", "description": "", "text": "", "emails": []}
try:
soup = BeautifulSoup(html_content, 'html.parser')
# 1b. Remove common Cookie Banners / Popups by class/id heuristics
for div in soup.find_all("div"):
classes = str(div.get("class", "")).lower()
ids = str(div.get("id", "")).lower()
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
div.decompose()
# 1. Cleanup Junk
# Safe removal of tags
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
if element: element.decompose()
# 1b. Remove common Cookie Banners (Defensive)
try:
for div in soup.find_all("div"):
if not div: continue
# .get can return None for attributes if not found? No, returns None if key not found.
# But if div is somehow None (unlikely in loop), check first.
# Convert list of classes to string if needed
cls_attr = div.get("class")
classes = " ".join(cls_attr).lower() if isinstance(cls_attr, list) else str(cls_attr or "").lower()
id_attr = div.get("id")
ids = str(id_attr or "").lower()
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
div.decompose()
except Exception as e:
logger.warning(f"Error filtering divs: {e}")
# 2. Extract Title & Meta Description
title = soup.title.string if soup.title else ""
meta_desc = ""
meta_tag = soup.find('meta', attrs={'name': 'description'})
if meta_tag:
meta_desc = meta_tag.get('content', '')
# 2. Extract Title & Meta Description
title = ""
try:
if soup.title and soup.title.string:
title = soup.title.string
except: pass
# 3. Extract Main Text
# Prefer body, fallback to full soup
body = soup.find('body')
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
cleaned_text = clean_text(raw_text)
# 4. Extract Emails (Basic Regex)
emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
return {
"title": clean_text(title),
"description": clean_text(meta_desc),
"text": cleaned_text[:25000], # Limit to avoid context overflow
"emails": list(emails)[:5] # Limit to 5
}
meta_desc = ""
try:
meta_tag = soup.find('meta', attrs={'name': 'description'})
if meta_tag:
meta_desc = meta_tag.get('content', '') or ""
except: pass
# 3. Extract Main Text
try:
body = soup.find('body')
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
cleaned_text = clean_text(raw_text)
except Exception as e:
logger.warning(f"Text extraction failed: {e}")
cleaned_text = ""
# 4. Extract Emails
emails = []
try:
emails = list(set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)))[:5]
except: pass
return {
"title": clean_text(title),
"description": clean_text(meta_desc),
"text": cleaned_text[:25000],
"emails": emails
}
except Exception as e:
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}