fix(ce): Resolve database schema mismatch and restore docs

- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
2026-01-15 15:54:45 +00:00
parent 9770387505
commit 23c57987bc
13 changed files with 724 additions and 555 deletions
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -36,17 +36,30 @@ class ScraperService:
            response.raise_for_status()
            
            # Check Content Type
+            logger.debug(f"Response status: {response.status_code}")
+            if response.headers is None:
+                logger.error("Response headers is None!")
+                return {"error": "No headers"}
+                
            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' not in content_type:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}

            # Parse Main Page
-            result = self._parse_html(response.content)
+            try:
+                result = self._parse_html(response.content)
+            except Exception as e:
+                logger.error(f"Error in _parse_html: {e}", exc_info=True)
+                return {"error": f"Parse error: {e}"}
            
            # --- IMPRESSUM LOGIC ---
-            soup = BeautifulSoup(response.content, 'html.parser')
-            impressum_url = self._find_impressum_link(soup, url)
+            try:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                impressum_url = self._find_impressum_link(soup, url)
+            except Exception as e:
+                logger.error(f"Error finding impressum: {e}", exc_info=True)
+                impressum_url = None
            
            # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
            if not impressum_url and url.count('/') > 3:
@@ -160,7 +173,8 @@ class ScraperService:
            # LLM Extraction
            prompt = f"""
            Extract the official company details from this German 'Impressum' text.
-            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
+            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
+            'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
            If a field is missing, use null.
            
            Text:
@@ -184,40 +198,72 @@ class ScraperService:
            return None

    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
-        soup = BeautifulSoup(html_content, 'html.parser')
-        
-        # 1. Cleanup Junk (Aggressive, matching legacy logic)
-        # Removed 'a' tags to prevent menu links from polluting the text analysis
-        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
-            element.decompose()
+        if not html_content:
+            return {"title": "", "description": "", "text": "", "emails": []}
+
+        try:
+            soup = BeautifulSoup(html_content, 'html.parser')
            
-        # 1b. Remove common Cookie Banners / Popups by class/id heuristics
-        for div in soup.find_all("div"):
-            classes = str(div.get("class", "")).lower()
-            ids = str(div.get("id", "")).lower()
-            if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
-                div.decompose()
+            # 1. Cleanup Junk
+            # Safe removal of tags
+            for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
+                if element: element.decompose()
+                
+            # 1b. Remove common Cookie Banners (Defensive)
+            try:
+                for div in soup.find_all("div"):
+                    if not div: continue
+                    # .get can return None for attributes if not found? No, returns None if key not found.
+                    # But if div is somehow None (unlikely in loop), check first.
+                    
+                    # Convert list of classes to string if needed
+                    cls_attr = div.get("class")
+                    classes = " ".join(cls_attr).lower() if isinstance(cls_attr, list) else str(cls_attr or "").lower()
+                    
+                    id_attr = div.get("id")
+                    ids = str(id_attr or "").lower()
+                    
+                    if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
+                        div.decompose()
+            except Exception as e:
+                logger.warning(f"Error filtering divs: {e}")

-        # 2. Extract Title & Meta Description
-        title = soup.title.string if soup.title else ""
-        meta_desc = ""
-        meta_tag = soup.find('meta', attrs={'name': 'description'})
-        if meta_tag:
-            meta_desc = meta_tag.get('content', '')
+            # 2. Extract Title & Meta Description
+            title = ""
+            try:
+                if soup.title and soup.title.string:
+                    title = soup.title.string
+            except: pass

-        # 3. Extract Main Text
-        # Prefer body, fallback to full soup
-        body = soup.find('body')
-        raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
-        
-        cleaned_text = clean_text(raw_text)
-        
-        # 4. Extract Emails (Basic Regex)
-        emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
-        
-        return {
-            "title": clean_text(title),
-            "description": clean_text(meta_desc),
-            "text": cleaned_text[:25000], # Limit to avoid context overflow
-            "emails": list(emails)[:5] # Limit to 5
-        }
+            meta_desc = ""
+            try:
+                meta_tag = soup.find('meta', attrs={'name': 'description'})
+                if meta_tag:
+                    meta_desc = meta_tag.get('content', '') or ""
+            except: pass
+
+            # 3. Extract Main Text
+            try:
+                body = soup.find('body')
+                raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
+                cleaned_text = clean_text(raw_text)
+            except Exception as e:
+                logger.warning(f"Text extraction failed: {e}")
+                cleaned_text = ""
+            
+            # 4. Extract Emails
+            emails = []
+            try:
+                emails = list(set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)))[:5]
+            except: pass
+            
+            return {
+                "title": clean_text(title),
+                "description": clean_text(meta_desc),
+                "text": cleaned_text[:25000],
+                "emails": emails
+            }
+
+        except Exception as e:
+            logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
+            return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}