feat(company-explorer): Initial Web UI & Backend with Enrichment Flow

This commit introduces the foundational elements for the new "Company Explorer" web application, marking a significant step away from the legacy Google Sheets / CLI system. Key changes include: - Project Structure: A new directory with separate (FastAPI) and (React/Vite) components. - Data Persistence: Migration from Google Sheets to a local SQLite database () using SQLAlchemy. - Core Utilities: Extraction and cleanup of essential helper functions (LLM wrappers, text utilities) into . - Backend Services: , , for AI-powered analysis, and logic. - Frontend UI: Basic React application with company table, import wizard, and dynamic inspector sidebar. - Docker Integration: Updated and for multi-stage builds and sideloading. - Deployment & Access: Integrated into central Nginx proxy and dashboard, accessible via . Lessons Learned & Fixed during development: - Frontend Asset Loading: Addressed issues with Vite's path and FastAPI's . - TypeScript Configuration: Added and . - Database Schema Evolution: Solved errors by forcing a new database file and correcting override. - Logging: Implemented robust file-based logging (). This new foundation provides a powerful and maintainable platform for future B2B robotics lead generation.
2026-01-07 17:55:08 +00:00
parent e27cc995f6
commit c6a37a3c17
51 changed files with 3475 additions and 2 deletions
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -0,0 +1,82 @@
+import logging
+import requests
+import random
+import re
+from bs4 import BeautifulSoup
+from typing import Optional, Dict
+from ..lib.core_utils import clean_text, retry_on_failure
+
+logger = logging.getLogger(__name__)
+
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
+]
+
+class ScraperService:
+    def __init__(self, timeout: int = 15):
+        self.timeout = timeout
+
+    @retry_on_failure(max_retries=2)
+    def scrape_url(self, url: str) -> Dict[str, str]:
+        """
+        Fetches a URL and returns cleaned text content + meta info.
+        """
+        if not url.startswith("http"):
+            url = "https://" + url
+
+        try:
+            headers = {'User-Agent': random.choice(USER_AGENTS)}
+            # verify=False is risky but often needed for poorly configured corporate sites
+            response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
+            response.raise_for_status()
+            
+            # Check Content Type
+            content_type = response.headers.get('Content-Type', '').lower()
+            if 'text/html' not in content_type:
+                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
+                return {"error": "Not HTML"}
+
+            return self._parse_html(response.content)
+
+        except requests.exceptions.SSLError:
+            # Retry with HTTP if HTTPS fails
+            if url.startswith("https://"):
+                logger.info(f"SSL failed for {url}, retrying with http://...")
+                return self.scrape_url(url.replace("https://", "http://"))
+            raise
+        except Exception as e:
+            logger.error(f"Scraping failed for {url}: {e}")
+            return {"error": str(e)}
+
+    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        
+        # 1. Cleanup Junk
+        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
+            element.decompose()
+            
+        # 2. Extract Title & Meta Description
+        title = soup.title.string if soup.title else ""
+        meta_desc = ""
+        meta_tag = soup.find('meta', attrs={'name': 'description'})
+        if meta_tag:
+            meta_desc = meta_tag.get('content', '')
+
+        # 3. Extract Main Text
+        # Prefer body, fallback to full soup
+        body = soup.find('body')
+        raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
+        
+        cleaned_text = clean_text(raw_text)
+        
+        # 4. Extract Emails (Basic Regex)
+        emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
+        
+        return {
+            "title": clean_text(title),
+            "description": clean_text(meta_desc),
+            "text": cleaned_text[:25000], # Limit to avoid context overflow
+            "emails": list(emails)[:5] # Limit to 5
+        }