feat(company-explorer): Initial Web UI & Backend with Enrichment Flow
This commit introduces the foundational elements for the new "Company Explorer" web application, marking a significant step away from the legacy Google Sheets / CLI system. Key changes include: - Project Structure: A new directory with separate (FastAPI) and (React/Vite) components. - Data Persistence: Migration from Google Sheets to a local SQLite database () using SQLAlchemy. - Core Utilities: Extraction and cleanup of essential helper functions (LLM wrappers, text utilities) into . - Backend Services: , , for AI-powered analysis, and logic. - Frontend UI: Basic React application with company table, import wizard, and dynamic inspector sidebar. - Docker Integration: Updated and for multi-stage builds and sideloading. - Deployment & Access: Integrated into central Nginx proxy and dashboard, accessible via . Lessons Learned & Fixed during development: - Frontend Asset Loading: Addressed issues with Vite's path and FastAPI's . - TypeScript Configuration: Added and . - Database Schema Evolution: Solved errors by forcing a new database file and correcting override. - Logging: Implemented robust file-based logging (). This new foundation provides a powerful and maintainable platform for future B2B robotics lead generation.
This commit is contained in:
82
company-explorer/backend/services/scraping.py
Normal file
82
company-explorer/backend/services/scraping.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import logging
|
||||
import requests
|
||||
import random
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional, Dict
|
||||
from ..lib.core_utils import clean_text, retry_on_failure
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
||||
]
|
||||
|
||||
class ScraperService:
|
||||
def __init__(self, timeout: int = 15):
|
||||
self.timeout = timeout
|
||||
|
||||
@retry_on_failure(max_retries=2)
|
||||
def scrape_url(self, url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Fetches a URL and returns cleaned text content + meta info.
|
||||
"""
|
||||
if not url.startswith("http"):
|
||||
url = "https://" + url
|
||||
|
||||
try:
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
# verify=False is risky but often needed for poorly configured corporate sites
|
||||
response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check Content Type
|
||||
content_type = response.headers.get('Content-Type', '').lower()
|
||||
if 'text/html' not in content_type:
|
||||
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
|
||||
return {"error": "Not HTML"}
|
||||
|
||||
return self._parse_html(response.content)
|
||||
|
||||
except requests.exceptions.SSLError:
|
||||
# Retry with HTTP if HTTPS fails
|
||||
if url.startswith("https://"):
|
||||
logger.info(f"SSL failed for {url}, retrying with http://...")
|
||||
return self.scrape_url(url.replace("https://", "http://"))
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping failed for {url}: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1. Cleanup Junk
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
|
||||
element.decompose()
|
||||
|
||||
# 2. Extract Title & Meta Description
|
||||
title = soup.title.string if soup.title else ""
|
||||
meta_desc = ""
|
||||
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
if meta_tag:
|
||||
meta_desc = meta_tag.get('content', '')
|
||||
|
||||
# 3. Extract Main Text
|
||||
# Prefer body, fallback to full soup
|
||||
body = soup.find('body')
|
||||
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
|
||||
|
||||
cleaned_text = clean_text(raw_text)
|
||||
|
||||
# 4. Extract Emails (Basic Regex)
|
||||
emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
|
||||
|
||||
return {
|
||||
"title": clean_text(title),
|
||||
"description": clean_text(meta_desc),
|
||||
"text": cleaned_text[:25000], # Limit to avoid context overflow
|
||||
"emails": list(emails)[:5] # Limit to 5
|
||||
}
|
||||
Reference in New Issue
Block a user