This commit introduces the foundational elements for the new "Company Explorer" web application, marking a significant step away from the legacy Google Sheets / CLI system. Key changes include: - Project Structure: A new directory with separate (FastAPI) and (React/Vite) components. - Data Persistence: Migration from Google Sheets to a local SQLite database () using SQLAlchemy. - Core Utilities: Extraction and cleanup of essential helper functions (LLM wrappers, text utilities) into . - Backend Services: , , for AI-powered analysis, and logic. - Frontend UI: Basic React application with company table, import wizard, and dynamic inspector sidebar. - Docker Integration: Updated and for multi-stage builds and sideloading. - Deployment & Access: Integrated into central Nginx proxy and dashboard, accessible via . Lessons Learned & Fixed during development: - Frontend Asset Loading: Addressed issues with Vite's path and FastAPI's . - TypeScript Configuration: Added and . - Database Schema Evolution: Solved errors by forcing a new database file and correcting override. - Logging: Implemented robust file-based logging (). This new foundation provides a powerful and maintainable platform for future B2B robotics lead generation.
83 lines
3.2 KiB
Python
83 lines
3.2 KiB
Python
import logging
|
|
import requests
|
|
import random
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
from typing import Optional, Dict
|
|
from ..lib.core_utils import clean_text, retry_on_failure
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
USER_AGENTS = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
|
]
|
|
|
|
class ScraperService:
|
|
def __init__(self, timeout: int = 15):
|
|
self.timeout = timeout
|
|
|
|
@retry_on_failure(max_retries=2)
|
|
def scrape_url(self, url: str) -> Dict[str, str]:
|
|
"""
|
|
Fetches a URL and returns cleaned text content + meta info.
|
|
"""
|
|
if not url.startswith("http"):
|
|
url = "https://" + url
|
|
|
|
try:
|
|
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
|
# verify=False is risky but often needed for poorly configured corporate sites
|
|
response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
|
|
response.raise_for_status()
|
|
|
|
# Check Content Type
|
|
content_type = response.headers.get('Content-Type', '').lower()
|
|
if 'text/html' not in content_type:
|
|
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
|
|
return {"error": "Not HTML"}
|
|
|
|
return self._parse_html(response.content)
|
|
|
|
except requests.exceptions.SSLError:
|
|
# Retry with HTTP if HTTPS fails
|
|
if url.startswith("https://"):
|
|
logger.info(f"SSL failed for {url}, retrying with http://...")
|
|
return self.scrape_url(url.replace("https://", "http://"))
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Scraping failed for {url}: {e}")
|
|
return {"error": str(e)}
|
|
|
|
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# 1. Cleanup Junk
|
|
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
|
|
element.decompose()
|
|
|
|
# 2. Extract Title & Meta Description
|
|
title = soup.title.string if soup.title else ""
|
|
meta_desc = ""
|
|
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
|
if meta_tag:
|
|
meta_desc = meta_tag.get('content', '')
|
|
|
|
# 3. Extract Main Text
|
|
# Prefer body, fallback to full soup
|
|
body = soup.find('body')
|
|
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
|
|
|
|
cleaned_text = clean_text(raw_text)
|
|
|
|
# 4. Extract Emails (Basic Regex)
|
|
emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
|
|
|
|
return {
|
|
"title": clean_text(title),
|
|
"description": clean_text(meta_desc),
|
|
"text": cleaned_text[:25000], # Limit to avoid context overflow
|
|
"emails": list(emails)[:5] # Limit to 5
|
|
}
|