This commit introduces the foundational elements for the new "Company Explorer" web application, marking a significant step away from the legacy Google Sheets / CLI system. Key changes include: - Project Structure: A new directory with separate (FastAPI) and (React/Vite) components. - Data Persistence: Migration from Google Sheets to a local SQLite database () using SQLAlchemy. - Core Utilities: Extraction and cleanup of essential helper functions (LLM wrappers, text utilities) into . - Backend Services: , , for AI-powered analysis, and logic. - Frontend UI: Basic React application with company table, import wizard, and dynamic inspector sidebar. - Docker Integration: Updated and for multi-stage builds and sideloading. - Deployment & Access: Integrated into central Nginx proxy and dashboard, accessible via . Lessons Learned & Fixed during development: - Frontend Asset Loading: Addressed issues with Vite's path and FastAPI's . - TypeScript Configuration: Added and . - Database Schema Evolution: Solved errors by forcing a new database file and correcting override. - Logging: Implemented robust file-based logging (). This new foundation provides a powerful and maintainable platform for future B2B robotics lead generation.
127 lines
4.3 KiB
Python
127 lines
4.3 KiB
Python
import logging
|
||
import requests
|
||
import re
|
||
from typing import Optional, Dict, Tuple
|
||
from urllib.parse import urlparse
|
||
from ..config import settings
|
||
from ..lib.core_utils import retry_on_failure, normalize_string
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Domains to ignore when looking for official company homepage
|
||
BLACKLIST_DOMAINS = {
|
||
"linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com",
|
||
"northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de",
|
||
"dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com",
|
||
"stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org"
|
||
}
|
||
|
||
class DiscoveryService:
|
||
def __init__(self):
|
||
self.api_key = settings.SERP_API_KEY
|
||
if not self.api_key:
|
||
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
|
||
|
||
@retry_on_failure(max_retries=2)
|
||
def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
|
||
"""
|
||
Uses Google Search via SerpAPI to find the most likely official homepage.
|
||
Returns "k.A." if nothing credible is found.
|
||
"""
|
||
if not self.api_key:
|
||
return "k.A."
|
||
|
||
query = f"{company_name} offizielle Website"
|
||
if city:
|
||
query += f" {city}"
|
||
|
||
logger.info(f"Searching website for: {query}")
|
||
|
||
try:
|
||
params = {
|
||
"engine": "google",
|
||
"q": query,
|
||
"api_key": self.api_key,
|
||
"num": 5,
|
||
"gl": "de",
|
||
"hl": "de"
|
||
}
|
||
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
|
||
if "organic_results" not in data:
|
||
return "k.A."
|
||
|
||
for result in data["organic_results"]:
|
||
link = result.get("link", "")
|
||
if self._is_credible_url(link):
|
||
# Simple heuristic: If the company name is part of the domain, high confidence
|
||
# Otherwise, take the first credible result.
|
||
return link
|
||
|
||
return "k.A."
|
||
|
||
except Exception as e:
|
||
logger.error(f"SerpAPI Error: {e}")
|
||
return "k.A."
|
||
|
||
@retry_on_failure(max_retries=2)
|
||
def find_wikipedia_url(self, company_name: str) -> str:
|
||
"""
|
||
Searches for a specific German Wikipedia article.
|
||
"""
|
||
if not self.api_key:
|
||
return "k.A."
|
||
|
||
query = f"{company_name} Wikipedia"
|
||
|
||
try:
|
||
params = {
|
||
"engine": "google",
|
||
"q": query,
|
||
"api_key": self.api_key,
|
||
"num": 3,
|
||
"gl": "de",
|
||
"hl": "de"
|
||
}
|
||
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
|
||
for result in data.get("organic_results", []):
|
||
link = result.get("link", "")
|
||
if "de.wikipedia.org/wiki/" in link:
|
||
# Basic validation: Is the title roughly the company?
|
||
title = result.get("title", "").replace(" – Wikipedia", "")
|
||
if self._check_name_similarity(company_name, title):
|
||
return link
|
||
|
||
return "k.A."
|
||
|
||
except Exception as e:
|
||
logger.error(f"Wiki Search Error: {e}")
|
||
return "k.A."
|
||
|
||
def _is_credible_url(self, url: str) -> bool:
|
||
"""Filters out social media, directories, and junk."""
|
||
if not url: return False
|
||
try:
|
||
domain = urlparse(url).netloc.lower().replace("www.", "")
|
||
if domain in BLACKLIST_DOMAINS:
|
||
return False
|
||
# Check for subdomains of blacklist (e.g. de.linkedin.com)
|
||
for bad in BLACKLIST_DOMAINS:
|
||
if domain.endswith("." + bad):
|
||
return False
|
||
return True
|
||
except:
|
||
return False
|
||
|
||
def _check_name_similarity(self, name1: str, name2: str) -> bool:
|
||
"""Simple fuzzy check for validation."""
|
||
n1 = normalize_string(name1)
|
||
n2 = normalize_string(name2)
|
||
# Very permissive: if one is contained in the other
|
||
return n1 in n2 or n2 in n1
|