Files
Brancheneinstufung2/company-explorer/backend/services/discovery.py
Floke 2c7bb262ef feat(company-explorer): Initial Web UI & Backend with Enrichment Flow
This commit introduces the foundational elements for the new "Company Explorer" web application, marking a significant step away from the legacy Google Sheets / CLI system.

Key changes include:
- Project Structure: A new  directory with separate  (FastAPI) and  (React/Vite) components.
- Data Persistence: Migration from Google Sheets to a local SQLite database () using SQLAlchemy.
- Core Utilities: Extraction and cleanup of essential helper functions (LLM wrappers, text utilities) into .
- Backend Services: , ,  for AI-powered analysis, and  logic.
- Frontend UI: Basic React application with company table, import wizard, and dynamic inspector sidebar.
- Docker Integration: Updated  and  for multi-stage builds and sideloading.
- Deployment & Access: Integrated into central Nginx proxy and dashboard, accessible via .

Lessons Learned & Fixed during development:
- Frontend Asset Loading: Addressed issues with Vite's  path and FastAPI's .
- TypeScript Configuration: Added  and .
- Database Schema Evolution: Solved  errors by forcing a new database file and correcting  override.
- Logging: Implemented robust file-based logging ().

This new foundation provides a powerful and maintainable platform for future B2B robotics lead generation.
2026-01-07 17:55:08 +00:00

127 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging
import requests
import re
from typing import Optional, Dict, Tuple
from urllib.parse import urlparse
from ..config import settings
from ..lib.core_utils import retry_on_failure, normalize_string
logger = logging.getLogger(__name__)
# Domains to ignore when looking for official company homepage
BLACKLIST_DOMAINS = {
"linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com",
"northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de",
"dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com",
"stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org"
}
class DiscoveryService:
def __init__(self):
self.api_key = settings.SERP_API_KEY
if not self.api_key:
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
@retry_on_failure(max_retries=2)
def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
"""
Uses Google Search via SerpAPI to find the most likely official homepage.
Returns "k.A." if nothing credible is found.
"""
if not self.api_key:
return "k.A."
query = f"{company_name} offizielle Website"
if city:
query += f" {city}"
logger.info(f"Searching website for: {query}")
try:
params = {
"engine": "google",
"q": query,
"api_key": self.api_key,
"num": 5,
"gl": "de",
"hl": "de"
}
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
response.raise_for_status()
data = response.json()
if "organic_results" not in data:
return "k.A."
for result in data["organic_results"]:
link = result.get("link", "")
if self._is_credible_url(link):
# Simple heuristic: If the company name is part of the domain, high confidence
# Otherwise, take the first credible result.
return link
return "k.A."
except Exception as e:
logger.error(f"SerpAPI Error: {e}")
return "k.A."
@retry_on_failure(max_retries=2)
def find_wikipedia_url(self, company_name: str) -> str:
"""
Searches for a specific German Wikipedia article.
"""
if not self.api_key:
return "k.A."
query = f"{company_name} Wikipedia"
try:
params = {
"engine": "google",
"q": query,
"api_key": self.api_key,
"num": 3,
"gl": "de",
"hl": "de"
}
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
response.raise_for_status()
data = response.json()
for result in data.get("organic_results", []):
link = result.get("link", "")
if "de.wikipedia.org/wiki/" in link:
# Basic validation: Is the title roughly the company?
title = result.get("title", "").replace(" Wikipedia", "")
if self._check_name_similarity(company_name, title):
return link
return "k.A."
except Exception as e:
logger.error(f"Wiki Search Error: {e}")
return "k.A."
def _is_credible_url(self, url: str) -> bool:
"""Filters out social media, directories, and junk."""
if not url: return False
try:
domain = urlparse(url).netloc.lower().replace("www.", "")
if domain in BLACKLIST_DOMAINS:
return False
# Check for subdomains of blacklist (e.g. de.linkedin.com)
for bad in BLACKLIST_DOMAINS:
if domain.endswith("." + bad):
return False
return True
except:
return False
def _check_name_similarity(self, name1: str, name2: str) -> bool:
"""Simple fuzzy check for validation."""
n1 = normalize_string(name1)
n2 = normalize_string(name2)
# Very permissive: if one is contained in the other
return n1 in n2 or n2 in n1