Files
Brancheneinstufung2/company-explorer/backend/services/discovery.py
Floke 4a336f6374 fix(ce): Resolve database schema mismatch and restore docs
- Fixed a critical  in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema.
- Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md.

- Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
2026-01-15 15:54:45 +00:00

113 lines
3.8 KiB
Python

import logging
import requests
import re
from typing import Optional, Dict, Tuple, Any
from urllib.parse import urlparse
from ..config import settings
from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
from .wikipedia_service import WikipediaService
logger = logging.getLogger(__name__)
# Domains to ignore when looking for official company homepage
BLACKLIST_DOMAINS = {
"linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com",
"northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de",
"dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com",
"stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org"
}
class DiscoveryService:
def __init__(self):
self.api_key = settings.SERP_API_KEY
if not self.api_key:
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
self.wiki_service = WikipediaService()
@retry_on_failure(max_retries=2)
def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
"""
Uses Google Search via SerpAPI to find the most likely official homepage.
Returns "k.A." if nothing credible is found.
"""
if not self.api_key:
return "k.A."
query = f"{company_name} offizielle Website"
if city:
query += f" {city}"
logger.info(f"Searching website for: {query}")
try:
params = {
"engine": "google",
"q": query,
"api_key": self.api_key,
"num": 5,
"gl": "de",
"hl": "de"
}
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
response.raise_for_status()
data = response.json()
if "organic_results" not in data:
return "k.A."
for result in data["organic_results"]:
link = result.get("link", "")
if self._is_credible_url(link):
return link
return "k.A."
except Exception as e:
logger.error(f"SerpAPI Error: {e}", exc_info=True)
return "k.A."
@retry_on_failure(max_retries=2)
def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
"""
Searches for a specific German Wikipedia article using the robust WikipediaService.
Includes validation via website domain and city.
"""
# Pass all available info for robust search and validation
page = self.wiki_service.search_company_article(
company_name=company_name,
website=website,
crm_city=city
)
if page:
return page.url
return "k.A."
def extract_wikipedia_data(self, url: str) -> dict:
"""
Extracts full company data from a given Wikipedia URL.
"""
try:
return self.wiki_service.extract_company_data(url)
except Exception as e:
logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
return {"url": url, "error": str(e)}
def _is_credible_url(self, url: str) -> bool:
"""
Filters out social media, directories, and junk.
"""
if not url: return False
try:
domain = urlparse(url).netloc.lower().replace("www.", "")
if domain in BLACKLIST_DOMAINS:
return False
for bad in BLACKLIST_DOMAINS:
if domain.endswith("." + bad):
return False
return True
except:
return False