- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
113 lines
3.8 KiB
Python
113 lines
3.8 KiB
Python
import logging
|
|
import requests
|
|
import re
|
|
from typing import Optional, Dict, Tuple, Any
|
|
from urllib.parse import urlparse
|
|
|
|
from ..config import settings
|
|
from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
|
|
from .wikipedia_service import WikipediaService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Domains to ignore when looking for official company homepage
|
|
BLACKLIST_DOMAINS = {
|
|
"linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com",
|
|
"northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de",
|
|
"dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com",
|
|
"stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org"
|
|
}
|
|
|
|
class DiscoveryService:
|
|
def __init__(self):
|
|
self.api_key = settings.SERP_API_KEY
|
|
if not self.api_key:
|
|
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
|
|
|
|
self.wiki_service = WikipediaService()
|
|
|
|
@retry_on_failure(max_retries=2)
|
|
def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
|
|
"""
|
|
Uses Google Search via SerpAPI to find the most likely official homepage.
|
|
Returns "k.A." if nothing credible is found.
|
|
"""
|
|
if not self.api_key:
|
|
return "k.A."
|
|
|
|
query = f"{company_name} offizielle Website"
|
|
if city:
|
|
query += f" {city}"
|
|
|
|
logger.info(f"Searching website for: {query}")
|
|
|
|
try:
|
|
params = {
|
|
"engine": "google",
|
|
"q": query,
|
|
"api_key": self.api_key,
|
|
"num": 5,
|
|
"gl": "de",
|
|
"hl": "de"
|
|
}
|
|
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if "organic_results" not in data:
|
|
return "k.A."
|
|
|
|
for result in data["organic_results"]:
|
|
link = result.get("link", "")
|
|
if self._is_credible_url(link):
|
|
return link
|
|
|
|
return "k.A."
|
|
|
|
except Exception as e:
|
|
logger.error(f"SerpAPI Error: {e}", exc_info=True)
|
|
return "k.A."
|
|
|
|
@retry_on_failure(max_retries=2)
|
|
def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
|
|
"""
|
|
Searches for a specific German Wikipedia article using the robust WikipediaService.
|
|
Includes validation via website domain and city.
|
|
"""
|
|
# Pass all available info for robust search and validation
|
|
page = self.wiki_service.search_company_article(
|
|
company_name=company_name,
|
|
website=website,
|
|
crm_city=city
|
|
)
|
|
|
|
if page:
|
|
return page.url
|
|
|
|
return "k.A."
|
|
|
|
def extract_wikipedia_data(self, url: str) -> dict:
|
|
"""
|
|
Extracts full company data from a given Wikipedia URL.
|
|
"""
|
|
try:
|
|
return self.wiki_service.extract_company_data(url)
|
|
except Exception as e:
|
|
logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
|
|
return {"url": url, "error": str(e)}
|
|
|
|
def _is_credible_url(self, url: str) -> bool:
|
|
"""
|
|
Filters out social media, directories, and junk.
|
|
"""
|
|
if not url: return False
|
|
try:
|
|
domain = urlparse(url).netloc.lower().replace("www.", "")
|
|
if domain in BLACKLIST_DOMAINS:
|
|
return False
|
|
for bad in BLACKLIST_DOMAINS:
|
|
if domain.endswith("." + bad):
|
|
return False
|
|
return True
|
|
except:
|
|
return False |