feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides
- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
This commit is contained in:
448
company-explorer/backend/services/wikipedia_service.py
Normal file
448
company-explorer/backend/services/wikipedia_service.py
Normal file
@@ -0,0 +1,448 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
wikipedia_service.py
|
||||
|
||||
Service class for interacting with Wikipedia, including search,
|
||||
validation, and extraction of company data.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import unquote
|
||||
|
||||
import requests
|
||||
import wikipedia
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Import settings and helpers
|
||||
from ..config import settings
|
||||
from ..lib.core_utils import (
|
||||
retry_on_failure,
|
||||
simple_normalize_url,
|
||||
normalize_company_name,
|
||||
extract_numeric_value,
|
||||
clean_text,
|
||||
fuzzy_similarity
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class WikipediaService:
|
||||
"""
|
||||
Handles searching for Wikipedia articles and extracting relevant
|
||||
company data. Includes validation logic for articles.
|
||||
"""
|
||||
def __init__(self, user_agent=None):
|
||||
"""
|
||||
Initialize the scraper with a requests session.
|
||||
"""
|
||||
self.user_agent = user_agent or 'Mozilla/5.0 (compatible; CompanyExplorer/1.0; +http://www.example.com/bot)'
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({'User-Agent': self.user_agent})
|
||||
|
||||
self.keywords_map = {
|
||||
'branche': ['branche', 'wirtschaftszweig', 'industry', 'taetigkeit', 'sektor', 'produkte', 'leistungen'],
|
||||
'umsatz': ['umsatz', 'erloes', 'revenue', 'jahresumsatz', 'konzernumsatz', 'ergebnis'],
|
||||
'mitarbeiter': ['mitarbeiter', 'mitarbeiterzahl', 'beschaeftigte', 'employees', 'number of employees', 'personal', 'belegschaft'],
|
||||
'sitz': ['sitz', 'hauptsitz', 'unternehmenssitz', 'firmensitz', 'headquarters', 'standort', 'sitz des unternehmens', 'anschrift', 'adresse']
|
||||
}
|
||||
|
||||
try:
|
||||
# Default to German for now, could be configurable
|
||||
wiki_lang = 'de'
|
||||
wikipedia.set_lang(wiki_lang)
|
||||
wikipedia.set_rate_limiting(False)
|
||||
logger.info(f"Wikipedia library language set to '{wiki_lang}'. Rate limiting DISABLED.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting Wikipedia language or rate limiting: {e}")
|
||||
|
||||
@retry_on_failure(max_retries=3)
|
||||
def serp_wikipedia_lookup(self, company_name: str, lang: str = 'de') -> str:
|
||||
"""
|
||||
Searches for the best Wikipedia URL for a company using Google Search (via SerpAPI).
|
||||
Prioritizes Knowledge Graph hits and then organic results.
|
||||
|
||||
Args:
|
||||
company_name (str): The name of the company to search for.
|
||||
lang (str): The language code for Wikipedia search (e.g., 'de').
|
||||
|
||||
Returns:
|
||||
str: The URL of the best hit or None if nothing suitable was found.
|
||||
"""
|
||||
logger.info(f"Starting SerpAPI Wikipedia search for '{company_name}'...")
|
||||
serp_key = settings.SERP_API_KEY
|
||||
if not serp_key:
|
||||
logger.warning("SerpAPI Key not configured. Skipping search.")
|
||||
return None
|
||||
|
||||
query = f'site:{lang}.wikipedia.org "{company_name}"'
|
||||
params = {"engine": "google", "q": query, "api_key": serp_key, "hl": lang}
|
||||
|
||||
try:
|
||||
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# 1. Check Knowledge Graph (highest priority)
|
||||
if "knowledge_graph" in data and "source" in data["knowledge_graph"]:
|
||||
source = data["knowledge_graph"]["source"]
|
||||
if "link" in source and f"{lang}.wikipedia.org" in source["link"]:
|
||||
url = source["link"]
|
||||
logger.info(f" -> Hit found in Knowledge Graph: {url}")
|
||||
return url
|
||||
|
||||
# 2. Check organic results
|
||||
if "organic_results" in data:
|
||||
for result in data.get("organic_results", []):
|
||||
link = result.get("link")
|
||||
if link and f"{lang}.wikipedia.org/wiki/" in link:
|
||||
logger.info(f" -> Best organic hit found: {link}")
|
||||
return link
|
||||
|
||||
logger.warning(f" -> No suitable Wikipedia URL found for '{company_name}' in SerpAPI results.")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error during SerpAPI request for '{company_name}': {e}")
|
||||
return None
|
||||
|
||||
@retry_on_failure(max_retries=3)
|
||||
def _get_page_soup(self, url: str) -> BeautifulSoup:
|
||||
"""
|
||||
Fetches HTML from a URL and returns a BeautifulSoup object.
|
||||
"""
|
||||
if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")):
|
||||
logger.warning(f"_get_page_soup: Invalid URL '{str(url)[:100]}...'")
|
||||
return None
|
||||
try:
|
||||
response = self.session.get(url, timeout=15)
|
||||
response.raise_for_status()
|
||||
# Handle encoding
|
||||
response.encoding = response.apparent_encoding
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
return soup
|
||||
except Exception as e:
|
||||
logger.error(f"_get_page_soup: Error fetching or parsing HTML from {str(url)[:100]}...: {e}")
|
||||
raise e
|
||||
|
||||
def _extract_first_paragraph_from_soup(self, soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
Extracts the first meaningful paragraph from the Wikipedia article soup.
|
||||
Mimics the sophisticated cleaning from the legacy system.
|
||||
"""
|
||||
if not soup: return "k.A."
|
||||
paragraph_text = "k.A."
|
||||
try:
|
||||
content_div = soup.find('div', class_='mw-parser-output')
|
||||
search_area = content_div if content_div else soup
|
||||
paragraphs = search_area.find_all('p', recursive=False)
|
||||
if not paragraphs: paragraphs = search_area.find_all('p')
|
||||
|
||||
for p in paragraphs:
|
||||
# Remove references [1], [2], etc.
|
||||
for sup in p.find_all('sup', class_='reference'): sup.decompose()
|
||||
# Remove hidden spans
|
||||
for span in p.find_all('span', style=lambda v: v and 'display:none' in v): span.decompose()
|
||||
# Remove coordinates
|
||||
for span in p.find_all('span', id='coordinates'): span.decompose()
|
||||
|
||||
text = clean_text(p.get_text(separator=' ', strip=True))
|
||||
|
||||
# Filter out meta-paragraphs or too short ones
|
||||
if text != "k.A." and len(text) > 50 and not re.match(r'^(Datei:|Abbildung:|Siehe auch:|Einzelnachweise|Siehe auch|Literatur)', text, re.IGNORECASE):
|
||||
paragraph_text = text[:2000] # Limit length
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting first paragraph: {e}")
|
||||
return paragraph_text
|
||||
|
||||
def extract_categories(self, soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
Extracts Wikipedia categories from the soup object, filtering out meta-categories.
|
||||
"""
|
||||
if not soup: return "k.A."
|
||||
cats_filtered = []
|
||||
try:
|
||||
cat_div = soup.find('div', id="mw-normal-catlinks")
|
||||
if cat_div:
|
||||
ul = cat_div.find('ul')
|
||||
if ul:
|
||||
cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
|
||||
cats_filtered = [c for c in cats if c and isinstance(c, str) and c.strip() and "kategorien:" not in c.lower()]
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting categories: {e}")
|
||||
return ", ".join(cats_filtered) if cats_filtered else "k.A."
|
||||
|
||||
def _validate_article(self, page, company_name: str, website: str, crm_city: str, parent_name: str = None) -> bool:
|
||||
"""
|
||||
Validates fact-based whether a Wikipedia article matches the company.
|
||||
Prioritizes hard facts (Domain, City) over pure name similarity.
|
||||
"""
|
||||
if not page or not hasattr(page, 'html'):
|
||||
return False
|
||||
|
||||
logger.debug(f"Validating article '{page.title}' for company '{company_name}'...")
|
||||
|
||||
try:
|
||||
page_html = page.html()
|
||||
soup = BeautifulSoup(page_html, 'html.parser')
|
||||
except Exception as e:
|
||||
logger.error(f"Could not parse HTML for article '{page.title}': {e}")
|
||||
return False
|
||||
|
||||
# --- Stage 1: Website Domain Validation (very strong signal) ---
|
||||
normalized_domain = simple_normalize_url(website)
|
||||
if normalized_domain != "k.A.":
|
||||
# Search for domain in "External links" section or infobox
|
||||
external_links = soup.select('.external, .infobox a[href*="."]')
|
||||
for link in external_links:
|
||||
href = link.get('href', '')
|
||||
if normalized_domain in href:
|
||||
logger.info(f" => VALIDATION SUCCESS (Domain Match): Domain '{normalized_domain}' found in links.")
|
||||
return True
|
||||
|
||||
# --- Stage 2: City Validation (strong signal) ---
|
||||
if crm_city and crm_city.lower() != 'k.a.':
|
||||
infobox_sitz_raw = self._extract_infobox_value(soup, 'sitz')
|
||||
if infobox_sitz_raw and infobox_sitz_raw.lower() != 'k.a.':
|
||||
if crm_city.lower() in infobox_sitz_raw.lower():
|
||||
logger.info(f" => VALIDATION SUCCESS (City Match): CRM City '{crm_city}' found in Infobox City '{infobox_sitz_raw}'.")
|
||||
return True
|
||||
|
||||
# --- Stage 3: Parent Validation ---
|
||||
normalized_parent = normalize_company_name(parent_name) if parent_name else None
|
||||
if normalized_parent:
|
||||
page_content_for_check = (page.title + " " + page.summary).lower()
|
||||
if normalized_parent in page_content_for_check:
|
||||
logger.info(f" => VALIDATION SUCCESS (Parent Match): Parent Name '{parent_name}' found in article.")
|
||||
return True
|
||||
|
||||
# --- Stage 4: Name Similarity (Fallback with stricter rules) ---
|
||||
normalized_company = normalize_company_name(company_name)
|
||||
normalized_title = normalize_company_name(page.title)
|
||||
similarity = fuzzy_similarity(normalized_title, normalized_company)
|
||||
|
||||
if similarity > 0.85: # Stricter threshold
|
||||
logger.info(f" => VALIDATION SUCCESS (High Similarity): High name similarity ({similarity:.2f}).")
|
||||
return True
|
||||
|
||||
logger.debug(f" => VALIDATION FAILED: No hard fact (Domain, City, Parent) and similarity ({similarity:.2f}) too low.")
|
||||
return False
|
||||
|
||||
def search_company_article(self, company_name: str, website: str = None, crm_city: str = None, parent_name: str = None):
|
||||
"""
|
||||
Searches and validates a matching Wikipedia article using the 'Google-First' strategy.
|
||||
1. Finds the best URL via SerpAPI.
|
||||
2. Validates the found article with hard facts.
|
||||
"""
|
||||
if not company_name:
|
||||
return None
|
||||
|
||||
logger.info(f"Starting 'Google-First' Wikipedia search for '{company_name}'...")
|
||||
|
||||
# 1. Find the best URL candidate via Google Search
|
||||
url_candidate = self.serp_wikipedia_lookup(company_name)
|
||||
|
||||
if not url_candidate:
|
||||
logger.warning(f" -> No URL found via SerpAPI. Search aborted.")
|
||||
return None
|
||||
|
||||
# 2. Load and validate the found article
|
||||
try:
|
||||
page_title = unquote(url_candidate.split('/wiki/')[-1].replace('_', ' '))
|
||||
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
|
||||
|
||||
# Use the new fact-based validation
|
||||
if self._validate_article(page, company_name, website, crm_city, parent_name):
|
||||
logger.info(f" -> Article '{page.title}' successfully validated.")
|
||||
return page
|
||||
else:
|
||||
logger.warning(f" -> Article '{page.title}' could not be validated.")
|
||||
return None
|
||||
except wikipedia.exceptions.PageError:
|
||||
logger.error(f" -> Error: Found URL '{url_candidate}' did not lead to a valid Wikipedia page.")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f" -> Unexpected error processing page '{url_candidate}': {e}")
|
||||
return None
|
||||
|
||||
def _extract_infobox_value(self, soup: BeautifulSoup, target: str) -> str:
|
||||
"""
|
||||
Targetedly extracts values (Industry, Revenue, etc.) from the infobox.
|
||||
"""
|
||||
if not soup or target not in self.keywords_map:
|
||||
return "k.A."
|
||||
keywords = self.keywords_map[target]
|
||||
infobox = soup.select_one('table[class*="infobox"]')
|
||||
if not infobox: return "k.A."
|
||||
|
||||
value_found = "k.A."
|
||||
try:
|
||||
rows = infobox.find_all('tr')
|
||||
for row in rows:
|
||||
cells = row.find_all(['th', 'td'], recursive=False)
|
||||
header_text, value_cell = None, None
|
||||
|
||||
if len(cells) >= 2:
|
||||
if cells[0].name == 'th':
|
||||
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
|
||||
elif cells[0].name == 'td' and cells[1].name == 'td':
|
||||
style = cells[0].get('style', '').lower()
|
||||
is_header_like = 'font-weight' in style and ('bold' in style or '700' in style) or cells[0].find(['b', 'strong'], recursive=False)
|
||||
if is_header_like:
|
||||
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
|
||||
|
||||
if header_text and value_cell:
|
||||
if any(kw in header_text.lower() for kw in keywords):
|
||||
for sup in value_cell.find_all(['sup', 'span']):
|
||||
sup.decompose()
|
||||
|
||||
raw_value_text = value_cell.get_text(separator=' ', strip=True)
|
||||
|
||||
if target == 'branche' or target == 'sitz':
|
||||
value_found = clean_text(raw_value_text).split('\n')[0].strip()
|
||||
elif target == 'umsatz':
|
||||
value_found = extract_numeric_value(raw_value_text, is_umsatz=True)
|
||||
elif target == 'mitarbeiter':
|
||||
value_found = extract_numeric_value(raw_value_text, is_umsatz=False)
|
||||
|
||||
value_found = value_found if value_found else "k.A."
|
||||
logger.info(f" --> Infobox '{target}' found: '{value_found}'")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error iterating infobox rows for '{target}': {e}")
|
||||
return "k.A."
|
||||
|
||||
return value_found
|
||||
|
||||
def _parse_sitz_string_detailed(self, raw_sitz_string_input: str) -> dict:
|
||||
"""
|
||||
Attempts to extract City and Country in detail from a raw Sitz string.
|
||||
"""
|
||||
sitz_stadt_val, sitz_land_val = "k.A.", "k.A."
|
||||
if not raw_sitz_string_input or not isinstance(raw_sitz_string_input, str):
|
||||
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
||||
|
||||
temp_sitz = raw_sitz_string_input.strip()
|
||||
if not temp_sitz or temp_sitz.lower() == "k.a.":
|
||||
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
||||
|
||||
known_countries_detailed = {
|
||||
"deutschland": "Deutschland", "germany": "Deutschland", "de": "Deutschland",
|
||||
"österreich": "Österreich", "austria": "Österreich", "at": "Österreich",
|
||||
"schweiz": "Schweiz", "switzerland": "Schweiz", "ch": "Schweiz", "suisse": "Schweiz",
|
||||
"usa": "USA", "u.s.": "USA", "united states": "USA", "vereinigte staaten": "USA",
|
||||
"vereinigtes königreich": "Vereinigtes Königreich", "united kingdom": "Vereinigtes Königreich", "uk": "Vereinigtes Königreich",
|
||||
}
|
||||
region_to_country = {
|
||||
"nrw": "Deutschland", "nordrhein-westfalen": "Deutschland", "bayern": "Deutschland", "hessen": "Deutschland",
|
||||
"zg": "Schweiz", "zug": "Schweiz", "zh": "Schweiz", "zürich": "Schweiz",
|
||||
"ca": "USA", "california": "USA", "ny": "USA", "new york": "USA",
|
||||
}
|
||||
|
||||
extracted_country = ""
|
||||
original_temp_sitz = temp_sitz
|
||||
|
||||
klammer_match = re.search(r'\(([^)]+)\)$', temp_sitz)
|
||||
if klammer_match:
|
||||
suffix_in_klammer = klammer_match.group(1).strip().lower()
|
||||
if suffix_in_klammer in known_countries_detailed:
|
||||
extracted_country = known_countries_detailed[suffix_in_klammer]
|
||||
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
||||
elif suffix_in_klammer in region_to_country:
|
||||
extracted_country = region_to_country[suffix_in_klammer]
|
||||
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
||||
|
||||
if not extracted_country and ',' in temp_sitz:
|
||||
parts = [p.strip() for p in temp_sitz.split(',')]
|
||||
if len(parts) > 1:
|
||||
last_part_lower = parts[-1].lower()
|
||||
if last_part_lower in known_countries_detailed:
|
||||
extracted_country = known_countries_detailed[last_part_lower]
|
||||
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
|
||||
elif last_part_lower in region_to_country:
|
||||
extracted_country = region_to_country[last_part_lower]
|
||||
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
|
||||
|
||||
sitz_land_val = extracted_country if extracted_country else "k.A."
|
||||
sitz_stadt_val = re.sub(r'^\d{4,8}\s*', '', temp_sitz).strip(" ,")
|
||||
|
||||
if not sitz_stadt_val:
|
||||
sitz_stadt_val = "k.A." if sitz_land_val != "k.A." else re.sub(r'^\d{4,8}\s*', '', original_temp_sitz).strip(" ,") or "k.A."
|
||||
|
||||
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
||||
|
||||
@retry_on_failure(max_retries=3)
|
||||
def extract_company_data(self, url_or_page) -> dict:
|
||||
"""
|
||||
Extracts structured company data from a Wikipedia article (URL or page object).
|
||||
"""
|
||||
default_result = {
|
||||
'url': 'k.A.', 'title': 'k.A.', 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
|
||||
'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.',
|
||||
'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_text': ''
|
||||
}
|
||||
page = None
|
||||
|
||||
try:
|
||||
if isinstance(url_or_page, str) and "wikipedia.org" in url_or_page:
|
||||
page_title = unquote(url_or_page.split('/wiki/')[-1].replace('_', ' '))
|
||||
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
|
||||
elif not isinstance(url_or_page, str): # Assumption: it is a page object
|
||||
page = url_or_page
|
||||
else:
|
||||
logger.warning(f"extract_company_data: Invalid Input '{str(url_or_page)[:100]}...")
|
||||
return default_result
|
||||
|
||||
logger.info(f"Extracting data for Wiki Article: {page.title[:100]}...")
|
||||
|
||||
# Extract basic data directly from page object
|
||||
first_paragraph = page.summary.split('\n')[0] if page.summary else 'k.A.'
|
||||
categories = ", ".join(page.categories)
|
||||
full_text = page.content
|
||||
|
||||
# BeautifulSoup needed for infobox and refined extraction
|
||||
soup = self._get_page_soup(page.url)
|
||||
if not soup:
|
||||
logger.warning(f" -> Could not load page for Soup parsing. Extracting basic data only.")
|
||||
return {
|
||||
'url': page.url, 'title': page.title, 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
|
||||
'first_paragraph': page.summary.split('\n')[0] if page.summary else 'k.A.',
|
||||
'branche': 'k.A.', 'umsatz': 'k.A.',
|
||||
'mitarbeiter': 'k.A.', 'categories': ", ".join(page.categories), 'full_text': full_text
|
||||
}
|
||||
|
||||
# Refined Extraction from Soup
|
||||
first_paragraph = self._extract_first_paragraph_from_soup(soup)
|
||||
categories = self.extract_categories(soup)
|
||||
|
||||
# Extract infobox data
|
||||
branche_val = self._extract_infobox_value(soup, 'branche')
|
||||
umsatz_val = self._extract_infobox_value(soup, 'umsatz')
|
||||
mitarbeiter_val = self._extract_infobox_value(soup, 'mitarbeiter')
|
||||
raw_sitz_string = self._extract_infobox_value(soup, 'sitz')
|
||||
parsed_sitz = self._parse_sitz_string_detailed(raw_sitz_string)
|
||||
sitz_stadt_val = parsed_sitz['sitz_stadt']
|
||||
sitz_land_val = parsed_sitz['sitz_land']
|
||||
|
||||
result = {
|
||||
'url': page.url,
|
||||
'title': page.title,
|
||||
'sitz_stadt': sitz_stadt_val,
|
||||
'sitz_land': sitz_land_val,
|
||||
'first_paragraph': first_paragraph,
|
||||
'branche': branche_val,
|
||||
'umsatz': umsatz_val,
|
||||
'mitarbeiter': mitarbeiter_val,
|
||||
'categories': categories,
|
||||
'full_text': full_text
|
||||
}
|
||||
|
||||
logger.info(f" -> Extracted Data: City='{sitz_stadt_val}', Country='{sitz_land_val}', Rev='{umsatz_val}', Emp='{mitarbeiter_val}'")
|
||||
return result
|
||||
|
||||
except wikipedia.exceptions.PageError:
|
||||
logger.error(f" -> Error: Wikipedia article for '{str(url_or_page)[:100]}' could not be found (PageError).")
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
except Exception as e:
|
||||
logger.error(f" -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}")
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
Reference in New Issue
Block a user