- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
448 lines
21 KiB
Python
448 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
wikipedia_service.py
|
|
|
|
Service class for interacting with Wikipedia, including search,
|
|
validation, and extraction of company data.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import unquote
|
|
|
|
import requests
|
|
import wikipedia
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Import settings and helpers
|
|
from ..config import settings
|
|
from ..lib.core_utils import (
|
|
retry_on_failure,
|
|
simple_normalize_url,
|
|
normalize_company_name,
|
|
extract_numeric_value,
|
|
clean_text,
|
|
fuzzy_similarity
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class WikipediaService:
|
|
"""
|
|
Handles searching for Wikipedia articles and extracting relevant
|
|
company data. Includes validation logic for articles.
|
|
"""
|
|
def __init__(self, user_agent=None):
|
|
"""
|
|
Initialize the scraper with a requests session.
|
|
"""
|
|
self.user_agent = user_agent or 'Mozilla/5.0 (compatible; CompanyExplorer/1.0; +http://www.example.com/bot)'
|
|
self.session = requests.Session()
|
|
self.session.headers.update({'User-Agent': self.user_agent})
|
|
|
|
self.keywords_map = {
|
|
'branche': ['branche', 'wirtschaftszweig', 'industry', 'taetigkeit', 'sektor', 'produkte', 'leistungen'],
|
|
'umsatz': ['umsatz', 'erloes', 'revenue', 'jahresumsatz', 'konzernumsatz', 'ergebnis'],
|
|
'mitarbeiter': ['mitarbeiter', 'mitarbeiterzahl', 'beschaeftigte', 'employees', 'number of employees', 'personal', 'belegschaft'],
|
|
'sitz': ['sitz', 'hauptsitz', 'unternehmenssitz', 'firmensitz', 'headquarters', 'standort', 'sitz des unternehmens', 'anschrift', 'adresse']
|
|
}
|
|
|
|
try:
|
|
# Default to German for now, could be configurable
|
|
wiki_lang = 'de'
|
|
wikipedia.set_lang(wiki_lang)
|
|
wikipedia.set_rate_limiting(False)
|
|
logger.info(f"Wikipedia library language set to '{wiki_lang}'. Rate limiting DISABLED.")
|
|
except Exception as e:
|
|
logger.warning(f"Error setting Wikipedia language or rate limiting: {e}")
|
|
|
|
@retry_on_failure(max_retries=3)
|
|
def serp_wikipedia_lookup(self, company_name: str, lang: str = 'de') -> str:
|
|
"""
|
|
Searches for the best Wikipedia URL for a company using Google Search (via SerpAPI).
|
|
Prioritizes Knowledge Graph hits and then organic results.
|
|
|
|
Args:
|
|
company_name (str): The name of the company to search for.
|
|
lang (str): The language code for Wikipedia search (e.g., 'de').
|
|
|
|
Returns:
|
|
str: The URL of the best hit or None if nothing suitable was found.
|
|
"""
|
|
logger.info(f"Starting SerpAPI Wikipedia search for '{company_name}'...")
|
|
serp_key = settings.SERP_API_KEY
|
|
if not serp_key:
|
|
logger.warning("SerpAPI Key not configured. Skipping search.")
|
|
return None
|
|
|
|
query = f'site:{lang}.wikipedia.org "{company_name}"'
|
|
params = {"engine": "google", "q": query, "api_key": serp_key, "hl": lang}
|
|
|
|
try:
|
|
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# 1. Check Knowledge Graph (highest priority)
|
|
if "knowledge_graph" in data and "source" in data["knowledge_graph"]:
|
|
source = data["knowledge_graph"]["source"]
|
|
if "link" in source and f"{lang}.wikipedia.org" in source["link"]:
|
|
url = source["link"]
|
|
logger.info(f" -> Hit found in Knowledge Graph: {url}")
|
|
return url
|
|
|
|
# 2. Check organic results
|
|
if "organic_results" in data:
|
|
for result in data.get("organic_results", []):
|
|
link = result.get("link")
|
|
if link and f"{lang}.wikipedia.org/wiki/" in link:
|
|
logger.info(f" -> Best organic hit found: {link}")
|
|
return link
|
|
|
|
logger.warning(f" -> No suitable Wikipedia URL found for '{company_name}' in SerpAPI results.")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error during SerpAPI request for '{company_name}': {e}")
|
|
return None
|
|
|
|
@retry_on_failure(max_retries=3)
|
|
def _get_page_soup(self, url: str) -> BeautifulSoup:
|
|
"""
|
|
Fetches HTML from a URL and returns a BeautifulSoup object.
|
|
"""
|
|
if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")):
|
|
logger.warning(f"_get_page_soup: Invalid URL '{str(url)[:100]}...'")
|
|
return None
|
|
try:
|
|
response = self.session.get(url, timeout=15)
|
|
response.raise_for_status()
|
|
# Handle encoding
|
|
response.encoding = response.apparent_encoding
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
return soup
|
|
except Exception as e:
|
|
logger.error(f"_get_page_soup: Error fetching or parsing HTML from {str(url)[:100]}...: {e}")
|
|
raise e
|
|
|
|
def _extract_first_paragraph_from_soup(self, soup: BeautifulSoup) -> str:
|
|
"""
|
|
Extracts the first meaningful paragraph from the Wikipedia article soup.
|
|
Mimics the sophisticated cleaning from the legacy system.
|
|
"""
|
|
if not soup: return "k.A."
|
|
paragraph_text = "k.A."
|
|
try:
|
|
content_div = soup.find('div', class_='mw-parser-output')
|
|
search_area = content_div if content_div else soup
|
|
paragraphs = search_area.find_all('p', recursive=False)
|
|
if not paragraphs: paragraphs = search_area.find_all('p')
|
|
|
|
for p in paragraphs:
|
|
# Remove references [1], [2], etc.
|
|
for sup in p.find_all('sup', class_='reference'): sup.decompose()
|
|
# Remove hidden spans
|
|
for span in p.find_all('span', style=lambda v: v and 'display:none' in v): span.decompose()
|
|
# Remove coordinates
|
|
for span in p.find_all('span', id='coordinates'): span.decompose()
|
|
|
|
text = clean_text(p.get_text(separator=' ', strip=True))
|
|
|
|
# Filter out meta-paragraphs or too short ones
|
|
if text != "k.A." and len(text) > 50 and not re.match(r'^(Datei:|Abbildung:|Siehe auch:|Einzelnachweise|Siehe auch|Literatur)', text, re.IGNORECASE):
|
|
paragraph_text = text[:2000] # Limit length
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error extracting first paragraph: {e}")
|
|
return paragraph_text
|
|
|
|
def extract_categories(self, soup: BeautifulSoup) -> str:
|
|
"""
|
|
Extracts Wikipedia categories from the soup object, filtering out meta-categories.
|
|
"""
|
|
if not soup: return "k.A."
|
|
cats_filtered = []
|
|
try:
|
|
cat_div = soup.find('div', id="mw-normal-catlinks")
|
|
if cat_div:
|
|
ul = cat_div.find('ul')
|
|
if ul:
|
|
cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
|
|
cats_filtered = [c for c in cats if c and isinstance(c, str) and c.strip() and "kategorien:" not in c.lower()]
|
|
except Exception as e:
|
|
logger.error(f"Error extracting categories: {e}")
|
|
return ", ".join(cats_filtered) if cats_filtered else "k.A."
|
|
|
|
def _validate_article(self, page, company_name: str, website: str, crm_city: str, parent_name: str = None) -> bool:
|
|
"""
|
|
Validates fact-based whether a Wikipedia article matches the company.
|
|
Prioritizes hard facts (Domain, City) over pure name similarity.
|
|
"""
|
|
if not page or not hasattr(page, 'html'):
|
|
return False
|
|
|
|
logger.debug(f"Validating article '{page.title}' for company '{company_name}'...")
|
|
|
|
try:
|
|
page_html = page.html()
|
|
soup = BeautifulSoup(page_html, 'html.parser')
|
|
except Exception as e:
|
|
logger.error(f"Could not parse HTML for article '{page.title}': {e}")
|
|
return False
|
|
|
|
# --- Stage 1: Website Domain Validation (very strong signal) ---
|
|
normalized_domain = simple_normalize_url(website)
|
|
if normalized_domain != "k.A.":
|
|
# Search for domain in "External links" section or infobox
|
|
external_links = soup.select('.external, .infobox a[href*="."]')
|
|
for link in external_links:
|
|
href = link.get('href', '')
|
|
if normalized_domain in href:
|
|
logger.info(f" => VALIDATION SUCCESS (Domain Match): Domain '{normalized_domain}' found in links.")
|
|
return True
|
|
|
|
# --- Stage 2: City Validation (strong signal) ---
|
|
if crm_city and crm_city.lower() != 'k.a.':
|
|
infobox_sitz_raw = self._extract_infobox_value(soup, 'sitz')
|
|
if infobox_sitz_raw and infobox_sitz_raw.lower() != 'k.a.':
|
|
if crm_city.lower() in infobox_sitz_raw.lower():
|
|
logger.info(f" => VALIDATION SUCCESS (City Match): CRM City '{crm_city}' found in Infobox City '{infobox_sitz_raw}'.")
|
|
return True
|
|
|
|
# --- Stage 3: Parent Validation ---
|
|
normalized_parent = normalize_company_name(parent_name) if parent_name else None
|
|
if normalized_parent:
|
|
page_content_for_check = (page.title + " " + page.summary).lower()
|
|
if normalized_parent in page_content_for_check:
|
|
logger.info(f" => VALIDATION SUCCESS (Parent Match): Parent Name '{parent_name}' found in article.")
|
|
return True
|
|
|
|
# --- Stage 4: Name Similarity (Fallback with stricter rules) ---
|
|
normalized_company = normalize_company_name(company_name)
|
|
normalized_title = normalize_company_name(page.title)
|
|
similarity = fuzzy_similarity(normalized_title, normalized_company)
|
|
|
|
if similarity > 0.85: # Stricter threshold
|
|
logger.info(f" => VALIDATION SUCCESS (High Similarity): High name similarity ({similarity:.2f}).")
|
|
return True
|
|
|
|
logger.debug(f" => VALIDATION FAILED: No hard fact (Domain, City, Parent) and similarity ({similarity:.2f}) too low.")
|
|
return False
|
|
|
|
def search_company_article(self, company_name: str, website: str = None, crm_city: str = None, parent_name: str = None):
|
|
"""
|
|
Searches and validates a matching Wikipedia article using the 'Google-First' strategy.
|
|
1. Finds the best URL via SerpAPI.
|
|
2. Validates the found article with hard facts.
|
|
"""
|
|
if not company_name:
|
|
return None
|
|
|
|
logger.info(f"Starting 'Google-First' Wikipedia search for '{company_name}'...")
|
|
|
|
# 1. Find the best URL candidate via Google Search
|
|
url_candidate = self.serp_wikipedia_lookup(company_name)
|
|
|
|
if not url_candidate:
|
|
logger.warning(f" -> No URL found via SerpAPI. Search aborted.")
|
|
return None
|
|
|
|
# 2. Load and validate the found article
|
|
try:
|
|
page_title = unquote(url_candidate.split('/wiki/')[-1].replace('_', ' '))
|
|
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
|
|
|
|
# Use the new fact-based validation
|
|
if self._validate_article(page, company_name, website, crm_city, parent_name):
|
|
logger.info(f" -> Article '{page.title}' successfully validated.")
|
|
return page
|
|
else:
|
|
logger.warning(f" -> Article '{page.title}' could not be validated.")
|
|
return None
|
|
except wikipedia.exceptions.PageError:
|
|
logger.error(f" -> Error: Found URL '{url_candidate}' did not lead to a valid Wikipedia page.")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f" -> Unexpected error processing page '{url_candidate}': {e}")
|
|
return None
|
|
|
|
def _extract_infobox_value(self, soup: BeautifulSoup, target: str) -> str:
|
|
"""
|
|
Targetedly extracts values (Industry, Revenue, etc.) from the infobox.
|
|
"""
|
|
if not soup or target not in self.keywords_map:
|
|
return "k.A."
|
|
keywords = self.keywords_map[target]
|
|
infobox = soup.select_one('table[class*="infobox"]')
|
|
if not infobox: return "k.A."
|
|
|
|
value_found = "k.A."
|
|
try:
|
|
rows = infobox.find_all('tr')
|
|
for row in rows:
|
|
cells = row.find_all(['th', 'td'], recursive=False)
|
|
header_text, value_cell = None, None
|
|
|
|
if len(cells) >= 2:
|
|
if cells[0].name == 'th':
|
|
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
|
|
elif cells[0].name == 'td' and cells[1].name == 'td':
|
|
style = cells[0].get('style', '').lower()
|
|
is_header_like = 'font-weight' in style and ('bold' in style or '700' in style) or cells[0].find(['b', 'strong'], recursive=False)
|
|
if is_header_like:
|
|
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
|
|
|
|
if header_text and value_cell:
|
|
if any(kw in header_text.lower() for kw in keywords):
|
|
for sup in value_cell.find_all(['sup', 'span']):
|
|
sup.decompose()
|
|
|
|
raw_value_text = value_cell.get_text(separator=' ', strip=True)
|
|
|
|
if target == 'branche' or target == 'sitz':
|
|
value_found = clean_text(raw_value_text).split('\n')[0].strip()
|
|
elif target == 'umsatz':
|
|
value_found = extract_numeric_value(raw_value_text, is_umsatz=True)
|
|
elif target == 'mitarbeiter':
|
|
value_found = extract_numeric_value(raw_value_text, is_umsatz=False)
|
|
|
|
value_found = value_found if value_found else "k.A."
|
|
logger.info(f" --> Infobox '{target}' found: '{value_found}'")
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error iterating infobox rows for '{target}': {e}")
|
|
return "k.A."
|
|
|
|
return value_found
|
|
|
|
def _parse_sitz_string_detailed(self, raw_sitz_string_input: str) -> dict:
|
|
"""
|
|
Attempts to extract City and Country in detail from a raw Sitz string.
|
|
"""
|
|
sitz_stadt_val, sitz_land_val = "k.A.", "k.A."
|
|
if not raw_sitz_string_input or not isinstance(raw_sitz_string_input, str):
|
|
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
|
|
|
temp_sitz = raw_sitz_string_input.strip()
|
|
if not temp_sitz or temp_sitz.lower() == "k.a.":
|
|
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
|
|
|
known_countries_detailed = {
|
|
"deutschland": "Deutschland", "germany": "Deutschland", "de": "Deutschland",
|
|
"österreich": "Österreich", "austria": "Österreich", "at": "Österreich",
|
|
"schweiz": "Schweiz", "switzerland": "Schweiz", "ch": "Schweiz", "suisse": "Schweiz",
|
|
"usa": "USA", "u.s.": "USA", "united states": "USA", "vereinigte staaten": "USA",
|
|
"vereinigtes königreich": "Vereinigtes Königreich", "united kingdom": "Vereinigtes Königreich", "uk": "Vereinigtes Königreich",
|
|
}
|
|
region_to_country = {
|
|
"nrw": "Deutschland", "nordrhein-westfalen": "Deutschland", "bayern": "Deutschland", "hessen": "Deutschland",
|
|
"zg": "Schweiz", "zug": "Schweiz", "zh": "Schweiz", "zürich": "Schweiz",
|
|
"ca": "USA", "california": "USA", "ny": "USA", "new york": "USA",
|
|
}
|
|
|
|
extracted_country = ""
|
|
original_temp_sitz = temp_sitz
|
|
|
|
klammer_match = re.search(r'\(([^)]+)\)$', temp_sitz)
|
|
if klammer_match:
|
|
suffix_in_klammer = klammer_match.group(1).strip().lower()
|
|
if suffix_in_klammer in known_countries_detailed:
|
|
extracted_country = known_countries_detailed[suffix_in_klammer]
|
|
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
|
elif suffix_in_klammer in region_to_country:
|
|
extracted_country = region_to_country[suffix_in_klammer]
|
|
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
|
|
|
if not extracted_country and "," in temp_sitz:
|
|
parts = [p.strip() for p in temp_sitz.split(',')]
|
|
if len(parts) > 1:
|
|
last_part_lower = parts[-1].lower()
|
|
if last_part_lower in known_countries_detailed:
|
|
extracted_country = known_countries_detailed[last_part_lower]
|
|
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
|
|
elif last_part_lower in region_to_country:
|
|
extracted_country = region_to_country[last_part_lower]
|
|
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
|
|
|
|
sitz_land_val = extracted_country if extracted_country else "k.A."
|
|
sitz_stadt_val = re.sub(r'^\d{4,8}\s*', '', temp_sitz).strip(" ,")
|
|
|
|
if not sitz_stadt_val:
|
|
sitz_stadt_val = "k.A." if sitz_land_val != "k.A." else re.sub(r'^\d{4,8}\s*', '', original_temp_sitz).strip(" ,") or "k.A."
|
|
|
|
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
|
|
|
@retry_on_failure(max_retries=3)
|
|
def extract_company_data(self, url_or_page) -> dict:
|
|
"""
|
|
Extracts structured company data from a Wikipedia article (URL or page object).
|
|
"""
|
|
default_result = {
|
|
'url': 'k.A.', 'title': 'k.A.', 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
|
|
'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.',
|
|
'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_text': ''
|
|
}
|
|
page = None
|
|
|
|
try:
|
|
if isinstance(url_or_page, str) and "wikipedia.org" in url_or_page:
|
|
page_title = unquote(url_or_page.split('/wiki/')[-1].replace('_', ' '))
|
|
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
|
|
elif not isinstance(url_or_page, str): # Assumption: it is a page object
|
|
page = url_or_page
|
|
else:
|
|
logger.warning(f"extract_company_data: Invalid Input '{str(url_or_page)[:100]}...")
|
|
return default_result
|
|
|
|
logger.info(f"Extracting data for Wiki Article: {page.title[:100]}...")
|
|
|
|
# Extract basic data directly from page object
|
|
first_paragraph = page.summary.split('\n')[0] if page.summary else 'k.A.'
|
|
categories = ", ".join(page.categories)
|
|
full_text = page.content
|
|
|
|
# BeautifulSoup needed for infobox and refined extraction
|
|
soup = self._get_page_soup(page.url)
|
|
if not soup:
|
|
logger.warning(f" -> Could not load page for Soup parsing. Extracting basic data only.")
|
|
return {
|
|
'url': page.url, 'title': page.title, 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
|
|
'first_paragraph': page.summary.split('\n')[0] if page.summary else 'k.A.',
|
|
'branche': 'k.A.', 'umsatz': 'k.A.',
|
|
'mitarbeiter': 'k.A.', 'categories': ", ".join(page.categories), 'full_text': full_text
|
|
}
|
|
|
|
# Refined Extraction from Soup
|
|
first_paragraph = self._extract_first_paragraph_from_soup(soup)
|
|
categories = self.extract_categories(soup)
|
|
|
|
# Extract infobox data
|
|
branche_val = self._extract_infobox_value(soup, 'branche')
|
|
umsatz_val = self._extract_infobox_value(soup, 'umsatz')
|
|
mitarbeiter_val = self._extract_infobox_value(soup, 'mitarbeiter')
|
|
raw_sitz_string = self._extract_infobox_value(soup, 'sitz')
|
|
parsed_sitz = self._parse_sitz_string_detailed(raw_sitz_string)
|
|
sitz_stadt_val = parsed_sitz['sitz_stadt']
|
|
sitz_land_val = parsed_sitz['sitz_land']
|
|
|
|
result = {
|
|
'url': page.url,
|
|
'title': page.title,
|
|
'sitz_stadt': sitz_stadt_val,
|
|
'sitz_land': sitz_land_val,
|
|
'first_paragraph': first_paragraph,
|
|
'branche': branche_val,
|
|
'umsatz': umsatz_val,
|
|
'mitarbeiter': mitarbeiter_val,
|
|
'categories': categories,
|
|
'full_text': full_text
|
|
}
|
|
|
|
logger.info(f" -> Extracted Data: City='{sitz_stadt_val}', Country='{sitz_land_val}', Rev='{umsatz_val}', Emp='{mitarbeiter_val}'")
|
|
return result
|
|
|
|
except wikipedia.exceptions.PageError:
|
|
logger.error(f" -> Error: Wikipedia article for '{str(url_or_page)[:100]}' could not be found (PageError).")
|
|
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
|
except Exception as e:
|
|
logger.error(f" -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}")
|
|
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'} |