wikipedia_scraper.py aktualisiert
This commit is contained in:
@@ -62,28 +62,59 @@ class WikipediaScraper:
|
|||||||
"""Extrahiert die normalisierte Domain (ohne www, ohne Pfad) aus einer URL."""
|
"""Extrahiert die normalisierte Domain (ohne www, ohne Pfad) aus einer URL."""
|
||||||
return simple_normalize_url(website)
|
return simple_normalize_url(website)
|
||||||
|
|
||||||
def _generate_search_terms(self, company_name, website):
|
def _generate_search_terms(self, company_name, website=None):
|
||||||
"""
|
"""
|
||||||
Generiert eine Liste von Suchbegriffen fuer die Wikipedia-Suche.
|
Generiert eine Liste von potenziellen Wikipedia-Artikeltiteln.
|
||||||
|
v2.0: Mit verbesserter Logik für Namen, die Zahlen enthalten.
|
||||||
"""
|
"""
|
||||||
if not company_name: return []
|
if not company_name:
|
||||||
terms = set()
|
return []
|
||||||
original_name_cleaned = str(company_name).strip()
|
|
||||||
if original_name_cleaned: terms.add(original_name_cleaned)
|
|
||||||
|
|
||||||
normalized_name = normalize_company_name(company_name)
|
# Basis-Normalisierung
|
||||||
if normalized_name:
|
normalized = normalize_company_name(company_name)
|
||||||
terms.add(normalized_name)
|
|
||||||
name_parts = normalized_name.split()
|
|
||||||
if len(name_parts) > 0: terms.add(name_parts[0])
|
|
||||||
if len(name_parts) > 1: terms.add(" ".join(name_parts[:2]))
|
|
||||||
|
|
||||||
full_domain = self._get_full_domain(website)
|
# NEUE LOGIK: Speziell für Namen wie "11 88 0 Solutions"
|
||||||
if full_domain != "k.A.": terms.add(full_domain)
|
# Fügt eine Version hinzu, bei der Leerzeichen zwischen Zahlen entfernt werden.
|
||||||
|
if re.search(r'\d[\s\d]+\d', normalized):
|
||||||
|
condensed_normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized)
|
||||||
|
# Führe eine erneute, aggressivere Normalisierung durch, um Reste zu entfernen
|
||||||
|
condensed_normalized = normalize_company_name(condensed_normalized)
|
||||||
|
else:
|
||||||
|
condensed_normalized = None
|
||||||
|
|
||||||
final_terms = [term for term in list(terms) if term][:getattr(Config, 'WIKIPEDIA_SEARCH_RESULTS', 5)]
|
search_terms = []
|
||||||
self.logger.debug(f"Generierte Suchbegriffe fuer '{company_name[:100]}...': {final_terms}")
|
|
||||||
return final_terms
|
# Füge die kondensierte Version mit höchster Priorität hinzu, falls sie existiert
|
||||||
|
if condensed_normalized and condensed_normalized not in search_terms:
|
||||||
|
search_terms.append(condensed_normalized)
|
||||||
|
|
||||||
|
# Füge den Originalnamen und die normalisierte Version hinzu
|
||||||
|
if company_name not in search_terms:
|
||||||
|
search_terms.append(company_name)
|
||||||
|
if normalized not in search_terms:
|
||||||
|
search_terms.append(normalized)
|
||||||
|
|
||||||
|
# Füge Teile des Namens hinzu
|
||||||
|
parts = normalized.split()
|
||||||
|
if len(parts) > 1:
|
||||||
|
if parts[0] not in search_terms: search_terms.append(parts[0])
|
||||||
|
first_two = " ".join(parts[:2])
|
||||||
|
if first_two not in search_terms: search_terms.append(first_two)
|
||||||
|
|
||||||
|
# Füge die Website-Domain als Suchbegriff hinzu
|
||||||
|
if website:
|
||||||
|
domain = simple_normalize_url(website)
|
||||||
|
if domain != "k.A." and domain not in search_terms:
|
||||||
|
search_terms.append(domain)
|
||||||
|
|
||||||
|
# Entferne Duplikate und behalte die Reihenfolge bei
|
||||||
|
unique_terms = []
|
||||||
|
for term in search_terms:
|
||||||
|
if term and term not in unique_terms:
|
||||||
|
unique_terms.append(term)
|
||||||
|
|
||||||
|
# Limitiere auf maximal 5 Suchbegriffe, um API-Calls zu sparen
|
||||||
|
return unique_terms[:5]
|
||||||
|
|
||||||
@retry_on_failure
|
@retry_on_failure
|
||||||
def _get_page_soup(self, url):
|
def _get_page_soup(self, url):
|
||||||
@@ -199,7 +230,7 @@ class WikipediaScraper:
|
|||||||
original_search_name_norm = normalize_company_name(company_name)
|
original_search_name_norm = normalize_company_name(company_name)
|
||||||
|
|
||||||
# KORRIGIERT: parent_name wird nun an die innere Funktion übergeben
|
# KORRIGIERT: parent_name wird nun an die innere Funktion übergeben
|
||||||
def check_page_recursive(title_to_check, current_depth, parent_name_for_validation):
|
def check_page_recursive(title_to_check, current_depth):
|
||||||
# KORRIGIERT: Sicherer Vergleich, falls max_recursion_depth None sein sollte
|
# KORRIGIERT: Sicherer Vergleich, falls max_recursion_depth None sein sollte
|
||||||
effective_max_depth = max_recursion_depth if max_recursion_depth is not None else 2
|
effective_max_depth = max_recursion_depth if max_recursion_depth is not None else 2
|
||||||
if title_to_check in processed_titles or current_depth > effective_max_depth:
|
if title_to_check in processed_titles or current_depth > effective_max_depth:
|
||||||
@@ -240,7 +271,7 @@ class WikipediaScraper:
|
|||||||
|
|
||||||
for option_to_check in relevant_options[:3]:
|
for option_to_check in relevant_options[:3]:
|
||||||
# KORRIGIERT: parent_name wird im rekursiven Aufruf weitergereicht
|
# KORRIGIERT: parent_name wird im rekursiven Aufruf weitergereicht
|
||||||
validated_page = check_page_recursive(option_to_check, current_depth + 1, parent_name_for_validation)
|
validated_page = check_page_recursive(option_to_check, current_depth + 1)
|
||||||
if validated_page: return validated_page
|
if validated_page: return validated_page
|
||||||
return None
|
return None
|
||||||
except Exception as e_page:
|
except Exception as e_page:
|
||||||
@@ -252,18 +283,16 @@ class WikipediaScraper:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Hauptlogik der Suche
|
# Hauptlogik der Suche
|
||||||
self.logger.debug(f" -> Versuche direkten Match fuer '{company_name[:100]}...'")
|
# Die innere Funktion `check_page_recursive` erbt `parent_name` jetzt aus dem äußeren Scope.
|
||||||
# KORRIGIERT: parent_name wird an den ersten Aufruf übergeben
|
# Daher muss der Parameter hier nicht mehr übergeben werden.
|
||||||
page_found = check_page_recursive(company_name, 0, parent_name)
|
|
||||||
if page_found: return page_found
|
|
||||||
|
|
||||||
self.logger.debug(f" -> Kein direkter Treffer. Starte Suche mit generierten Begriffen...")
|
# Iteriere durch alle generierten Suchbegriffe, inklusive des Originalnamens
|
||||||
for term in search_terms:
|
for term in search_terms:
|
||||||
if term == company_name: continue
|
|
||||||
self.logger.debug(f" -> Versuche Suchbegriff: '{term[:100]}...'")
|
self.logger.debug(f" -> Versuche Suchbegriff: '{term[:100]}...'")
|
||||||
# KORRIGIERT: parent_name wird an die weiteren Aufrufe übergeben
|
page_found = check_page_recursive(term, 0)
|
||||||
page_found = check_page_recursive(term, 0, parent_name)
|
if page_found:
|
||||||
if page_found: return page_found
|
return page_found
|
||||||
|
|
||||||
|
|
||||||
self.logger.warning(f"Kein passender & validierter Wikipedia-Artikel fuer '{company_name[:100]}...' gefunden.")
|
self.logger.warning(f"Kein passender & validierter Wikipedia-Artikel fuer '{company_name[:100]}...' gefunden.")
|
||||||
return None
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user