wikipedia_scraper.py aktualisiert
This commit is contained in:
@@ -179,11 +179,10 @@ class WikipediaScraper:
|
|||||||
self.logger.log(log_level, f" => Artikel '{page.title[:100]}...' {'VALIDIERT' if is_valid else 'NICHT validiert'} (Grund: {reason})")
|
self.logger.log(log_level, f" => Artikel '{page.title[:100]}...' {'VALIDIERT' if is_valid else 'NICHT validiert'} (Grund: {reason})")
|
||||||
return is_valid
|
return is_valid
|
||||||
|
|
||||||
def search_company_article(self, company_name, website=None, max_recursion_depth=1):
|
def search_company_article(self, company_name, website=None, parent_name=None, max_recursion_depth=1):
|
||||||
"""
|
"""
|
||||||
Sucht einen passenden Wikipedia-Artikel fuer das Unternehmen und gibt das
|
Sucht einen passenden Wikipedia-Artikel fuer das Unternehmen und gibt das
|
||||||
wikipedia.WikipediaPage Objekt zurueck, wenn ein relevanter und validierter
|
wikipedia.WikipediaPage Objekt zurueck. Berücksichtigt nun auch den parent_name.
|
||||||
Artikel gefunden wird. Behandelt explizit Begriffsklaerungsseiten.
|
|
||||||
"""
|
"""
|
||||||
if not company_name or str(company_name).strip() == "":
|
if not company_name or str(company_name).strip() == "":
|
||||||
self.logger.warning("Wikipedia search skipped: No company name provided.")
|
self.logger.warning("Wikipedia search skipped: No company name provided.")
|
||||||
@@ -199,8 +198,11 @@ class WikipediaScraper:
|
|||||||
processed_titles = set()
|
processed_titles = set()
|
||||||
original_search_name_norm = normalize_company_name(company_name)
|
original_search_name_norm = normalize_company_name(company_name)
|
||||||
|
|
||||||
def check_page_recursive(title_to_check, current_depth):
|
# KORRIGIERT: parent_name wird nun an die innere Funktion übergeben
|
||||||
if title_to_check in processed_titles or current_depth > max_recursion_depth:
|
def check_page_recursive(title_to_check, current_depth, parent_name_for_validation):
|
||||||
|
# KORRIGIERT: Sicherer Vergleich, falls max_recursion_depth None sein sollte
|
||||||
|
effective_max_depth = max_recursion_depth if max_recursion_depth is not None else 2
|
||||||
|
if title_to_check in processed_titles or current_depth > effective_max_depth:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
processed_titles.add(title_to_check)
|
processed_titles.add(title_to_check)
|
||||||
@@ -216,18 +218,18 @@ class WikipediaScraper:
|
|||||||
page = None
|
page = None
|
||||||
try:
|
try:
|
||||||
page = wikipedia.page(title_to_check, auto_suggest=False, preload=False, redirect=True)
|
page = wikipedia.page(title_to_check, auto_suggest=False, preload=False, redirect=True)
|
||||||
if self._validate_article(page, company_name, website):
|
# KORRIGIERT: parent_name wird an die Validierung übergeben
|
||||||
|
if self._validate_article(page, company_name, website, parent_name_for_validation):
|
||||||
self.logger.info(f" -> Titel '{page.title[:100]}...' erfolgreich validiert!")
|
self.logger.info(f" -> Titel '{page.title[:100]}...' erfolgreich validiert!")
|
||||||
return page
|
return page
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
# NEU: Fängt den "Page id ... does not match" Fehler gezielt ab
|
|
||||||
except wikipedia.exceptions.PageError:
|
except wikipedia.exceptions.PageError:
|
||||||
self.logger.debug(f" -> Artikel '{title_to_check[:100]}' nicht gefunden (PageError).")
|
self.logger.debug(f" -> Artikel '{title_to_check[:100]}' nicht gefunden (PageError).")
|
||||||
return None
|
return None
|
||||||
except wikipedia.exceptions.DisambiguationError as e_disamb:
|
except wikipedia.exceptions.DisambiguationError as e_disamb:
|
||||||
self.logger.info(f" -> Begriffsklaerung '{e_disamb.title}' gefunden (Tiefe {current_depth}). Pruefe Optionen...")
|
self.logger.info(f" -> Begriffsklaerung '{e_disamb.title}' gefunden (Tiefe {current_depth}). Pruefe Optionen...")
|
||||||
if current_depth >= max_recursion_depth: return None
|
if current_depth >= effective_max_depth: return None
|
||||||
|
|
||||||
relevant_options = []
|
relevant_options = []
|
||||||
for option in e_disamb.options:
|
for option in e_disamb.options:
|
||||||
@@ -237,7 +239,8 @@ class WikipediaScraper:
|
|||||||
relevant_options.append(option)
|
relevant_options.append(option)
|
||||||
|
|
||||||
for option_to_check in relevant_options[:3]:
|
for option_to_check in relevant_options[:3]:
|
||||||
validated_page = check_page_recursive(option_to_check, current_depth + 1)
|
# KORRIGIERT: parent_name wird im rekursiven Aufruf weitergereicht
|
||||||
|
validated_page = check_page_recursive(option_to_check, current_depth + 1, parent_name_for_validation)
|
||||||
if validated_page: return validated_page
|
if validated_page: return validated_page
|
||||||
return None
|
return None
|
||||||
except Exception as e_page:
|
except Exception as e_page:
|
||||||
@@ -250,14 +253,16 @@ class WikipediaScraper:
|
|||||||
|
|
||||||
# Hauptlogik der Suche
|
# Hauptlogik der Suche
|
||||||
self.logger.debug(f" -> Versuche direkten Match fuer '{company_name[:100]}...'")
|
self.logger.debug(f" -> Versuche direkten Match fuer '{company_name[:100]}...'")
|
||||||
page_found = check_page_recursive(company_name, 0)
|
# KORRIGIERT: parent_name wird an den ersten Aufruf übergeben
|
||||||
|
page_found = check_page_recursive(company_name, 0, parent_name)
|
||||||
if page_found: return page_found
|
if page_found: return page_found
|
||||||
|
|
||||||
self.logger.debug(f" -> Kein direkter Treffer. Starte Suche mit generierten Begriffen...")
|
self.logger.debug(f" -> Kein direkter Treffer. Starte Suche mit generierten Begriffen...")
|
||||||
for term in search_terms:
|
for term in search_terms:
|
||||||
if term == company_name: continue
|
if term == company_name: continue
|
||||||
self.logger.debug(f" -> Versuche Suchbegriff: '{term[:100]}...'")
|
self.logger.debug(f" -> Versuche Suchbegriff: '{term[:100]}...'")
|
||||||
page_found = check_page_recursive(term, 0)
|
# KORRIGIERT: parent_name wird an die weiteren Aufrufe übergeben
|
||||||
|
page_found = check_page_recursive(term, 0, parent_name)
|
||||||
if page_found: return page_found
|
if page_found: return page_found
|
||||||
|
|
||||||
self.logger.warning(f"Kein passender & validierter Wikipedia-Artikel fuer '{company_name[:100]}...' gefunden.")
|
self.logger.warning(f"Kein passender & validierter Wikipedia-Artikel fuer '{company_name[:100]}...' gefunden.")
|
||||||
|
|||||||
Reference in New Issue
Block a user