Chat GPT Bugfixing Infobox ausgabe

This commit is contained in:
2025-03-31 17:32:37 +00:00
parent db0d2ca29f
commit 849af75839

View File

@@ -171,19 +171,22 @@ class WikipediaScraper:
def extract_company_data(self, page_url):
"""Extrahiert Daten aus dem Wikipedia-Artikel"""
if not page_url:
return {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''}
return {
'full_infobox': self.extract_full_infobox(soup),'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''}
try:
response = requests.get(page_url)
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
return {
'full_infobox': self.extract_full_infobox(soup),
'branche': self._extract_infobox_value(soup, 'branche'),
'umsatz': self._extract_infobox_value(soup, 'umsatz'),
'url': page_url
}
except Exception as e:
debug_print(f"Extraktionsfehler: {str(e)}")
return {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': page_url}
return {
'full_infobox': self.extract_full_infobox(soup),'branche': 'k.A.', 'umsatz': 'k.A.', 'url': page_url}
def _extract_infobox_value(self, soup, target):
"""Extrahiert Werte aus der Infobox"""
@@ -238,6 +241,18 @@ class WikipediaScraper:
return "k.A."
def extract_full_infobox(self, soup):
"""Extrahiert die komplette Infobox als Text"""
infobox = soup.find('table', class_=lambda c: c and any(
kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']
))
if not infobox:
return "k.A."
return clean_text(infobox.get_text(separator=' | '))
# ==================== DATA PROCESSOR ====================
class DataProcessor:
"""Steuerung des Gesamtprozesses"""
@@ -266,10 +281,11 @@ class DataProcessor:
if article:
company_data = self.wiki_scraper.extract_company_data(article.url)
else:
company_data = {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''}
company_data = {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': '', 'full_infobox': 'k.A.'}
current_values = self.sheet_handler.sheet.row_values(row_num)
new_values = [
company_data.get('full_infobox', 'k.A.'),
company_data['branche'] if company_data['branche'] != "k.A." else current_values[6] if len(current_values) > 6 else "k.A.",
"k.A.",
company_data['umsatz'] if company_data['umsatz'] != "k.A." else current_values[8] if len(current_values) > 8 else "k.A.",