From 849af7583916ff2ec9c3f03fa8aceb3dcdab4819 Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 31 Mar 2025 17:32:37 +0000 Subject: [PATCH] Chat GPT Bugfixing Infobox ausgabe --- brancheneinstufung.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index b45f1540..8ad64617 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -171,19 +171,22 @@ class WikipediaScraper: def extract_company_data(self, page_url): """Extrahiert Daten aus dem Wikipedia-Artikel""" if not page_url: - return {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''} + return { + 'full_infobox': self.extract_full_infobox(soup),'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''} try: response = requests.get(page_url) soup = BeautifulSoup(response.text, Config.HTML_PARSER) return { + 'full_infobox': self.extract_full_infobox(soup), 'branche': self._extract_infobox_value(soup, 'branche'), 'umsatz': self._extract_infobox_value(soup, 'umsatz'), 'url': page_url } except Exception as e: debug_print(f"Extraktionsfehler: {str(e)}") - return {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': page_url} + return { + 'full_infobox': self.extract_full_infobox(soup),'branche': 'k.A.', 'umsatz': 'k.A.', 'url': page_url} def _extract_infobox_value(self, soup, target): """Extrahiert Werte aus der Infobox""" @@ -238,6 +241,18 @@ class WikipediaScraper: return "k.A." + + def extract_full_infobox(self, soup): + """Extrahiert die komplette Infobox als Text""" + infobox = soup.find('table', class_=lambda c: c and any( + kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen'] + )) + + if not infobox: + return "k.A." + + return clean_text(infobox.get_text(separator=' | ')) + # ==================== DATA PROCESSOR ==================== class DataProcessor: """Steuerung des Gesamtprozesses""" @@ -266,10 +281,11 @@ class DataProcessor: if article: company_data = self.wiki_scraper.extract_company_data(article.url) else: - company_data = {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''} + company_data = {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': '', 'full_infobox': 'k.A.'} current_values = self.sheet_handler.sheet.row_values(row_num) new_values = [ + company_data.get('full_infobox', 'k.A.'), company_data['branche'] if company_data['branche'] != "k.A." else current_values[6] if len(current_values) > 6 else "k.A.", "k.A.", company_data['umsatz'] if company_data['umsatz'] != "k.A." else current_values[8] if len(current_values) > 8 else "k.A.",