Chat GPT Bugfixing Infobox ausgabe
This commit is contained in:
@@ -171,19 +171,22 @@ class WikipediaScraper:
|
||||
def extract_company_data(self, page_url):
|
||||
"""Extrahiert Daten aus dem Wikipedia-Artikel"""
|
||||
if not page_url:
|
||||
return {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''}
|
||||
return {
|
||||
'full_infobox': self.extract_full_infobox(soup),'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''}
|
||||
|
||||
try:
|
||||
response = requests.get(page_url)
|
||||
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
|
||||
return {
|
||||
'full_infobox': self.extract_full_infobox(soup),
|
||||
'branche': self._extract_infobox_value(soup, 'branche'),
|
||||
'umsatz': self._extract_infobox_value(soup, 'umsatz'),
|
||||
'url': page_url
|
||||
}
|
||||
except Exception as e:
|
||||
debug_print(f"Extraktionsfehler: {str(e)}")
|
||||
return {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': page_url}
|
||||
return {
|
||||
'full_infobox': self.extract_full_infobox(soup),'branche': 'k.A.', 'umsatz': 'k.A.', 'url': page_url}
|
||||
|
||||
def _extract_infobox_value(self, soup, target):
|
||||
"""Extrahiert Werte aus der Infobox"""
|
||||
@@ -238,6 +241,18 @@ class WikipediaScraper:
|
||||
|
||||
return "k.A."
|
||||
|
||||
|
||||
def extract_full_infobox(self, soup):
|
||||
"""Extrahiert die komplette Infobox als Text"""
|
||||
infobox = soup.find('table', class_=lambda c: c and any(
|
||||
kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']
|
||||
))
|
||||
|
||||
if not infobox:
|
||||
return "k.A."
|
||||
|
||||
return clean_text(infobox.get_text(separator=' | '))
|
||||
|
||||
# ==================== DATA PROCESSOR ====================
|
||||
class DataProcessor:
|
||||
"""Steuerung des Gesamtprozesses"""
|
||||
@@ -266,10 +281,11 @@ class DataProcessor:
|
||||
if article:
|
||||
company_data = self.wiki_scraper.extract_company_data(article.url)
|
||||
else:
|
||||
company_data = {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': ''}
|
||||
company_data = {'branche': 'k.A.', 'umsatz': 'k.A.', 'url': '', 'full_infobox': 'k.A.'}
|
||||
|
||||
current_values = self.sheet_handler.sheet.row_values(row_num)
|
||||
new_values = [
|
||||
company_data.get('full_infobox', 'k.A.'),
|
||||
company_data['branche'] if company_data['branche'] != "k.A." else current_values[6] if len(current_values) > 6 else "k.A.",
|
||||
"k.A.",
|
||||
company_data['umsatz'] if company_data['umsatz'] != "k.A." else current_values[8] if len(current_values) > 8 else "k.A.",
|
||||
|
||||
Reference in New Issue
Block a user