From 2bdcc5fa0927c3793cd30d7a20d85ee2b3d46f77 Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 31 Mar 2025 14:25:01 +0000 Subject: [PATCH] =?UTF-8?q?Chat=20GPT=20Bugfix=20f=C3=BCr=20Deepseek?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brancheneinstufung.py | 207 +++++++++++++++++++++--------------------- 1 file changed, 105 insertions(+), 102 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 3a179b93..954a9294 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -89,108 +89,7 @@ class GoogleSheetHandler: ) # ==================== WIKIPEDIA SCRAPER ==================== -class WikipediaScraper: - """Klasse zur Handhabung der Wikipedia-Suche und Datenextraktion""" - - def __init__(self): - wikipedia.set_lang(Config.LANG) - - def _extract_domain_hint(self, website): - """Extrahiert den Domain-Schlüssel aus der Website-URL""" - if not website: - return "" - # Entferne Protokoll und www, zerlege in Teile - clean_url = website.lower().replace("https://", "").replace("http://", "").replace("www.", "") - domain_parts = clean_url.split(".") - return domain_parts[0] if domain_parts else "" - - def _generate_search_terms(self, company_name, website_hint=""): - """Generiert Suchbegriffe aus Firmenname und Website""" - search_terms = [company_name.strip()] - - # Bereinigung von Rechtsformen und Sonderzeichen - clean_name = re.sub( - r'\s+(?:GmbH|AG|KG|OHG|e\.V\.|mbH|& Co\. KG| GmbH & Co\. KG).*$', - '', - company_name - ).strip() - - # Füge bereinigten Namen hinzu, wenn unterschiedlich - if clean_name and clean_name != company_name: - search_terms.append(clean_name) - - # Extrahiere erste zwei relevante Wörter - name_words = [w for w in re.split(r'\W+', clean_name) if w] - if len(name_words) >= 2: - search_terms.append(" ".join(name_words[:2])) - - # Domain-Hint hinzufügen - domain_hint = self._extract_domain_hint(website_hint) - if domain_hint and domain_hint not in ["de", "com", "org", "net"]: - search_terms.append(domain_hint) - - debug_print(f"Generierte Suchbegriffe: {search_terms}") - return list(set(search_terms)) # Duplikate entfernen - - def _validate_article(self, page, company_name, domain_hint=""): - """Überprüft ob der Artikel zum Unternehmen passt""" - # Normalisiere beide Namen - page_title = re.sub(r'\(.*?\)', '', page.title).strip().lower() - search_name = re.sub(r'[^a-zA-Z0-9äöüß ]', '', company_name).strip().lower() - - # Ähnlichkeitsprüfung - similarity = SequenceMatcher(None, page_title, search_name).ratio() - debug_print(f"Ähnlichkeit '{page_title}' vs '{search_name}': {similarity:.2f}") - - # Zusätzliche Domain-Prüfung - if domain_hint: - html_content = requests.get(page.url).text.lower() - if domain_hint not in html_content: - debug_print(f"Domain-Hint '{domain_hint}' nicht im Artikel gefunden") - return False - - return similarity >= Config.SIMILARITY_THRESHOLD - - @retry_on_failure - def search_company_article(self, company_name, website_hint=""): - """Hauptfunktion zur Artikelsuche""" - search_terms = self._generate_search_terms(company_name, website_hint) - domain_hint = self._extract_domain_hint(website_hint) - - for term in search_terms: - try: - results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS) - debug_print(f"Suchergebnisse für '{term}': {results}") - - for title in results: - try: - page = wikipedia.page(title, auto_suggest=False) - if self._validate_article(page, company_name, domain_hint): - return page - except wikipedia.exceptions.DisambiguationError: - continue - except Exception as e: - debug_print(f"Fehler bei Suche nach {term}: {str(e)}") - continue - return None - - def extract_company_data(self, page_url): - """Extrahiert Branche und Umsatz aus dem Wikipedia-Artikel""" - response = requests.get(page_url) - soup = BeautifulSoup(response.text, Config.HTML_PARSER) - - return { - 'branche': self._extract_infobox_value(soup, 'branche'), - 'umsatz': self._extract_infobox_value(soup, 'umsatz'), - 'url': page_url - } - -# ==================== WIKIPEDIA SCRAPER ==================== -class WikipediaScraper: - def _extract_infobox_value(self, soup, target): - """Extrahiert spezifischen Wert aus der Infobox mit erweiterten Suchmustern""" - # Erweiterte Infobox-Erkennung - infobox = soup.find('table', class_=lambda c: c and any( +class_=lambda c: c and any( kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen'] )) @@ -239,6 +138,110 @@ class WikipediaScraper: return "k.A." +class WikipediaScraper: + def __init__(self): + wikipedia.set_lang(Config.LANG) + + def _extract_domain_hint(self, website): + """Extrahiert den Domain-Schlüssel aus der Website-URL""" + if not website: + return "" + # Entferne Protokoll und www, zerlege in Teile + clean_url = website.lower().replace("https://", "").replace("http://", "").replace("www.", "") + domain_parts = clean_url.split(".") + return domain_parts[0] if domain_parts else "" + + def _generate_search_terms(self, company_name, website_hint=""): + """Generiert Suchbegriffe aus Firmenname und Website""" + search_terms = [company_name.strip()] + + # Bereinigung von Rechtsformen und Sonderzeichen + clean_name = re.sub( + r'\s+(?:GmbH|AG|KG|OHG|e\.V\.|mbH|& Co\. KG| GmbH & Co\. KG).*$', + '', + company_name + ).strip() + + # Füge bereinigten Namen hinzu, wenn unterschiedlich + if clean_name and clean_name != company_name: + search_terms.append(clean_name) + + # Extrahiere erste zwei relevante Wörter + name_words = [w for w in re.split(r'\W+', clean_name) if w] + if len(name_words) >= 2: + search_terms.append(" ".join(name_words[:2])) + + # Domain-Hint hinzufügen + domain_hint = self._extract_domain_hint(website_hint) + if domain_hint and domain_hint not in ["de", "com", "org", "net"]: + search_terms.append(domain_hint) + + debug_print(f"Generierte Suchbegriffe: {search_terms}") + return list(set(search_terms)) # Duplikate entfernen + + def _validate_article(self, page, company_name, domain_hint=""): + """Überprüft ob der Artikel zum Unternehmen passt""" + # Normalisiere beide Namen + page_title = re.sub(r'\(.*?\)', '', page.title).strip().lower() + search_name = re.sub(r'[^a-zA-Z0-9äöüß ]', '', company_name).strip().lower() + + # Ähnlichkeitsprüfung + similarity = SequenceMatcher(None, page_title, search_name).ratio() + debug_print(f"Ähnlichkeit '{page_title}' vs '{search_name}': {similarity:.2f}") + + # Zusätzliche Domain-Prüfung + if domain_hint: + html_content = requests.get(page.url).text.lower() + if domain_hint not in html_content: + debug_print(f"Domain-Hint '{domain_hint}' nicht im Artikel gefunden") + return False + + return similarity >= Config.SIMILARITY_THRESHOLD + + @retry_on_failure + + def search_company_article(self, company_name, website_hint=""): + """Hauptfunktion zur Artikelsuche""" + search_terms = self._generate_search_terms(company_name, website_hint) + domain_hint = self._extract_domain_hint(website_hint) + + for term in search_terms: + try: + results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS) + debug_print(f"Suchergebnisse für '{term}': {results}") + + for title in results: + try: + page = wikipedia.page(title, auto_suggest=False) + if self._validate_article(page, company_name, domain_hint): + return page + except wikipedia.exceptions.DisambiguationError: + continue + except Exception as e: + debug_print(f"Fehler bei Suche nach {term}: {str(e)}") + continue + return None + + def extract_company_data(self, page_url): + """Extrahiert Branche und Umsatz aus dem Wikipedia-Artikel""" + response = requests.get(page_url) + soup = BeautifulSoup(response.text, Config.HTML_PARSER) + + return { + 'branche': self._extract_infobox_value(soup, 'branche'), + 'umsatz': self._extract_infobox_value(soup, 'umsatz'), + 'url': page_url + } + + + # ==================== WIKIPEDIA SCRAPER ==================== + class WikipediaScraper: + + def _extract_infobox_value(self, soup, target): + """Extrahiert spezifischen Wert aus der Infobox mit erweiterten Suchmustern""" + # Erweiterte Infobox-Erkennung + infobox = soup.find('table', + # ==================== DATA PROCESSOR ==================== class DataProcessor: """Klasse zur Steuerung des Gesamtprozesses"""