From 032a269c6f09ce24915e2b43f083d4dd9e3d3594 Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 29 Dec 2025 13:33:05 +0000 Subject: [PATCH] fix: Robustify Market Intel - Complete rewrite of orchestrator - market_intel_orchestrator.py: Overwrote file to enforce all fixes (URL auto-scheme, User-Agent, Graceful Fallback in Strategy and Audit). --- market_intel_orchestrator.py | 292 +++++++++++++++++++++++++++++++++-- 1 file changed, 281 insertions(+), 11 deletions(-) diff --git a/market_intel_orchestrator.py b/market_intel_orchestrator.py index 700d10e0..3d8551f7 100644 --- a/market_intel_orchestrator.py +++ b/market_intel_orchestrator.py @@ -59,6 +59,10 @@ def load_serp_api_key(file_path="serpapikey.txt"): return None def get_website_text(url): + # Auto-fix missing scheme + if url and not url.startswith('http'): + url = 'https://' + url + logger.info(f"Scraping URL: {url}") try: # Use a more realistic, modern User-Agent to avoid blocking @@ -80,7 +84,247 @@ def get_website_text(url): logger.error(f"Scraping failed for {url}: {e}") return None -# ... (omitted parts) ... +def serp_search(query, num_results=3): + """Führt eine Google-Suche über SerpAPI durch.""" + api_key = load_serp_api_key() + if not api_key: + logger.warning("SerpAPI Key fehlt. Suche übersprungen.") + return [] + + logger.info(f"SerpAPI Suche: {query}") + try: + params = { + "engine": "google", + "q": query, + "api_key": api_key, + "num": num_results, + "hl": "de", + "gl": "de" + } + response = requests.get("https://serpapi.com/search", params=params, timeout=20) + response.raise_for_status() + data = response.json() + + results = [] + if "organic_results" in data: + for result in data["organic_results"]: + results.append({ + "title": result.get("title"), + "link": result.get("link"), + "snippet": result.get("snippet") + }) + return results + except Exception as e: + logger.error(f"SerpAPI Fehler: {e}") + return [] + +def _extract_target_industries_from_context(context_content): + md = context_content + # Versuche verschiedene Muster für die Tabelle, falls das Format variiert + step2_match = re.search(r'##\s*Schritt\s*2:[\s\S]*?(?=\n##\s*Schritt\s*\d:| + *$)', md, re.IGNORECASE) + if not step2_match: + # Fallback: Suche nach "Zielbranche" irgendwo im Text + match = re.search(r'Zielbranche\s*\|?\s*([^|\n]+)', md, re.IGNORECASE) + if match: + return [s.strip() for s in match.group(1).split(',')] + return [] + + table_lines = [] + in_table = False + for line in step2_match.group(0).split('\n'): + if line.strip().startswith('|'): + in_table = True + table_lines.append(line.strip()) + elif in_table: + break + + if len(table_lines) < 3: return [] + header = [s.strip() for s in table_lines[0].split('|') if s.strip()] + industry_col = next((h for h in header if re.search(r'zielbranche|segment|branche|industrie', h, re.IGNORECASE)), None) + if not industry_col: return [] + + col_idx = header.index(industry_col) + industries = [] + for line in table_lines[2:]: + cells = [s.strip() for s in line.split('|') if s.strip()] + if len(cells) > col_idx: industries.append(cells[col_idx]) + return list(set(industries)) + +def _extract_json_from_text(text): + """ + Versucht, ein JSON-Objekt aus einem Textstring zu extrahieren, + unabhängig von Markdown-Formatierung (```json ... ```). + """ + try: + # 1. Versuch: Direktersatz von Markdown-Tags (falls vorhanden) + clean_text = text.replace("```json", "").replace("```", "").strip() + return json.loads(clean_text) + except json.JSONDecodeError: + pass + + try: + # 2. Versuch: Regex Suche nach dem ersten { und letzten } + json_match = re.search(r"(\{[\s\S]*\})", text) + if json_match: + return json.loads(json_match.group(1)) + except json.JSONDecodeError: + pass + + logger.error(f"JSON Parsing fehlgeschlagen. Roher Text: {text[:500]}...") + return None + +def generate_search_strategy(reference_url, context_content): + logger.info(f"Generating strategy for {reference_url}") + api_key = load_gemini_api_key() + target_industries = _extract_target_industries_from_context(context_content) + + homepage_text = get_website_text(reference_url) + if not homepage_text: + logger.warning(f"Strategy Generation: Could not scrape {reference_url}. Relying on context.") + homepage_text = "[WEBSITE ACCESS DENIED] - The strategy must be developed based on the provided STRATEGIC CONTEXT and the URL name alone." + + # Switch to stable 2.5-pro model (which works for v1beta) + GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}" + + prompt = f""" + You are a B2B Market Intelligence Architect. + + --- ROLE DEFINITION --- + You are working for the company described in the "STRATEGIC CONTEXT" below (The "Hunter"). + Your goal is to find new potential customers who look exactly like the "REFERENCE CLIENT" described below (The "Seed" / "Prey"). + + --- STRATEGIC CONTEXT (YOUR COMPANY / THE OFFER) --- + {context_content} + + --- REFERENCE CLIENT HOMEPAGE (THE IDEAL CUSTOMER TO CLONE) --- + URL: {reference_url} + CONTENT: {homepage_text[:10000]} + + --- TASK --- + Develop a search strategy to find **Lookalikes of the Reference Client** who would be interested in **Your Company's Offer**. + + 1. **summaryOfOffer**: A 1-sentence summary of what the **REFERENCE CLIENT** does (NOT what your company does). We need this to search for similar companies. + 2. **idealCustomerProfile**: A concise definition of the Ideal Customer Profile (ICP) based on the Reference Client's characteristics. + 3. **searchStrategyICP**: A detailed description of the Ideal Customer Profile (ICP) based on the analysis. + 4. **digitalSignals**: Identification and description of relevant digital signals that indicate purchase interest or engagement for YOUR offer. + 5. **targetPages**: A list of the most important target pages on the company website relevant for marketing and sales activities. + 6. **signals**: Identify exactly 4 specific digital signals to check on potential lookalikes. + - **CRITICAL**: One signal MUST be "Technographic / Incumbent Search". It must look for existing competitor software or legacy systems that **YOUR COMPANY'S OFFER** replaces or complements. + - The other 3 signals should focus on business pains or strategic fit. + + --- SIGNAL DEFINITION --- + For EACH signal, you MUST provide: + - `id`: A unique ID (e.g., "sig_1"). + - `name`: A short, descriptive name. + - `description`: What does this signal indicate? + - `targetPageKeywords`: A list of 3-5 keywords to look for on a company's website (e.g., ["career", "jobs"] for a hiring signal). + - `proofStrategy`: An object containing: + - `likelySource`: Where on the website or web is this info found? (e.g., "Careers Page"). + - `searchQueryTemplate`: A Google search query to find this. Use `{{COMPANY}}` as a placeholder for the company name. + Example: `site:{{COMPANY}} "software engineer" OR "developer"` + + --- OUTPUT FORMAT --- + Return ONLY a valid JSON object. + {{ + "summaryOfOffer": "The Reference Client provides...", + "idealCustomerProfile": "...", + "searchStrategyICP": "...", + "digitalSignals": "...", + "targetPages": "...", + "signals": [ ... ] + }} + """ + + payload = {"contents": [{"parts": [{"text": prompt}]}]} + logger.info("Sende Anfrage an Gemini API...") + try: + response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'}) + response.raise_for_status() + res_json = response.json() + logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).") + + text = res_json['candidates'][0]['content']['parts'][0]['text'] + + # DEBUG LOGGING FOR RAW JSON + logger.error(f"RAW GEMINI JSON RESPONSE: {text}") + + result = _extract_json_from_text(text) + + if not result: + raise ValueError("Konnte kein valides JSON extrahieren") + + return result + + except Exception as e: + logger.error(f"Strategy generation failed: {e}") + # Return fallback to avoid frontend crash + return { + "summaryOfOffer": "Error generating strategy. Please check logs.", + "idealCustomerProfile": "Error generating ICP. Please check logs.", + "searchStrategyICP": "Error generating Search Strategy ICP. Please check logs.", + "digitalSignals": "Error generating Digital Signals. Please check logs.", + "targetPages": "Error generating Target Pages. Please check logs.", + "signals": [] + } + +def identify_competitors(reference_url, target_market, industries, summary_of_offer=None): + logger.info(f"Identifying competitors for {reference_url}") + api_key = load_gemini_api_key() + # Switch to stable 2.5-pro model + GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}" + + prompt = f""" + You are a B2B Market Analyst. Find 3-5 direct competitors or highly similar companies (lookalikes) for the company at `{reference_url}`. + + --- CONTEXT --- + - Reference Client Business (What they do): {summary_of_offer} + - Target Market: {target_market} + - Relevant Industries: {', '.join(industries)} + + --- TASK --- + Identify companies that are **similar to the Reference Client** (i.e., Lookalikes). + We are looking for other companies that do the same thing as `{reference_url}`. + + Categorize them into three groups: + 1. 'localCompetitors': Competitors in the same immediate region/city. + 2. 'nationalCompetitors': Competitors operating across the same country. + 3. 'internationalCompetitors': Global players. + + For EACH competitor, you MUST provide: + - `id`: A unique, URL-friendly identifier (e.g., "competitor-name-gmbh"). + - `name`: The official, full name of the company. + - `description`: A concise explanation of why they are a competitor. + + --- OUTPUT FORMAT --- + Return ONLY a valid JSON object with the following structure: + {{ + "localCompetitors": [ {{ "id": "...", "name": "...", "description": "..." }} ], + "nationalCompetitors": [ ... ], + "internationalCompetitors": [ ... ] + }} + """ + + payload = {"contents": [{"parts": [{"text": prompt}]}]} + logger.info("Sende Anfrage an Gemini API...") + # logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload, indent=2)}") + try: + response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'}) + response.raise_for_status() + res_json = response.json() + logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).") + + text = res_json['candidates'][0]['content']['parts'][0]['text'] + result = _extract_json_from_text(text) + + if not result: + raise ValueError("Konnte kein valides JSON extrahieren") + + return result + + except Exception as e: + logger.error(f"Competitor identification failed: {e}") + return {"localCompetitors": [], "nationalCompetitors": [], "internationalCompetitors": []} def analyze_company(company_name, strategy, target_market): logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} ---") @@ -95,9 +339,35 @@ def analyze_company(company_name, strategy, target_market): logger.info(f"Website via SerpAPI gefunden: {url}") if not url: - # Fallback: Frage Gemini - # ... (Gemini URL fallback logic remains same) ... - pass + # Fallback: Frage Gemini (Low Confidence) + logger.info("Keine URL via SerpAPI, frage Gemini...") + prompt_url = f"What is the official homepage URL for the company '{company_name}' in the market '{target_market}'? Respond with ONLY the single, complete URL and nothing else." + payload_url = {"contents": [{"parts": [{"text": prompt_url}]}]} + logger.info("Sende Anfrage an Gemini API (URL Fallback)...") + # logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload_url, indent=2)}") + try: + res = requests.post(GEMINI_API_URL, json=payload_url, headers={'Content-Type': 'application/json'}, timeout=15) + res.raise_for_status() + res_json = res.json() + logger.info(f"Gemini API-Antwort erhalten (Status: {res.status_code}).") + + candidate = res_json.get('candidates', [{}])[0] + content = candidate.get('content', {}).get('parts', [{}])[0] + text_response = content.get('text', '').strip() + + url_match = re.search(r'(https?://[^\s"]+)', text_response) + if url_match: + url = url_match.group(1) + logger.info(f"Gemini Fallback hat URL gefunden: {url}") + else: + logger.warning(f"Keine gültige URL in Gemini-Antwort gefunden: '{text_response}'") + + except Exception as e: + logger.error(f"Gemini URL Fallback failed: {e}") + pass + + if not url or not url.startswith("http"): + return {"error": f"Could not find website for {company_name}"} # 2. Homepage Scraping with GRACEFUL FALLBACK homepage_text = "" @@ -116,8 +386,8 @@ def analyze_company(company_name, strategy, target_market): scraping_note = "(No URL found)" # --- ENHANCED: EXTERNAL TECHNOGRAPHIC INTELLIGENCE --- - # ... (remains same) ... - + # Suche aktiv nach Wettbewerbern, nicht nur auf der Firmenwebsite. + tech_evidence = [] # Liste bekannter Wettbewerber / Incumbents known_incumbents = [ @@ -136,7 +406,7 @@ def analyze_company(company_name, strategy, target_market): tech_queries = [ f'"{company_name}" ({group1})', f'"{company_name}" ({group2})', - f'"{company_name}" "supplier portal" login' # Suche nach dem Portal selbst + f'"{company_name}" "supplier portal" login" # Suche nach dem Portal selbst ] logger.info(f"Starte erweiterte Tech-Stack-Suche für {company_name}...") @@ -155,7 +425,7 @@ def analyze_company(company_name, strategy, target_market): # Firmographics Search firmographics_results = serp_search(f"{company_name} Umsatz Mitarbeiterzahl 2023") - firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})" for r in firmographics_results]) + firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})") for r in firmographics_results]) # Signal Searches (Original Strategy) signals = strategy.get('signals', []) @@ -182,7 +452,7 @@ def analyze_company(company_name, strategy, target_market): logger.info(f"Signal Search '{signal['name']}': {query}") results = serp_search(query, num_results=3) if results: - search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}" for r in results]) + search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}") for r in results]) if search_context: signal_evidence.append(f"SIGNAL '{signal['name']}':\n{search_context}") @@ -214,7 +484,7 @@ def analyze_company(company_name, strategy, target_market): TASK: 1. **Firmographics**: Estimate Revenue and Employees. 2. **Technographic Audit**: Look for specific competitor software or legacy systems mentioned in EVIDENCE 1 (e.g., "Partner of SynerTrade", "Login to Jaggaer Portal"). - 3. **Status**: + 3. **Status**: - Set to "Nutzt Wettbewerber" if ANY competitor technology is found (Ariba, Jaggaer, SynerTrade, Coupa, etc.). - Set to "Greenfield" ONLY if absolutely no competitor tech is found. - Set to "Bestandskunde" if they already use our solution. @@ -406,4 +676,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file