fix: Robustify Market Intel - Complete rewrite of orchestrator
- market_intel_orchestrator.py: Overwrote file to enforce all fixes (URL auto-scheme, User-Agent, Graceful Fallback in Strategy and Audit).
This commit is contained in:
@@ -59,6 +59,10 @@ def load_serp_api_key(file_path="serpapikey.txt"):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def get_website_text(url):
|
def get_website_text(url):
|
||||||
|
# Auto-fix missing scheme
|
||||||
|
if url and not url.startswith('http'):
|
||||||
|
url = 'https://' + url
|
||||||
|
|
||||||
logger.info(f"Scraping URL: {url}")
|
logger.info(f"Scraping URL: {url}")
|
||||||
try:
|
try:
|
||||||
# Use a more realistic, modern User-Agent to avoid blocking
|
# Use a more realistic, modern User-Agent to avoid blocking
|
||||||
@@ -80,7 +84,247 @@ def get_website_text(url):
|
|||||||
logger.error(f"Scraping failed for {url}: {e}")
|
logger.error(f"Scraping failed for {url}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# ... (omitted parts) ...
|
def serp_search(query, num_results=3):
|
||||||
|
"""Führt eine Google-Suche über SerpAPI durch."""
|
||||||
|
api_key = load_serp_api_key()
|
||||||
|
if not api_key:
|
||||||
|
logger.warning("SerpAPI Key fehlt. Suche übersprungen.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
logger.info(f"SerpAPI Suche: {query}")
|
||||||
|
try:
|
||||||
|
params = {
|
||||||
|
"engine": "google",
|
||||||
|
"q": query,
|
||||||
|
"api_key": api_key,
|
||||||
|
"num": num_results,
|
||||||
|
"hl": "de",
|
||||||
|
"gl": "de"
|
||||||
|
}
|
||||||
|
response = requests.get("https://serpapi.com/search", params=params, timeout=20)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
results = []
|
||||||
|
if "organic_results" in data:
|
||||||
|
for result in data["organic_results"]:
|
||||||
|
results.append({
|
||||||
|
"title": result.get("title"),
|
||||||
|
"link": result.get("link"),
|
||||||
|
"snippet": result.get("snippet")
|
||||||
|
})
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"SerpAPI Fehler: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _extract_target_industries_from_context(context_content):
|
||||||
|
md = context_content
|
||||||
|
# Versuche verschiedene Muster für die Tabelle, falls das Format variiert
|
||||||
|
step2_match = re.search(r'##\s*Schritt\s*2:[\s\S]*?(?=\n##\s*Schritt\s*\d:|
|
||||||
|
*$)', md, re.IGNORECASE)
|
||||||
|
if not step2_match:
|
||||||
|
# Fallback: Suche nach "Zielbranche" irgendwo im Text
|
||||||
|
match = re.search(r'Zielbranche\s*\|?\s*([^|\n]+)', md, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
return [s.strip() for s in match.group(1).split(',')]
|
||||||
|
return []
|
||||||
|
|
||||||
|
table_lines = []
|
||||||
|
in_table = False
|
||||||
|
for line in step2_match.group(0).split('\n'):
|
||||||
|
if line.strip().startswith('|'):
|
||||||
|
in_table = True
|
||||||
|
table_lines.append(line.strip())
|
||||||
|
elif in_table:
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(table_lines) < 3: return []
|
||||||
|
header = [s.strip() for s in table_lines[0].split('|') if s.strip()]
|
||||||
|
industry_col = next((h for h in header if re.search(r'zielbranche|segment|branche|industrie', h, re.IGNORECASE)), None)
|
||||||
|
if not industry_col: return []
|
||||||
|
|
||||||
|
col_idx = header.index(industry_col)
|
||||||
|
industries = []
|
||||||
|
for line in table_lines[2:]:
|
||||||
|
cells = [s.strip() for s in line.split('|') if s.strip()]
|
||||||
|
if len(cells) > col_idx: industries.append(cells[col_idx])
|
||||||
|
return list(set(industries))
|
||||||
|
|
||||||
|
def _extract_json_from_text(text):
|
||||||
|
"""
|
||||||
|
Versucht, ein JSON-Objekt aus einem Textstring zu extrahieren,
|
||||||
|
unabhängig von Markdown-Formatierung (```json ... ```).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 1. Versuch: Direktersatz von Markdown-Tags (falls vorhanden)
|
||||||
|
clean_text = text.replace("```json", "").replace("```", "").strip()
|
||||||
|
return json.loads(clean_text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 2. Versuch: Regex Suche nach dem ersten { und letzten }
|
||||||
|
json_match = re.search(r"(\{[\s\S]*\})", text)
|
||||||
|
if json_match:
|
||||||
|
return json.loads(json_match.group(1))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logger.error(f"JSON Parsing fehlgeschlagen. Roher Text: {text[:500]}...")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def generate_search_strategy(reference_url, context_content):
|
||||||
|
logger.info(f"Generating strategy for {reference_url}")
|
||||||
|
api_key = load_gemini_api_key()
|
||||||
|
target_industries = _extract_target_industries_from_context(context_content)
|
||||||
|
|
||||||
|
homepage_text = get_website_text(reference_url)
|
||||||
|
if not homepage_text:
|
||||||
|
logger.warning(f"Strategy Generation: Could not scrape {reference_url}. Relying on context.")
|
||||||
|
homepage_text = "[WEBSITE ACCESS DENIED] - The strategy must be developed based on the provided STRATEGIC CONTEXT and the URL name alone."
|
||||||
|
|
||||||
|
# Switch to stable 2.5-pro model (which works for v1beta)
|
||||||
|
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are a B2B Market Intelligence Architect.
|
||||||
|
|
||||||
|
--- ROLE DEFINITION ---
|
||||||
|
You are working for the company described in the "STRATEGIC CONTEXT" below (The "Hunter").
|
||||||
|
Your goal is to find new potential customers who look exactly like the "REFERENCE CLIENT" described below (The "Seed" / "Prey").
|
||||||
|
|
||||||
|
--- STRATEGIC CONTEXT (YOUR COMPANY / THE OFFER) ---
|
||||||
|
{context_content}
|
||||||
|
|
||||||
|
--- REFERENCE CLIENT HOMEPAGE (THE IDEAL CUSTOMER TO CLONE) ---
|
||||||
|
URL: {reference_url}
|
||||||
|
CONTENT: {homepage_text[:10000]}
|
||||||
|
|
||||||
|
--- TASK ---
|
||||||
|
Develop a search strategy to find **Lookalikes of the Reference Client** who would be interested in **Your Company's Offer**.
|
||||||
|
|
||||||
|
1. **summaryOfOffer**: A 1-sentence summary of what the **REFERENCE CLIENT** does (NOT what your company does). We need this to search for similar companies.
|
||||||
|
2. **idealCustomerProfile**: A concise definition of the Ideal Customer Profile (ICP) based on the Reference Client's characteristics.
|
||||||
|
3. **searchStrategyICP**: A detailed description of the Ideal Customer Profile (ICP) based on the analysis.
|
||||||
|
4. **digitalSignals**: Identification and description of relevant digital signals that indicate purchase interest or engagement for YOUR offer.
|
||||||
|
5. **targetPages**: A list of the most important target pages on the company website relevant for marketing and sales activities.
|
||||||
|
6. **signals**: Identify exactly 4 specific digital signals to check on potential lookalikes.
|
||||||
|
- **CRITICAL**: One signal MUST be "Technographic / Incumbent Search". It must look for existing competitor software or legacy systems that **YOUR COMPANY'S OFFER** replaces or complements.
|
||||||
|
- The other 3 signals should focus on business pains or strategic fit.
|
||||||
|
|
||||||
|
--- SIGNAL DEFINITION ---
|
||||||
|
For EACH signal, you MUST provide:
|
||||||
|
- `id`: A unique ID (e.g., "sig_1").
|
||||||
|
- `name`: A short, descriptive name.
|
||||||
|
- `description`: What does this signal indicate?
|
||||||
|
- `targetPageKeywords`: A list of 3-5 keywords to look for on a company's website (e.g., ["career", "jobs"] for a hiring signal).
|
||||||
|
- `proofStrategy`: An object containing:
|
||||||
|
- `likelySource`: Where on the website or web is this info found? (e.g., "Careers Page").
|
||||||
|
- `searchQueryTemplate`: A Google search query to find this. Use `{{COMPANY}}` as a placeholder for the company name.
|
||||||
|
Example: `site:{{COMPANY}} "software engineer" OR "developer"`
|
||||||
|
|
||||||
|
--- OUTPUT FORMAT ---
|
||||||
|
Return ONLY a valid JSON object.
|
||||||
|
{{
|
||||||
|
"summaryOfOffer": "The Reference Client provides...",
|
||||||
|
"idealCustomerProfile": "...",
|
||||||
|
"searchStrategyICP": "...",
|
||||||
|
"digitalSignals": "...",
|
||||||
|
"targetPages": "...",
|
||||||
|
"signals": [ ... ]
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
payload = {"contents": [{"parts": [{"text": prompt}]}]}
|
||||||
|
logger.info("Sende Anfrage an Gemini API...")
|
||||||
|
try:
|
||||||
|
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||||
|
response.raise_for_status()
|
||||||
|
res_json = response.json()
|
||||||
|
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||||
|
|
||||||
|
text = res_json['candidates'][0]['content']['parts'][0]['text']
|
||||||
|
|
||||||
|
# DEBUG LOGGING FOR RAW JSON
|
||||||
|
logger.error(f"RAW GEMINI JSON RESPONSE: {text}")
|
||||||
|
|
||||||
|
result = _extract_json_from_text(text)
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Strategy generation failed: {e}")
|
||||||
|
# Return fallback to avoid frontend crash
|
||||||
|
return {
|
||||||
|
"summaryOfOffer": "Error generating strategy. Please check logs.",
|
||||||
|
"idealCustomerProfile": "Error generating ICP. Please check logs.",
|
||||||
|
"searchStrategyICP": "Error generating Search Strategy ICP. Please check logs.",
|
||||||
|
"digitalSignals": "Error generating Digital Signals. Please check logs.",
|
||||||
|
"targetPages": "Error generating Target Pages. Please check logs.",
|
||||||
|
"signals": []
|
||||||
|
}
|
||||||
|
|
||||||
|
def identify_competitors(reference_url, target_market, industries, summary_of_offer=None):
|
||||||
|
logger.info(f"Identifying competitors for {reference_url}")
|
||||||
|
api_key = load_gemini_api_key()
|
||||||
|
# Switch to stable 2.5-pro model
|
||||||
|
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are a B2B Market Analyst. Find 3-5 direct competitors or highly similar companies (lookalikes) for the company at `{reference_url}`.
|
||||||
|
|
||||||
|
--- CONTEXT ---
|
||||||
|
- Reference Client Business (What they do): {summary_of_offer}
|
||||||
|
- Target Market: {target_market}
|
||||||
|
- Relevant Industries: {', '.join(industries)}
|
||||||
|
|
||||||
|
--- TASK ---
|
||||||
|
Identify companies that are **similar to the Reference Client** (i.e., Lookalikes).
|
||||||
|
We are looking for other companies that do the same thing as `{reference_url}`.
|
||||||
|
|
||||||
|
Categorize them into three groups:
|
||||||
|
1. 'localCompetitors': Competitors in the same immediate region/city.
|
||||||
|
2. 'nationalCompetitors': Competitors operating across the same country.
|
||||||
|
3. 'internationalCompetitors': Global players.
|
||||||
|
|
||||||
|
For EACH competitor, you MUST provide:
|
||||||
|
- `id`: A unique, URL-friendly identifier (e.g., "competitor-name-gmbh").
|
||||||
|
- `name`: The official, full name of the company.
|
||||||
|
- `description`: A concise explanation of why they are a competitor.
|
||||||
|
|
||||||
|
--- OUTPUT FORMAT ---
|
||||||
|
Return ONLY a valid JSON object with the following structure:
|
||||||
|
{{
|
||||||
|
"localCompetitors": [ {{ "id": "...", "name": "...", "description": "..." }} ],
|
||||||
|
"nationalCompetitors": [ ... ],
|
||||||
|
"internationalCompetitors": [ ... ]
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
payload = {"contents": [{"parts": [{"text": prompt}]}]}
|
||||||
|
logger.info("Sende Anfrage an Gemini API...")
|
||||||
|
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload, indent=2)}")
|
||||||
|
try:
|
||||||
|
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||||
|
response.raise_for_status()
|
||||||
|
res_json = response.json()
|
||||||
|
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||||
|
|
||||||
|
text = res_json['candidates'][0]['content']['parts'][0]['text']
|
||||||
|
result = _extract_json_from_text(text)
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Competitor identification failed: {e}")
|
||||||
|
return {"localCompetitors": [], "nationalCompetitors": [], "internationalCompetitors": []}
|
||||||
|
|
||||||
def analyze_company(company_name, strategy, target_market):
|
def analyze_company(company_name, strategy, target_market):
|
||||||
logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} ---")
|
logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} ---")
|
||||||
@@ -95,9 +339,35 @@ def analyze_company(company_name, strategy, target_market):
|
|||||||
logger.info(f"Website via SerpAPI gefunden: {url}")
|
logger.info(f"Website via SerpAPI gefunden: {url}")
|
||||||
|
|
||||||
if not url:
|
if not url:
|
||||||
# Fallback: Frage Gemini
|
# Fallback: Frage Gemini (Low Confidence)
|
||||||
# ... (Gemini URL fallback logic remains same) ...
|
logger.info("Keine URL via SerpAPI, frage Gemini...")
|
||||||
pass
|
prompt_url = f"What is the official homepage URL for the company '{company_name}' in the market '{target_market}'? Respond with ONLY the single, complete URL and nothing else."
|
||||||
|
payload_url = {"contents": [{"parts": [{"text": prompt_url}]}]}
|
||||||
|
logger.info("Sende Anfrage an Gemini API (URL Fallback)...")
|
||||||
|
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload_url, indent=2)}")
|
||||||
|
try:
|
||||||
|
res = requests.post(GEMINI_API_URL, json=payload_url, headers={'Content-Type': 'application/json'}, timeout=15)
|
||||||
|
res.raise_for_status()
|
||||||
|
res_json = res.json()
|
||||||
|
logger.info(f"Gemini API-Antwort erhalten (Status: {res.status_code}).")
|
||||||
|
|
||||||
|
candidate = res_json.get('candidates', [{}])[0]
|
||||||
|
content = candidate.get('content', {}).get('parts', [{}])[0]
|
||||||
|
text_response = content.get('text', '').strip()
|
||||||
|
|
||||||
|
url_match = re.search(r'(https?://[^\s"]+)', text_response)
|
||||||
|
if url_match:
|
||||||
|
url = url_match.group(1)
|
||||||
|
logger.info(f"Gemini Fallback hat URL gefunden: {url}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Keine gültige URL in Gemini-Antwort gefunden: '{text_response}'")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Gemini URL Fallback failed: {e}")
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not url or not url.startswith("http"):
|
||||||
|
return {"error": f"Could not find website for {company_name}"}
|
||||||
|
|
||||||
# 2. Homepage Scraping with GRACEFUL FALLBACK
|
# 2. Homepage Scraping with GRACEFUL FALLBACK
|
||||||
homepage_text = ""
|
homepage_text = ""
|
||||||
@@ -116,8 +386,8 @@ def analyze_company(company_name, strategy, target_market):
|
|||||||
scraping_note = "(No URL found)"
|
scraping_note = "(No URL found)"
|
||||||
|
|
||||||
# --- ENHANCED: EXTERNAL TECHNOGRAPHIC INTELLIGENCE ---
|
# --- ENHANCED: EXTERNAL TECHNOGRAPHIC INTELLIGENCE ---
|
||||||
# ... (remains same) ...
|
# Suche aktiv nach Wettbewerbern, nicht nur auf der Firmenwebsite.
|
||||||
|
tech_evidence = []
|
||||||
|
|
||||||
# Liste bekannter Wettbewerber / Incumbents
|
# Liste bekannter Wettbewerber / Incumbents
|
||||||
known_incumbents = [
|
known_incumbents = [
|
||||||
@@ -136,7 +406,7 @@ def analyze_company(company_name, strategy, target_market):
|
|||||||
tech_queries = [
|
tech_queries = [
|
||||||
f'"{company_name}" ({group1})',
|
f'"{company_name}" ({group1})',
|
||||||
f'"{company_name}" ({group2})',
|
f'"{company_name}" ({group2})',
|
||||||
f'"{company_name}" "supplier portal" login' # Suche nach dem Portal selbst
|
f'"{company_name}" "supplier portal" login" # Suche nach dem Portal selbst
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info(f"Starte erweiterte Tech-Stack-Suche für {company_name}...")
|
logger.info(f"Starte erweiterte Tech-Stack-Suche für {company_name}...")
|
||||||
@@ -155,7 +425,7 @@ def analyze_company(company_name, strategy, target_market):
|
|||||||
|
|
||||||
# Firmographics Search
|
# Firmographics Search
|
||||||
firmographics_results = serp_search(f"{company_name} Umsatz Mitarbeiterzahl 2023")
|
firmographics_results = serp_search(f"{company_name} Umsatz Mitarbeiterzahl 2023")
|
||||||
firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})" for r in firmographics_results])
|
firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})") for r in firmographics_results])
|
||||||
|
|
||||||
# Signal Searches (Original Strategy)
|
# Signal Searches (Original Strategy)
|
||||||
signals = strategy.get('signals', [])
|
signals = strategy.get('signals', [])
|
||||||
@@ -182,7 +452,7 @@ def analyze_company(company_name, strategy, target_market):
|
|||||||
logger.info(f"Signal Search '{signal['name']}': {query}")
|
logger.info(f"Signal Search '{signal['name']}': {query}")
|
||||||
results = serp_search(query, num_results=3)
|
results = serp_search(query, num_results=3)
|
||||||
if results:
|
if results:
|
||||||
search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}" for r in results])
|
search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}") for r in results])
|
||||||
|
|
||||||
if search_context:
|
if search_context:
|
||||||
signal_evidence.append(f"SIGNAL '{signal['name']}':\n{search_context}")
|
signal_evidence.append(f"SIGNAL '{signal['name']}':\n{search_context}")
|
||||||
@@ -214,7 +484,7 @@ def analyze_company(company_name, strategy, target_market):
|
|||||||
TASK:
|
TASK:
|
||||||
1. **Firmographics**: Estimate Revenue and Employees.
|
1. **Firmographics**: Estimate Revenue and Employees.
|
||||||
2. **Technographic Audit**: Look for specific competitor software or legacy systems mentioned in EVIDENCE 1 (e.g., "Partner of SynerTrade", "Login to Jaggaer Portal").
|
2. **Technographic Audit**: Look for specific competitor software or legacy systems mentioned in EVIDENCE 1 (e.g., "Partner of SynerTrade", "Login to Jaggaer Portal").
|
||||||
3. **Status**:
|
3. **Status**:
|
||||||
- Set to "Nutzt Wettbewerber" if ANY competitor technology is found (Ariba, Jaggaer, SynerTrade, Coupa, etc.).
|
- Set to "Nutzt Wettbewerber" if ANY competitor technology is found (Ariba, Jaggaer, SynerTrade, Coupa, etc.).
|
||||||
- Set to "Greenfield" ONLY if absolutely no competitor tech is found.
|
- Set to "Greenfield" ONLY if absolutely no competitor tech is found.
|
||||||
- Set to "Bestandskunde" if they already use our solution.
|
- Set to "Bestandskunde" if they already use our solution.
|
||||||
@@ -406,4 +676,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Reference in New Issue
Block a user