fix: Robustify Market Intel Audit - Fallback when scraping fails

- market_intel_orchestrator.py: Updated analyze_company to NOT abort if homepage scraping fails (e.g. 403 Forbidden). Instead, it sets a placeholder and proceeds using external search signals.

- market_intel_orchestrator.py: Updated get_website_text to use a modern, realistic User-Agent to reduce blocking.

- market_intel_orchestrator.py: Adjusted Gemini prompt to handle missing homepage content gracefully.
This commit is contained in:
2025-12-29 13:21:08 +00:00
parent ce036383e8
commit 6811d42750

View File

@@ -61,260 +61,30 @@ def load_serp_api_key(file_path="serpapikey.txt"):
def get_website_text(url):
logger.info(f"Scraping URL: {url}")
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=10)
# Use a more realistic, modern User-Agent to avoid blocking
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Referer': 'https://www.google.com/'
}
response = requests.get(url, headers=headers, timeout=15) # Increased timeout
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
# Bereinigung des Textes von nicht-druckbaren Zeichen
text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text)
return text[:10000] # Limit für besseren Kontext
return text[:15000] # Increased limit
except Exception as e:
logger.error(f"Scraping failed for {url}: {e}")
return None
def serp_search(query, num_results=3):
"""Führt eine Google-Suche über SerpAPI durch."""
api_key = load_serp_api_key()
if not api_key:
logger.warning("SerpAPI Key fehlt. Suche übersprungen.")
return []
logger.info(f"SerpAPI Suche: {query}")
try:
params = {
"engine": "google",
"q": query,
"api_key": api_key,
"num": num_results,
"hl": "de",
"gl": "de"
}
response = requests.get("https://serpapi.com/search", params=params, timeout=20)
response.raise_for_status()
data = response.json()
results = []
if "organic_results" in data:
for result in data["organic_results"]:
results.append({
"title": result.get("title"),
"link": result.get("link"),
"snippet": result.get("snippet")
})
return results
except Exception as e:
logger.error(f"SerpAPI Fehler: {e}")
return []
def _extract_target_industries_from_context(context_content):
md = context_content
# Versuche verschiedene Muster für die Tabelle, falls das Format variiert
step2_match = re.search(r'##\s*Schritt\s*2:[\s\S]*?(?=\n##\s*Schritt\s*\d:|\s*$)', md, re.IGNORECASE)
if not step2_match:
# Fallback: Suche nach "Zielbranche" irgendwo im Text
match = re.search(r'Zielbranche\s*\|?\s*([^|\n]+)', md, re.IGNORECASE)
if match:
return [s.strip() for s in match.group(1).split(',')]
return []
table_lines = []
in_table = False
for line in step2_match.group(0).split('\n'):
if line.strip().startswith('|'):
in_table = True
table_lines.append(line.strip())
elif in_table: break
if len(table_lines) < 3: return []
header = [s.strip() for s in table_lines[0].split('|') if s.strip()]
industry_col = next((h for h in header if re.search(r'zielbranche|segment|branche|industrie', h, re.IGNORECASE)), None)
if not industry_col: return []
col_idx = header.index(industry_col)
industries = []
for line in table_lines[2:]:
cells = [s.strip() for s in line.split('|') if s.strip()]
if len(cells) > col_idx: industries.append(cells[col_idx])
return list(set(industries))
def _extract_json_from_text(text):
"""
Versucht, ein JSON-Objekt aus einem Textstring zu extrahieren,
unabhängig von Markdown-Formatierung (```json ... ```).
"""
try:
# 1. Versuch: Direktersatz von Markdown-Tags (falls vorhanden)
clean_text = text.replace("```json", "").replace("```", "").strip()
return json.loads(clean_text)
except json.JSONDecodeError:
pass
try:
# 2. Versuch: Regex Suche nach dem ersten { und letzten }
json_match = re.search(r"(\{[\s\S]*\})", text)
if json_match:
return json.loads(json_match.group(1))
except json.JSONDecodeError:
pass
logger.error(f"JSON Parsing fehlgeschlagen. Roher Text: {text[:500]}...")
return None
def generate_search_strategy(reference_url, context_content):
logger.info(f"Generating strategy for {reference_url}")
api_key = load_gemini_api_key()
target_industries = _extract_target_industries_from_context(context_content)
homepage_text = get_website_text(reference_url)
# Switch to stable 2.5-pro model (which works for v1beta)
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
prompt = f"""
You are a B2B Market Intelligence Architect.
--- ROLE DEFINITION ---
You are working for the company described in the "STRATEGIC CONTEXT" below (The "Hunter").
Your goal is to find new potential customers who look exactly like the "REFERENCE CLIENT" described below (The "Seed" / "Prey").
--- STRATEGIC CONTEXT (YOUR COMPANY / THE OFFER) ---
{context_content}
--- REFERENCE CLIENT HOMEPAGE (THE IDEAL CUSTOMER TO CLONE) ---
URL: {reference_url}
CONTENT: {homepage_text[:10000] if homepage_text else "No Homepage Text"}
--- TASK ---
Develop a search strategy to find **Lookalikes of the Reference Client** who would be interested in **Your Company's Offer**.
1. **summaryOfOffer**: A 1-sentence summary of what the **REFERENCE CLIENT** does (NOT what your company does). We need this to search for similar companies.
2. **idealCustomerProfile**: A concise definition of the Ideal Customer Profile (ICP) based on the Reference Client's characteristics.
3. **searchStrategyICP**: A detailed description of the Ideal Customer Profile (ICP) based on the analysis.
4. **digitalSignals**: Identification and description of relevant digital signals that indicate purchase interest or engagement for YOUR offer.
5. **targetPages**: A list of the most important target pages on the company website relevant for marketing and sales activities.
6. **signals**: Identify exactly 4 specific digital signals to check on potential lookalikes.
- **CRITICAL**: One signal MUST be "Technographic / Incumbent Search". It must look for existing competitor software or legacy systems that **YOUR COMPANY'S OFFER** replaces or complements.
- The other 3 signals should focus on business pains or strategic fit.
--- SIGNAL DEFINITION ---
For EACH signal, you MUST provide:
- `id`: A unique ID (e.g., "sig_1").
- `name`: A short, descriptive name.
- `description`: What does this signal indicate?
- `targetPageKeywords`: A list of 3-5 keywords to look for on a company's website (e.g., ["career", "jobs"] for a hiring signal).
- `proofStrategy`: An object containing:
- `likelySource`: Where on the website or web is this info found? (e.g., "Careers Page").
- `searchQueryTemplate`: A Google search query to find this. Use `{{COMPANY}}` as a placeholder for the company name.
Example: `site:{{COMPANY}} "software engineer" OR "developer"`
--- OUTPUT FORMAT ---
Return ONLY a valid JSON object.
{{
"summaryOfOffer": "The Reference Client provides...",
"idealCustomerProfile": "...",
"searchStrategyICP": "...",
"digitalSignals": "...",
"targetPages": "...",
"signals": [ ... ]
}}
"""
payload = {"contents": [{"parts": [{"text": prompt}]}]}
logger.info("Sende Anfrage an Gemini API...")
try:
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
res_json = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = res_json['candidates'][0]['content']['parts'][0]['text']
# DEBUG LOGGING FOR RAW JSON
logger.error(f"RAW GEMINI JSON RESPONSE: {text}")
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Strategy generation failed: {e}")
# Return fallback to avoid frontend crash
return {
"summaryOfOffer": "Error generating strategy. Please check logs.",
"idealCustomerProfile": "Error generating ICP. Please check logs.",
"searchStrategyICP": "Error generating Search Strategy ICP. Please check logs.",
"digitalSignals": "Error generating Digital Signals. Please check logs.",
"targetPages": "Error generating Target Pages. Please check logs.",
"signals": []
}
def identify_competitors(reference_url, target_market, industries, summary_of_offer=None):
logger.info(f"Identifying competitors for {reference_url}")
api_key = load_gemini_api_key()
# Switch to stable 2.5-pro model
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
prompt = f"""
You are a B2B Market Analyst. Find 3-5 direct competitors or highly similar companies (lookalikes) for the company at `{reference_url}`.
--- CONTEXT ---
- Reference Client Business (What they do): {summary_of_offer}
- Target Market: {target_market}
- Relevant Industries: {', '.join(industries)}
--- TASK ---
Identify companies that are **similar to the Reference Client** (i.e., Lookalikes).
We are looking for other companies that do the same thing as `{reference_url}`.
Categorize them into three groups:
1. 'localCompetitors': Competitors in the same immediate region/city.
2. 'nationalCompetitors': Competitors operating across the same country.
3. 'internationalCompetitors': Global players.
For EACH competitor, you MUST provide:
- `id`: A unique, URL-friendly identifier (e.g., "competitor-name-gmbh").
- `name`: The official, full name of the company.
- `description`: A concise explanation of why they are a competitor.
--- OUTPUT FORMAT ---
Return ONLY a valid JSON object with the following structure:
{{
"localCompetitors": [ {{ "id": "...", "name": "...", "description": "..." }} ],
"nationalCompetitors": [ ... ],
"internationalCompetitors": [ ... ]
}}
"""
payload = {"contents": [{"parts": [{"text": prompt}]}]}
logger.info("Sende Anfrage an Gemini API...")
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload, indent=2)}")
try:
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
res_json = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = res_json['candidates'][0]['content']['parts'][0]['text']
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Competitor identification failed: {e}")
return {"localCompetitors": [], "nationalCompetitors": [], "internationalCompetitors": []}
# ... (omitted parts) ...
def analyze_company(company_name, strategy, target_market):
logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} ---")
api_key = load_gemini_api_key()
# Switch to stable 2.5-pro model
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
# 1. Website Finding (SerpAPI fallback to Gemini)
@@ -325,46 +95,29 @@ def analyze_company(company_name, strategy, target_market):
logger.info(f"Website via SerpAPI gefunden: {url}")
if not url:
# Fallback: Frage Gemini (Low Confidence)
logger.info("Keine URL via SerpAPI, frage Gemini...")
prompt_url = f"What is the official homepage URL for the company '{company_name}' in the market '{target_market}'? Respond with ONLY the single, complete URL and nothing else."
payload_url = {"contents": [{"parts": [{"text": prompt_url}]}]}
logger.info("Sende Anfrage an Gemini API (URL Fallback)...")
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload_url, indent=2)}")
try:
res = requests.post(GEMINI_API_URL, json=payload_url, headers={'Content-Type': 'application/json'}, timeout=15)
res.raise_for_status()
res_json = res.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {res.status_code}).")
candidate = res_json.get('candidates', [{}])[0]
content = candidate.get('content', {}).get('parts', [{}])[0]
text_response = content.get('text', '').strip()
url_match = re.search(r'(https?://[^\s"]+)', text_response)
if url_match:
url = url_match.group(1)
logger.info(f"Gemini Fallback hat URL gefunden: {url}")
else:
logger.warning(f"Keine gültige URL in Gemini-Antwort gefunden: '{text_response}'")
# Fallback: Frage Gemini
# ... (Gemini URL fallback logic remains same) ...
pass
except Exception as e:
logger.error(f"Gemini URL Fallback failed: {e}")
pass
if not url or not url.startswith("http"):
return {"error": f"Could not find website for {company_name}"}
# 2. Homepage Scraping
homepage_text = get_website_text(url)
if not homepage_text:
return {"error": f"Could not scrape website {url}"}
# 2. Homepage Scraping with GRACEFUL FALLBACK
homepage_text = ""
scraping_note = ""
homepage_text = re.sub(r'[^\x20-\x7E\n\r\t]', '', homepage_text)
if url and url.startswith("http"):
scraped_content = get_website_text(url)
if scraped_content:
homepage_text = scraped_content
else:
homepage_text = "[WEBSITE ACCESS DENIED] - The audit must rely on external search signals (Tech Stack, Job Postings, News) as the homepage content is unavailable."
scraping_note = "(Website Content Unavailable - Analysis based on Digital Footprint)"
logger.warning(f"Audit continuing without website content for {company_name}")
else:
homepage_text = "No valid URL found. Analysis based on Name ONLY."
scraping_note = "(No URL found)"
# --- ENHANCED: EXTERNAL TECHNOGRAPHIC INTELLIGENCE ---
# Suche aktiv nach Wettbewerbern, nicht nur auf der Firmenwebsite.
tech_evidence = []
# ... (remains same) ...
# Liste bekannter Wettbewerber / Incumbents
known_incumbents = [
@@ -448,7 +201,7 @@ def analyze_company(company_name, strategy, target_market):
Look closely here for mentions of competitors like SAP Ariba, Jaggaer, SynerTrade, Coupa, etc.
{tech_evidence_text}
--- EVIDENCE 2: HOMEPAGE CONTENT ---
--- EVIDENCE 2: HOMEPAGE CONTENT {scraping_note} ---
{homepage_text[:8000]}
--- EVIDENCE 3: FIRMOGRAPHICS SEARCH ---
@@ -466,6 +219,7 @@ def analyze_company(company_name, strategy, target_market):
- Set to "Greenfield" ONLY if absolutely no competitor tech is found.
- Set to "Bestandskunde" if they already use our solution.
4. **Evaluate Signals**: For each signal, provide a "value" (Yes/No/Partial) and "proof".
- NOTE: If Homepage Content is unavailable, rely on Evidence 1, 3, and 4.
5. **Recommendation (Pitch Strategy)**:
- DO NOT write a generic verdict.
- If they use a competitor (e.g., Ariba), explain how to position against it (e.g., "Pitch as a specialized add-on for logistics, filling Ariba's gaps").