[31388f42] Implement hierarchical search strategy for more robust role discovery

This commit is contained in:
2026-03-02 09:52:51 +00:00
parent b8597e068c
commit c753c2feab

View File

@@ -48,15 +48,17 @@ def extract_role_with_llm(name, company, search_results):
{context}
TASK:
Extract the exact Job Title / Role. Look for terms like "Geschäftsführer", "CEO", "CFO", "Leiter", "Head of", "Manager", "Inhaber", "Arzt".
Extract the professional Job Title / Role.
Look for:
- Management: "Geschäftsführer", "Vorstand", "CFO", "Mitglied der Klinikleitung"
- Department Heads: "Leiter", "Bereichsleitung", "Head of", "Pflegedienstleitung"
- Specialized: "Arzt", "Ingenieur", "Einkäufer"
RULES:
1. If multiple roles appear (e.g. "CFO & CEO"), pick the most senior one current role.
2. Return ONLY the role string. No full sentences.
3. If absolutely no role is mentioned in the snippets, return "Unbekannt".
Example Input: "Georg Stahl ... CFO at KLEMM..."
Example Output: CFO
1. Extract the most specific and senior current role.
2. Return ONLY the role string (e.g. "Bereichsleitung Patientenmanagement").
3. Maximum length: 60 characters.
4. If no role is found, return "Unbekannt".
"""
url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
@@ -64,8 +66,8 @@ def extract_role_with_llm(name, company, search_results):
response = requests.post(url, headers={'Content-Type': 'application/json'}, json={"contents": [{"parts": [{"text": prompt}]}]})
if response.status_code == 200:
role = response.json()['candidates'][0]['content']['parts'][0]['text'].strip()
# Cleanup: remove punctuation at the end
role = role.rstrip('.')
# Remove markdown formatting if any
role = role.replace('**', '').replace('"', '').rstrip('.')
return None if "Unbekannt" in role else role
else:
print(f"DEBUG: Gemini API Error {response.status_code}: {response.text}")
@@ -76,40 +78,52 @@ def extract_role_with_llm(name, company, search_results):
def lookup_person_role(name, company):
"""
Searches for a person's role via SerpAPI and extracts it using LLM.
Uses a multi-step search strategy to find the best snippets.
"""
if not SERP_API_KEY:
print("Error: SERP_API key not found in .env")
return None
# Broad query to find role/position
query = f'{name} {company} Position Job'
# Step 1: Highly specific search
queries = [
f'site:linkedin.com "{name}" "{company}"',
f'"{name}" "{company}" position',
f'{name} {company}'
]
params = {
"engine": "google",
"q": query,
"api_key": SERP_API_KEY,
"num": 5,
"hl": "de", # Force German UI
"gl": "de" # Force German Location
}
all_results = []
for query in queries:
params = {
"engine": "google",
"q": query,
"api_key": SERP_API_KEY,
"num": 3,
"hl": "de",
"gl": "de"
}
try:
response = requests.get("https://serpapi.com/search", params=params)
response.raise_for_status()
data = response.json()
organic_results = data.get("organic_results", [])
if not organic_results:
return None
try:
response = requests.get("https://serpapi.com/search", params=params)
response.raise_for_status()
data = response.json()
results = data.get("organic_results", [])
if results:
all_results.extend(results)
# If we have good results, we don't necessarily need more searches
if len(all_results) >= 3:
break
except Exception as e:
print(f"SerpAPI lookup failed for query '{query}': {e}")
# Delegate extraction to LLM
return extract_role_with_llm(name, company, organic_results)
except Exception as e:
print(f"SerpAPI lookup failed: {e}")
if not all_results:
return None
# Delegate extraction to LLM with the best results found
return extract_role_with_llm(name, company, all_results)
if __name__ == "__main__":
# Test cases
print(f"Markus Drees: {lookup_person_role('Markus Drees', 'Ärztehaus Rünthe')}")
print(f"Georg Stahl: {lookup_person_role('Georg Stahl', 'Klemm Bohrtechnik GmbH')}")
print(f"Steve Trüby: {lookup_person_role('Steve Trüby', 'RehaKlinikum Bad Säckingen GmbH')}")