[31388f42] Implement hierarchical search strategy for more robust role discovery

This commit is contained in:
2026-03-02 09:52:51 +00:00
parent 3be69a3019
commit d08899a7a9

View File

@@ -48,15 +48,17 @@ def extract_role_with_llm(name, company, search_results):
{context} {context}
TASK: TASK:
Extract the exact Job Title / Role. Look for terms like "Geschäftsführer", "CEO", "CFO", "Leiter", "Head of", "Manager", "Inhaber", "Arzt". Extract the professional Job Title / Role.
Look for:
- Management: "Geschäftsführer", "Vorstand", "CFO", "Mitglied der Klinikleitung"
- Department Heads: "Leiter", "Bereichsleitung", "Head of", "Pflegedienstleitung"
- Specialized: "Arzt", "Ingenieur", "Einkäufer"
RULES: RULES:
1. If multiple roles appear (e.g. "CFO & CEO"), pick the most senior one current role. 1. Extract the most specific and senior current role.
2. Return ONLY the role string. No full sentences. 2. Return ONLY the role string (e.g. "Bereichsleitung Patientenmanagement").
3. If absolutely no role is mentioned in the snippets, return "Unbekannt". 3. Maximum length: 60 characters.
4. If no role is found, return "Unbekannt".
Example Input: "Georg Stahl ... CFO at KLEMM..."
Example Output: CFO
""" """
url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}" url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
@@ -64,8 +66,8 @@ def extract_role_with_llm(name, company, search_results):
response = requests.post(url, headers={'Content-Type': 'application/json'}, json={"contents": [{"parts": [{"text": prompt}]}]}) response = requests.post(url, headers={'Content-Type': 'application/json'}, json={"contents": [{"parts": [{"text": prompt}]}]})
if response.status_code == 200: if response.status_code == 200:
role = response.json()['candidates'][0]['content']['parts'][0]['text'].strip() role = response.json()['candidates'][0]['content']['parts'][0]['text'].strip()
# Cleanup: remove punctuation at the end # Remove markdown formatting if any
role = role.rstrip('.') role = role.replace('**', '').replace('"', '').rstrip('.')
return None if "Unbekannt" in role else role return None if "Unbekannt" in role else role
else: else:
print(f"DEBUG: Gemini API Error {response.status_code}: {response.text}") print(f"DEBUG: Gemini API Error {response.status_code}: {response.text}")
@@ -76,40 +78,52 @@ def extract_role_with_llm(name, company, search_results):
def lookup_person_role(name, company): def lookup_person_role(name, company):
""" """
Searches for a person's role via SerpAPI and extracts it using LLM. Searches for a person's role via SerpAPI and extracts it using LLM.
Uses a multi-step search strategy to find the best snippets.
""" """
if not SERP_API_KEY: if not SERP_API_KEY:
print("Error: SERP_API key not found in .env") print("Error: SERP_API key not found in .env")
return None return None
# Broad query to find role/position # Step 1: Highly specific search
query = f'{name} {company} Position Job' queries = [
f'site:linkedin.com "{name}" "{company}"',
f'"{name}" "{company}" position',
f'{name} {company}'
]
params = { all_results = []
"engine": "google", for query in queries:
"q": query, params = {
"api_key": SERP_API_KEY, "engine": "google",
"num": 5, "q": query,
"hl": "de", # Force German UI "api_key": SERP_API_KEY,
"gl": "de" # Force German Location "num": 3,
} "hl": "de",
"gl": "de"
}
try: try:
response = requests.get("https://serpapi.com/search", params=params) response = requests.get("https://serpapi.com/search", params=params)
response.raise_for_status() response.raise_for_status()
data = response.json() data = response.json()
organic_results = data.get("organic_results", []) results = data.get("organic_results", [])
if not organic_results: if results:
return None all_results.extend(results)
# If we have good results, we don't necessarily need more searches
if len(all_results) >= 3:
break
except Exception as e:
print(f"SerpAPI lookup failed for query '{query}': {e}")
# Delegate extraction to LLM if not all_results:
return extract_role_with_llm(name, company, organic_results)
except Exception as e:
print(f"SerpAPI lookup failed: {e}")
return None return None
# Delegate extraction to LLM with the best results found
return extract_role_with_llm(name, company, all_results)
if __name__ == "__main__": if __name__ == "__main__":
# Test cases # Test cases
print(f"Markus Drees: {lookup_person_role('Markus Drees', 'Ärztehaus Rünthe')}") print(f"Markus Drees: {lookup_person_role('Markus Drees', 'Ärztehaus Rünthe')}")
print(f"Georg Stahl: {lookup_person_role('Georg Stahl', 'Klemm Bohrtechnik GmbH')}") print(f"Georg Stahl: {lookup_person_role('Georg Stahl', 'Klemm Bohrtechnik GmbH')}")
print(f"Steve Trüby: {lookup_person_role('Steve Trüby', 'RehaKlinikum Bad Säckingen GmbH')}")