[31388f42] Implement hierarchical search strategy for more robust role discovery

2026-03-02 09:52:51 +00:00
parent b8597e068c
commit c753c2feab
1 changed files with 46 additions and 32 deletions
--- a/lead-engine/lookup_role.py
+++ b/lead-engine/lookup_role.py
@@ -48,15 +48,17 @@ def extract_role_with_llm(name, company, search_results):
    {context}
    
    TASK:
-    Extract the exact Job Title / Role. Look for terms like "Geschäftsführer", "CEO", "CFO", "Leiter", "Head of", "Manager", "Inhaber", "Arzt".
+    Extract the professional Job Title / Role. 
+    Look for: 
+    - Management: "Geschäftsführer", "Vorstand", "CFO", "Mitglied der Klinikleitung"
+    - Department Heads: "Leiter", "Bereichsleitung", "Head of", "Pflegedienstleitung"
+    - Specialized: "Arzt", "Ingenieur", "Einkäufer"
    
    RULES:
-    1. If multiple roles appear (e.g. "CFO & CEO"), pick the most senior one current role.
-    2. Return ONLY the role string. No full sentences.
-    3. If absolutely no role is mentioned in the snippets, return "Unbekannt".
-    
-    Example Input: "Georg Stahl ... CFO at KLEMM..."
-    Example Output: CFO
+    1. Extract the most specific and senior current role.
+    2. Return ONLY the role string (e.g. "Bereichsleitung Patientenmanagement").
+    3. Maximum length: 60 characters.
+    4. If no role is found, return "Unbekannt".
    """
    
    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
@@ -64,8 +66,8 @@ def extract_role_with_llm(name, company, search_results):
        response = requests.post(url, headers={'Content-Type': 'application/json'}, json={"contents": [{"parts": [{"text": prompt}]}]})
        if response.status_code == 200:
            role = response.json()['candidates'][0]['content']['parts'][0]['text'].strip()
-            # Cleanup: remove punctuation at the end
-            role = role.rstrip('.')
+            # Remove markdown formatting if any
+            role = role.replace('**', '').replace('"', '').rstrip('.')
            return None if "Unbekannt" in role else role
        else:
            print(f"DEBUG: Gemini API Error {response.status_code}: {response.text}")
@@ -76,40 +78,52 @@ def extract_role_with_llm(name, company, search_results):
 def lookup_person_role(name, company):
    """
    Searches for a person's role via SerpAPI and extracts it using LLM.
+    Uses a multi-step search strategy to find the best snippets.
    """
    if not SERP_API_KEY:
        print("Error: SERP_API key not found in .env")
        return None

-    # Broad query to find role/position
-    query = f'{name} {company} Position Job'
+    # Step 1: Highly specific search
+    queries = [
+        f'site:linkedin.com "{name}" "{company}"',
+        f'"{name}" "{company}" position',
+        f'{name} {company}'
+    ]
    
-    params = {
-        "engine": "google",
-        "q": query,
-        "api_key": SERP_API_KEY,
-        "num": 5,
-        "hl": "de", # Force German UI
-        "gl": "de"  # Force German Location
-    }
+    all_results = []
+    for query in queries:
+        params = {
+            "engine": "google",
+            "q": query,
+            "api_key": SERP_API_KEY,
+            "num": 3,
+            "hl": "de",
+            "gl": "de"
+        }

-    try:
-        response = requests.get("https://serpapi.com/search", params=params)
-        response.raise_for_status()
-        data = response.json()
-        
-        organic_results = data.get("organic_results", [])
-        if not organic_results:
-            return None
+        try:
+            response = requests.get("https://serpapi.com/search", params=params)
+            response.raise_for_status()
+            data = response.json()
+            
+            results = data.get("organic_results", [])
+            if results:
+                all_results.extend(results)
+                # If we have good results, we don't necessarily need more searches
+                if len(all_results) >= 3:
+                    break
+        except Exception as e:
+            print(f"SerpAPI lookup failed for query '{query}': {e}")

-        # Delegate extraction to LLM
-        return extract_role_with_llm(name, company, organic_results)
-
-    except Exception as e:
-        print(f"SerpAPI lookup failed: {e}")
+    if not all_results:
        return None

+    # Delegate extraction to LLM with the best results found
+    return extract_role_with_llm(name, company, all_results)
+
 if __name__ == "__main__":
    # Test cases
    print(f"Markus Drees: {lookup_person_role('Markus Drees', 'Ärztehaus Rünthe')}")
    print(f"Georg Stahl: {lookup_person_role('Georg Stahl', 'Klemm Bohrtechnik GmbH')}")
+    print(f"Steve Trüby: {lookup_person_role('Steve Trüby', 'RehaKlinikum Bad Säckingen GmbH')}")