Brancheneinstufung2/lead-engine/lookup_role.py

import os
import requests
import re
from dotenv import load_dotenv

# Try loading .env only if file exists (Local Dev), otherwise rely on Docker Env
env_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '.env'))
if os.path.exists(env_path):
    load_dotenv(dotenv_path=env_path, override=True)

SERP_API_KEY = os.getenv("SERP_API")

if not SERP_API_KEY:
    print(f"DEBUG: SERP_API not found in environment.")

import json

# --- Helper: Get Gemini Key ---
def get_gemini_key():
    candidates = [
        "gemini_api_key.txt",                     # Current dir
        "/app/gemini_api_key.txt",                # Docker default
        os.path.join(os.path.dirname(__file__), "gemini_api_key.txt"), # Script dir
        os.path.join(os.path.dirname(os.path.dirname(__file__)), 'gemini_api_key.txt') # Parent dir
    ]

    for path in candidates:
        if os.path.exists(path):
            try:
                with open(path, 'r') as f:
                    return f.read().strip()
            except:
                pass

    return os.getenv("GEMINI_API_KEY")

def extract_role_with_llm(name, company, search_results):
    """Uses Gemini to identify the job title from search snippets."""
    api_key = get_gemini_key()
    if not api_key: return None

    context = "\n".join([f"- {r.get('title')}: {r.get('snippet')}" for r in search_results])

    prompt = f"""
    Analyze these Google Search results to identify the professional role of "{name}" at "{company}".

    SEARCH RESULTS:
    {context}

    TASK:
    Extract the professional Job Title / Role.
    Look for:
    - Management: "Geschäftsführer", "Vorstand", "CFO", "Mitglied der Klinikleitung"
    - Department Heads: "Leiter", "Bereichsleitung", "Head of", "Pflegedienstleitung"
    - Specialized: "Arzt", "Ingenieur", "Einkäufer"

    RULES:
    1. Extract the most specific and senior current role.
    2. Return ONLY the role string (e.g. "Bereichsleitung Patientenmanagement").
    3. Maximum length: 60 characters.
    4. If no role is found, return "Unbekannt".
    """

    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
    try:
        response = requests.post(url, headers={'Content-Type': 'application/json'}, json={"contents": [{"parts": [{"text": prompt}]}]})
        if response.status_code == 200:
            role = response.json()['candidates'][0]['content']['parts'][0]['text'].strip()
            # Remove markdown formatting if any
            role = role.replace('**', '').replace('"', '').rstrip('.')
            return None if "Unbekannt" in role else role
        else:
            print(f"DEBUG: Gemini API Error {response.status_code}: {response.text}")
    except Exception as e:
        print(f"DEBUG: Gemini API Exception: {e}")
    return None

def lookup_person_role(name, company):
    """
    Searches for a person's role via SerpAPI and extracts it using LLM.
    Uses a multi-step search strategy to find the best snippets.
    """
    if not SERP_API_KEY:
        print("Error: SERP_API key not found in .env")
        return None

    # Step 1: Highly specific search
    queries = [
        f'site:linkedin.com "{name}" "{company}"',
        f'"{name}" "{company}" position',
        f'{name} {company}'
    ]

    all_results = []
    for query in queries:
        params = {
            "engine": "google",
            "q": query,
            "api_key": SERP_API_KEY,
            "num": 3,
            "hl": "de",
            "gl": "de"
        }

        try:
            response = requests.get("https://serpapi.com/search", params=params)
            response.raise_for_status()
            data = response.json()

            results = data.get("organic_results", [])
            if results:
                all_results.extend(results)
                # If we have good results, we don't necessarily need more searches
                if len(all_results) >= 3:
                    break
        except Exception as e:
            print(f"SerpAPI lookup failed for query '{query}': {e}")

    if not all_results:
        return None

    # Delegate extraction to LLM with the best results found
    return extract_role_with_llm(name, company, all_results)

if __name__ == "__main__":
    # Test cases
    print(f"Markus Drees: {lookup_person_role('Markus Drees', 'Ärztehaus Rünthe')}")
    print(f"Georg Stahl: {lookup_person_role('Georg Stahl', 'Klemm Bohrtechnik GmbH')}")
    print(f"Steve Trüby: {lookup_person_role('Steve Trüby', 'RehaKlinikum Bad Säckingen GmbH')}")