Brancheneinstufung2/company-explorer/backend/scripts/analyze_job_title_patterns.py

import sys
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from collections import Counter
import re

# Add backend to path to import models
sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))

from backend.config import settings
from backend.database import Contact, JobRolePattern

def clean_text(text):
    if not text: return ""
    # Keep only alphanumeric and spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.lower().strip()

def get_ngrams(tokens, n):
    if len(tokens) < n:
        return []
    return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def analyze_patterns():
    print(f"Connecting to database: {settings.DATABASE_URL}")
    engine = create_engine(settings.DATABASE_URL)
    Session = sessionmaker(bind=engine)
    session = Session()

    try:
        # Fetch all contacts with a role
        contacts = session.query(Contact).filter(Contact.role != None, Contact.job_title != None).all()
        print(f"Found {len(contacts)} classified contacts to analyze.")

        role_groups = {}
        for c in contacts:
            if c.role not in role_groups:
                role_groups[c.role] = []
            role_groups[c.role].append(c.job_title)

        print("\n" + "="*60)
        print(" JOB TITLE PATTERN ANALYSIS REPORT")
        print("="*60 + "\n")

        for role, titles in role_groups.items():
            print(f"--- ROLE: {role} ({len(titles)} samples) ---")

            # Tokenize all titles
            all_tokens = []
            all_bigrams = []

            for t in titles:
                cleaned = clean_text(t)
                tokens = [w for w in cleaned.split() if len(w) > 2] # Ignore short words
                all_tokens.extend(tokens)
                all_bigrams.extend(get_ngrams(tokens, 2))

            # Analyze frequencies
            common_words = Counter(all_tokens).most_common(15)
            common_bigrams = Counter(all_bigrams).most_common(10)

            print("Top Keywords:")
            for word, count in common_words:
                print(f"  - {word}: {count}")

            print("\nTop Bigrams (Word Pairs):")
            for bg, count in common_bigrams:
                print(f"  - \"{bg}\": {count}")

            print("\nSuggested Regex Components:")
            top_5_words = [w[0] for w in common_words[:5]]
            print(f"  ({ '|'.join(top_5_words) })")
            print("\n" + "-"*30 + "\n")

    except Exception as e:
        print(f"Error: {e}")
    finally:
        session.close()

if __name__ == "__main__":
    analyze_patterns()