Files
Brancheneinstufung2/company-explorer/backend/scripts/analyze_job_title_patterns.py

83 lines
2.6 KiB
Python

import sys
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from collections import Counter
import re
# Add backend to path to import models
sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
from backend.config import settings
from backend.database import Contact, JobRolePattern
def clean_text(text):
if not text: return ""
# Keep only alphanumeric and spaces
text = re.sub(r'[^\w\s]', ' ', text)
return text.lower().strip()
def get_ngrams(tokens, n):
if len(tokens) < n:
return []
return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
def analyze_patterns():
print(f"Connecting to database: {settings.DATABASE_URL}")
engine = create_engine(settings.DATABASE_URL)
Session = sessionmaker(bind=engine)
session = Session()
try:
# Fetch all contacts with a role
contacts = session.query(Contact).filter(Contact.role != None, Contact.job_title != None).all()
print(f"Found {len(contacts)} classified contacts to analyze.")
role_groups = {}
for c in contacts:
if c.role not in role_groups:
role_groups[c.role] = []
role_groups[c.role].append(c.job_title)
print("\n" + "="*60)
print(" JOB TITLE PATTERN ANALYSIS REPORT")
print("="*60 + "\n")
for role, titles in role_groups.items():
print(f"--- ROLE: {role} ({len(titles)} samples) ---")
# Tokenize all titles
all_tokens = []
all_bigrams = []
for t in titles:
cleaned = clean_text(t)
tokens = [w for w in cleaned.split() if len(w) > 2] # Ignore short words
all_tokens.extend(tokens)
all_bigrams.extend(get_ngrams(tokens, 2))
# Analyze frequencies
common_words = Counter(all_tokens).most_common(15)
common_bigrams = Counter(all_bigrams).most_common(10)
print("Top Keywords:")
for word, count in common_words:
print(f" - {word}: {count}")
print("\nTop Bigrams (Word Pairs):")
for bg, count in common_bigrams:
print(f" - \"{bg}\": {count}")
print("\nSuggested Regex Components:")
top_5_words = [w[0] for w in common_words[:5]]
print(f" ({ '|'.join(top_5_words) })")
print("\n" + "-"*30 + "\n")
except Exception as e:
print(f"Error: {e}")
finally:
session.close()
if __name__ == "__main__":
analyze_patterns()