Docs: Aktualisierung der Dokumentation für Task [2ea88f42]
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
import sys
|
||||
import os
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from collections import Counter
|
||||
import re
|
||||
|
||||
# Add backend to path to import models
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
|
||||
from backend.config import settings
|
||||
from backend.database import Contact, JobRolePattern
|
||||
|
||||
def clean_text(text):
|
||||
if not text: return ""
|
||||
# Keep only alphanumeric and spaces
|
||||
text = re.sub(r'[^\w\s]', ' ', text)
|
||||
return text.lower().strip()
|
||||
|
||||
def get_ngrams(tokens, n):
|
||||
if len(tokens) < n:
|
||||
return []
|
||||
return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
|
||||
|
||||
def analyze_patterns():
|
||||
print(f"Connecting to database: {settings.DATABASE_URL}")
|
||||
engine = create_engine(settings.DATABASE_URL)
|
||||
Session = sessionmaker(bind=engine)
|
||||
session = Session()
|
||||
|
||||
try:
|
||||
# Fetch all contacts with a role
|
||||
contacts = session.query(Contact).filter(Contact.role != None, Contact.job_title != None).all()
|
||||
print(f"Found {len(contacts)} classified contacts to analyze.")
|
||||
|
||||
role_groups = {}
|
||||
for c in contacts:
|
||||
if c.role not in role_groups:
|
||||
role_groups[c.role] = []
|
||||
role_groups[c.role].append(c.job_title)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(" JOB TITLE PATTERN ANALYSIS REPORT")
|
||||
print("="*60 + "\n")
|
||||
|
||||
for role, titles in role_groups.items():
|
||||
print(f"--- ROLE: {role} ({len(titles)} samples) ---")
|
||||
|
||||
# Tokenize all titles
|
||||
all_tokens = []
|
||||
all_bigrams = []
|
||||
|
||||
for t in titles:
|
||||
cleaned = clean_text(t)
|
||||
tokens = [w for w in cleaned.split() if len(w) > 2] # Ignore short words
|
||||
all_tokens.extend(tokens)
|
||||
all_bigrams.extend(get_ngrams(tokens, 2))
|
||||
|
||||
# Analyze frequencies
|
||||
common_words = Counter(all_tokens).most_common(15)
|
||||
common_bigrams = Counter(all_bigrams).most_common(10)
|
||||
|
||||
print("Top Keywords:")
|
||||
for word, count in common_words:
|
||||
print(f" - {word}: {count}")
|
||||
|
||||
print("\nTop Bigrams (Word Pairs):")
|
||||
for bg, count in common_bigrams:
|
||||
print(f" - \"{bg}\": {count}")
|
||||
|
||||
print("\nSuggested Regex Components:")
|
||||
top_5_words = [w[0] for w in common_words[:5]]
|
||||
print(f" ({ '|'.join(top_5_words) })")
|
||||
print("\n" + "-"*30 + "\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_patterns()
|
||||
Reference in New Issue
Block a user