Docs: Aktualisierung der Dokumentation für Task [2ea88f42]

2026-03-04 15:14:11 +00:00
parent 6b89c68edc
commit fdca0e5f54
6 changed files with 858 additions and 17 deletions
--- a/company-explorer/backend/scripts/analyze_job_title_patterns.py
+++ b/company-explorer/backend/scripts/analyze_job_title_patterns.py
@@ -0,0 +1,82 @@
+import sys
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from collections import Counter
+import re
+
+# Add backend to path to import models
+sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
+
+from backend.config import settings
+from backend.database import Contact, JobRolePattern
+
+def clean_text(text):
+    if not text: return ""
+    # Keep only alphanumeric and spaces
+    text = re.sub(r'[^\w\s]', ' ', text)
+    return text.lower().strip()
+
+def get_ngrams(tokens, n):
+    if len(tokens) < n:
+        return []
+    return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
+
+def analyze_patterns():
+    print(f"Connecting to database: {settings.DATABASE_URL}")
+    engine = create_engine(settings.DATABASE_URL)
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    try:
+        # Fetch all contacts with a role
+        contacts = session.query(Contact).filter(Contact.role != None, Contact.job_title != None).all()
+        print(f"Found {len(contacts)} classified contacts to analyze.")
+
+        role_groups = {}
+        for c in contacts:
+            if c.role not in role_groups:
+                role_groups[c.role] = []
+            role_groups[c.role].append(c.job_title)
+
+        print("\n" + "="*60)
+        print(" JOB TITLE PATTERN ANALYSIS REPORT")
+        print("="*60 + "\n")
+
+        for role, titles in role_groups.items():
+            print(f"--- ROLE: {role} ({len(titles)} samples) ---")
+            
+            # Tokenize all titles
+            all_tokens = []
+            all_bigrams = []
+            
+            for t in titles:
+                cleaned = clean_text(t)
+                tokens = [w for w in cleaned.split() if len(w) > 2] # Ignore short words
+                all_tokens.extend(tokens)
+                all_bigrams.extend(get_ngrams(tokens, 2))
+
+            # Analyze frequencies
+            common_words = Counter(all_tokens).most_common(15)
+            common_bigrams = Counter(all_bigrams).most_common(10)
+
+            print("Top Keywords:")
+            for word, count in common_words:
+                print(f"  - {word}: {count}")
+            
+            print("\nTop Bigrams (Word Pairs):")
+            for bg, count in common_bigrams:
+                print(f"  - \"{bg}\": {count}")
+            
+            print("\nSuggested Regex Components:")
+            top_5_words = [w[0] for w in common_words[:5]]
+            print(f"  ({ '|'.join(top_5_words) })")
+            print("\n" + "-"*30 + "\n")
+
+    except Exception as e:
+        print(f"Error: {e}")
+    finally:
+        session.close()
+
+if __name__ == "__main__":
+    analyze_patterns()