[31188f42] einfügen

einfügen
2026-02-24 06:47:35 +00:00
parent 45fef41a6a
commit 5603d42574
21 changed files with 1575 additions and 152 deletions
--- a/company-explorer/backend/scripts/init.py
+++ b/company-explorer/backend/scripts/init.py
--- a/company-explorer/backend/scripts/check_mappings.py
+++ b/company-explorer/backend/scripts/check_mappings.py
@@ -5,14 +5,14 @@ import os
 # Setup Environment
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))

-from backend.database import SessionLocal, JobRoleMapping
+from backend.database import SessionLocal, JobRolePattern

 def check_mappings():
    db = SessionLocal()
-    count = db.query(JobRoleMapping).count()
-    print(f"Total JobRoleMappings: {count}")
+    count = db.query(JobRolePattern).count()
+    print(f"Total JobRolePatterns: {count}")
    
-    examples = db.query(JobRoleMapping).limit(5).all()
+    examples = db.query(JobRolePattern).limit(5).all()
    for ex in examples:
        print(f"  - {ex.pattern} -> {ex.role}")
        
--- a/company-explorer/backend/scripts/classify_unmapped_titles.py
+++ b/company-explorer/backend/scripts/classify_unmapped_titles.py
@@ -0,0 +1,171 @@
+import sys
+import os
+import argparse
+import json
+import logging
+from sqlalchemy.orm import sessionmaker, declarative_base
+from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime
+from datetime import datetime
+
+# --- Standalone Configuration ---
+# Add the project root to the Python path to find the LLM utility
+sys.path.insert(0, '/app')
+from company_explorer.backend.lib.core_utils import call_gemini_flash
+
+DATABASE_URL = "sqlite:////app/companies_v3_fixed_2.db"
+LOG_FILE = "/app/Log_from_docker/batch_classifier.log"
+BATCH_SIZE = 50 # Number of titles to process in one LLM call
+
+# --- Logging Setup ---
+os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+# --- SQLAlchemy Models (self-contained) ---
+Base = declarative_base()
+
+class RawJobTitle(Base):
+    __tablename__ = 'raw_job_titles'
+    id = Column(Integer, primary_key=True)
+    title = Column(String, unique=True, index=True)
+    count = Column(Integer, default=1)
+    source = Column(String)
+    is_mapped = Column(Boolean, default=False)
+    created_at = Column(DateTime, default=datetime.now)
+    updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
+
+class JobRolePattern(Base):
+    __tablename__ = "job_role_patterns"
+    id = Column(Integer, primary_key=True, index=True)
+    pattern_type = Column(String, default="exact", index=True)
+    pattern_value = Column(String, unique=True)
+    role = Column(String, index=True)
+    priority = Column(Integer, default=100)
+    is_active = Column(Boolean, default=True)
+    created_by = Column(String, default="system")
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+class Persona(Base):
+    __tablename__ = "personas"
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, unique=True, index=True)
+    pains = Column(String)
+    gains = Column(String)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+# --- Database Connection ---
+engine = create_engine(DATABASE_URL)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+def build_classification_prompt(titles_to_classify, available_roles):
+    """Builds the prompt for the LLM to classify a batch of job titles."""
+    prompt = f"""
+    You are an expert in B2B contact segmentation. Your task is to classify a list of job titles into predefined roles.
+
+    Analyze the following list of job titles and assign each one to the most appropriate role from the list provided.
+    
+    The available roles are:
+    - {', '.join(available_roles)}
+
+    RULES:
+    1.  Respond ONLY with a valid JSON object. Do not include any text, explanations, or markdown code fences before or after the JSON.
+    2.  The JSON object should have the original job title as the key and the assigned role as the value.
+    3.  If a job title is ambiguous or you cannot confidently classify it, assign the value "Influencer". Use this as a fallback.
+    4.  Do not invent new roles. Only use the roles from the provided list.
+
+    Here are the job titles to classify:
+    {json.dumps(titles_to_classify, indent=2)}
+
+    Your JSON response:
+    """
+    return prompt
+
+def classify_and_store_titles():
+    db = SessionLocal()
+    try:
+        # 1. Fetch available persona names (roles)
+        personas = db.query(Persona).all()
+        available_roles = [p.name for p in personas]
+        if not available_roles:
+            logger.error("No Personas/Roles found in the database. Cannot classify. Please seed personas first.")
+            return
+            
+        logger.info(f"Classifying based on these roles: {available_roles}")
+
+        # 2. Fetch unmapped titles
+        unmapped_titles = db.query(RawJobTitle).filter(RawJobTitle.is_mapped == False).all()
+        if not unmapped_titles:
+            logger.info("No unmapped job titles found. Nothing to do.")
+            return
+
+        logger.info(f"Found {len(unmapped_titles)} unmapped job titles to process.")
+        
+        # 3. Process in batches
+        for i in range(0, len(unmapped_titles), BATCH_SIZE):
+            batch = unmapped_titles[i:i + BATCH_SIZE]
+            title_strings = [item.title for item in batch]
+            
+            logger.info(f"Processing batch {i//BATCH_SIZE + 1} of { (len(unmapped_titles) + BATCH_SIZE - 1) // BATCH_SIZE } with {len(title_strings)} titles...")
+            
+            # 4. Call LLM
+            prompt = build_classification_prompt(title_strings, available_roles)
+            response_text = ""
+            try:
+                response_text = call_gemini_flash(prompt, json_mode=True)
+                # Clean potential markdown fences
+                if response_text.strip().startswith("```json"):
+                    response_text = response_text.strip()[7:-4]
+                
+                classifications = json.loads(response_text)
+            except Exception as e:
+                logger.error(f"Failed to get or parse LLM response for batch. Skipping. Error: {e}")
+                logger.error(f"Raw response was: {response_text}")
+                continue
+
+            # 5. Process results
+            new_patterns = 0
+            for title_obj in batch:
+                original_title = title_obj.title
+                assigned_role = classifications.get(original_title)
+
+                if assigned_role and assigned_role in available_roles:
+                    exists = db.query(JobRolePattern).filter(JobRolePattern.pattern_value == original_title).first()
+                    if not exists:
+                        new_pattern = JobRolePattern(
+                            pattern_type='exact',
+                            pattern_value=original_title,
+                            role=assigned_role,
+                            priority=90, 
+                            created_by='llm_batch'
+                        )
+                        db.add(new_pattern)
+                        new_patterns += 1
+                    title_obj.is_mapped = True
+                else:
+                    logger.warning(f"Could not classify '{original_title}' or role '{assigned_role}' is invalid. It will be re-processed later.")
+
+            db.commit()
+            logger.info(f"Batch {i//BATCH_SIZE + 1} complete. Created {new_patterns} new mapping patterns.")
+
+    except Exception as e:
+        logger.error(f"An unexpected error occurred: {e}", exc_info=True)
+        db.rollback()
+    finally:
+        db.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Batch classify unmapped job titles using an LLM.")
+    args = parser.parse_args()
+    
+    logger.info("--- Starting Batch Classification Script ---")
+    classify_and_store_titles()
+    logger.info("--- Batch Classification Script Finished ---")
--- a/company-explorer/backend/scripts/import_job_titles.py
+++ b/company-explorer/backend/scripts/import_job_titles.py
@@ -1,95 +1,66 @@
 import sys
 import os
 import csv
+from collections import Counter
 import argparse
-from datetime import datetime

-# Setup Environment
-sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
+# Add the 'backend' directory to the path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

-from backend.database import SessionLocal, RawJobTitle, init_db, engine, Base
+from database import SessionLocal, RawJobTitle
+from lib.logging_setup import setup_logging
+import logging

-def import_titles(file_path: str, delimiter: str = ';'):
-    print(f"🚀 Starting Import from {file_path}...")
-    
-    # Ensure Table Exists
-    RawJobTitle.__table__.create(bind=engine, checkfirst=True)
-    
+setup_logging()
+logger = logging.getLogger(__name__)
+
+def import_job_titles_from_csv(file_path: str):
    db = SessionLocal()
-    total_rows = 0
-    new_titles = 0
-    updated_titles = 0
-    
    try:
-        with open(file_path, 'r', encoding='utf-8-sig') as f: # utf-8-sig handles BOM from Excel
-            # Try to detect header
-            sample = f.read(1024)
-            has_header = csv.Sniffer().has_header(sample)
-            f.seek(0)
-            
-            reader = csv.reader(f, delimiter=delimiter)
-            
-            if has_header:
-                headers = next(reader)
-                print(f"ℹ️ Header detected: {headers}")
-                # Try to find the right column index
-                col_idx = 0
-                for i, h in enumerate(headers):
-                    if h.lower() in ['funktion', 'jobtitle', 'title', 'position', 'rolle']:
-                        col_idx = i
-                        print(f"  -> Using column '{h}' (Index {i})")
-                        break
-            else:
-                col_idx = 0
-                print("ℹ️ No header detected, using first column.")
+        logger.info(f"Starting import of job titles from {file_path}")
+        
+        # Use Counter to get frequencies directly from the CSV
+        job_title_counts = Counter()
+        total_rows = 0

-            # Process Rows
+        with open(file_path, 'r', encoding='utf-8') as f:
+            reader = csv.reader(f)
+            # Assuming the CSV contains only job titles, one per row
            for row in reader:
-                if not row: continue
-                if len(row) <= col_idx: continue
-                
-                raw_title = row[col_idx].strip()
-                if not raw_title: continue # Skip empty
-                
-                total_rows += 1
-                
-                # Check existance
-                existing = db.query(RawJobTitle).filter(RawJobTitle.title == raw_title).first()
-                
-                if existing:
-                    existing.count += 1
-                    existing.updated_at = datetime.utcnow()
-                    updated_titles += 1
-                else:
-                    db.add(RawJobTitle(title=raw_title, count=1))
-                    new_titles += 1
-                
-                if total_rows % 100 == 0:
-                    db.commit()
-                    print(f"  Processed {total_rows} rows...", end='\r')
+                if row and row[0].strip():
+                    title = row[0].strip()
+                    job_title_counts[title] += 1
+                    total_rows += 1
+        
+        logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
+
+        added_count = 0
+        updated_count = 0
+
+        for title, count in job_title_counts.items():
+            existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
+            if existing_title:
+                if existing_title.count != count:
+                    existing_title.count = count
+                    updated_count += 1
+                # If it exists and count is the same, do nothing.
+            else:
+                new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
+                db.add(new_title)
+                added_count += 1
+        
+        db.commit()
+        logger.info(f"Import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")

-            db.commit()
-            
    except Exception as e:
-        print(f"\n❌ Error: {e}")
+        logger.error(f"Error during job title import: {e}", exc_info=True)
        db.rollback()
    finally:
        db.close()

-    print(f"\n✅ Import Complete.")
-    print(f"   Total Processed: {total_rows}")
-    print(f"   New Unique Titles: {new_titles}")
-    print(f"   Updated Frequencies: {updated_titles}")
-
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Import Job Titles from CSV")
-    parser.add_argument("file", help="Path to CSV file")
-    parser.add_argument("--delimiter", default=";", help="CSV Delimiter (default: ';')")
-    
+    parser = argparse.ArgumentParser(description="Import job titles from a CSV file into the RawJobTitle database table.")
+    parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
    args = parser.parse_args()
-    
-    if not os.path.exists(args.file):
-        print(f"❌ File not found: {args.file}")
-        sys.exit(1)
-        
-    import_titles(args.file, args.delimiter)
+
+    import_job_titles_from_csv(args.file_path)
--- a/company-explorer/backend/scripts/seed_marketing_data.py
+++ b/company-explorer/backend/scripts/seed_marketing_data.py
@@ -4,7 +4,7 @@ import json

 # Setup Environment to import backend modules
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
-from backend.database import SessionLocal, Persona, JobRoleMapping
+from backend.database import SessionLocal, Persona, JobRolePattern

 def seed_archetypes():
    db = SessionLocal()
@@ -87,33 +87,41 @@ def seed_archetypes():
    
    db.commit()

-    # --- 2. Update JobRoleMappings to map to Archetypes ---
+    # --- 2. Update JobRolePatterns to map to Archetypes ---
    # We map the patterns to the new 4 Archetypes
    
    mapping_updates = [
        # Wirtschaftlicher Entscheider
-        {"role": "Wirtschaftlicher Entscheider", "patterns": ["%geschäftsführer%", "%ceo%", "%director%", "%einkauf%", "%procurement%", "%finance%", "%cfo%"]},
+        {"role": "Wirtschaftlicher Entscheider", "patterns": ["geschäftsführer", "ceo", "director", "einkauf", "procurement", "finance", "cfo"]},
        
        # Operativer Entscheider
-        {"role": "Operativer Entscheider", "patterns": ["%housekeeping%", "%hausdame%", "%hauswirtschaft%", "%reinigung%", "%restaurant%", "%f&b%", "%werksleiter%", "%produktionsleiter%", "%lager%", "%logistik%", "%operations%", "%coo%"]},
+        {"role": "Operativer Entscheider", "patterns": ["housekeeping", "hausdame", "hauswirtschaft", "reinigung", "restaurant", "f&b", "werksleiter", "produktionsleiter", "lager", "logistik", "operations", "coo"]},
        
        # Infrastruktur-Verantwortlicher
-        {"role": "Infrastruktur-Verantwortlicher", "patterns": ["%facility%", "%technik%", "%instandhaltung%", "%it-leiter%", "%cto%", "%admin%", "%building%"]},
+        {"role": "Infrastruktur-Verantwortlicher", "patterns": ["facility", "technik", "instandhaltung", "it-leiter", "cto", "admin", "building"]},
        
        # Innovations-Treiber
-        {"role": "Innovations-Treiber", "patterns": ["%innovation%", "%digital%", "%transformation%", "%business dev%", "%marketing%"]}
+        {"role": "Innovations-Treiber", "patterns": ["innovation", "digital", "transformation", "business dev", "marketing"]}
    ]
    
    # Clear old mappings to prevent confusion
-    db.query(JobRoleMapping).delete()
+    db.query(JobRolePattern).delete()
    db.commit()
-    print("Cleared old JobRoleMappings.")
+    print("Cleared old JobRolePatterns.")

    for group in mapping_updates:
        role_name = group["role"]
-        for pattern in group["patterns"]:
-            print(f"Mapping '{pattern}' -> '{role_name}'")
-            db.add(JobRoleMapping(pattern=pattern, role=role_name))
+        for pattern_text in group["patterns"]:
+            print(f"Mapping '{pattern_text}' -> '{role_name}'")
+            # All seeded patterns are regex contains checks
+            new_pattern = JobRolePattern(
+                pattern_type='regex',
+                pattern_value=pattern_text, # Stored without wildcards
+                role=role_name,
+                priority=100, # Default priority for seeded patterns
+                created_by='system'
+            )
+            db.add(new_pattern)

    db.commit()
    print("Archetypes and Mappings Seeded Successfully.")
--- a/company-explorer/backend/scripts/test_mapping_logic.py
+++ b/company-explorer/backend/scripts/test_mapping_logic.py
@@ -5,15 +5,15 @@ import os
 # Setup Environment
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))

-from backend.database import SessionLocal, JobRoleMapping, Persona
+from backend.database import SessionLocal, JobRolePattern, Persona

 def test_mapping(job_title):
    db = SessionLocal()
    print(f"\n--- Testing Mapping for '{job_title}' ---")
    
-    # 1. Find Role Name via JobRoleMapping
+    # 1. Find Role Name via JobRolePattern
    role_name = None
-    mappings = db.query(JobRoleMapping).all()
+    mappings = db.query(JobRolePattern).all()
    for m in mappings:
        pattern_clean = m.pattern.replace("%", "").lower()
        if pattern_clean in job_title.lower():
--- a/company-explorer/backend/scripts/upgrade_schema_v2.py
+++ b/company-explorer/backend/scripts/upgrade_schema_v2.py
@@ -6,7 +6,7 @@ import os
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))

 # Import everything to ensure metadata is populated
-from backend.database import engine, Base, Company, Contact, Industry, JobRoleMapping, Persona, Signal, EnrichmentData, RoboticsCategory, ImportLog, ReportedMistake, MarketingMatrix
+from backend.database import engine, Base, Company, Contact, Industry, JobRolePattern, Persona, Signal, EnrichmentData, RoboticsCategory, ImportLog, ReportedMistake, MarketingMatrix

 def migrate():
    print("Migrating Database Schema...")