[2ff88f42] einfügen

einfügen
2026-02-20 13:25:21 +00:00
parent 653bd79e1f
commit 101f67936a
5 changed files with 188 additions and 20 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -454,6 +454,22 @@ def list_industries(db: Session = Depends(get_db), username: str = Depends(authe
 def list_job_roles(db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
    return db.query(JobRoleMapping).order_by(JobRoleMapping.pattern.asc()).all()

+@app.get("/api/job_roles/raw")
+def list_raw_job_titles(
+    limit: int = 100, 
+    unmapped_only: bool = True,
+    db: Session = Depends(get_db), 
+    username: str = Depends(authenticate_user)
+):
+    """
+    Returns unique raw job titles from CRM imports, prioritized by frequency.
+    """
+    query = db.query(RawJobTitle)
+    if unmapped_only:
+        query = query.filter(RawJobTitle.is_mapped == False)
+    
+    return query.order_by(RawJobTitle.count.desc()).limit(limit).all()
+
@app.get("/api/mistakes")
 def list_reported_mistakes(
    status: Optional[str] = Query(None),
--- a/company-explorer/backend/database.py
+++ b/company-explorer/backend/database.py
@@ -150,7 +150,7 @@ class Industry(Base):
    created_at = Column(DateTime, default=datetime.utcnow)


-class JobRoleMapping(Base):
+class JobRoleMapping(BaseModel):
    """
    Maps job title patterns (regex or simple string) to Roles.
    """
@@ -162,7 +162,25 @@ class JobRoleMapping(Base):
    
    created_at = Column(DateTime, default=datetime.utcnow)

-class Persona(Base):
+class RawJobTitle(BaseModel):
+    """
+    Stores raw unique job titles imported from CRM to assist in pattern mining.
+    Tracks frequency to prioritize high-impact patterns.
+    """
+    __tablename__ = "raw_job_titles"
+
+    id = Column(Integer, primary_key=True, index=True)
+    title = Column(String, unique=True, index=True) # The raw string, e.g. "Senior Sales Mgr."
+    count = Column(Integer, default=1) # How often this title appears in the CRM
+    source = Column(String, default="import")
+    
+    # Status Flags
+    is_mapped = Column(Boolean, default=False) # True if a pattern currently covers this title
+    
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+class Persona(BaseModel):
    """
    Represents a generalized persona/role (e.g. 'Geschäftsführer', 'IT-Leiter')
    independent of the specific job title pattern.
--- a/company-explorer/backend/scripts/import_job_titles.py
+++ b/company-explorer/backend/scripts/import_job_titles.py
@@ -0,0 +1,95 @@
+import sys
+import os
+import csv
+import argparse
+from datetime import datetime
+
+# Setup Environment
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
+
+from backend.database import SessionLocal, RawJobTitle, init_db, engine, Base
+
+def import_titles(file_path: str, delimiter: str = ';'):
+    print(f"🚀 Starting Import from {file_path}...")
+    
+    # Ensure Table Exists
+    RawJobTitle.__table__.create(bind=engine, checkfirst=True)
+    
+    db = SessionLocal()
+    total_rows = 0
+    new_titles = 0
+    updated_titles = 0
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8-sig') as f: # utf-8-sig handles BOM from Excel
+            # Try to detect header
+            sample = f.read(1024)
+            has_header = csv.Sniffer().has_header(sample)
+            f.seek(0)
+            
+            reader = csv.reader(f, delimiter=delimiter)
+            
+            if has_header:
+                headers = next(reader)
+                print(f"ℹ️ Header detected: {headers}")
+                # Try to find the right column index
+                col_idx = 0
+                for i, h in enumerate(headers):
+                    if h.lower() in ['funktion', 'jobtitle', 'title', 'position', 'rolle']:
+                        col_idx = i
+                        print(f"  -> Using column '{h}' (Index {i})")
+                        break
+            else:
+                col_idx = 0
+                print("ℹ️ No header detected, using first column.")
+
+            # Process Rows
+            for row in reader:
+                if not row: continue
+                if len(row) <= col_idx: continue
+                
+                raw_title = row[col_idx].strip()
+                if not raw_title: continue # Skip empty
+                
+                total_rows += 1
+                
+                # Check existance
+                existing = db.query(RawJobTitle).filter(RawJobTitle.title == raw_title).first()
+                
+                if existing:
+                    existing.count += 1
+                    existing.updated_at = datetime.utcnow()
+                    updated_titles += 1
+                else:
+                    db.add(RawJobTitle(title=raw_title, count=1))
+                    new_titles += 1
+                
+                if total_rows % 100 == 0:
+                    db.commit()
+                    print(f"  Processed {total_rows} rows...", end='\r')
+
+            db.commit()
+            
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        db.rollback()
+    finally:
+        db.close()
+
+    print(f"\n✅ Import Complete.")
+    print(f"   Total Processed: {total_rows}")
+    print(f"   New Unique Titles: {new_titles}")
+    print(f"   Updated Frequencies: {updated_titles}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Import Job Titles from CSV")
+    parser.add_argument("file", help="Path to CSV file")
+    parser.add_argument("--delimiter", default=";", help="CSV Delimiter (default: ';')")
+    
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.file):
+        print(f"❌ File not found: {args.file}")
+        sys.exit(1)
+        
+    import_titles(args.file, args.delimiter)