diff --git a/GEMINI.md b/GEMINI.md index eb737026..9b7a3d86 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -143,10 +143,33 @@ Since the "Golden Record" for Industry Verticals (Pains, Gains, Products) reside 3. **`list_notion_structure.py` (Schema Discovery):** - **Purpose:** Lists all property keys and page titles. Use this to debug schema changes (e.g. if a column was renamed). - - **Usage:** `python3 list_notion_structure.py` - -## Next Steps -* **Marketing Automation:** Implement the actual sending logic (or export) based on the contact status. -* **Job Role Mapping Engine:** Connect the configured patterns to the contact import/creation process to auto-assign roles. -* **Industry Classification Engine:** Connect the configured industries to the AI Analysis prompt to enforce the "Strict Mode" mapping. -* **Export:** Generate Excel/CSV enriched reports (already partially implemented via JSON export). + - **Usage:** `python3 list_notion_structure.py` + + ## Next Steps + * **Marketing Automation:** Implement the actual sending logic (or export) based on the contact status. + * **Job Role Mapping Engine:** Connect the configured patterns to the contact import/creation process to auto-assign roles. + * **Industry Classification Engine:** Connect the configured industries to the AI Analysis prompt to enforce the "Strict Mode" mapping. + * **Export:** Generate Excel/CSV enriched reports (already partially implemented via JSON export). + + ## Company Explorer Access & Debugging + + The **Company Explorer** is the central intelligence engine. + + **Core Paths:** + * **Database:** `/app/companies_v3_fixed_2.db` (SQLite) + * **Backend Code:** `/app/company-explorer/backend/` + * **Logs:** `/app/logs_debug/company_explorer_debug.log` + + **Accessing Data:** + To inspect live data without starting the full stack, use `sqlite3` directly or the helper scripts (if environment permits). + + * **Direct SQL:** `sqlite3 /app/companies_v3_fixed_2.db "SELECT * FROM companies WHERE name LIKE '%Firma%';" ` + * **Python (requires env):** The app runs in a Docker container. When debugging from outside (CLI agent), Python dependencies like `sqlalchemy` might be missing in the global scope. Prefer `sqlite3` for quick checks. + + **Key Endpoints (Internal API :8000):** + * `POST /api/provision/superoffice-contact`: Triggers the text generation logic. + * `GET /api/companies/{id}`: Full company profile including enrichment data. + + **Troubleshooting:** + * **"BaseModel" Error:** Usually a mix-up between Pydantic and SQLAlchemy `Base`. Check imports in `database.py`. + * **Missing Dependencies:** The CLI agent runs in `/app` but not necessarily inside the container's venv. Use standard tools (`grep`, `sqlite3`) where possible. diff --git a/company-explorer/backend/app.py b/company-explorer/backend/app.py index 964b920c..0b662a4c 100644 --- a/company-explorer/backend/app.py +++ b/company-explorer/backend/app.py @@ -97,6 +97,8 @@ class ProvisioningResponse(BaseModel): website: Optional[str] = None vertical_name: Optional[str] = None role_name: Optional[str] = None + opener: Optional[str] = None # Primary opener (Infrastructure/Cleaning) + opener_secondary: Optional[str] = None # Secondary opener (Service/Logistics) texts: Dict[str, Optional[str]] = {} # --- Events --- @@ -243,6 +245,8 @@ def provision_superoffice_contact( website=company.website, vertical_name=vertical_name, role_name=role_name, + opener=company.ai_opener, + opener_secondary=company.ai_opener_secondary, texts=texts ) @@ -797,21 +801,29 @@ def run_analysis_task(company_id: int): logger.info(f"Running Analysis Task for {company.name}") - # 1. Scrape Website (if not locked) + # --- 1. Scrape Website (if not locked) --- + # Check for existing scrape data first existing_scrape = db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, EnrichmentData.source_type == "website_scrape" ).first() + # If it doesn't exist or is not locked, we perform a scrape if not existing_scrape or not existing_scrape.is_locked: - from .services.scraping import ScraperService - scrape_res = ScraperService().scrape_url(company.website) + logger.info(f"Scraping website for {company.name}...") + scrape_res = scraper.scrape_url(company.website) # Use singleton + + # Now, either create new or update existing if not existing_scrape: db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_res)) + logger.info("Created new website_scrape entry.") else: existing_scrape.content = scrape_res existing_scrape.updated_at = datetime.utcnow() + logger.info("Updated existing website_scrape entry.") db.commit() + else: + logger.info("Website scrape is locked. Skipping.") # 2. Classify Industry & Metrics # IMPORTANT: Using the new method name and passing db session diff --git a/company-explorer/backend/database.py b/company-explorer/backend/database.py index 9f6a0504..99de1f30 100644 --- a/company-explorer/backend/database.py +++ b/company-explorer/backend/database.py @@ -150,7 +150,7 @@ class Industry(Base): created_at = Column(DateTime, default=datetime.utcnow) -class JobRoleMapping(BaseModel): +class JobRoleMapping(Base): """ Maps job title patterns (regex or simple string) to Roles. """ @@ -162,7 +162,7 @@ class JobRoleMapping(BaseModel): created_at = Column(DateTime, default=datetime.utcnow) -class RawJobTitle(BaseModel): +class RawJobTitle(Base): """ Stores raw unique job titles imported from CRM to assist in pattern mining. Tracks frequency to prioritize high-impact patterns. @@ -180,7 +180,7 @@ class RawJobTitle(BaseModel): created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) -class Persona(BaseModel): +class Persona(Base): """ Represents a generalized persona/role (e.g. 'Geschäftsführer', 'IT-Leiter') independent of the specific job title pattern. diff --git a/company-explorer/backend/scripts/inspect_therme.py b/company-explorer/backend/scripts/inspect_therme.py new file mode 100644 index 00000000..407ebc1a --- /dev/null +++ b/company-explorer/backend/scripts/inspect_therme.py @@ -0,0 +1,58 @@ +import sys +import os +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +# Add backend path +sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) + +from backend.database import Company, EnrichmentData +from backend.config import settings + +def inspect_company(company_name_part): + engine = create_engine(settings.DATABASE_URL) + SessionLocal = sessionmaker(bind=engine) + db = SessionLocal() + + try: + print(f"Searching for company containing: '{company_name_part}'...") + companies = db.query(Company).filter(Company.name.ilike(f"%{company_name_part}%")).all() + + if not companies: + print("❌ No company found.") + return + + for company in companies: + print("\n" + "="*60) + print(f"🏢 COMPANY: {company.name} (ID: {company.id})") + print("="*60) + print(f"🌐 Website: {company.website}") + print(f"🏗️ Industry (AI): {company.industry_ai}") + print(f"📊 Metric: {company.calculated_metric_value} {company.calculated_metric_unit} (Std: {company.standardized_metric_value} m²)") + print(f"✅ Status: {company.status}") + + # Enrichment Data + enrichment = db.query(EnrichmentData).filter(EnrichmentData.company_id == company.id).all() + print("\n📚 ENRICHMENT DATA:") + for ed in enrichment: + print(f" 🔹 Type: {ed.source_type} (Locked: {ed.is_locked})") + if ed.source_type == "website_scrape": + content = ed.content + if isinstance(content, dict): + summary = content.get("summary", "No summary") + raw_text = content.get("raw_text", "") + print(f" 📝 Summary: {str(summary)[:200]}...") + print(f" 📄 Raw Text Length: {len(str(raw_text))} chars") + elif ed.source_type == "wikipedia": + content = ed.content + if isinstance(content, dict): + print(f" 🔗 Wiki URL: {content.get('url')}") + print(f" 📄 Content Snippet: {str(content.get('full_text', ''))[:200]}...") + + except Exception as e: + print(f"Error: {e}") + finally: + db.close() + +if __name__ == "__main__": + inspect_company("Therme Erding") \ No newline at end of file diff --git a/company-explorer/backend/scripts/migrate_opener.py b/company-explorer/backend/scripts/migrate_opener.py new file mode 100644 index 00000000..68542cbd --- /dev/null +++ b/company-explorer/backend/scripts/migrate_opener.py @@ -0,0 +1,31 @@ +from sqlalchemy import create_engine, text +import sys +import os + +# Add backend path +sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) +from backend.config import settings + +def migrate(): + engine = create_engine(settings.DATABASE_URL) + with engine.connect() as conn: + try: + # Check if column exists + print("Checking schema...") + # SQLite specific pragma + result = conn.execute(text("PRAGMA table_info(companies)")) + columns = [row[1] for row in result.fetchall()] + + if "ai_opener" in columns: + print("Column 'ai_opener' already exists. Skipping.") + else: + print("Adding column 'ai_opener' to 'companies' table...") + conn.execute(text("ALTER TABLE companies ADD COLUMN ai_opener TEXT")) + conn.commit() + print("✅ Migration successful.") + + except Exception as e: + print(f"❌ Migration failed: {e}") + +if __name__ == "__main__": + migrate() diff --git a/company-explorer/backend/scripts/test_opener_generation.py b/company-explorer/backend/scripts/test_opener_generation.py new file mode 100644 index 00000000..f39da517 --- /dev/null +++ b/company-explorer/backend/scripts/test_opener_generation.py @@ -0,0 +1,41 @@ +import sys +import os +import logging + +# Add backend path +sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) + +# Mock logging +logging.basicConfig(level=logging.INFO) + +# Import Service +from backend.services.classification import ClassificationService + +def test_opener_generation(): + service = ClassificationService() + + print("\n--- TEST: Therme Erding (Primary Focus: Hygiene) ---") + op_prim = service._generate_marketing_opener( + company_name="Therme Erding", + website_text="Größte Therme der Welt, 35 Saunen, Rutschenparadies Galaxy, Wellenbad. Täglich tausende Besucher.", + industry_name="Leisure - Wet & Spa", + industry_pains="Rutschgefahr und Hygiene", + focus_mode="primary" + ) + print(f"Primary Opener: {op_prim}") + + print("\n--- TEST: Dachser Logistik (Secondary Focus: Process) ---") + op_sec = service._generate_marketing_opener( + company_name="Dachser SE", + website_text="Globaler Logistikdienstleister, Warehousing, Food Logistics, Air & Sea Logistics. Intelligent Logistics.", + industry_name="Logistics - Warehouse", + industry_pains="Effizienz und Sicherheit", + focus_mode="secondary" + ) + print(f"Secondary Opener: {op_sec}") + +if __name__ == "__main__": + try: + test_opener_generation() + except Exception as e: + print(f"Test Failed (likely due to missing env/deps): {e}") diff --git a/company-explorer/backend/services/classification.py b/company-explorer/backend/services/classification.py index 3c164b6b..7be378b9 100644 --- a/company-explorer/backend/services/classification.py +++ b/company-explorer/backend/services/classification.py @@ -75,10 +75,12 @@ Source Text: {text_content[:6000]} Return a JSON object with: -- "raw_value": The number found (e.g. 352 or 352.0). If text says "352 Betten", extract 352. If not found, null. +- "raw_value": The number found (e.g. 352 or 352.0). If not found, null. - "raw_unit": The unit found (e.g. "Betten", "m²"). - "proof_text": A short quote from the text proving this value. +**IMPORTANT:** Ignore obvious year numbers (like 1900-2026) if other, more plausible metric values are present in the text. Focus on the target metric. + JSON ONLY. """ try: @@ -159,8 +161,8 @@ JSON ONLY. try: args = (company,) if source_name == 'website' else (db, company.id) if source_name == 'wikipedia' else (company, search_term) content_text, current_source_url = content_loader(*args) - if not content_text: - logger.info(f"No content for {source_name}.") + if not content_text or len(content_text) < 100: + logger.info(f"No or insufficient content for {source_name} (Length: {len(content_text) if content_text else 0}).") continue llm_result = self._run_llm_metric_extraction_prompt(content_text, search_term, industry_name) if llm_result: @@ -224,13 +226,68 @@ JSON ONLY. company.metric_confidence_reason = metrics["metric_confidence_reason"] company.last_classification_at = datetime.utcnow() - db.commit() + # REMOVED: db.commit() - This should be handled by the calling function. return company def reevaluate_wikipedia_metric(self, company: Company, db: Session, industry: Industry) -> Company: logger.info(f"Re-evaluating metric for {company.name}...") return self.extract_metrics_for_industry(company, db, industry) + def _generate_marketing_opener(self, company_name: str, website_text: str, industry_name: str, industry_pains: str, focus_mode: str = "primary") -> Optional[str]: + """ + Generates the 'First Sentence' (Opener). + focus_mode: 'primary' (Standard/Cleaning) or 'secondary' (Service/Logistics). + """ + if not industry_pains: + industry_pains = "Effizienz und Personalmangel" # Fallback + + # Dynamic Focus Instruction + if focus_mode == "secondary": + focus_instruction = """ + - **FOKUS: SEKUNDÄR-PROZESSE (Logistik/Service/Versorgung).** + - Ignoriere das Thema Reinigung. Konzentriere dich auf **Abläufe, Materialfluss, Entlastung von Fachkräften** oder **Gäste-Service**. + - Der Satz muss einen operativen Entscheider (z.B. Pflegedienstleitung, Produktionsleiter) abholen.""" + else: + focus_instruction = """ + - **FOKUS: PRIMÄR-PROZESSE (Infrastruktur/Sauberkeit/Sicherheit).** + - Konzentriere dich auf Anforderungen an das Facility Management, Hygiene, Außenwirkung oder Arbeitssicherheit. + - Der Satz muss einen Infrastruktur-Entscheider (z.B. FM-Leiter, Geschäftsführer) abholen.""" + + prompt = f""" +Du bist ein exzellenter B2B-Stratege und Texter. +Deine Aufgabe ist es, einen hochpersonalisierten Einleitungssatz für eine E-Mail an ein potenzielles Kundenunternehmen zu formulieren. + +--- KONTEXT --- +Zielunternehmen: {company_name} +Branche: {industry_name} +Operative Herausforderung (Pain): "{industry_pains}" + +Webseiten-Kontext: +{website_text[:2500]} + +--- Denkprozess & Stilvorgaben --- +1. **Analysiere den Kontext:** Verstehe das Kerngeschäft. +2. **Identifiziere den Hebel:** Was ist der Erfolgsfaktor in Bezug auf den FOKUS? +3. **Formuliere den Satz (ca. 20-35 Wörter):** + - Wähle einen eleganten, aktiven Einstieg. + - Verbinde die **Tätigkeit** mit dem **Hebel** und den **Konsequenzen**. + - **WICHTIG:** Formuliere als positive Beobachtung über eine Kernkompetenz. + - **VERMEIDE:** Konkrete Zahlen. + - Verwende den Firmennamen: {company_name}. + {focus_instruction} + +--- Deine Ausgabe --- +Gib NUR den finalen Satz aus. Keine Anführungszeichen. +""" + try: + response = call_gemini_flash(prompt) + if response: + return response.strip().strip('"') + return None + except Exception as e: + logger.error(f"Opener Generation Error: {e}") + return None + def classify_company_potential(self, company: Company, db: Session) -> Company: logger.info(f"Starting classification for {company.name}...") @@ -249,12 +306,29 @@ JSON ONLY. suggested_industry_name = self._run_llm_classification_prompt(website_content, company.name, industry_defs) logger.info(f"AI suggests industry: {suggested_industry_name}") - # 4. Update Company - # Match back to DB object + # 4. Update Company & Generate Openers matched_industry = next((i for i in industries if i.name == suggested_industry_name), None) if matched_industry: company.industry_ai = matched_industry.name + + # --- Generate PRIMARY Opener (Infrastructure/Cleaning) --- + op_prim = self._generate_marketing_opener( + company.name, website_content, matched_industry.name, matched_industry.pains, "primary" + ) + if op_prim: + company.ai_opener = op_prim + logger.info(f"Opener (Primary): {op_prim}") + + # --- Generate SECONDARY Opener (Service/Logistics) --- + # Only if relevant (could be optimized, but generating always is safer for "Dual Strategy") + op_sec = self._generate_marketing_opener( + company.name, website_content, matched_industry.name, matched_industry.pains, "secondary" + ) + if op_sec: + company.ai_opener_secondary = op_sec + logger.info(f"Opener (Secondary): {op_sec}") + else: company.industry_ai = "Others" diff --git a/company-explorer/frontend/src/components/Inspector.tsx b/company-explorer/frontend/src/components/Inspector.tsx index cf94576d..48bc4cc5 100644 --- a/company-explorer/frontend/src/components/Inspector.tsx +++ b/company-explorer/frontend/src/components/Inspector.tsx @@ -57,6 +57,10 @@ type CompanyDetail = { // Industry Strategy (V2) industry_details?: IndustryDetails + // Marketing AI (V3) + ai_opener: string | null + ai_opener_secondary: string | null + // NEU v0.7.0: Quantitative Metrics calculated_metric_name: string | null calculated_metric_value: number | null @@ -453,6 +457,43 @@ export function Inspector({ companyId, initialContactId, onClose, apiBase }: Ins ) } + // Marketing AI Card Renderer + const renderMarketingCard = () => { + if (!data?.ai_opener && !data?.ai_opener_secondary) return null; + + return ( +
+ These sentences are statically pre-calculated for the "First Sentence Matching" strategy. +
+