import os
import re
import html
import sqlite3
import time
from datetime import datetime
from langdetect import detect, DetectorFactory
from openai import OpenAI
from faster_whisper import WhisperModel
import weasyprint

DetectorFactory.seed = 0

# ============================================================
# KONFIGURATION
# ============================================================

# 🔍 ÄNDERE HIER DEN TEST-CHAT (SQL LIKE-Filter)
TARGET_CHAT = "%chat-94.txt%"  

DB_FILE = "case.db"
BASE_DIR = "/home/kai/Schreibtisch/Fall/test/chats_audio_photos"
REPORT_DIR = "reports"
LMSTUDIO_URL = "http://localhost:1234/v1"
LMSTUDIO_MODEL = "qwen/qwen3.6-35b-a3b"
BATCH_SIZE = 20

# Zeigt die absoluten Dateipfade im HTML-Bericht an?
SHOW_FILE_PATHS = True  

os.makedirs(REPORT_DIR, exist_ok=True)

# ============================================================
# LM STUDIO
# ============================================================

client = OpenAI(
    base_url=LMSTUDIO_URL,
    api_key="lm-studio"
)

# ============================================================
# WHISPER (Lazy Loading)
# ============================================================

whisper_model = None

def get_whisper():
    global whisper_model
    if whisper_model is None:
        print("[INFO] Lade Whisper (kann 10-30s dauern)...")
        whisper_model = WhisperModel(
            "large-v3-turbo",
            device="cuda",
            compute_type="float16"
        )
    return whisper_model

# ============================================================
# DATENBANK
# ============================================================

conn = sqlite3.connect(DB_FILE)
conn.row_factory = sqlite3.Row
cur = conn.cursor()

# ============================================================
# HILFSFUNKTIONEN
# ============================================================

def file_path_from_attachment(attachment):
    if not attachment: return None
    rel = attachment.replace("\\", "/").replace("chats/", "")
    return os.path.join(BASE_DIR, rel)

def exists(path): return path and os.path.exists(path)

def parse_timestamp(ts):
    try:
        ts = ts.replace("(UTC+0)", "").strip()
        return datetime.strptime(ts, "%d.%m.%Y %H:%M:%S")
    except:
        return datetime.min

# ✅ Zeilenumbrüche im HTML erhalten
def safe_html(text):
    return html.escape(text or "").replace("\n", "<br>")

# ✅ Nur text_language dynamisch anlegen (sicher)
def ensure_schema():
    cols = [r[1] for r in cur.execute("PRAGMA table_info(messages)")]
    if "text_language" not in cols:
        cur.execute("ALTER TABLE messages ADD COLUMN text_language TEXT")
        conn.commit()
        print("[INFO] Spalte 'text_language' wurde hinzugefügt.")

# ============================================================
# PIPELINE (Nur TARGET_CHAT)
# ============================================================

def run_single_chat_pipeline():
    print(f"\n{'='*40}")
    print(f"🎯 VERARBEITE EINZELNEN CHAT: {TARGET_CHAT}")
    print(f"{'='*40}")

    # --- 1. Audios transkribieren ---
    print("[1] Suche fehlende Audios...")
    cur.execute("""
        SELECT id, attachment FROM messages
        WHERE chat_file LIKE ? AND attachment IS NOT NULL
        AND (attachment LIKE '%.opus' OR attachment LIKE '%.mp3' OR attachment LIKE '%.wav' OR attachment LIKE '%.m4a' OR attachment LIKE '%.ogg')
        AND (transcript IS NULL OR transcript='')
    """, (TARGET_CHAT,))
    audio_rows = cur.fetchall()
    total_audios = len(audio_rows)
    print(f"    Finde {total_audios} Audios...")

    # ✅ FIX: Whisper nur laden, wenn tatsächlich Audios vorhanden sind
    if total_audios > 0:
        whisper = get_whisper()
        for idx, row in enumerate(audio_rows, 1):
            print(f"    [{idx}/{total_audios}] Audio: {row['id']}")
            path = file_path_from_attachment(row["attachment"])
            if not exists(path):
                print(f"    [WARN] Datei fehlt: {path}")
                continue
            try:
                segments, info = whisper.transcribe(path)
                text = " ".join(s.text.strip() for s in segments)
                cur.execute("UPDATE messages SET language=?, transcript=? WHERE id=?", (info.language, text, row["id"]))
            except Exception as e:
                print(f"    [FEHLER] {e}")
        conn.commit()
        print("    ✅ Transkription abgeschlossen.")
    else:
        print("    Keine Audios zu transkribieren.")

    # --- 2. Textsprachen erkennen ---
    print("[2] Erkenne Textsprachen (langdetect)...")
    cur.execute("""
        SELECT id, text FROM messages
        WHERE chat_file LIKE ? AND text IS NOT NULL AND text <> ''
        AND (text_language IS NULL OR text_language='')
    """, (TARGET_CHAT,))
    text_rows = cur.fetchall()
    for row in text_rows:
        text = (row["text"] or "").strip()
        try:
            if len(text) < 10:
                lang = "unknown"
            else:
                lang = detect(text)
        except Exception:
            lang = "unknown"
        cur.execute("UPDATE messages SET text_language=? WHERE id=?", (lang, row["id"]))
    conn.commit()
    print(f"    ✅ {len(text_rows)} Sprachen erkannt.")

    # --- 3. Text-Übersetzungen (Batch + Retry) ---
    print("[3] Übersetze Texte...")
    cur.execute("""
        SELECT id, text FROM messages
        WHERE chat_file LIKE ? AND text IS NOT NULL AND text <> ''
        AND (text_translation IS NULL OR text_translation='')
        AND (text_language IS NULL OR text_language != 'de')
    """, (TARGET_CHAT,))
    text_trans_rows = cur.fetchall()
    if text_trans_rows:
        total_batches = (len(text_trans_rows) + BATCH_SIZE - 1) // BATCH_SIZE
        for batch_idx, start in enumerate(range(0, len(text_trans_rows), BATCH_SIZE), 1):
            batch = text_trans_rows[start:start+BATCH_SIZE]
            items = [(r["id"], r["text"]) for r in batch]
            translations = {}
            for attempt in range(3):
                try:
                    translations = translate_batch(items)
                    break
                except Exception as e:
                    print(f"    [WARN] Qwen Batch fehlgeschlagen (Versuch {attempt+1}/3): {e}")
                    if attempt < 2: time.sleep(2)
                    else: print("    [FEHLER] Batch nach 3 Versuchen übersprungen.")
            for msg_id, german in translations.items():
                cur.execute("UPDATE messages SET text_translation=? WHERE id=?", (german, msg_id))
            conn.commit()
            print(f"    [{batch_idx}/{total_batches}] Text-Batch übersetzt.")

    # --- 4. Audio-Übersetzungen (Batch + Retry) ---
    print("[4] Übersetze Audio-Transkripte...")
    cur.execute("""
        SELECT id, transcript FROM messages
        WHERE chat_file LIKE ? AND transcript IS NOT NULL AND transcript <> ''
        AND (translation IS NULL OR translation='')
        AND (language IS NULL OR language != 'de')
    """, (TARGET_CHAT,))
    audio_trans_rows = cur.fetchall()
    if audio_trans_rows:
        total_batches = (len(audio_trans_rows) + BATCH_SIZE - 1) // BATCH_SIZE
        for batch_idx, start in enumerate(range(0, len(audio_trans_rows), BATCH_SIZE), 1):
            batch = audio_trans_rows[start:start+BATCH_SIZE]
            items = [(r["id"], r["transcript"]) for r in batch]
            translations = {}
            for attempt in range(3):
                try:
                    translations = translate_batch(items)
                    break
                except Exception as e:
                    print(f"    [WARN] Qwen Batch fehlgeschlagen (Versuch {attempt+1}/3): {e}")
                    if attempt < 2: time.sleep(2)
                    else: print("    [FEHLER] Batch nach 3 Versuchen übersprungen.")
            for msg_id, german in translations.items():
                cur.execute("UPDATE messages SET translation=? WHERE id=?", (german, msg_id))
            conn.commit()
            print(f"    [{batch_idx}/{total_batches}] Audio-Batch übersetzt.")

    # --- 5. Report generieren ---
    print("[5] Erzeuge PDF-Bericht...")
    build_pdf(TARGET_CHAT)
    print("    ✅ Bericht erstellt.")

# ============================================================
# QWEN BATCH ÜBERSETZER
# ============================================================

def translate_batch(items):
    if not items: return {}
    prompt = [f"###ID:{item_id}###\n{text}" for item_id, text in items]
    prompt = "\n\n".join(prompt)

    response = client.chat.completions.create(
        model=LMSTUDIO_MODEL, temperature=0,
        messages=[
            {"role":"system", "content":"Übersetze jeden Text ins Deutsche. Behalte die Marker exakt bei. Format: ###ID:123###\nÜbersetzung\n\nKeine Kommentare. Keine Erklärungen. Nur die Übersetzungen."},
            {"role":"user", "content":prompt}
        ]
    )
    result = response.choices[0].message.content

    if "###ID:" not in result:
        print("    [WARNUNG] Qwen hat kein ###ID: Format zurückgegeben. Batch übersprungen.")
        return {}

    translations = {}
    parts = re.split(r"###ID:(\d+)###", result)
    for i in range(1, len(parts), 2):
        try:
            translations[int(parts[i])] = parts[i + 1].strip()
        except:
            pass
    return translations

# ============================================================
# PDF REPORT (mit Dateipfad-Anzeige)
# ============================================================

def build_pdf(chat_file):
    cur.execute("SELECT * FROM messages WHERE chat_file LIKE ? ORDER BY id", (chat_file,))
    rows = cur.fetchall()
    rows = sorted(rows, key=lambda r: parse_timestamp(r["timestamp"]))

    safe_name = os.path.basename(chat_file).replace('/', '_').replace('\\', '_')
    html_path = os.path.join(REPORT_DIR, f"report_{safe_name}.html")
    pdf_path = os.path.join(REPORT_DIR, f"report_{safe_name}.pdf")

    # Erstelle HTML-Inhalt
    html_content = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Report: {html.escape(chat_file)}</title>
<style>
body{{font-family:Arial;background:#f0f0f0;margin:30px;}}
.entry{{background:white;margin-bottom:20px;padding:15px;border-radius:8px;}}
.time{{color:#666;}}
.sender{{font-weight:bold;margin-top:5px;margin-bottom:10px;}}
.original{{background:#eef7ff;padding:10px;border-radius:6px;margin-top:10px;}}
.translation{{background:#fff3c9;padding:10px;border-radius:6px;margin-top:10px;}}
.label{{font-weight:bold;margin-bottom:5px;}}
img{{max-width:700px;display:block;}}
video{{max-width:700px;}}
audio{{width:700px;}}
.path-info{{font-size:0.8em;color:#888;margin-top:4px;font-family:monospace;}}
</style>
</head>
<body>
<h1>Chronologischer Bericht: {html.escape(chat_file)}</h1>
"""
    for row in rows:
        html_content += "<div class='entry'>"
        html_content += f"<div class='time'>{safe_html(row['timestamp'])}</div>"
        html_content += f"<div class='sender'>{safe_html(row['sender'])}</div>"

        text = row["text"] or ""
        if text.strip():
            lang = row['text_language'] or '?'
            html_content += f"<div class='original'><div class='label'>Nachricht Original ({lang})</div>{safe_html(text)}</div>"
            german = row["text_translation"] or ""
            if german.strip():
                html_content += f"<div class='translation'><div class='label'>Deutsch</div>{safe_html(german)}</div>"

        transcript = row["transcript"] or ""
        if transcript.strip():
            lang_a = row['language'] or '?'
            html_content += f"<div class='original'><div class='label'>Audio Original ({lang_a})</div>{safe_html(transcript)}</div>"
            german_a = row["translation"] or ""
            if german_a.strip():
                html_content += f"<div class='translation'><div class='label'>Audio Deutsch</div>{safe_html(german_a)}</div>"

        attachment = row["attachment"] or ""
        if attachment:
            path = file_path_from_attachment(attachment)
            if exists(path):
                ext = os.path.splitext(path)[1].lower()
                media_tag = ""
                if ext in (".jpg",".jpeg",".png",".webp"):
                    # Verwende absolute Pfade für bessere Kompatibilität
                    abs_path = os.path.abspath(path)
                    media_tag = f'<img src="file://{html.escape(abs_path)}" alt="Bild">'
                elif ext in (".mp4",".mov",".webm"):
                    abs_path = os.path.abspath(path)
                    media_tag = f'<video controls><source src="file://{html.escape(abs_path)}"></video>'
                elif ext in (".opus",".mp3",".wav",".m4a",".ogg"):
                    abs_path = os.path.abspath(path)
                    media_tag = f'<audio controls><source src="file://{html.escape(abs_path)}"></audio>'
                
                if media_tag:
                    html_content += f"<br>{media_tag}"
                    if SHOW_FILE_PATHS:
                        html_content += f'<div class="path-info">📁 {html.escape(path)}</div>'
        html_content += "</div>"
    html_content += "</body></html>"

    # Speichere HTML-Datei
    with open(html_path, "w", encoding="utf-8") as f:
        f.write(html_content)
    
    # Konvertiere HTML zu PDF
    try:
        weasyprint.HTML(string=html_content).write_pdf(pdf_path)
        print(f"    📄 PDF-Bericht: {pdf_path}")
    except Exception as e:
        print(f"    [FEHLER] PDF-Konvertierung fehlgeschlagen: {e}")
        print("    Versuche alternative Konvertierung...")
        # Fallback auf HTML-Datei
        print(f"    📄 HTML-Bericht (Fallback): {html_path}")

# ============================================================
# MAIN
# ============================================================

def main():
    print("\n🚀 Start Fallbericht V3 (Testmodus: Einzelner Chat)")
    ensure_schema()
    run_single_chat_pipeline()
    conn.close()
    print("\n✅ Testlauf abgeschlossen.")

if __name__ == "__main__":
    main()
