import os import re import html import sqlite3 import time from datetime import datetime from langdetect import detect, DetectorFactory from openai import OpenAI from faster_whisper import WhisperModel import weasyprint DetectorFactory.seed = 0 # ============================================================ # KONFIGURATION # ============================================================ # 🔍 ÄNDERE HIER DEN TEST-CHAT (SQL LIKE-Filter) TARGET_CHAT = "%chat-94.txt%" DB_FILE = "case.db" BASE_DIR = "/home/kai/Schreibtisch/Fall/test/chats_audio_photos" REPORT_DIR = "reports" LMSTUDIO_URL = "http://localhost:1234/v1" LMSTUDIO_MODEL = "qwen/qwen3.6-35b-a3b" BATCH_SIZE = 20 # Zeigt die absoluten Dateipfade im HTML-Bericht an? SHOW_FILE_PATHS = True os.makedirs(REPORT_DIR, exist_ok=True) # ============================================================ # LM STUDIO # ============================================================ client = OpenAI( base_url=LMSTUDIO_URL, api_key="lm-studio" ) # ============================================================ # WHISPER (Lazy Loading) # ============================================================ whisper_model = None def get_whisper(): global whisper_model if whisper_model is None: print("[INFO] Lade Whisper (kann 10-30s dauern)...") whisper_model = WhisperModel( "large-v3-turbo", device="cuda", compute_type="float16" ) return whisper_model # ============================================================ # DATENBANK # ============================================================ conn = sqlite3.connect(DB_FILE) conn.row_factory = sqlite3.Row cur = conn.cursor() # ============================================================ # HILFSFUNKTIONEN # ============================================================ def file_path_from_attachment(attachment): if not attachment: return None rel = attachment.replace("\\", "/").replace("chats/", "") return os.path.join(BASE_DIR, rel) def exists(path): return path and os.path.exists(path) def parse_timestamp(ts): try: ts = ts.replace("(UTC+0)", "").strip() return datetime.strptime(ts, "%d.%m.%Y %H:%M:%S") except: return datetime.min # ✅ Zeilenumbrüche im HTML erhalten def safe_html(text): return html.escape(text or "").replace("\n", "
") # ✅ Nur text_language dynamisch anlegen (sicher) def ensure_schema(): cols = [r[1] for r in cur.execute("PRAGMA table_info(messages)")] if "text_language" not in cols: cur.execute("ALTER TABLE messages ADD COLUMN text_language TEXT") conn.commit() print("[INFO] Spalte 'text_language' wurde hinzugefügt.") # ============================================================ # PIPELINE (Nur TARGET_CHAT) # ============================================================ def run_single_chat_pipeline(): print(f"\n{'='*40}") print(f"🎯 VERARBEITE EINZELNEN CHAT: {TARGET_CHAT}") print(f"{'='*40}") # --- 1. Audios transkribieren --- print("[1] Suche fehlende Audios...") cur.execute(""" SELECT id, attachment FROM messages WHERE chat_file LIKE ? AND attachment IS NOT NULL AND (attachment LIKE '%.opus' OR attachment LIKE '%.mp3' OR attachment LIKE '%.wav' OR attachment LIKE '%.m4a' OR attachment LIKE '%.ogg') AND (transcript IS NULL OR transcript='') """, (TARGET_CHAT,)) audio_rows = cur.fetchall() total_audios = len(audio_rows) print(f" Finde {total_audios} Audios...") # ✅ FIX: Whisper nur laden, wenn tatsächlich Audios vorhanden sind if total_audios > 0: whisper = get_whisper() for idx, row in enumerate(audio_rows, 1): print(f" [{idx}/{total_audios}] Audio: {row['id']}") path = file_path_from_attachment(row["attachment"]) if not exists(path): print(f" [WARN] Datei fehlt: {path}") continue try: segments, info = whisper.transcribe(path) text = " ".join(s.text.strip() for s in segments) cur.execute("UPDATE messages SET language=?, transcript=? WHERE id=?", (info.language, text, row["id"])) except Exception as e: print(f" [FEHLER] {e}") conn.commit() print(" ✅ Transkription abgeschlossen.") else: print(" Keine Audios zu transkribieren.") # --- 2. Textsprachen erkennen --- print("[2] Erkenne Textsprachen (langdetect)...") cur.execute(""" SELECT id, text FROM messages WHERE chat_file LIKE ? AND text IS NOT NULL AND text <> '' AND (text_language IS NULL OR text_language='') """, (TARGET_CHAT,)) text_rows = cur.fetchall() for row in text_rows: text = (row["text"] or "").strip() try: if len(text) < 10: lang = "unknown" else: lang = detect(text) except Exception: lang = "unknown" cur.execute("UPDATE messages SET text_language=? WHERE id=?", (lang, row["id"])) conn.commit() print(f" ✅ {len(text_rows)} Sprachen erkannt.") # --- 3. Text-Übersetzungen (Batch + Retry) --- print("[3] Übersetze Texte...") cur.execute(""" SELECT id, text FROM messages WHERE chat_file LIKE ? AND text IS NOT NULL AND text <> '' AND (text_translation IS NULL OR text_translation='') AND (text_language IS NULL OR text_language != 'de') """, (TARGET_CHAT,)) text_trans_rows = cur.fetchall() if text_trans_rows: total_batches = (len(text_trans_rows) + BATCH_SIZE - 1) // BATCH_SIZE for batch_idx, start in enumerate(range(0, len(text_trans_rows), BATCH_SIZE), 1): batch = text_trans_rows[start:start+BATCH_SIZE] items = [(r["id"], r["text"]) for r in batch] translations = {} for attempt in range(3): try: translations = translate_batch(items) break except Exception as e: print(f" [WARN] Qwen Batch fehlgeschlagen (Versuch {attempt+1}/3): {e}") if attempt < 2: time.sleep(2) else: print(" [FEHLER] Batch nach 3 Versuchen übersprungen.") for msg_id, german in translations.items(): cur.execute("UPDATE messages SET text_translation=? WHERE id=?", (german, msg_id)) conn.commit() print(f" [{batch_idx}/{total_batches}] Text-Batch übersetzt.") # --- 4. Audio-Übersetzungen (Batch + Retry) --- print("[4] Übersetze Audio-Transkripte...") cur.execute(""" SELECT id, transcript FROM messages WHERE chat_file LIKE ? AND transcript IS NOT NULL AND transcript <> '' AND (translation IS NULL OR translation='') AND (language IS NULL OR language != 'de') """, (TARGET_CHAT,)) audio_trans_rows = cur.fetchall() if audio_trans_rows: total_batches = (len(audio_trans_rows) + BATCH_SIZE - 1) // BATCH_SIZE for batch_idx, start in enumerate(range(0, len(audio_trans_rows), BATCH_SIZE), 1): batch = audio_trans_rows[start:start+BATCH_SIZE] items = [(r["id"], r["transcript"]) for r in batch] translations = {} for attempt in range(3): try: translations = translate_batch(items) break except Exception as e: print(f" [WARN] Qwen Batch fehlgeschlagen (Versuch {attempt+1}/3): {e}") if attempt < 2: time.sleep(2) else: print(" [FEHLER] Batch nach 3 Versuchen übersprungen.") for msg_id, german in translations.items(): cur.execute("UPDATE messages SET translation=? WHERE id=?", (german, msg_id)) conn.commit() print(f" [{batch_idx}/{total_batches}] Audio-Batch übersetzt.") # --- 5. Report generieren --- print("[5] Erzeuge PDF-Bericht...") build_pdf(TARGET_CHAT) print(" ✅ Bericht erstellt.") # ============================================================ # QWEN BATCH ÜBERSETZER # ============================================================ def translate_batch(items): if not items: return {} prompt = [f"###ID:{item_id}###\n{text}" for item_id, text in items] prompt = "\n\n".join(prompt) response = client.chat.completions.create( model=LMSTUDIO_MODEL, temperature=0, messages=[ {"role":"system", "content":"Übersetze jeden Text ins Deutsche. Behalte die Marker exakt bei. Format: ###ID:123###\nÜbersetzung\n\nKeine Kommentare. Keine Erklärungen. Nur die Übersetzungen."}, {"role":"user", "content":prompt} ] ) result = response.choices[0].message.content if "###ID:" not in result: print(" [WARNUNG] Qwen hat kein ###ID: Format zurückgegeben. Batch übersprungen.") return {} translations = {} parts = re.split(r"###ID:(\d+)###", result) for i in range(1, len(parts), 2): try: translations[int(parts[i])] = parts[i + 1].strip() except: pass return translations # ============================================================ # PDF REPORT (mit Dateipfad-Anzeige) # ============================================================ def build_pdf(chat_file): cur.execute("SELECT * FROM messages WHERE chat_file LIKE ? ORDER BY id", (chat_file,)) rows = cur.fetchall() rows = sorted(rows, key=lambda r: parse_timestamp(r["timestamp"])) safe_name = os.path.basename(chat_file).replace('/', '_').replace('\\', '_') html_path = os.path.join(REPORT_DIR, f"report_{safe_name}.html") pdf_path = os.path.join(REPORT_DIR, f"report_{safe_name}.pdf") # Erstelle HTML-Inhalt html_content = f""" Report: {html.escape(chat_file)}

Chronologischer Bericht: {html.escape(chat_file)}

""" for row in rows: html_content += "

" html_content += f"

{safe_html(row['timestamp'])}

" html_content += f"

{safe_html(row['sender'])}

" text = row["text"] or "" if text.strip(): lang = row['text_language'] or '?' html_content += f"

Nachricht Original ({lang})

{safe_html(text)}

" german = row["text_translation"] or "" if german.strip(): html_content += f"

Deutsch

{safe_html(german)}

" transcript = row["transcript"] or "" if transcript.strip(): lang_a = row['language'] or '?' html_content += f"

Audio Original ({lang_a})

{safe_html(transcript)}

" german_a = row["translation"] or "" if german_a.strip(): html_content += f"

Audio Deutsch

{safe_html(german_a)}

" attachment = row["attachment"] or "" if attachment: path = file_path_from_attachment(attachment) if exists(path): ext = os.path.splitext(path)[1].lower() media_tag = "" if ext in (".jpg",".jpeg",".png",".webp"): # Verwende absolute Pfade für bessere Kompatibilität abs_path = os.path.abspath(path) media_tag = f' Bild

Bild

' elif ext in (".mp4",".mov",".webm"): abs_path = os.path.abspath(path) media_tag = f'

' elif ext in (".opus",".mp3",".wav",".m4a",".ogg"): abs_path = os.path.abspath(path) media_tag = f'

' if media_tag: html_content += f"
{media_tag}" if SHOW_FILE_PATHS: html_content += f'

📁 {html.escape(path)}

' html_content += "

" html_content += "" # Speichere HTML-Datei with open(html_path, "w", encoding="utf-8") as f: f.write(html_content) # Konvertiere HTML zu PDF try: weasyprint.HTML(string=html_content).write_pdf(pdf_path) print(f" 📄 PDF-Bericht: {pdf_path}") except Exception as e: print(f" [FEHLER] PDF-Konvertierung fehlgeschlagen: {e}") print(" Versuche alternative Konvertierung...") # Fallback auf HTML-Datei print(f" 📄 HTML-Bericht (Fallback): {html_path}") # ============================================================ # MAIN # ============================================================ def main(): print("\n🚀 Start Fallbericht V3 (Testmodus: Einzelner Chat)") ensure_schema() run_single_chat_pipeline() conn.close() print("\n✅ Testlauf abgeschlossen.") if __name__ == "__main__": main()