import argparse import math import torch from PIL import Image, ImageOps, ImageFilter from transformers import CLIPModel, CLIPProcessor # ============================================================ # PROMPTS # ============================================================ REAL_LABELS_EN = [ "a photo of a real scene in a physical environment", "a photo of a real indoor room", "a photo of a real outdoor scene", "a photo of real objects in a room", "a real photograph taken by a person in the real world", ] RECAPTURE_LABELS_EN = [ "a screenshot", "a photo that is entirely a computer screen", "a photo that is entirely a phone screen", "a photo of a printed photo", "a photo of printed paper", "a photo of a photo", ] REAL_LABELS_DE = [ "ein echtes foto in einer realen umgebung", "ein echtes foto in einem raum", "ein echtes foto draußen", "ein foto von echten objekten in einem raum", "ein echtes foto, das von einer person aufgenommen wurde", ] RECAPTURE_LABELS_DE = [ "ein screenshot", "ein foto, das vollständig einen computerbildschirm zeigt", "ein foto, das vollständig einen handybildschirm zeigt", "ein foto eines ausgedruckten fotos", "ein foto von bedrucktem papier", "ein foto von einem foto", ] # ============================================================ # UTILS # ============================================================ def softmax(xs): m = max(xs) exps = [math.exp(x - m) for x in xs] s = sum(exps) return [e / s for e in exps] def blur_score(image: Image.Image) -> float: """ Einfache Schärfemessung über Kantenstärke. Niedriger Wert = unscharf. Höherer Wert = schärfer. """ gray = image.convert("L") edges = gray.filter(ImageFilter.FIND_EDGES) stat = edges.getdata() values = list(stat) mean = sum(values) / len(values) variance = sum((v - mean) ** 2 for v in values) / len(values) return variance # ============================================================ # MAIN # ============================================================ def main(): ap = argparse.ArgumentParser() ap.add_argument("--image", "-i", required=True) ap.add_argument("--lang", choices=["en", "de"], default="en") ap.add_argument("--model", default="openai/clip-vit-large-patch14") ap.add_argument("--blur-threshold", type=float, default=80.0) args = ap.parse_args() if args.lang == "en": real_labels = REAL_LABELS_EN recapture_labels = RECAPTURE_LABELS_EN else: real_labels = REAL_LABELS_DE recapture_labels = RECAPTURE_LABELS_DE labels = real_labels + recapture_labels image = ImageOps.exif_transpose(Image.open(args.image)).convert("RGB") # ======================================================== # UNSCHÄRFE-CHECK ZUERST # ======================================================== blur = blur_score(image) print(f"\nBlur score: {blur:.2f}") if blur < args.blur_threshold: print("\nVerdict: UNSCHARF\n") return device = "cuda" if torch.cuda.is_available() else "cpu" model = CLIPModel.from_pretrained( args.model, local_files_only=True ).to(device) processor = CLIPProcessor.from_pretrained( args.model, local_files_only=True, use_fast=False ) inputs = processor( text=labels, images=image, return_tensors="pt", padding=True ).to(device) with torch.no_grad(): logits = model(**inputs).logits_per_image[0].cpu().tolist() probs = softmax(logits) ranked = sorted(zip(labels, probs), key=lambda x: x[1], reverse=True) real_score = sum(p for l, p in ranked if l in real_labels) recapture_score = sum(p for l, p in ranked if l in recapture_labels) best_real = max((p for l, p in ranked if l in real_labels), default=0.0) best_recapture = max((p for l, p in ranked if l in recapture_labels), default=0.0) print("\nTop predictions:") for l, p in ranked[:5]: print(f" {p:6.3f} {l}") print("\nScores:") print(f" real_score = {real_score:.3f}") print(f" recapture_score = {recapture_score:.3f}") print(f" best_real = {best_real:.3f}") print(f" best_recapture = {best_recapture:.3f}") # ======================================================== # ENTSCHEIDUNG # ======================================================== if best_real >= 0.20: verdict = "ECHTES FOTO (reale Umgebung vorhanden)" elif best_recapture >= 0.40: verdict = "RECATURE (keine reale Umgebung, nur Reproduktion)" else: verdict = "RECATURE (Grenzfall, eher fake)" print(f"\nVerdict: {verdict}\n") if __name__ == "__main__": main()