#!/usr/bin/env python3 import os, io, re, sys, subprocess, hashlib, pathlib, time from typing import Optional import requests from PIL import Image, ImageSequence import gradio as gr MODEL_DIR = pathlib.Path("model") MODEL_DIR.mkdir(exist_ok=True, parents=True) # Public mradermacher GGUF links (no tokens) PRIMARY_URL = "https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-q4_k_s.gguf" FALLBACK_URL = "https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-q4_k_m.gguf" PRIMARY_NAME = MODEL_DIR / "llama-joycaption-q4_k_s.gguf" FALLBACK_NAME = MODEL_DIR / "llama-joycaption-q4_k_m.gguf" # Generation params MAX_TOKENS = 128 TEMPERATURE = 0.2 TOP_P = 0.95 STOP = ["\n"] def download_file(url: str, dest: pathlib.Path, timeout=120): if dest.exists(): return print("Downloading", url) with requests.get(url, stream=True, timeout=timeout) as r: r.raise_for_status() total = int(r.headers.get("content-length", 0) or 0) done = 0 with open(dest, "wb") as f: for chunk in r.iter_content(8192): if not chunk: continue f.write(chunk) done += len(chunk) if total: pct = done * 100 // total print(f"\r{dest.name}: {pct}% ", end="", flush=True) print() def mp4_to_gif(mp4_bytes: bytes) -> bytes: files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")} resp = requests.post("https://s.ezgif.com/video-to-gif", files=files, data={"file":"video.mp4"}, timeout=120) resp.raise_for_status() m = re.search(r']+src="([^"]+\.gif)"', resp.text) or re.search(r'src="([^"]+?/tmp/[^"]+\.gif)"', resp.text) if not m: raise RuntimeError("GIF URL not found") gif_url = m.group(1) if gif_url.startswith("//"): gif_url = "https:" + gif_url elif gif_url.startswith("/"): gif_url = "https://s.ezgif.com" + gif_url r2 = requests.get(gif_url, timeout=60); r2.raise_for_status(); return r2.content def load_first_frame(raw: bytes): img = Image.open(io.BytesIO(raw)) if getattr(img, "is_animated", False): img = next(ImageSequence.Iterator(img)) if img.mode != "RGB": img = img.convert("RGB") return img def rebuild_llama_cpp(): env = os.environ.copy() env["PIP_NO_BINARY"] = "llama-cpp-python" subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"], env=env) subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "cmake", "wheel", "setuptools"], env=env) subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "llama-cpp-python"], env=env) _llama = None def ensure_model(): global _llama if _llama is not None: return # try primary then fallback for url, path in ((PRIMARY_URL, PRIMARY_NAME), (FALLBACK_URL, FALLBACK_NAME)): try: download_file(url, path) import importlib llama_cpp = importlib.import_module("llama_cpp") Llama = getattr(llama_cpp, "Llama") print("Loading", path) _llama = Llama(model_path=str(path), n_ctx=2048, n_gpu_layers=0, verbose=False) print("Loaded model:", path.name) return except Exception as e: print("Load failed for", path.name, ":", e) # rebuild once try: print("Rebuilding llama-cpp-python from source...") rebuild_llama_cpp() except Exception as e: raise RuntimeError("Rebuild failed: " + str(e)) # retry primary try: import importlib download_file(PRIMARY_URL, PRIMARY_NAME) llama_cpp = importlib.reload(importlib.import_module("llama_cpp")) Llama = getattr(llama_cpp, "Llama") _llama = Llama(model_path=str(PRIMARY_NAME), n_ctx=2048, n_gpu_layers=0, verbose=False) print("Loaded after rebuild.") return except Exception as e: raise RuntimeError("Load after rebuild failed: " + str(e)) def build_prompt(img_tag: str, user_prompt: str): # Minimal prompt: image placeholder and the user request return f"{img_tag}\n{user_prompt}\nAnswer:" def generate_caption_from_url(url: str, prompt: str="Describe the image."): if not url: return "No URL provided." try: r = requests.get(url, timeout=30); r.raise_for_status(); raw = r.content except Exception as e: return "Download error: " + str(e) try: lower = url.lower().split("?")[0] if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1: try: raw = mp4_to_gif(raw) except Exception as e: return "MP4→GIF conversion failed: " + str(e) img = load_first_frame(raw) except Exception as e: return "Image processing error: " + str(e) try: img = img.resize((512,512), resample=Image.BICUBIC) except Exception: pass # create a tiny base64 tag to signal image presence (model must understand this format) import base64 buf = io.BytesIO() img.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() img_tag = b64 # minimal prompt_text = build_prompt(img_tag, prompt or "Describe the image.") try: ensure_model() # call llama-cpp model out = _llama(prompt_text, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stop=STOP) text = out.get("choices", [{}])[0].get("text", "") return text.strip() except Exception as e: return "Inference error: " + str(e) iface = gr.Interface( fn=generate_caption_from_url, inputs=[gr.Textbox(label="Image / GIF / MP4 URL"), gr.Textbox(label="Prompt", value="Describe the image.")], outputs=gr.Textbox(label="Generated caption"), title="JoyCaption (minimal GGUF, auto-rebuild)", description="No tokens required. Downloads a public GGUF and runs locally via llama-cpp." ) if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)