Inference getting stuck

#31

by alteam123456 - opened 4 days ago

Script:
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

---- Settings ----

------------------

---- Image Preprocessing For Spotting ----

image = Image.open(image_path).convert("RGB")
orig_w, orig_h = image.size
spotting_upscale_threshold = 1500

if task == "spotting" and orig_w < spotting_upscale_threshold and orig_h < spotting_upscale_threshold:
process_w, process_h = orig_w * 2, orig_h * 2
try:
resample_filter = Image.Resampling.LANCZOS
except AttributeError:
resample_filter = Image.LANCZOS
image = image.resize((process_w, process_h), resample_filter)

---- Absolute Guardrail: Hard Reset Massive Dimensions ----

MAX_SIZE = 1344
if max(image.size) > MAX_SIZE:
try:
resample_filter = Image.Resampling.LANCZOS
except AttributeError:
resample_filter = Image.LANCZOS
image.thumbnail((MAX_SIZE, MAX_SIZE), resample_filter)

----------------------------------------------------------

Define total pixel boundaries mathematically instead of raw edge sizes

min_pixels = 14 * 14 * 28 * 28 # ~112,896 pixels baseline limit
max_pixels = 2048 * 28 * 28 if task == "spotting" else 1280 * 28 * 28 # ~1.6M vs ~1M pixels upper limit

---------------------------

-------- Inference --------

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PROMPTS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
"spotting": "Spotting:",
"seal": "Seal Recognition:",
}

print(f"Loading model into {DEVICE} memory...")
model = AutoModelForImageTextToText.from_pretrained(model_path, dtype=torch.bfloat16).to(DEVICE).eval()
processor = AutoProcessor.from_pretrained(model_path)

messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": PROMPTS[task]},
]
}
]

print("Extracting structured prompt text from chat template...")

Extracting text only from the template to completely isolate the vision arguments

prompt_text = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False
)

print("Applying explicit image and text feature processing with root-level pixel limits...")

Forcing the processor to explicitly accept pixel boundaries directly at the root call

inputs = processor(
text=[prompt_text],
images=image,
min_pixels=min_pixels,
max_pixels=max_pixels,
padding=True,
return_tensors="pt"
).to(model.device)

print("Running model generation sequence...")
outputs = model.generate(**inputs, max_new_tokens=512)

print("\n--- OCR Target Output ---")
result = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:-1])
print(result)

---------------------------

Problem:
When running this script, the inference gets stuck on:
File "/mnt/ppocr_env/lib/python3.10/site-packages/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py", line 720, in
torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)

Device Specs:
Tesla T4, Cuda 12.2
torch version: 2.5.1+cu121
transformers version: 5.9.0

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment