Spaces:
Running on Zero
Running on Zero
remove dead code: AOTI export, inductor/triton cache, shared_results, deferred write
Browse files
README.md
CHANGED
|
@@ -44,7 +44,7 @@ Based on [CorridorKey](https://github.com/nikopueringer/CorridorKey) by Corridor
|
|
| 44 |
|
| 45 |
- **Full GPU pipeline**: preprocessing (resize + normalize) and postprocessing (despill, clean_matte, resize) stay on device — avoids CPU↔GPU round-trips per batch
|
| 46 |
- **TF32 tensor cores**: `torch.set_float32_matmul_precision('high')` for FP32 postprocessing ops
|
| 47 |
-
- **torch.
|
| 48 |
|
| 49 |
**Pipeline timing** (89 frames, batch 32 @ 1024px model res): CPU mask 22s → GPU load 5s → inference 29s → write 15s → stitch 9s ≈ 80s total, 49s GPU. Model always processes at 1024x1024 or 2048x2048 regardless of input resolution
|
| 50 |
|
|
|
|
| 44 |
|
| 45 |
- **Full GPU pipeline**: preprocessing (resize + normalize) and postprocessing (despill, clean_matte, resize) stay on device — avoids CPU↔GPU round-trips per batch
|
| 46 |
- **TF32 tensor cores**: `torch.set_float32_matmul_precision('high')` for FP32 postprocessing ops
|
| 47 |
+
- **AOTI compilation with torch.inductor + triton cudagraphs** (native CUDA kernels, fused ops, replays entire kernel sequence without CPU-GPU sync overhead) don't benefit GreenFormer: tested max-autotune (118s, 0 triton kernels) and reduce-overhead (36s compile + 48s graph recording = 84s for 5% speedup). Small feature maps (112-896ch) are cublas-optimal, not triton-friendly. Disabled on ZeroGPU — eager at 0.32s/frame beats 84s+ overhead. torch.compile still available for local GPU
|
| 48 |
|
| 49 |
**Pipeline timing** (89 frames, batch 32 @ 1024px model res): CPU mask 22s → GPU load 5s → inference 29s → write 15s → stitch 9s ≈ 80s total, 49s GPU. Model always processes at 1024x1024 or 2048x2048 regardless of input resolution
|
| 50 |
|
app.py
CHANGED
|
@@ -36,15 +36,6 @@ try:
|
|
| 36 |
except ImportError:
|
| 37 |
HAS_SPACES = False
|
| 38 |
|
| 39 |
-
# Probe for pre-compiled AOTI support (future: zero-cost triton loading)
|
| 40 |
-
HAS_AOTI = False
|
| 41 |
-
try:
|
| 42 |
-
from spaces.zero.torch.aoti import ZeroGPUCompiledModel
|
| 43 |
-
HAS_AOTI = True
|
| 44 |
-
logging.getLogger(__name__).info("spaces.zero.torch.aoti available — pre-compiled AOTI possible")
|
| 45 |
-
except (ImportError, AttributeError):
|
| 46 |
-
pass
|
| 47 |
-
|
| 48 |
# GPU perf: TF32 tensor cores for FP32 postprocessing ops
|
| 49 |
try:
|
| 50 |
import torch as _torch
|
|
@@ -52,10 +43,6 @@ try:
|
|
| 52 |
del _torch
|
| 53 |
except ImportError:
|
| 54 |
pass
|
| 55 |
-
# Persist compilation caches across ZeroGPU sessions
|
| 56 |
-
_inductor_cache = os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "inductor")
|
| 57 |
-
os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", _inductor_cache)
|
| 58 |
-
os.environ.setdefault("TRITON_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "triton"))
|
| 59 |
|
| 60 |
# Workaround: Gradio cache_examples bug with None outputs.
|
| 61 |
_original_read_from_flag = gr.components.Component.read_from_flag
|
|
@@ -539,24 +526,6 @@ def _write_frame_fast(i, alpha, fg, w, h, bg_lin, comp_dir, matte_dir, fg_dir):
|
|
| 539 |
(np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
|
| 540 |
|
| 541 |
|
| 542 |
-
def _write_frame_deferred(i, raw_path, w, h, bg_lin, fg_dir, processed_dir):
|
| 543 |
-
"""Deferred write: FG (JPEG) + Processed (RGBA PNG). Runs after GPU release."""
|
| 544 |
-
d = np.load(raw_path)
|
| 545 |
-
alpha, fg = d["alpha"], d["fg"]
|
| 546 |
-
if alpha.ndim == 2:
|
| 547 |
-
alpha = alpha[:, :, np.newaxis]
|
| 548 |
-
alpha_2d = alpha[:, :, 0]
|
| 549 |
-
cv2.imwrite(os.path.join(fg_dir, f"{i:05d}.jpg"),
|
| 550 |
-
(np.clip(fg, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
|
| 551 |
-
fg_lin = srgb_to_linear(fg)
|
| 552 |
-
fg_premul = premultiply(fg_lin, alpha)
|
| 553 |
-
fg_premul_srgb = linear_to_srgb(fg_premul)
|
| 554 |
-
fg_u8 = (np.clip(fg_premul_srgb, 0, 1) * 255).astype(np.uint8)
|
| 555 |
-
a_u8 = (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8)
|
| 556 |
-
rgba = np.concatenate([fg_u8[:, :, ::-1], a_u8[:, :, np.newaxis]], axis=-1)
|
| 557 |
-
cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
|
| 558 |
-
os.remove(raw_path) # cleanup
|
| 559 |
-
|
| 560 |
|
| 561 |
def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir):
|
| 562 |
"""Full write: all 4 outputs. Used by CPU path."""
|
|
@@ -579,12 +548,6 @@ def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir
|
|
| 579 |
cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
|
| 580 |
|
| 581 |
|
| 582 |
-
# ---------------------------------------------------------------------------
|
| 583 |
-
# Shared storage: GPU function stores results here instead of returning them.
|
| 584 |
-
# This avoids ZeroGPU serializing gigabytes of numpy arrays on return.
|
| 585 |
-
# ---------------------------------------------------------------------------
|
| 586 |
-
_shared_results = {"data": None}
|
| 587 |
-
|
| 588 |
# ---------------------------------------------------------------------------
|
| 589 |
# Main pipeline
|
| 590 |
# ---------------------------------------------------------------------------
|
|
@@ -593,66 +556,6 @@ def _gpu_decorator(fn):
|
|
| 593 |
return spaces.GPU(duration=120)(fn)
|
| 594 |
return fn
|
| 595 |
|
| 596 |
-
def _export_decorator(fn):
|
| 597 |
-
if HAS_SPACES:
|
| 598 |
-
return spaces.GPU(duration=300)(fn)
|
| 599 |
-
return fn
|
| 600 |
-
|
| 601 |
-
@_export_decorator
|
| 602 |
-
def _export_compiled_model(resolution):
|
| 603 |
-
"""Compile model + export .pt2 artifact. Separate GPU call with 300s budget."""
|
| 604 |
-
import torch
|
| 605 |
-
img_size = int(resolution)
|
| 606 |
-
t0 = time.time()
|
| 607 |
-
model = get_pytorch_model(img_size)
|
| 608 |
-
logger.info("Export: model loaded in %.1fs", time.time() - t0)
|
| 609 |
-
|
| 610 |
-
pt2_path = f"/tmp/corridorkey_{img_size}.pt2"
|
| 611 |
-
cache_path = f"/tmp/corridorkey_{img_size}_cache.bin"
|
| 612 |
-
|
| 613 |
-
unwrapped = model._orig_mod if hasattr(model, '_orig_mod') else model
|
| 614 |
-
dummy = torch.zeros(1, 4, img_size, img_size, dtype=torch.float16, device="cuda")
|
| 615 |
-
|
| 616 |
-
# Try AOTInductor .pt2
|
| 617 |
-
try:
|
| 618 |
-
t1 = time.time()
|
| 619 |
-
exported = torch.export.export(unwrapped, (dummy,))
|
| 620 |
-
torch._inductor.aoti_compile_and_package(exported, package_path=pt2_path)
|
| 621 |
-
size_mb = os.path.getsize(pt2_path) / 1024 / 1024
|
| 622 |
-
logger.info("AOTInductor .pt2: %s (%.1f MB) in %.1fs", pt2_path, size_mb, time.time() - t1)
|
| 623 |
-
return pt2_path, f"Exported {pt2_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
|
| 624 |
-
except Exception as e:
|
| 625 |
-
logger.info("AOTInductor failed: %s — trying mega-cache", e)
|
| 626 |
-
|
| 627 |
-
# Try mega-cache
|
| 628 |
-
try:
|
| 629 |
-
t1 = time.time()
|
| 630 |
-
artifact_bytes, cache_info = torch.compiler.save_cache_artifacts()
|
| 631 |
-
with open(cache_path, "wb") as f:
|
| 632 |
-
f.write(artifact_bytes)
|
| 633 |
-
size_mb = len(artifact_bytes) / 1024 / 1024
|
| 634 |
-
logger.info("Mega-cache: %s (%.1f MB) in %.1fs", cache_path, size_mb, time.time() - t1)
|
| 635 |
-
return cache_path, f"Mega-cache {cache_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
|
| 636 |
-
except Exception as e:
|
| 637 |
-
logger.info("Mega-cache failed: %s", e)
|
| 638 |
-
|
| 639 |
-
# Try tarring the inductor+triton cache
|
| 640 |
-
try:
|
| 641 |
-
import tarfile
|
| 642 |
-
tar_path = f"/tmp/corridorkey_{img_size}_inductor_cache.tar.gz"
|
| 643 |
-
t1 = time.time()
|
| 644 |
-
with tarfile.open(tar_path, "w:gz") as tar:
|
| 645 |
-
if os.path.isdir(_inductor_cache):
|
| 646 |
-
tar.add(_inductor_cache, arcname="inductor")
|
| 647 |
-
triton_cache = os.environ.get("TRITON_CACHE_DIR", "")
|
| 648 |
-
if triton_cache and os.path.isdir(triton_cache):
|
| 649 |
-
tar.add(triton_cache, arcname="triton")
|
| 650 |
-
size_mb = os.path.getsize(tar_path) / 1024 / 1024
|
| 651 |
-
logger.info("Cache tar: %s (%.1f MB) in %.1fs", tar_path, size_mb, time.time() - t1)
|
| 652 |
-
return tar_path, f"Cache tar {tar_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
|
| 653 |
-
except Exception as e:
|
| 654 |
-
return None, f"All export methods failed. Last error: {e}"
|
| 655 |
-
|
| 656 |
|
| 657 |
@_gpu_decorator
|
| 658 |
def _gpu_phase(video_path, resolution, despill_val, mask_mode,
|
|
@@ -1075,14 +978,6 @@ with gr.Blocks(title="CorridorKey") as demo:
|
|
| 1075 |
matte_video = gr.Video(label="Alpha Matte")
|
| 1076 |
download_zip = gr.File(label="Download Full Package (Comp + FG + Matte + Processed)")
|
| 1077 |
status_text = gr.Textbox(label="Status", interactive=False)
|
| 1078 |
-
with gr.Accordion("Compiled Model Artifacts", open=False):
|
| 1079 |
-
gr.Markdown("Export compiled .pt2 for pre-loaded speed. Takes ~2 min GPU time (separate from inference).")
|
| 1080 |
-
export_res = gr.Radio(choices=["1024", "2048"], value="1024", label="Export Resolution")
|
| 1081 |
-
export_btn = gr.Button("Export .pt2", variant="secondary", size="sm")
|
| 1082 |
-
export_status = gr.Textbox(label="Export Status", interactive=False)
|
| 1083 |
-
pt2_download = gr.File(label="Download .pt2", interactive=False)
|
| 1084 |
-
export_btn.click(fn=_export_compiled_model, inputs=[export_res],
|
| 1085 |
-
outputs=[pt2_download, export_status])
|
| 1086 |
|
| 1087 |
gr.Examples(
|
| 1088 |
examples=[
|
|
|
|
| 36 |
except ImportError:
|
| 37 |
HAS_SPACES = False
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# GPU perf: TF32 tensor cores for FP32 postprocessing ops
|
| 40 |
try:
|
| 41 |
import torch as _torch
|
|
|
|
| 43 |
del _torch
|
| 44 |
except ImportError:
|
| 45 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# Workaround: Gradio cache_examples bug with None outputs.
|
| 48 |
_original_read_from_flag = gr.components.Component.read_from_flag
|
|
|
|
| 526 |
(np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
|
| 527 |
|
| 528 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
|
| 530 |
def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir):
|
| 531 |
"""Full write: all 4 outputs. Used by CPU path."""
|
|
|
|
| 548 |
cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
|
| 549 |
|
| 550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
# ---------------------------------------------------------------------------
|
| 552 |
# Main pipeline
|
| 553 |
# ---------------------------------------------------------------------------
|
|
|
|
| 556 |
return spaces.GPU(duration=120)(fn)
|
| 557 |
return fn
|
| 558 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
|
| 560 |
@_gpu_decorator
|
| 561 |
def _gpu_phase(video_path, resolution, despill_val, mask_mode,
|
|
|
|
| 978 |
matte_video = gr.Video(label="Alpha Matte")
|
| 979 |
download_zip = gr.File(label="Download Full Package (Comp + FG + Matte + Processed)")
|
| 980 |
status_text = gr.Textbox(label="Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
|
| 982 |
gr.Examples(
|
| 983 |
examples=[
|