Spaces:

Luminia
/

CorridorKey

Running on Zero

App Files Files Community

Nekochu commited on May 2

Commit

2a4471f

1 Parent(s): f4a2965

remove dead code: AOTI export, inductor/triton cache, shared_results, deferred write

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +0 -105

README.md CHANGED Viewed

@@ -44,7 +44,7 @@ Based on [CorridorKey](https://github.com/nikopueringer/CorridorKey) by Corridor
 - **Full GPU pipeline**: preprocessing (resize + normalize) and postprocessing (despill, clean_matte, resize) stay on device — avoids CPU↔GPU round-trips per batch
 - **TF32 tensor cores**: `torch.set_float32_matmul_precision('high')` for FP32 postprocessing ops
-- **torch.compile on local GPU only**: tested max-autotune (118s, 0 triton kernels) and reduce-overhead (36s compile + 48s CUDA graph recording = 84s overhead, 5% speedup). GreenFormer's small feature maps (112-896ch) are cublas-optimal, not triton-friendly. Disabled on ZeroGPU — eager mode at 0.32s/frame is better than 84s+ overhead
 **Pipeline timing** (89 frames, batch 32 @ 1024px model res): CPU mask 22s → GPU load 5s → inference 29s → write 15s → stitch 9s ≈ 80s total, 49s GPU. Model always processes at 1024x1024 or 2048x2048 regardless of input resolution

 - **Full GPU pipeline**: preprocessing (resize + normalize) and postprocessing (despill, clean_matte, resize) stay on device — avoids CPU↔GPU round-trips per batch
 - **TF32 tensor cores**: `torch.set_float32_matmul_precision('high')` for FP32 postprocessing ops
+- **AOTI compilation with torch.inductor + triton cudagraphs** (native CUDA kernels, fused ops, replays entire kernel sequence without CPU-GPU sync overhead) don't benefit GreenFormer: tested max-autotune (118s, 0 triton kernels) and reduce-overhead (36s compile + 48s graph recording = 84s for 5% speedup). Small feature maps (112-896ch) are cublas-optimal, not triton-friendly. Disabled on ZeroGPU — eager at 0.32s/frame beats 84s+ overhead. torch.compile still available for local GPU
 **Pipeline timing** (89 frames, batch 32 @ 1024px model res): CPU mask 22s → GPU load 5s → inference 29s → write 15s → stitch 9s ≈ 80s total, 49s GPU. Model always processes at 1024x1024 or 2048x2048 regardless of input resolution

app.py CHANGED Viewed

@@ -36,15 +36,6 @@ try:
 except ImportError:
     HAS_SPACES = False
-# Probe for pre-compiled AOTI support (future: zero-cost triton loading)
-HAS_AOTI = False
-try:
-    from spaces.zero.torch.aoti import ZeroGPUCompiledModel
-    HAS_AOTI = True
-    logging.getLogger(__name__).info("spaces.zero.torch.aoti available — pre-compiled AOTI possible")
-except (ImportError, AttributeError):
-    pass
 # GPU perf: TF32 tensor cores for FP32 postprocessing ops
 try:
     import torch as _torch
@@ -52,10 +43,6 @@ try:
     del _torch
 except ImportError:
     pass
-# Persist compilation caches across ZeroGPU sessions
-_inductor_cache = os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "inductor")
-os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", _inductor_cache)
-os.environ.setdefault("TRITON_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "triton"))
 # Workaround: Gradio cache_examples bug with None outputs.
 _original_read_from_flag = gr.components.Component.read_from_flag
@@ -539,24 +526,6 @@ def _write_frame_fast(i, alpha, fg, w, h, bg_lin, comp_dir, matte_dir, fg_dir):
                 (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
-def _write_frame_deferred(i, raw_path, w, h, bg_lin, fg_dir, processed_dir):
-    """Deferred write: FG (JPEG) + Processed (RGBA PNG). Runs after GPU release."""
-    d = np.load(raw_path)
-    alpha, fg = d["alpha"], d["fg"]
-    if alpha.ndim == 2:
-        alpha = alpha[:, :, np.newaxis]
-    alpha_2d = alpha[:, :, 0]
-    cv2.imwrite(os.path.join(fg_dir, f"{i:05d}.jpg"),
-                (np.clip(fg, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
-    fg_lin = srgb_to_linear(fg)
-    fg_premul = premultiply(fg_lin, alpha)
-    fg_premul_srgb = linear_to_srgb(fg_premul)
-    fg_u8 = (np.clip(fg_premul_srgb, 0, 1) * 255).astype(np.uint8)
-    a_u8 = (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8)
-    rgba = np.concatenate([fg_u8[:, :, ::-1], a_u8[:, :, np.newaxis]], axis=-1)
-    cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
-    os.remove(raw_path)  # cleanup
 def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir):
     """Full write: all 4 outputs. Used by CPU path."""
@@ -579,12 +548,6 @@ def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir
     cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
-# ---------------------------------------------------------------------------
-# Shared storage: GPU function stores results here instead of returning them.
-# This avoids ZeroGPU serializing gigabytes of numpy arrays on return.
-# ---------------------------------------------------------------------------
-_shared_results = {"data": None}
 # ---------------------------------------------------------------------------
 # Main pipeline
 # ---------------------------------------------------------------------------
@@ -593,66 +556,6 @@ def _gpu_decorator(fn):
         return spaces.GPU(duration=120)(fn)
     return fn
-def _export_decorator(fn):
-    if HAS_SPACES:
-        return spaces.GPU(duration=300)(fn)
-    return fn
-@_export_decorator
-def _export_compiled_model(resolution):
-    """Compile model + export .pt2 artifact. Separate GPU call with 300s budget."""
-    import torch
-    img_size = int(resolution)
-    t0 = time.time()
-    model = get_pytorch_model(img_size)
-    logger.info("Export: model loaded in %.1fs", time.time() - t0)
-    pt2_path = f"/tmp/corridorkey_{img_size}.pt2"
-    cache_path = f"/tmp/corridorkey_{img_size}_cache.bin"
-    unwrapped = model._orig_mod if hasattr(model, '_orig_mod') else model
-    dummy = torch.zeros(1, 4, img_size, img_size, dtype=torch.float16, device="cuda")
-    # Try AOTInductor .pt2
-    try:
-        t1 = time.time()
-        exported = torch.export.export(unwrapped, (dummy,))
-        torch._inductor.aoti_compile_and_package(exported, package_path=pt2_path)
-        size_mb = os.path.getsize(pt2_path) / 1024 / 1024
-        logger.info("AOTInductor .pt2: %s (%.1f MB) in %.1fs", pt2_path, size_mb, time.time() - t1)
-        return pt2_path, f"Exported {pt2_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
-    except Exception as e:
-        logger.info("AOTInductor failed: %s — trying mega-cache", e)
-    # Try mega-cache
-    try:
-        t1 = time.time()
-        artifact_bytes, cache_info = torch.compiler.save_cache_artifacts()
-        with open(cache_path, "wb") as f:
-            f.write(artifact_bytes)
-        size_mb = len(artifact_bytes) / 1024 / 1024
-        logger.info("Mega-cache: %s (%.1f MB) in %.1fs", cache_path, size_mb, time.time() - t1)
-        return cache_path, f"Mega-cache {cache_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
-    except Exception as e:
-        logger.info("Mega-cache failed: %s", e)
-    # Try tarring the inductor+triton cache
-    try:
-        import tarfile
-        tar_path = f"/tmp/corridorkey_{img_size}_inductor_cache.tar.gz"
-        t1 = time.time()
-        with tarfile.open(tar_path, "w:gz") as tar:
-            if os.path.isdir(_inductor_cache):
-                tar.add(_inductor_cache, arcname="inductor")
-            triton_cache = os.environ.get("TRITON_CACHE_DIR", "")
-            if triton_cache and os.path.isdir(triton_cache):
-                tar.add(triton_cache, arcname="triton")
-        size_mb = os.path.getsize(tar_path) / 1024 / 1024
-        logger.info("Cache tar: %s (%.1f MB) in %.1fs", tar_path, size_mb, time.time() - t1)
-        return tar_path, f"Cache tar {tar_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
-    except Exception as e:
-        return None, f"All export methods failed. Last error: {e}"
 @_gpu_decorator
 def _gpu_phase(video_path, resolution, despill_val, mask_mode,
@@ -1075,14 +978,6 @@ with gr.Blocks(title="CorridorKey") as demo:
                 matte_video = gr.Video(label="Alpha Matte")
             download_zip = gr.File(label="Download Full Package (Comp + FG + Matte + Processed)")
             status_text = gr.Textbox(label="Status", interactive=False)
-            with gr.Accordion("Compiled Model Artifacts", open=False):
-                gr.Markdown("Export compiled .pt2 for pre-loaded speed. Takes ~2 min GPU time (separate from inference).")
-                export_res = gr.Radio(choices=["1024", "2048"], value="1024", label="Export Resolution")
-                export_btn = gr.Button("Export .pt2", variant="secondary", size="sm")
-                export_status = gr.Textbox(label="Export Status", interactive=False)
-                pt2_download = gr.File(label="Download .pt2", interactive=False)
-                export_btn.click(fn=_export_compiled_model, inputs=[export_res],
-                                 outputs=[pt2_download, export_status])
     gr.Examples(
         examples=[

 except ImportError:
     HAS_SPACES = False
 # GPU perf: TF32 tensor cores for FP32 postprocessing ops
 try:
     import torch as _torch
     del _torch
 except ImportError:
     pass
 # Workaround: Gradio cache_examples bug with None outputs.
 _original_read_from_flag = gr.components.Component.read_from_flag
                 (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
 def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir):
     """Full write: all 4 outputs. Used by CPU path."""
     cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
 # ---------------------------------------------------------------------------
 # Main pipeline
 # ---------------------------------------------------------------------------
         return spaces.GPU(duration=120)(fn)
     return fn
 @_gpu_decorator
 def _gpu_phase(video_path, resolution, despill_val, mask_mode,
                 matte_video = gr.Video(label="Alpha Matte")
             download_zip = gr.File(label="Download Full Package (Comp + FG + Matte + Processed)")
             status_text = gr.Textbox(label="Status", interactive=False)
     gr.Examples(
         examples=[