Nekochu commited on
Commit
2a4471f
·
1 Parent(s): f4a2965

remove dead code: AOTI export, inductor/triton cache, shared_results, deferred write

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +0 -105
README.md CHANGED
@@ -44,7 +44,7 @@ Based on [CorridorKey](https://github.com/nikopueringer/CorridorKey) by Corridor
44
 
45
  - **Full GPU pipeline**: preprocessing (resize + normalize) and postprocessing (despill, clean_matte, resize) stay on device — avoids CPU↔GPU round-trips per batch
46
  - **TF32 tensor cores**: `torch.set_float32_matmul_precision('high')` for FP32 postprocessing ops
47
- - **torch.compile on local GPU only**: tested max-autotune (118s, 0 triton kernels) and reduce-overhead (36s compile + 48s CUDA graph recording = 84s overhead, 5% speedup). GreenFormer's small feature maps (112-896ch) are cublas-optimal, not triton-friendly. Disabled on ZeroGPU — eager mode at 0.32s/frame is better than 84s+ overhead
48
 
49
  **Pipeline timing** (89 frames, batch 32 @ 1024px model res): CPU mask 22s → GPU load 5s → inference 29s → write 15s → stitch 9s ≈ 80s total, 49s GPU. Model always processes at 1024x1024 or 2048x2048 regardless of input resolution
50
 
 
44
 
45
  - **Full GPU pipeline**: preprocessing (resize + normalize) and postprocessing (despill, clean_matte, resize) stay on device — avoids CPU↔GPU round-trips per batch
46
  - **TF32 tensor cores**: `torch.set_float32_matmul_precision('high')` for FP32 postprocessing ops
47
+ - **AOTI compilation with torch.inductor + triton cudagraphs** (native CUDA kernels, fused ops, replays entire kernel sequence without CPU-GPU sync overhead) don't benefit GreenFormer: tested max-autotune (118s, 0 triton kernels) and reduce-overhead (36s compile + 48s graph recording = 84s for 5% speedup). Small feature maps (112-896ch) are cublas-optimal, not triton-friendly. Disabled on ZeroGPU — eager at 0.32s/frame beats 84s+ overhead. torch.compile still available for local GPU
48
 
49
  **Pipeline timing** (89 frames, batch 32 @ 1024px model res): CPU mask 22s → GPU load 5s → inference 29s → write 15s → stitch 9s ≈ 80s total, 49s GPU. Model always processes at 1024x1024 or 2048x2048 regardless of input resolution
50
 
app.py CHANGED
@@ -36,15 +36,6 @@ try:
36
  except ImportError:
37
  HAS_SPACES = False
38
 
39
- # Probe for pre-compiled AOTI support (future: zero-cost triton loading)
40
- HAS_AOTI = False
41
- try:
42
- from spaces.zero.torch.aoti import ZeroGPUCompiledModel
43
- HAS_AOTI = True
44
- logging.getLogger(__name__).info("spaces.zero.torch.aoti available — pre-compiled AOTI possible")
45
- except (ImportError, AttributeError):
46
- pass
47
-
48
  # GPU perf: TF32 tensor cores for FP32 postprocessing ops
49
  try:
50
  import torch as _torch
@@ -52,10 +43,6 @@ try:
52
  del _torch
53
  except ImportError:
54
  pass
55
- # Persist compilation caches across ZeroGPU sessions
56
- _inductor_cache = os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "inductor")
57
- os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", _inductor_cache)
58
- os.environ.setdefault("TRITON_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "triton"))
59
 
60
  # Workaround: Gradio cache_examples bug with None outputs.
61
  _original_read_from_flag = gr.components.Component.read_from_flag
@@ -539,24 +526,6 @@ def _write_frame_fast(i, alpha, fg, w, h, bg_lin, comp_dir, matte_dir, fg_dir):
539
  (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
540
 
541
 
542
- def _write_frame_deferred(i, raw_path, w, h, bg_lin, fg_dir, processed_dir):
543
- """Deferred write: FG (JPEG) + Processed (RGBA PNG). Runs after GPU release."""
544
- d = np.load(raw_path)
545
- alpha, fg = d["alpha"], d["fg"]
546
- if alpha.ndim == 2:
547
- alpha = alpha[:, :, np.newaxis]
548
- alpha_2d = alpha[:, :, 0]
549
- cv2.imwrite(os.path.join(fg_dir, f"{i:05d}.jpg"),
550
- (np.clip(fg, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
551
- fg_lin = srgb_to_linear(fg)
552
- fg_premul = premultiply(fg_lin, alpha)
553
- fg_premul_srgb = linear_to_srgb(fg_premul)
554
- fg_u8 = (np.clip(fg_premul_srgb, 0, 1) * 255).astype(np.uint8)
555
- a_u8 = (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8)
556
- rgba = np.concatenate([fg_u8[:, :, ::-1], a_u8[:, :, np.newaxis]], axis=-1)
557
- cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
558
- os.remove(raw_path) # cleanup
559
-
560
 
561
  def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir):
562
  """Full write: all 4 outputs. Used by CPU path."""
@@ -579,12 +548,6 @@ def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir
579
  cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
580
 
581
 
582
- # ---------------------------------------------------------------------------
583
- # Shared storage: GPU function stores results here instead of returning them.
584
- # This avoids ZeroGPU serializing gigabytes of numpy arrays on return.
585
- # ---------------------------------------------------------------------------
586
- _shared_results = {"data": None}
587
-
588
  # ---------------------------------------------------------------------------
589
  # Main pipeline
590
  # ---------------------------------------------------------------------------
@@ -593,66 +556,6 @@ def _gpu_decorator(fn):
593
  return spaces.GPU(duration=120)(fn)
594
  return fn
595
 
596
- def _export_decorator(fn):
597
- if HAS_SPACES:
598
- return spaces.GPU(duration=300)(fn)
599
- return fn
600
-
601
- @_export_decorator
602
- def _export_compiled_model(resolution):
603
- """Compile model + export .pt2 artifact. Separate GPU call with 300s budget."""
604
- import torch
605
- img_size = int(resolution)
606
- t0 = time.time()
607
- model = get_pytorch_model(img_size)
608
- logger.info("Export: model loaded in %.1fs", time.time() - t0)
609
-
610
- pt2_path = f"/tmp/corridorkey_{img_size}.pt2"
611
- cache_path = f"/tmp/corridorkey_{img_size}_cache.bin"
612
-
613
- unwrapped = model._orig_mod if hasattr(model, '_orig_mod') else model
614
- dummy = torch.zeros(1, 4, img_size, img_size, dtype=torch.float16, device="cuda")
615
-
616
- # Try AOTInductor .pt2
617
- try:
618
- t1 = time.time()
619
- exported = torch.export.export(unwrapped, (dummy,))
620
- torch._inductor.aoti_compile_and_package(exported, package_path=pt2_path)
621
- size_mb = os.path.getsize(pt2_path) / 1024 / 1024
622
- logger.info("AOTInductor .pt2: %s (%.1f MB) in %.1fs", pt2_path, size_mb, time.time() - t1)
623
- return pt2_path, f"Exported {pt2_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
624
- except Exception as e:
625
- logger.info("AOTInductor failed: %s — trying mega-cache", e)
626
-
627
- # Try mega-cache
628
- try:
629
- t1 = time.time()
630
- artifact_bytes, cache_info = torch.compiler.save_cache_artifacts()
631
- with open(cache_path, "wb") as f:
632
- f.write(artifact_bytes)
633
- size_mb = len(artifact_bytes) / 1024 / 1024
634
- logger.info("Mega-cache: %s (%.1f MB) in %.1fs", cache_path, size_mb, time.time() - t1)
635
- return cache_path, f"Mega-cache {cache_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
636
- except Exception as e:
637
- logger.info("Mega-cache failed: %s", e)
638
-
639
- # Try tarring the inductor+triton cache
640
- try:
641
- import tarfile
642
- tar_path = f"/tmp/corridorkey_{img_size}_inductor_cache.tar.gz"
643
- t1 = time.time()
644
- with tarfile.open(tar_path, "w:gz") as tar:
645
- if os.path.isdir(_inductor_cache):
646
- tar.add(_inductor_cache, arcname="inductor")
647
- triton_cache = os.environ.get("TRITON_CACHE_DIR", "")
648
- if triton_cache and os.path.isdir(triton_cache):
649
- tar.add(triton_cache, arcname="triton")
650
- size_mb = os.path.getsize(tar_path) / 1024 / 1024
651
- logger.info("Cache tar: %s (%.1f MB) in %.1fs", tar_path, size_mb, time.time() - t1)
652
- return tar_path, f"Cache tar {tar_path} ({size_mb:.1f} MB) in {time.time()-t0:.0f}s"
653
- except Exception as e:
654
- return None, f"All export methods failed. Last error: {e}"
655
-
656
 
657
  @_gpu_decorator
658
  def _gpu_phase(video_path, resolution, despill_val, mask_mode,
@@ -1075,14 +978,6 @@ with gr.Blocks(title="CorridorKey") as demo:
1075
  matte_video = gr.Video(label="Alpha Matte")
1076
  download_zip = gr.File(label="Download Full Package (Comp + FG + Matte + Processed)")
1077
  status_text = gr.Textbox(label="Status", interactive=False)
1078
- with gr.Accordion("Compiled Model Artifacts", open=False):
1079
- gr.Markdown("Export compiled .pt2 for pre-loaded speed. Takes ~2 min GPU time (separate from inference).")
1080
- export_res = gr.Radio(choices=["1024", "2048"], value="1024", label="Export Resolution")
1081
- export_btn = gr.Button("Export .pt2", variant="secondary", size="sm")
1082
- export_status = gr.Textbox(label="Export Status", interactive=False)
1083
- pt2_download = gr.File(label="Download .pt2", interactive=False)
1084
- export_btn.click(fn=_export_compiled_model, inputs=[export_res],
1085
- outputs=[pt2_download, export_status])
1086
 
1087
  gr.Examples(
1088
  examples=[
 
36
  except ImportError:
37
  HAS_SPACES = False
38
 
 
 
 
 
 
 
 
 
 
39
  # GPU perf: TF32 tensor cores for FP32 postprocessing ops
40
  try:
41
  import torch as _torch
 
43
  del _torch
44
  except ImportError:
45
  pass
 
 
 
 
46
 
47
  # Workaround: Gradio cache_examples bug with None outputs.
48
  _original_read_from_flag = gr.components.Component.read_from_flag
 
526
  (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
527
 
528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
  def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir):
531
  """Full write: all 4 outputs. Used by CPU path."""
 
548
  cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
549
 
550
 
 
 
 
 
 
 
551
  # ---------------------------------------------------------------------------
552
  # Main pipeline
553
  # ---------------------------------------------------------------------------
 
556
  return spaces.GPU(duration=120)(fn)
557
  return fn
558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
  @_gpu_decorator
561
  def _gpu_phase(video_path, resolution, despill_val, mask_mode,
 
978
  matte_video = gr.Video(label="Alpha Matte")
979
  download_zip = gr.File(label="Download Full Package (Comp + FG + Matte + Processed)")
980
  status_text = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
981
 
982
  gr.Examples(
983
  examples=[