unknown commited on
Commit
b83a586
·
1 Parent(s): 7273fb2
Files changed (1) hide show
  1. app.py +226 -220
app.py CHANGED
@@ -628,257 +628,263 @@
628
  # if __name__ == "__main__":
629
  # demo.launch()
630
 
631
- import re
632
- import tempfile
633
- from dataclasses import dataclass
634
- from typing import List, Dict
635
 
 
636
  import gradio as gr
637
- from huggingface_hub import list_repo_files, hf_hub_download
638
- from pydub import AudioSegment
639
-
640
-
641
- # =========================================================
642
- # 基本配置
643
- # =========================================================
644
- MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
645
- VTT_EXTS = (".vtt",)
646
- DEFAULT_MAX_MID_DIFF = 1.5
647
-
648
-
649
- # =========================================================
650
- # 数据结构
651
- # =========================================================
652
- @dataclass
653
- class Cue:
654
- start: float
655
- end: float
656
- text: str
657
-
658
-
659
- # =========================================================
660
- # VTT 解析(只保留纯字幕)
661
- # =========================================================
662
- _TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
663
- _VTT_TIME_RE = re.compile(
664
- r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
665
- r"(?P<end>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})"
666
- )
667
-
668
-
669
- def _strip_tags(text: str) -> str:
670
- return _TAG_RE.sub("", text).strip()
671
-
672
-
673
- def _time_to_seconds(t: str) -> float:
674
- parts = t.split(":")
675
- if len(parts) == 3:
676
- return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
677
- return int(parts[0]) * 60 + float(parts[1])
678
-
679
-
680
- def parse_vtt_file(path: str) -> List[Cue]:
681
- with open(path, "r", encoding="utf-8") as f:
682
- content = f.read()
683
-
684
- content = content.replace("\ufeff", "")
685
- content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
686
-
687
- blocks = re.split(r"\r?\n\r?\n", content.strip())
688
- cues: List[Cue] = []
689
-
690
- for block in blocks:
691
- lines = [l.strip() for l in block.splitlines() if l.strip()]
692
- if not lines:
693
- continue
694
-
695
- time_idx = None
696
- for i, line in enumerate(lines):
697
- if "-->" in line:
698
- time_idx = i
699
- break
700
- if time_idx is None:
701
- continue
702
-
703
- m = _VTT_TIME_RE.search(lines[time_idx])
704
- if not m:
705
- continue
706
-
707
- start = _time_to_seconds(m.group("start"))
708
- end = _time_to_seconds(m.group("end"))
709
- if end <= start:
710
- continue
711
-
712
- text_lines = lines[time_idx + 1 :]
713
- if not text_lines:
714
- continue
715
-
716
- text = _strip_tags("\n".join(text_lines))
717
- if text:
718
- cues.append(Cue(start=start, end=end, text=text))
719
-
720
- return sorted(cues, key=lambda x: x.start)
721
-
722
-
723
- # =========================================================
724
- # 字幕对齐(按时间中点)
725
- # =========================================================
726
- def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
727
- out, i, j, idx = [], 0, 0, 1
728
- while i < len(a) and j < len(b):
729
- ma = (a[i].start + a[i].end) / 2
730
- mb = (b[j].start + b[j].end) / 2
731
- if abs(ma - mb) <= th:
732
- out.append(
733
- {
734
- "idx": idx,
735
- "start": min(a[i].start, b[j].start),
736
- "end": max(a[i].end, b[j].end),
737
- "a_text": a[i].text,
738
- "b_text": b[j].text,
739
- }
740
- )
741
- idx += 1
742
- i += 1
743
- j += 1
744
- elif ma < mb:
745
- i += 1
746
- else:
747
- j += 1
748
- return out
749
-
750
-
751
- # =========================================================
752
- # 音频切片
753
- # =========================================================
754
- def export_segment(audio: AudioSegment, start: float, end: float) -> str:
755
- seg = audio[int(start * 1000) : int(end * 1000)]
756
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
757
- seg.export(tmp.name, format="wav")
758
- return tmp.name
759
-
760
-
761
- # =========================================================
762
- # Gradio 回调
763
- # =========================================================
764
- def scan_dataset(repo_id: str, repo_type: str):
765
- if not repo_id:
766
- raise gr.Error("请填写 Dataset / Repo 名称。")
767
-
768
- files = list_repo_files(repo_id, repo_type=repo_type)
769
- media_files = [f for f in files if f.lower().endswith(MEDIA_EXTS)]
770
- vtt_files = [f for f in files if f.lower().endswith(VTT_EXTS)]
771
-
772
- if not media_files or not vtt_files:
773
- raise gr.Error("Dataset 中未找到媒体文件或 VTT 文件。")
774
-
775
- return (
776
- gr.update(choices=media_files, value=media_files[0]),
777
- gr.update(choices=media_files, value=media_files[0]),
778
- gr.update(choices=vtt_files, value=vtt_files[0]),
779
- gr.update(choices=vtt_files, value=vtt_files[0]),
780
- )
781
 
782
 
783
- def load_and_align(repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th):
784
- audio_a = AudioSegment.from_file(hf_hub_download(repo_id, media_a, repo_type=repo_type))
785
- audio_b = AudioSegment.from_file(hf_hub_download(repo_id, media_b, repo_type=repo_type))
 
 
 
786
 
787
- cues_a = parse_vtt_file(hf_hub_download(repo_id, vtt_a, repo_type=repo_type))
788
- cues_b = parse_vtt_file(hf_hub_download(repo_id, vtt_b, repo_type=repo_type))
789
 
790
- aligned = align_by_time(cues_a, cues_b, th)
791
- if not aligned:
792
- raise gr.Error("未对齐到任何字幕片段。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793
 
794
  rows = [
795
- [x["idx"], f'{x["start"]:.2f}-{x["end"]:.2f}', x["a_text"], x["b_text"]]
796
- for x in aligned
 
 
 
 
797
  ]
798
 
 
 
 
 
 
 
 
 
799
  state = {
800
- "aligned": aligned,
801
- "audio_a": audio_a,
802
- "audio_b": audio_b,
 
803
  }
804
 
805
- return rows, state, None, None
806
-
807
-
808
- def play_on_select(evt: gr.SelectData, df_value, state):
809
- if not state:
810
- raise gr.Error("请先加载并对齐。")
811
-
812
- row = evt.index[0] if isinstance(evt.index, (tuple, list)) else evt.index
813
- seg_idx = int(df_value[row][0])
814
- seg = state["aligned"][seg_idx - 1]
815
-
816
- a_wav = export_segment(state["audio_a"], seg["start"], seg["end"])
817
- b_wav = export_segment(state["audio_b"], seg["start"], seg["end"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
 
819
- info = {
820
- "segment": seg_idx,
821
- "time": f'{seg["start"]:.2f} - {seg["end"]:.2f}',
822
- }
823
 
824
- return a_wav, b_wav, info
 
825
 
826
 
827
- # =========================================================
828
  # UI
829
- # =========================================================
830
- with gr.Blocks(title="双语音频字幕对齐(点击即播放)") as demo:
831
- gr.Markdown("# 🎧 双语音频字幕对齐(点击表格即播放)")
832
-
833
- state = gr.State()
834
-
835
- with gr.Row():
836
- repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset")
837
- repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
838
-
839
- btn_scan = gr.Button("扫描 Dataset", variant="primary")
840
 
841
- with gr.Row():
842
- media_a = gr.Dropdown(label="Track A 媒体")
843
- media_b = gr.Dropdown(label="Track B 媒体")
844
 
845
- with gr.Row():
846
- vtt_a = gr.Dropdown(label="Track A 字幕")
847
- vtt_b = gr.Dropdown(label="Track B 字幕")
848
 
849
- btn_scan.click(
850
- scan_dataset,
851
- inputs=[repo_id, repo_type],
852
- outputs=[media_a, media_b, vtt_a, vtt_b],
853
  )
854
 
855
- th = gr.Slider(0.3, 5.0, value=DEFAULT_MAX_MID_DIFF, step=0.1, label="对齐阈值(秒)")
856
- btn_align = gr.Button("加载并对齐", variant="primary")
857
 
858
  df = gr.Dataframe(
859
- headers=["#", "Time", "Track A", "Track B"],
860
- interactive=True,
 
 
 
861
  wrap=True,
862
- max_height=520,
863
- )
864
-
865
- btn_align.click(
866
- load_and_align,
867
- inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
868
- outputs=[df, state, gr.Audio(), gr.Audio()],
869
  )
870
 
871
  with gr.Row():
872
- a_out = gr.Audio(label="Track A 片段")
873
- b_out = gr.Audio(label="Track B 片段")
874
 
875
- play_info = gr.JSON(label="当前片段")
 
 
 
 
 
 
876
 
877
  df.select(
878
- play_on_select,
879
- inputs=[df, state],
880
- outputs=[a_out, b_out, play_info],
881
  )
882
 
883
- if __name__ == "__main__":
884
- demo.launch()
 
628
  # if __name__ == "__main__":
629
  # demo.launch()
630
 
631
+ import json
632
+ import os
633
+ from typing import List, Dict, Any, Tuple
 
634
 
635
+ import numpy as np
636
  import gradio as gr
637
+
638
+ from huggingface_hub import hf_hub_download, list_repo_files
639
+
640
+ # Audio backends
641
+ import soundfile as sf
642
+ try:
643
+ from pydub import AudioSegment # fallback (ffmpeg)
644
+ except Exception:
645
+ AudioSegment = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
647
 
648
+ # =====================
649
+ # 固定配置(你的数据)
650
+ # =====================
651
+ REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
652
+ AUDIO_DIR = "audio/testbatch/ARE"
653
+ JSON_DIR = "text/ref/testbatch/ARE"
654
 
 
 
655
 
656
+ # =====================
657
+ # 工具函数
658
+ # =====================
659
+ def list_are_audio_files() -> List[str]:
660
+ files = list_repo_files(REPO_ID)
661
+ audio_files = [
662
+ f for f in files
663
+ if f.startswith(AUDIO_DIR) and f.lower().endswith((".wav", ".mp3", ".flac", ".ogg"))
664
+ ]
665
+ audio_files.sort()
666
+ return audio_files
667
+
668
+
669
+ def _load_audio_np(local_audio_path: str) -> Tuple[np.ndarray, int]:
670
+ """
671
+ Return mono float32 numpy audio in [-1, 1], and sample rate.
672
+ soundfile first; if fails, fallback to pydub.
673
+ """
674
+ # 1) soundfile
675
+ try:
676
+ audio, sr = sf.read(local_audio_path, always_2d=False)
677
+ if audio.ndim == 2:
678
+ audio = audio.mean(axis=1)
679
+ audio = np.asarray(audio, dtype=np.float32)
680
+ return audio, int(sr)
681
+ except Exception:
682
+ pass
683
+
684
+ # 2) pydub fallback
685
+ if AudioSegment is None:
686
+ raise RuntimeError("soundfile 读取失败,且未安装/不可用 pydub。请确认 requirements.txt 包含 pydub 且 Space 有 ffmpeg。")
687
+
688
+ seg = AudioSegment.from_file(local_audio_path)
689
+ sr = int(seg.frame_rate)
690
+ samples = np.array(seg.get_array_of_samples())
691
+
692
+ if seg.channels > 1:
693
+ samples = samples.reshape((-1, seg.channels)).mean(axis=1)
694
+
695
+ # normalize by sample width
696
+ max_val = float(1 << (8 * seg.sample_width - 1))
697
+ audio = (samples / max_val).astype(np.float32)
698
+ return audio, sr
699
+
700
+
701
+ def load_audio_and_json(repo_audio_path: str):
702
+ # ---- 推导 json 路径 ----
703
+ filename = os.path.basename(repo_audio_path)
704
+ base, _ext = os.path.splitext(filename)
705
+ repo_json_path = f"{JSON_DIR}/{base}.json"
706
+
707
+ # ---- 下载 ----
708
+ local_audio = hf_hub_download(REPO_ID, repo_audio_path)
709
+ local_json = hf_hub_download(REPO_ID, repo_json_path)
710
+
711
+ # ---- 读音频 ----
712
+ audio, sr = _load_audio_np(local_audio)
713
+
714
+ # ---- 读 JSON ----
715
+ with open(local_json, "r", encoding="utf-8") as f:
716
+ data = json.load(f)
717
+
718
+ segments = []
719
+ for i, s in enumerate(data.get("segments", [])):
720
+ start = float(s.get("start", 0.0))
721
+ end = float(s.get("end", 0.0))
722
+ segments.append({
723
+ "row_id": s.get("index", i),
724
+ "start": start,
725
+ "end": end,
726
+ "dur": max(0.0, end - start),
727
+ "status": s.get("status", ""),
728
+ "speaker": s.get("speaker", ""),
729
+ "gender": s.get("gender", ""),
730
+ "age_group": s.get("age_group", ""),
731
+ "emotion": s.get("emotion", ""),
732
+ "text": s.get("text", "") or "",
733
+ })
734
+
735
+ audio_name = data.get("audio_name", filename)
736
+ return audio, sr, segments, audio_name
737
+
738
+
739
+ def slice_audio(audio: np.ndarray, sr: int, start: float, end: float) -> Tuple[int, np.ndarray]:
740
+ n = int(audio.shape[0])
741
+
742
+ start_i = int(round(float(start) * sr))
743
+ end_i = int(round(float(end) * sr))
744
+
745
+ start_i = max(0, min(n, start_i))
746
+ end_i = max(0, min(n, end_i))
747
+
748
+ if end_i <= start_i:
749
+ return sr, np.zeros((0,), dtype=np.float32)
750
+
751
+ seg = np.asarray(audio[start_i:end_i], dtype=np.float32)
752
+ return sr, seg
753
+
754
+
755
+ # =====================
756
+ # Gradio 交互逻辑
757
+ # =====================
758
+ def on_select_file(repo_audio_path: str):
759
+ if not repo_audio_path:
760
+ raise gr.Error("请选择音频文件。")
761
+
762
+ audio, sr, segments, audio_name = load_audio_and_json(repo_audio_path)
763
 
764
  rows = [
765
+ [
766
+ s["row_id"], s["start"], s["end"], s["dur"],
767
+ s["status"], s["speaker"], s["gender"],
768
+ s["age_group"], s["emotion"], s["text"]
769
+ ]
770
+ for s in segments
771
  ]
772
 
773
+ info = (
774
+ f"**Repo**: `{REPO_ID}` \n"
775
+ f"**Audio**: `{audio_name}` \n"
776
+ f"**Segments**: {len(segments)} \n"
777
+ f"**Sample rate**: {sr} Hz \n"
778
+ f"**Duration**: {len(audio)/sr:.2f} s"
779
+ )
780
+
781
  state = {
782
+ "audio": audio,
783
+ "sr": sr,
784
+ "segments": segments,
785
+ "rows": rows, # 关键:存 rows,用于 df.select 时通过 index 稳定取行数据
786
  }
787
 
788
+ return state, gr.update(value=rows), gr.update(value=info)
789
+
790
+
791
+ def on_select_segment(evt: gr.SelectData, state: Dict[str, Any]):
792
+ if not state or "rows" not in state:
793
+ raise gr.Error("请先加载音频。")
794
+
795
+ # Gradio 不同版本对 evt.row_value 支持不稳定:优先使用 evt.index[0]
796
+ row_idx = None
797
+ if getattr(evt, "index", None) is not None:
798
+ try:
799
+ row_idx = int(evt.index[0])
800
+ except Exception:
801
+ row_idx = None
802
+
803
+ if row_idx is None:
804
+ # fallback:尝试 row_value
805
+ row_value = getattr(evt, "row_value", None)
806
+ if row_value is None:
807
+ raise gr.Error("无法从 Dataframe 选择事件中读取行数据。请升级/锁定 gradio 版本或使用本代码的 index 方案。")
808
+ row = row_value
809
+ else:
810
+ rows = state["rows"]
811
+ if row_idx < 0 or row_idx >= len(rows):
812
+ raise gr.Error("选中行越界,请重新点击。")
813
+ row = rows[row_idx]
814
+
815
+ # row schema:
816
+ # [row_id, start, end, dur, status, speaker, gender, age_group, emotion, text]
817
+ start, end = float(row[1]), float(row[2])
818
+ sr, audio_seg = slice_audio(state["audio"], int(state["sr"]), start, end)
819
+
820
+ meta = (
821
+ f"### 选中片段\n"
822
+ f"- **time**: `{start:.3f}s` → `{end:.3f}s` (dur≈{max(0.0, end-start):.3f}s)\n"
823
+ f"- **status**: `{row[4]}`\n"
824
+ f"- **speaker**: `{row[5]}`\n"
825
+ f"- **gender**: `{row[6]}`\n"
826
+ f"- **age_group**: `{row[7]}`\n"
827
+ f"- **emotion**: `{row[8]}`\n"
828
+ )
829
 
830
+ text = row[9] if row[9] is not None else ""
831
+ if not str(text).strip():
832
+ text = "(empty)"
 
833
 
834
+ # gr.Audio(type="numpy") expects (sr, np.ndarray)
835
+ return (sr, np.asarray(audio_seg, dtype=np.float32)), gr.update(value=meta), gr.update(value=str(text))
836
 
837
 
838
+ # =====================
839
  # UI
840
+ # =====================
841
+ with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
842
+ gr.Markdown(
843
+ "# ARE 音频 & 字幕可视化(Hugging Face Repo)\n"
844
+ f"数据来源:`{REPO_ID}`"
845
+ )
 
 
 
 
 
846
 
847
+ state = gr.State(value={})
 
 
848
 
849
+ audio_files = list_are_audio_files()
 
 
850
 
851
+ audio_selector = gr.Dropdown(
852
+ choices=audio_files,
853
+ label="选择音频文件(ARE)",
854
+ value=audio_files[0] if audio_files else None
855
  )
856
 
857
+ load_btn = gr.Button("加载", variant="primary")
858
+ info = gr.Markdown()
859
 
860
  df = gr.Dataframe(
861
+ headers=[
862
+ "row_id", "start", "end", "dur",
863
+ "status", "speaker", "gender",
864
+ "age_group", "emotion", "text"
865
+ ],
866
  wrap=True,
867
+ interactive=False,
868
+ max_height=420,
 
 
 
 
 
869
  )
870
 
871
  with gr.Row():
872
+ audio_out = gr.Audio(label="播放", type="numpy")
873
+ meta = gr.Markdown()
874
 
875
+ text = gr.Textbox(label="字幕文本", lines=4)
876
+
877
+ load_btn.click(
878
+ fn=on_select_file,
879
+ inputs=[audio_selector],
880
+ outputs=[state, df, info],
881
+ )
882
 
883
  df.select(
884
+ fn=on_select_segment,
885
+ inputs=[state],
886
+ outputs=[audio_out, meta, text],
887
  )
888
 
889
+ demo.launch()
890
+