Spaces:

ZTXRiley
/

AUDIO_Inspection

Sleeping

App Files Files Community

unknown commited on Jan 25

Commit

b83a586

1 Parent(s): 7273fb2

play4

Browse files

Files changed (1) hide show

app.py +226 -220

app.py CHANGED Viewed

@@ -628,257 +628,263 @@
 # if __name__ == "__main__":
 #     demo.launch()
-import re
-import tempfile
-from dataclasses import dataclass
-from typing import List, Dict
 import gradio as gr
-from huggingface_hub import list_repo_files, hf_hub_download
-from pydub import AudioSegment
-# =========================================================
-# 基本配置
-# =========================================================
-MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
-VTT_EXTS = (".vtt",)
-DEFAULT_MAX_MID_DIFF = 1.5
-# =========================================================
-# 数据结构
-# =========================================================
-@dataclass
-class Cue:
-    start: float
-    end: float
-    text: str
-# =========================================================
-# VTT 解析（只保留纯字幕）
-# =========================================================
-_TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
-_VTT_TIME_RE = re.compile(
-    r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
-    r"(?P<end>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})"
-)
-def _strip_tags(text: str) -> str:
-    return _TAG_RE.sub("", text).strip()
-def _time_to_seconds(t: str) -> float:
-    parts = t.split(":")
-    if len(parts) == 3:
-        return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
-    return int(parts[0]) * 60 + float(parts[1])
-def parse_vtt_file(path: str) -> List[Cue]:
-    with open(path, "r", encoding="utf-8") as f:
-        content = f.read()
-    content = content.replace("\ufeff", "")
-    content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
-    blocks = re.split(r"\r?\n\r?\n", content.strip())
-    cues: List[Cue] = []
-    for block in blocks:
-        lines = [l.strip() for l in block.splitlines() if l.strip()]
-        if not lines:
-            continue
-        time_idx = None
-        for i, line in enumerate(lines):
-            if "-->" in line:
-                time_idx = i
-                break
-        if time_idx is None:
-            continue
-        m = _VTT_TIME_RE.search(lines[time_idx])
-        if not m:
-            continue
-        start = _time_to_seconds(m.group("start"))
-        end = _time_to_seconds(m.group("end"))
-        if end <= start:
-            continue
-        text_lines = lines[time_idx + 1 :]
-        if not text_lines:
-            continue
-        text = _strip_tags("\n".join(text_lines))
-        if text:
-            cues.append(Cue(start=start, end=end, text=text))
-    return sorted(cues, key=lambda x: x.start)
-# =========================================================
-# 字幕对齐（按时间中点）
-# =========================================================
-def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
-    out, i, j, idx = [], 0, 0, 1
-    while i < len(a) and j < len(b):
-        ma = (a[i].start + a[i].end) / 2
-        mb = (b[j].start + b[j].end) / 2
-        if abs(ma - mb) <= th:
-            out.append(
-                {
-                    "idx": idx,
-                    "start": min(a[i].start, b[j].start),
-                    "end": max(a[i].end, b[j].end),
-                    "a_text": a[i].text,
-                    "b_text": b[j].text,
-                }
-            )
-            idx += 1
-            i += 1
-            j += 1
-        elif ma < mb:
-            i += 1
-        else:
-            j += 1
-    return out
-# =========================================================
-# 音频切片
-# =========================================================
-def export_segment(audio: AudioSegment, start: float, end: float) -> str:
-    seg = audio[int(start * 1000) : int(end * 1000)]
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    seg.export(tmp.name, format="wav")
-    return tmp.name
-# =========================================================
-# Gradio 回调
-# =========================================================
-def scan_dataset(repo_id: str, repo_type: str):
-    if not repo_id:
-        raise gr.Error("请填写 Dataset / Repo 名称。")
-    files = list_repo_files(repo_id, repo_type=repo_type)
-    media_files = [f for f in files if f.lower().endswith(MEDIA_EXTS)]
-    vtt_files = [f for f in files if f.lower().endswith(VTT_EXTS)]
-    if not media_files or not vtt_files:
-        raise gr.Error("Dataset 中未找到媒体文件或 VTT 文件。")
-    return (
-        gr.update(choices=media_files, value=media_files[0]),
-        gr.update(choices=media_files, value=media_files[0]),
-        gr.update(choices=vtt_files, value=vtt_files[0]),
-        gr.update(choices=vtt_files, value=vtt_files[0]),
-    )
-def load_and_align(repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th):
-    audio_a = AudioSegment.from_file(hf_hub_download(repo_id, media_a, repo_type=repo_type))
-    audio_b = AudioSegment.from_file(hf_hub_download(repo_id, media_b, repo_type=repo_type))
-    cues_a = parse_vtt_file(hf_hub_download(repo_id, vtt_a, repo_type=repo_type))
-    cues_b = parse_vtt_file(hf_hub_download(repo_id, vtt_b, repo_type=repo_type))
-    aligned = align_by_time(cues_a, cues_b, th)
-    if not aligned:
-        raise gr.Error("未对齐到任何字幕片段。")
     rows = [
-        [x["idx"], f'{x["start"]:.2f}-{x["end"]:.2f}', x["a_text"], x["b_text"]]
-        for x in aligned
     ]
     state = {
-        "aligned": aligned,
-        "audio_a": audio_a,
-        "audio_b": audio_b,
     }
-    return rows, state, None, None
-def play_on_select(evt: gr.SelectData, df_value, state):
-    if not state:
-        raise gr.Error("请先加载并对齐。")
-    row = evt.index[0] if isinstance(evt.index, (tuple, list)) else evt.index
-    seg_idx = int(df_value[row][0])
-    seg = state["aligned"][seg_idx - 1]
-    a_wav = export_segment(state["audio_a"], seg["start"], seg["end"])
-    b_wav = export_segment(state["audio_b"], seg["start"], seg["end"])
-    info = {
-        "segment": seg_idx,
-        "time": f'{seg["start"]:.2f} - {seg["end"]:.2f}',
-    }
-    return a_wav, b_wav, info
-# =========================================================
 # UI
-# =========================================================
-with gr.Blocks(title="双语音频字幕对齐（点击即播放）") as demo:
-    gr.Markdown("# 🎧 双语音频字幕对齐（点击表格即播放）")
-    state = gr.State()
-    with gr.Row():
-        repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset")
-        repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
-    btn_scan = gr.Button("扫描 Dataset", variant="primary")
-    with gr.Row():
-        media_a = gr.Dropdown(label="Track A 媒体")
-        media_b = gr.Dropdown(label="Track B 媒体")
-    with gr.Row():
-        vtt_a = gr.Dropdown(label="Track A 字幕")
-        vtt_b = gr.Dropdown(label="Track B 字幕")
-    btn_scan.click(
-        scan_dataset,
-        inputs=[repo_id, repo_type],
-        outputs=[media_a, media_b, vtt_a, vtt_b],
     )
-    th = gr.Slider(0.3, 5.0, value=DEFAULT_MAX_MID_DIFF, step=0.1, label="对齐阈值（秒）")
-    btn_align = gr.Button("加载并对齐", variant="primary")
     df = gr.Dataframe(
-        headers=["#", "Time", "Track A", "Track B"],
-        interactive=True,
         wrap=True,
-        max_height=520,
-    )
-    btn_align.click(
-        load_and_align,
-        inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
-        outputs=[df, state, gr.Audio(), gr.Audio()],
     )
     with gr.Row():
-        a_out = gr.Audio(label="Track A 片段")
-        b_out = gr.Audio(label="Track B 片段")
-    play_info = gr.JSON(label="当前片段")
     df.select(
-        play_on_select,
-        inputs=[df, state],
-        outputs=[a_out, b_out, play_info],
     )
-if __name__ == "__main__":
-    demo.launch()

 # if __name__ == "__main__":
 #     demo.launch()
+import json
+import os
+from typing import List, Dict, Any, Tuple
+import numpy as np
 import gradio as gr
+from huggingface_hub import hf_hub_download, list_repo_files
+# Audio backends
+import soundfile as sf
+try:
+    from pydub import AudioSegment  # fallback (ffmpeg)
+except Exception:
+    AudioSegment = None
+# =====================
+# 固定配置（你的数据）
+# =====================
+REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
+AUDIO_DIR = "audio/testbatch/ARE"
+JSON_DIR = "text/ref/testbatch/ARE"
+# =====================
+# 工具函数
+# =====================
+def list_are_audio_files() -> List[str]:
+    files = list_repo_files(REPO_ID)
+    audio_files = [
+        f for f in files
+        if f.startswith(AUDIO_DIR) and f.lower().endswith((".wav", ".mp3", ".flac", ".ogg"))
+    ]
+    audio_files.sort()
+    return audio_files
+def _load_audio_np(local_audio_path: str) -> Tuple[np.ndarray, int]:
+    """
+    Return mono float32 numpy audio in [-1, 1], and sample rate.
+    soundfile first; if fails, fallback to pydub.
+    """
+    # 1) soundfile
+    try:
+        audio, sr = sf.read(local_audio_path, always_2d=False)
+        if audio.ndim == 2:
+            audio = audio.mean(axis=1)
+        audio = np.asarray(audio, dtype=np.float32)
+        return audio, int(sr)
+    except Exception:
+        pass
+    # 2) pydub fallback
+    if AudioSegment is None:
+        raise RuntimeError("soundfile 读取失败，且未安装/不可用 pydub。请确认 requirements.txt 包含 pydub 且 Space 有 ffmpeg。")
+    seg = AudioSegment.from_file(local_audio_path)
+    sr = int(seg.frame_rate)
+    samples = np.array(seg.get_array_of_samples())
+    if seg.channels > 1:
+        samples = samples.reshape((-1, seg.channels)).mean(axis=1)
+    # normalize by sample width
+    max_val = float(1 << (8 * seg.sample_width - 1))
+    audio = (samples / max_val).astype(np.float32)
+    return audio, sr
+def load_audio_and_json(repo_audio_path: str):
+    # ---- 推导 json 路径 ----
+    filename = os.path.basename(repo_audio_path)
+    base, _ext = os.path.splitext(filename)
+    repo_json_path = f"{JSON_DIR}/{base}.json"
+    # ---- 下载 ----
+    local_audio = hf_hub_download(REPO_ID, repo_audio_path)
+    local_json = hf_hub_download(REPO_ID, repo_json_path)
+    # ---- 读音频 ----
+    audio, sr = _load_audio_np(local_audio)
+    # ---- 读 JSON ----
+    with open(local_json, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    segments = []
+    for i, s in enumerate(data.get("segments", [])):
+        start = float(s.get("start", 0.0))
+        end = float(s.get("end", 0.0))
+        segments.append({
+            "row_id": s.get("index", i),
+            "start": start,
+            "end": end,
+            "dur": max(0.0, end - start),
+            "status": s.get("status", ""),
+            "speaker": s.get("speaker", ""),
+            "gender": s.get("gender", ""),
+            "age_group": s.get("age_group", ""),
+            "emotion": s.get("emotion", ""),
+            "text": s.get("text", "") or "",
+        })
+    audio_name = data.get("audio_name", filename)
+    return audio, sr, segments, audio_name
+def slice_audio(audio: np.ndarray, sr: int, start: float, end: float) -> Tuple[int, np.ndarray]:
+    n = int(audio.shape[0])
+    start_i = int(round(float(start) * sr))
+    end_i = int(round(float(end) * sr))
+    start_i = max(0, min(n, start_i))
+    end_i = max(0, min(n, end_i))
+    if end_i <= start_i:
+        return sr, np.zeros((0,), dtype=np.float32)
+    seg = np.asarray(audio[start_i:end_i], dtype=np.float32)
+    return sr, seg
+# =====================
+# Gradio 交互逻辑
+# =====================
+def on_select_file(repo_audio_path: str):
+    if not repo_audio_path:
+        raise gr.Error("请选择音频文件。")
+    audio, sr, segments, audio_name = load_audio_and_json(repo_audio_path)
     rows = [
+        [
+            s["row_id"], s["start"], s["end"], s["dur"],
+            s["status"], s["speaker"], s["gender"],
+            s["age_group"], s["emotion"], s["text"]
+        ]
+        for s in segments
     ]
+    info = (
+        f"**Repo**: `{REPO_ID}`  \n"
+        f"**Audio**: `{audio_name}`  \n"
+        f"**Segments**: {len(segments)}  \n"
+        f"**Sample rate**: {sr} Hz  \n"
+        f"**Duration**: {len(audio)/sr:.2f} s"
+    )
     state = {
+        "audio": audio,
+        "sr": sr,
+        "segments": segments,
+        "rows": rows,  # 关键：存 rows，用于 df.select 时通过 index 稳定取行数据
     }
+    return state, gr.update(value=rows), gr.update(value=info)
+def on_select_segment(evt: gr.SelectData, state: Dict[str, Any]):
+    if not state or "rows" not in state:
+        raise gr.Error("请先加载音频。")
+    # Gradio 不同版本对 evt.row_value 支持不稳定：优先使用 evt.index[0]
+    row_idx = None
+    if getattr(evt, "index", None) is not None:
+        try:
+            row_idx = int(evt.index[0])
+        except Exception:
+            row_idx = None
+    if row_idx is None:
+        # fallback：尝试 row_value
+        row_value = getattr(evt, "row_value", None)
+        if row_value is None:
+            raise gr.Error("无法从 Dataframe 选择事件中读取行数据。请升级/锁定 gradio 版本或使用本代码的 index 方案。")
+        row = row_value
+    else:
+        rows = state["rows"]
+        if row_idx < 0 or row_idx >= len(rows):
+            raise gr.Error("选中行越界，请重新点击。")
+        row = rows[row_idx]
+    # row schema:
+    # [row_id, start, end, dur, status, speaker, gender, age_group, emotion, text]
+    start, end = float(row[1]), float(row[2])
+    sr, audio_seg = slice_audio(state["audio"], int(state["sr"]), start, end)
+    meta = (
+        f"### 选中片段\n"
+        f"- **time**: `{start:.3f}s` → `{end:.3f}s` (dur≈{max(0.0, end-start):.3f}s)\n"
+        f"- **status**: `{row[4]}`\n"
+        f"- **speaker**: `{row[5]}`\n"
+        f"- **gender**: `{row[6]}`\n"
+        f"- **age_group**: `{row[7]}`\n"
+        f"- **emotion**: `{row[8]}`\n"
+    )
+    text = row[9] if row[9] is not None else ""
+    if not str(text).strip():
+        text = "(empty)"
+    # gr.Audio(type="numpy") expects (sr, np.ndarray)
+    return (sr, np.asarray(audio_seg, dtype=np.float32)), gr.update(value=meta), gr.update(value=str(text))
+# =====================
 # UI
+# =====================
+with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
+    gr.Markdown(
+        "# ARE 音频 & 字幕可视化（Hugging Face Repo）\n"
+        f"数据来源：`{REPO_ID}`"
+    )
+    state = gr.State(value={})
+    audio_files = list_are_audio_files()
+    audio_selector = gr.Dropdown(
+        choices=audio_files,
+        label="选择音频文件（ARE）",
+        value=audio_files[0] if audio_files else None
     )
+    load_btn = gr.Button("加载", variant="primary")
+    info = gr.Markdown()
     df = gr.Dataframe(
+        headers=[
+            "row_id", "start", "end", "dur",
+            "status", "speaker", "gender",
+            "age_group", "emotion", "text"
+        ],
         wrap=True,
+        interactive=False,
+        max_height=420,
     )
     with gr.Row():
+        audio_out = gr.Audio(label="分段播放", type="numpy")
+        meta = gr.Markdown()
+    text = gr.Textbox(label="字幕文本", lines=4)
+    load_btn.click(
+        fn=on_select_file,
+        inputs=[audio_selector],
+        outputs=[state, df, info],
+    )
     df.select(
+        fn=on_select_segment,
+        inputs=[state],
+        outputs=[audio_out, meta, text],
     )
+demo.launch()