Spaces:
Sleeping
Sleeping
unknown commited on
Commit ·
b83a586
1
Parent(s): 7273fb2
play4
Browse files
app.py
CHANGED
|
@@ -628,257 +628,263 @@
|
|
| 628 |
# if __name__ == "__main__":
|
| 629 |
# demo.launch()
|
| 630 |
|
| 631 |
-
import
|
| 632 |
-
import
|
| 633 |
-
from
|
| 634 |
-
from typing import List, Dict
|
| 635 |
|
|
|
|
| 636 |
import gradio as gr
|
| 637 |
-
|
| 638 |
-
from
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
#
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
DEFAULT_MAX_MID_DIFF = 1.5
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
# =========================================================
|
| 650 |
-
# 数据结构
|
| 651 |
-
# =========================================================
|
| 652 |
-
@dataclass
|
| 653 |
-
class Cue:
|
| 654 |
-
start: float
|
| 655 |
-
end: float
|
| 656 |
-
text: str
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
# =========================================================
|
| 660 |
-
# VTT 解析(只保留纯字幕)
|
| 661 |
-
# =========================================================
|
| 662 |
-
_TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
|
| 663 |
-
_VTT_TIME_RE = re.compile(
|
| 664 |
-
r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
|
| 665 |
-
r"(?P<end>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})"
|
| 666 |
-
)
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
def _strip_tags(text: str) -> str:
|
| 670 |
-
return _TAG_RE.sub("", text).strip()
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
def _time_to_seconds(t: str) -> float:
|
| 674 |
-
parts = t.split(":")
|
| 675 |
-
if len(parts) == 3:
|
| 676 |
-
return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
|
| 677 |
-
return int(parts[0]) * 60 + float(parts[1])
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
def parse_vtt_file(path: str) -> List[Cue]:
|
| 681 |
-
with open(path, "r", encoding="utf-8") as f:
|
| 682 |
-
content = f.read()
|
| 683 |
-
|
| 684 |
-
content = content.replace("\ufeff", "")
|
| 685 |
-
content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
|
| 686 |
-
|
| 687 |
-
blocks = re.split(r"\r?\n\r?\n", content.strip())
|
| 688 |
-
cues: List[Cue] = []
|
| 689 |
-
|
| 690 |
-
for block in blocks:
|
| 691 |
-
lines = [l.strip() for l in block.splitlines() if l.strip()]
|
| 692 |
-
if not lines:
|
| 693 |
-
continue
|
| 694 |
-
|
| 695 |
-
time_idx = None
|
| 696 |
-
for i, line in enumerate(lines):
|
| 697 |
-
if "-->" in line:
|
| 698 |
-
time_idx = i
|
| 699 |
-
break
|
| 700 |
-
if time_idx is None:
|
| 701 |
-
continue
|
| 702 |
-
|
| 703 |
-
m = _VTT_TIME_RE.search(lines[time_idx])
|
| 704 |
-
if not m:
|
| 705 |
-
continue
|
| 706 |
-
|
| 707 |
-
start = _time_to_seconds(m.group("start"))
|
| 708 |
-
end = _time_to_seconds(m.group("end"))
|
| 709 |
-
if end <= start:
|
| 710 |
-
continue
|
| 711 |
-
|
| 712 |
-
text_lines = lines[time_idx + 1 :]
|
| 713 |
-
if not text_lines:
|
| 714 |
-
continue
|
| 715 |
-
|
| 716 |
-
text = _strip_tags("\n".join(text_lines))
|
| 717 |
-
if text:
|
| 718 |
-
cues.append(Cue(start=start, end=end, text=text))
|
| 719 |
-
|
| 720 |
-
return sorted(cues, key=lambda x: x.start)
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
# =========================================================
|
| 724 |
-
# 字幕对齐(按时间中点)
|
| 725 |
-
# =========================================================
|
| 726 |
-
def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
|
| 727 |
-
out, i, j, idx = [], 0, 0, 1
|
| 728 |
-
while i < len(a) and j < len(b):
|
| 729 |
-
ma = (a[i].start + a[i].end) / 2
|
| 730 |
-
mb = (b[j].start + b[j].end) / 2
|
| 731 |
-
if abs(ma - mb) <= th:
|
| 732 |
-
out.append(
|
| 733 |
-
{
|
| 734 |
-
"idx": idx,
|
| 735 |
-
"start": min(a[i].start, b[j].start),
|
| 736 |
-
"end": max(a[i].end, b[j].end),
|
| 737 |
-
"a_text": a[i].text,
|
| 738 |
-
"b_text": b[j].text,
|
| 739 |
-
}
|
| 740 |
-
)
|
| 741 |
-
idx += 1
|
| 742 |
-
i += 1
|
| 743 |
-
j += 1
|
| 744 |
-
elif ma < mb:
|
| 745 |
-
i += 1
|
| 746 |
-
else:
|
| 747 |
-
j += 1
|
| 748 |
-
return out
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
# =========================================================
|
| 752 |
-
# 音频切片
|
| 753 |
-
# =========================================================
|
| 754 |
-
def export_segment(audio: AudioSegment, start: float, end: float) -> str:
|
| 755 |
-
seg = audio[int(start * 1000) : int(end * 1000)]
|
| 756 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 757 |
-
seg.export(tmp.name, format="wav")
|
| 758 |
-
return tmp.name
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
# =========================================================
|
| 762 |
-
# Gradio 回调
|
| 763 |
-
# =========================================================
|
| 764 |
-
def scan_dataset(repo_id: str, repo_type: str):
|
| 765 |
-
if not repo_id:
|
| 766 |
-
raise gr.Error("请填写 Dataset / Repo 名称。")
|
| 767 |
-
|
| 768 |
-
files = list_repo_files(repo_id, repo_type=repo_type)
|
| 769 |
-
media_files = [f for f in files if f.lower().endswith(MEDIA_EXTS)]
|
| 770 |
-
vtt_files = [f for f in files if f.lower().endswith(VTT_EXTS)]
|
| 771 |
-
|
| 772 |
-
if not media_files or not vtt_files:
|
| 773 |
-
raise gr.Error("Dataset 中未找到媒体文件或 VTT 文件。")
|
| 774 |
-
|
| 775 |
-
return (
|
| 776 |
-
gr.update(choices=media_files, value=media_files[0]),
|
| 777 |
-
gr.update(choices=media_files, value=media_files[0]),
|
| 778 |
-
gr.update(choices=vtt_files, value=vtt_files[0]),
|
| 779 |
-
gr.update(choices=vtt_files, value=vtt_files[0]),
|
| 780 |
-
)
|
| 781 |
|
| 782 |
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
| 786 |
|
| 787 |
-
cues_a = parse_vtt_file(hf_hub_download(repo_id, vtt_a, repo_type=repo_type))
|
| 788 |
-
cues_b = parse_vtt_file(hf_hub_download(repo_id, vtt_b, repo_type=repo_type))
|
| 789 |
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
|
| 794 |
rows = [
|
| 795 |
-
[
|
| 796 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 797 |
]
|
| 798 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
state = {
|
| 800 |
-
"
|
| 801 |
-
"
|
| 802 |
-
"
|
|
|
|
| 803 |
}
|
| 804 |
|
| 805 |
-
return
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
def
|
| 809 |
-
if not state:
|
| 810 |
-
raise gr.Error("请先加载
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
}
|
| 823 |
|
| 824 |
-
|
|
|
|
| 825 |
|
| 826 |
|
| 827 |
-
# =====================
|
| 828 |
# UI
|
| 829 |
-
# =====================
|
| 830 |
-
with gr.Blocks(title="
|
| 831 |
-
gr.Markdown(
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
with gr.Row():
|
| 836 |
-
repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset")
|
| 837 |
-
repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
|
| 838 |
-
|
| 839 |
-
btn_scan = gr.Button("扫描 Dataset", variant="primary")
|
| 840 |
|
| 841 |
-
|
| 842 |
-
media_a = gr.Dropdown(label="Track A 媒体")
|
| 843 |
-
media_b = gr.Dropdown(label="Track B 媒体")
|
| 844 |
|
| 845 |
-
|
| 846 |
-
vtt_a = gr.Dropdown(label="Track A 字幕")
|
| 847 |
-
vtt_b = gr.Dropdown(label="Track B 字幕")
|
| 848 |
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
)
|
| 854 |
|
| 855 |
-
|
| 856 |
-
|
| 857 |
|
| 858 |
df = gr.Dataframe(
|
| 859 |
-
headers=[
|
| 860 |
-
|
|
|
|
|
|
|
|
|
|
| 861 |
wrap=True,
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
btn_align.click(
|
| 866 |
-
load_and_align,
|
| 867 |
-
inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
|
| 868 |
-
outputs=[df, state, gr.Audio(), gr.Audio()],
|
| 869 |
)
|
| 870 |
|
| 871 |
with gr.Row():
|
| 872 |
-
|
| 873 |
-
|
| 874 |
|
| 875 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
|
| 877 |
df.select(
|
| 878 |
-
|
| 879 |
-
inputs=[
|
| 880 |
-
outputs=[
|
| 881 |
)
|
| 882 |
|
| 883 |
-
|
| 884 |
-
|
|
|
|
| 628 |
# if __name__ == "__main__":
|
| 629 |
# demo.launch()
|
| 630 |
|
| 631 |
+
import json
|
| 632 |
+
import os
|
| 633 |
+
from typing import List, Dict, Any, Tuple
|
|
|
|
| 634 |
|
| 635 |
+
import numpy as np
|
| 636 |
import gradio as gr
|
| 637 |
+
|
| 638 |
+
from huggingface_hub import hf_hub_download, list_repo_files
|
| 639 |
+
|
| 640 |
+
# Audio backends
|
| 641 |
+
import soundfile as sf
|
| 642 |
+
try:
|
| 643 |
+
from pydub import AudioSegment # fallback (ffmpeg)
|
| 644 |
+
except Exception:
|
| 645 |
+
AudioSegment = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
|
| 647 |
|
| 648 |
+
# =====================
|
| 649 |
+
# 固定配置(你的数据)
|
| 650 |
+
# =====================
|
| 651 |
+
REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
|
| 652 |
+
AUDIO_DIR = "audio/testbatch/ARE"
|
| 653 |
+
JSON_DIR = "text/ref/testbatch/ARE"
|
| 654 |
|
|
|
|
|
|
|
| 655 |
|
| 656 |
+
# =====================
|
| 657 |
+
# 工具函数
|
| 658 |
+
# =====================
|
| 659 |
+
def list_are_audio_files() -> List[str]:
|
| 660 |
+
files = list_repo_files(REPO_ID)
|
| 661 |
+
audio_files = [
|
| 662 |
+
f for f in files
|
| 663 |
+
if f.startswith(AUDIO_DIR) and f.lower().endswith((".wav", ".mp3", ".flac", ".ogg"))
|
| 664 |
+
]
|
| 665 |
+
audio_files.sort()
|
| 666 |
+
return audio_files
|
| 667 |
+
|
| 668 |
+
|
| 669 |
+
def _load_audio_np(local_audio_path: str) -> Tuple[np.ndarray, int]:
|
| 670 |
+
"""
|
| 671 |
+
Return mono float32 numpy audio in [-1, 1], and sample rate.
|
| 672 |
+
soundfile first; if fails, fallback to pydub.
|
| 673 |
+
"""
|
| 674 |
+
# 1) soundfile
|
| 675 |
+
try:
|
| 676 |
+
audio, sr = sf.read(local_audio_path, always_2d=False)
|
| 677 |
+
if audio.ndim == 2:
|
| 678 |
+
audio = audio.mean(axis=1)
|
| 679 |
+
audio = np.asarray(audio, dtype=np.float32)
|
| 680 |
+
return audio, int(sr)
|
| 681 |
+
except Exception:
|
| 682 |
+
pass
|
| 683 |
+
|
| 684 |
+
# 2) pydub fallback
|
| 685 |
+
if AudioSegment is None:
|
| 686 |
+
raise RuntimeError("soundfile 读取失败,且未安装/不可用 pydub。请确认 requirements.txt 包含 pydub 且 Space 有 ffmpeg。")
|
| 687 |
+
|
| 688 |
+
seg = AudioSegment.from_file(local_audio_path)
|
| 689 |
+
sr = int(seg.frame_rate)
|
| 690 |
+
samples = np.array(seg.get_array_of_samples())
|
| 691 |
+
|
| 692 |
+
if seg.channels > 1:
|
| 693 |
+
samples = samples.reshape((-1, seg.channels)).mean(axis=1)
|
| 694 |
+
|
| 695 |
+
# normalize by sample width
|
| 696 |
+
max_val = float(1 << (8 * seg.sample_width - 1))
|
| 697 |
+
audio = (samples / max_val).astype(np.float32)
|
| 698 |
+
return audio, sr
|
| 699 |
+
|
| 700 |
+
|
| 701 |
+
def load_audio_and_json(repo_audio_path: str):
|
| 702 |
+
# ---- 推导 json 路径 ----
|
| 703 |
+
filename = os.path.basename(repo_audio_path)
|
| 704 |
+
base, _ext = os.path.splitext(filename)
|
| 705 |
+
repo_json_path = f"{JSON_DIR}/{base}.json"
|
| 706 |
+
|
| 707 |
+
# ---- 下载 ----
|
| 708 |
+
local_audio = hf_hub_download(REPO_ID, repo_audio_path)
|
| 709 |
+
local_json = hf_hub_download(REPO_ID, repo_json_path)
|
| 710 |
+
|
| 711 |
+
# ---- 读音频 ----
|
| 712 |
+
audio, sr = _load_audio_np(local_audio)
|
| 713 |
+
|
| 714 |
+
# ---- 读 JSON ----
|
| 715 |
+
with open(local_json, "r", encoding="utf-8") as f:
|
| 716 |
+
data = json.load(f)
|
| 717 |
+
|
| 718 |
+
segments = []
|
| 719 |
+
for i, s in enumerate(data.get("segments", [])):
|
| 720 |
+
start = float(s.get("start", 0.0))
|
| 721 |
+
end = float(s.get("end", 0.0))
|
| 722 |
+
segments.append({
|
| 723 |
+
"row_id": s.get("index", i),
|
| 724 |
+
"start": start,
|
| 725 |
+
"end": end,
|
| 726 |
+
"dur": max(0.0, end - start),
|
| 727 |
+
"status": s.get("status", ""),
|
| 728 |
+
"speaker": s.get("speaker", ""),
|
| 729 |
+
"gender": s.get("gender", ""),
|
| 730 |
+
"age_group": s.get("age_group", ""),
|
| 731 |
+
"emotion": s.get("emotion", ""),
|
| 732 |
+
"text": s.get("text", "") or "",
|
| 733 |
+
})
|
| 734 |
+
|
| 735 |
+
audio_name = data.get("audio_name", filename)
|
| 736 |
+
return audio, sr, segments, audio_name
|
| 737 |
+
|
| 738 |
+
|
| 739 |
+
def slice_audio(audio: np.ndarray, sr: int, start: float, end: float) -> Tuple[int, np.ndarray]:
|
| 740 |
+
n = int(audio.shape[0])
|
| 741 |
+
|
| 742 |
+
start_i = int(round(float(start) * sr))
|
| 743 |
+
end_i = int(round(float(end) * sr))
|
| 744 |
+
|
| 745 |
+
start_i = max(0, min(n, start_i))
|
| 746 |
+
end_i = max(0, min(n, end_i))
|
| 747 |
+
|
| 748 |
+
if end_i <= start_i:
|
| 749 |
+
return sr, np.zeros((0,), dtype=np.float32)
|
| 750 |
+
|
| 751 |
+
seg = np.asarray(audio[start_i:end_i], dtype=np.float32)
|
| 752 |
+
return sr, seg
|
| 753 |
+
|
| 754 |
+
|
| 755 |
+
# =====================
|
| 756 |
+
# Gradio 交互逻辑
|
| 757 |
+
# =====================
|
| 758 |
+
def on_select_file(repo_audio_path: str):
|
| 759 |
+
if not repo_audio_path:
|
| 760 |
+
raise gr.Error("请选择音频文件。")
|
| 761 |
+
|
| 762 |
+
audio, sr, segments, audio_name = load_audio_and_json(repo_audio_path)
|
| 763 |
|
| 764 |
rows = [
|
| 765 |
+
[
|
| 766 |
+
s["row_id"], s["start"], s["end"], s["dur"],
|
| 767 |
+
s["status"], s["speaker"], s["gender"],
|
| 768 |
+
s["age_group"], s["emotion"], s["text"]
|
| 769 |
+
]
|
| 770 |
+
for s in segments
|
| 771 |
]
|
| 772 |
|
| 773 |
+
info = (
|
| 774 |
+
f"**Repo**: `{REPO_ID}` \n"
|
| 775 |
+
f"**Audio**: `{audio_name}` \n"
|
| 776 |
+
f"**Segments**: {len(segments)} \n"
|
| 777 |
+
f"**Sample rate**: {sr} Hz \n"
|
| 778 |
+
f"**Duration**: {len(audio)/sr:.2f} s"
|
| 779 |
+
)
|
| 780 |
+
|
| 781 |
state = {
|
| 782 |
+
"audio": audio,
|
| 783 |
+
"sr": sr,
|
| 784 |
+
"segments": segments,
|
| 785 |
+
"rows": rows, # 关键:存 rows,用于 df.select 时通过 index 稳定取行数据
|
| 786 |
}
|
| 787 |
|
| 788 |
+
return state, gr.update(value=rows), gr.update(value=info)
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
def on_select_segment(evt: gr.SelectData, state: Dict[str, Any]):
|
| 792 |
+
if not state or "rows" not in state:
|
| 793 |
+
raise gr.Error("请先加载音频。")
|
| 794 |
+
|
| 795 |
+
# Gradio 不同版本对 evt.row_value 支持不稳定:优先使用 evt.index[0]
|
| 796 |
+
row_idx = None
|
| 797 |
+
if getattr(evt, "index", None) is not None:
|
| 798 |
+
try:
|
| 799 |
+
row_idx = int(evt.index[0])
|
| 800 |
+
except Exception:
|
| 801 |
+
row_idx = None
|
| 802 |
+
|
| 803 |
+
if row_idx is None:
|
| 804 |
+
# fallback:尝试 row_value
|
| 805 |
+
row_value = getattr(evt, "row_value", None)
|
| 806 |
+
if row_value is None:
|
| 807 |
+
raise gr.Error("无法从 Dataframe 选择事件中读取行数据。请升级/锁定 gradio 版本或使用本代码的 index 方案。")
|
| 808 |
+
row = row_value
|
| 809 |
+
else:
|
| 810 |
+
rows = state["rows"]
|
| 811 |
+
if row_idx < 0 or row_idx >= len(rows):
|
| 812 |
+
raise gr.Error("选中行越界,请重新点击。")
|
| 813 |
+
row = rows[row_idx]
|
| 814 |
+
|
| 815 |
+
# row schema:
|
| 816 |
+
# [row_id, start, end, dur, status, speaker, gender, age_group, emotion, text]
|
| 817 |
+
start, end = float(row[1]), float(row[2])
|
| 818 |
+
sr, audio_seg = slice_audio(state["audio"], int(state["sr"]), start, end)
|
| 819 |
+
|
| 820 |
+
meta = (
|
| 821 |
+
f"### 选中片段\n"
|
| 822 |
+
f"- **time**: `{start:.3f}s` → `{end:.3f}s` (dur≈{max(0.0, end-start):.3f}s)\n"
|
| 823 |
+
f"- **status**: `{row[4]}`\n"
|
| 824 |
+
f"- **speaker**: `{row[5]}`\n"
|
| 825 |
+
f"- **gender**: `{row[6]}`\n"
|
| 826 |
+
f"- **age_group**: `{row[7]}`\n"
|
| 827 |
+
f"- **emotion**: `{row[8]}`\n"
|
| 828 |
+
)
|
| 829 |
|
| 830 |
+
text = row[9] if row[9] is not None else ""
|
| 831 |
+
if not str(text).strip():
|
| 832 |
+
text = "(empty)"
|
|
|
|
| 833 |
|
| 834 |
+
# gr.Audio(type="numpy") expects (sr, np.ndarray)
|
| 835 |
+
return (sr, np.asarray(audio_seg, dtype=np.float32)), gr.update(value=meta), gr.update(value=str(text))
|
| 836 |
|
| 837 |
|
| 838 |
+
# =====================
|
| 839 |
# UI
|
| 840 |
+
# =====================
|
| 841 |
+
with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
|
| 842 |
+
gr.Markdown(
|
| 843 |
+
"# ARE 音频 & 字幕可视化(Hugging Face Repo)\n"
|
| 844 |
+
f"数据来源:`{REPO_ID}`"
|
| 845 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
|
| 847 |
+
state = gr.State(value={})
|
|
|
|
|
|
|
| 848 |
|
| 849 |
+
audio_files = list_are_audio_files()
|
|
|
|
|
|
|
| 850 |
|
| 851 |
+
audio_selector = gr.Dropdown(
|
| 852 |
+
choices=audio_files,
|
| 853 |
+
label="选择音频文件(ARE)",
|
| 854 |
+
value=audio_files[0] if audio_files else None
|
| 855 |
)
|
| 856 |
|
| 857 |
+
load_btn = gr.Button("加载", variant="primary")
|
| 858 |
+
info = gr.Markdown()
|
| 859 |
|
| 860 |
df = gr.Dataframe(
|
| 861 |
+
headers=[
|
| 862 |
+
"row_id", "start", "end", "dur",
|
| 863 |
+
"status", "speaker", "gender",
|
| 864 |
+
"age_group", "emotion", "text"
|
| 865 |
+
],
|
| 866 |
wrap=True,
|
| 867 |
+
interactive=False,
|
| 868 |
+
max_height=420,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 869 |
)
|
| 870 |
|
| 871 |
with gr.Row():
|
| 872 |
+
audio_out = gr.Audio(label="分段播放", type="numpy")
|
| 873 |
+
meta = gr.Markdown()
|
| 874 |
|
| 875 |
+
text = gr.Textbox(label="字幕文本", lines=4)
|
| 876 |
+
|
| 877 |
+
load_btn.click(
|
| 878 |
+
fn=on_select_file,
|
| 879 |
+
inputs=[audio_selector],
|
| 880 |
+
outputs=[state, df, info],
|
| 881 |
+
)
|
| 882 |
|
| 883 |
df.select(
|
| 884 |
+
fn=on_select_segment,
|
| 885 |
+
inputs=[state],
|
| 886 |
+
outputs=[audio_out, meta, text],
|
| 887 |
)
|
| 888 |
|
| 889 |
+
demo.launch()
|
| 890 |
+
|