| | from datasets import load_dataset |
| | from transformers import pipeline |
| | import evaluate |
| | import numpy as np |
| | from tqdm import tqdm |
| |
|
| | ds = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True) |
| | ds = ds.take(100) |
| |
|
| | model_name = { |
| | "whisper-tiny": "openai/whisper-tiny.en", |
| | "wav2vec2-large-960h": "facebook/wav2vec2-base-960h", |
| | "distill-whisper-small": "distil-whisper/distil-small.en", |
| | } |
| |
|
| | def evaluate_model(ds, pipe, wer_metric): |
| | wer_scores = [] |
| | wer_results = [] |
| | for idx, sample in enumerate(tqdm(ds, desc="Evaluating", total=len(list(ds)))): |
| | audio_sample = sample["audio"] |
| | transcription = pipe(audio_sample["array"])['text'] |
| | |
| | transcription = transcription.replace(",", "").replace(".", "").replace("!", "").replace("?", "") |
| | wer = wer_metric.compute(predictions=[transcription.upper()], references=[sample["text"].upper()]) |
| | wer_scores.append(wer) |
| | wer_results.append({ |
| | "index": idx, |
| | "transcription": transcription.upper(), |
| | "reference": sample["text"].upper(), |
| | "wer": wer |
| | }) |
| | return wer_scores, wer_results |
| |
|
| | |
| | wer_metric = evaluate.load("wer") |
| |
|
| | results = {} |
| | model_wer_results = {} |
| | |
| | for model in model_name: |
| | pipe = pipeline("automatic-speech-recognition", model=model_name[model]) |
| | wer_scores, wer_results = evaluate_model(ds, pipe, wer_metric) |
| | results[model] = np.mean(wer_scores) |
| | model_wer_results[model] = wer_results |
| |
|
| | for model in results: |
| | print(f"Model: {model}, WER: {results[model]}") |