1

i'm trying to fine tunning whisper-medium for Koreans language.

Here is tutorial that i followed.

And here is my experiment setting

python==3.9.16
transformers==4.27.4
tokenizers==0.13.3
torch==2.0.0
torchaudio==2.0.0
torchmetrics==0.11.4
torchvision==0.15.0

Here is preprocessing code for data

from transformers import WhisperFeatureExtractor
from datasets import load_dataset
from transformers import WhisperTokenizer
from datasets import Audio
def prepare_dataset(batch, tokenizer, feature_extractor):
 # load and resample audio data from 48 to 16kHz
 audio = batch["audio"]
 # compute log-Mel input features from input audio array 
 batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
 # encode target text to label ids 
 batch["labels"] = tokenizer(batch["text"]).input_ids
 return batch
def main():
 dataset = load_dataset("Bingsu/zeroth-korean")
 dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")
 tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="ko", task="transcribe")
 mapped_dataset = dataset.map(lambda x: prepare_dataset(x, tokenizer, feature_extractor), remove_columns=dataset.column_names["train"], num_proc=16)
 mapped_dataset.save_to_disk('./data/Bingsu_zeroth-korean', num_proc=4)
if __name__ == '__main__':
 main()

And here is training code.

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_from_disk
from utils import compute_metrics, DataCollatorSpeechSeq2SeqWithPadding
import os
import argparse
import evaluate
def define_argparser():
 """Function to define the command line arguments
 
 Returns:
 argparse.Namespace: Command line arguments
 """
 p = argparse.ArgumentParser()
 p.add_argument('--data_path', type=str, default='./data/Bingsu_zeroth-korean/')
 p.add_argument('--model_address', type=str, default='openai/whisper-medium')
 p.add_argument('--model_save_path', type=str, default='./models_zoo/openai_whisper-medium/')
 p.add_argument('--gradient_accumulation_steps', type=int, default=1)
 p.add_argument('--batch_size_per_device', type=int, default=24)
 # p.add_argument('--n_epochs', type=int, default=5)
 p.add_argument('--total_step', type=int, default=500)
 p.add_argument('--warmup_ratio', type=float, default=.2)
 # p.add_argument('--max_length', type=int, default=225)
 config = p.parse_args()
 return config
def main(config):
 """Main function to train the language model
 Args:
 config (argparse.Namespace): Command line arguments
 """
 dataset = load_from_disk(config.data_path)
 
 processor = WhisperProcessor.from_pretrained(config.model_address, language="ko", task="transcribe")
 data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
 model = WhisperForConditionalGeneration.from_pretrained(config.model_address)
 model.config.forced_decoder_ids = None
 model.config.suppress_tokens = []
 metric = evaluate.load("wer")
 
 print(
 '|train| =', len(dataset['train']),
 '|valid| =', len(dataset['test']),
 )
 
 # total_batch_size = config.batch_size_per_device * torch.cuda.device_count()
 # n_total_iterations = int(len(dataset['train']) / total_batch_size * config.n_epochs)
 n_total_iterations = config.total_step
 n_warmup_steps = int(n_total_iterations * config.warmup_ratio)
 
 print(
 '#total_iters =', n_total_iterations,
 '#warmup_iters =', n_warmup_steps,
 )
 training_args = Seq2SeqTrainingArguments(
 output_dir=os.path.join(config.model_save_path, 'checkpoints'),
 # num_train_epochs=config.n_epochs,
 max_steps=n_total_iterations,
 per_device_train_batch_size=config.batch_size_per_device,
 per_device_eval_batch_size=config.batch_size_per_device,
 gradient_accumulation_steps=config.gradient_accumulation_steps,
 warmup_steps=n_warmup_steps,
 fp16=True,
 learning_rate=5e-6,
 gradient_checkpointing=True,
 evaluation_strategy='steps',
 save_strategy ='steps',
 report_to=["tensorboard"],
 logging_steps=25,
 save_steps=n_total_iterations // 5,
 eval_steps=n_total_iterations // 5,
 predict_with_generate=True,
 generation_max_length=config.max_length,
 load_best_model_at_end=True,
 metric_for_best_model="wer",
 greater_is_better=False,
 dataloader_num_workers=16
 )
 trainer = Seq2SeqTrainer(
 args=training_args,
 model=model,
 train_dataset=dataset["train"],
 eval_dataset=dataset["test"],
 data_collator=data_collator,
 compute_metrics=lambda x: compute_metrics(x, metric, processor.tokenizer),
 tokenizer=processor.feature_extractor,
 )
 trainer.train()
 
 trainer.model.save_pretrained(os.path.join(config.model_save_path, 'model_weights'))
 # tokenizer.save_pretrained(os.path.join(config.model_save_path, 'tokenizer'))
if __name__ == '__main__':
 config = define_argparser()
 main(config)

During the training, the training loss and evaluation loss goes down, but The WER goes 100.

And, after training, the model always predict <|startoftranscript|><|endoftext|> for all samples.

How should i fix this?

asked Apr 20, 2023 at 2:26
1
  • Add below code for model config, it works to me model.config.forced_decoder_ids=processor.get_decoder_prompt_ids() model.config.suppress_tokens = [] Commented Apr 25, 2023 at 0:42

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.