@@ -3831,19 +3831,25 @@ struct ggml_cgraph * vocoder_graph(
38313831
38323832 cur = ggml_cpy (ctx0, cur, ggml_new_tensor (ctx0, GGML_TYPE_F32,4 ,cur->ne ));
38333833
3834- int strides[] = {8 ,4 ,4 };
3835- int paddings[] = {4 ,2 ,2 };
3834+ int strides[] = {8 ,8 ,4 };
3835+ int paddings[] = {4 ,4 ,2 };
38363836 int hop_sizes[] = {8 ,64 ,256 };
3837- 3837+ // const int kernel_size = 3;
3838+ // const int dilation = 1;
3839+ const int padding_length = 1 ;
38383840
38393841
38403842 struct ggml_tensor * conditioning;
38413843
38423844 // graph tether
38433845 // res blocks
3844- for (int i =0 ; i < 1 ; i++)
3846+ for (int i =0 ; i < 3 ; i++)
38453847 {
38463848
3849+ 3850+ 3851+ 3852+ 38473853 ggml_tensor * float_32_conv_transpose_1d_weight= ggml_cont (ctx0,ggml_cpy (ctx0, model.residual_stack [i].convolution_t_pre_weight , ggml_new_tensor (ctx0, GGML_TYPE_F32,4 ,model.residual_stack [i].convolution_t_pre_weight ->ne )));
38483854
38493855 cur = ggml_cont (ctx0,ggml_leaky_relu (ctx0, cur, 0.2 , false ));
@@ -3854,7 +3860,7 @@ struct ggml_cgraph * vocoder_graph(
38543860
38553861 cur = ggml_cont (ctx0,ggml_transpose (ctx0,ggml_add (ctx0, ggml_cont (ctx0, ggml_transpose (ctx0, cur)), model.residual_stack [i].convolution_t_pre_bias )));
38563862
3857-
3863+ 38583864
38593865 conditioning = ggml_cpy (ctx0, padded_mel, ggml_new_tensor (ctx0, GGML_TYPE_F32,4 ,padded_mel->ne ));
38603866
@@ -3962,21 +3968,26 @@ struct ggml_cgraph * vocoder_graph(
39623968
39633969 for (int c = 0 ; c < 4 ; c++)
39643970 {
3971+ 39653972 output = ggml_leaky_relu (ctx0, cur, 0.2 , false );
39663973
3974+ 39673975 float_16_conv_1d_weight= ggml_cpy (ctx0, model.residual_stack [i].conv_blocks [c].conv_block_1_weight , ggml_new_tensor (ctx0, GGML_TYPE_F16,4 ,model.residual_stack [i].conv_blocks [c].conv_block_1_weight ->ne ));
39683976
39693977 output = ggml_cont (ctx0,ggml_conv_1d (ctx0, float_16_conv_1d_weight, output, 1 ,conv_block_paddings[c],conv_block_dilations[c]));
39703978
39713979
39723980 output = ggml_cpy (ctx0, output, ggml_new_tensor (ctx0, GGML_TYPE_F32,4 ,output->ne ));
39733981
3982+ 39743983
39753984 output = ggml_cont (ctx0,ggml_transpose (ctx0,ggml_add (ctx0, ggml_cont (ctx0, ggml_transpose (ctx0, output)), model.residual_stack [i].conv_blocks [c].conv_block_1_bias )));
39763985
39773986
39783987 output = ggml_cpy (ctx0, output, ggml_new_tensor (ctx0, GGML_TYPE_F32,4 ,output->ne ));
39793988
3989+ 3990+ 39803991 output = ggml_leaky_relu (ctx0, output, 0.2 , false );
39813992
39823993 k = ggml_cont (ctx0,ggml_view_3d (ctx0, kernels, kernels->ne [0 ], 6144 , 1 , kernels->nb [1 ], kernels->nb [2 ], c * kernels->ne [0 ] * 6144 * sizeof (float )));
@@ -3987,7 +3998,13 @@ struct ggml_cgraph * vocoder_graph(
39873998
39883999 output = ggml_pad_ext (ctx0, output, 1 ,1 ,0 ,0 ,0 ,0 ,0 ,0 );
39894000
3990- output = ggml_unfold_1d (ctx0, output, 10 , 8 );
4001+ std::cout << " output shape" << std::endl;
4002+ std::cout << output->ne [0 ] << " ," << output->ne [1 ] << " ," << output->ne [2 ] << " ," << output->ne [3 ] << std::endl;
4003+ std::cout << hop_sizes[i] + 2 * padding_length << " ," << hop_sizes[i] << std::endl;
4004+ 4005+ output = ggml_unfold_1d (ctx0, output, hop_sizes[i] + 2 * padding_length, hop_sizes[i]);
4006+ 4007+ 39914008
39924009 output = ggml_unfold_1d (ctx0, output, 1 , 1 );
39934010
@@ -3996,49 +4013,85 @@ struct ggml_cgraph * vocoder_graph(
39964013
39974014 const int output_length = output->ne [2 ];
39984015
3999- output = ggml_reshape_3d (ctx0, output, 10 , 1 , output_length * 32 );
4016+ 4017+ 4018+ output = ggml_reshape_3d (ctx0, output, hop_sizes[i] + 2 * padding_length, 1 , output_length * 32 );
40004019
40014020 output = ggml_unfold_1d (ctx0, output, 3 ,1 );
40024021
4022+ 4023+ 4024+ 40034025
40044026 // o = torch.einsum('bildsk,biokl->bolsd', x, kernel)
40054027
40064028
4007- output = ggml_reshape_4d (ctx0, output, 3 , 8 , output_length, 32 );
4029+ output = ggml_reshape_4d (ctx0, output, 3 , hop_sizes[i] , output_length, 32 );
40084030
40094031 reshaped_kernel = ggml_reshape_4d (ctx0, k, output_length, 3 , 64 , 32 );
40104032
40114033 reshaped_kernel = ggml_cont (ctx0, ggml_permute (ctx0, reshaped_kernel, 2 ,0 ,1 ,3 ));
40124034
4035+ 4036+ 4037+ 40134038 output = ggml_mul_mat (ctx0, reshaped_kernel, output);
40144039
4015- output_accumulator = ggml_cont (ctx0,ggml_view_4d (ctx0, output, 64 , 8 , output_length,
4016- 1 , output->nb [1 ], output->nb [2 ], output->nb [3 ], 0 * output_length * 64 * 8 * sizeof (float ) ));
4040+ 4041+ 4042+ 4043+ output_accumulator = ggml_cont (ctx0,ggml_view_4d (ctx0, output, 64 , hop_sizes[i], output_length,
4044+ 1 , output->nb [1 ], output->nb [2 ], output->nb [3 ], 0 * output_length * 64 * hop_sizes[i] * sizeof (float ) ));
40174045 for (int j = 1 ; j < 32 ; j++ )
40184046 {
4019- output_accumulator = ggml_add (ctx0, output_accumulator, ggml_cont (ctx0,ggml_view_4d (ctx0, output, 64 , 8 , output_length,
4020- 1 , output->nb [1 ], output->nb [2 ], output->nb [3 ], j * output_length * 64 * 8 * sizeof (float ) )));
4047+ output_accumulator = ggml_add (ctx0, output_accumulator, ggml_cont (ctx0,ggml_view_4d (ctx0, output, 64 , hop_sizes[i] , output_length,
4048+ 1 , output->nb [1 ], output->nb [2 ], output->nb [3 ], j * output_length * 64 * hop_sizes[i] * sizeof (float ) )));
40214049 }
40224050
4051+ 4052+ 40234053
40244054 output = ggml_cont (ctx0,ggml_permute (ctx0, output_accumulator, 3 ,1 ,2 ,0 ));
40254055
4056+ 4057+ 40264058 output = ggml_add (ctx0, output, ggml_reshape_4d (ctx0, b, 1 ,1 , output_length, 64 ));
40274059
4028- output = ggml_reshape_3d (ctx0, output,1 , 8 *output_length, 64 );
4060+ output = ggml_reshape_3d (ctx0, output,1 , hop_sizes[i] *output_length, 64 );
40294061
4062+ /*
4063+ if (i >= 1)
4064+ {
4065+ cur = output;
4066+ goto end;
4067+ }*/
40304068
4031- output_half_1 = ggml_sigmoid (ctx0,ggml_cont (ctx0,ggml_view_3d (ctx0, output, 1 , 8 *output_length,32 , output->nb [1 ], output->nb [2 ], 0 * 1 * 8 * output_length * 32 * sizeof (float ) )));
4032- output_half_2 = ggml_tanh (ctx0,ggml_cont (ctx0,ggml_view_3d (ctx0, output, 1 , 8 *output_length,32 , output->nb [1 ], output->nb [2 ], 1 * 1 * 8 * output_length * 32 * sizeof (float ) )));
4069+ output_half_1 = ggml_sigmoid (ctx0,ggml_cont (ctx0,ggml_view_3d (ctx0, output, 1 , hop_sizes[i] *output_length,32 , output->nb [1 ], output->nb [2 ], 0 * 1 * hop_sizes[i] * output_length * 32 * sizeof (float ) )));
4070+ output_half_2 = ggml_tanh (ctx0,ggml_cont (ctx0,ggml_view_3d (ctx0, output, 1 , hop_sizes[i] *output_length,32 , output->nb [1 ], output->nb [2 ], 1 * 1 * hop_sizes[i] * output_length * 32 * sizeof (float ) )));
40334071
4034- cur = ggml_add (ctx0, cur, ggml_reshape_4d (ctx0,ggml_mul (ctx0,output_half_1, output_half_2), output_length * 8 , 32 ,1 ,1 ));
4072+ cur = ggml_add (ctx0, cur, ggml_reshape_4d (ctx0,ggml_mul (ctx0,output_half_1, output_half_2), output_length * hop_sizes[i] , 32 ,1 ,1 ));
40354073
40364074
40374075 }
40384076
40394077
40404078 }
40414079
4080+ cur = ggml_leaky_relu (ctx0, cur, 0.2 , false );
4081+ 4082+ float_16_conv_1d_weight= ggml_cpy (ctx0, model.convolution_post_weight , ggml_new_tensor (ctx0, GGML_TYPE_F16,4 ,model.convolution_post_weight ->ne ));
4083+ cur = ggml_cont (ctx0,ggml_conv_1d (ctx0, float_16_conv_1d_weight, cur, 1 ,0 ,1 ));
4084+ 4085+ 4086+ cur = ggml_cpy (ctx0, cur, ggml_new_tensor (ctx0, GGML_TYPE_F32,4 ,cur->ne ));
4087+ 4088+ 4089+ cur = ggml_cont (ctx0,ggml_transpose (ctx0,ggml_add (ctx0, ggml_cont (ctx0, ggml_transpose (ctx0, cur)), model.convolution_post_bias )));
4090+ 4091+ 4092+ cur = ggml_cpy (ctx0, cur, ggml_new_tensor (ctx0, GGML_TYPE_F32,4 ,cur->ne ));
4093+ 4094+ 40424095
40434096 ggml_build_forward_expand (gf, cur);
40444097 ggml_set_name (cur, " vocoder_output" );
@@ -4439,6 +4492,54 @@ std::vector<int> process_logits_and_sample(ggml_cgraph * gf, std::vector<int> &
44394492}
44404493
44414494
4495+ // thanks gpt3.5
4496+ // Function to write a WAV file from floating-point data
4497+ void writeWav (const char * filename, const std::vector<float >& data, int sampleRate) {
4498+ // WAV file parameters
4499+ int numChannels = 1 ; // Mono
4500+ int bitsPerSample = 32 ; // Float (32-bit)
4501+ int byteRate = sampleRate * numChannels * bitsPerSample / 8 ;
4502+ int blockAlign = numChannels * bitsPerSample / 8 ;
4503+ 4504+ // Open the output file
4505+ std::ofstream outFile (filename, std::ios::binary);
4506+ if (!outFile.is_open ()) {
4507+ std::cerr << " Error opening output file." << std::endl;
4508+ return ;
4509+ }
4510+ 4511+ // Write the WAV header
4512+ // RIFF header
4513+ outFile.write (" RIFF" , 4 );
4514+ int fileSize = 36 + data.size () * sizeof (float ); // Size of the entire file minus 8 bytes
4515+ outFile.write (reinterpret_cast <const char *>(&fileSize), 4 );
4516+ outFile.write (" WAVE" , 4 );
4517+ 4518+ // Format subchunk
4519+ outFile.write (" fmt " , 4 );
4520+ int fmtSize = 16 ;
4521+ outFile.write (reinterpret_cast <const char *>(&fmtSize), 4 );
4522+ int audioFormat = 3 ; // Floating point PCM
4523+ outFile.write (reinterpret_cast <const char *>(&audioFormat), 2 );
4524+ outFile.write (reinterpret_cast <const char *>(&numChannels), 2 );
4525+ outFile.write (reinterpret_cast <const char *>(&sampleRate), 4 );
4526+ outFile.write (reinterpret_cast <const char *>(&byteRate), 4 );
4527+ outFile.write (reinterpret_cast <const char *>(&blockAlign), 2 );
4528+ outFile.write (reinterpret_cast <const char *>(&bitsPerSample), 2 );
4529+ 4530+ // Data subchunk
4531+ outFile.write (" data" , 4 );
4532+ int dataSize = data.size () * sizeof (float );
4533+ outFile.write (reinterpret_cast <const char *>(&dataSize), 4 );
4534+ 4535+ // Write the audio data
4536+ outFile.write (reinterpret_cast <const char *>(data.data ()), dataSize);
4537+ 4538+ // Close the file
4539+ outFile.close ();
4540+ 4541+ std::cout << " WAV file saved successfully." << std::endl;
4542+ }
44424543
44434544
44444545
@@ -5869,7 +5970,7 @@ int main(int argc, char ** argv) {
58695970
58705971
58715972
5872- /*
5973+ 58735974 gpt_vocab vocab;
58745975 gpt_vocab_init (" ../models/tokenizer.json" , vocab);
58755976
@@ -5879,7 +5980,8 @@ int main(int argc, char ** argv) {
58795980 // exit(0);
58805981
58815982
5882-
5983+ // std::vector<gpt_vocab::id> tokens = ::parse_tokens_from_string("255,147,2,54,2,14,2,33,218,2,26,61,150,112,0,0", ','); // "This is a test message"
5984+ 58835985 std::vector<gpt_vocab::id> tokens = ::parse_tokens_from_string (" 255,15,55,49,9,9,9,2,134,16,51,31,2,19,46,18,176,13,0,0" , ' ,' ); // "Based... Dr. Freeman?"
58845986
58855987
@@ -5899,9 +6001,9 @@ int main(int argc, char ** argv) {
58996001 save_f32_vector (" ./logs/mel.bin" , mel);
59006002 std::cout << mel.size () <<std::endl;
59016003
5902- */
6004+ 59036005
5904- std::vector<float > mel = load_f32_vector (" ./logs/mel.bin" , 187 * 100 * sizeof (float ));
6006+ // std::vector<float> mel = load_f32_vector("./logs/mel.bin", 187 * 100 * sizeof(float));
59056007
59066008
59076009 std::string vocoder_model_file_path = " ../models/ggml-vocoder-model.bin" ;
@@ -5979,6 +6081,13 @@ int main(int argc, char ** argv) {
59796081 print_all_tensors (vocoder_gf, false , true , " vocoder_output" );
59806082 print_all_tensors (vocoder_gf, true , true , " vocoder_output" );
59816083
6084+ std::vector<float > audio = std::vector<float >();
6085+ 6086+ extract_tensor_to_vector ( vocoder_gf->nodes [vocoder_gf->n_nodes -1 ] , audio);
6087+ 6088+ 6089+ writeWav (" based?.wav" , audio , 24000 );
6090+ 59826091
59836092
59846093 ggml_gallocr_free (vocoder_allocr);
0 commit comments