Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 11c8e38

Browse files
vocoder complete
1 parent a74ca1d commit 11c8e38

File tree

1 file changed

+129
-20
lines changed

1 file changed

+129
-20
lines changed

‎main.cpp‎

Lines changed: 129 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3831,19 +3831,25 @@ struct ggml_cgraph * vocoder_graph(
38313831

38323832
cur = ggml_cpy(ctx0, cur, ggml_new_tensor(ctx0, GGML_TYPE_F32,4,cur->ne));
38333833

3834-
int strides[] = {8,4,4};
3835-
int paddings[] = {4,2,2};
3834+
int strides[] = {8,8,4};
3835+
int paddings[] = {4,4,2};
38363836
int hop_sizes[] = {8,64,256};
3837-
3837+
//const int kernel_size = 3;
3838+
//const int dilation = 1;
3839+
const int padding_length = 1;
38383840

38393841

38403842
struct ggml_tensor * conditioning;
38413843

38423844
//graph tether
38433845
//res blocks
3844-
for (int i =0; i < 1; i++)
3846+
for (int i =0; i < 3; i++)
38453847
{
38463848

3849+
3850+
3851+
3852+
38473853
ggml_tensor * float_32_conv_transpose_1d_weight= ggml_cont(ctx0,ggml_cpy(ctx0, model.residual_stack[i].convolution_t_pre_weight, ggml_new_tensor(ctx0, GGML_TYPE_F32,4,model.residual_stack[i].convolution_t_pre_weight->ne)));
38483854

38493855
cur = ggml_cont(ctx0,ggml_leaky_relu(ctx0, cur, 0.2, false));
@@ -3854,7 +3860,7 @@ struct ggml_cgraph * vocoder_graph(
38543860

38553861
cur = ggml_cont(ctx0,ggml_transpose(ctx0,ggml_add(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), model.residual_stack[i].convolution_t_pre_bias)));
38563862

3857-
3863+
38583864

38593865
conditioning = ggml_cpy(ctx0, padded_mel, ggml_new_tensor(ctx0, GGML_TYPE_F32,4,padded_mel->ne));
38603866

@@ -3962,21 +3968,26 @@ struct ggml_cgraph * vocoder_graph(
39623968

39633969
for (int c = 0; c < 4; c++)
39643970
{
3971+
39653972
output = ggml_leaky_relu(ctx0, cur, 0.2, false);
39663973

3974+
39673975
float_16_conv_1d_weight= ggml_cpy(ctx0, model.residual_stack[i].conv_blocks[c].conv_block_1_weight, ggml_new_tensor(ctx0, GGML_TYPE_F16,4,model.residual_stack[i].conv_blocks[c].conv_block_1_weight->ne));
39683976

39693977
output = ggml_cont(ctx0,ggml_conv_1d(ctx0, float_16_conv_1d_weight, output, 1,conv_block_paddings[c],conv_block_dilations[c]));
39703978

39713979

39723980
output = ggml_cpy(ctx0, output, ggml_new_tensor(ctx0, GGML_TYPE_F32,4,output->ne));
39733981

3982+
39743983

39753984
output = ggml_cont(ctx0,ggml_transpose(ctx0,ggml_add(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, output)), model.residual_stack[i].conv_blocks[c].conv_block_1_bias)));
39763985

39773986

39783987
output = ggml_cpy(ctx0, output, ggml_new_tensor(ctx0, GGML_TYPE_F32,4,output->ne));
39793988

3989+
3990+
39803991
output = ggml_leaky_relu(ctx0, output, 0.2, false);
39813992

39823993
k = ggml_cont(ctx0,ggml_view_3d(ctx0, kernels, kernels->ne[0], 6144, 1, kernels->nb[1], kernels->nb[2], c * kernels->ne[0] * 6144 * sizeof(float)));
@@ -3987,7 +3998,13 @@ struct ggml_cgraph * vocoder_graph(
39873998

39883999
output = ggml_pad_ext(ctx0, output, 1,1,0,0,0,0,0,0);
39894000

3990-
output = ggml_unfold_1d(ctx0, output, 10, 8);
4001+
std::cout << "output shape" << std::endl;
4002+
std::cout << output->ne[0] << "," << output->ne[1] << "," << output->ne[2] << "," << output->ne[3] << std::endl;
4003+
std::cout << hop_sizes[i] + 2 * padding_length << "," << hop_sizes[i] << std::endl;
4004+
4005+
output = ggml_unfold_1d(ctx0, output, hop_sizes[i] + 2 * padding_length, hop_sizes[i]);
4006+
4007+
39914008

39924009
output = ggml_unfold_1d(ctx0, output, 1, 1);
39934010

@@ -3996,49 +4013,85 @@ struct ggml_cgraph * vocoder_graph(
39964013

39974014
const int output_length = output->ne[2];
39984015

3999-
output = ggml_reshape_3d(ctx0, output, 10, 1, output_length * 32);
4016+
4017+
4018+
output = ggml_reshape_3d(ctx0, output, hop_sizes[i] + 2 * padding_length, 1, output_length * 32);
40004019

40014020
output = ggml_unfold_1d(ctx0, output, 3,1);
40024021

4022+
4023+
4024+
40034025

40044026
// o = torch.einsum('bildsk,biokl->bolsd', x, kernel)
40054027

40064028

4007-
output = ggml_reshape_4d(ctx0, output, 3, 8 , output_length, 32);
4029+
output = ggml_reshape_4d(ctx0, output, 3, hop_sizes[i] , output_length, 32);
40084030

40094031
reshaped_kernel = ggml_reshape_4d(ctx0, k, output_length, 3, 64, 32);
40104032

40114033
reshaped_kernel = ggml_cont(ctx0, ggml_permute(ctx0, reshaped_kernel, 2,0,1,3));
40124034

4035+
4036+
4037+
40134038
output = ggml_mul_mat(ctx0, reshaped_kernel, output);
40144039

4015-
output_accumulator = ggml_cont(ctx0,ggml_view_4d(ctx0, output, 64, 8, output_length,
4016-
1, output->nb[1], output->nb[2], output->nb[3], 0 * output_length * 64 * 8 * sizeof(float) ));
4040+
4041+
4042+
4043+
output_accumulator = ggml_cont(ctx0,ggml_view_4d(ctx0, output, 64, hop_sizes[i], output_length,
4044+
1, output->nb[1], output->nb[2], output->nb[3], 0 * output_length * 64 * hop_sizes[i] * sizeof(float) ));
40174045
for (int j = 1; j < 32; j++ )
40184046
{
4019-
output_accumulator = ggml_add(ctx0, output_accumulator, ggml_cont(ctx0,ggml_view_4d(ctx0, output, 64, 8, output_length,
4020-
1, output->nb[1], output->nb[2], output->nb[3], j * output_length * 64 * 8 * sizeof(float) )));
4047+
output_accumulator = ggml_add(ctx0, output_accumulator, ggml_cont(ctx0,ggml_view_4d(ctx0, output, 64, hop_sizes[i], output_length,
4048+
1, output->nb[1], output->nb[2], output->nb[3], j * output_length * 64 * hop_sizes[i] * sizeof(float) )));
40214049
}
40224050

4051+
4052+
40234053

40244054
output = ggml_cont(ctx0,ggml_permute(ctx0, output_accumulator, 3,1,2,0));
40254055

4056+
4057+
40264058
output = ggml_add(ctx0, output, ggml_reshape_4d(ctx0, b, 1,1, output_length, 64));
40274059

4028-
output = ggml_reshape_3d(ctx0, output,1, 8*output_length, 64);
4060+
output = ggml_reshape_3d(ctx0, output,1, hop_sizes[i]*output_length, 64);
40294061

4062+
/*
4063+
if (i >= 1)
4064+
{
4065+
cur = output;
4066+
goto end;
4067+
}*/
40304068

4031-
output_half_1 = ggml_sigmoid(ctx0,ggml_cont(ctx0,ggml_view_3d(ctx0, output, 1, 8*output_length,32, output->nb[1], output->nb[2], 0 * 1 * 8 * output_length * 32 * sizeof(float) )));
4032-
output_half_2 = ggml_tanh(ctx0,ggml_cont(ctx0,ggml_view_3d(ctx0, output, 1, 8*output_length,32, output->nb[1], output->nb[2], 1 * 1 * 8 * output_length * 32 * sizeof(float) )));
4069+
output_half_1 = ggml_sigmoid(ctx0,ggml_cont(ctx0,ggml_view_3d(ctx0, output, 1, hop_sizes[i]*output_length,32, output->nb[1], output->nb[2], 0 * 1 * hop_sizes[i] * output_length * 32 * sizeof(float) )));
4070+
output_half_2 = ggml_tanh(ctx0,ggml_cont(ctx0,ggml_view_3d(ctx0, output, 1, hop_sizes[i]*output_length,32, output->nb[1], output->nb[2], 1 * 1 * hop_sizes[i] * output_length * 32 * sizeof(float) )));
40334071

4034-
cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0,ggml_mul(ctx0,output_half_1, output_half_2), output_length * 8, 32,1,1));
4072+
cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0,ggml_mul(ctx0,output_half_1, output_half_2), output_length * hop_sizes[i], 32,1,1));
40354073

40364074

40374075
}
40384076

40394077

40404078
}
40414079

4080+
cur = ggml_leaky_relu(ctx0, cur, 0.2, false);
4081+
4082+
float_16_conv_1d_weight= ggml_cpy(ctx0, model.convolution_post_weight, ggml_new_tensor(ctx0, GGML_TYPE_F16,4,model.convolution_post_weight->ne));
4083+
cur = ggml_cont(ctx0,ggml_conv_1d(ctx0, float_16_conv_1d_weight, cur, 1,0,1 ));
4084+
4085+
4086+
cur = ggml_cpy(ctx0, cur, ggml_new_tensor(ctx0, GGML_TYPE_F32,4,cur->ne));
4087+
4088+
4089+
cur = ggml_cont(ctx0,ggml_transpose(ctx0,ggml_add(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), model.convolution_post_bias)));
4090+
4091+
4092+
cur = ggml_cpy(ctx0, cur, ggml_new_tensor(ctx0, GGML_TYPE_F32,4,cur->ne));
4093+
4094+
40424095

40434096
ggml_build_forward_expand(gf, cur);
40444097
ggml_set_name(cur, "vocoder_output");
@@ -4439,6 +4492,54 @@ std::vector<int> process_logits_and_sample(ggml_cgraph * gf, std::vector<int> &
44394492
}
44404493

44414494

4495+
// thanks gpt3.5
4496+
// Function to write a WAV file from floating-point data
4497+
void writeWav(const char* filename, const std::vector<float>& data, int sampleRate) {
4498+
// WAV file parameters
4499+
int numChannels = 1; // Mono
4500+
int bitsPerSample = 32; // Float (32-bit)
4501+
int byteRate = sampleRate * numChannels * bitsPerSample / 8;
4502+
int blockAlign = numChannels * bitsPerSample / 8;
4503+
4504+
// Open the output file
4505+
std::ofstream outFile(filename, std::ios::binary);
4506+
if (!outFile.is_open()) {
4507+
std::cerr << "Error opening output file." << std::endl;
4508+
return;
4509+
}
4510+
4511+
// Write the WAV header
4512+
// RIFF header
4513+
outFile.write("RIFF", 4);
4514+
int fileSize = 36 + data.size() * sizeof(float); // Size of the entire file minus 8 bytes
4515+
outFile.write(reinterpret_cast<const char*>(&fileSize), 4);
4516+
outFile.write("WAVE", 4);
4517+
4518+
// Format subchunk
4519+
outFile.write("fmt ", 4);
4520+
int fmtSize = 16;
4521+
outFile.write(reinterpret_cast<const char*>(&fmtSize), 4);
4522+
int audioFormat = 3; // Floating point PCM
4523+
outFile.write(reinterpret_cast<const char*>(&audioFormat), 2);
4524+
outFile.write(reinterpret_cast<const char*>(&numChannels), 2);
4525+
outFile.write(reinterpret_cast<const char*>(&sampleRate), 4);
4526+
outFile.write(reinterpret_cast<const char*>(&byteRate), 4);
4527+
outFile.write(reinterpret_cast<const char*>(&blockAlign), 2);
4528+
outFile.write(reinterpret_cast<const char*>(&bitsPerSample), 2);
4529+
4530+
// Data subchunk
4531+
outFile.write("data", 4);
4532+
int dataSize = data.size() * sizeof(float);
4533+
outFile.write(reinterpret_cast<const char*>(&dataSize), 4);
4534+
4535+
// Write the audio data
4536+
outFile.write(reinterpret_cast<const char*>(data.data()), dataSize);
4537+
4538+
// Close the file
4539+
outFile.close();
4540+
4541+
std::cout << "WAV file saved successfully." << std::endl;
4542+
}
44424543

44434544

44444545

@@ -5869,7 +5970,7 @@ int main(int argc, char ** argv) {
58695970

58705971

58715972

5872-
/*
5973+
58735974
gpt_vocab vocab;
58745975
gpt_vocab_init("../models/tokenizer.json", vocab);
58755976

@@ -5879,7 +5980,8 @@ int main(int argc, char ** argv) {
58795980
//exit(0);
58805981

58815982

5882-
5983+
//std::vector<gpt_vocab::id> tokens = ::parse_tokens_from_string("255,147,2,54,2,14,2,33,218,2,26,61,150,112,0,0", ','); // "This is a test message"
5984+
58835985
std::vector<gpt_vocab::id> tokens = ::parse_tokens_from_string("255,15,55,49,9,9,9,2,134,16,51,31,2,19,46,18,176,13,0,0", ','); //"Based... Dr. Freeman?"
58845986

58855987

@@ -5899,9 +6001,9 @@ int main(int argc, char ** argv) {
58996001
save_f32_vector("./logs/mel.bin", mel);
59006002
std::cout << mel.size() <<std::endl;
59016003

5902-
*/
6004+
59036005

5904-
std::vector<float> mel = load_f32_vector("./logs/mel.bin", 187 * 100 * sizeof(float));
6006+
//std::vector<float> mel = load_f32_vector("./logs/mel.bin", 187 * 100 * sizeof(float));
59056007

59066008

59076009
std::string vocoder_model_file_path = "../models/ggml-vocoder-model.bin";
@@ -5979,6 +6081,13 @@ int main(int argc, char ** argv) {
59796081
print_all_tensors(vocoder_gf, false, true, "vocoder_output");
59806082
print_all_tensors(vocoder_gf, true, true, "vocoder_output");
59816083

6084+
std::vector<float> audio = std::vector<float>();
6085+
6086+
extract_tensor_to_vector( vocoder_gf->nodes[vocoder_gf->n_nodes -1] , audio);
6087+
6088+
6089+
writeWav("based?.wav", audio , 24000);
6090+
59826091

59836092

59846093
ggml_gallocr_free(vocoder_allocr);

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /