测试阶段:为方便快速接入测试,可以使用离在线混合授权,详见:授权介绍
正式接入阶段:使用在线激活授权,详见:授权介绍
用于加载输入音频
string in_wav_path = input_file; string out_wav_path = output_file; drwav in_wav; drwav_bool32 drwav_ret = drwav_init_file(&in_wav, in_wav_path.c_str(), NULL); if(drwav_ret == false){ cout << "open input_file error!!!" << endl; res_str = "open input_file error"; return res_str; } cout << "===input file info====" << endl; cout << "ch:" << in_wav.channels <<endl; cout << "sample rate:" << in_wav.sampleRate << endl; cout << "bitsPerSample:" << in_wav.bitsPerSample << endl; cout << "translatedFormatTag:" << in_wav.translatedFormatTag << endl; cout << "===input file info====" << endl;
用于保存处理后音频
drwav out_wav; drwav_data_format format; format.container = drwav_container_riff; format.format = DR_WAVE_FORMAT_PCM; format.channels = in_wav.channels; format.sampleRate = in_wav.sampleRate; format.bitsPerSample = 16; drwav_ret = drwav_init_file_write(&out_wav, out_wav_path.c_str(), &format, NULL); if(drwav_ret == false){ cout << "open output_file error!!!" << endl; drwav_uninit(&in_wav); res_str = "open input_file error"; return res_str; }
std::vector<uint8_t> encoder_model_buffer = loadFileAsBinary(encoder_model_file); if (encoder_model_buffer.empty()) { drwav_uninit(&in_wav); res_str = "loadFileAsBinary fail:" + encoder_model_file; cout << res_str << endl; return res_str; } std::vector<uint8_t> timbre_model_buffer = loadFileAsBinary(timbre_model_file); if (timbre_model_buffer.empty()) { drwav_uninit(&in_wav); res_str = "loadFileAsBinary fail:" + timbre_model_file; cout << res_str << endl; return res_str; } std::vector<uint8_t> vad_model_buffer = loadFileAsBinary(vad_model_file); if (vad_model_buffer.empty()) { drwav_uninit(&in_wav); res_str = "loadFileAsBinary fail:" + vad_model_file; cout << res_str << endl; return res_str; } const int block_size = in_wav.sampleRate/100; //10ms const int sample_rate = in_wav.sampleRate; const int num_channels = in_wav.channels; SAMICoreHandle handle_; SAMICoreExecutorContextCreateVCParameter create_param; memset(&create_param,0,sizeof(create_param)); create_param.maxBlockSize = block_size; create_param.sampleRate = sample_rate; create_param.numChannel = num_channels; create_param.modelBuffer = (char *)encoder_model_buffer.data(); create_param.modelLen = encoder_model_buffer.size(); create_param.timbreModelBuffer = (char *)timbre_model_buffer.data(); create_param.timbreModelLen = timbre_model_buffer.size(); create_param.vadModelBuffer = (char *)vad_model_buffer.data(); create_param.vadModelLen = vad_model_buffer.size(); int ret = SAMICoreCreateHandleByIdentify(&handle_, SAMICoreIdentify_VOICE_CONVERTER, &create_param); if (ret != SAMI_OK) { drwav_uninit(&in_wav); res_str = "sami_core handler create fail:" + to_string(ret); cout << res_str << endl; return res_str; }
SAMICoreExecutorContextCreateParameter 参数解释
参数 | 介绍 |
---|---|
maxBlockSize | 每次调用process送的最大帧数,超过会报错,建议是使用比实际大一点的数据即可(内部会根据这个数字申请内部缓冲队列,对延时和内存占用都有影响),一般是10ms的帧数,比如16k的采样率,建议160个采样点 |
sampleRate | 采样率 |
numChannel | 通道数,目前仅支持一个通道 |
modelBuffer | 声音转换模型的数据 |
modelLen | 声音转换模型的数据长度 |
timbreModelBuffer | 音色模型的数据 |
timbreModelLen | 音色模型的数据长度 |
vadModelBuffer | VAD模型的数据 |
vadModelLen | VAD模型的数据长度 |
int num_frame = in_wav.sampleRate/100; //10ms; int f32_buf_num = num_frame * in_wav.channels; vector<float> f32_buffer; f32_buffer.resize(f32_buf_num); vector<signed short> f32_to_s16_buffer; f32_to_s16_buffer.resize(f32_buf_num*(sizeof(float )/sizeof(signed short))); SAMICoreAudioBuffer in_audio_buffer; in_audio_buffer.numberChannels = num_channels; in_audio_buffer.numberSamples = block_size; in_audio_buffer.data = new float*[num_channels]; in_audio_buffer.isInterleave = 0; SAMICoreAudioBuffer out_audio_buffer; out_audio_buffer.numberChannels = num_channels; out_audio_buffer.numberSamples = block_size; out_audio_buffer.data = new float*[num_channels]; out_audio_buffer.isInterleave = 0; for(int c = 0; c < int(num_channels); ++c) { in_audio_buffer.data[c] = new float[block_size]; out_audio_buffer.data[c] = new float[block_size]; } SAMICoreBlock in_block; memset(&in_block, 0, sizeof(SAMICoreBlock)); in_block.numberAudioData = 1; in_block.dataType = SAMICoreDataType::SAMICoreDataType_AudioBuffer; in_block.audioData = &in_audio_buffer; SAMICoreBlock out_block; memset(&out_block, 0, sizeof(SAMICoreBlock)); out_block.numberAudioData = 1; out_block.dataType = SAMICoreDataType::SAMICoreDataType_AudioBuffer; out_block.audioData = &out_audio_buffer;
demo展示了处理的流程
将数据排布方式从Interleave转成Planar
处理
将数据排布方式从Planar转成lnterleave
从float转成S16
cout << "process start" << endl; int readed_frames = 0; do { //read from file readed_frames = drwav_read_pcm_frames_f32(&in_wav, num_frame, f32_buffer.data()); if (readed_frames<=0){ break; } in_audio_buffer.numberSamples = num_frame; out_audio_buffer.numberSamples = num_frame; //Interleave to Planar interleaveToPlanarFloat(f32_buffer.data(),in_audio_buffer.data, readed_frames, num_channels); //process int ret = SAMICoreProcess(handle_, &in_block, &out_block); if (ret!=SAMI_OK){ res_str = "SAMICoreProcess error:" + to_string(ret) ; cout << res_str << endl; break; } //Planar to Interleave planarToInterleaveFloat(const_cast<const float **>(out_audio_buffer.data),f32_buffer.data(),readed_frames,num_channels); //f32->s16 floatArrayToShortArray(f32_buffer.data(),f32_to_s16_buffer.data(),readed_frames*num_channels); //write to out file drwav_uint64 writed_frames = drwav_write_pcm_frames(&out_wav, readed_frames, f32_to_s16_buffer.data()); }while (readed_frames!=0);
cout << "release" << endl; drwav_uninit(&in_wav); drwav_uninit(&out_wav); ret = SAMICoreDestroyHandle(handle_); if(ret!=SAMI_OK){ res_str = "SAMICoreDestroyHandle error:" + to_string(ret) ; cout << res_str << endl; } for(int c = 0; c < int(num_channels); ++c) { delete[] in_audio_buffer.data[c]; delete[] out_audio_buffer.data[c]; } delete[] in_audio_buffer.data; delete[] out_audio_buffer.data;
本例子是从demo中的例子,建议结合demo工程使用
// // Created by sami on 2021/9/28. // #include <string> #include <vector> #include <chrono> #include <iostream> #include <fstream> #include "sami_core.h" #include "help_function.h" #include "dr_libs/dr_wav.h" #include "voice_conversion.h" using namespace std; static std::vector<uint8_t> loadFileAsBinary(const std::string& path) { std::ifstream file(path, std::ios::binary | std::ios::ate); std::streamsize size = file.tellg(); file.seekg(0, std::ios::beg); std::vector<uint8_t> buffer(size); if(file.read((char*)buffer.data(), size)) { return buffer; } return {}; } string voice_conversion_fun(string input_file,string output_file,string res_path,string timbre){ string res_str = "OK"; cout << "input_file:" << input_file << endl; cout << "output_file:" << output_file << endl; string encoder_model_file = res_path + "model/voice_conversion/voice_conversion_online_encoder_v1.0.model"; string timbre_model_file = res_path + "model/voice_conversion/" + timbre + ".model"; string vad_model_file = res_path + "/model/voice_conversion/vad_online_v1.1.model"; cout << "encoder_model_file:" << encoder_model_file << endl; cout << "timbre_model_file:" << timbre_model_file << endl; cout << "vad_model_file:" << vad_model_file << endl; //1. 输入输出文件准备 =================================== string in_wav_path = input_file; string out_wav_path = output_file; drwav in_wav; drwav_bool32 drwav_ret = drwav_init_file(&in_wav, in_wav_path.c_str(), NULL); if(drwav_ret == false){ cout << "open input_file error!!!" << endl; res_str = "open input_file error"; return res_str; } cout << "===input file info====" << endl; cout << "ch:" << in_wav.channels <<endl; cout << "sample rate:" << in_wav.sampleRate << endl; cout << "bitsPerSample:" << in_wav.bitsPerSample << endl; cout << "translatedFormatTag:" << in_wav.translatedFormatTag << endl; cout << "===input file info====" << endl; if (in_wav.channels!=1 ){ cout << "nonsupport channels" << endl; res_str = "nonsupport channels"; return res_str; } drwav out_wav; drwav_data_format format; format.container = drwav_container_riff; format.format = DR_WAVE_FORMAT_PCM; format.channels = in_wav.channels; format.sampleRate = in_wav.sampleRate; format.bitsPerSample = 16; drwav_ret = drwav_init_file_write(&out_wav, out_wav_path.c_str(), &format, NULL); if(drwav_ret == false){ cout << "open output_file error!!!" << endl; drwav_uninit(&in_wav); res_str = "open input_file error"; return res_str; } std::vector<uint8_t> encoder_model_buffer = loadFileAsBinary(encoder_model_file); if (encoder_model_buffer.empty()) { drwav_uninit(&in_wav); res_str = "loadFileAsBinary fail:" + encoder_model_file; cout << res_str << endl; return res_str; } std::vector<uint8_t> timbre_model_buffer = loadFileAsBinary(timbre_model_file); if (timbre_model_buffer.empty()) { drwav_uninit(&in_wav); res_str = "loadFileAsBinary fail:" + timbre_model_file; cout << res_str << endl; return res_str; } std::vector<uint8_t> vad_model_buffer = loadFileAsBinary(vad_model_file); if (vad_model_buffer.empty()) { drwav_uninit(&in_wav); res_str = "loadFileAsBinary fail:" + vad_model_file; cout << res_str << endl; return res_str; } const int block_size = in_wav.sampleRate/100; //10ms const int sample_rate = in_wav.sampleRate; const int num_channels = in_wav.channels; SAMICoreHandle handle_; SAMICoreExecutorContextCreateVCParameter create_param; memset(&create_param,0,sizeof(create_param)); create_param.maxBlockSize = block_size; create_param.sampleRate = sample_rate; create_param.numChannel = num_channels; create_param.modelBuffer = (char *)encoder_model_buffer.data(); create_param.modelLen = encoder_model_buffer.size(); create_param.timbreModelBuffer = (char *)timbre_model_buffer.data(); create_param.timbreModelLen = timbre_model_buffer.size(); create_param.vadModelBuffer = (char *)vad_model_buffer.data(); create_param.vadModelLen = vad_model_buffer.size(); int ret = SAMICoreCreateHandleByIdentify(&handle_, SAMICoreIdentify_VOICE_CONVERTER, &create_param); if (ret != SAMI_OK) { drwav_uninit(&in_wav); res_str = "sami_core handler create fail:" + to_string(ret); cout << res_str << endl; return res_str; } //3. init process buffer int num_frame = block_size; int f32_buf_num = num_frame * in_wav.channels; vector<float> f32_buffer; f32_buffer.resize(f32_buf_num); vector<signed short> f32_to_s16_buffer; f32_to_s16_buffer.resize(f32_buf_num*(sizeof(float )/sizeof(signed short))); SAMICoreAudioBuffer in_audio_buffer; in_audio_buffer.numberChannels = num_channels; in_audio_buffer.numberSamples = block_size; in_audio_buffer.data = new float*[num_channels]; in_audio_buffer.isInterleave = 0; SAMICoreAudioBuffer out_audio_buffer; out_audio_buffer.numberChannels = num_channels; out_audio_buffer.numberSamples = block_size; out_audio_buffer.data = new float*[num_channels]; out_audio_buffer.isInterleave = 0; for(int c = 0; c < int(num_channels); ++c) { in_audio_buffer.data[c] = new float[block_size]; out_audio_buffer.data[c] = new float[block_size]; } SAMICoreBlock in_block; memset(&in_block, 0, sizeof(SAMICoreBlock)); in_block.numberAudioData = 1; in_block.dataType = SAMICoreDataType::SAMICoreDataType_AudioBuffer; in_block.audioData = &in_audio_buffer; SAMICoreBlock out_block; memset(&out_block, 0, sizeof(SAMICoreBlock)); out_block.numberAudioData = 1; out_block.dataType = SAMICoreDataType::SAMICoreDataType_AudioBuffer; out_block.audioData = &out_audio_buffer; //4. process =================================== cout << "process start" << endl; auto start = std::chrono::steady_clock::now(); int readed_frames = 0; int process_frame = 0; do { //read from file readed_frames = drwav_read_pcm_frames_f32(&in_wav, num_frame, f32_buffer.data()); if (readed_frames<=0){ break; } process_frame = process_frame + readed_frames; in_audio_buffer.numberSamples = readed_frames; out_audio_buffer.numberSamples = readed_frames; //Interleave to Planar interleaveToPlanarFloat(f32_buffer.data(),in_audio_buffer.data, readed_frames, num_channels); //process ret = SAMICoreProcess(handle_, &in_block, &out_block); if (ret!=SAMI_OK){ res_str = "SAMICoreProcess error:" + to_string(ret) ; cout << res_str << endl; break; } //Planar to Interleave planarToInterleaveFloat(const_cast<const float **>(out_audio_buffer.data),f32_buffer.data(),readed_frames,num_channels); //f32->s16 floatArrayToShortArray(f32_buffer.data(),f32_to_s16_buffer.data(),readed_frames*num_channels); //write to out file drwav_uint64 writed_frames = drwav_write_pcm_frames(&out_wav, readed_frames, f32_to_s16_buffer.data()); }while (readed_frames!=0); cout << "process end" << endl; auto end = std::chrono::steady_clock::now(); auto process_cost_ms = std::chrono::duration<double, std::milli>(end - start).count(); double rtf = process_cost_ms/(1.0 * process_frame / sample_rate * 1000); //process_cost_ms/process_frame_ms cout << "rtf:" << rtf << endl; //get tail data SAMICoreProperty outProperty; ret = SAMICoreGetPropertyById(handle_, SAMICorePropertyID_VOICE_CONVERTER_ExtraData, &outProperty); if(ret == SAMI_OK){ unsigned int outLen = outProperty.dataLen; if(outLen > 0) { auto* data = (float *)outProperty.data; unsigned int block_size = outLen/num_channels; cout << "dump tail data,block_size:" << block_size << endl; vector<short> interleave_s16; interleave_s16.resize(block_size*num_channels); //f32->s16 floatArrayToShortArray(data,interleave_s16.data(),block_size*num_channels); drwav_write_pcm_frames(&out_wav, block_size, interleave_s16.data()); } } SAMICoreDestroyProperty(&outProperty); //5. release cout << "release" << endl; drwav_uninit(&in_wav); drwav_uninit(&out_wav); ret = SAMICoreDestroyHandle(handle_); if(ret!=SAMI_OK){ res_str = "SAMICoreDestroyHandle error:" + to_string(ret) ; cout << res_str << endl; } for(int c = 0; c < int(num_channels); ++c) { delete[] in_audio_buffer.data[c]; delete[] out_audio_buffer.data[c]; } delete[] in_audio_buffer.data; delete[] out_audio_buffer.data; return res_str; }