实时语音翻译API集成语音识别、智能断句、文本翻译等技术能力,可以将实时语音流、音频文件识别成文字并翻译成目标语言,达到“边说边译”的效果。
支持语向:可在「语言支持」列表中查看实时语音翻译支持的的源语言及目标语言。
音频参数要求
采样率:16000hz
采样位:16
单声道
格式:wav或pcm(格式错误会导致识别效果差,返回时间戳错误等一系列问题,所以一定要确保自己发送的格式正确。)
音频内容需使用base64编码
wss://translate.volces.com/api/translate/speech/v1/
WEBSOCKET
参数名 | 值 |
---|---|
Version | 2020-06-01 |
Action | SpeechTranslate |
Path | /api/translate/speech/v1/ |
服务使用websocket协议
Configuration
建立websocket之后第一包发送配置包如下字段 | 类型 | 是否为必须项 | 说明 | 备注 |
---|---|---|---|---|
SourceLanguage | String | 是 | 源语言 | 仅支持zh ja en |
TargetLanguage | [String] | 是 | 目标语言 | 可在语言支持中查询对应的语言代码 |
HotWordList | [Object] | 否 | 热词 | 在语音识别时,接口更倾向于识别成热词的内容 |
{ "Configuration": { "SourceLanguage": "zh", "TargetLanguages": [ "en" ], "HotWordList": [ { "Word": "hello", "Scale": 1 } ] } }
AudioData
发送Configuratoin之后可以发送二进制数组的音频数据包,采样率为16000hz,单声道,使用base64编码,推荐每100-200ms发送一包字段 | 类型 | 是否为必须项 | 说明 | 备注 |
---|---|---|---|---|
AudioData | String | 是 | 音频包的base64编码 | 无 |
{ "AudioData": "YQ==" }
End
bool值,只要发送了就表示音频发送结束字段 | 类型 | 是否为必须项 | 说明 | 备注 |
---|---|---|---|---|
End | Bool | 是 | 音频包发送完成时发 | 不论true,false ,server接收到包后会处理完剩余的音频包以及相关的翻译,全部结束后会断开websocket |
{ "End": true }
字段 | 类型 | 说明 | 备注 |
---|---|---|---|
ResponseMetadata | ResponseMetadata | 通用字段 | 无 |
Subtitle | Object | 包含翻译结果、时间、语言等信息的结果 | 无 |
Subtitle:
字段 | 类型 | 说明 | 备注 |
---|---|---|---|
Text | String | 返回的结果 | 当某两个结果的Sequence和Definite一致时,结果中的Text分别代表原文及其译文 |
BeginTime | Int | 该文本识别的开始时间 | 无 |
EndTime | Int | 该文本识别的结束时间 | 无 |
Definite | Bool | 文本是否确定 |
|
Language | String | 语言 | 无 |
Sequence | Int | 序列号 | 文本片段的序号 |
{ "Subtitle": { "Text": "i'm elise hugh this is ted talk daily",//返回的结果 "BeginTime": 4070,//该文本识别的开始时间 "EndTime": 6955,//该文本的识别的结束时间 "Definite": false,//正在说话,文本还未确定 "Language": "en",//语言 "Sequence": 8 //序列号 sequence和definite一致的就是一个语言的不同文本 }, "ResponseMetaData": { "RequestId": "7088303501725501476",//每次连接有不同的reqId "Action": "SpeechTranslate", "Version": "2020-06-01", "Service": "translate", "Region": "cn-north-1" } }
在接口错误时,错误信息会在ResponseMetadata.Error
中,包含了错误类型Code:: String
和错误信息Message:: String
两个字段,其中错误码列表为:
错误码 | 说明 |
---|---|
-301 | 音频间隔过长 |
-400 | 请求参数错误,请求参数错误,具体错误可参考Message信息 |
-401 | 未授权用户 |
-403 | 无权限 |
-429 | 请求过于频繁 |
-5xx | 翻译引擎内部错误 |
100001-100021 | 通用错误 |
# 需要引入https://github.com/volcengine/volc-sdk-python import base64 import json import threading import time import websocket from volcengine.ApiInfo import ApiInfo from volcengine.Credentials import Credentials from volcengine.ServiceInfo import ServiceInfo from volcengine.base.Service import Service k_access_key = "k_access_key" # https://console.volcengine.com/iam/keymanage/ k_secret_key = "k_secret_key" k_host = 'translate.volces.com' k_path = '/api/translate/speech/v1/' k_timeout = 5 # second k_service_info = \ ServiceInfo(k_host, {'Content-Type': 'application/json'}, Credentials(k_access_key, k_secret_key, 'translate', 'cn-north-1'), 5, 5) k_query = { 'Action': 'SpeechTranslate', 'Version': '2020-06-01' } k_api_info = { 'SpeechTranslate': ApiInfo('GET', k_path, k_query, {}, {}) } file_path = './audio.wav' # 音频的文件 def get_websocket_url(): service = Service(k_service_info, k_api_info) url = 'wss://' + k_host + k_path + '?' + service.get_sign_url("SpeechTranslate", {}) return url def send_audio(ws): file = open(file_path, 'rb') while True: chunk = file.read(200 * 32) if not chunk: break audio = { "AudioData": base64.b64encode(chunk).decode('utf-8') } ws.send(json.dumps(audio)) time.sleep(0.2) end = { "End": True } ws.send(json.dumps(end)) def recv(ws): while True: print(ws.recv().decode("utf-8")) if __name__ == '__main__': websocket.enableTrace(False) ws = websocket.WebSocket() ws.connect(get_websocket_url(), header=["Content-Type: application/json"]) configuration = { "Configuration": { "SourceLanguage": "zh", "TargetLanguages": [ "en" ], "HotWordList": [ { "Word": "hello", "Scale": 1 } ] } } ws.send(json.dumps(configuration)) t2 = threading.Thread(target=recv, args=(ws,)) t1 = threading.Thread(target=send_audio, args=(ws,)) t2.start() t1.start()
package main import ( "bytes" "encoding/json" "io" "io/ioutil" "net/http" "net/url" "os" "sync" "time" "github.com/gorilla/websocket" "github.com/volcengine/volc-sdk-golang/base" ) const ( accessKey = "accessKey" // https://console.volcengine.com/iam/keymanage/ secretKey = "secretKey" ) const ( kServiceVersion20200601 = "2020-06-01" kSpeechTranslateAction = "SpeechTranslate" kHost = "translate.volces.com" ) func fromLocal(path string) (*bytes.Reader, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() data, err := ioutil.ReadAll(f) if err != nil { return nil, err } return bytes.NewReader(data), nil } func newServiceInfo(host string) *base.ServiceInfo { return &base.ServiceInfo{ Timeout: 5 * time.Second, Host: host, Header: http.Header{}, Credentials: base.Credentials{Region: base.RegionCnNorth1, Service: "translate"}, } } func newClient(host string, accessKey string, secretKey string) *base.Client { client := base.NewClient(newServiceInfo(host), map[string]*base.ApiInfo{ kSpeechTranslateAction: { Method: http.MethodGet, Path: "/api/translate/speech/v1/", Query: url.Values{ "Action": []string{kSpeechTranslateAction}, "Version": []string{kServiceVersion20200601}, }, }, }) client.SetAccessKey(accessKey) client.SetSecretKey(secretKey) return client } func newAudioData(data []byte) []byte { type AudioDataReq struct { AudioData []byte `json:"AudioData"` } audioData := &AudioDataReq{ AudioData: data, } b, err := json.Marshal(audioData) if err != nil { println(err.Error()) return []byte{0} } return b } func getWsSignUrl(host string, accessKey string, secretKey string) (string, error) { client := newClient(host, accessKey, secretKey) signUrl, err := client.GetSignUrl(kSpeechTranslateAction, nil) if err != nil { return "", err } clientUrl := url.URL{ Scheme: "wss", Path: client.ApiInfoList[kSpeechTranslateAction].Path, Host: client.ServiceInfo.Host, RawQuery: signUrl, } return clientUrl.String(), nil } func main() { var ( group = &sync.WaitGroup{} buffer = make([]byte, 200*32) dialer = &websocket.Dialer{} header = http.Header{} ) r, err := fromLocal("audio.wav") if err != nil { panic(err) return } signUrl, err := getWsSignUrl(kHost, accessKey, secretKey) if err != nil { panic(err) return } group.Add(2) header.Set("content-type", "application/json") conn, resp, err := dialer.Dial(signUrl, header) if err != nil { println(resp.Status) panic(err) } if err = conn.WriteMessage(websocket.BinaryMessage, []byte(`{"Configuration":{"SourceLanguage":"en","TargetLanguages":["zh"]}}`)); err != nil { panic(err) } go func() { defer group.Done() for { _, data, err := conn.ReadMessage() if err != nil { println(err.Error()) return } println(string(data)) } }() go func() { defer group.Done() for { _, err = r.Read(buffer) if err != nil { println(err.Error()) if err == io.EOF { if err = conn.WriteMessage(websocket.BinaryMessage, []byte(`{"End":true}`)); err != nil { panic(err) } } return } err = conn.WriteMessage(websocket.BinaryMessage, newAudioData(buffer)) if err != nil { panic(err) return } time.Sleep(200 * time.Millisecond) } }() group.Wait() }
<dependencies> <dependency> <groupId>javax.websocket</groupId> <artifactId>javax.websocket-client-api</artifactId> <version>{version}</version> </dependency> <dependency> <groupId>org.java-websocket</groupId> <artifactId>Java-WebSocket</artifactId> <version>1{version}</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-simple</artifactId> <version>{version}</version> <scope>runtime</scope> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>{version}</version> </dependency> <!-- https://mvnrepository.com/artifact/com.volcengine/volc-sdk-java --> <dependency> <groupId>com.volcengine</groupId> <artifactId>volc-sdk-java</artifactId> <version>{version}</version> </dependency> </dependencies>
//src/main/java/translate/TranslateConfig.java package translate; import com.volcengine.helper.Const; import com.volcengine.model.ApiInfo; import com.volcengine.model.Credentials; import com.volcengine.model.ServiceInfo; import org.apache.http.Header; import org.apache.http.NameValuePair; import org.apache.http.message.BasicHeader; import org.apache.http.message.BasicNameValuePair; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; public class TranslateConfig { public static String accessKey = "your access key"; public static String secretKey = "your secret key"; public static String api = "SpeechTranslate"; public static String path = "/api/translate/speech/v1/"; public static String host = "translate.volces.com"; public static ServiceInfo serviceInfo = new ServiceInfo( new HashMap<>() { { put(Const.CONNECTION_TIMEOUT, 5000); put(Const.SOCKET_TIMEOUT, 5000); put(Const.Host, host); put(Const.Header, new ArrayList<Header>() { { add(new BasicHeader("Accept", "application/json")); } }); put(Const.Credentials, new Credentials(Const.REGION_CN_NORTH_1, "translate")); } } ); public static Map<String, ApiInfo> apiInfoList = new HashMap<>() { { put(api, new ApiInfo( new HashMap<>() { { put(Const.Method, "GET"); put(Const.Path, path); put(Const.Query, new ArrayList<NameValuePair>() { { add(new BasicNameValuePair("Action", api)); add(new BasicNameValuePair("Version", "2020-06-01")); } }); } } )); } }; }
// src/main/java/translate/TranslateService.java package translate; import com.volcengine.model.ApiInfo; import com.volcengine.model.ServiceInfo; import com.volcengine.service.BaseServiceImpl; import java.util.Map; public class TranslateService extends BaseServiceImpl { public TranslateService(ServiceInfo info, Map<String, ApiInfo> apiInfoList) { super(info, apiInfoList); } }
// src/main/java/websocket/Client.java package websocket; import org.java_websocket.client.WebSocketClient; import org.java_websocket.handshake.ServerHandshake; import java.net.URI; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; public class Client extends WebSocketClient { public Client(URI serverUri) { super(serverUri); } @Override public void onOpen(ServerHandshake serverHandshake) { System.out.println("onopen"); } @Override public void onMessage(String s) { System.out.println("onmessage"); System.out.println(s); } @Override public void onMessage(ByteBuffer message) { System.out.println("onmessage"); System.out.println(new String(message.array(), StandardCharsets.UTF_8)); } @Override public void onClose(int i, String s, boolean b) { System.out.println("onclose"); } @Override public void onError(Exception e) { e.printStackTrace(); } }
//src/main/java/Main.java import translate.TranslateConfig; import translate.TranslateService; import websocket.Client; import java.io.File; import java.io.FileInputStream; import java.net.URI; import java.util.Base64; public class Main { public static void main(String[] args) throws Exception { File input = new File("audio.wav"); TranslateService translateService = new TranslateService(TranslateConfig.serviceInfo, TranslateConfig.apiInfoList); translateService.setAccessKey(TranslateConfig.accessKey); translateService.setSecretKey(TranslateConfig.secretKey); String signUrl = translateService.getSignUrl(TranslateConfig.api, null); URI url = new URI("wss://" + TranslateConfig.host + TranslateConfig.path + "?" + signUrl); System.out.println(url); // open websocket Client client = new Client(url); client.connectBlocking(); client.send("{\n" + " \"Configuration\": {\n" + " \"SourceLanguage\": \"zh\",\n" + " \"TargetLanguages\": [\n" + " \"en\"\n" + " ],\n" + " \"HotWordList\": [\n" + " {\n" + " \"Word\": \"hello\",\n" + " \"Scale\": 1\n" + " }\n" + " ]\n" + " }\n" + "}"); byte[] buffer = new byte[200 * 32]; int bytesLeft = 100 * 1024 * 1024; try (FileInputStream fis = new FileInputStream(input)) { while (bytesLeft > 0) { int read = fis.read(buffer, 0, Math.min(bytesLeft, buffer.length)); if (read == -1) { break; } client.send(bytesToMessage(buffer)); Thread.sleep(200); bytesLeft -= read; } } finally { client.send("{\n" + " \"End\": true\n" + "}"); } } static String bytesToMessage(byte[] data) { String base64Data = Base64.getEncoder().encodeToString(data); return "{\n" + " \"AudioData\": \"" + base64Data + "\"\n" + "}"; } }
依赖 vcloud-sdk-nodejs
依赖 ws
import {Signer} from "@volcengine/openapi"; import {Credentials, RequestObj} from "@volcengine/openapi/lib/base/types"; import WebSocket from 'ws'; import * as fs from 'fs'; import {open} from 'node:fs/promises'; import {sleep} from "@volcengine/openapi/lib/services/rocketmq/utils/common"; const host = "translate.volces.com" const path = "/api/translate/speech/v1/" function getWebsocketUrl(): string { const openApiRequestData: RequestObj = { method: "GET", region: "cn-north-1", params: { Action: "SpeechTranslate", Version: "2020-06-01", }, pathname: "/api/translate/speech/v1/" }; const credentials: Credentials = { accessKeyId: "your access key", secretKey: "your secret key", }; const signer = new Signer(openApiRequestData, "translate"); return 'wss://' + host + path + '?' + signer.getSignUrl(credentials); } console.log(getWebsocketUrl()); const ws = new WebSocket(getWebsocketUrl()); ws.on('open', async () => { console.log("onopen") let configuration = { "Configuration": { "SourceLanguage": "en", "TargetLanguages": [ "zh" ], "HotWordList": [ { "Word": "hello", "Scale": 1 } ], "Extra": {} } }; ws.send(JSON.stringify(configuration)); let readStream = fs.createReadStream('audio.wav'); readStream.on('readable', async () => { let chunk = readStream.read(200 * 32); while (chunk !== null) { await sleep(200).catch(e => { console.error(e); }) let audio = { "AudioData": Buffer.from(chunk).toString("base64") }; ws.send(JSON.stringify(audio)); chunk = readStream.read(200 * 32); } ws.send(JSON.stringify({ "End": true })); }) }); ws.on('message', (data) => { console.log("%s", data); }); ws.on("close", () => { console.log("onclose") }); ws.on("error", (err) => { console.error(err) }); ws.on("ping", () => { console.log("headers") }); ws.on("pong", () => { });