> ## Documentation Index
> Fetch the complete documentation index at: https://docs.siliconflow.cn/llms.txt
> Use this file to discover all available pages before exploring further.

# 创建文本转语音请求

> Generate audio from input text. The data generated by the interface is the binary data of the audio, which requires the user to handle it themselves. Reference:https://docs.siliconflow.cn/capabilities/text-to-speech#5


## OpenAPI

````yaml post /audio/speech
openapi: 3.0.0
info:
  title: SiliconFlow API
  description: The SiliconFlow REST API
  version: 1.0.0
  contact:
    name: SiliconFlow Support
    url: https://www.siliconflow.cn/
  license:
    name: MIT
    url: https://github.com/siliconflow/siliconcloud/blob/main/LICENSE
servers:
  - url: https://api.siliconflow.cn/v1
security:
  - bearerAuth: []
paths:
  /audio/speech:
    post:
      tags:
        - Audio
      summary: Create Speech
      description: >-
        Generate audio from input text. The data generated by the interface is
        the binary data of the audio, which requires the user to handle it
        themselves.
        Reference:https://docs.siliconflow.cn/capabilities/text-to-speech#5
      operationId: createSpeech
      requestBody:
        required: true
        content:
          application/json:
            schema:
              oneOf:
                - $ref: '#/components/schemas/MOSS-TTSD-v0.5'
                - $ref: '#/components/schemas/CosyVoice2-0.5B'
      responses:
        '200':
          description: >-
            Generate audio based on the input text. The data generated by the
            interface is in binary format and requires the user to process it
            themselves.
            Reference:https://docs.siliconflow.cn/capabilities/text-to-speech#5

            The response header contains the x-siliconcloud-trace-id field,
            which serves as a unique identifier for tracing requests,
            facilitating log queries and issue troubleshooting.
          headers:
            Transfer-Encoding:
              schema:
                type: string
              description: chunked
          content:
            application/audio:
              schema:
                type: string
                format: binary
                example: 音频的二进制数据
            audio/wav:
              schema:
                type: string
                format: binary
                example: 音频的二进制数据
            audio/opus:
              schema:
                type: string
                format: binary
                example: 音频的二进制数据
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
        '429':
          $ref: '#/components/responses/RateLimit'
        '503':
          $ref: '#/components/responses/Overloaded'
        '504':
          $ref: '#/components/responses/Timeout'
      x-codeSamples:
        - lang: python
          label: Default
          source: >
            import requests


            url = "https://api.siliconflow.cn/v1/audio/speech"


            payload = {
                "model": "fnlp/MOSS-TTSD-v0.5",
                "input": "你站在桥上看风景，看风景的人在楼上看你。明月装饰了你的窗子，你装饰了别人的梦",
                "voice": "fnlp/MOSS-TTSD-v0.5:alex",
                "response_format": "mp3",
                "stream": True
            }


            headers = {
                "Authorization": "Bearer sk-xx",  # 请替换为您的真实 API Key
                "Content-Type": "application/json"
            }


            response = requests.post(url, json=payload, headers=headers,
            stream=True)


            if response.status_code == 200:
                # 将流式响应写入文件
                with open("output.mp3", "wb") as f:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                print("音频已成功保存为 output.mp3")
            else:
                print(f"请求失败，状态码: {response.status_code}")
                print(f"错误信息: {response.text}")
        - lang: curl
          label: Default
          source: |
            curl --location 'https://api.siliconflow.cn/v1/audio/speech' \
            --header 'Authorization: Bearer sk-xx' \
            --header 'Content-Type: application/json' \
            --data '{
              "model": "fnlp/MOSS-TTSD-v0.5",
              "input": "你站在桥上看风景，看风景的人在楼上看你。明月装饰了你的窗子，你装饰了别人的梦",
              "voice": "fnlp/MOSS-TTSD-v0.5:alex",
              "response_format": "mp3",
              "stream": true
            }'
        - lang: javaScript
          label: Default
          source: |
            const axios = require('axios');
            const fs = require('fs');
            const url = 'https://api.siliconflow.cn/v1/audio/speech';
            const data = {
                model: "fnlp/MOSS-TTSD-v0.5",
                input: "你站在桥上看风景，看风景的人在楼上看你。明月装饰了你的窗子，你装饰了别人的梦",
                voice: "fnlp/MOSS-TTSD-v0.5:alex",
                response_format: "mp3",
                stream: true
            };

            const config = {
                method: 'post',
                url: url,
                headers: {
                    'Authorization': 'Bearer sk-xx', // 请替换为您的真实 API Key
                    'Content-Type': 'application/json'
                },
                data: data,
                responseType: 'stream' // 重要：设置响应类型为流
            };
            axios(config)
                .then(function (response) {
                    // 将流式数据写入文件
                    const writer = fs.createWriteStream('output.mp3');
                    response.data.pipe(writer);

                    writer.on('finish', () => {
                        console.log('音频已成功保存为 output.mp3');
                    });
                    
                    writer.on('error', (err) => {
                        console.error('写入文件时出错:', err);
                    });
                })
                .catch(function (error) {
                    console.error('请求失败:', error.message);
                    if (error.response) {
                        console.error('状态码:', error.response.status);
                    }
                });
components:
  schemas:
    MOSS-TTSD-v0.5:
      title: MOSS-TTSD-v0.5
      type: object
      required:
        - model
        - input
      additionalProperties: false
      properties:
        model:
          type: string
          enum:
            - fnlp/MOSS-TTSD-v0.5
          description: >
            MOSS-TTSD (text to spoken dialogue) is an open-source bilingual
            spoken dialogue synthesis model that supports both Chinese and
            English. It can transform dialogue scripts between two speakers into
            natural, expressive conversational speech. MOSS-TTSD supports voice
            cloning and long single-session speech generation, making it ideal
            for AI podcast production.


            To better enhance service quality, we will make periodic changes to
            the models provided by this service, including but not limited to
            model on/offlining and adjustments to model service capabilities. We
            will notify you of such changes through appropriate means such as
            announcements or message pushes where feasible.
        input:
          type: string
          description: |
            The dialogue text uses speaker tags to indicate turns:
            [S1]: Indicates Speaker 1 is speaking
            [S2]: Indicates Speaker 2 is speaking
          example: >-
            [S1]Hello, how are you today?[S2]I'm doing great, thanks for
            asking![S1]That's wonderful to hear 
          maxLength: 128000
          minLength: 1
        max_tokens:
          type: integer
          description: >-
            The maximum number of tokens to generate. The input + output does
            not exceed 32k tokens.
          default: 2048
          example: 4096
        references:
          description: >-
            The voice field and references field are mutually exclusive. If you
            want to use scripted dialogue, you need to pass two voice tones
            through the references field. Scripted dialogue is only available
            for the moss model.
          type: array
          items:
            type: object
            properties:
              audio:
                oneOf:
                  - type: string
                    format: url
                    description: >-
                      A URL pointing to an audio file (e.g.,
                      `https://example.com/audio.mp3`).
                  - type: string
                    pattern: ^data:audio\/\w+;base64,[A-Za-z0-9+/=]+$
                    description: >-
                      A base64-encoded audio string (e.g.,
                      `data:audio/mp3;base64,ABC123...`).
              text:
                description: >-
                  The audio content, which can be either a URL pointing to an
                  audio file or a base64-encoded audio string.
                type: string
        voice:
          description: >-
            The "voice" field currently does not support two timbres. If you
            need to upload two timbres, please use "reference".
          type: string
          enum:
            - fnlp/MOSS-TTSD-v0.5:alex
            - fnlp/MOSS-TTSD-v0.5:anna
            - fnlp/MOSS-TTSD-v0.5:bella
            - fnlp/MOSS-TTSD-v0.5:benjamin
            - fnlp/MOSS-TTSD-v0.5:charles
            - fnlp/MOSS-TTSD-v0.5:claire
            - fnlp/MOSS-TTSD-v0.5:david
            - fnlp/MOSS-TTSD-v0.5:diana
        response_format:
          description: >-
            The format to audio out. Supported formats are `mp3`, `opus`, `wav`,
            `pcm`
          default: mp3
          type: string
          enum:
            - mp3
            - opus
            - wav
            - pcm
        sample_rate:
          description: >-
            Control the output sample rate. The default values and differ for
            different video output types, as follows: opus: Supports 48000 Hz.
            wav, pcm: Supports 8000, 16000, 24000, 32000, 44100 Hz, with a
            default of 44100 Hz. mp3: Supports 32000, 44100 Hz, with a default
            of 44100 Hz.
          type: number
          default: 32000
        stream:
          description: streaming or not
          type: boolean
          default: true
        speed:
          type: number
          description: >-
            The speed of the generated audio. Select a value from `0.25` to
            `4.0`. `1.0` is the default.
          format: float
          minimum: 0.25
          maximum: 4
          default: 1
        gain:
          type: number
          format: float
          minimum: -10
          maximum: 10
          default: 0
    CosyVoice2-0.5B:
      title: CosyVoice2-0.5B
      type: object
      required:
        - model
        - input
      additionalProperties: false
      properties:
        model:
          type: string
          enum:
            - FunAudioLLM/CosyVoice2-0.5B
          description: >-
            Corresponding Model Name. To better enhance service quality, we will
            make periodic changes to the models provided by this service,
            including but not limited to model on/offlining and adjustments to
            model service capabilities. We will notify you of such changes
            through appropriate means such as announcements or message pushes
            where feasible.
        input:
          type: string
          description: >-
            For natural language instructions, add a special end marker
            "<|endofprompt|>" before the natural language description. These
            descriptions cover aspects such as emotion, speaking speed,
            role-playing, and dialects. For detailed instructions, insert pitch
            bursts between text markers, using markers like "[laughter]" and
            "[breath]." Additionally, we apply pitch feature markers to phrases;
            for example:Can you say it with a happy emotion? <|endofprompt|>
            Today is really happy, Spring Festival is coming! I’m so happy,
            Spring Festival is coming! [laughter] [breath].
          example: >-
            Can you say it with a happy emotion? <|endofprompt|>I'm so happy,
            Spring Festival is coming!
          default: >-
            Can you say it with a happy emotion? <|endofprompt|>I'm so happy,
            Spring Festival is coming!
          maxLength: 128000
          minLength: 1
        voice:
          type: string
          enum:
            - FunAudioLLM/CosyVoice2-0.5B:alex
            - FunAudioLLM/CosyVoice2-0.5B:anna
            - FunAudioLLM/CosyVoice2-0.5B:bella
            - FunAudioLLM/CosyVoice2-0.5B:benjamin
            - FunAudioLLM/CosyVoice2-0.5B:charles
            - FunAudioLLM/CosyVoice2-0.5B:claire
            - FunAudioLLM/CosyVoice2-0.5B:david
            - FunAudioLLM/CosyVoice2-0.5B:diana
        references:
          description: The voice field and references field are mutually exclusive.
          type: array
          items:
            type: object
            properties:
              audio:
                oneOf:
                  - type: string
                    format: uri
                    description: >-
                      A URL pointing to an audio file (e.g.,
                      `https://example.com/audio.mp3`).
                  - type: string
                    pattern: ^data:audio\/\w+;base64,[A-Za-z0-9+/=]+$
                    description: >-
                      A base64-encoded audio string (e.g.,
                      `data:audio/mp3;base64,ABC123...`).
              text:
                description: >-
                  The audio content, which can be either a URL pointing to an
                  audio file or a base64-encoded audio string.
                type: string
        response_format:
          description: >-
            The format to audio out. Supported formats are `mp3`, `opus`, `wav`,
            `pcm`
          default: mp3
          type: string
          enum:
            - mp3
            - opus
            - wav
            - pcm
        sample_rate:
          description: >-
            Control the output sample rate. The default values and differ for
            different video output types, as follows: opus: Supports 48000 Hz.
            wav, pcm: Supports 8000, 16000, 24000, 32000, 44100 Hz, with a
            default of 44100 Hz. mp3: Supports 32000, 44100 Hz, with a default
            of 44100 Hz.
          type: number
          default: 32000
        stream:
          description: streaming or not
          type: boolean
        speed:
          type: number
          description: >-
            The speed of the generated audio. Select a value from `0.25` to
            `4.0`. `1.0` is the default.
          format: float
          minimum: 0.25
          maximum: 4
          default: 1
        gain:
          type: number
          format: float
          minimum: -10
          maximum: 10
          default: 0
    BadRquestData:
      type: object
      required:
        - message
        - data
        - code
      properties:
        code:
          type: integer
          nullable: true
          default: false
          example: 20012
        message:
          type: string
          nullable: false
        data:
          type: string
          nullable: false
    UnauthorizedData:
      type: string
      default: false
      example: Invalid token
    ForbiddenData:
      type: string
      default: false
      example: Forbidden
    NotFoundData:
      type: string
      default: false
      example: 404 page not found
    RateLimitData:
      type: object
      required:
        - message
        - data
      properties:
        message:
          type: string
          example: >-
            Request was rejected due to rate limiting. If you want more, please
            contact contact@siliconflow.cn. Details:TPM limit reached.
        data:
          type: string
    OverloadedtData:
      type: object
      required:
        - code
        - message
        - data
      properties:
        code:
          type: integer
          example: 50505
        message:
          type: string
          example: Model service overloaded. Please try again later.
        data:
          type: string
          nullable: false
    TimeoutData:
      type: string
  responses:
    BadRequest:
      description: BadRequest
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/BadRquestData'
    Unauthorized:
      description: Unauthorized
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/UnauthorizedData'
    Forbidden:
      description: Forbidden
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ForbiddenData'
    NotFound:
      description: NotFound
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/NotFoundData'
    RateLimit:
      description: RateLimit
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/RateLimitData'
    Overloaded:
      description: Overloaded
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/OverloadedtData'
    Timeout:
      description: Timeout
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/TimeoutData'
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: your api key
      description: >-
        Use the following format for authentication: Bearer [<your api
        key>](https://cloud.siliconflow.cn/account/ak)

````