openapi: 3.0.1
servers:
  - url: https://prod.api.market/api/v1/sarvam/ai-models
info:
  title: "Sarvam AI API Suite for Language Processing: Multilingual Translation,
    Speech-to-Text, and TTS"
  description: Seamlessly translate, transcribe, and synthesize voice across
    Indian languages.
  version: 1.0.1
components:
  securitySchemes: {}
security: {}
paths:
  /translate:
    post:
      summary: Translate Text
      description: Translates input text to the specified target language.
      operationId: translateText
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              properties:
                input:
                  type: string
                  description: Input text to be translated
                  example: This is a sample string to translate to Hindi
                source_language_code:
                  type: string
                  enum:
                    - en-IN
                  description: Source language code
                  example: en-IN
                target_language_code:
                  type: string
                  enum:
                    - hi-IN
                    - bn-IN
                    - kn-IN
                    - ml-IN
                    - mr-IN
                    - od-IN
                    - pa-IN
                    - ta-IN
                    - te-IN
                    - gu-IN
                  description: Target language code
                  example: hi-IN
                speaker_gender:
                  type: string
                  enum:
                    - Male
                    - Female
                  default: Female
                  description: >
                    Gender of the speaker. Only supported for code-mixed
                    translation models.
                  example: Male
                mode:
                  type: string
                  enum:
                    - formal
                    - code-mixed
                  default: formal
                  description: Mode of translation, either formal or code-mixed.
                  example: formal
                model:
                  type: string
                  enum:
                    - mayura:v1
                  default: mayura:v1
                  description: Model used for translation
                  example: mayura:v1
                enable_preprocessing:
                  type: boolean
                  default: true
                  description: >
                    Enable custom preprocessing for potentially better
                    translations.
                  example: true
      responses:
        "200":
          description: Successful translation
          content:
            application/json:
              schema:
                type: object
                properties:
                  translated_text:
                    type: string
                    description: Translated text result
                    example: <Translated string in Hindi>
    parameters:
      - &a1
        description: API.market API Key
        in: header
        name: x-api-market-key
        value: Please Login/Signup to get an API Key
        required: true
        schema:
          type: string
  /speech-to-text:
    post:
      summary: Speech to Text
      description: >
        Transcribes audio input to text in the same language using Sarvam
        speech-to-text models. Suitable for use cases requiring transcripts in
        the native language.

        The ideal input audio size is below 5 minutes.
      operationId: speechToText
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: The audio file to transcribe. Supported formats are .wav and .mp3
                    (16kHz preferred). Multiple channels will be merged.
                language_code:
                  type: string
                  enum:
                    - hi-IN
                    - bn-IN
                    - kn-IN
                    - ml-IN
                    - mr-IN
                    - od-IN
                    - pa-IN
                    - ta-IN
                    - te-IN
                    - gu-IN
                  description: Language code for transcription
                  example: bn-IN
                model:
                  type: string
                  enum:
                    - saarika:v1
                  default: saarika:v1
                  description: Model to be used for speech-to-text
                  example: saarika:v1
                with_timestamps:
                  type: boolean
                  default: false
                  description: Enable word-level timestamps in the transcript
                  example: true
      responses:
        "200":
          description: Successful transcription
          content:
            application/json:
              schema:
                type: object
                properties:
                  transcript:
                    type: string
                    description: Transcript of the provided speech.
                    example: <string>
                  timestamps:
                    type: object
                    properties:
                      words:
                        type: array
                        items:
                          type: string
                        description: List of words in the transcript.
                        example:
                          - <string>
                      start_time_seconds:
                        type: array
                        items:
                          type: number
                        description: List of start times of words in seconds.
                        example:
                          - 123
                      end_time_seconds:
                        type: array
                        items:
                          type: number
                        description: List of end times of words in seconds.
                        example:
                          - 123
    parameters:
      - *a1
  /speech-to-text-translate:
    post:
      summary: Speech To Text Translate
      description: >
        Transcribes audio input and directly translates the text to English
        using a single model that can automatically detect the language.
        Suitable for voice-based LLM applications.

        The ideal input audio size is below 5 minutes.
      operationId: speechToTextTranslate
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: The audio file to transcribe. Supported formats are .wav and .mp3
                    (16kHz preferred). Multiple channels will be merged.
                prompt:
                  type: string
                  nullable: true
                  description: >
                    Conversation context to boost model accuracy. This is an
                    experimental feature and may not match the performance of
                    large language models.
                  example: Create a nice translation like a normal person
                model:
                  type: string
                  enum:
                    - saaras:v1
                  default: saaras:v1
                  description: Model used for converting speech to text in the target language
                  example: saaras:v1
      responses:
        "200":
          description: Successful speech-to-text translation
          content:
            application/json:
              schema:
                type: object
                properties:
                  transcript:
                    type: string
                    description: Transcript of the provided speech.
                    example: <string>
                  language_code:
                    type: string
                    nullable: true
                    description: >
                      BCP-47 code of the language detected in the input. If
                      multiple languages are detected, returns the most
                      predominant language. Returns `null` if no language is
                      detected.
                    enum:
                      - hi-IN
                      - bn-IN
                      - kn-IN
                      - ml-IN
                      - mr-IN
                      - od-IN
                      - pa-IN
                      - ta-IN
                      - te-IN
                      - gu-IN
                      - en-IN
                    example: hi-IN
    parameters:
      - *a1
  /call-analytics:
    post:
      summary: Call Analytics
      description: >
        Analyzes an audio file of a call between two parties and returns the
        transcript, along with answers to specified questions. Each response
        includes reasoning and direct phrases from the transcript.
      operationId: callAnalytics
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: >
                    The audio file to be analyzed. Supported formats are .wav
                    and .mp3 (16kHz preferred). Multi-channel audio will be
                    merged to mono. File size must be <10MB and duration <600
                    seconds (10 mins).
                questions:
                  type: string
                  description: >
                    List of questions to answer based on the call content. Each
                    question should be a JSON object with fields `id`, `text`,
                    `description`, `type`, and `properties`.
                hotwords:
                  type: string
                  nullable: true
                  description: >
                    Comma-separated keywords specific to your domain, preserved
                    as-is in the transcript.
                  example: sales, product, ai, API
      responses:
        "200":
          description: Successful call analysis
          content:
            application/json:
              schema:
                type: object
                properties:
                  file_name:
                    type: string
                    nullable: true
                    description: Unique identifier for the analyzed audio file.
                    example: call_20230901_123456.mp3
                  transcript:
                    type: string
                    description: Full transcript of the call generated by Sarvam's in-house
                      speech-to-text model.
                    example: "Agent: Thank you for calling customer support. How may I assist you
                      today?..."
                  answers:
                    type: array
                    items:
                      type: object
                      properties:
                        id:
                          type: string
                          description: Unique identifier for the question-answer pair, matching the
                            question ID.
                          example: q1
                        question:
                          type: string
                          description: The original question text that was asked and analyzed.
                          example: What was the main issue discussed in the call?
                        reasoning:
                          type: string
                          description: Explanation justifying the answer.
                          example: The customer repeatedly mentioned issues with their internet
                            connection.
                        response:
                          type: string
                          description: Answer to the question based on the call analysis.
                          example: INTERNET_ISSUES
    parameters:
      - *a1
  /text-to-speech:
    post:
      summary: Text to Speech
      description: >
        Converts text into spoken audio. The output is a wave file encoded as a
        base64 string.
      operationId: textToSpeech
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              properties:
                inputs:
                  type: array
                  items:
                    type: string
                  description: The text to be spoken. Maximum limit of 500 characters per text; up
                    to 3 texts in a single request.
                  example:
                    - The yellow fox jumped over the something something
                    - and then some Another string
                target_language_code:
                  type: string
                  enum:
                    - hi-IN
                    - bn-IN
                    - kn-IN
                    - ml-IN
                    - mr-IN
                    - od-IN
                    - pa-IN
                    - ta-IN
                    - te-IN
                    - en-IN
                    - gu-IN
                  description: The language of the text in BCP-47 format
                  example: hi-IN
                speaker:
                  type: string
                  enum:
                    - meera
                    - pavithra
                    - maitreyi
                    - arvind
                    - amol
                    - amartya
                  default: meera
                  description: The speaker or voice to use
                  example: meera
                pitch:
                  type: number
                  nullable: true
                  description: Control the pitch of the audio, with a range from -0.75 to 0.75.
                  example: 0
                pace:
                  type: number
                  nullable: true
                  description: Control the speed of the audio, with a range from 0.5 to 2.
                  example: 1
                loudness:
                  type: number
                  nullable: true
                  default: 1
                  description: Control the loudness of the audio, with a range from 0.3 to 3.0.
                  example: 1
                speech_sample_rate:
                  type: integer
                  enum:
                    - 8000
                    - 16000
                    - 22050
                  default: 22050
                  description: The sample rate of the output audio; default is 22050.
                  example: 16000
                enable_preprocessing:
                  type: boolean
                  default: true
                  description: >
                    Controls whether normalization of English words and numeric
                    entities is applied.
                  example: true
                model:
                  type: string
                  enum:
                    - bulbul:v1
                  default: bulbul:v1
                  description: Model used for converting text inputs to speech.
                  example: bulbul:v1
      responses:
        "200":
          description: Successful text-to-speech conversion
          content:
            application/json:
              schema:
                type: object
                properties:
                  audios:
                    type: array
                    items:
                      type: string
                    description: Array of base64-encoded wave (.wav) file outputs.
                    example:
                      - <base64-encoded audio string>
    parameters:
      - *a1
tags:
  - name: artificial-intelligence
    description: Operations related to artificial-intelligence