Interactive conversations with the Live API

The Live API enables low-latency, two-way voice and video interactions with Gemini.

This guide covers how to set up a two-way interactive conversation, adjust audio settings, manage sessions, and more.

Supported models

You can use the Live API with the following models:

Model version	Availability level
`gemini-live-2.5-flash`	Private GA^*
`gemini-live-2.5-flash-preview-native-audio`	Public preview
`gemini-live-2.5-flash-preview-native-audio-09-2025`	Public preview

^* Reach out to your Google account team representative to request access.

Start a conversation

Console

Open Vertex AI Studio > Stream realtime.
Click Start session to initiate the conversation.

To end the session, click Stop session.

Python

To enable real-time, interactive conversations, run the following example on a local computer with microphone and speaker access (not a Colab notebook). This example sets up a conversation with the API, allowing you to send audio prompts and receive audio responses:

"""
# Installation
# on linux
sudo apt-get install portaudio19-dev

# on mac
brew install portaudio

python3 -m venv env
source env/bin/activate
pip install google-genai
"""

import asyncio
import pyaudio
from google import genai
from google.genai import types

CHUNK=4200
FORMAT=pyaudio.paInt16
CHANNELS=1
RECORD_SECONDS=5
MODEL = 'gemini-live-2.5-flash'
INPUT_RATE=16000
OUTPUT_RATE=24000

client = genai.Client(
    vertexai=True,
    project=GOOGLE_CLOUD_PROJECT,
    location=GOOGLE_CLOUD_LOCATION,
)
config = {
    "response_modalities": ["AUDIO"],
    "input_audio_transcription": {},  # Configure input transcription
    "output_audio_transcription": {},  # Configure output transcription
}

async def main():
    print(MODEL)
    p = pyaudio.PyAudio()
    async with client.aio.live.connect(model=MODEL, config=config) as session:
        #exit()
        async def send():
            stream = p.open(
                format=FORMAT, channels=CHANNELS, rate=INPUT_RATE, input=True, frames_per_buffer=CHUNK)
            while True:
                frame = stream.read(CHUNK)
                await session.send(input={"data": frame, "mime_type": "audio/pcm"})
                await asyncio.sleep(10**-12)
        async def receive():
            output_stream = p.open(
                format=FORMAT, channels=CHANNELS, rate=OUTPUT_RATE, output=True, frames_per_buffer=CHUNK)
            async for message in session.receive():
                if message.server_content.input_transcription:
                  print(message.server_content.model_dump(mode="json", exclude_none=True))
                if message.server_content.output_transcription:
                  print(message.server_content.model_dump(mode="json", exclude_none=True))
                if message.server_content.model_turn:
                    for part in message.server_content.model_turn.parts:
                        if part.inline_data.data:
                            audio_data=part.inline_data.data
                            output_stream.write(audio_data)
                            await asyncio.sleep(10**-12)
        send_task = asyncio.create_task(send())
        receive_task = asyncio.create_task(receive())
        await asyncio.gather(send_task, receive_task)

asyncio.run(main())

Python

Set up a conversation with the API that lets you send text prompts and receive audio responses:

# Set model generation_config
CONFIG = {"response_modalities": ["AUDIO"]}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {bearer_token[0]}",
}

async def main() -> None:
    # Connect to the server
    async with connect(SERVICE_URL, additional_headers=headers) as ws:

        # Setup the session
        async def setup() -> None:
            await ws.send(
                json.dumps(
                    {
                        "setup": {
                            "model": "gemini-live-2.5-flash",
                            "generation_config": CONFIG,
                        }
                    }
                )
            )

            # Receive setup response
            raw_response = await ws.recv(decode=False)
            setup_response = json.loads(raw_response.decode("ascii"))
            print(f"Connected: {setup_response}")
            return

        # Send text message
        async def send() -> bool:
            text_input = input("Input > ")
            if text_input.lower() in ("q", "quit", "exit"):
                return False

            msg = {
                "client_content": {
                    "turns": [{"role": "user", "parts": [{"text": text_input}]}],
                    "turn_complete": True,
                }
            }

            await ws.send(json.dumps(msg))
            return True

        # Receive server response
        async def receive() -> None:
            responses = []

            # Receive chucks of server response
            async for raw_response in ws:
                response = json.loads(raw_response.decode())
                server_content = response.pop("serverContent", None)
                if server_content is None:
                    break

                model_turn = server_content.pop("modelTurn", None)
                if model_turn is not None:
                    parts = model_turn.pop("parts", None)
                    if parts is not None:
                        for part in parts:
                            pcm_data = base64.b64decode(part["inlineData"]["data"])
                            responses.append(np.frombuffer(pcm_data, dtype=np.int16))

                # End of turn
                turn_complete = server_content.pop("turnComplete", None)
                if turn_complete:
                    break

            # Play the returned audio message
            display(Markdown("**Response >**"))
            display(Audio(np.concatenate(responses), rate=24000, autoplay=True))
            return

        await setup()

        while True:
            if not await send():
                break
            await receive()

Start the conversation, input your prompts, or type q, quit or exit to exit.

await main()

Change language and voice settings

The Live API uses Chirp 3 to support synthesized speech responses in a variety of HD voices and languages. For a full list and demos of each voice, see Chirp 3: HD voices.

To set the response voice and language:

Console

Open Vertex AI Studio > Stream realtime.
In the Outputs expander, select a voice from the Voice drop-down.
In the same expander, select a language from the Language drop-down.
Click Start session to start the session.

Python

config = LiveConnectConfig(
    response_modalities=["AUDIO"],
    speech_config=SpeechConfig(
        voice_config=VoiceConfig(
            prebuilt_voice_config=PrebuiltVoiceConfig(
                voice_name=voice_name,
            )
        ),
        language_code="en-US",
    ),
)

Change voice activity detection settings

Voice activity detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time.

The model automatically performs voice activity detection (VAD) on a continuous audio input stream. You can configure the VAD settings using the realtimeInputConfig.automaticActivityDetection field of the setup message. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a message to report the interruption.

If the audio stream pauses for more than a second (for example, if the user turns off the microphone), send an audioStreamEnd event to flush any cached audio. The client can resume sending audio data at any time.

Alternatively, disable automatic VAD by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. With this configuration, the client detects user speech and sends activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent. Interruptions are marked by activityEnd.

Python

config = LiveConnectConfig(
    response_modalities=["TEXT"],
    realtime_input_config=RealtimeInputConfig(
        automatic_activity_detection=AutomaticActivityDetection(
            disabled=False,  # default
            start_of_speech_sensitivity=StartSensitivity.START_SENSITIVITY_LOW, # Either START_SENSITIVITY_LOW or START_SENSITIVITY_HIGH
            end_of_speech_sensitivity=EndSensitivity.END_SENSITIVITY_LOW, # Either END_SENSITIVITY_LOW or END_SENSITIVITY_HIGH
            prefix_padding_ms=20,
            silence_duration_ms=100,
        )
    ),
)

async with client.aio.live.connect(
    model=MODEL_ID,
    config=config,
) as session:
    audio_bytes = Path("sample.pcm").read_bytes()

    await session.send_realtime_input(
        media=Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
    )

    # if stream gets paused, send:
    # await session.send_realtime_input(audio_stream_end=True)

    response = []
    async for message in session.receive():
        if message.server_content.interrupted is True:
            # The model generation was interrupted
            response.append("The session was interrupted")

        if message.text:
            response.append(message.text)

    display(Markdown(f"**Response >** {''.join(response)}"))

Extend a session

The default maximum length of a conversation session is 10 minutes. A goAway notification (BidiGenerateContentServerMessage.goAway) is sent to the client 60 seconds before the session ends.

You can extend the session length in 10-minute increments using the Gen AI SDK. There's no limit to the number of times you can extend a session. For an example, see Enable and disable session resumption.

Context window

The Live API context window is used to store real-time streamed data (25 tokens per second (TPS) for audio and 258 TPS for video) and other content, including text inputs and model outputs.

If the context window exceeds the maximum length (set using the Max content size slider in Vertex AI Studio, or trigger_tokens in the API), the oldest turns are truncated using context window compression to prevent abrupt session termination. Context window compression triggers once the context window hits its maximum length (set either in Vertex AI Studio using the Target context size slider, or using target_tokens in the API) and deletes the oldest parts of the conversation until the total token count is back down to this target size.

For example, if your maximum context length is set to 32000 tokens and your target size is set to 16000 tokens:

Turn 1: The conversation starts. In this example, the request uses 12,000 tokens.
- Total context size: 12,000 tokens
Turn 2: You make another request. This request uses another 12,000 tokens.
- Total context size: 24,000 tokens
Turn 3: You make another request. This request uses 14,000 tokens.
- Total context size: 38,000 tokens

Since the total context size is now higher than the 32,000 token maximum, context window compression now triggers. The system goes back to the beginning of the conversation and starts deleting old turns until the total token size is less than the 16,000 token target:

It deletes Turn 1 (12,000 tokens). The total is now 26,000 tokens, which is still higher than the 16,000 token target.
It deletes Turn 2 (12,000 tokens). The total is now 14,000 tokens.

The final result is that only Turn 3 remains in active memory, and the conversation continues from that point.

The minimum and maximum lengths for the context length and target size are:

Setting (API flag)	Minimum value	Maximum value
Maximum context length (`trigger_tokens`)	5,000	128,000
Target context size (`target_tokens`)	0	128,000

To set the context window:

Console

Open Vertex AI Studio > Stream realtime.
Click to open the Advanced menu.
In the Session Context section, use the Max context size slider to set the context size to a value between 5,000 and 128,000.
(Optional) In the same section, use the Target context size slider to set the target size to a value between 0 and 128,000.

Python

Set the context_window_compression.trigger_tokens and context_window_compression.sliding_window.target_tokens fields in the setup message:

config = types.LiveConnectConfig(
      temperature=0.7,
      response_modalities=['TEXT'],
      system_instruction=types.Content(
          parts=[types.Part(text='test instruction')], role='user'
      ),
      context_window_compression=types.ContextWindowCompressionConfig(
          trigger_tokens=1000,
          sliding_window=types.SlidingWindow(target_tokens=10),
      ),
  )

Concurrent sessions

You can have up to 1,000 concurrent sessions per project.

Update system instructions during a session

The Live API lets you update the system instructions during an active session. Use this to adapt the model's responses, such as changing the response language or modifying the tone.

To update the system instructions mid-session, you can send text content with the system role. The updated system instruction will remain in effect for the remaining session.

Python

session.send_client_content(
      turns=types.Content(
          role="system", parts=[types.Part(text="new system instruction")]
      ),
      turn_complete=False
  )

Enable and disable session resumption

Session resumption lets you reconnect to a previous session within 24 hours. This is achieved by storing cached data, including text, video, audio prompts, and model outputs. Project-level privacy is enforced for this cached data.

By default, session resumption is disabled.

To enable the session resumption feature, set the sessionResumption field of the BidiGenerateContentSetup message. If enabled, the server will periodically take a snapshot of the current cached session contexts, and store it in the internal storage.

When a snapshot is successfully taken, a resumptionUpdate is returned with the handle ID that you can record and use later to resume the session from the snapshot.

Here's an example of enabling session resumption and retrieving the handle ID:

Python

import asyncio
from google import genai
from google.genai import types

client = genai.Client(
    vertexai=True,
    project=GOOGLE_CLOUD_PROJECT,
    location=GOOGLE_CLOUD_LOCATION,
)
model = "gemini-live-2.5-flash"

async def main():
    print(f"Connecting to the service with handle {previous_session_handle}...")
    async with client.aio.live.connect(
        model=model,
        config=types.LiveConnectConfig(
            response_modalities=["AUDIO"],
            session_resumption=types.SessionResumptionConfig(
                # The handle of the session to resume is passed here,
                # or else None to start a new session.
                handle=previous_session_handle
            ),
        ),
    ) as session:
        while True:
            await session.send_client_content(
                turns=types.Content(
                    role="user", parts=[types.Part(text="Hello world!")]
                )
            )
            async for message in session.receive():
                # Periodically, the server will send update messages that may
                # contain a handle for the current state of the session.
                if message.session_resumption_update:
                    update = message.session_resumption_update
                    if update.resumable and update.new_handle:
                        # The handle should be retained and linked to the session.
                        return update.new_handle

                # For the purposes of this example, placeholder input is continually fed
                # to the model. In non-sample code, the model inputs would come from
                # the user.
                if message.server_content and message.server_content.turn_complete:
                    break

if __name__ == "__main__":
    asyncio.run(main())

To achieve seamless session resumption, enable transparent mode:

Python

types.LiveConnectConfig(
            response_modalities=["AUDIO"],
            session_resumption=types.SessionResumptionConfig(
                transparent=True,
    ),
)

After transparent mode is enabled, the index of the client message that corresponds with the context snapshot is explicitly returned. This helps identify which client message you need to send again, when you resume the session from the resumption handle.

More information

For more information on using the Live API, see: