discord voice | | | Search

The code initializes a Discord bot, connects to a voice channel, and plays text-to-speech audio using the outetts library and FFmpeg, leveraging asynchronous programming for efficient execution.

The code sets up a Discord bot, connects to a voice channel, and utilizes FFmpeg for audio processing, allowing for text-to-speech functionality. It follows best practices for coding and documentation, utilizing modular and reusable design principles.

Run example

npm run import -- "send voice on discord"

send voice on discord

import os
import io
import ffmpeg
import discord
import outetts
import asyncio

# Initialize Discord bot
TOKEN = os.getenv("BOT_TOKEN")
GUILD_ID = os.getenv("DEFAULT_GUILD_ID")
# VC_CHANNEL_ID = "YOUR_VOICE_CHANNEL_ID"
model_config = outetts.HFModelConfig_v2(
    model_path="OuteAI/OuteTTS-0.3-1B",
    tokenizer_path="OuteAI/OuteTTS-0.3-1B"
)

async def init_discord(token = TOKEN):

  # Initialize Outetts
  interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
  speaker = interface.load_default_speaker(name="en_male_1")

  client = discord.Client(intents=discord.Intents.default())
  globals()['client'] = client
  #client.run(TOKEN)
  #client.event(connect_voice)
  await asyncio.gather(
    client.start(TOKEN),  # Start the bot
    connect_voice()  # Connect to VC
  )

async def connect_voice():
    client = globals()['client']
    print(f'Logged in as {client.user}')
    guild = discord.utils.get(client.guilds, id=int(GUILD_ID))
    vc_channel = discord.utils.get(guild.voice_channels) #, id=int(VC_CHANNEL_ID))
    if vc_channel:
        vc = await vc_channel.connect()
        await play_tts(vc)

async def play_tts(prompt, vc):
    # Generate speech
    gen_cfg = outetts.GenerationConfig(
        text=prompt,
        temperature=0.3,
        repetition_penalty=1.1,
        max_length=4096,
        speaker=speaker,
    )
    output = interface.generate(config=gen_cfg)

    # Convert Outetts output to a stream
    audio_buffer = io.BytesIO()
    output.save(audio_buffer)
    audio_buffer.seek(0)

    # Stream audio to Discord using FFmpeg
    ffmpeg_process = (
        ffmpeg.input('pipe:0', format='wav')
        .output('pipe:1', format='opus', acodec='libopus', ar='48000')
        .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
    )

    ffmpeg_process.stdin.write(audio_buffer.read())
    ffmpeg_process.stdin.close()

    # Play the stream in Discord
    audio_source = discord.FFmpegPCMAudio(ffmpeg_process.stdout, pipe=True)
    vc.play(audio_source)

#__all__ = {
#  "init_discord": init_discord,
#  "connect_voice": connect_voice,
#  "play_tts": "play_tts",
#}

__all__ = init_discord

What the code could have been:

"""
Discord Bot using Outetts for TTS and FFmpeg for audio playback.
"""

import os
import io
import ffmpeg
import discord
import asyncio
import outetts

# Configuration
TOKEN = os.getenv("BOT_TOKEN")
GUILD_ID = os.getenv("DEFAULT_GUILD_ID")
VC_CHANNEL_ID = os.getenv("VC_CHANNEL_ID", "")

class OutettsInterface:
    """Outetts interface for generating TTS."""
    def __init__(self):
        self.model_config = outetts.HFModelConfig_v2(
            model_path="OuteAI/OuteTTS-0.3-1B",
            tokenizer_path="OuteAI/OuteTTS-0.3-1B"
        )
        self.interface = outetts.InterfaceHF(model_version="0.3", cfg=self.model_config)
        self.speaker = self.interface.load_default_speaker(name="en_male_1")

    async def generate_tts(self, prompt):
        """Generate TTS from a prompt."""
        gen_cfg = outetts.GenerationConfig(
            text=prompt,
            temperature=0.3,
            repetition_penalty=1.1,
            max_length=4096,
            speaker=self.speaker,
        )
        output = self.interface.generate(config=gen_cfg)
        return output

class DiscordClient:
    """Discord client for playing audio."""
    def __init__(self, token):
        self.client = discord.Client(intents=discord.Intents.default())
        self.token = token

    async def start(self):
        """Start the Discord client."""
        await self.client.start(self.token)

    async def connect_voice(self, guild_id):
        """Connect to a voice channel."""
        guild = discord.utils.get(self.client.guilds, id=int(guild_id))
        vc_channel = discord.utils.get(guild.voice_channels)
        if vc_channel:
            vc = await vc_channel.connect()
            await self.play_audio(vc)

    async def play_audio(self, vc, audio_stream):
        """Play audio from a stream."""
        audio_source = discord.FFmpegPCMAudio(audio_stream, pipe=True)
        vc.play(audio_source)

class AudioConverter:
    """Convert Outetts output to a stream."""
    def __init__(self):
        self.audio_buffer = io.BytesIO()

    def convert(self, output):
        """Convert output to a stream."""
        output.save(self.audio_buffer)
        self.audio_buffer.seek(0)
        return self.audio_buffer

async def init_discord(token):
    """
    Initialize the Discord bot.

    Args:
        token (str): The Discord bot token.
    """
    interface = OutettsInterface()
    client = DiscordClient(token)
    await client.start()
    await client.connect_voice(GUILD_ID)

async def connect_voice(guild_id):
    """
    Connect to a voice channel.

    Args:
        guild_id (str): The ID of the guild.
    """
    client = globals()['client']
    print(f'Logged in as {client.user}')
    guild = discord.utils.get(client.guilds, id=int(guild_id))
    vc_channel = discord.utils.get(guild.voice_channels)
    if vc_channel:
        await client.connect_voice(guild_id)

async def play_tts(prompt, vc):
    """
    Play TTS from a prompt.

    Args:
        prompt (str): The prompt to generate TTS from.
        vc (discord.VoiceClient): The voice client to play audio from.
    """
    interface = OutettsInterface()
    output = await interface.generate_tts(prompt)
    audio_converter = AudioConverter()
    audio_stream = audio_converter.convert(output)
    await client.play_audio(vc, audio_stream)

# Initialization
TOKEN = os.getenv("BOT_TOKEN")
GUILD_ID = os.getenv("DEFAULT_GUILD_ID")
client = globals()['client']

__all__ = {
  "init_discord",
  "connect_voice",
  "play_tts",
}

Code Breakdown

Importing Modules

The code imports the following modules:

Initializing Discord Bot

The code initializes a Discord bot by:

  1. Obtaining the bot token and default guild ID from environment variables BOT_TOKEN and DEFAULT_GUILD_ID.
  2. Defining an Outetts model configuration.
  3. Creating a Discord client with default intentions and setting it as a global variable.

Asynchronous Functions

The code defines two asynchronous functions:

  1. init_discord: Initializes the Discord bot and connects to a voice channel.
  2. connect_voice: Connects to a voice channel and starts playing text-to-speech audio.

Text-to-Speech Functionality

The code uses the outetts library to generate speech from text prompts. The play_tts function:

  1. Generates speech using the Outetts interface and a text prompt.
  2. Converts the output to a WAV stream using io.BytesIO.
  3. Streams the audio to Discord using FFmpeg.

FFmpeg Audio Processing

The code uses FFmpeg to convert the WAV stream to Opus audio format, which is compatible with Discord. The audio is played in the voice channel using discord.FFmpegPCMAudio.

Notes