The code initializes a Discord bot, connects to a voice channel, and plays text-to-speech audio using the outetts
library and FFmpeg, leveraging asynchronous programming for efficient execution.
The code sets up a Discord bot, connects to a voice channel, and utilizes FFmpeg for audio processing, allowing for text-to-speech functionality. It follows best practices for coding and documentation, utilizing modular and reusable design principles.
npm run import -- "send voice on discord"
import os
import io
import ffmpeg
import discord
import outetts
import asyncio
# Initialize Discord bot
TOKEN = os.getenv("BOT_TOKEN")
GUILD_ID = os.getenv("DEFAULT_GUILD_ID")
# VC_CHANNEL_ID = "YOUR_VOICE_CHANNEL_ID"
model_config = outetts.HFModelConfig_v2(
model_path="OuteAI/OuteTTS-0.3-1B",
tokenizer_path="OuteAI/OuteTTS-0.3-1B"
)
async def init_discord(token = TOKEN):
# Initialize Outetts
interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
speaker = interface.load_default_speaker(name="en_male_1")
client = discord.Client(intents=discord.Intents.default())
globals()['client'] = client
#client.run(TOKEN)
#client.event(connect_voice)
await asyncio.gather(
client.start(TOKEN), # Start the bot
connect_voice() # Connect to VC
)
async def connect_voice():
client = globals()['client']
print(f'Logged in as {client.user}')
guild = discord.utils.get(client.guilds, id=int(GUILD_ID))
vc_channel = discord.utils.get(guild.voice_channels) #, id=int(VC_CHANNEL_ID))
if vc_channel:
vc = await vc_channel.connect()
await play_tts(vc)
async def play_tts(prompt, vc):
# Generate speech
gen_cfg = outetts.GenerationConfig(
text=prompt,
temperature=0.3,
repetition_penalty=1.1,
max_length=4096,
speaker=speaker,
)
output = interface.generate(config=gen_cfg)
# Convert Outetts output to a stream
audio_buffer = io.BytesIO()
output.save(audio_buffer)
audio_buffer.seek(0)
# Stream audio to Discord using FFmpeg
ffmpeg_process = (
ffmpeg.input('pipe:0', format='wav')
.output('pipe:1', format='opus', acodec='libopus', ar='48000')
.run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
)
ffmpeg_process.stdin.write(audio_buffer.read())
ffmpeg_process.stdin.close()
# Play the stream in Discord
audio_source = discord.FFmpegPCMAudio(ffmpeg_process.stdout, pipe=True)
vc.play(audio_source)
#__all__ = {
# "init_discord": init_discord,
# "connect_voice": connect_voice,
# "play_tts": "play_tts",
#}
__all__ = init_discord
"""
Discord Bot using Outetts for TTS and FFmpeg for audio playback.
"""
import os
import io
import ffmpeg
import discord
import asyncio
import outetts
# Configuration
TOKEN = os.getenv("BOT_TOKEN")
GUILD_ID = os.getenv("DEFAULT_GUILD_ID")
VC_CHANNEL_ID = os.getenv("VC_CHANNEL_ID", "")
class OutettsInterface:
"""Outetts interface for generating TTS."""
def __init__(self):
self.model_config = outetts.HFModelConfig_v2(
model_path="OuteAI/OuteTTS-0.3-1B",
tokenizer_path="OuteAI/OuteTTS-0.3-1B"
)
self.interface = outetts.InterfaceHF(model_version="0.3", cfg=self.model_config)
self.speaker = self.interface.load_default_speaker(name="en_male_1")
async def generate_tts(self, prompt):
"""Generate TTS from a prompt."""
gen_cfg = outetts.GenerationConfig(
text=prompt,
temperature=0.3,
repetition_penalty=1.1,
max_length=4096,
speaker=self.speaker,
)
output = self.interface.generate(config=gen_cfg)
return output
class DiscordClient:
"""Discord client for playing audio."""
def __init__(self, token):
self.client = discord.Client(intents=discord.Intents.default())
self.token = token
async def start(self):
"""Start the Discord client."""
await self.client.start(self.token)
async def connect_voice(self, guild_id):
"""Connect to a voice channel."""
guild = discord.utils.get(self.client.guilds, id=int(guild_id))
vc_channel = discord.utils.get(guild.voice_channels)
if vc_channel:
vc = await vc_channel.connect()
await self.play_audio(vc)
async def play_audio(self, vc, audio_stream):
"""Play audio from a stream."""
audio_source = discord.FFmpegPCMAudio(audio_stream, pipe=True)
vc.play(audio_source)
class AudioConverter:
"""Convert Outetts output to a stream."""
def __init__(self):
self.audio_buffer = io.BytesIO()
def convert(self, output):
"""Convert output to a stream."""
output.save(self.audio_buffer)
self.audio_buffer.seek(0)
return self.audio_buffer
async def init_discord(token):
"""
Initialize the Discord bot.
Args:
token (str): The Discord bot token.
"""
interface = OutettsInterface()
client = DiscordClient(token)
await client.start()
await client.connect_voice(GUILD_ID)
async def connect_voice(guild_id):
"""
Connect to a voice channel.
Args:
guild_id (str): The ID of the guild.
"""
client = globals()['client']
print(f'Logged in as {client.user}')
guild = discord.utils.get(client.guilds, id=int(guild_id))
vc_channel = discord.utils.get(guild.voice_channels)
if vc_channel:
await client.connect_voice(guild_id)
async def play_tts(prompt, vc):
"""
Play TTS from a prompt.
Args:
prompt (str): The prompt to generate TTS from.
vc (discord.VoiceClient): The voice client to play audio from.
"""
interface = OutettsInterface()
output = await interface.generate_tts(prompt)
audio_converter = AudioConverter()
audio_stream = audio_converter.convert(output)
await client.play_audio(vc, audio_stream)
# Initialization
TOKEN = os.getenv("BOT_TOKEN")
GUILD_ID = os.getenv("DEFAULT_GUILD_ID")
client = globals()['client']
__all__ = {
"init_discord",
"connect_voice",
"play_tts",
}
Code Breakdown
The code imports the following modules:
os
for interacting with the operating systemio
for Input/Output operationsffmpeg
for audio processingdiscord
for creating the Discord botoutetts
for text-to-speech functionalityasyncio
for asynchronous programmingThe code initializes a Discord bot by:
BOT_TOKEN
and DEFAULT_GUILD_ID
.Outetts
model configuration.The code defines two asynchronous functions:
init_discord
: Initializes the Discord bot and connects to a voice channel.connect_voice
: Connects to a voice channel and starts playing text-to-speech audio.The code uses the outetts
library to generate speech from text prompts. The play_tts
function:
Outetts
interface and a text prompt.io.BytesIO
.The code uses FFmpeg to convert the WAV stream to Opus audio format, which is compatible with Discord. The audio is played in the voice channel using discord.FFmpegPCMAudio
.