English | 日本語
Demo

Streamlit Multimodal Chat Input
A multimodal chat input component for Streamlit that supports text input, image upload, and voice input.
Note: Voice and image features require HTTPS or localhost environment to function properly.
Features
- 📝 Text Input: Same usability as st.chat_input
- 🖼️ Image File Upload: Supports jpg, png, gif, webp
- 🎤 Voice Input: Web Speech API / OpenAI Whisper API support
- 🎨 Streamlit Standard Theme: Fully compatible design
- 🔄 Drag & Drop: File drag and drop support
- ⌨️ Ctrl+V: Paste images from clipboard
- ⚙️ Customizable: Rich configuration options
Installation
pip install st-chat-input-multimodal
Basic Usage
import streamlit as st
from st_chat_input_multimodal import multimodal_chat_input
result = multimodal_chat_input()
if result:
if result['text']:
st.write(f"Text: {result['text']}")
if result['files']:
for file in result['files']:
import base64
base64_data = file['data'].split(',')[1]
image_bytes = base64.b64decode(base64_data)
st.image(image_bytes, caption=file['name'])
if result.get('audio_metadata'):
st.write(f"Voice input used: {result['audio_metadata']['used_voice_input']}")
Advanced Usage
Voice Input Features
result = multimodal_chat_input(
enable_voice_input=True,
voice_recognition_method="web_speech",
voice_language="ja-JP",
max_recording_time=60
)
result = multimodal_chat_input(
enable_voice_input=True,
voice_recognition_method="openai_whisper",
openai_api_key="sk-your-api-key",
voice_language="ja-JP"
)
Custom Configuration
result = multimodal_chat_input(
placeholder="Please enter your message...",
max_chars=500,
accepted_file_types=["jpg", "png", "gif", "webp"],
max_file_size_mb=10,
disabled=False,
key="custom_chat_input"
)
Chat Usage
import streamlit as st
import base64
from st_chat_input_multimodal import multimodal_chat_input
st.set_page_config(
page_title="Multimodal Chat Input Demo",
page_icon="💬",
layout="wide"
)
st.subheader("💭 Multimodal Chat Input Demo")
st.markdown("Simulate a chat application with voice input and file upload.")
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
chat_result = multimodal_chat_input(
placeholder="Enter chat message...",
enable_voice_input=True,
key="chat_input"
)
if chat_result:
st.session_state.chat_history.append(chat_result)
if st.session_state.chat_history:
for i, message in enumerate(st.session_state.chat_history):
with st.chat_message("user"):
if message.get("text"):
st.write(message["text"])
if message.get("files"):
for file in message["files"]:
try:
base64_data = file['data'].split(',')[1] if ',' in file['data'] else file['data']
image_bytes = base64.b64decode(base64_data)
st.image(image_bytes, caption=file['name'], width=200)
except:
st.write(f"📎 {file['name']}")
if message.get("audio_metadata") and message["audio_metadata"]["used_voice_input"]:
st.caption(f"🎤 Voice input ({message['audio_metadata']['transcription_method']})")
if st.button("Clear History"):
st.session_state.chat_history = []
st.rerun()
License
MIT License
Author
tsuzukia21