xr-ai-accelerator

XrAiSpeechToTextHelper

The XrAiSpeechToTextHelper class is a MonoBehaviour component that simplifies audio recording and conversion for speech-to-text operations. It handles microphone input, audio encoding, and provides callbacks for processing recorded audio data.

Class Declaration

public class XrAiSpeechToTextHelper : MonoBehaviour

Methods

StartRecording

Begins recording audio from the specified microphone device.

public void StartRecording(string device, Action<byte[]> onRecordingComplete = null, int recordingMax = 5)

Parameters:

StopRecording

Manually stops the current recording session.

public void StopRecording()

Usage Example

public class VoiceCommandSystem : MonoBehaviour
{
    private XrAiSpeechToTextHelper recorder;
    private IXrAiSpeechToText speechToText;
    
    void Start()
    {
        recorder = GetComponent<XrAiSpeechToTextHelper>();
        speechToText = XrAiFactory.LoadSpeechToText("OpenAI", new Dictionary<string, string>
        {
            { "apiKey", "your-api-key" }
        });
    }
    
    public void StartListening()
    {
        // Get the default microphone
        string defaultMic = Microphone.devices.Length > 0 ? Microphone.devices[0] : null;
        
        if (defaultMic != null)
        {
            recorder.StartRecording(defaultMic, OnAudioRecorded, 10);
            Debug.Log("Started recording...");
        }
        else
        {
            Debug.LogError("No microphone devices found");
        }
    }
    
    private async void OnAudioRecorded(byte[] audioData)
    {
        Debug.Log($"Recording complete. Audio data size: {audioData.Length} bytes");
        
        // Process the audio with speech-to-text
        var result = await speechToText.Execute(audioData, new Dictionary<string, string>
        {
            { "model", "whisper-1" },
            { "language", "en" }
        });
        
        if (result.IsSuccess)
        {
            Debug.Log($"Transcribed: {result.Data}");
            ProcessVoiceCommand(result.Data);
        }
        else
        {
            Debug.LogError($"Speech recognition failed: {result.ErrorMessage}");
        }
    }
    
    private void ProcessVoiceCommand(string command)
    {
        // Handle the voice command
        Debug.Log($"Processing command: {command}");
    }
}

Microphone Device Selection

public class MicrophoneManager : MonoBehaviour
{
    private XrAiSpeechToTextHelper recorder;
    
    void Start()
    {
        recorder = GetComponent<XrAiSpeechToTextHelper>();
        ListAvailableMicrophones();
    }
    
    private void ListAvailableMicrophones()
    {
        Debug.Log($"Available microphones: {Microphone.devices.Length}");
        
        for (int i = 0; i < Microphone.devices.Length; i++)
        {
            string deviceName = Microphone.devices[i];
            Debug.Log($"{i}: {deviceName}");
            
            // Get device capabilities
            Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq);
            Debug.Log($"  Frequency range: {minFreq}Hz - {maxFreq}Hz");
        }
    }
    
    public void StartRecordingWithDevice(int deviceIndex)
    {
        if (deviceIndex >= 0 && deviceIndex < Microphone.devices.Length)
        {
            string deviceName = Microphone.devices[deviceIndex];
            recorder.StartRecording(deviceName, OnRecordingComplete, 8);
        }
    }
}

Real-time Voice Recognition

public class ContinuousVoiceRecognition : MonoBehaviour
{
    private XrAiSpeechToTextHelper recorder;
    private IXrAiSpeechToText speechToText;
    private bool isListening = false;
    private Queue<string> recentCommands = new Queue<string>();
    
    public void StartContinuousListening()
    {
        if (!isListening)
        {
            isListening = true;
            StartNextRecording();
        }
    }
    
    public void StopContinuousListening()
    {
        isListening = false;
        recorder.StopRecording();
    }
    
    private void StartNextRecording()
    {
        if (isListening)
        {
            recorder.StartRecording(Microphone.devices[0], OnContinuousRecording, 3);
        }
    }
    
    private async void OnContinuousRecording(byte[] audioData)
    {
        if (isListening && audioData.Length > 1000) // Ignore very short recordings
        {
            var result = await speechToText.Execute(audioData);
            
            if (result.IsSuccess && !string.IsNullOrEmpty(result.Data.Trim()))
            {
                string command = result.Data.Trim();
                recentCommands.Enqueue(command);
                
                // Keep only recent commands
                if (recentCommands.Count > 10)
                {
                    recentCommands.Dequeue();
                }
                
                ProcessCommand(command);
            }
            
            // Continue listening
            StartNextRecording();
        }
    }
}

Audio Quality Configuration

public class AudioQualitySettings
{
    public static class Presets
    {
        public static (int sampleRate, int recordingLength) LowQuality => (22050, 3);
        public static (int sampleRate, int recordingLength) StandardQuality => (44100, 5);
        public static (int sampleRate, int recordingLength) HighQuality => (48000, 10);
    }
}

public void StartHighQualityRecording()
{
    var settings = AudioQualitySettings.Presets.HighQuality;
    recorder.StartRecording(
        Microphone.devices[0], 
        OnHighQualityRecording, 
        settings.recordingLength
    );
}

Integration with UI

public class VoiceInputUI : MonoBehaviour
{
    [SerializeField] private Button recordButton;
    [SerializeField] private Image recordingIndicator;
    [SerializeField] private Text statusText;
    
    private XrAiSpeechToTextHelper recorder;
    private bool isRecording = false;
    
    void Start()
    {
        recorder = GetComponent<XrAiSpeechToTextHelper>();
        recordButton.onClick.AddListener(ToggleRecording);
    }
    
    private void ToggleRecording()
    {
        if (!isRecording)
        {
            StartRecording();
        }
        else
        {
            StopRecording();
        }
    }
    
    private void StartRecording()
    {
        isRecording = true;
        recordingIndicator.color = Color.red;
        statusText.text = "Recording...";
        
        recorder.StartRecording(Microphone.devices[0], OnRecordingComplete, 10);
    }
    
    private void StopRecording()
    {
        isRecording = false;
        recordingIndicator.color = Color.gray;
        statusText.text = "Processing...";
        
        recorder.StopRecording();
    }
    
    private async void OnRecordingComplete(byte[] audioData)
    {
        statusText.text = "Converting to text...";
        
        // Process audio data
        // Update UI with results
        
        isRecording = false;
        recordingIndicator.color = Color.gray;
        statusText.text = "Ready";
    }
}

Audio Data Processing

The helper automatically converts Unity AudioClip to WAV format:

WAV Format Specifications

Manual Audio Conversion

// If you need to process AudioClip manually
private byte[] ConvertAudioClipToWav(AudioClip clip)
{
    // The helper handles this internally, but you can access
    // the conversion logic if needed for custom processing
    return XrAiSpeechToTextHelper.Convert(clip); // This method would need to be made public
}

Performance Considerations

Error Handling

private void OnRecordingComplete(byte[] audioData)
{
    if (audioData == null || audioData.Length == 0)
    {
        Debug.LogError("Recording failed or produced no audio data");
        return;
    }
    
    if (audioData.Length < 1000) // Very short recording
    {
        Debug.LogWarning("Recording may be too short for accurate transcription");
    }
    
    // Process audio data...
}

Implementation Notes