using System; namespace Adrenak.UniVoice { /// /// A minimal, adaptive voice activity detector operating on time-domain PCM. /// Supports float [-1,1] and 16-bit samples, with per-call adaptation to /// input frequency and channel count. Multi-channel input is downmixed to mono. /// /// /// The detector emits when the speaking state toggles. /// Timings (attack, release, gaps) are maintained in milliseconds and remain /// stable across sample-rate changes. /// public class SimpleVad { /// /// Configuration for the MiniVad voice activity detector. /// All time-based parameters are expressed in milliseconds. /// [Serializable] public class Config { /// /// Target analysis frame duration in milliseconds. The frame sample count /// is computed from the current input frequency each call. /// public int TargetFrameMs = 20; /// /// Minimum continuous speech duration required to enter the speaking state. /// public int AttackMs = 20; /// /// Minimum continuous silence duration required to exit the speaking state. /// public int ReleaseMs = 1000; /// /// SNR threshold in decibels used to enter the speaking state. /// Higher values make entry stricter. /// public float SnrEnterDb = 8f; /// /// SNR threshold in decibels used to remain in the speaking state. /// public float SnrExitDb = 4f; /// /// Maximum tolerated duration of consecutive quiet frames while already speaking. /// public int MaxGapMs = 300; /// /// Grace period after speech onset during which release is disallowed. /// public int NoDropWindowMs = 400; /// /// Noise-floor update rate (EMA alpha) during non-speech. /// public float NonSpeechNoiseUpdateRate = 0.01f; /// /// Maximum noise-floor update rate (EMA alpha) during speech. /// public float SpeechNoiseUpdateRate = 0.002f; /// /// The minimum allowed value for the estimated noise level (RMS). /// Prevents the noise estimate from collapsing toward zero, which /// would make SNR calculations unstable or excessively large. /// public float MinNoiseRms = 1e-5f; /// /// Energy floor used to clamp extremely low RMS values. /// public float EnergyFloor = 1e-5f; } /// /// Raised when the VAD speaking state changes. /// The event argument is true when entering the speaking state, /// and false when exiting. /// public event Action OnVadChanged; /// /// Current speaking state. /// public bool IsSpeaking { get; private set; } private readonly Config _config; // Temporary buffer used to collect samples until one full frame is ready. private float[] _frameBuf; // Current fill position within the frame buffer. private int _frameFill; // Sample rate currently used for frame geometry. private int _curSampleRate = -1; // Cached frame size in samples for the current sample rate. private int _frameSamples = 0; // Duration (ms) of a single analysis frame at the current sample rate. private float _frameDurationMs = 0f; // Current adaptive noise level estimate (RMS). Updated every frame via EMA. private float _noiseRms; // Small constant added to denominators to avoid log(0) or division by zero. private readonly float _eps = 1e-12f; // Time (ms) of continuous speech detected so far. private float _speechMs; // Time (ms) of continuous silence detected so far. private float _silenceMs; // Time (ms) since the most recent transition into the speaking state. private float _sinceOnsetMs; // Accumulated quiet period (ms) while still considered speaking. private float _gapMs; // Warm-up frames during which we only learn noise and disallow onset. private int _warmupFrames = 0; /// /// Initializes a new instance of . /// /// Optional configuration. If null, defaults are used. public SimpleVad(Config config = null) { _config = config ?? new Config(); _noiseRms = Math.Max(_config.MinNoiseRms, 5e-3f); IsSpeaking = false; } /// /// Ensures internal frame geometry matches the provided frequency. /// Recomputes the frame sample count and resets partial-frame state when changed. /// /// Input sample rate in Hz. private void EnsureGeometry(int frequency) { if (frequency == _curSampleRate && _frameBuf != null) return; _curSampleRate = frequency; // Choose frame sample count from target frame duration int frameSamples = Math.Max(80, (_curSampleRate * _config.TargetFrameMs) / 1000); // Recompute warm-up frames for the new rate: ~200 ms of noise learning _warmupFrames = Math.Max(1, (int)Math.Ceiling(200.0 / _config.TargetFrameMs)); if (frameSamples != _frameSamples || _frameBuf == null) { _frameSamples = frameSamples; _frameBuf = new float[_frameSamples]; _frameFill = 0; } _frameDurationMs = 1000f * _frameSamples / (float)_curSampleRate; // When geometry changes, reset streaming timers so old partials don't leak across rates _speechMs = 0f; _silenceMs = 0f; _sinceOnsetMs = IsSpeaking ? 0f : _sinceOnsetMs; // safe reset on onset timing _gapMs = 0f; _noiseRms = Math.Max(_noiseRms, Math.Max(_config.MinNoiseRms, 5e-3f)); } /// /// Resets internal state and timers. /// /// Initial speaking state after reset. public void Reset(bool isSpeaking = false) { if (_frameBuf != null) Array.Clear(_frameBuf, 0, _frameBuf.Length); _frameFill = 0; _speechMs = 0f; _silenceMs = 0f; _sinceOnsetMs = 0f; _gapMs = 0f; _noiseRms = 5e-3f; IsSpeaking = isSpeaking; } /// /// Processes interleaved float PCM in the range [-1, 1] with adaptive /// handling of frequency and channels. Multi-channel input is downmixed /// to mono via averaging. /// /// Input sample rate in Hz. /// Number of interleaved channels. If 0, treated as 1 (mono). /// Buffer containing interleaved sample data. /// Number of elements from to process. public void Process(int frequency, int channels, float[] samples, int count) { if (samples == null || count <= 0) return; if (channels <= 0) channels = 1; // Reconfigure frame geometry if sample rate changed or not initialized EnsureGeometry(frequency); // Consume 'count' values which represent count/channels mono samples int usable = (count / channels) * channels; // ignore any trailing partial int idx = 0; while (idx < usable) { // Downmix one interleaved multi-channel sample to mono float sum = 0f; for (int c = 0; c < channels; c++) { sum += samples[idx + c]; } float mono = sum / channels; idx += channels; _frameBuf[_frameFill++] = mono; if (_frameFill == _frameBuf.Length) { ProcessOneFrame(_frameBuf); _frameFill = 0; } } } /// /// Processes interleaved 16-bit PCM with adaptive handling of frequency and channels. /// Multi-channel input is downmixed to mono via averaging. /// /// Input sample rate in Hz. If 0, a default is used. /// Number of interleaved channels. If 0, treated as 1. /// Buffer containing interleaved sample data. /// Number of elements from to process. public void Process(int frequency, int channels, short[] samples, int count) { if (samples == null || count <= 0) return; if (channels <= 0) channels = 1; EnsureGeometry(frequency); int usable = (count / channels) * channels; int idx = 0; while (idx < usable) { int baseIdx = idx; float sum = 0f; for (int c = 0; c < channels; c++) { sum += samples[baseIdx + c] / 32768f; } float mono = sum / channels; idx += channels; _frameBuf[_frameFill++] = mono; if (_frameFill == _frameBuf.Length) { ProcessOneFrame(_frameBuf); _frameFill = 0; } } } /// /// Convenience overload for processing fully-filled float buffers. /// /// Input sample rate in Hz. /// Number of interleaved channels. /// Buffer containing interleaved sample data. public void Process(int frequency, int channels, float[] samples) => Process(frequency, channels, samples, samples?.Length ?? 0); /// /// Convenience overload for processing fully-filled 16-bit buffers. /// /// Input sample rate in Hz. /// Number of interleaved channels. /// Buffer containing interleaved sample data. public void Process(int frequency, int channels, short[] samples) => Process(frequency, channels, samples, samples?.Length ?? 0); /// /// Processes a single analysis frame and updates the speaking state. /// /// Mono frame of length equal to the current frame size. private void ProcessOneFrame(float[] frame) { // --- Energy / RMS --- double sumSq = 0; for (int i = 0; i < frame.Length; i++) { float s = frame[i]; sumSq += (double)s * s; } float rms = (float)Math.Sqrt(sumSq / frame.Length); rms = Math.Max(rms, _config.EnergyFloor); // --- SNR(dB) vs noise floor --- float noise = Math.Max(_noiseRms, _config.MinNoiseRms); float snrDb = 20f * (float)Math.Log10((rms + _eps) / (noise + _eps)); // Pick threshold depending on current state (hysteresis) float threshold = IsSpeaking ? _config.SnrExitDb : _config.SnrEnterDb; bool rawSpeech = (snrDb >= threshold) && (rms > _config.EnergyFloor); // Noise EMA: slow during speech, faster during non-speech float alpha = rawSpeech ? _config.SpeechNoiseUpdateRate : _config.NonSpeechNoiseUpdateRate; _noiseRms = (1f - alpha) * _noiseRms + alpha * rms; _noiseRms = Math.Max(_noiseRms, _config.MinNoiseRms); // During warm-up: do not allow entering speaking state. // Keep learning noise using the non-speech update rate feel. if (_warmupFrames > 0) { _warmupFrames--; // Treat this frame as "effective silence" for timers. // (We still did the EMA update above, so noise keeps adapting.) _silenceMs += _frameDurationMs; _speechMs = 0f; // Do not change speaking state during warm-up. return; } // --- Gap filling: allow brief quiet while speaking --- if (IsSpeaking) { if (rawSpeech) _gapMs = 0f; else _gapMs += _frameDurationMs; } else { _gapMs = 0f; } // Effective speech used for timers/state: bool effectiveSpeech = rawSpeech || (IsSpeaking && _gapMs <= _config.MaxGapMs); // --- Time-based hangover & no-drop window --- if (effectiveSpeech) { _speechMs += _frameDurationMs; _silenceMs = 0f; } else { _silenceMs += _frameDurationMs; _speechMs = 0f; } bool newIsSpeaking = IsSpeaking; // Enter speaking after AttackMs of continuous effectiveSpeech if (!IsSpeaking && _speechMs >= _config.AttackMs) { newIsSpeaking = true; _sinceOnsetMs = 0f; // reset onset timer when we flip on _gapMs = 0f; } // Update onset timer if speaking if (newIsSpeaking) _sinceOnsetMs += _frameDurationMs; // Exit only if: // 1) we've accumulated ReleaseMs of effective silence AND // 2) we're past the initial NoDropWindow if (IsSpeaking && _silenceMs >= _config.ReleaseMs && _sinceOnsetMs >= _config.NoDropWindowMs) { newIsSpeaking = false; } if (newIsSpeaking != IsSpeaking) { IsSpeaking = newIsSpeaking; OnVadChanged?.Invoke(IsSpeaking); // reset timers appropriately if (IsSpeaking) { _sinceOnsetMs = 0f; _gapMs = 0f; } else { _silenceMs = 0f; _speechMs = 0f; _gapMs = 0f; } } } } }