Source: speech-to-text/webaudio-l16-stream.js

'use strict';
var Transform = require('stream').Transform;
var util = require('util');
var defaults = require('defaults');

var TARGET_SAMPLE_RATE = 16000;
/**
 * Transforms Buffers or AudioBuffers into a binary stream of l16 (raw wav) audio, downsampling in the process.
 *
 * The watson speech-to-text service works on 16kHz and internally downsamples audio received at higher samplerates.
 * WebAudio is usually 44.1kHz or 48kHz, so downsampling here reduces bandwidth usage by ~2/3.
 *
 * Format event + stream can be combined with https://www.npmjs.com/package/wav to generate a wav file with a proper header
 *
 * Todo: support multi-channel audio (for use with <audio>/<video> elements) - will require interleaving audio channels
 *
 * @param {Object} options
 * @constructor
 */
function WebAudioL16Stream(options) {
  options = this.options = defaults(options, {
    sourceSampleRate: 48000,
    downsample: true
  });

  Transform.call(this, options);

  this.bufferUnusedSamples = [];

  if (options.objectMode || options.writableObjectMode) {
    this._transform = this.handleFirstAudioBuffer;
  } else {
    this._transform = this.transformBuffer;
    process.nextTick(this.emitFormat.bind(this));
  }
}
util.inherits(WebAudioL16Stream, Transform);

WebAudioL16Stream.prototype.emitFormat = function emitFormat() {
  this.emit('format', {
    channels: 1,
    bitDepth: 16,
    sampleRate: this.options.downsample ? TARGET_SAMPLE_RATE : this.options.sourceSampleRate,
    signed: true,
    float: false
  });
};

/**
 * Downsamples WebAudio to 16 kHz.
 *
 * Browsers can downsample WebAudio natively with OfflineAudioContext's but it was designed for non-streaming use and
 * requires a new context for each AudioBuffer. Firefox can handle this, but chrome (v47) crashes after a few minutes.
 * So, we'll do it in JS for now.
 *
 * This really belongs in it's own stream, but there's no way to create new AudioBuffer instances from JS, so its
 * fairly coupled to the wav conversion code.
 *
 * @param  {AudioBuffer} bufferNewSamples Microphone/MediaElement audio chunk
 * @return {Float32Array} 'audio/l16' chunk
 */
WebAudioL16Stream.prototype.downsample = function downsample(bufferNewSamples) {
  var buffer = null;
  var newSamples = bufferNewSamples.length;
  var unusedSamples = this.bufferUnusedSamples.length;
  var i;
  var offset;

  if (unusedSamples > 0) {
    buffer = new Float32Array(unusedSamples + newSamples);
    for (i = 0; i < unusedSamples; ++i) {
      buffer[i] = this.bufferUnusedSamples[i];
    }
    for (i = 0; i < newSamples; ++i) {
      buffer[unusedSamples + i] = bufferNewSamples[i];
    }
  } else {
    buffer = bufferNewSamples;
  }

  // Downsampling and low-pass filter:
  // Input audio is typically 44.1kHz or 48kHz, this downsamples it to 16kHz.
  // It uses a FIR (finite impulse response) Filter to remove (or, at least attinuate) 
  // audio frequencies > ~8kHz because sampled audio cannot accurately represent  
  // frequiencies greater than half of the sample rate. 
  // (Human voice tops out at < 4kHz, so nothing important is lost for transcription.)
  // See http://dsp.stackexchange.com/a/37475/26392 for a good explination of this code.
  var filter = [
    -0.037935,
    -0.00089024,
    0.040173,
    0.019989,
    0.0047792,
    -0.058675,
    -0.056487,
    -0.0040653,
    0.14527,
    0.26927,
    0.33913,
    0.26927,
    0.14527,
    -0.0040653,
    -0.056487,
    -0.058675,
    0.0047792,
    0.019989,
    0.040173,
    -0.00089024,
    -0.037935
  ];
  var samplingRateRatio = this.options.sourceSampleRate / TARGET_SAMPLE_RATE;
  var nOutputSamples = Math.floor((buffer.length - filter.length) / samplingRateRatio) + 1;
  var outputBuffer = new Float32Array(nOutputSamples);

  for (i = 0; i + filter.length - 1 < buffer.length; i++) {
    offset = Math.round(samplingRateRatio * i);
    var sample = 0;
    for (var j = 0; j < filter.length; ++j) {
      sample += buffer[offset + j] * filter[j];
    }
    outputBuffer[i] = sample;
  }

  var indexSampleAfterLastUsed = Math.round(samplingRateRatio * i);
  var remaining = buffer.length - indexSampleAfterLastUsed;
  if (remaining > 0) {
    this.bufferUnusedSamples = new Float32Array(remaining);
    for (i = 0; i < remaining; ++i) {
      this.bufferUnusedSamples[i] = buffer[indexSampleAfterLastUsed + i];
    }
  } else {
    this.bufferUnusedSamples = new Float32Array(0);
  }

  return outputBuffer;
};

/**
 * Accepts a Float32Array of audio data and converts it to a Buffer of l16 audio data (raw wav)
 *
 * Explanation for the math: The raw values captured from the Web Audio API are
 * in 32-bit Floating Point, between -1 and 1 (per the specification).
 * The values for 16-bit PCM range between -32768 and +32767 (16-bit signed integer).
 * Filter & combine samples to reduce frequency, then multiply to by 0x7FFF (32767) to convert.
 * Store in little endian.
 *
 * @param {Float32Array} input
 * @return {Buffer}
 */
WebAudioL16Stream.prototype.floatTo16BitPCM = function(input) {
  var output = new DataView(new ArrayBuffer(input.length * 2)); // length is in bytes (8-bit), so *2 to get 16-bit length
  for (var i = 0; i < input.length; i++) {
    var multiplier = input[i] < 0 ? 0x8000 : 0x7fff; // 16-bit signed range is -32768 to 32767
    output.setInt16(i * 2, input[i] * multiplier | 0, true); // index, value, little edian
  }
  return Buffer.from(output.buffer);
};

/**
 * Does some one-time setup to grab sampleRate and emit format, then sets _transform to the actual audio buffer handler and calls it.
 * @param {AudioBuffer} audioBuffer
 * @param {String} encoding
 * @param {Function} next
 */
WebAudioL16Stream.prototype.handleFirstAudioBuffer = function handleFirstAudioBuffer(audioBuffer, encoding, next) {
  this.options.sourceSampleRate = audioBuffer.sampleRate;
  this.emitFormat();
  this._transform = this.transformAudioBuffer;
  this._transform(audioBuffer, encoding, next);
};

/**
 * Accepts an AudioBuffer (for objectMode), then downsamples to 16000 and converts to a 16-bit pcm
 *
 * @param {AudioBuffer} audioBuffer
 * @param {String} encoding
 * @param {Function} next
 */
WebAudioL16Stream.prototype.transformAudioBuffer = function(audioBuffer, encoding, next) {
  var source = audioBuffer.getChannelData(0);
  if (this.options.downsample) {
    source = this.downsample(source);
  }
  this.push(this.floatTo16BitPCM(source));
  next();
};

/**
 * Accepts a Buffer (for binary mode), then downsamples to 16000 and converts to a 16-bit pcm
 *
 * @param {Buffer} nodebuffer
 * @param {String} encoding
 * @param {Function} next
 */
WebAudioL16Stream.prototype.transformBuffer = function(nodebuffer, encoding, next) {
  var source = new Float32Array(nodebuffer.buffer);
  if (this.options.downsample) {
    source = this.downsample(source);
  }
  this.push(this.floatTo16BitPCM(source));
  next();
};
// new Float32Array(nodebuffer.buffer)

module.exports = WebAudioL16Stream;