Source: speech-to-text/format-stream.js

'use strict';

var Transform = require('stream').Transform;
var util = require('util');
var clone = require('clone');
var defaults = require('defaults');

/**
 * Applies some basic formatting to transcriptions:
 *  - Capitalize the first word of each sentence
 *  - Add a period to the end
 *  - Fix any "cruft" in the transcription
 *  - etc.
 *
 *  May be used as either a Stream, or a standalone helper.
 *
 * @param {Object} opts
 * @param {String} [opts.model] - some models / languages need special handling
 * @param {String} [opts.hesitation=''] - what to put down for a "hesitation" event, also consider \u2026 (ellipsis: ...)
 * @param {Boolean} [options.objectMode=false] - emit `result` objects instead of string Buffers for the `data` events.
 * @constructor
 */
function FormatStream(opts) {
  this.options = defaults(opts, {
    model: '', // some models should have all spaces removed
    hesitation: '',
    decodeStrings: false // false = don't convert strings to buffers before passing to _write
  });
  Transform.call(this, this.options);

  this.isJaCn = this.options.model.substring(0, 5) === 'ja-JP' || this.options.model.substring(0, 5) === 'zh-CN';
  this._transform = this.options.objectMode ? this.transformObject : this.transformString;
}
util.inherits(FormatStream, Transform);

var reHesitation = /%HESITATION ?/g; // http://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml#hesitation - D_ is handled below
var reRepeatedCharacter = /([a-z])\1{2,}/ig; // detect the same character repeated three or more times and remove it
var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)

/**
 * Formats one or more words, removing special symbols, junk, and spacing for some languages
 * @param {String} text
 * @param {Boolean} isFinal
 * @return {String}
 */
FormatStream.prototype.clean = function clean(text) {
  // clean out "junk"
  text = text
    .replace(reHesitation, this.options.hesitation ? this.options.hesitation.trim() + ' ' : this.options.hesitation)
    .replace(reRepeatedCharacter, '')
    .replace(reDUnderscoreWords, '');

  // remove spaces for Japanese and Chinese
  if (this.isJaCn) {
    text = text.replace(/ /g, '');
  }

  return text.trim() + ' '; // we want exactly 1 space at the end
};

/**
 * Capitalizes the first word of a sentence
 * @param {String} text
 * @return {string}
 */
FormatStream.prototype.capitalize = function capitalize(text) {
  // capitalize first word, returns '' in the case of an empty word
  return text.charAt(0).toUpperCase() + text.substring(1);
};

/**
 * Puts a period on the end of a sentence
 * @param {String} text
 * @return {string}
 */
FormatStream.prototype.period = function period(text) {
  text = text.trim();
  // don't put a period down if the clean stage remove all of the text
  if (!text) {
    return ' ';
  }
  // just add a space if the sentence ends in an ellipse
  if (text.substr(-1) === '\u2026') {
    return text + ' ';
  }
  return text + (this.isJaCn ? '。' : '. ');
};

FormatStream.prototype.transformString = function(chunk, encoding, next) {
  this.push(this.formatString(chunk.toString()));
  next();
};

FormatStream.prototype.transformObject = function formatResult(result, encoding, next) {
  this.push(this.formatResult(result));
  next();
};

/**
 * Formats a single string result.
 *
 * May be used outside of Node.js streams
 *
 * @param {String} str - text to format
 * @param {bool} [isInterim=false] - set to true to prevent adding a period to the end of the sentence
 * @return {String}
 */
FormatStream.prototype.formatString = function(str, isInterim) {
  str = this.capitalize(this.clean(str));
  return isInterim ? str : this.period(str);
};

/**
 * Creates a new result with all transcriptions formatted
 *
 * May be used outside of Node.js streams
 *
 * @param {Object} data
 * @return {Object}
 */
FormatStream.prototype.formatResult = function formatResult(data) {
  data = clone(data);
  if (Array.isArray(data.results)) {
    data.results.forEach(
      function(result, i) {
        // if there are multiple interim results (as produced by the speaker stream),
        // treat the text as final in all but the last result
        var textFinal = result.final || i !== data.results.length - 1;

        result.alternatives = result.alternatives.map(
          function(alt) {
            alt.transcript = this.formatString(alt.transcript, !textFinal);
            if (alt.timestamps) {
              alt.timestamps = alt.timestamps
                .map(
                  function(ts, j, arr) {
                    // timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]'
                    ts[0] = this.clean(ts[0]);
                    if (j === 0) {
                      ts[0] = this.capitalize(ts[0]);
                    }

                    if (j === arr.length - 1 && textFinal) {
                      ts[0] = this.period(ts[0]);
                    }
                    return ts;
                  },
                  this
                )
                .filter(function(ts) {
                  return ts[0]; // remove any timestamps without a word (due to cleaning out junk words)
                });
            }
            return alt;
          },
          this
        );
      },
      this
    );
  }
  return data;
};

FormatStream.prototype.promise = require('./to-promise');

module.exports = FormatStream;