'use strict';
var Transform = require('stream').Transform;
var util = require('util');
var clone = require('clone');
var defaults = require('defaults');
/**
* Applies some basic formatting to transcriptions:
* - Capitalize the first word of each sentence
* - Add a period to the end
* - Fix any "cruft" in the transcription
* - etc.
*
* May be used as either a Stream, or a standalone helper.
*
* @param {Object} opts
* @param {String} [opts.model] - some models / languages need special handling
* @param {String} [opts.hesitation=''] - what to put down for a "hesitation" event, also consider \u2026 (ellipsis: ...)
* @param {Boolean} [options.objectMode=false] - emit `result` objects instead of string Buffers for the `data` events.
* @constructor
*/
function FormatStream(opts) {
this.options = defaults(opts, {
model: '', // some models should have all spaces removed
hesitation: '',
decodeStrings: false // false = don't convert strings to buffers before passing to _write
});
Transform.call(this, this.options);
this.isJaCn = this.options.model.substring(0, 5) === 'ja-JP' || this.options.model.substring(0, 5) === 'zh-CN';
this._transform = this.options.objectMode ? this.transformObject : this.transformString;
}
util.inherits(FormatStream, Transform);
var reHesitation = /%HESITATION ?/g; // http://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml#hesitation - D_ is handled below
var reRepeatedCharacter = /([a-z])\1{2,}/ig; // detect the same character repeated three or more times and remove it
var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
/**
* Formats one or more words, removing special symbols, junk, and spacing for some languages
* @param {String} text
* @param {Boolean} isFinal
* @return {String}
*/
FormatStream.prototype.clean = function clean(text) {
// clean out "junk"
text = text
.replace(reHesitation, this.options.hesitation ? this.options.hesitation.trim() + ' ' : this.options.hesitation)
.replace(reRepeatedCharacter, '')
.replace(reDUnderscoreWords, '');
// remove spaces for Japanese and Chinese
if (this.isJaCn) {
text = text.replace(/ /g, '');
}
return text.trim() + ' '; // we want exactly 1 space at the end
};
/**
* Capitalizes the first word of a sentence
* @param {String} text
* @return {string}
*/
FormatStream.prototype.capitalize = function capitalize(text) {
// capitalize first word, returns '' in the case of an empty word
return text.charAt(0).toUpperCase() + text.substring(1);
};
/**
* Puts a period on the end of a sentence
* @param {String} text
* @return {string}
*/
FormatStream.prototype.period = function period(text) {
text = text.trim();
// don't put a period down if the clean stage remove all of the text
if (!text) {
return ' ';
}
// just add a space if the sentence ends in an ellipse
if (text.substr(-1) === '\u2026') {
return text + ' ';
}
return text + (this.isJaCn ? '。' : '. ');
};
FormatStream.prototype.transformString = function(chunk, encoding, next) {
this.push(this.formatString(chunk.toString()));
next();
};
FormatStream.prototype.transformObject = function formatResult(result, encoding, next) {
this.push(this.formatResult(result));
next();
};
/**
* Formats a single string result.
*
* May be used outside of Node.js streams
*
* @param {String} str - text to format
* @param {bool} [isInterim=false] - set to true to prevent adding a period to the end of the sentence
* @return {String}
*/
FormatStream.prototype.formatString = function(str, isInterim) {
str = this.capitalize(this.clean(str));
return isInterim ? str : this.period(str);
};
/**
* Creates a new result with all transcriptions formatted
*
* May be used outside of Node.js streams
*
* @param {Object} data
* @return {Object}
*/
FormatStream.prototype.formatResult = function formatResult(data) {
data = clone(data);
if (Array.isArray(data.results)) {
data.results.forEach(
function(result, i) {
// if there are multiple interim results (as produced by the speaker stream),
// treat the text as final in all but the last result
var textFinal = result.final || i !== data.results.length - 1;
result.alternatives = result.alternatives.map(
function(alt) {
alt.transcript = this.formatString(alt.transcript, !textFinal);
if (alt.timestamps) {
alt.timestamps = alt.timestamps
.map(
function(ts, j, arr) {
// timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]'
ts[0] = this.clean(ts[0]);
if (j === 0) {
ts[0] = this.capitalize(ts[0]);
}
if (j === arr.length - 1 && textFinal) {
ts[0] = this.period(ts[0]);
}
return ts;
},
this
)
.filter(function(ts) {
return ts[0]; // remove any timestamps without a word (due to cleaning out junk words)
});
}
return alt;
},
this
);
},
this
);
}
return data;
};
FormatStream.prototype.promise = require('./to-promise');
module.exports = FormatStream;