UNPKG

4.27 kBJavaScriptView Raw
1var fs = require('fs');
2var path = require("path");
3var readline = require('readline');
4var stream = require('stream');
5var str = require("string");
6var async = require("async");
7var debug = require('debug')("Normalizer");
8
9// TODO, fix the paths
10var tasks = [
11 {'key':'_sys','file':'systemessentials.txt'},
12 {'key':'_extra','file':'substitutes.txt'},
13 {'key':'_contractions','file':'contractions.txt'},
14 {'key':'_interjections','file':'interjections.txt'},
15 {'key':'_britsh','file':'british.txt'},
16 {'key':'_spellfix','file':'spellfix.txt'},
17 {'key':'_texting','file':'texting.txt'}
18];
19
20var reSet = {};
21
22var readSubstitutes = function(file, lineHandle, closeHandle) {
23 var p = path.join(__dirname, "../data/", file)
24 var instream = fs.createReadStream(p);
25 var outstream = new stream;
26 var rl = readline.createInterface(instream, outstream);
27 rl.on('line', function(line){
28 var nline = str(line).trimLeft();
29
30 // Lets allow comments with '#'
31 var pos = nline.indexOf('#');
32
33 if (pos == -1) {
34 var parts = nline.s.split(" ");
35 if (parts[1] == undefined) {
36 lineHandle(parts[0], "");
37 } else {
38 lineHandle(parts[0], parts[1]);
39 }
40
41 } else if (pos > 0) {
42 nline = nline.left(pos);
43 var parts = nline.s.split(" ");
44 lineHandle(parts[0], parts[1]);
45 }
46 });
47
48 rl.on('close', closeHandle);
49}
50
51exports.loadData = function(cb){
52
53 var itor = function(item, cb) {
54 debug("Loaded File", item);
55
56 var lineHandle = function(key, replacer) {
57
58 if (reSet[item.key] === undefined) {
59 reSet[item.key] = {};
60 }
61
62 if (reSet[item.key][key] === undefined) {
63 reSet[item.key][key] = [];
64 }
65
66 // Add RegEx
67 var startM, endM, lookup = key;
68 if (key[0] == '<') {
69 startM = true;
70 lookup = key.substring(1);
71 }
72
73 if (key.slice(-1) == '>') {
74 endM = true;
75 lookup = lookup.substring(0, lookup.length - 1);
76 }
77
78 lookup = lookup.replace(/_/g," ");
79 var qm = quotemeta(lookup);
80
81 if (startM && endM) {
82 reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
83 } else if (startM) {
84 reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
85 reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1"});
86 } else if (endM) {
87
88 reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
89 if (item.key == "_sys") {
90 reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer });
91 } else {
92 // reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
93 }
94 } else {
95 reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
96 reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1" });
97 reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
98 reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
99 }
100 }
101
102 readSubstitutes(item.file, lineHandle, function() {
103 cb(null);
104 });
105 }
106
107 async.map(tasks, itor, function(){
108 debug("Done Loading files");
109 cb();
110 });
111};
112
113exports.clean = function(msg){
114
115 msg = msg.replace(/\+/g, "<plus>");
116 msg = msg.replace(new RegExp("\t", "g"), " ");
117 msg = msg.replace(/\s+/g, " ");
118
119 var fileItor = function(item1, next1) {
120
121 var itemItor = function(item2, next2) {
122 var reArray = reSet[item1][item2];
123 var reItor = function(item3, next3) {
124 var pm = msg;
125 msg = msg.replace(item3.re, item3.r);
126 next3(null);
127 };
128
129 async.map(reArray, reItor, function(){
130 next2(null);
131 });
132 }
133
134 async.each(Object.keys(reSet[item1]), itemItor, function(){
135 next1(null)
136 });
137 }
138
139 async.mapSeries(Object.keys(reSet), fileItor, function() {
140 msg = msg.replace(new RegExp("[\+]{1}", "g"), " ");
141 msg = msg.replace(new RegExp("<plus>", "g"), "+");
142 msg = msg.replace(/\d,\d/g, function(v) { return v.replace(",",""); });
143 });
144
145 return msg.trim();
146}
147
148var quotemeta = function (string) {
149 var unsafe = "\\.+*?[^]$(){}=!<>|:";
150 for (var i = 0; i < unsafe.length; i++) {
151 string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i));
152 }
153 return string;
154};
\No newline at end of file