1 | var fs = require('fs');
|
2 | var path = require("path");
|
3 | var readline = require('readline');
|
4 | var stream = require('stream');
|
5 | var str = require("string");
|
6 | var async = require("async");
|
7 | var debug = require('debug')("Normalizer");
|
8 |
|
9 |
|
10 | var tasks = [
|
11 | {'key':'_sys','file':'systemessentials.txt'},
|
12 | {'key':'_extra','file':'substitutes.txt'},
|
13 | {'key':'_contractions','file':'contractions.txt'},
|
14 | {'key':'_interjections','file':'interjections.txt'},
|
15 | {'key':'_britsh','file':'british.txt'},
|
16 | {'key':'_spellfix','file':'spellfix.txt'},
|
17 | {'key':'_texting','file':'texting.txt'}
|
18 | ];
|
19 |
|
20 | var reSet = {};
|
21 |
|
22 | var readSubstitutes = function(file, lineHandle, closeHandle) {
|
23 | var p = path.join(__dirname, "../data/", file)
|
24 | var instream = fs.createReadStream(p);
|
25 | var outstream = new stream;
|
26 | var rl = readline.createInterface(instream, outstream);
|
27 | rl.on('line', function(line){
|
28 | var nline = str(line).trimLeft();
|
29 |
|
30 |
|
31 | var pos = nline.indexOf('#');
|
32 |
|
33 | if (pos == -1) {
|
34 | var parts = nline.s.split(" ");
|
35 | if (parts[1] == undefined) {
|
36 | lineHandle(parts[0], "");
|
37 | } else {
|
38 | lineHandle(parts[0], parts[1]);
|
39 | }
|
40 |
|
41 | } else if (pos > 0) {
|
42 | nline = nline.left(pos);
|
43 | var parts = nline.s.split(" ");
|
44 | lineHandle(parts[0], parts[1]);
|
45 | }
|
46 | });
|
47 |
|
48 | rl.on('close', closeHandle);
|
49 | }
|
50 |
|
51 | exports.loadData = function(cb){
|
52 |
|
53 | var itor = function(item, cb) {
|
54 | debug("Loaded File", item);
|
55 |
|
56 | var lineHandle = function(key, replacer) {
|
57 |
|
58 | if (reSet[item.key] === undefined) {
|
59 | reSet[item.key] = {};
|
60 | }
|
61 |
|
62 | if (reSet[item.key][key] === undefined) {
|
63 | reSet[item.key][key] = [];
|
64 | }
|
65 |
|
66 |
|
67 | var startM, endM, lookup = key;
|
68 | if (key[0] == '<') {
|
69 | startM = true;
|
70 | lookup = key.substring(1);
|
71 | }
|
72 |
|
73 | if (key.slice(-1) == '>') {
|
74 | endM = true;
|
75 | lookup = lookup.substring(0, lookup.length - 1);
|
76 | }
|
77 |
|
78 | lookup = lookup.replace(/_/g," ");
|
79 | var qm = quotemeta(lookup);
|
80 |
|
81 | if (startM && endM) {
|
82 | reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
|
83 | } else if (startM) {
|
84 | reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
|
85 | reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1"});
|
86 | } else if (endM) {
|
87 |
|
88 | reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
|
89 | if (item.key == "_sys") {
|
90 | reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer });
|
91 | } else {
|
92 |
|
93 | }
|
94 | } else {
|
95 | reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
|
96 | reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1" });
|
97 | reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
|
98 | reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
|
99 | }
|
100 | }
|
101 |
|
102 | readSubstitutes(item.file, lineHandle, function() {
|
103 | cb(null);
|
104 | });
|
105 | }
|
106 |
|
107 | async.map(tasks, itor, function(){
|
108 | debug("Done Loading files");
|
109 | cb();
|
110 | });
|
111 | };
|
112 |
|
113 | exports.clean = function(msg){
|
114 |
|
115 | msg = msg.replace(/\+/g, "<plus>");
|
116 | msg = msg.replace(new RegExp("\t", "g"), " ");
|
117 | msg = msg.replace(/\s+/g, " ");
|
118 |
|
119 | var fileItor = function(item1, next1) {
|
120 |
|
121 | var itemItor = function(item2, next2) {
|
122 | var reArray = reSet[item1][item2];
|
123 | var reItor = function(item3, next3) {
|
124 | var pm = msg;
|
125 | msg = msg.replace(item3.re, item3.r);
|
126 | next3(null);
|
127 | };
|
128 |
|
129 | async.map(reArray, reItor, function(){
|
130 | next2(null);
|
131 | });
|
132 | }
|
133 |
|
134 | async.each(Object.keys(reSet[item1]), itemItor, function(){
|
135 | next1(null)
|
136 | });
|
137 | }
|
138 |
|
139 | async.mapSeries(Object.keys(reSet), fileItor, function() {
|
140 | msg = msg.replace(new RegExp("[\+]{1}", "g"), " ");
|
141 | msg = msg.replace(new RegExp("<plus>", "g"), "+");
|
142 | msg = msg.replace(/\d,\d/g, function(v) { return v.replace(",",""); });
|
143 | });
|
144 |
|
145 | return msg.trim();
|
146 | }
|
147 |
|
148 | var quotemeta = function (string) {
|
149 | var unsafe = "\\.+*?[^]$(){}=!<>|:";
|
150 | for (var i = 0; i < unsafe.length; i++) {
|
151 | string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i));
|
152 | }
|
153 | return string;
|
154 | }; |
\ | No newline at end of file |