UNPKG

12.6 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.EmailOptimizer = exports.EMAILCHAR = exports._EMAILCHAR = void 0;
4const mod_1 = require("../mod");
5/**
6 * 邮箱地址中允许出现的字符
7 * 参考:http://www.cs.tut.fi/~jkorpela/rfc/822addr.html
8 */
9exports._EMAILCHAR = '!"#$%&\'*+-/0123456789=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~.'.split('');
10exports.EMAILCHAR = {};
11for (let i in exports._EMAILCHAR)
12 exports.EMAILCHAR[exports._EMAILCHAR[i]] = 1;
13/**
14 * 邮箱地址识别优化模块
15 *
16 * @author 老雷<leizongmin@gmail.com>
17 */
18class EmailOptimizer extends mod_1.SubSModuleOptimizer {
19 /**
20 * 对可能是邮箱地址的单词进行优化
21 *
22 * @param {array} words 单词数组
23 * @return {array}
24 */
25 doOptimize(words) {
26 const POSTAG = this.segment.POSTAG;
27 //debug(words);
28 let i = 0;
29 let ie = words.length - 1;
30 let addr_start = false;
31 let has_at = false;
32 while (i < ie) {
33 let word = words[i];
34 let is_ascii = ((word.p == POSTAG.A_NX) ||
35 (word.p == POSTAG.A_M && word.w.charCodeAt(0) < 128))
36 ? true : false;
37 // 如果是外文字符或者数字,符合电子邮件地址开头的条件
38 // @ts-ignore
39 if (addr_start === false && is_ascii) {
40 addr_start = i;
41 i++;
42 continue;
43 }
44 else {
45 // 如果遇到@符号,符合第二个条件
46 if (has_at === false && word.w == '@') {
47 has_at = true;
48 i++;
49 continue;
50 }
51 // 如果已经遇到过@符号,且出现了其他字符,则截取邮箱地址
52 if (has_at !== false && words[i - 1].w != '@' && is_ascii === false && !(word.w in exports.EMAILCHAR)) {
53 let mailws = words.slice(addr_start, i);
54 //debug(toEmailAddress(mailws));
55 words.splice(addr_start, mailws.length, {
56 w: this.toEmailAddress(mailws),
57 p: POSTAG.URL
58 });
59 i = addr_start + 1;
60 ie -= mailws.length - 1;
61 addr_start = false;
62 has_at = false;
63 continue;
64 }
65 // 如果已经开头
66 if (addr_start !== false && (is_ascii || word.w in exports.EMAILCHAR)) {
67 i++;
68 continue;
69 }
70 }
71 // 移到下一个词
72 addr_start = false;
73 has_at = false;
74 i++;
75 }
76 // 检查剩余部分
77 if (addr_start && has_at && words[ie]) {
78 let word = words[ie];
79 let is_ascii = ((word.p == POSTAG.A_NX) ||
80 (word.p == POSTAG.A_M && word.w in exports.EMAILCHAR))
81 ? true : false;
82 if (is_ascii) {
83 let mailws = words.slice(addr_start, words.length);
84 //debug(toEmailAddress(mailws));
85 words.splice(addr_start, mailws.length, {
86 w: this.toEmailAddress(mailws),
87 p: POSTAG.URL
88 });
89 }
90 }
91 return words;
92 }
93 /**
94 * 根据一组单词生成邮箱地址
95 *
96 * @param {array} words 单词数组
97 * @return {string}
98 */
99 toEmailAddress(words) {
100 let ret = words[0].w;
101 for (let i = 1, word; word = words[i]; i++) {
102 ret += word.w;
103 }
104 return ret;
105 }
106}
107exports.EmailOptimizer = EmailOptimizer;
108exports.init = EmailOptimizer.init.bind(EmailOptimizer);
109exports.default = EmailOptimizer;
110//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"EmailOptimizer.js","sourceRoot":"","sources":["EmailOptimizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;;;AAEb,gCAA8E;AAI9E;;;GAGG;AACU,QAAA,UAAU,GAAG,uFAAuF,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;AAC/G,QAAA,SAAS,GAAkB,EAAE,CAAC;AAC3C,KAAK,IAAI,CAAC,IAAI,kBAAU;IAAE,iBAAS,CAAC,kBAAU,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;AAEvD;;;;GAIG;AACH,MAAa,cAAe,SAAQ,yBAAmB;IAGtD;;;;;OAKG;IACH,UAAU,CAAC,KAAK;QAEf,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QACnC,eAAe;QAEf,IAAI,CAAC,GAAG,CAAC,CAAC;QACV,IAAI,EAAE,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;QAC1B,IAAI,UAAU,GAAqB,KAAK,CAAC;QACzC,IAAI,MAAM,GAAG,KAAK,CAAC;QAEnB,OAAO,CAAC,GAAG,EAAE,EACb;YACC,IAAI,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACpB,IAAI,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,IAAI,CAAC;gBACtC,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,GAAG,IAAI,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;gBACrD,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC;YAEhB,4BAA4B;YAC5B,aAAa;YACb,IAAI,UAAU,KAAK,KAAK,IAAI,QAAQ,EACpC;gBACC,UAAU,GAAG,CAAC,CAAC;gBACf,CAAC,EAAE,CAAC;gBACJ,SAAS;aACT;iBAED;gBACC,kBAAkB;gBAClB,IAAI,MAAM,KAAK,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI,GAAG,EACrC;oBACC,MAAM,GAAG,IAAI,CAAC;oBACd,CAAC,EAAE,CAAC;oBACJ,SAAS;iBACT;gBACD,8BAA8B;gBAC9B,IAAI,MAAM,KAAK,KAAK,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,IAAI,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,iBAAS,CAAC,EAC7F;oBACC,IAAI,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;oBACxC,gCAAgC;oBAChC,KAAK,CAAC,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;wBACvC,CAAC,EAAE,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC;wBAC9B,CAAC,EAAE,MAAM,CAAC,GAAG;qBACb,CAAC,CAAC;oBACH,CAAC,GAAW,UAAU,GAAG,CAAC,CAAC;oBAC3B,EAAE,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC;oBACxB,UAAU,GAAG,KAAK,CAAC;oBACnB,MAAM,GAAG,KAAK,CAAC;oBACf,SAAS;iBACT;gBACD,SAAS;gBACT,IAAI,UAAU,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,CAAC,IAAI,iBAAS,CAAC,EAC7D;oBACC,CAAC,EAAE,CAAC;oBACJ,SAAS;iBACT;aACD;YAED,SAAS;YACT,UAAU,GAAG,KAAK,CAAC;YACnB,MAAM,GAAG,KAAK,CAAC;YACf,CAAC,EAAE,CAAC;SACJ;QAED,SAAS;QACT,IAAI,UAAU,IAAI,MAAM,IAAI,KAAK,CAAC,EAAE,CAAC,EACrC;YACC,IAAI,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC,CAAC;YACrB,IAAI,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,IAAI,CAAC;gBACtC,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,GAAG,IAAI,IAAI,CAAC,CAAC,IAAI,iBAAS,CAAC,CAAC;gBAC9C,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC;YAChB,IAAI,QAAQ,EACZ;gBACC,IAAI,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;gBACnD,gCAAgC;gBAChC,KAAK,CAAC,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;oBACvC,CAAC,EAAE,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC;oBAC9B,CAAC,EAAE,MAAM,CAAC,GAAG;iBACb,CAAC,CAAC;aACH;SACD;QAED,OAAO,KAAK,CAAC;IACd,CAAC;IAED;;;;;OAKG;IACH,cAAc,CAAC,KAAc;QAE5B,IAAI,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAC1C;YACC,GAAG,IAAI,IAAI,CAAC,CAAC,CAAC;SACd;QACD,OAAO,GAAG,CAAC;IACZ,CAAC;CAED;AA7GD,wCA6GC;AAEY,QAAA,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,CAAwC,CAAC;AAEpG,kBAAe,cAAc,CAAC","sourcesContent":["'use strict';\n\nimport { SubSModule, SubSModuleOptimizer, ISubOptimizerCreate } from '../mod';\nimport { Segment, IWord, IDICT } from '../Segment';\nimport UString from 'uni-string';\n\n/**\n * 邮箱地址中允许出现的字符\n * 参考：http://www.cs.tut.fi/~jkorpela/rfc/822addr.html\n */\nexport const _EMAILCHAR = '!\"#$%&\\'*+-/0123456789=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~.'.split('');\nexport const EMAILCHAR: IDICT<number> = {};\nfor (let i in _EMAILCHAR) EMAILCHAR[_EMAILCHAR[i]] = 1;\n\n/**\n * 邮箱地址识别优化模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\nexport class EmailOptimizer extends SubSModuleOptimizer\n{\n\n\t/**\n\t * 对可能是邮箱地址的单词进行优化\n\t *\n\t * @param {array} words 单词数组\n\t * @return {array}\n\t */\n\tdoOptimize(words)\n\t{\n\t\tconst POSTAG = this.segment.POSTAG;\n\t\t//debug(words);\n\n\t\tlet i = 0;\n\t\tlet ie = words.length - 1;\n\t\tlet addr_start: boolean | number = false;\n\t\tlet has_at = false;\n\n\t\twhile (i < ie)\n\t\t{\n\t\t\tlet word = words[i];\n\t\t\tlet is_ascii = ((word.p == POSTAG.A_NX) ||\n\t\t\t\t(word.p == POSTAG.A_M && word.w.charCodeAt(0) < 128))\n\t\t\t\t? true : false;\n\n\t\t\t// 如果是外文字符或者数字，符合电子邮件地址开头的条件\n\t\t\t// @ts-ignore\n\t\t\tif (addr_start === false && is_ascii)\n\t\t\t{\n\t\t\t\taddr_start = i;\n\t\t\t\ti++;\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t// 如果遇到@符号，符合第二个条件\n\t\t\t\tif (has_at === false && word.w == '@')\n\t\t\t\t{\n\t\t\t\t\thas_at = true;\n\t\t\t\t\ti++;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\t// 如果已经遇到过@符号，且出现了其他字符，则截取邮箱地址\n\t\t\t\tif (has_at !== false && words[i - 1].w != '@' && is_ascii === false && !(word.w in EMAILCHAR))\n\t\t\t\t{\n\t\t\t\t\tlet mailws = words.slice(addr_start, i);\n\t\t\t\t\t//debug(toEmailAddress(mailws));\n\t\t\t\t\twords.splice(addr_start, mailws.length, {\n\t\t\t\t\t\tw: this.toEmailAddress(mailws),\n\t\t\t\t\t\tp: POSTAG.URL\n\t\t\t\t\t});\n\t\t\t\t\ti = <number>addr_start + 1;\n\t\t\t\t\tie -= mailws.length - 1;\n\t\t\t\t\taddr_start = false;\n\t\t\t\t\thas_at = false;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\t// 如果已经开头\n\t\t\t\tif (addr_start !== false && (is_ascii || word.w in EMAILCHAR))\n\t\t\t\t{\n\t\t\t\t\ti++;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// 移到下一个词\n\t\t\taddr_start = false;\n\t\t\thas_at = false;\n\t\t\ti++;\n\t\t}\n\n\t\t// 检查剩余部分\n\t\tif (addr_start && has_at && words[ie])\n\t\t{\n\t\t\tlet word = words[ie];\n\t\t\tlet is_ascii = ((word.p == POSTAG.A_NX) ||\n\t\t\t\t(word.p == POSTAG.A_M && word.w in EMAILCHAR))\n\t\t\t\t? true : false;\n\t\t\tif (is_ascii)\n\t\t\t{\n\t\t\t\tlet mailws = words.slice(addr_start, words.length);\n\t\t\t\t//debug(toEmailAddress(mailws));\n\t\t\t\twords.splice(addr_start, mailws.length, {\n\t\t\t\t\tw: this.toEmailAddress(mailws),\n\t\t\t\t\tp: POSTAG.URL\n\t\t\t\t});\n\t\t\t}\n\t\t}\n\n\t\treturn words;\n\t}\n\n\t/**\n\t * 根据一组单词生成邮箱地址\n\t *\n\t * @param {array} words 单词数组\n\t * @return {string}\n\t */\n\ttoEmailAddress(words: IWord[])\n\t{\n\t\tlet ret = words[0].w;\n\t\tfor (let i = 1, word; word = words[i]; i++)\n\t\t{\n\t\t\tret += word.w;\n\t\t}\n\t\treturn ret;\n\t}\n\n}\n\nexport const init = EmailOptimizer.init.bind(EmailOptimizer) as ISubOptimizerCreate<EmailOptimizer>;\n\nexport default EmailOptimizer;\n"]}
\No newline at end of file