1 | ;
|
2 | /**
|
3 | * URL识别模块
|
4 | *
|
5 | * @author 老雷<leizongmin@gmail.com>
|
6 | */
|
7 | var debug = console.log;
|
8 | /** 模块类型 */
|
9 | exports.type = 'tokenizer';
|
10 | /**
|
11 | * 模块初始化
|
12 | *
|
13 | * @param {Segment} segment 分词接口
|
14 | */
|
15 | exports.init = function (segment) {
|
16 | exports.segment = segment;
|
17 | };
|
18 | /**
|
19 | * 对未识别的单词进行分词
|
20 | *
|
21 | * @param {array} words 单词数组
|
22 | * @return {array}
|
23 | */
|
24 | exports.split = function (words) {
|
25 | var POSTAG = exports.segment.POSTAG;
|
26 | var ret = [];
|
27 | for (var i = 0, word; word = words[i]; i++) {
|
28 | if (word.p > 0) {
|
29 | ret.push(word);
|
30 | continue;
|
31 | }
|
32 | // 仅对未识别的词进行匹配
|
33 | var urlinfo = matchURL(word.w);
|
34 | if (urlinfo.length < 1) {
|
35 | ret.push(word);
|
36 | continue;
|
37 | }
|
38 | // 分离出URL
|
39 | var lastc = 0;
|
40 | for (var ui = 0, url; url = urlinfo[ui]; ui++) {
|
41 | if (url.c > lastc) {
|
42 | ret.push({ w: word.w.substr(lastc, url.c - lastc) });
|
43 | }
|
44 | ret.push({ w: url.w, p: POSTAG.URL });
|
45 | lastc = url.c + url.w.length;
|
46 | }
|
47 | var lasturl = urlinfo[urlinfo.length - 1];
|
48 | if (lasturl.c + lasturl.w.length < word.w.length) {
|
49 | ret.push({ w: word.w.substr(lasturl.c + lasturl.w.length) });
|
50 | }
|
51 | }
|
52 | // debug(ret);
|
53 | return ret;
|
54 | };
|
55 | // =================================================================
|
56 | // 协议URL头
|
57 | var PROTOTAL = ['http://', 'https://', 'ftp://', 'news://', 'telnet://'];
|
58 | // 协议头最小长度
|
59 | var MIN_PROTOTAL_LEN = 100;
|
60 | for (var i in PROTOTAL) {
|
61 | if (PROTOTAL[i].length < MIN_PROTOTAL_LEN) {
|
62 | MIN_PROTOTAL_LEN = PROTOTAL[i].length;
|
63 | }
|
64 | }
|
65 | // 允许出现在URL中的字符
|
66 | var _URLCHAR = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
67 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
68 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
69 | '!', '#', '$', '%', '&', '‘', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '[', '\\', ']', '^', '_', '`', '|', '~'];
|
70 | var URLCHAR = {};
|
71 | for (var i in _URLCHAR) {
|
72 | URLCHAR[_URLCHAR[i]] = 1;
|
73 | }
|
74 | // =================================================================
|
75 | /**
|
76 | * 匹配包含的网址,返回相关信息
|
77 | *
|
78 | * @param {string} text 文本
|
79 | * @param {int} cur 开始位置
|
80 | * @return {array} 返回格式 {w: '网址', c: 开始位置}
|
81 | */
|
82 | var matchURL = function (text, cur) {
|
83 | if (isNaN(cur))
|
84 | cur = 0;
|
85 | var ret = [];
|
86 | var s = false;
|
87 | while (cur < text.length) {
|
88 | // 判断是否为 http:// 之类的文本开头
|
89 | if (s === false && cur < text.length - MIN_PROTOTAL_LEN) {
|
90 | for (var i = 0, prot; prot = PROTOTAL[i]; i++) {
|
91 | if (text.substr(cur, prot.length) == prot) {
|
92 | s = cur;
|
93 | cur += prot.length - 1;
|
94 | break;
|
95 | }
|
96 | }
|
97 | }
|
98 | else if (s !== false && !(text.charAt(cur) in URLCHAR)) {
|
99 | // 如果以http://之类开头,遇到了非URL字符,则结束
|
100 | ret.push({
|
101 | w: text.substr(s, cur - s),
|
102 | c: s
|
103 | });
|
104 | s = false;
|
105 | }
|
106 | cur++;
|
107 | }
|
108 | // 检查剩余部分
|
109 | if (s !== false) {
|
110 | ret.push({
|
111 | w: text.substr(s, cur - s),
|
112 | c: s
|
113 | });
|
114 | }
|
115 | return ret;
|
116 | };
|
117 | //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"URLTokenizer.js","sourceRoot":"","sources":["URLTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;AAEb;;;;GAIG;AAEH,IAAI,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC;AAExB,WAAW;AACX,OAAO,CAAC,IAAI,GAAG,WAAW,CAAC;AAE3B;;;;GAIG;AACH,OAAO,CAAC,IAAI,GAAG,UAAU,OAAO;IAC9B,OAAO,CAAC,OAAO,GAAG,OAAO,CAAC;AAC5B,CAAC,CAAC;AAEF;;;;;GAKG;AACH,OAAO,CAAC,KAAK,GAAG,UAAU,KAAK;IAC7B,IAAI,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC;IACpC,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC1C,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,EAAE;YACd,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACf,SAAS;SACV;QACD,cAAc;QACd,IAAI,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE;YACtB,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACf,SAAS;SACV;QACD,SAAS;QACT,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,GAAG,GAAG,OAAO,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,EAAE;YAC7C,IAAI,GAAG,CAAC,CAAC,GAAG,KAAK,EAAE;gBACjB,GAAG,CAAC,IAAI,CAAC,EAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC,EAAC,CAAC,CAAC;aACpD;YACD,GAAG,CAAC,IAAI,CAAC,EAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,GAAG,EAAC,CAAC,CAAC;YACpC,KAAK,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;SAC9B;QACD,IAAI,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC1C,IAAI,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE;YAChD,GAAG,CAAC,IAAI,CAAC,EAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAC,CAAC,CAAC;SAC5D;KACF;IACD,cAAc;IACd,OAAO,GAAG,CAAC;AACb,CAAC,CAAC;AAEF,oEAAoE;AACpE,SAAS;AACT,IAAI,QAAQ,GAAG,CAAC,SAAS,EAAE,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;AACzE,UAAU;AACV,IAAI,gBAAgB,GAAG,GAAG,CAAC;AAC3B,KAAK,IAAI,CAAC,IAAI,QAAQ,EAAE;IACtB,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,gBAAgB,EAAE;QACzC,gBAAgB,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;KACvC;CACF;AACD,eAAe;AACf,IAAI,QAAQ,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;IACxI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;IAChI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;IAChD,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;AAChJ,IAAI,OAAO,GAAG,EAAE,CAAC;AACjB,KAAK,IAAI,CAAC,IAAI,QAAQ,EAAE;IACtB,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;CAC1B;AACD,oEAAoE;AAEpE;;;;;;GAMG;AACH,IAAI,QAAQ,GAAG,UAAU,IAAI,EAAE,GAAG;IAChC,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,GAAG,GAAG,CAAC,CAAC;IACxB,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,IAAI,CAAC,GAAG,KAAK,CAAC;IACd,OAAO,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE;QACxB,wBAAwB;QACxB,IAAI,CAAC,KAAK,KAAK,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,GAAG,gBAAgB,EAAE;YACvD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;gBAC7C,IAAI,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE;oBACzC,CAAC,GAAG,GAAG,CAAC;oBACR,GAAG,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;oBACvB,MAAM;iBACP;aACF;SACF;aAAM,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,OAAO,CAAC,EAAE;YACxD,+BAA+B;YAC/B,GAAG,CAAC,IAAI,CAAC;gBACP,CAAC,EAAG,IAAI,CAAC,MAAM,CAAC,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC;gBAC3B,CAAC,EAAG,CAAC;aACN,CAAC,CAAC;YACH,CAAC,GAAG,KAAK,CAAC;SACX;QACD,GAAG,EAAE,CAAC;KACP;IACD,SAAS;IACT,IAAI,CAAC,KAAK,KAAK,EAAE;QACf,GAAG,CAAC,IAAI,CAAC;YACP,CAAC,EAAG,IAAI,CAAC,MAAM,CAAC,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC;YAC3B,CAAC,EAAG,CAAC;SACN,CAAC,CAAC;KACJ;IAED,OAAO,GAAG,CAAC;AACb,CAAC,CAAC","sourcesContent":["'use strict';\n\n/**\n * URL识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\n\nvar debug = console.log;\n\n/** 模块类型 */\nexports.type = 'tokenizer';\n\n/**\n * 模块初始化\n *\n * @param {Segment} segment 分词接口\n */\nexports.init = function (segment) {\n  exports.segment = segment;\n};\n\n/**\n * 对未识别的单词进行分词\n *\n * @param {array} words 单词数组\n * @return {array}\n */\nexports.split = function (words) {\n  var POSTAG = exports.segment.POSTAG;\n  var ret = [];\n  for (var i = 0, word; word = words[i]; i++) {\n    if (word.p > 0) {\n      ret.push(word);\n      continue;\n    }\n    // 仅对未识别的词进行匹配\n    var urlinfo = matchURL(word.w);\n    if (urlinfo.length < 1) {\n      ret.push(word);\n      continue;\n    }\n    // 分离出URL\n    var lastc = 0;\n    for (var ui = 0, url; url = urlinfo[ui]; ui++) {\n      if (url.c > lastc) {\n        ret.push({w: word.w.substr(lastc, url.c - lastc)});\n      }\n      ret.push({w: url.w, p: POSTAG.URL});\n      lastc = url.c + url.w.length;\n    }\n    var lasturl = urlinfo[urlinfo.length - 1];\n    if (lasturl.c + lasturl.w.length < word.w.length) {\n      ret.push({w: word.w.substr(lasturl.c + lasturl.w.length)});\n    }\n  }\n  // debug(ret);\n  return ret;\n};\n\n// =================================================================\n// 协议URL头\nvar PROTOTAL = ['http://', 'https://', 'ftp://', 'news://', 'telnet://'];\n// 协议头最小长度\nvar MIN_PROTOTAL_LEN = 100;\nfor (var i in PROTOTAL) {\n  if (PROTOTAL[i].length < MIN_PROTOTAL_LEN) {\n    MIN_PROTOTAL_LEN = PROTOTAL[i].length;\n  }\n}\n// 允许出现在URL中的字符\nvar _URLCHAR = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',\n        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',\n        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',\n        '!', '#', '$', '%', '&', '‘', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '[', '\\\\', ']', '^', '_', '`', '|', '~'];\nvar URLCHAR = {};\nfor (var i in _URLCHAR) {\n  URLCHAR[_URLCHAR[i]] = 1;\n}\n// =================================================================\n\n/**\n * 匹配包含的网址，返回相关信息\n *\n * @param {string} text 文本\n * @param {int} cur 开始位置\n * @return {array}  返回格式   {w: '网址', c: 开始位置}\n */\nvar matchURL = function (text, cur) {\n  if (isNaN(cur)) cur = 0;\n  var ret = [];\n  var s = false;\n  while (cur < text.length) {\n    // 判断是否为 http:// 之类的文本开头\n    if (s === false && cur < text.length - MIN_PROTOTAL_LEN) {\n      for (var i = 0, prot; prot = PROTOTAL[i]; i++) {\n        if (text.substr(cur, prot.length) == prot) {\n          s = cur;\n          cur += prot.length - 1;\n          break;\n        }\n      }\n    } else if (s !== false && !(text.charAt(cur) in URLCHAR)) {\n      // 如果以http://之类开头，遇到了非URL字符，则结束\n      ret.push({\n        w:  text.substr(s, cur - s),\n        c:  s\n      });\n      s = false;\n    }\n    cur++;\n  }\n  // 检查剩余部分\n  if (s !== false) {\n    ret.push({\n      w:  text.substr(s, cur - s),\n      c:  s\n    });\n  }\n\n  return ret;\n};\n// debug(matchURL('http://www.baidu.com哈啊http://哇fdgggghttp://baidu.com/ss/'));\n"]} |
\ | No newline at end of file |