UNPKG

7.58 kBSource Map (JSON)View Raw
1{"version":3,"file":"URLTokenizer.js","sourceRoot":"","sources":["URLTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;AAEb;;;;GAIG;AAEH,IAAI,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC;AAExB,WAAW;AACX,OAAO,CAAC,IAAI,GAAG,WAAW,CAAC;AAE3B;;;;GAIG;AACH,OAAO,CAAC,IAAI,GAAG,UAAU,OAAO;IAC9B,OAAO,CAAC,OAAO,GAAG,OAAO,CAAC;AAC5B,CAAC,CAAC;AAEF;;;;;GAKG;AACH,OAAO,CAAC,KAAK,GAAG,UAAU,KAAK;IAC7B,IAAI,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC;IACpC,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC1C,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,EAAE;YACd,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACf,SAAS;SACV;QACD,cAAc;QACd,IAAI,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE;YACtB,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACf,SAAS;SACV;QACD,SAAS;QACT,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,GAAG,GAAG,OAAO,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,EAAE;YAC7C,IAAI,GAAG,CAAC,CAAC,GAAG,KAAK,EAAE;gBACjB,GAAG,CAAC,IAAI,CAAC,EAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC,EAAC,CAAC,CAAC;aACpD;YACD,GAAG,CAAC,IAAI,CAAC,EAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,GAAG,EAAC,CAAC,CAAC;YACpC,KAAK,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;SAC9B;QACD,IAAI,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC1C,IAAI,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE;YAChD,GAAG,CAAC,IAAI,CAAC,EAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAC,CAAC,CAAC;SAC5D;KACF;IACD,cAAc;IACd,OAAO,GAAG,CAAC;AACb,CAAC,CAAC;AAEF,oEAAoE;AACpE,SAAS;AACT,IAAI,QAAQ,GAAG,CAAC,SAAS,EAAE,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;AACzE,UAAU;AACV,IAAI,gBAAgB,GAAG,GAAG,CAAC;AAC3B,KAAK,IAAI,CAAC,IAAI,QAAQ,EAAE;IACtB,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,gBAAgB,EAAE;QACzC,gBAAgB,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;KACvC;CACF;AACD,eAAe;AACf,IAAI,QAAQ,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;IACxI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;IAChI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;IAChD,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;AAChJ,IAAI,OAAO,GAAG,EAAE,CAAC;AACjB,KAAK,IAAI,CAAC,IAAI,QAAQ,EAAE;IACtB,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;CAC1B;AACD,oEAAoE;AAEpE;;;;;;GAMG;AACH,IAAI,QAAQ,GAAG,UAAU,IAAI,EAAE,GAAG;IAChC,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,GAAG,GAAG,CAAC,CAAC;IACxB,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,IAAI,CAAC,GAAG,KAAK,CAAC;IACd,OAAO,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE;QACxB,wBAAwB;QACxB,IAAI,CAAC,KAAK,KAAK,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,GAAG,gBAAgB,EAAE;YACvD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;gBAC7C,IAAI,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE;oBACzC,CAAC,GAAG,GAAG,CAAC;oBACR,GAAG,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;oBACvB,MAAM;iBACP;aACF;SACF;aAAM,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,OAAO,CAAC,EAAE;YACxD,+BAA+B;YAC/B,GAAG,CAAC,IAAI,CAAC;gBACP,CAAC,EAAG,IAAI,CAAC,MAAM,CAAC,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC;gBAC3B,CAAC,EAAG,CAAC;aACN,CAAC,CAAC;YACH,CAAC,GAAG,KAAK,CAAC;SACX;QACD,GAAG,EAAE,CAAC;KACP;IACD,SAAS;IACT,IAAI,CAAC,KAAK,KAAK,EAAE;QACf,GAAG,CAAC,IAAI,CAAC;YACP,CAAC,EAAG,IAAI,CAAC,MAAM,CAAC,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC;YAC3B,CAAC,EAAG,CAAC;SACN,CAAC,CAAC;KACJ;IAED,OAAO,GAAG,CAAC;AACb,CAAC,CAAC","sourcesContent":["'use strict';\n\n/**\n * URL识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\n\nvar debug = console.log;\n\n/** 模块类型 */\nexports.type = 'tokenizer';\n\n/**\n * 模块初始化\n *\n * @param {Segment} segment 分词接口\n */\nexports.init = function (segment) {\n exports.segment = segment;\n};\n\n/**\n * 对未识别的单词进行分词\n *\n * @param {array} words 单词数组\n * @return {array}\n */\nexports.split = function (words) {\n var POSTAG = exports.segment.POSTAG;\n var ret = [];\n for (var i = 0, word; word = words[i]; i++) {\n if (word.p > 0) {\n ret.push(word);\n continue;\n }\n // 仅对未识别的词进行匹配\n var urlinfo = matchURL(word.w);\n if (urlinfo.length < 1) {\n ret.push(word);\n continue;\n }\n // 分离出URL\n var lastc = 0;\n for (var ui = 0, url; url = urlinfo[ui]; ui++) {\n if (url.c > lastc) {\n ret.push({w: word.w.substr(lastc, url.c - lastc)});\n }\n ret.push({w: url.w, p: POSTAG.URL});\n lastc = url.c + url.w.length;\n }\n var lasturl = urlinfo[urlinfo.length - 1];\n if (lasturl.c + lasturl.w.length < word.w.length) {\n ret.push({w: word.w.substr(lasturl.c + lasturl.w.length)});\n }\n }\n // debug(ret);\n return ret;\n};\n\n// =================================================================\n// 协议URL头\nvar PROTOTAL = ['http://', 'https://', 'ftp://', 'news://', 'telnet://'];\n// 协议头最小长度\nvar MIN_PROTOTAL_LEN = 100;\nfor (var i in PROTOTAL) {\n if (PROTOTAL[i].length < MIN_PROTOTAL_LEN) {\n MIN_PROTOTAL_LEN = PROTOTAL[i].length;\n }\n}\n// 允许出现在URL中的字符\nvar _URLCHAR = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',\n 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',\n '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',\n '!', '#', '$', '%', '&', '‘', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '[', '\\\\', ']', '^', '_', '`', '|', '~'];\nvar URLCHAR = {};\nfor (var i in _URLCHAR) {\n URLCHAR[_URLCHAR[i]] = 1;\n}\n// =================================================================\n\n/**\n * 匹配包含的网址,返回相关信息\n *\n * @param {string} text 文本\n * @param {int} cur 开始位置\n * @return {array} 返回格式 {w: '网址', c: 开始位置}\n */\nvar matchURL = function (text, cur) {\n if (isNaN(cur)) cur = 0;\n var ret = [];\n var s = false;\n while (cur < text.length) {\n // 判断是否为 http:// 之类的文本开头\n if (s === false && cur < text.length - MIN_PROTOTAL_LEN) {\n for (var i = 0, prot; prot = PROTOTAL[i]; i++) {\n if (text.substr(cur, prot.length) == prot) {\n s = cur;\n cur += prot.length - 1;\n break;\n }\n }\n } else if (s !== false && !(text.charAt(cur) in URLCHAR)) {\n // 如果以http://之类开头,遇到了非URL字符,则结束\n ret.push({\n w: text.substr(s, cur - s),\n c: s\n });\n s = false;\n }\n cur++;\n }\n // 检查剩余部分\n if (s !== false) {\n ret.push({\n w: text.substr(s, cur - s),\n c: s\n });\n }\n\n return ret;\n};\n// debug(matchURL('http://www.baidu.com哈啊http://哇fdgggghttp://baidu.com/ss/'));\n"]}
\No newline at end of file