UNPKG

3.6 kBJavaScriptView Raw
1'use strict';
2/**
3 * URL识别模块
4 *
5 * @author 老雷<leizongmin@gmail.com>
6 */
7var debug = console.log;
8/** 模块类型 */
9exports.type = 'tokenizer';
10/**
11 * 模块初始化
12 *
13 * @param {Segment} segment 分词接口
14 */
15exports.init = function (segment) {
16 exports.segment = segment;
17};
18/**
19 * 对未识别的单词进行分词
20 *
21 * @param {array} words 单词数组
22 * @return {array}
23 */
24exports.split = function (words) {
25 var POSTAG = exports.segment.POSTAG;
26 var ret = [];
27 for (var i = 0, word; word = words[i]; i++) {
28 if (word.p > 0) {
29 ret.push(word);
30 continue;
31 }
32 // 仅对未识别的词进行匹配
33 var urlinfo = matchURL(word.w);
34 if (urlinfo.length < 1) {
35 ret.push(word);
36 continue;
37 }
38 // 分离出URL
39 var lastc = 0;
40 for (var ui = 0, url; url = urlinfo[ui]; ui++) {
41 if (url.c > lastc) {
42 ret.push({ w: word.w.substr(lastc, url.c - lastc) });
43 }
44 ret.push({ w: url.w, p: POSTAG.URL });
45 lastc = url.c + url.w.length;
46 }
47 var lasturl = urlinfo[urlinfo.length - 1];
48 if (lasturl.c + lasturl.w.length < word.w.length) {
49 ret.push({ w: word.w.substr(lasturl.c + lasturl.w.length) });
50 }
51 }
52 // debug(ret);
53 return ret;
54};
55// =================================================================
56// 协议URL头
57var PROTOTAL = ['http://', 'https://', 'ftp://', 'news://', 'telnet://'];
58// 协议头最小长度
59var MIN_PROTOTAL_LEN = 100;
60for (var i in PROTOTAL) {
61 if (PROTOTAL[i].length < MIN_PROTOTAL_LEN) {
62 MIN_PROTOTAL_LEN = PROTOTAL[i].length;
63 }
64}
65// 允许出现在URL中的字符
66var _URLCHAR = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
67 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
68 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
69 '!', '#', '$', '%', '&', '‘', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '[', '\\', ']', '^', '_', '`', '|', '~'];
70var URLCHAR = {};
71for (var i in _URLCHAR) {
72 URLCHAR[_URLCHAR[i]] = 1;
73}
74// =================================================================
75/**
76 * 匹配包含的网址,返回相关信息
77 *
78 * @param {string} text 文本
79 * @param {int} cur 开始位置
80 * @return {array} 返回格式 {w: '网址', c: 开始位置}
81 */
82var matchURL = function (text, cur) {
83 if (isNaN(cur))
84 cur = 0;
85 var ret = [];
86 var s = false;
87 while (cur < text.length) {
88 // 判断是否为 http:// 之类的文本开头
89 if (s === false && cur < text.length - MIN_PROTOTAL_LEN) {
90 for (var i = 0, prot; prot = PROTOTAL[i]; i++) {
91 if (text.substr(cur, prot.length) == prot) {
92 s = cur;
93 cur += prot.length - 1;
94 break;
95 }
96 }
97 }
98 else if (s !== false && !(text.charAt(cur) in URLCHAR)) {
99 // 如果以http://之类开头,遇到了非URL字符,则结束
100 ret.push({
101 w: text.substr(s, cur - s),
102 c: s
103 });
104 s = false;
105 }
106 cur++;
107 }
108 // 检查剩余部分
109 if (s !== false) {
110 ret.push({
111 w: text.substr(s, cur - s),
112 c: s
113 });
114 }
115 return ret;
116};
117//# sourceMappingURL=URLTokenizer.js.map
\No newline at end of file