1 | 'use strict';
|
2 |
|
3 |
|
4 |
|
5 |
|
6 |
|
7 | var debug = console.log;
|
8 |
|
9 | exports.type = 'tokenizer';
|
10 |
|
11 |
|
12 |
|
13 |
|
14 |
|
15 | exports.init = function (segment) {
|
16 | exports.segment = segment;
|
17 | };
|
18 |
|
19 |
|
20 |
|
21 |
|
22 |
|
23 |
|
24 | exports.split = function (words) {
|
25 | var POSTAG = exports.segment.POSTAG;
|
26 | var ret = [];
|
27 | for (var i = 0, word; word = words[i]; i++) {
|
28 | if (word.p > 0) {
|
29 | ret.push(word);
|
30 | continue;
|
31 | }
|
32 |
|
33 | var urlinfo = matchURL(word.w);
|
34 | if (urlinfo.length < 1) {
|
35 | ret.push(word);
|
36 | continue;
|
37 | }
|
38 |
|
39 | var lastc = 0;
|
40 | for (var ui = 0, url; url = urlinfo[ui]; ui++) {
|
41 | if (url.c > lastc) {
|
42 | ret.push({ w: word.w.substr(lastc, url.c - lastc) });
|
43 | }
|
44 | ret.push({ w: url.w, p: POSTAG.URL });
|
45 | lastc = url.c + url.w.length;
|
46 | }
|
47 | var lasturl = urlinfo[urlinfo.length - 1];
|
48 | if (lasturl.c + lasturl.w.length < word.w.length) {
|
49 | ret.push({ w: word.w.substr(lasturl.c + lasturl.w.length) });
|
50 | }
|
51 | }
|
52 |
|
53 | return ret;
|
54 | };
|
55 |
|
56 |
|
57 | var PROTOTAL = ['http://', 'https://', 'ftp://', 'news://', 'telnet://'];
|
58 |
|
59 | var MIN_PROTOTAL_LEN = 100;
|
60 | for (var i in PROTOTAL) {
|
61 | if (PROTOTAL[i].length < MIN_PROTOTAL_LEN) {
|
62 | MIN_PROTOTAL_LEN = PROTOTAL[i].length;
|
63 | }
|
64 | }
|
65 |
|
66 | var _URLCHAR = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
67 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
68 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
69 | '!', '#', '$', '%', '&', '‘', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '[', '\\', ']', '^', '_', '`', '|', '~'];
|
70 | var URLCHAR = {};
|
71 | for (var i in _URLCHAR) {
|
72 | URLCHAR[_URLCHAR[i]] = 1;
|
73 | }
|
74 |
|
75 |
|
76 |
|
77 |
|
78 |
|
79 |
|
80 |
|
81 |
|
82 | var matchURL = function (text, cur) {
|
83 | if (isNaN(cur))
|
84 | cur = 0;
|
85 | var ret = [];
|
86 | var s = false;
|
87 | while (cur < text.length) {
|
88 |
|
89 | if (s === false && cur < text.length - MIN_PROTOTAL_LEN) {
|
90 | for (var i = 0, prot; prot = PROTOTAL[i]; i++) {
|
91 | if (text.substr(cur, prot.length) == prot) {
|
92 | s = cur;
|
93 | cur += prot.length - 1;
|
94 | break;
|
95 | }
|
96 | }
|
97 | }
|
98 | else if (s !== false && !(text.charAt(cur) in URLCHAR)) {
|
99 |
|
100 | ret.push({
|
101 | w: text.substr(s, cur - s),
|
102 | c: s
|
103 | });
|
104 | s = false;
|
105 | }
|
106 | cur++;
|
107 | }
|
108 |
|
109 | if (s !== false) {
|
110 | ret.push({
|
111 | w: text.substr(s, cur - s),
|
112 | c: s
|
113 | });
|
114 | }
|
115 | return ret;
|
116 | };
|
117 |
|
\ | No newline at end of file |