1 | 'use strict';
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.init = exports.ChsNameTokenizer = void 0;
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 | const CHS_NAMES_1 = require("../mod/CHS_NAMES");
|
10 | const mod_1 = require("../mod");
|
11 | class ChsNameTokenizer extends mod_1.SubSModuleTokenizer {
|
12 | constructor() {
|
13 | super(...arguments);
|
14 | this.name = 'ChsNameTokenizer';
|
15 | }
|
16 | _cache() {
|
17 | super._cache();
|
18 | this._TABLE = this.segment.getDict('TABLE');
|
19 | this._POSTAG = this.segment.POSTAG;
|
20 | }
|
21 | |
22 |
|
23 |
|
24 |
|
25 |
|
26 |
|
27 | split(words) {
|
28 | const POSTAG = this._POSTAG;
|
29 | let ret = [];
|
30 | for (let i = 0, word; word = words[i]; i++) {
|
31 | if (word.p) {
|
32 |
|
33 | ret.push(word);
|
34 | continue;
|
35 | }
|
36 | let nameinfo = this.matchName(word.w);
|
37 | if (nameinfo.length < 1) {
|
38 | ret.push(word);
|
39 | continue;
|
40 | }
|
41 |
|
42 | let lastc = 0;
|
43 | for (let ui = 0, url; url = nameinfo[ui]; ui++) {
|
44 | if (url.c > lastc) {
|
45 | ret.push(this.debugToken({
|
46 | w: word.w.substr(lastc, url.c - lastc),
|
47 | }, {
|
48 | [this.name]: false,
|
49 | }, true));
|
50 | }
|
51 | ret.push(this.debugToken({
|
52 | w: url.w,
|
53 | p: POSTAG.A_NR
|
54 | }, {
|
55 | [this.name]: true,
|
56 | }, true));
|
57 | lastc = url.c + url.w.length;
|
58 | }
|
59 | let lastn = nameinfo[nameinfo.length - 1];
|
60 | if (lastn.c + lastn.w.length < word.w.length) {
|
61 | ret.push(this.debugToken({
|
62 | w: word.w.substr(lastn.c + lastn.w.length),
|
63 | }, {
|
64 | [this.name]: false,
|
65 | }, true));
|
66 | }
|
67 | }
|
68 | return ret;
|
69 | }
|
70 | |
71 |
|
72 |
|
73 |
|
74 |
|
75 |
|
76 |
|
77 | matchName(text, cur = 0) {
|
78 | if (isNaN(cur))
|
79 | cur = 0;
|
80 | let ret = [];
|
81 | while (cur < text.length) {
|
82 |
|
83 | let name = null;
|
84 |
|
85 | let f2 = text.substr(cur, 2);
|
86 | if (f2 in CHS_NAMES_1.FAMILY_NAME_2) {
|
87 | let n1 = text.charAt(cur + 2);
|
88 | let n2 = text.charAt(cur + 3);
|
89 | if (n1 in CHS_NAMES_1.DOUBLE_NAME_1 && n2 in CHS_NAMES_1.DOUBLE_NAME_2) {
|
90 | name = f2 + n1 + n2;
|
91 | }
|
92 | else if (n1 in CHS_NAMES_1.SINGLE_NAME) {
|
93 | name = f2 + n1 + (n1 == n2 ? n2 : '');
|
94 | }
|
95 | }
|
96 |
|
97 | let f1 = text.charAt(cur);
|
98 | if (name === null && f1 in CHS_NAMES_1.FAMILY_NAME_1) {
|
99 | let n1 = text.charAt(cur + 1);
|
100 | let n2 = text.charAt(cur + 2);
|
101 | if (n1 in CHS_NAMES_1.DOUBLE_NAME_1 && n2 in CHS_NAMES_1.DOUBLE_NAME_2) {
|
102 | name = f1 + n1 + n2;
|
103 | }
|
104 | else if (n1 in CHS_NAMES_1.SINGLE_NAME) {
|
105 | name = f1 + n1 + (n1 == n2 ? n2 : '');
|
106 | }
|
107 | }
|
108 |
|
109 | if (name === null) {
|
110 | cur++;
|
111 | }
|
112 | else {
|
113 | ret.push({ w: name, c: cur });
|
114 | cur += name.length;
|
115 | }
|
116 | }
|
117 | return ret;
|
118 | }
|
119 | }
|
120 | exports.ChsNameTokenizer = ChsNameTokenizer;
|
121 |
|
122 |
|
123 |
|
124 | exports.init = ChsNameTokenizer.init.bind(ChsNameTokenizer);
|
125 | exports.default = ChsNameTokenizer;
|
126 |
|
\ | No newline at end of file |