UNPKG

14.9 kBJavaScriptView Raw
1// Generated by CoffeeScript 1.9.1
2(function() {
3 var CND, alert, badge, binary_interval_search, character_sets_and_ranges, debug, dec, decG, echo, help, hex, hexG, info, log, name, nameO, nameOG, rpr, urge, warn, whisper;
4
5 CND = require('cnd');
6
7 rpr = CND.rpr.bind(CND);
8
9 badge = 'coffeenode-chr';
10
11 log = CND.get_logger('plain', badge);
12
13 info = CND.get_logger('info', badge);
14
15 alert = CND.get_logger('alert', badge);
16
17 debug = CND.get_logger('debug', badge);
18
19 warn = CND.get_logger('warn', badge);
20
21 urge = CND.get_logger('urge', badge);
22
23 whisper = CND.get_logger('whisper', badge);
24
25 help = CND.get_logger('help', badge);
26
27 echo = CND.echo.bind(CND);
28
29 character_sets_and_ranges = require('./character-sets-and-ranges');
30
31 this._names_and_ranges_by_csg = character_sets_and_ranges['names-and-ranges-by-csg'];
32
33 this._ranges_by_rsg = character_sets_and_ranges['ranges-by-rsg'];
34
35 binary_interval_search = require('./binary-interval-search');
36
37 this.chrs_from_text = function(text, options) {
38 var input_mode, ref, splitter;
39 if (text.length === 0) {
40 return [];
41 }
42 switch (input_mode = (ref = options != null ? options['input'] : void 0) != null ? ref : 'plain') {
43 case 'plain':
44 splitter = this._plain_splitter;
45 break;
46 case 'ncr':
47 splitter = this._ncr_splitter;
48 break;
49 case 'xncr':
50 splitter = this._xncr_splitter;
51 break;
52 default:
53 throw new Error("unknown input mode: " + (rpr(input_mode)));
54 }
55 return (text.split(splitter)).filter(function(element, idx) {
56 return element.length !== 0;
57 });
58 };
59
60 this._new_chunk = function(csg, rsg, chrs) {
61 var R;
62 R = {
63 '~isa': 'CHR/chunk',
64 'csg': csg,
65 'rsg': rsg,
66 'text': chrs.join('')
67 };
68 return R;
69 };
70
71 this.chunks_from_text = function(text, options) {
72
73 /* Given a `text` and `options` (of which `csg` is irrelevant here), return a list of `CHR/chunk`
74 objects (as returned by `CHR._new_chunk`) that describes stretches of characters with codepoints in the
75 same 'range' (Unicode block).
76 */
77 var R, chr, chrs, csg, description, i, last_csg, last_rsg, len, output_mode, ref, ref1, rsg, transform_output;
78 R = [];
79 if (text.length === 0) {
80 return R;
81 }
82 last_csg = 'u';
83 last_rsg = null;
84 chrs = [];
85 switch (output_mode = (ref = options != null ? options['output'] : void 0) != null ? ref : 'plain') {
86 case 'plain':
87 transform_output = function(chr) {
88 return chr;
89 };
90 break;
91 case 'html':
92 transform_output = function(chr) {
93 switch (chr) {
94 case '&':
95 return '&';
96 case '<':
97 return '&lt;';
98 case '>':
99 return '&gt;';
100 default:
101 return chr;
102 }
103 };
104 break;
105 default:
106 throw new Error("unknown output mode: " + (rpr(output_mode)));
107 }
108 ref1 = this.chrs_from_text(text, options);
109 for (i = 0, len = ref1.length; i < len; i++) {
110 chr = ref1[i];
111 description = this.analyze(chr, options);
112 csg = description.csg, rsg = description.rsg;
113 chr = description[csg === 'u' ? 'chr' : 'ncr'];
114 if (rsg !== last_rsg) {
115 if (chrs.length > 0) {
116 R.push(this._new_chunk(last_csg, last_rsg, chrs));
117 }
118 last_csg = csg;
119 last_rsg = rsg;
120 chrs = [];
121 }
122 chrs.push(transform_output(chr));
123 }
124 if (chrs.length > 0) {
125 R.push(this._new_chunk(last_csg, last_rsg, chrs));
126 }
127 return R;
128 };
129
130 this.html_from_text = function(text, options) {
131 var R, chunk, chunks, i, input_mode, len, ref;
132 R = [];
133 input_mode = (ref = options != null ? options['input'] : void 0) != null ? ref : 'plain';
134 chunks = this.chunks_from_text(text, {
135 input: input_mode,
136 output: 'html'
137 });
138 for (i = 0, len = chunks.length; i < len; i++) {
139 chunk = chunks[i];
140 R.push("<span class=\"" + chunk['rsg'] + "\">" + chunk['text'] + "</span>");
141 }
142 return R.join('');
143 };
144
145 this.cid_from_chr = function(chr, options) {
146 var input_mode, ref;
147 input_mode = (ref = options != null ? options['input'] : void 0) != null ? ref : 'plain';
148 return (this._chr_csg_cid_from_chr(chr, input_mode))[2];
149 };
150
151 this.csg_cid_from_chr = function(chr, options) {
152 var input_mode, ref;
153 input_mode = (ref = options != null ? options['input'] : void 0) != null ? ref : 'plain';
154 return (this._chr_csg_cid_from_chr(chr, input_mode)).slice(1);
155 };
156
157 this._chr_csg_cid_from_chr = function(chr, input_mode) {
158
159 /* Given a text with one or more characters, return the first character, its CSG, and its CID (as a
160 non-negative integer). Additionally, an input mode may be given as either `plain`, `ncr`, or `xncr`.
161 */
162 var cid, cid_dec, cid_hex, csg, first_chr, hi, lo, match, matcher;
163 if (chr.length === 0) {
164 throw new Error("unable to obtain CID from empty string");
165 }
166 if (input_mode == null) {
167 input_mode = 'plain';
168 }
169 switch (input_mode) {
170 case 'plain':
171 matcher = this._first_chr_matcher_plain;
172 break;
173 case 'ncr':
174 matcher = this._first_chr_matcher_ncr;
175 break;
176 case 'xncr':
177 matcher = this._first_chr_matcher_xncr;
178 break;
179 default:
180 throw new Error("unknown input mode: " + (rpr(input_mode)));
181 }
182 match = chr.match(matcher);
183 if (match == null) {
184 throw new Error("illegal character sequence in " + (rpr(chr)));
185 }
186 first_chr = match[0];
187 switch (first_chr.length) {
188 case 1:
189 return [first_chr, 'u', first_chr.charCodeAt(0)];
190 case 2:
191
192 /* thx to http://perldoc.perl.org/Encode/Unicode.html */
193 hi = first_chr.charCodeAt(0);
194 lo = first_chr.charCodeAt(1);
195 cid = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
196 return [first_chr, 'u', cid];
197 default:
198 chr = match[0], csg = match[1], cid_hex = match[2], cid_dec = match[3];
199 cid = cid_hex != null ? parseInt(cid_hex, 16) : parseInt(cid_dec, 10);
200 if (csg.length === 0) {
201 csg = 'u';
202 }
203 return [first_chr, csg, cid];
204 }
205 };
206
207 this.as_csg = function(cid_hint, O) {
208 return (this._csg_cid_from_hint(cid_hint, O))[0];
209 };
210
211 this.as_cid = function(cid_hint, O) {
212 return (this._csg_cid_from_hint(cid_hint, O))[1];
213 };
214
215 this.as_chr = function(cid_hint, O) {
216 return this._as_chr.apply(this, this._csg_cid_from_hint(cid_hint, O));
217 };
218
219 this.as_uchr = function(cid_hint, O) {
220 return this._as_uchr.apply(this, this._csg_cid_from_hint(cid_hint, O));
221 };
222
223 this.as_fncr = function(cid_hint, O) {
224 return this._as_fncr.apply(this, this._csg_cid_from_hint(cid_hint, O));
225 };
226
227 this.as_sfncr = function(cid_hint, O) {
228 return this._as_sfncr.apply(this, this._csg_cid_from_hint(cid_hint, O));
229 };
230
231 this.as_xncr = function(cid_hint, O) {
232 return this._as_xncr.apply(this, this._csg_cid_from_hint(cid_hint, O));
233 };
234
235 this.as_ncr = function(cid_hint, O) {
236 return this._as_xncr.apply(this, this._csg_cid_from_hint(cid_hint, O));
237 };
238
239 this.as_rsg = function(cid_hint, O) {
240 return this._as_rsg.apply(this, this._csg_cid_from_hint(cid_hint, O));
241 };
242
243 this.as_range_name = function(cid_hint, O) {
244 return this._as_range_name.apply(this, this._csg_cid_from_hint(cid_hint, O));
245 };
246
247 this.analyze = function(cid_hint, O) {
248 return this._analyze.apply(this, this._csg_cid_from_hint(cid_hint, O));
249 };
250
251 this._analyze = function(csg, cid) {
252 var R, chr, ncr, xncr;
253 if (csg === 'u') {
254 chr = this._unicode_chr_from_cid(cid);
255 ncr = xncr = this._as_xncr(csg, cid);
256 } else {
257 chr = this._as_xncr(csg, cid);
258 xncr = this._as_xncr(csg, cid);
259 ncr = this._as_xncr('u', cid);
260 }
261 R = {
262 '~isa': 'CHR/info',
263 'chr': chr,
264 'uchr': this._unicode_chr_from_cid(cid),
265 'csg': csg,
266 'cid': cid,
267 'fncr': this._as_fncr(csg, cid),
268 'sfncr': this._as_sfncr(csg, cid),
269 'ncr': ncr,
270 'xncr': xncr,
271 'rsg': this._as_rsg(csg, cid)
272 };
273 return R;
274 };
275
276 this._as_chr = function(csg, cid) {
277 if (csg === 'u') {
278 return this._unicode_chr_from_cid(cid);
279 }
280 return (this._analyze(csg, cid))['chr'];
281 };
282
283 this._as_uchr = function(csg, cid) {
284 return this._unicode_chr_from_cid(cid);
285 };
286
287 this._unicode_chr_from_cid = function(cid) {
288 var hi, lo;
289 if (cid <= 0xffff) {
290 return String.fromCharCode(cid);
291 }
292
293 /* thx to http://perldoc.perl.org/Encode/Unicode.html */
294 hi = (Math.floor((cid - 0x10000) / 0x400)) + 0xD800;
295 lo = (cid - 0x10000) % 0x400 + 0xDC00;
296 return (String.fromCharCode(hi)) + (String.fromCharCode(lo));
297 };
298
299 this._as_fncr = function(csg, cid) {
300 var ref, rsg;
301 rsg = (ref = this._as_rsg(csg, cid)) != null ? ref : csg;
302 return rsg + "-" + (cid.toString(16));
303 };
304
305 this._as_sfncr = function(csg, cid) {
306 return csg + "-" + (cid.toString(16));
307 };
308
309 this._as_xncr = function(csg, cid) {
310 if (csg === 'u' || (csg == null)) {
311 csg = '';
312 }
313 return "&" + csg + "#x" + (cid.toString(16)) + ";";
314 };
315
316 this._as_rsg = function(csg, cid) {
317 return binary_interval_search(this._names_and_ranges_by_csg[csg], 'first-cid', 'last-cid', 'rsg', cid);
318 };
319
320 this._as_range_name = function(csg, cid) {
321 return binary_interval_search(this._names_and_ranges_by_csg[csg], 'first-cid', 'last-cid', 'range-name', cid);
322 };
323
324 this._csg_cid_from_hint = function(cid_hint, options) {
325
326 /* This helper is used to derive the correct CSG and CID from arguments as accepted by the `as_*` family
327 of methods, such as `CHR.as_fncr`, `CHR.as_rsg` and so on; its output may be directly applied to the
328 respective namesake private method (`CHR._as_fncr`, `CHR._as_rsg` and so on). The method arguments should
329 obey the following rules:
330
331 * Methods may be called with one or two arguments; the first is known as the 'CID hint', the second as
332 'options'.
333
334 * The CID hint may be a number or a text; if it is a number, it is understood as a CID; if it
335 is a text, its interpretation is subject to the `options[ 'input' ]` setting.
336
337 * Options must be a POD with the optional members `input` and `csg`.
338
339 * `options[ 'input' ]` is *only* observed if the CID hint is a text; it governs which kinds of character
340 references are recognized in the text. `input` may be one of `plain`, `ncr`, or `xncr`; it defaults to
341 `plain` (no character references will be recognized).
342
343 * `options[ 'csg' ]` sets the character set sigil. If `csg` is set in the options, then it will override
344 whatever the outcome of `CHR.csg_cid_from_chr` w.r.t. CSG is—in other words, if you call
345 `CHR.as_sfncr '&jzr#xe100', input: 'xncr', csg: 'u'`, you will get `u-e100`, with the numerically
346 equivalent codepoint from the `u` (Unicode) character set.
347
348 * Before CSG and CID are returned, they will be validated for plausibility.
349 */
350 var cid, csg, csg_of_cid_hint, csg_of_options, input_mode, ref, type;
351 switch (type = CND.type_of(options)) {
352 case 'null':
353 case 'jsundefined':
354 csg_of_options = null;
355 input_mode = null;
356 break;
357 case 'pod':
358 csg_of_options = options['csg'];
359 input_mode = options['input'];
360 break;
361 default:
362 throw new Error("expected a POD as second argument, got a " + type);
363 }
364 switch (type = CND.type_of(cid_hint)) {
365 case 'number':
366 csg_of_cid_hint = null;
367 cid = cid_hint;
368 break;
369 case 'text':
370 ref = this.csg_cid_from_chr(cid_hint, {
371 input: input_mode
372 }), csg_of_cid_hint = ref[0], cid = ref[1];
373 break;
374 default:
375 throw new Error("expected a text or a number as first argument, got a " + type);
376 }
377 if (csg_of_options != null) {
378 csg = csg_of_options;
379 } else if (csg_of_cid_hint != null) {
380 csg = csg_of_cid_hint;
381 } else {
382 csg = 'u';
383 }
384 this.validate_is_csg(csg);
385 this.validate_is_cid(cid);
386 return [csg, cid];
387 };
388
389 name = /(?:[a-z][a-z0-9]*)/.source;
390
391 nameO = /(?:(?:[a-z][a-z0-9]*)|)/.source;
392
393 nameOG = /((?:[a-z][a-z0-9]*)|)/.source;
394
395 hex = /(?:x[a-fA-F0-9]+)/.source;
396
397 hexG = /(?:x([a-fA-F0-9]+))/.source;
398
399 dec = /(?:[0-9]+)/.source;
400
401 decG = /(?:([0-9]+))/.source;
402
403 this._csg_matcher = RegExp("^" + name + "$");
404
405 this._ncr_matcher = RegExp("(?:&\\#(?:" + hex + "|" + dec + ");)");
406
407 this._xncr_matcher = RegExp("(?:&" + nameO + "\\#(?:" + hex + "|" + dec + ");)");
408
409 this._ncr_csg_cid_matcher = RegExp("(?:&()\\#(?:" + hexG + "|" + decG + ");)");
410
411 this._xncr_csg_cid_matcher = RegExp("(?:&" + nameOG + "\\#(?:" + hexG + "|" + decG + ");)");
412
413
414 /* Matchers for surrogate sequences and non-surrogate, 'ordinary' characters: */
415
416 this._surrogate_matcher = /(?:[\ud800-\udbff][\udc00-\udfff])/;
417
418 this._nonsurrogate_matcher = /[^\ud800-\udbff\udc00-\udfff]/;
419
420
421 /* Matchers for the first character of a string, in three modes (`plain`, `ncr`, `xncr`): */
422
423 this._first_chr_matcher_plain = RegExp("^(?:" + this._surrogate_matcher.source + "|" + this._nonsurrogate_matcher.source + ")");
424
425 this._first_chr_matcher_ncr = RegExp("^(?:" + this._surrogate_matcher.source + "|" + this._ncr_csg_cid_matcher.source + "|" + this._nonsurrogate_matcher.source + ")");
426
427 this._first_chr_matcher_xncr = RegExp("^(?:" + this._surrogate_matcher.source + "|" + this._xncr_csg_cid_matcher.source + "|" + this._nonsurrogate_matcher.source + ")");
428
429 this._plain_splitter = RegExp("(" + this._surrogate_matcher.source + "|" + this._nonsurrogate_matcher.source + ")");
430
431 this._ncr_splitter = RegExp("(" + this._ncr_matcher.source + "|" + this._surrogate_matcher.source + "|" + this._nonsurrogate_matcher.source + ")");
432
433 this._xncr_splitter = RegExp("(" + this._xncr_matcher.source + "|" + this._surrogate_matcher.source + "|" + this._nonsurrogate_matcher.source + ")");
434
435 this.cid_range_from_rsg = function(rsg) {
436 var R;
437 if ((R = this._ranges_by_rsg[rsg]) == null) {
438 throw new Error("unknown RSG: " + (rpr(rsg)));
439 }
440 return R;
441 };
442
443 this.validate_is_csg = function(x) {
444 CND.validate_isa_text(x);
445 if ((x.match(this._csg_matcher)) == null) {
446 throw new Error("not a valid CSG: " + (rpr(x)));
447 }
448 if (this._names_and_ranges_by_csg[x] == null) {
449 throw new Error("unknown CSG: " + (rpr(x)));
450 }
451 return null;
452 };
453
454 this.validate_is_cid = function(x) {
455 CND.validate_isa_number(x);
456 if (x < 0 || x > 0xffffffff || (parseInt(x)) !== x) {
457 throw new Error("expected an integer between 0x0 and 0x10ffff, got 0x" + (x.toString(16)));
458 }
459 return null;
460 };
461
462}).call(this);