1 |
|
2 |
|
3 |
|
4 |
|
5 | ############################################################################################################
|
6 | CND = require 'cnd'
|
7 | rpr = CND.rpr.bind CND
|
8 | badge = 'coffeenode-chr'
|
9 | log = CND.get_logger 'plain', badge
|
10 | info = CND.get_logger 'info', badge
|
11 | alert = CND.get_logger 'alert', badge
|
12 | debug = CND.get_logger 'debug', badge
|
13 | warn = CND.get_logger 'warn', badge
|
14 | urge = CND.get_logger 'urge', badge
|
15 | whisper = CND.get_logger 'whisper', badge
|
16 | help = CND.get_logger 'help', badge
|
17 | echo = CND.echo.bind CND
|
18 | #...........................................................................................................
|
19 | character_sets_and_ranges = require './character-sets-and-ranges'
|
20 | @_names_and_ranges_by_csg = character_sets_and_ranges[ 'names-and-ranges-by-csg' ]
|
21 | @_ranges_by_rsg = character_sets_and_ranges[ 'ranges-by-rsg' ]
|
22 | binary_interval_search = require './binary-interval-search'
|
23 |
|
24 |
|
25 |
|
26 | #===========================================================================================================
|
27 | # SPLIT TEXT INTO CHARACTERS
|
28 | #-----------------------------------------------------------------------------------------------------------
|
29 | @chrs_from_text = ( text, options ) ->
|
30 | return [] if text.length is 0
|
31 | #.........................................................................................................
|
32 | switch input_mode = options?[ 'input' ] ? 'plain'
|
33 | when 'plain' then splitter = @_plain_splitter
|
34 | when 'ncr' then splitter = @_ncr_splitter
|
35 | when 'xncr' then splitter = @_xncr_splitter
|
36 | else throw new Error "unknown input mode: #{rpr input_mode}"
|
37 | #.........................................................................................................
|
38 | return ( text.split splitter ).filter ( element, idx ) -> return element.length isnt 0
|
39 |
|
40 | #-----------------------------------------------------------------------------------------------------------
|
41 | @_new_chunk = ( csg, rsg, chrs ) ->
|
42 | R =
|
43 | '~isa': 'CHR/chunk'
|
44 | 'csg': csg
|
45 | 'rsg': rsg
|
46 | # 'chrs': chrs
|
47 | 'text': chrs.join ''
|
48 | #.........................................................................................................
|
49 | return R
|
50 |
|
51 | #-----------------------------------------------------------------------------------------------------------
|
52 | @chunks_from_text = ( text, options ) ->
|
53 | ### Given a `text` and `options` (of which `csg` is irrelevant here), return a list of `CHR/chunk`
|
54 | objects (as returned by `CHR._new_chunk`) that describes stretches of characters with codepoints in the
|
55 | same 'range' (Unicode block).
|
56 | ###
|
57 | R = []
|
58 | return R if text.length is 0
|
59 | last_csg = 'u'
|
60 | last_rsg = null
|
61 | chrs = []
|
62 | #.........................................................................................................
|
63 | switch output_mode = options?[ 'output' ] ? 'plain'
|
64 | when 'plain'
|
65 | transform_output = ( chr ) ->
|
66 | return chr
|
67 | when 'html'
|
68 | transform_output = ( chr ) ->
|
69 | return switch chr
|
70 | when '&' then '&'
|
71 | when '<' then '<'
|
72 | when '>' then '>'
|
73 | else chr
|
74 | else
|
75 | throw new Error "unknown output mode: #{rpr output_mode}"
|
76 | #.........................................................................................................
|
77 | for chr in @chrs_from_text text, options
|
78 | description = @analyze chr, options
|
79 | { csg
|
80 | rsg } = description
|
81 | chr = description[ if csg is 'u' then 'chr' else 'ncr' ]
|
82 | if rsg isnt last_rsg
|
83 | R.push @_new_chunk last_csg, last_rsg, chrs if chrs.length > 0
|
84 | last_csg = csg
|
85 | last_rsg = rsg
|
86 | chrs = []
|
87 | #.......................................................................................................
|
88 | chrs.push transform_output chr
|
89 | #.........................................................................................................
|
90 | R.push @_new_chunk last_csg, last_rsg, chrs if chrs.length > 0
|
91 | return R
|
92 |
|
93 | #-----------------------------------------------------------------------------------------------------------
|
94 | @html_from_text = ( text, options ) ->
|
95 | R = []
|
96 | #.........................................................................................................
|
97 | input_mode = options?[ 'input' ] ? 'plain'
|
98 | chunks = @chunks_from_text text, input: input_mode, output: 'html'
|
99 | for chunk in chunks
|
100 | R.push """<span class="#{chunk[ 'rsg' ]}">#{chunk[ 'text' ]}</span>"""
|
101 | #.........................................................................................................
|
102 | return R.join ''
|
103 |
|
104 | #===========================================================================================================
|
105 | # CONVERTING TO CID
|
106 | #-----------------------------------------------------------------------------------------------------------
|
107 | @cid_from_chr = ( chr, options ) ->
|
108 | input_mode = options?[ 'input' ] ? 'plain'
|
109 | return ( @_chr_csg_cid_from_chr chr, input_mode )[ 2 ]
|
110 |
|
111 | #-----------------------------------------------------------------------------------------------------------
|
112 | @csg_cid_from_chr = ( chr, options ) ->
|
113 | input_mode = options?[ 'input' ] ? 'plain'
|
114 | return ( @_chr_csg_cid_from_chr chr, input_mode )[ 1 .. ]
|
115 |
|
116 | #-----------------------------------------------------------------------------------------------------------
|
117 | @_chr_csg_cid_from_chr = ( chr, input_mode ) ->
|
118 | ### Given a text with one or more characters, return the first character, its CSG, and its CID (as a
|
119 | non-negative integer). Additionally, an input mode may be given as either `plain`, `ncr`, or `xncr`.
|
120 | ###
|
121 | #.........................................................................................................
|
122 | throw new Error "unable to obtain CID from empty string" if chr.length is 0
|
123 | #.........................................................................................................
|
124 | input_mode ?= 'plain'
|
125 | switch input_mode
|
126 | when 'plain' then matcher = @_first_chr_matcher_plain
|
127 | when 'ncr' then matcher = @_first_chr_matcher_ncr
|
128 | when 'xncr' then matcher = @_first_chr_matcher_xncr
|
129 | else throw new Error "unknown input mode: #{rpr input_mode}"
|
130 | #.........................................................................................................
|
131 | match = chr.match matcher
|
132 | throw new Error "illegal character sequence in #{rpr chr}" unless match?
|
133 | first_chr = match[ 0 ]
|
134 | #.........................................................................................................
|
135 | switch first_chr.length
|
136 | #.......................................................................................................
|
137 | when 1
|
138 | return [ first_chr, 'u', first_chr.charCodeAt 0 ]
|
139 | #.......................................................................................................
|
140 | when 2
|
141 | ### thx to http://perldoc.perl.org/Encode/Unicode.html ###
|
142 | hi = first_chr.charCodeAt 0
|
143 | lo = first_chr.charCodeAt 1
|
144 | cid = ( hi - 0xD800 ) * 0x400 + ( lo - 0xDC00 ) + 0x10000
|
145 | return [ first_chr, 'u', cid ]
|
146 | #.......................................................................................................
|
147 | else
|
148 | [ chr
|
149 | csg
|
150 | cid_hex
|
151 | cid_dec ] = match
|
152 | cid = if cid_hex? then parseInt cid_hex, 16 else parseInt cid_dec, 10
|
153 | csg = 'u' if csg.length is 0
|
154 | return [ first_chr, csg, cid ]
|
155 |
|
156 |
|
157 | # #-----------------------------------------------------------------------------------------------------------
|
158 | # @cid_from_ncr = ( ) ->
|
159 |
|
160 | # #-----------------------------------------------------------------------------------------------------------
|
161 | # @cid_from_xncr = ( ) ->
|
162 |
|
163 | # #-----------------------------------------------------------------------------------------------------------
|
164 | # @cid_from_fncr = ( ) ->
|
165 |
|
166 |
|
167 | #===========================================================================================================
|
168 | # CONVERTING FROM CID &c
|
169 | #-----------------------------------------------------------------------------------------------------------
|
170 | @as_csg = ( cid_hint, O ) -> return ( @_csg_cid_from_hint cid_hint, O )[ 0 ]
|
171 | @as_cid = ( cid_hint, O ) -> return ( @_csg_cid_from_hint cid_hint, O )[ 1 ]
|
172 | #...........................................................................................................
|
173 | @as_chr = ( cid_hint, O ) -> return @_as_chr.apply @, @_csg_cid_from_hint cid_hint, O
|
174 | @as_uchr = ( cid_hint, O ) -> return @_as_uchr.apply @, @_csg_cid_from_hint cid_hint, O
|
175 | @as_fncr = ( cid_hint, O ) -> return @_as_fncr.apply @, @_csg_cid_from_hint cid_hint, O
|
176 | @as_sfncr = ( cid_hint, O ) -> return @_as_sfncr.apply @, @_csg_cid_from_hint cid_hint, O
|
177 | @as_xncr = ( cid_hint, O ) -> return @_as_xncr.apply @, @_csg_cid_from_hint cid_hint, O
|
178 | @as_ncr = ( cid_hint, O ) -> return @_as_xncr.apply @, @_csg_cid_from_hint cid_hint, O
|
179 | @as_rsg = ( cid_hint, O ) -> return @_as_rsg.apply @, @_csg_cid_from_hint cid_hint, O
|
180 | @as_range_name = ( cid_hint, O ) -> return @_as_range_name.apply @, @_csg_cid_from_hint cid_hint, O
|
181 | #...........................................................................................................
|
182 | @analyze = ( cid_hint, O ) -> return @_analyze.apply @, @_csg_cid_from_hint cid_hint, O
|
183 |
|
184 | #-----------------------------------------------------------------------------------------------------------
|
185 | @_analyze = ( csg, cid ) ->
|
186 | if csg is 'u'
|
187 | chr = @_unicode_chr_from_cid cid
|
188 | ncr = xncr = @_as_xncr csg, cid
|
189 | else
|
190 | chr = @_as_xncr csg, cid
|
191 | xncr = @_as_xncr csg, cid
|
192 | ncr = @_as_xncr 'u', cid
|
193 | #.........................................................................................................
|
194 | R =
|
195 | '~isa': 'CHR/info'
|
196 | 'chr': chr
|
197 | 'uchr': @_unicode_chr_from_cid cid
|
198 | 'csg': csg
|
199 | 'cid': cid
|
200 | 'fncr': @_as_fncr csg, cid
|
201 | 'sfncr': @_as_sfncr csg, cid
|
202 | 'ncr': ncr
|
203 | 'xncr': xncr
|
204 | 'rsg': @_as_rsg csg, cid
|
205 | #.........................................................................................................
|
206 | return R
|
207 |
|
208 | #-----------------------------------------------------------------------------------------------------------
|
209 | @_as_chr = ( csg, cid ) ->
|
210 | return @_unicode_chr_from_cid cid if csg is 'u'
|
211 | return ( @_analyze csg, cid )[ 'chr' ]
|
212 |
|
213 | #-----------------------------------------------------------------------------------------------------------
|
214 | @_as_uchr = ( csg, cid ) ->
|
215 | return @_unicode_chr_from_cid cid
|
216 |
|
217 | #-----------------------------------------------------------------------------------------------------------
|
218 | @_unicode_chr_from_cid = ( cid ) ->
|
219 | return String.fromCharCode cid if cid <= 0xffff
|
220 | ### thx to http://perldoc.perl.org/Encode/Unicode.html ###
|
221 | hi = ( Math.floor ( cid - 0x10000 ) / 0x400 ) + 0xD800
|
222 | lo = ( cid - 0x10000 ) % 0x400 + 0xDC00
|
223 | return ( String.fromCharCode hi ) + ( String.fromCharCode lo )
|
224 |
|
225 | #-----------------------------------------------------------------------------------------------------------
|
226 | @_as_fncr = ( csg, cid ) ->
|
227 | rsg = ( @_as_rsg csg, cid ) ? csg
|
228 | return "#{rsg}-#{cid.toString 16}"
|
229 |
|
230 | #-----------------------------------------------------------------------------------------------------------
|
231 | @_as_sfncr = ( csg, cid ) ->
|
232 | return "#{csg}-#{cid.toString 16}"
|
233 |
|
234 | #-----------------------------------------------------------------------------------------------------------
|
235 | @_as_xncr = ( csg, cid ) ->
|
236 | csg = '' if csg is 'u' or not csg?
|
237 | return "&#{csg}#x#{cid.toString 16};"
|
238 |
|
239 | #-----------------------------------------------------------------------------------------------------------
|
240 | @_as_rsg = ( csg, cid ) ->
|
241 | return binary_interval_search @_names_and_ranges_by_csg[ csg ], 'first-cid', 'last-cid', 'rsg', cid
|
242 |
|
243 | #-----------------------------------------------------------------------------------------------------------
|
244 | @_as_range_name = ( csg, cid ) ->
|
245 | return binary_interval_search @_names_and_ranges_by_csg[ csg ], 'first-cid', 'last-cid', 'range-name', cid
|
246 |
|
247 |
|
248 | #===========================================================================================================
|
249 | # ANALYZE ARGUMENTS
|
250 | #-----------------------------------------------------------------------------------------------------------
|
251 | @_csg_cid_from_hint = ( cid_hint, options ) ->
|
252 | ### This helper is used to derive the correct CSG and CID from arguments as accepted by the `as_*` family
|
253 | of methods, such as `CHR.as_fncr`, `CHR.as_rsg` and so on; its output may be directly applied to the
|
254 | respective namesake private method (`CHR._as_fncr`, `CHR._as_rsg` and so on). The method arguments should
|
255 | obey the following rules:
|
256 |
|
257 | * Methods may be called with one or two arguments; the first is known as the 'CID hint', the second as
|
258 | 'options'.
|
259 |
|
260 | * The CID hint may be a number or a text; if it is a number, it is understood as a CID; if it
|
261 | is a text, its interpretation is subject to the `options[ 'input' ]` setting.
|
262 |
|
263 | * Options must be a POD with the optional members `input` and `csg`.
|
264 |
|
265 | * `options[ 'input' ]` is *only* observed if the CID hint is a text; it governs which kinds of character
|
266 | references are recognized in the text. `input` may be one of `plain`, `ncr`, or `xncr`; it defaults to
|
267 | `plain` (no character references will be recognized).
|
268 |
|
269 | * `options[ 'csg' ]` sets the character set sigil. If `csg` is set in the options, then it will override
|
270 | whatever the outcome of `CHR.csg_cid_from_chr` w.r.t. CSG is—in other words, if you call
|
271 | `CHR.as_sfncr '&jzr#xe100', input: 'xncr', csg: 'u'`, you will get `u-e100`, with the numerically
|
272 | equivalent codepoint from the `u` (Unicode) character set.
|
273 |
|
274 | * Before CSG and CID are returned, they will be validated for plausibility.
|
275 |
|
276 | ###
|
277 | #.........................................................................................................
|
278 | switch type = CND.type_of options
|
279 | when 'null', 'jsundefined'
|
280 | csg_of_options = null
|
281 | input_mode = null
|
282 | when 'pod'
|
283 | csg_of_options = options[ 'csg' ]
|
284 | input_mode = options[ 'input' ]
|
285 | else
|
286 | throw new Error "expected a POD as second argument, got a #{type}"
|
287 | #.........................................................................................................
|
288 | switch type = CND.type_of cid_hint
|
289 | when 'number'
|
290 | csg_of_cid_hint = null
|
291 | cid = cid_hint
|
292 | when 'text'
|
293 | [ csg_of_cid_hint
|
294 | cid ] = @csg_cid_from_chr cid_hint, input: input_mode
|
295 | else
|
296 | throw new Error "expected a text or a number as first argument, got a #{type}"
|
297 | #.........................................................................................................
|
298 | if csg_of_options?
|
299 | csg = csg_of_options
|
300 | else if csg_of_cid_hint?
|
301 | csg = csg_of_cid_hint
|
302 | else
|
303 | csg = 'u'
|
304 | #.........................................................................................................
|
305 | @validate_is_csg csg
|
306 | @validate_is_cid cid
|
307 | return [ csg, cid, ]
|
308 |
|
309 |
|
310 | #===========================================================================================================
|
311 | # PATTERNS
|
312 | #-----------------------------------------------------------------------------------------------------------
|
313 | # G: grouped
|
314 | # O: optional
|
315 | name = ( /// (?: [a-z][a-z0-9]* ) /// ).source
|
316 | # nameG = ( /// ( (?: [a-z][a-z0-9]* ) | ) /// ).source
|
317 | nameO = ( /// (?: (?: [a-z][a-z0-9]* ) | ) /// ).source
|
318 | nameOG = ( /// ( (?: [a-z][a-z0-9]* ) | ) /// ).source
|
319 | hex = ( /// (?: x [a-fA-F0-9]+ ) /// ).source
|
320 | hexG = ( /// (?: x ([a-fA-F0-9]+) ) /// ).source
|
321 | dec = ( /// (?: [ 0-9]+ ) /// ).source
|
322 | decG = ( /// (?: ([ 0-9]+) ) /// ).source
|
323 | #...........................................................................................................
|
324 | @_csg_matcher = /// ^ #{name} $ ///
|
325 | @_ncr_matcher = /// (?: & \# (?: #{hex} | #{dec} ) ; ) ///
|
326 | @_xncr_matcher = /// (?: & #{nameO} \# (?: #{hex} | #{dec} ) ; ) ///
|
327 | @_ncr_csg_cid_matcher = /// (?: & () \# (?: #{hexG} | #{decG} ) ; ) ///
|
328 | @_xncr_csg_cid_matcher = /// (?: & #{nameOG} \# (?: #{hexG} | #{decG} ) ; ) ///
|
329 | #...........................................................................................................
|
330 | ### Matchers for surrogate sequences and non-surrogate, 'ordinary' characters: ###
|
331 | @_surrogate_matcher = /// (?: [ \ud800-\udbff ] [ \udc00-\udfff ] ) ///
|
332 | @_nonsurrogate_matcher = /// [^ \ud800-\udbff \udc00-\udfff ] ///
|
333 | #...........................................................................................................
|
334 | ### Matchers for the first character of a string, in three modes (`plain`, `ncr`, `xncr`): ###
|
335 | @_first_chr_matcher_plain = /// ^ (?: #{@_surrogate_matcher.source} |
|
336 | #{@_nonsurrogate_matcher.source} ) ///
|
337 | @_first_chr_matcher_ncr = /// ^ (?: #{@_surrogate_matcher.source} |
|
338 | #{@_ncr_csg_cid_matcher.source} |
|
339 | #{@_nonsurrogate_matcher.source} ) ///
|
340 | @_first_chr_matcher_xncr = /// ^ (?: #{@_surrogate_matcher.source} |
|
341 | #{@_xncr_csg_cid_matcher.source} |
|
342 | #{@_nonsurrogate_matcher.source} ) ///
|
343 | #...........................................................................................................
|
344 | @_plain_splitter = /// ( #{@_surrogate_matcher.source} |
|
345 | #{@_nonsurrogate_matcher.source} ) ///
|
346 | @_ncr_splitter = /// ( #{@_ncr_matcher.source} |
|
347 | #{@_surrogate_matcher.source} |
|
348 | #{@_nonsurrogate_matcher.source} ) ///
|
349 | @_xncr_splitter = /// ( #{@_xncr_matcher.source} |
|
350 | #{@_surrogate_matcher.source} |
|
351 | #{@_nonsurrogate_matcher.source} ) ///
|
352 |
|
353 |
|
354 | #===========================================================================================================
|
355 | #
|
356 | #-----------------------------------------------------------------------------------------------------------
|
357 | @cid_range_from_rsg = ( rsg ) ->
|
358 | # [ csg, ... ] = rsg.split '-'
|
359 | unless ( R = @_ranges_by_rsg[ rsg ] )?
|
360 | throw new Error "unknown RSG: #{rpr rsg}"
|
361 | return R
|
362 |
|
363 |
|
364 | #===========================================================================================================
|
365 | # VALIDATION
|
366 | #-----------------------------------------------------------------------------------------------------------
|
367 | @validate_is_csg = ( x ) ->
|
368 | CND.validate_isa_text x
|
369 | throw new Error "not a valid CSG: #{rpr x}" unless ( x.match @_csg_matcher )?
|
370 | throw new Error "unknown CSG: #{rpr x}" unless @_names_and_ranges_by_csg[ x ]?
|
371 | return null
|
372 |
|
373 | #-----------------------------------------------------------------------------------------------------------
|
374 | @validate_is_cid = ( x ) ->
|
375 | CND.validate_isa_number x
|
376 | # if x < 0 or x > 0x10ffff or ( parseInt x ) != x
|
377 | if x < 0 or x > 0xffffffff or ( parseInt x ) != x
|
378 | throw new Error "expected an integer between 0x0 and 0x10ffff, got 0x#{x.toString 16}"
|
379 | return null
|
380 |
|
381 |
|
382 |
|
383 |
|
384 |
|
385 |
|
386 | # console.log name for name of @
|
387 | # console.log String.fromCharCode 0x61
|
388 | # console.log String.fromCharCode 0x24563
|
389 |
|
390 |
|
391 |
|