UNPKG

20 kBtext/coffeescriptView Raw
1
2
3
4
5############################################################################################################
6CND = require 'cnd'
7rpr = CND.rpr.bind CND
8badge = 'coffeenode-chr'
9log = CND.get_logger 'plain', badge
10info = CND.get_logger 'info', badge
11alert = CND.get_logger 'alert', badge
12debug = CND.get_logger 'debug', badge
13warn = CND.get_logger 'warn', badge
14urge = CND.get_logger 'urge', badge
15whisper = CND.get_logger 'whisper', badge
16help = CND.get_logger 'help', badge
17echo = CND.echo.bind CND
18#...........................................................................................................
19character_sets_and_ranges = require './character-sets-and-ranges'
20@_names_and_ranges_by_csg = character_sets_and_ranges[ 'names-and-ranges-by-csg' ]
21@_ranges_by_rsg = character_sets_and_ranges[ 'ranges-by-rsg' ]
22binary_interval_search = require './binary-interval-search'
23
24
25
26#===========================================================================================================
27# SPLIT TEXT INTO CHARACTERS
28#-----------------------------------------------------------------------------------------------------------
29@chrs_from_text = ( text, options ) ->
30 return [] if text.length is 0
31 #.........................................................................................................
32 switch input_mode = options?[ 'input' ] ? 'plain'
33 when 'plain' then splitter = @_plain_splitter
34 when 'ncr' then splitter = @_ncr_splitter
35 when 'xncr' then splitter = @_xncr_splitter
36 else throw new Error "unknown input mode: #{rpr input_mode}"
37 #.........................................................................................................
38 return ( text.split splitter ).filter ( element, idx ) -> return element.length isnt 0
39
40#-----------------------------------------------------------------------------------------------------------
41@_new_chunk = ( csg, rsg, chrs ) ->
42 R =
43 '~isa': 'CHR/chunk'
44 'csg': csg
45 'rsg': rsg
46 # 'chrs': chrs
47 'text': chrs.join ''
48 #.........................................................................................................
49 return R
50
51#-----------------------------------------------------------------------------------------------------------
52@chunks_from_text = ( text, options ) ->
53 ### Given a `text` and `options` (of which `csg` is irrelevant here), return a list of `CHR/chunk`
54 objects (as returned by `CHR._new_chunk`) that describes stretches of characters with codepoints in the
55 same 'range' (Unicode block).
56 ###
57 R = []
58 return R if text.length is 0
59 last_csg = 'u'
60 last_rsg = null
61 chrs = []
62 #.........................................................................................................
63 switch output_mode = options?[ 'output' ] ? 'plain'
64 when 'plain'
65 transform_output = ( chr ) ->
66 return chr
67 when 'html'
68 transform_output = ( chr ) ->
69 return switch chr
70 when '&' then '&'
71 when '<' then '&lt;'
72 when '>' then '&gt;'
73 else chr
74 else
75 throw new Error "unknown output mode: #{rpr output_mode}"
76 #.........................................................................................................
77 for chr in @chrs_from_text text, options
78 description = @analyze chr, options
79 { csg
80 rsg } = description
81 chr = description[ if csg is 'u' then 'chr' else 'ncr' ]
82 if rsg isnt last_rsg
83 R.push @_new_chunk last_csg, last_rsg, chrs if chrs.length > 0
84 last_csg = csg
85 last_rsg = rsg
86 chrs = []
87 #.......................................................................................................
88 chrs.push transform_output chr
89 #.........................................................................................................
90 R.push @_new_chunk last_csg, last_rsg, chrs if chrs.length > 0
91 return R
92
93#-----------------------------------------------------------------------------------------------------------
94@html_from_text = ( text, options ) ->
95 R = []
96 #.........................................................................................................
97 input_mode = options?[ 'input' ] ? 'plain'
98 chunks = @chunks_from_text text, input: input_mode, output: 'html'
99 for chunk in chunks
100 R.push """<span class="#{chunk[ 'rsg' ]}">#{chunk[ 'text' ]}</span>"""
101 #.........................................................................................................
102 return R.join ''
103
104#===========================================================================================================
105# CONVERTING TO CID
106#-----------------------------------------------------------------------------------------------------------
107@cid_from_chr = ( chr, options ) ->
108 input_mode = options?[ 'input' ] ? 'plain'
109 return ( @_chr_csg_cid_from_chr chr, input_mode )[ 2 ]
110
111#-----------------------------------------------------------------------------------------------------------
112@csg_cid_from_chr = ( chr, options ) ->
113 input_mode = options?[ 'input' ] ? 'plain'
114 return ( @_chr_csg_cid_from_chr chr, input_mode )[ 1 .. ]
115
116#-----------------------------------------------------------------------------------------------------------
117@_chr_csg_cid_from_chr = ( chr, input_mode ) ->
118 ### Given a text with one or more characters, return the first character, its CSG, and its CID (as a
119 non-negative integer). Additionally, an input mode may be given as either `plain`, `ncr`, or `xncr`.
120 ###
121 #.........................................................................................................
122 throw new Error "unable to obtain CID from empty string" if chr.length is 0
123 #.........................................................................................................
124 input_mode ?= 'plain'
125 switch input_mode
126 when 'plain' then matcher = @_first_chr_matcher_plain
127 when 'ncr' then matcher = @_first_chr_matcher_ncr
128 when 'xncr' then matcher = @_first_chr_matcher_xncr
129 else throw new Error "unknown input mode: #{rpr input_mode}"
130 #.........................................................................................................
131 match = chr.match matcher
132 throw new Error "illegal character sequence in #{rpr chr}" unless match?
133 first_chr = match[ 0 ]
134 #.........................................................................................................
135 switch first_chr.length
136 #.......................................................................................................
137 when 1
138 return [ first_chr, 'u', first_chr.charCodeAt 0 ]
139 #.......................................................................................................
140 when 2
141 ### thx to http://perldoc.perl.org/Encode/Unicode.html ###
142 hi = first_chr.charCodeAt 0
143 lo = first_chr.charCodeAt 1
144 cid = ( hi - 0xD800 ) * 0x400 + ( lo - 0xDC00 ) + 0x10000
145 return [ first_chr, 'u', cid ]
146 #.......................................................................................................
147 else
148 [ chr
149 csg
150 cid_hex
151 cid_dec ] = match
152 cid = if cid_hex? then parseInt cid_hex, 16 else parseInt cid_dec, 10
153 csg = 'u' if csg.length is 0
154 return [ first_chr, csg, cid ]
155
156
157# #-----------------------------------------------------------------------------------------------------------
158# @cid_from_ncr = ( ) ->
159
160# #-----------------------------------------------------------------------------------------------------------
161# @cid_from_xncr = ( ) ->
162
163# #-----------------------------------------------------------------------------------------------------------
164# @cid_from_fncr = ( ) ->
165
166
167#===========================================================================================================
168# CONVERTING FROM CID &c
169#-----------------------------------------------------------------------------------------------------------
170@as_csg = ( cid_hint, O ) -> return ( @_csg_cid_from_hint cid_hint, O )[ 0 ]
171@as_cid = ( cid_hint, O ) -> return ( @_csg_cid_from_hint cid_hint, O )[ 1 ]
172#...........................................................................................................
173@as_chr = ( cid_hint, O ) -> return @_as_chr.apply @, @_csg_cid_from_hint cid_hint, O
174@as_uchr = ( cid_hint, O ) -> return @_as_uchr.apply @, @_csg_cid_from_hint cid_hint, O
175@as_fncr = ( cid_hint, O ) -> return @_as_fncr.apply @, @_csg_cid_from_hint cid_hint, O
176@as_sfncr = ( cid_hint, O ) -> return @_as_sfncr.apply @, @_csg_cid_from_hint cid_hint, O
177@as_xncr = ( cid_hint, O ) -> return @_as_xncr.apply @, @_csg_cid_from_hint cid_hint, O
178@as_ncr = ( cid_hint, O ) -> return @_as_xncr.apply @, @_csg_cid_from_hint cid_hint, O
179@as_rsg = ( cid_hint, O ) -> return @_as_rsg.apply @, @_csg_cid_from_hint cid_hint, O
180@as_range_name = ( cid_hint, O ) -> return @_as_range_name.apply @, @_csg_cid_from_hint cid_hint, O
181#...........................................................................................................
182@analyze = ( cid_hint, O ) -> return @_analyze.apply @, @_csg_cid_from_hint cid_hint, O
183
184#-----------------------------------------------------------------------------------------------------------
185@_analyze = ( csg, cid ) ->
186 if csg is 'u'
187 chr = @_unicode_chr_from_cid cid
188 ncr = xncr = @_as_xncr csg, cid
189 else
190 chr = @_as_xncr csg, cid
191 xncr = @_as_xncr csg, cid
192 ncr = @_as_xncr 'u', cid
193 #.........................................................................................................
194 R =
195 '~isa': 'CHR/info'
196 'chr': chr
197 'uchr': @_unicode_chr_from_cid cid
198 'csg': csg
199 'cid': cid
200 'fncr': @_as_fncr csg, cid
201 'sfncr': @_as_sfncr csg, cid
202 'ncr': ncr
203 'xncr': xncr
204 'rsg': @_as_rsg csg, cid
205 #.........................................................................................................
206 return R
207
208#-----------------------------------------------------------------------------------------------------------
209@_as_chr = ( csg, cid ) ->
210 return @_unicode_chr_from_cid cid if csg is 'u'
211 return ( @_analyze csg, cid )[ 'chr' ]
212
213#-----------------------------------------------------------------------------------------------------------
214@_as_uchr = ( csg, cid ) ->
215 return @_unicode_chr_from_cid cid
216
217#-----------------------------------------------------------------------------------------------------------
218@_unicode_chr_from_cid = ( cid ) ->
219 return String.fromCharCode cid if cid <= 0xffff
220 ### thx to http://perldoc.perl.org/Encode/Unicode.html ###
221 hi = ( Math.floor ( cid - 0x10000 ) / 0x400 ) + 0xD800
222 lo = ( cid - 0x10000 ) % 0x400 + 0xDC00
223 return ( String.fromCharCode hi ) + ( String.fromCharCode lo )
224
225#-----------------------------------------------------------------------------------------------------------
226@_as_fncr = ( csg, cid ) ->
227 rsg = ( @_as_rsg csg, cid ) ? csg
228 return "#{rsg}-#{cid.toString 16}"
229
230#-----------------------------------------------------------------------------------------------------------
231@_as_sfncr = ( csg, cid ) ->
232 return "#{csg}-#{cid.toString 16}"
233
234#-----------------------------------------------------------------------------------------------------------
235@_as_xncr = ( csg, cid ) ->
236 csg = '' if csg is 'u' or not csg?
237 return "&#{csg}#x#{cid.toString 16};"
238
239#-----------------------------------------------------------------------------------------------------------
240@_as_rsg = ( csg, cid ) ->
241 return binary_interval_search @_names_and_ranges_by_csg[ csg ], 'first-cid', 'last-cid', 'rsg', cid
242
243#-----------------------------------------------------------------------------------------------------------
244@_as_range_name = ( csg, cid ) ->
245 return binary_interval_search @_names_and_ranges_by_csg[ csg ], 'first-cid', 'last-cid', 'range-name', cid
246
247
248#===========================================================================================================
249# ANALYZE ARGUMENTS
250#-----------------------------------------------------------------------------------------------------------
251@_csg_cid_from_hint = ( cid_hint, options ) ->
252 ### This helper is used to derive the correct CSG and CID from arguments as accepted by the `as_*` family
253 of methods, such as `CHR.as_fncr`, `CHR.as_rsg` and so on; its output may be directly applied to the
254 respective namesake private method (`CHR._as_fncr`, `CHR._as_rsg` and so on). The method arguments should
255 obey the following rules:
256
257 * Methods may be called with one or two arguments; the first is known as the 'CID hint', the second as
258 'options'.
259
260 * The CID hint may be a number or a text; if it is a number, it is understood as a CID; if it
261 is a text, its interpretation is subject to the `options[ 'input' ]` setting.
262
263 * Options must be a POD with the optional members `input` and `csg`.
264
265 * `options[ 'input' ]` is *only* observed if the CID hint is a text; it governs which kinds of character
266 references are recognized in the text. `input` may be one of `plain`, `ncr`, or `xncr`; it defaults to
267 `plain` (no character references will be recognized).
268
269 * `options[ 'csg' ]` sets the character set sigil. If `csg` is set in the options, then it will override
270 whatever the outcome of `CHR.csg_cid_from_chr` w.r.t. CSG is—in other words, if you call
271 `CHR.as_sfncr '&jzr#xe100', input: 'xncr', csg: 'u'`, you will get `u-e100`, with the numerically
272 equivalent codepoint from the `u` (Unicode) character set.
273
274 * Before CSG and CID are returned, they will be validated for plausibility.
275
276 ###
277 #.........................................................................................................
278 switch type = CND.type_of options
279 when 'null', 'jsundefined'
280 csg_of_options = null
281 input_mode = null
282 when 'pod'
283 csg_of_options = options[ 'csg' ]
284 input_mode = options[ 'input' ]
285 else
286 throw new Error "expected a POD as second argument, got a #{type}"
287 #.........................................................................................................
288 switch type = CND.type_of cid_hint
289 when 'number'
290 csg_of_cid_hint = null
291 cid = cid_hint
292 when 'text'
293 [ csg_of_cid_hint
294 cid ] = @csg_cid_from_chr cid_hint, input: input_mode
295 else
296 throw new Error "expected a text or a number as first argument, got a #{type}"
297 #.........................................................................................................
298 if csg_of_options?
299 csg = csg_of_options
300 else if csg_of_cid_hint?
301 csg = csg_of_cid_hint
302 else
303 csg = 'u'
304 #.........................................................................................................
305 @validate_is_csg csg
306 @validate_is_cid cid
307 return [ csg, cid, ]
308
309
310#===========================================================================================================
311# PATTERNS
312#-----------------------------------------------------------------------------------------------------------
313# G: grouped
314# O: optional
315name = ( /// (?: [a-z][a-z0-9]* ) /// ).source
316# nameG = ( /// ( (?: [a-z][a-z0-9]* ) | ) /// ).source
317nameO = ( /// (?: (?: [a-z][a-z0-9]* ) | ) /// ).source
318nameOG = ( /// ( (?: [a-z][a-z0-9]* ) | ) /// ).source
319hex = ( /// (?: x [a-fA-F0-9]+ ) /// ).source
320hexG = ( /// (?: x ([a-fA-F0-9]+) ) /// ).source
321dec = ( /// (?: [ 0-9]+ ) /// ).source
322decG = ( /// (?: ([ 0-9]+) ) /// ).source
323#...........................................................................................................
324@_csg_matcher = /// ^ #{name} $ ///
325@_ncr_matcher = /// (?: & \# (?: #{hex} | #{dec} ) ; ) ///
326@_xncr_matcher = /// (?: & #{nameO} \# (?: #{hex} | #{dec} ) ; ) ///
327@_ncr_csg_cid_matcher = /// (?: & () \# (?: #{hexG} | #{decG} ) ; ) ///
328@_xncr_csg_cid_matcher = /// (?: & #{nameOG} \# (?: #{hexG} | #{decG} ) ; ) ///
329#...........................................................................................................
330### Matchers for surrogate sequences and non-surrogate, 'ordinary' characters: ###
331@_surrogate_matcher = /// (?: [ \ud800-\udbff ] [ \udc00-\udfff ] ) ///
332@_nonsurrogate_matcher = /// [^ \ud800-\udbff \udc00-\udfff ] ///
333#...........................................................................................................
334### Matchers for the first character of a string, in three modes (`plain`, `ncr`, `xncr`): ###
335@_first_chr_matcher_plain = /// ^ (?: #{@_surrogate_matcher.source} |
336 #{@_nonsurrogate_matcher.source} ) ///
337@_first_chr_matcher_ncr = /// ^ (?: #{@_surrogate_matcher.source} |
338 #{@_ncr_csg_cid_matcher.source} |
339 #{@_nonsurrogate_matcher.source} ) ///
340@_first_chr_matcher_xncr = /// ^ (?: #{@_surrogate_matcher.source} |
341 #{@_xncr_csg_cid_matcher.source} |
342 #{@_nonsurrogate_matcher.source} ) ///
343#...........................................................................................................
344@_plain_splitter = /// ( #{@_surrogate_matcher.source} |
345 #{@_nonsurrogate_matcher.source} ) ///
346@_ncr_splitter = /// ( #{@_ncr_matcher.source} |
347 #{@_surrogate_matcher.source} |
348 #{@_nonsurrogate_matcher.source} ) ///
349@_xncr_splitter = /// ( #{@_xncr_matcher.source} |
350 #{@_surrogate_matcher.source} |
351 #{@_nonsurrogate_matcher.source} ) ///
352
353
354#===========================================================================================================
355#
356#-----------------------------------------------------------------------------------------------------------
357@cid_range_from_rsg = ( rsg ) ->
358 # [ csg, ... ] = rsg.split '-'
359 unless ( R = @_ranges_by_rsg[ rsg ] )?
360 throw new Error "unknown RSG: #{rpr rsg}"
361 return R
362
363
364#===========================================================================================================
365# VALIDATION
366#-----------------------------------------------------------------------------------------------------------
367@validate_is_csg = ( x ) ->
368 CND.validate_isa_text x
369 throw new Error "not a valid CSG: #{rpr x}" unless ( x.match @_csg_matcher )?
370 throw new Error "unknown CSG: #{rpr x}" unless @_names_and_ranges_by_csg[ x ]?
371 return null
372
373#-----------------------------------------------------------------------------------------------------------
374@validate_is_cid = ( x ) ->
375 CND.validate_isa_number x
376 # if x < 0 or x > 0x10ffff or ( parseInt x ) != x
377 if x < 0 or x > 0xffffffff or ( parseInt x ) != x
378 throw new Error "expected an integer between 0x0 and 0x10ffff, got 0x#{x.toString 16}"
379 return null
380
381
382
383
384
385
386# console.log name for name of @
387# console.log String.fromCharCode 0x61
388# console.log String.fromCharCode 0x24563
389
390
391