UNPKG

3.83 kBJavaScriptView Raw
1try {
2 // Pick up the sanitized html
3 const fs = require('fs');
4 var path = require('path');
5 var scroll = fs.readFileSync(path.join('interim', 'sanitized.html'));
6
7} catch (e) {
8 if (e.code === 'ENOENT') {
9 console.log('File not found!');
10
11 } else {
12 throw e;
13 }
14 process.exit(1);
15}
16
17
18function normalize() {
19
20 const PARA_WORD_LIMIT = 40; // Fine tune according to style or literature.
21 const wordcount = require('wordcount');
22 const cheerio = require("cheerio");
23 const $ = cheerio.load(scroll);
24
25 let normalizedHTML = "";
26 let newElem = "";
27
28 $('body').children().each(function(i, elem) {
29
30 if (elem.name === 'p') {
31
32 const para = $(this).text();
33 const paraLength = wordcount(para);
34
35 const ratio = Math.round(paraLength / PARA_WORD_LIMIT);
36
37 if (ratio > 1) {
38 newElem = splitPara(para, ratio);
39 } else {
40 newElem = $(this);
41 }
42
43 } else {
44 newElem = $(this);
45 }
46 normalizedHTML = normalizedHTML + newElem;
47 newElem = "";
48 });
49
50
51 outPutNormalizedBook(normalizedHTML);
52
53 function splitPara(para, ratio) {
54 const text = mutateHonorifics(para).trim();
55
56 const sentencesArray = text.split(". ");
57
58 const newHTML = [];
59 let newParagraph = '';
60
61 for (i = 0; i < sentencesArray.length; i++) {
62 let punctuation = '. ';
63
64 const lastCharacter = sentencesArray[i].slice(-1);
65
66 if (lastCharacter === ':' || lastCharacter === '"' || lastCharacter === ';' || lastCharacter === '-' || lastCharacter === '—' || lastCharacter === '.' || lastCharacter === ',') {
67 punctuation = ' ';
68 }
69
70 newParagraph = newParagraph + sentencesArray[i] + punctuation;
71
72 if (wordcount(newParagraph) > PARA_WORD_LIMIT) {
73 newHTML.push(`<p>${newParagraph}</p>`);
74 newParagraph = " ";
75 }
76
77 }
78 const paragraphs = reverseHonorifics(newHTML.join(' '));
79
80 return paragraphs;
81 }
82
83 function mutateHonorifics(para) {
84 let paraString = para;
85
86 const initials = ["Mr.", "Mrs.", "Ms.", "Jr.", "Sr.", "St.", "Dr.", "Fr.", "Br.", "Mx."];
87
88 const fooInitials = ['Mr~', 'Mrs~', 'Ms~', 'Jr~', 'Sr~', 'St~', 'Dr~', 'Fr~', 'Br~', 'Mx~'];
89
90 let length = initials.length;
91
92 while (length--) {
93 const regExp = escapeRegExp(initials[length]);
94
95 if (paraString.indexOf(initials[length]) != -1) {
96 paraString = paraString.replace(new RegExp(regExp, 'g'), fooInitials[length]);
97 }
98 }
99 return paraString;
100 }
101
102 function reverseHonorifics(paras) {
103
104 let paraString = paras;
105
106 const initials = ["Mr.", "Mrs.", "Ms.", "Jr.", "Sr.", "St.", "Dr.", "Fr.", "Br.", "Mx."];
107
108 const fooInitials = ['Mr~', 'Mrs~', 'Ms~', 'Jr~', 'Sr~', 'St~', 'Dr~', 'Fr~', 'Br~', 'Mx~'];
109
110 let length = initials.length;
111
112 while (length--) {
113 const regExp = escapeRegExp(fooInitials[length]);
114
115 if (paraString.indexOf(fooInitials[length]) != -1) {
116 paraString = paraString.replace(new RegExp(regExp, 'g'), initials[length]);
117 }
118 }
119 return paraString;
120
121 }
122
123 function escapeRegExp(str) {
124 return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
125 }
126
127
128 function outPutNormalizedBook(book) {
129
130 book = book.replace(new RegExp("&quot;", 'g'), '"');
131 book = `<body>${book}</body>`;
132
133 const saver = require(path.join('..', 'lib', 'saver.js'));
134
135 const filename = "normalized";
136 saver.save(book, filename);
137
138 }
139
140}
141
142module.exports.normalize = normalize;