1 | try {
|
2 |
|
3 | const fs = require('fs');
|
4 | var path = require('path');
|
5 | var scroll = fs.readFileSync(path.join('interim', 'sanitized.html'));
|
6 |
|
7 | } catch (e) {
|
8 | if (e.code === 'ENOENT') {
|
9 | console.log('File not found!');
|
10 |
|
11 | } else {
|
12 | throw e;
|
13 | }
|
14 | process.exit(1);
|
15 | }
|
16 |
|
17 |
|
18 | function normalize() {
|
19 |
|
20 | const PARA_WORD_LIMIT = 40;
|
21 | const wordcount = require('wordcount');
|
22 | const cheerio = require("cheerio");
|
23 | const $ = cheerio.load(scroll);
|
24 |
|
25 | let normalizedHTML = "";
|
26 | let newElem = "";
|
27 |
|
28 | $('body').children().each(function(i, elem) {
|
29 |
|
30 | if (elem.name === 'p') {
|
31 |
|
32 | const para = $(this).text();
|
33 | const paraLength = wordcount(para);
|
34 |
|
35 | const ratio = Math.round(paraLength / PARA_WORD_LIMIT);
|
36 |
|
37 | if (ratio > 1) {
|
38 | newElem = splitPara(para, ratio);
|
39 | } else {
|
40 | newElem = $(this);
|
41 | }
|
42 |
|
43 | } else {
|
44 | newElem = $(this);
|
45 | }
|
46 | normalizedHTML = normalizedHTML + newElem;
|
47 | newElem = "";
|
48 | });
|
49 |
|
50 |
|
51 | outPutNormalizedBook(normalizedHTML);
|
52 |
|
53 | function splitPara(para, ratio) {
|
54 | const text = mutateHonorifics(para).trim();
|
55 |
|
56 | const sentencesArray = text.split(". ");
|
57 |
|
58 | const newHTML = [];
|
59 | let newParagraph = '';
|
60 |
|
61 | for (i = 0; i < sentencesArray.length; i++) {
|
62 | let punctuation = '. ';
|
63 |
|
64 | const lastCharacter = sentencesArray[i].slice(-1);
|
65 |
|
66 | if (lastCharacter === ':' || lastCharacter === '"' || lastCharacter === ';' || lastCharacter === '-' || lastCharacter === '—' || lastCharacter === '.' || lastCharacter === ',') {
|
67 | punctuation = ' ';
|
68 | }
|
69 |
|
70 | newParagraph = newParagraph + sentencesArray[i] + punctuation;
|
71 |
|
72 | if (wordcount(newParagraph) > PARA_WORD_LIMIT) {
|
73 | newHTML.push(`<p>${newParagraph}</p>`);
|
74 | newParagraph = " ";
|
75 | }
|
76 |
|
77 | }
|
78 | const paragraphs = reverseHonorifics(newHTML.join(' '));
|
79 |
|
80 | return paragraphs;
|
81 | }
|
82 |
|
83 | function mutateHonorifics(para) {
|
84 | let paraString = para;
|
85 |
|
86 | const initials = ["Mr.", "Mrs.", "Ms.", "Jr.", "Sr.", "St.", "Dr.", "Fr.", "Br.", "Mx."];
|
87 |
|
88 | const fooInitials = ['Mr~', 'Mrs~', 'Ms~', 'Jr~', 'Sr~', 'St~', 'Dr~', 'Fr~', 'Br~', 'Mx~'];
|
89 |
|
90 | let length = initials.length;
|
91 |
|
92 | while (length--) {
|
93 | const regExp = escapeRegExp(initials[length]);
|
94 |
|
95 | if (paraString.indexOf(initials[length]) != -1) {
|
96 | paraString = paraString.replace(new RegExp(regExp, 'g'), fooInitials[length]);
|
97 | }
|
98 | }
|
99 | return paraString;
|
100 | }
|
101 |
|
102 | function reverseHonorifics(paras) {
|
103 |
|
104 | let paraString = paras;
|
105 |
|
106 | const initials = ["Mr.", "Mrs.", "Ms.", "Jr.", "Sr.", "St.", "Dr.", "Fr.", "Br.", "Mx."];
|
107 |
|
108 | const fooInitials = ['Mr~', 'Mrs~', 'Ms~', 'Jr~', 'Sr~', 'St~', 'Dr~', 'Fr~', 'Br~', 'Mx~'];
|
109 |
|
110 | let length = initials.length;
|
111 |
|
112 | while (length--) {
|
113 | const regExp = escapeRegExp(fooInitials[length]);
|
114 |
|
115 | if (paraString.indexOf(fooInitials[length]) != -1) {
|
116 | paraString = paraString.replace(new RegExp(regExp, 'g'), initials[length]);
|
117 | }
|
118 | }
|
119 | return paraString;
|
120 |
|
121 | }
|
122 |
|
123 | function escapeRegExp(str) {
|
124 | return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
|
125 | }
|
126 |
|
127 |
|
128 | function outPutNormalizedBook(book) {
|
129 |
|
130 | book = book.replace(new RegExp(""", 'g'), '"');
|
131 | book = `<body>${book}</body>`;
|
132 |
|
133 | const saver = require(path.join('..', 'lib', 'saver.js'));
|
134 |
|
135 | const filename = "normalized";
|
136 | saver.save(book, filename);
|
137 |
|
138 | }
|
139 |
|
140 | }
|
141 |
|
142 | module.exports.normalize = normalize;
|