all files / htmlcs/lib/ parse.js

100% Statements 38/38
100% Branches 8/8
100% Functions 5/5
100% Lines 38/38
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147                          3744×   3744× 3570×     3744×                 174×   174×   174× 792×   792× 792×   792× 792×     174×     174×   174×                   174×           174×         174×     174×     174×                                   174×                                 174×                     174×     174×     174×     174×   174×        
/**
 * @file parse code
 * @author nighca<nighca@live.cn>
 */
 
var htmlparser2 = require('htmlparser2');
var Parser = htmlparser2.Parser;
var DomHandler = htmlparser2.DomHandler;
 
var util = require('./util');
var Node = require('./node');
 
/**
 * Transform node to Node instance & recursively transform its children.
 *
 * @param {Object} node - given node
 * @return {Node} result node
 */
var transformRecursively = function (node) {
    Node.init(node);
 
    node.childNodes.forEach(function (childNode) {
        transformRecursively(childNode);
    });
 
    return node;
};
 
/**
 * Wrap node list with <document>.
 *
 * @param {Array} arr - node list
 * @return {Node} document node
 */
var wrapDocument = function (arr) {
    var document = htmlparser2.parseDOM('<document></document>')[0];
 
    document.children = arr;
 
    for (var i = 0; i < arr.length; i++) {
        var node = arr[i];
 
        node.prev = arr[i - 1] || null;
        node.next = arr[i + 1] || null;
 
        node.root = document;
        node.parent = null;
    }
 
    transformRecursively(document);
 
    // fix startIndex missing, cause <document> is parsed seperately
    document.startIndex = document.documentElement && document.documentElement.startIndex | 0;
 
    return document;
};
 
/**
 * Get a HTML parser.
 *
 * @param {Object} options - options for create parser
 * @return {Parser} HTML parser
 */
var getParser = function (options) {
    // merge with default options
    options = util.extend({
        lowerCaseAttributeNames: false,
        recognizeCDATA: true
    }, options);
 
    // init handler
    var handler = new DomHandler({
        withStartIndices: true
    });
 
    // init parser
    var parser = new Parser(handler, options);
 
    // make handler accessible
    parser.handler = handler;
 
    // make tokenizer emittable & accessible
    parser.tokenizer = util.emittable(parser._tokenizer, [
        'attribdata',
        'opentagname',
        'opentagend',
        'selfclosingtag',
        'attribname',
        'attribend',
        'closetag',
        'declaration',
        'processinginstruction',
        'comment',
        'cdata',
        'text',
        'error',
        'end'
    ]);
 
    // make parser emittable
    parser = util.emittable(parser, [
        'processinginstruction',
        'comment',
        'commentend',
        'cdatastart',
        'text',
        'cdataend',
        'error',
        'closetag',
        'end',
        'reset',
        'parserinit',
        'opentagname',
        'opentag',
        'attribute'
    ]);
 
    return parser;
};
 
/**
 * Parse given html content to document node.
 *
 * @param {string} htmlCode - HTML code content
 * @param {Parser=} parser - given parser
 * @return {Node} document node
 */
var parse = function (htmlCode, parser) {
    // get parser
    parser = parser || getParser();
 
    // replace "\r\n" with "\n"
    htmlCode = htmlCode.replace(/\r\n/g, '\n');
 
    // do parse
    parser.end(htmlCode);
 
    // get dom & wrap it with <document>
    var document = wrapDocument(parser.handler.dom);
 
    return document;
};
 
parse.getParser = getParser;
 
module.exports = parse;