Вставить элементы html в строку текста, чтобы соответствовать другой строке html

имеют два файла pdf и html, считывая файлы в виде строк простого текста (после извлечения текста из pdf) и html и теперь, пытаясь сделать обычный текст, те же теги html, что и html string. Затем, чтобы сравнить их, чтобы найти различия

Final Редактировать простой пример, который в настоящее время не работает

diff эти две строки выводятся, а text2 показывает разницу, поскольку "другой" вставлен

Ответы

Ответ 1

Это простое решение того, что вы хотите, это динамическое решение, поскольку оно будет обрабатывать любые найденные теги и сравнивать только текстовое содержимое. findDiff() найдет разницу и вызовет функцию обратного вызова с выходом и массив разных слов в качестве параметров.

JSFiddle: https://jsfiddle.net/9svuc7om/18/

/**
 * Parse and construct an Array of PDF text tokens
 * @params {string} text   The PDF text to be parsed
 * @return {object}         The parsed Array of tokens
 */
function parsePDFText(text) {
    var token = text.split(' ');
    for (var i=0,l=token.length; i<l; i++) {
        // remove token of first space and consecutive space
        if (token[i] == '') {
            token.splice(i, 1);
        }
    }
    return token;
}

/**
 * Return the minimum indexOf among all the arguments
 * @params {...number} index  The indexOf
 * @return {number}           The minimum indexOf, -1 if all arguments are -1
 */
function findMinIndex() {
    var min;
    for (var i = 0, l = arguments.length; i < l; i++) {
        // indexOf() returns -1 if not found
        if (arguments[i] === -1) {
            continue;
        }
        if (typeof min === 'undefined' || arguments[i] < min) {
            min = arguments[i];
        }
    }
    return min || -1;
}

/**
 * Parse and construct an Array of HTML tokens
 * @params {string} text   The HTML text to be parsed
 * @return {object}       The parsed Array of tokens
 */
function parseHTMLText(text) {
    var currentIndex = 0,
        tl = text.length,
        tokens = [],
        token, firstChar, endPos;
    while (currentIndex < tl) {
        // determine the next token type
        firstChar = text.charAt(currentIndex);
        if (firstChar == '<') {
            // a tag
            // find the position of closing tag, assume all tags are well formed
            endPos = text.indexOf('>', currentIndex + 1) + 1;
            token = {
                type: 'tag',
                content: text.slice(currentIndex, endPos), 
                valid: true
            }
            currentIndex = endPos;
        } else if (firstChar == ' ') {
            // a space
            token = {
                type: 'space', 
                content: ' ', 
                valid: true
            }
            currentIndex++;
        } else {
            // a character, possibliy part of a word
            // find the end of the word
            // assume a word is delimitered either by tags or space
            endPos = findMinIndex(text.indexOf('<', currentIndex), text.indexOf(' ', currentIndex));
            // endPos is `-1` if there are not delimiter anymore, end of string reached
            if (endPos === -1) {
                endPos = tl;
            }
            token = {
                type: 'text',
                content: text.slice(currentIndex, endPos), 
                valid: true
            }
            currentIndex = endPos;
        }
        tokens.push(token);
    }
    return tokens;
}

/**
 * Find the difference between pdf text and html text and pass the output and differenc to a callback function
 * @params {string} pdfText     The pdf text
 * @params {string} htmlText    The html text
 * @params {function} callback  The callback function
 */
function findDiff(pdfText, htmlText, callback) {
    var output = '', // the final output
        diff = [], // the array of different words
        pdfTokens = parsePDFText(pdfText),
        htmlTokens = parseHTMLText(htmlText), 
        j=0, hl=htmlTokens.length;
    // the pdf text is the reference point, i.e. all the words in pdf text should always be present in html text as well
    for (var i=0,pl=pdfTokens.length; i<pl; i++) {
        // find the first occurrence of the pdf text
        for(; j<hl; j++) {
            if (htmlTokens[j].type != 'text') {
                // exclude comparison to non-text
                continue;
            }
            // check if the two text matches
            if (htmlTokens[j].content == pdfTokens[i]) {
                // a match is found
                j++;
                break;
            } else {
                // push the different html token into `diff` array
                diff.push(htmlTokens[j].content);
                // set the `valid` field of token to false
                htmlTokens[j].valid = false;
            }
        }
    }
    // invalidate the rest of the html text
    for(; j<hl; j++) {
        if (htmlTokens[j].type == 'text') {
            htmlTokens[j].valid = false;
        }
    }
    // concat the final string to output
    for (j=0; j<hl; j++) {
        if (htmlTokens[j].valid) {
            output += htmlTokens[j].content;
        }
    }
    callback(output, diff);
}

И вы можете вызвать функцию, используя

findDiff(text1, text2, function(output, diff) {
    console.log(output);
    console.log(diff);
});

Однако в этом решении есть некоторые ограничения

Предполагается, что все содержимое в формате pdf присутствует в тексте HTML
Он обрабатывает только <> и пробел, если есть другой возможный разделитель, например. вкладки, необходим дополнительный код
Предполагается, что все теги хорошо сформированы и не будут закрывать теги между текстовым контентом (если вам нужно использовать вместо него > <)
Функция является упрощенным решением и не полностью протестирована. Вы не можете рассчитывать на какую-либо гарантию, и необходимы некоторые адаптации. Я предлагаю предоставить только контент внутри body или даже более узкий диапазон вместо всего HTML файла (если это возможно в вашем случае), поскольку в содержимом HTML файла будет слишком много изменений.

Ответ 2

Самый простой способ -

var s="Hello everyone on stackoverflow"
var s_split = s.split(' ');
var y = '<html><head></head><body><div>' + s_split[0] + '<span>' + s_split[1] + '</span>' + s_split[2]+' ' + s_split[3] + '</div></body></html>';

Проверьте jsfiddle

Ответ 3

Почему бы просто не разделить html-теги и сравнить текст.

var s = "Hello everyone on stackoverflow";

var y = "<html><head><head><body><div>Hello<span>everyone</span>on stackoverflow</div></body></html>";

//using regular expressions match HTML tags and replace them with empty string. Make sure to trim the output so that the extra whitespaces at either end are removed.
var z = y.replace(/(<([^>]+)>)/ig, ' ').trim();

//compare if the stripped string matches the other string.
if(z == s) {
    s = y;  
}
alert(s);

fiddle

Ответ 4

Если вам нужно обернуть определенное слово или текст, тогда выполните поиск и замените его примерно так:

var f = "Hello everyone on stackoverflow";
var o = "Hello";
var e = "everyone on";
var s = "stackoverflow";

if (f.indexOf(e) >= 0) {
    var h = f.replace(e,"<strong>"+e+"</strong>");
}else{
    var h = f;
}
if (h.indexOf(s) >= 0){
    var h = h.replace(s,"<em>"+s+"</em>");
}
if (h.indexOf(o) >= 0){
    var h = h.replace(o,"<u>"+o+"</u>");
}

$('body').append('<div>'+h+'</div>');

Пример здесь: https://jsfiddle.net/jwqrgsL1/1/