Skip to content
文档章节

比较HTML

说明:本算法开发時部分参考过 github 上代码 https://github.com/zhoujq/htmldiff, 特别感谢。

背景

在做富文本内容编辑的时候,有时需要比较两次编辑内容差异,需要通过前端页面显示出来。

目前在比较两个字符串的差异,可以递归获取最长子串,然后组合成一个新的字符串。而富文本不同于字符串比较,其有html标签,不能按照字符串的方案进行比较。

思路

参考 字符串比较算法

  1. html 字符串比较中可以将 html 字符串进行拆分,其拆分的每一个单字或块,可以看做字符串中的一个字符,比如 <a href="" />, <p> 中文单个字等,可以获得两个数组,比如 [<a href="" />, <p>, 中, 文, 单, 字, </p>][<a href="" />, <p>, 中, 文, 单, 字, </p>]
  2. 递归比较两个序列,获取所有相同最长子串列表
  3. 新的html可以看做一系列操作,比如 删除, 插入, 复制 等操作
  4. 在两个相同子串之间,旧的区间可以看做是删除的部分,新的区间可以看做是插入的部分

特别需要处理的是 img, <table></table> 这种元素占用位置,需要特殊处理

下面的比较算法中,比较相同可以优化,比如 对于 html标签中 <div><div class="aaa"> 其实是相等的,只是显示不同,对于文字而言,没有任何变化

代码

js
/**
 * @typedef {insert|del|equal} action
 */

/**
 * html1 and html2 same parts info
 * @typedef {Object} Match
 * @property {number} html1Start
 * @property {number} html1End
 * @property {number} html2Start
 * @property {number} html2End
 * @property {string} html1PartStr
 * @property {string} html2PartStr
 */

/**
 * Operation Part
 * @typedef {Object} Operation
 * @property {action} action
 * @property {string} text
 * @property {Word[]} list
 */

/**
 * Word
 * @typedef {Object} Word
 * @property {string} value
 * @property {boolean} isOpeningTag
 * @property {boolean} isClosingTag
 * @property {boolean} isHtmlTag
 * @property {string} [tagName]
 * @property {string} [content]
 */

const cssStrMap = {
  insert: "diff-added",
  del: "diff-removed",
  equal: "diff-equal",
};

/**
 * @class
 * @classdesc HtmlDiff, a way to compare two html
 */
class HtmlDiff {
  /**
   * HtmlDiff Constructor
   * @constructor
   * @param {string} html1
   * @param {string} html2
   */
  constructor(html1, html2) {
    this.html1 = this.preParseHtml(html1);
    this.html2 = this.preParseHtml(html2);

    /**
     * @type {string}
     */
    this.html = "";

    /**
     * html1 split words
     * @type {Word[]}
     */
    this.html1Words = [];

    /**
     * html2 split words
     * @type {Word[]}
     */
    this.html2Words = [];

    /** @type {Match[]} */
    this.matches = [];

    /** @type {Operation[]} */
    this.operations = [];

    this.contents = [];

    this.build();
  }

  build() {
    this.html1Words = this.splitHtmlToWords(this.html1);
    this.html2Words = this.splitHtmlToWords(this.html2);

    this.matches = this.findCommonParts(
      0,
      this.html1Words.length - 1,
      0,
      this.html2Words.length - 1
    );

    this.computeOperations();
    this.performOperations();
  }

  /**
   * pre parse html
   * @param {string} html
   * @returns
   */
  // eslint-disable-next-line class-methods-use-this
  preParseHtml(html) {
    html = html || "<p></br></p>";
    html = html.trim();

    if (html && (html[0] !== "<" || html[html.length - 1] !== ">")) {
      html = `<p>${html}</p>`;
    }

    return html;
  }

  /**
   * splitHtmlToWords
   * @param {string} html
   */
  // eslint-disable-next-line class-methods-use-this
  splitHtmlToWords(html) {
    const wordValues = html.match(/<[^>]+>|[^<|>|\w]|\w+\b|\s+/gm);
    const tagNameReg = /\w+\b/;
    const contentReg = /(href|src)="[^"]*"/;

    const hasContentTagNames = ["img", "a", "video"];

    const words = [];

    // eslint-disable-next-line no-unused-vars
    for (const value of wordValues) {
      const isOpeningTag = this.isOpeningTag(value);
      const isClosingTag = this.isClosingTag(value);
      const isHtmlTag = isOpeningTag || isClosingTag;

      let content = value;
      let tagName = "";

      if (isHtmlTag) {
        // eslint-disable-next-line prefer-destructuring
        tagName = value.match(tagNameReg)[0];

        if (hasContentTagNames.includes(tagName)) {
          // eslint-disable-next-line prefer-destructuring
          content = value.match(contentReg)[0];
        }
      }

      const word = {
        value,
        isOpeningTag,
        isClosingTag,
        isHtmlTag,
        tagName,
        content,
      };

      words.push(word);
    }
    return words;
  }

  /**
   * findCommonParts
   * @param {number} html1Start
   * @param {number} html1End
   * @param {number} html2Start
   * @param {number} html2End
   * @param {Match[]} matches
   */
  findCommonParts(html1Start, html1End, html2Start, html2End, matches = []) {
    const match = this.findMatch(html1Start, html1End, html2Start, html2End);

    if (!match) return;

    if (html1Start < match.html1Start && html2Start < match.html2Start) {
      this.findCommonParts(
        html1Start,
        match.html1Start - 1,
        html2Start,
        match.html2Start - 1,
        matches
      );
    }

    matches.push(match);

    if (match.html1End < html1End && match.html2End < html2End) {
      this.findCommonParts(
        match.html1End + 1,
        html1End,
        match.html2End + 1,
        html2End,
        matches
      );
    }

    return matches;
  }

  /**
   * findMatch
   * @param {number} html1Start
   * @param {number} html1End
   * @param {number} html2Start
   * @param {number} html2End
   * @return {Match}
   */
  findMatch(html1Start, html1End, html2Start, html2End) {
    const m = html1End - html1Start + 1;
    const n = html2End - html2Start + 1;

    const html1WordsParts = this.html1Words.slice(html1Start, html1End + 1);
    const html2WordsParts = this.html2Words.slice(html2Start, html2End + 1);

    const dp = new Array(m + 1).fill(0).map(() => new Array(n + 1).fill(0));

    let maxLength = 0;

    let html1EndPos = 0;
    let html2EndPos = 0;

    for (let i = 1; i <= m; i++) {
      for (let j = 1; j <= n; j++) {
        // <img src="aaa" />
        // <img src="bbb" />
        // <div class="a1">
        // <div class="a2">
        // <hr /> <table>
        // </table>
        // <br /> <br/>
        // html1WordsParts[i - 1] === html2WordsParts[j - 1]
        if (this.isWordEqual(html1WordsParts[i - 1], html2WordsParts[j - 1])) {
          dp[i][j] = dp[i - 1][j - 1] + 1;

          if (dp[i][j] > maxLength) {
            maxLength = dp[i][j];

            html1EndPos = i - 1;
            html2EndPos = j - 1;
          }
        } else {
          dp[i][j] = 0;
        }
      }
    }

    if (!maxLength) return null;

    const html1StartPos = html1EndPos - maxLength + 1;
    const html2StartPos = html2EndPos - maxLength + 1;

    const html1PartStr = html1WordsParts
      .slice(html1StartPos, html1EndPos + 1)
      .map((item) => item.value)
      .join("");
    const html2PartStr = html2WordsParts
      .slice(html2StartPos, html2EndPos + 1)
      .map((item) => item.value)
      .join("");

    const match = {
      html1Start: html1Start + html1StartPos,
      html1End: html1Start + html1EndPos,
      html2Start: html2Start + html2StartPos,
      html2End: html2Start + html2EndPos,
      html1PartStr,
      html2PartStr,
    };

    return match;
  }

  /**
   * isWordEqual
   * @param {Word} word1
   * @param {Word} word2
   * @returns {boolean}
   */
  // eslint-disable-next-line class-methods-use-this
  isWordEqual(word1, word2) {
    // <div> 和 <div class="aaa"> 相同
    // <a href="a" /> 跟 <a href="b" /> 不同
    // <img src="a" /> 跟 <img src="b" /> 不同

    const hasContentTagNames = ["img", "a", "video"];

    if (word1.value == word2.value) return true;

    // 同时是html标签可以比较
    if (
      (word1.isOpeningTag && word2.isOpeningTag) ||
      (word1.isClosingTag && word2.isClosingTag)
    ) {
      const isSameTag = word1.tagName == word2.tagName;

      if (isSameTag && hasContentTagNames.includes(word1.tagName)) {
        return word1.content == word2.content;
      }

      return isSameTag;
    }

    return false;
  }

  /**
   * get operations
   */
  computeOperations() {
    let p = -1;
    let q = -1;

    // eslint-disable-next-line no-unused-vars
    for (const match of this.matches) {
      if (p < match.html1Start - 1) {
        const list = this.html1Words.slice(p + 1, match.html1Start);
        const operation1 = {
          action: "del",
          list,
          text: list.map((item) => item.value).join(""),
        };

        this.operations.push(operation1);
      }

      if (q < match.html2Start - 1) {
        const list = this.html2Words.slice(q + 1, match.html2Start);
        const operation2 = {
          action: "insert",
          list,
          text: list.map((item) => item.value).join(""),
        };

        this.operations.push(operation2);
      }

      const list = this.html2Words.slice(match.html2Start, match.html2End + 1);
      const operation = {
        action: "equal",
        list,
        text: list.map((item) => item.value).join(""),
      };
      this.operations.push(operation);

      p = match.html1End;
      q = match.html2End;
    }

    if (p < this.html1Words.length - 1) {
      const list = this.html1Words.slice(p + 1, this.html1Words.length);
      const operation1 = {
        action: "del",
        list,
        text: list.map((item) => item.value).join(""),
      };

      this.operations.push(operation1);
    }

    if (q < this.html2Words.length - 1) {
      const list = this.html2Words.slice(q + 1, this.html2Words.length);
      const operation2 = {
        action: "insert",
        list,
        text: list.map((item) => item.value).join(""),
      };

      this.operations.push(operation2);
    }
  }

  /**
   * perform operation in sequence
   */
  performOperations() {
    this.operations.forEach((operation) => {
      this.insertContent(operation);
    });

    this.html = this.contents.join("");
  }

  /**
   * insert operation string to content
   * @param {Operation} operation
   */
  insertContent(operation) {
    const { action, list } = operation;
    const cssStr = cssStrMap[action] || "";

    let text = list.map((item) => item.value).join("");
    if (["insert", "del"].includes(action)) {
      const newList = this.wrapTextList(list, cssStr, action);
      text = newList.join("");

      text = this.wrapBlockContent(text, cssStr);
    }

    this.contents.push(text);
  }

  /**
   * 查找非标签字符列表
   * @param {Word[]} list
   * @param {string} cssStr
   * @param {string} action
   * @returns {string[]}
   */
  // eslint-disable-next-line class-methods-use-this
  wrapTextList(list, cssStr, action) {
    const parts = [];
    let newList = [];
    let hasContent = false;

    // eslint-disable-next-line no-unused-vars
    for (const word of list) {
      const { value, isHtmlTag } = word;
      if (isHtmlTag) {
        if (parts.length) {
          let textPart = parts.join("");
          textPart = `<span class="${cssStr}">${textPart}</span>`;
          newList.push(textPart);
          parts.length = 0;
        }

        // 下面的标签,即使删除也要占位
        if (
          !hasContent &&
          (value.indexOf("<img") > -1 ||
            value.indexOf("<a") > -1 ||
            value.indexOf("<table")) > -1
        ) {
          hasContent = true;
        }

        newList.push(value);
        continue;
      }

      hasContent = true;
      parts.push(value);
    }

    if (parts.length) {
      let textPart = parts.join("");
      textPart = `<span class="${cssStr}">${textPart}</span>`;
      newList.push(textPart);
    }

    if (!hasContent && action === "del") {
      newList = [];
    }
    return newList;
  }

  /**
   * 包装块状结构,比如 img, table 等
   * @param {string} text
   * @param {string} cssStr
   */
  // eslint-disable-next-line class-methods-use-this
  wrapBlockContent(text, cssStr) {
    // 图片处理
    const imgReg = /<img[^>]*>/gi;
    if (imgReg.test(text)) {
      const list = text.match(imgReg);
      const set = new Set();
      // eslint-disable-next-line no-unused-vars
      for (const item of list) {
        if (set.has(item)) continue;
        const regex2 = new RegExp(item, "g");

        text = text.replace(
          regex2,
          `<div class="${cssStr} diff-block">${item}</div>`
        );
        set.add(item);
      }
    }

    // 表格处理
    const tableStartReg = /<table[^>]*>/gi;
    const tableEndReg = /<\/table>/gi;

    if (tableStartReg.test(text)) {
      const list = text.match(tableStartReg);
      const set = new Set();
      // eslint-disable-next-line no-unused-vars
      for (const item of list) {
        if (set.has(item)) continue;
        const regex2 = new RegExp(item, "g");

        text = text.replace(regex2, `<div class="${cssStr}">${item}`);
        set.add(item);
      }
    }

    if (tableEndReg.test(text)) {
      const list = text.match(tableEndReg);
      const set = new Set();
      // eslint-disable-next-line no-unused-vars
      for (const item of list) {
        if (set.has(item)) continue;
        const regex2 = new RegExp(item, "g");

        text = text.replace(regex2, `${item}</div>`);
        set.add(item);
      }
    }

    return text;
  }

  /**
   * isOpeningTag
   * @param {string} str
   * @returns
   */
  // eslint-disable-next-line class-methods-use-this
  isOpeningTag(str) {
    return /^\s*<\s*[a-zA-Z]+[^>]*[/]?\s*>\s*$/gi.test(str);
  }

  /**
   * isClosingTag
   * @param {string} str
   * @returns
   */
  // eslint-disable-next-line class-methods-use-this
  isClosingTag(str) {
    return /^\s*<\s*\/[^>]+>\s*/gi.test(str);
  }

  /**
   * isHtmlTag
   * @param {string} str
   * @returns {boolean}
   */
  isHtmlTag(str) {
    return this.isOpeningTag(str) || this.isClosingTag(str);
  }
}

export default HtmlDiff;