html 转义处理

比如要把：<span>test</span> 这段代码当做文本原样输出在页面上，如果按照正常的方式，肯定会被转义，在页面上只能看到 text。那么要想达到预想的效果，应该怎么办呢？

在学习 html 标签时，知道如果要把代码原样输出，可以用标签 pre + code 处理。但这种方式不能处理：html 标签。

1. 借助 vue 框架，可以这么做实现：

<template>

     <p v-html="html" />

</template>

<script>

    export default {

        data () {

            return {

                html: '<span>test</span>'

            }

        }

    }

</script>

2. 借助 react 框架，可以这么实现：

import React, { PureComponent } from 'react'

class Test extends PureComponent {

  constructor (props) {

      super(props)

      this.state = {

         html: '<span>test</span>'

      }

  }

  render () {

   const { content } = this.state

    return (

      <div>

        <p dangerouslySetInnerHTML={{__html: content}} />

      </div>

    )

  }

}

3. 如果想只用一个 html 标签就实现，可能吗？答案是可能的，可以用 xmp 标签。这个标签的作用：会将内容当做字符串输出。

<xmp><span>test</span></xmp>

不过，这个标签被W3C废弃了，但各大浏览器依然支持该标签。为什么被废弃呢？被废弃，肯定有被废弃的缘由的。如果一定要用这个标签，需注意：

若模板中包含标签会造成标签结束符混乱的问题，因此通过该方式存放模板时，不能包含结束标签；
xmp元素必须作为body的子孙元素。

4. 那么不借助xmp标签或框架的能力，如何自己实现呢？

// html 转义处理

function htmlEncode (text) {

    var isHtml = /[\x00`><\"'&]/;

    var htmlEncode = /[\x00`><"'&]/g;

    return text != null ? isHtml.test(text) && ("" + text).replace(htmlEncode, getCharEntity) || text : "";

    function getCharEntity (ch) {

        var charEntities = {

            "&": "&amp;",

            "<": "&lt;",

            ">": "&gt;",

            "\x00": "",

            "'": "'",

            '"': """,

            "`": "`"

        };

        return charEntities[ch] || (charEntities[ch] = "&#" + ch.charCodeAt(0) + ";");

    }

};

let htmlCon = '<span>test</span>';

document.querySelector('#html_con').innerHTML = htmlEncode(htmlCon);

5. 特地去看了 vue，react 源码，想看看它们都是怎么实现的。但只找到 react 中是怎么实现的，vue 目录太多也比较绕，没找到。

node-modeles/react-dom/cjs/react-dom-server.browser.development.js 中 631 行。

// code copied and modified from escape-html

/**

 * Module variables.

 * @private

 */

var matchHtmlRegExp = /["'&<>]/;

/**

 * Escapes special characters and HTML entities in a given html string.

 *

 * @param  {string} string HTML string to escape for later insertion

 * @return {string}

 * @public

 */

function escapeHtml(string) {

  var str = '' + string;

  var match = matchHtmlRegExp.exec(str);

  if (!match) {

    return str;

  }

  var escape = void 0;

  var html = '';

  var index = 0;

  var lastIndex = 0;

  for (index = match.index; index < str.length; index++) {

    switch (str.charCodeAt(index)) {

      case 34:

        // "

        escape = '&quot;';

        break;

      case 38:

        // &

        escape = '&amp;';

        break;

      case 39:

        // '

        escape = '''; // modified from escape-html; used to be '&#39'

        break;

      case 60:

        // <

        escape = '&lt;';

        break;

      case 62:

        // >

        escape = '&gt;';

        break;

      default:

        continue;

    }

    if (lastIndex !== index) {

      html += str.substring(lastIndex, index);

    }

    lastIndex = index + 1;

    html += escape;

  }

  return lastIndex !== index ? html + str.substring(lastIndex, index) : html;

}

// end code copied and modified from escape-html

/**

 * Escapes text to prevent scripting attacks.

 *

 * @param {*} text Text value to escape.

 * @return {string} An escaped string.

 */

function escapeTextForBrowser(text) {

  if (typeof text === 'boolean' || typeof text === 'number') {

    // this shortcircuit helps perf for types that we know will never have

    // special characters, especially given that this function is used often

    // for numeric dom ids.

    return '' + text;

  }

  return escapeHtml(text);

}

/**

 * Escapes attribute value to prevent scripting attacks.

 *

 * @param {*} value Value to escape.

 * @return {string} An escaped string.

 */

function quoteAttributeValueForBrowser(value) {

  return '"' + escapeTextForBrowser(value) + '"';

}

这部分源码，还是比如容易看懂呢。

总结：html 转义，主要就是将 "'&<> 这几个特殊字符转换为 html 实体。

延伸：预防 xss 攻击：

对用户输入进行转义
获取内容后，反转义并domParse，过滤不安全标签及属性，进行xss拦截
- 不安全标签：style、link、script、iframe、frame、img
- 不安全属性：onerror、onclick等

相关文章