fix(hashHTMLBlock): fix issue with html breaking markdown parsing

Closes #220
2024-03-22 13:30:55 +08:00 · 2016-01-01 23:33:33 +00:00 · 2016-01-01 23:33:33 +00:00 · 2746949d7d
commit 2746949d7d
parent 1011c0f7ac
12 changed files with 299 additions and 250 deletions
--- a/dist/showdown.js
+++ b/dist/showdown.js
@ -1,4 +1,4 @@
-;/*! showdown 22-12-2015 */
+;/*! showdown 01-01-2016 */
 (function(){
 /**
 * Created by Tivie on 13-07-2015.
@ -431,6 +431,18 @@ showdown.helper.isString = function isString(a) {
  return (typeof a === 'string' || a instanceof String);
 };

+/**
+ * Check if var is a function
+ * @static
+ * @param {string} a
+ * @returns {boolean}
+ */
+showdown.helper.isFunction = function isFunction(a) {
+  'use strict';
+  var getType = {};
+  return a && getType.toString.call(a) === '[object Function]';
+};
+
 /**
 * ForEach helper function
 * @static
@ -520,6 +532,38 @@ showdown.helper.escapeCharacters = function escapeCharacters(text, charsToEscape
  return text;
 };

+var rgxFindMatchPos = function (str, left, right, flags) {
+  'use strict';
+  var f = flags || '',
+    g = f.indexOf('g') > -1,
+    x = new RegExp(left + '|' + right, 'g' + f.replace(/g/g, '')),
+    l = new RegExp(left, f.replace(/g/g, '')),
+    pos = [],
+    t, s, m, start, end;
+
+  do {
+    t = 0;
+    while ((m = x.exec(str))) {
+      if (l.test(m[0])) {
+        if (!(t++)) {
+          s = x.lastIndex;
+          start = s - m[0].length;
+        }
+      } else if (t) {
+        if (!--t) {
+          end = m.index + m[0].length;
+          pos.push({start: start, end: end});
+          if (!g) {
+            return pos;
+          }
+        }
+      }
+    }
+  } while (t && (x.lastIndex = s));
+
+  return pos;
+};
+
 /**
 * matchRecursiveRegExp
 *
@ -553,7 +597,7 @@ showdown.helper.matchRecursiveRegExp = function (str, left, right, flags) {
  'use strict';
  var	f = flags || '',
    g = f.indexOf('g') > -1,
-    x = new RegExp(left + '|' + right, f),
+    x = new RegExp(left + '|' + right, 'g' + f.replace(/g/g, '')),
    l = new RegExp(left, f.replace(/g/g, '')),
    a = [],
    t, s, m, start, end;
@ -582,6 +626,48 @@ showdown.helper.matchRecursiveRegExp = function (str, left, right, flags) {
  return a;
 };

+/**
+ *
+ * @param {string} str
+ * @param {string|function} replacement
+ * @param {string} left
+ * @param {string} right
+ * @param {string} flags
+ * @returns {string}
+ */
+showdown.helper.replaceRecursiveRegExp = function (str, replacement, left, right, flags) {
+  'use strict';
+
+  if (!showdown.helper.isFunction(replacement)) {
+    var repStr = replacement;
+    replacement = function () {
+      return repStr;
+    };
+  }
+
+  var matchPos = rgxFindMatchPos(str, left, right, flags),
+      finalStr = str,
+      lng = matchPos.length;
+
+  if (lng > 0) {
+    var bits = [];
+    if (matchPos[0].start !== 0) {
+      bits.push(str.slice(0, matchPos[0].start));
+    }
+    for (var i = 0; i < lng; ++i) {
+      bits.push(replacement(str.slice(matchPos[i].start, matchPos[i].end)));
+      if (i < lng - 1) {
+        bits.push(str.slice(matchPos[i].end, matchPos[i + 1].start));
+      }
+    }
+    if (matchPos[lng - 1].end < str.length) {
+      bits.push(str.slice(matchPos[lng - 1].end));
+    }
+    finalStr = bits.join('');
+  }
+  return finalStr;
+};
+
 /**
 * POLYFILLS
 */
@ -1591,135 +1677,63 @@ showdown.subParser('hashElement', function (text, options, globals) {
 showdown.subParser('hashHTMLBlocks', function (text, options, globals) {
  'use strict';

-  // attacklab: Double up blank lines to reduce lookaround
-  text = text.replace(/\n/g, '\n\n');
+  var blockTags = [
+      'pre',
+      'div',
+      'h1',
+      'h2',
+      'h3',
+      'h4',
+      'h5',
+      'h6',
+      'blockquote',
+      'table',
+      'dl',
+      'ol',
+      'ul',
+      'script',
+      'noscript',
+      'form',
+      'fieldset',
+      'iframe',
+      'math',
+      'style',
+      'section',
+      'header',
+      'footer',
+      'nav',
+      'article',
+      'aside',
+      'address',
+      'audio',
+      'canvas',
+      'figure',
+      'hgroup',
+      'output',
+      'video',
+      'p'
+    ],
+    repFunc = function (match) {
+      return '\n\n~K' + (globals.gHtmlBlocks.push(match) - 1) + 'K\n\n';
+    };

-  // Hashify HTML blocks:
-  // We only want to do this for block-level HTML tags, such as headers,
-  // lists, and tables. That's because we still want to wrap <p>s around
-  // "paragraphs" that are wrapped in non-block-level tags, such as anchors,
-  // phrase emphasis, and spans. The list of tags we're looking for is
-  // hard-coded:
-  //var block_tags_a =
-  // 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del|style|section|header|footer|nav|article|aside';
-  // var block_tags_b =
-  // 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|style|section|header|footer|nav|article|aside';
+  for (var i = 0; i < blockTags.length; ++i) {
+    text = showdown.helper.replaceRecursiveRegExp(text, repFunc, '^(?: |\\t){0,3}<' + blockTags[i] + '\\b[^>]*>', '</' + blockTags[i] + '>', 'gim');
+  }

-  // First, look for nested blocks, e.g.:
-  //   <div>
-  //     <div>
-  //     tags for inner block must be indented.
-  //     </div>
-  //   </div>
-  //
-  // The outermost tags must start at the left margin for this to match, and
-  // the inner nested divs must be indented.
-  // We need to do this before the next, more liberal match, because the next
-  // match will start at the first `<div>` and stop at the first `</div>`.
-
-  // attacklab: This regex can be expensive when it fails.
-  /*
-   var text = text.replace(/
-   (						// save in $1
-   ^					// start of line  (with /m)
-   <($block_tags_a)	// start tag = $2
-   \b					// word break
-   // attacklab: hack around khtml/pcre bug...
-   [^\r]*?\n			// any number of lines, minimally matching
-   </\2>				// the matching end tag
-   [ \t]*				// trailing spaces/tabs
-   (?=\n+)				// followed by a newline
-   )						// attacklab: there are sentinel newlines at end of document
-   /gm,function(){...}};
-   */
-  text = text.replace(/^(<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del)\b[^\r]*?\n<\/\2>[ \t]*(?=\n+))/gm,
-                      showdown.subParser('hashElement')(text, options, globals));
-
-  //
-  // Now match more liberally, simply from `\n<tag>` to `</tag>\n`
-  //
-
-  /*
-   var text = text.replace(/
-   (						// save in $1
-   ^					// start of line  (with /m)
-   <($block_tags_b)	// start tag = $2
-   \b					// word break
-   // attacklab: hack around khtml/pcre bug...
-   [^\r]*?				// any number of lines, minimally matching
-   </\2>				// the matching end tag
-   [ \t]*				// trailing spaces/tabs
-   (?=\n+)				// followed by a newline
-   )						// attacklab: there are sentinel newlines at end of document
-   /gm,function(){...}};
-   */
-  text = text.replace(/^(<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|style|section|header|footer|nav|article|aside|address|audio|canvas|figure|hgroup|output|video)\b[^\r]*?<\/\2>[ \t]*(?=\n+)\n)/gm,
-                      showdown.subParser('hashElement')(text, options, globals));
-
-  // Special case just for <hr />. It was easier to make a special case than
-  // to make the other regex more complicated.
-
-  /*
-   text = text.replace(/
-   (						// save in $1
-   \n\n				// Starting after a blank line
-   [ ]{0,3}
-   (<(hr)				// start tag = $2
-   \b					// word break
-   ([^<>])*?			//
-   \/?>)				// the matching end tag
-   [ \t]*
-   (?=\n{2,})			// followed by a blank line
-   )
-   /g,showdown.subParser('hashElement')(text, options, globals));
-   */
+  // HR SPECIAL CASE
  text = text.replace(/(\n[ ]{0,3}(<(hr)\b([^<>])*?\/?>)[ \t]*(?=\n{2,}))/g,
-                      showdown.subParser('hashElement')(text, options, globals));
+    showdown.subParser('hashElement')(text, options, globals));

  // Special case for standalone HTML comments:
-
-  /*
-   text = text.replace(/
-   (						// save in $1
-   \n\n				// Starting after a blank line
-   [ ]{0,3}			// attacklab: g_tab_width - 1
-   <!
-   (--[^\r]*?--\s*)+
-   >
-   [ \t]*
-   (?=\n{2,})			// followed by a blank line
-   )
-   /g,showdown.subParser('hashElement')(text, options, globals));
-   */
-  text = text.replace(/(\n\n[ ]{0,3}<!(--[^\r]*?--\s*)+>[ \t]*(?=\n{2,}))/g,
-                      showdown.subParser('hashElement')(text, options, globals));
+  text = text.replace(/(<!(--[^\r]*?--\s*)+>[ \t]*(?=\n{2,}))/g,
+    showdown.subParser('hashElement')(text, options, globals));

  // PHP and ASP-style processor instructions (<?...?> and <%...%>)
-
-  /*
-   text = text.replace(/
-   (?:
-   \n\n				// Starting after a blank line
-   )
-   (						// save in $1
-   [ ]{0,3}			// attacklab: g_tab_width - 1
-   (?:
-   <([?%])			// $2
-   [^\r]*?
-   \2>
-   )
-   [ \t]*
-   (?=\n{2,})			// followed by a blank line
-   )
-   /g,showdown.subParser('hashElement')(text, options, globals));
-   */
  text = text.replace(/(?:\n\n)([ ]{0,3}(?:<([?%])[^\r]*?\2>)[ \t]*(?=\n{2,}))/g,
-                      showdown.subParser('hashElement')(text, options, globals));
+    showdown.subParser('hashElement')(text, options, globals));

-  // attacklab: Undo double lines (see comment at top of this function)
-  text = text.replace(/\n\n/g, '\n');
  return text;
-
 });

 /**
--- a/dist/showdown.js.map
+++ b/dist/showdown.js.map
--- a/dist/showdown.min.js
+++ b/dist/showdown.min.js
--- a/dist/showdown.min.js.map
+++ b/dist/showdown.min.js.map
--- a/src/helpers.js
+++ b/src/helpers.js
@ -17,6 +17,18 @@ showdown.helper.isString = function isString(a) {
  return (typeof a === 'string' || a instanceof String);
 };

+/**
+ * Check if var is a function
+ * @static
+ * @param {string} a
+ * @returns {boolean}
+ */
+showdown.helper.isFunction = function isFunction(a) {
+  'use strict';
+  var getType = {};
+  return a && getType.toString.call(a) === '[object Function]';
+};
+
 /**
 * ForEach helper function
 * @static
@ -106,6 +118,38 @@ showdown.helper.escapeCharacters = function escapeCharacters(text, charsToEscape
  return text;
 };

+var rgxFindMatchPos = function (str, left, right, flags) {
+  'use strict';
+  var f = flags || '',
+    g = f.indexOf('g') > -1,
+    x = new RegExp(left + '|' + right, 'g' + f.replace(/g/g, '')),
+    l = new RegExp(left, f.replace(/g/g, '')),
+    pos = [],
+    t, s, m, start, end;
+
+  do {
+    t = 0;
+    while ((m = x.exec(str))) {
+      if (l.test(m[0])) {
+        if (!(t++)) {
+          s = x.lastIndex;
+          start = s - m[0].length;
+        }
+      } else if (t) {
+        if (!--t) {
+          end = m.index + m[0].length;
+          pos.push({start: start, end: end});
+          if (!g) {
+            return pos;
+          }
+        }
+      }
+    }
+  } while (t && (x.lastIndex = s));
+
+  return pos;
+};
+
 /**
 * matchRecursiveRegExp
 *
@ -139,7 +183,7 @@ showdown.helper.matchRecursiveRegExp = function (str, left, right, flags) {
  'use strict';
  var	f = flags || '',
    g = f.indexOf('g') > -1,
-    x = new RegExp(left + '|' + right, f),
+    x = new RegExp(left + '|' + right, 'g' + f.replace(/g/g, '')),
    l = new RegExp(left, f.replace(/g/g, '')),
    a = [],
    t, s, m, start, end;
@ -168,6 +212,48 @@ showdown.helper.matchRecursiveRegExp = function (str, left, right, flags) {
  return a;
 };

+/**
+ *
+ * @param {string} str
+ * @param {string|function} replacement
+ * @param {string} left
+ * @param {string} right
+ * @param {string} flags
+ * @returns {string}
+ */
+showdown.helper.replaceRecursiveRegExp = function (str, replacement, left, right, flags) {
+  'use strict';
+
+  if (!showdown.helper.isFunction(replacement)) {
+    var repStr = replacement;
+    replacement = function () {
+      return repStr;
+    };
+  }
+
+  var matchPos = rgxFindMatchPos(str, left, right, flags),
+      finalStr = str,
+      lng = matchPos.length;
+
+  if (lng > 0) {
+    var bits = [];
+    if (matchPos[0].start !== 0) {
+      bits.push(str.slice(0, matchPos[0].start));
+    }
+    for (var i = 0; i < lng; ++i) {
+      bits.push(replacement(str.slice(matchPos[i].start, matchPos[i].end)));
+      if (i < lng - 1) {
+        bits.push(str.slice(matchPos[i].end, matchPos[i + 1].start));
+      }
+    }
+    if (matchPos[lng - 1].end < str.length) {
+      bits.push(str.slice(matchPos[lng - 1].end));
+    }
+    finalStr = bits.join('');
+  }
+  return finalStr;
+};
+
 /**
 * POLYFILLS
 */
--- a/src/subParsers/hashHTMLBlocks.js
+++ b/src/subParsers/hashHTMLBlocks.js
@ -1,133 +1,61 @@
 showdown.subParser('hashHTMLBlocks', function (text, options, globals) {
  'use strict';

-  // attacklab: Double up blank lines to reduce lookaround
-  text = text.replace(/\n/g, '\n\n');
+  var blockTags = [
+      'pre',
+      'div',
+      'h1',
+      'h2',
+      'h3',
+      'h4',
+      'h5',
+      'h6',
+      'blockquote',
+      'table',
+      'dl',
+      'ol',
+      'ul',
+      'script',
+      'noscript',
+      'form',
+      'fieldset',
+      'iframe',
+      'math',
+      'style',
+      'section',
+      'header',
+      'footer',
+      'nav',
+      'article',
+      'aside',
+      'address',
+      'audio',
+      'canvas',
+      'figure',
+      'hgroup',
+      'output',
+      'video',
+      'p'
+    ],
+    repFunc = function (match) {
+      return '\n\n~K' + (globals.gHtmlBlocks.push(match) - 1) + 'K\n\n';
+    };

-  // Hashify HTML blocks:
-  // We only want to do this for block-level HTML tags, such as headers,
-  // lists, and tables. That's because we still want to wrap <p>s around
-  // "paragraphs" that are wrapped in non-block-level tags, such as anchors,
-  // phrase emphasis, and spans. The list of tags we're looking for is
-  // hard-coded:
-  //var block_tags_a =
-  // 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del|style|section|header|footer|nav|article|aside';
-  // var block_tags_b =
-  // 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|style|section|header|footer|nav|article|aside';
+  for (var i = 0; i < blockTags.length; ++i) {
+    text = showdown.helper.replaceRecursiveRegExp(text, repFunc, '^(?: |\\t){0,3}<' + blockTags[i] + '\\b[^>]*>', '</' + blockTags[i] + '>', 'gim');
+  }

-  // First, look for nested blocks, e.g.:
-  //   <div>
-  //     <div>
-  //     tags for inner block must be indented.
-  //     </div>
-  //   </div>
-  //
-  // The outermost tags must start at the left margin for this to match, and
-  // the inner nested divs must be indented.
-  // We need to do this before the next, more liberal match, because the next
-  // match will start at the first `<div>` and stop at the first `</div>`.
-
-  // attacklab: This regex can be expensive when it fails.
-  /*
-   var text = text.replace(/
-   (						// save in $1
-   ^					// start of line  (with /m)
-   <($block_tags_a)	// start tag = $2
-   \b					// word break
-   // attacklab: hack around khtml/pcre bug...
-   [^\r]*?\n			// any number of lines, minimally matching
-   </\2>				// the matching end tag
-   [ \t]*				// trailing spaces/tabs
-   (?=\n+)				// followed by a newline
-   )						// attacklab: there are sentinel newlines at end of document
-   /gm,function(){...}};
-   */
-  text = text.replace(/^(<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del)\b[^\r]*?\n<\/\2>[ \t]*(?=\n+))/gm,
-                      showdown.subParser('hashElement')(text, options, globals));
-
-  //
-  // Now match more liberally, simply from `\n<tag>` to `</tag>\n`
-  //
-
-  /*
-   var text = text.replace(/
-   (						// save in $1
-   ^					// start of line  (with /m)
-   <($block_tags_b)	// start tag = $2
-   \b					// word break
-   // attacklab: hack around khtml/pcre bug...
-   [^\r]*?				// any number of lines, minimally matching
-   </\2>				// the matching end tag
-   [ \t]*				// trailing spaces/tabs
-   (?=\n+)				// followed by a newline
-   )						// attacklab: there are sentinel newlines at end of document
-   /gm,function(){...}};
-   */
-  text = text.replace(/^(<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|style|section|header|footer|nav|article|aside|address|audio|canvas|figure|hgroup|output|video)\b[^\r]*?<\/\2>[ \t]*(?=\n+)\n)/gm,
-                      showdown.subParser('hashElement')(text, options, globals));
-
-  // Special case just for <hr />. It was easier to make a special case than
-  // to make the other regex more complicated.
-
-  /*
-   text = text.replace(/
-   (						// save in $1
-   \n\n				// Starting after a blank line
-   [ ]{0,3}
-   (<(hr)				// start tag = $2
-   \b					// word break
-   ([^<>])*?			//
-   \/?>)				// the matching end tag
-   [ \t]*
-   (?=\n{2,})			// followed by a blank line
-   )
-   /g,showdown.subParser('hashElement')(text, options, globals));
-   */
+  // HR SPECIAL CASE
  text = text.replace(/(\n[ ]{0,3}(<(hr)\b([^<>])*?\/?>)[ \t]*(?=\n{2,}))/g,
-                      showdown.subParser('hashElement')(text, options, globals));
+    showdown.subParser('hashElement')(text, options, globals));

  // Special case for standalone HTML comments:
-
-  /*
-   text = text.replace(/
-   (						// save in $1
-   \n\n				// Starting after a blank line
-   [ ]{0,3}			// attacklab: g_tab_width - 1
-   <!
-   (--[^\r]*?--\s*)+
-   >
-   [ \t]*
-   (?=\n{2,})			// followed by a blank line
-   )
-   /g,showdown.subParser('hashElement')(text, options, globals));
-   */
-  text = text.replace(/(\n\n[ ]{0,3}<!(--[^\r]*?--\s*)+>[ \t]*(?=\n{2,}))/g,
-                      showdown.subParser('hashElement')(text, options, globals));
+  text = text.replace(/(<!(--[^\r]*?--\s*)+>[ \t]*(?=\n{2,}))/g,
+    showdown.subParser('hashElement')(text, options, globals));

  // PHP and ASP-style processor instructions (<?...?> and <%...%>)
-
-  /*
-   text = text.replace(/
-   (?:
-   \n\n				// Starting after a blank line
-   )
-   (						// save in $1
-   [ ]{0,3}			// attacklab: g_tab_width - 1
-   (?:
-   <([?%])			// $2
-   [^\r]*?
-   \2>
-   )
-   [ \t]*
-   (?=\n{2,})			// followed by a blank line
-   )
-   /g,showdown.subParser('hashElement')(text, options, globals));
-   */
  text = text.replace(/(?:\n\n)([ ]{0,3}(?:<([?%])[^\r]*?\2>)[ \t]*(?=\n{2,}))/g,
-                      showdown.subParser('hashElement')(text, options, globals));
+    showdown.subParser('hashElement')(text, options, globals));

-  // attacklab: Undo double lines (see comment at top of this function)
-  text = text.replace(/\n\n/g, '\n');
  return text;
-
 });
--- a/test/cases/list-with-code.html
+++ b/test/cases/list-with-code.html
@ -2,5 +2,6 @@
  <li><p>A list item with code:</p>

    <pre><code>alert('Hello world!');
-    </code></pre></li>
+    </code></pre>
+  </li>
 </ul>
--- a/test/ghost/underscore.md
+++ b/test/ghost/underscore.md
@ -63,6 +63,7 @@ Another [example][wiki] of a link
 <p><code>foo_bar_baz foo_bar_baz_bar_foo _foo_bar baz_bar_ baz_foo</code></p>

 <!-- These two cases still have bad <ems> because showdown handles them incorrectly -->
+
 <code>foo_bar_baz foo_bar_baz_bar_foo _foo_bar baz_bar_ baz_foo</code>

 ![foo_bar_baz foo_bar_baz_bar_foo _foo_bar baz_bar_ baz_foo](http://myurl.com/foo_bar_baz_bar_foo)
--- a/test/issues/#183.gh-code-blocks-within-lists-do-not-render-properly.html
+++ b/test/issues/#183.gh-code-blocks-within-lists-do-not-render-properly.html
@ -5,12 +5,14 @@
        <pre><code class="sh language-sh">$ git clone thing.git

 dfgdfg
-</code></pre></li>
+        </code></pre>
+    </li>
    <li>
        <p>I am another thing!</p>

        <pre><code class="sh language-sh">$ git clone other-thing.git

 foobar
-</code></pre></li>
-</ol>
+        </code></pre>
+    </li>
+</ol>
--- a/test/issues/#220.html-breaks-markdown-parsing.html
+++ b/test/issues/#220.html-breaks-markdown-parsing.html
@ -0,0 +1,5 @@
+<h2 id="title1">Title 1</h2>
+<div></div>
+<h1 id="title2">Title 2</h1>
+<div>
+</div>
--- a/test/issues/#220.html-breaks-markdown-parsing.md
+++ b/test/issues/#220.html-breaks-markdown-parsing.md
@ -0,0 +1,11 @@
+Title 1
+-------
+
+<div></div>
+
+
+# Title 2
+
+
+<div>
+</div>
--- a/test/karlcow/list-code.html
+++ b/test/karlcow/list-code.html
@ -3,5 +3,6 @@

 <pre><code>10 PRINT HELLO INFINITE
 20 GOTO 10
-</code></pre></li>
+</code></pre>
+</li>
 </ul>