Avoid raw mode parsing so that raw mode tags like <script> don't cause issues.

Certain tags like <script> but also <title> and others switch an HTML5 parser into raw mode, which causes the rest of the HTML string to be always parsed as text, including any elements or entities that we do want to support (e.g. <p>). As we're going to escape any of the raw text elements anyway (it's e.g. script, style, title, xmp, noframes, and a couple of others) we can just switch of raw text parsing by disabling it after each starting tag.
2014-05-03 12:58:25 +02:00 · 2014-05-03 12:58:25 +02:00 · 11e042f6c1
parent 50b8e0370b
commit 11e042f6c1
2 changed files with 20 additions and 8 deletions
--- a/inline_test.go
+++ b/inline_test.go
@ -135,7 +135,7 @@ func TestRawHtmlTag(t *testing.T) {
 		"<p><a>xss link</a></p>\n",

 		`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
-		"<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n",
+		"<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n",

 		"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
 		"<p><img></p>\n",
@ -182,18 +182,14 @@ func TestRawHtmlTag(t *testing.T) {
 		`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
 		"<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",

-		// HTML5 interprets the <script> tag contents as raw test, thus the end
-		// result has double-escaped &amp;quot;
 		`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
-		"<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n",
+		"<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n",

-		// HTML5 parses the </p> within an unclosed <script> tag as text.
-		// Same for the following tests.
 		"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
-		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n",
+		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n",

 		"<SCRIPT SRC=//ha.ckers.org/.j>",
-		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n",
+		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n",

 		`<IMG SRC="javascript:alert('XSS')"`,
 		"<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",
@ -220,11 +216,23 @@ func TestRawHtmlTag(t *testing.T) {

 func TestQuoteEscaping(t *testing.T) {
 	tests := []string{
+		// Make sure quotes are transported correctly (different entities or
+		// unicode, but correct semantics)
 		"<p>Here are some &quot;quotes&quot;.</p>\n",
 		"<p>Here are some &#34;quotes&#34;.</p>\n",

 		"<p>Here are some &ldquo;quotes&rdquo;.</p>\n",
 		"<p>Here are some \u201Cquotes\u201D.</p>\n",
+
+		// Within a <script> tag, content gets parsed by the raw text parsing rules.
+		// This test makes sure we correctly disable those parsing rules and do not
+		// escape e.g. the closing </p>.
+		`Here are <script> some "quotes".`,
+		"<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n",
+
+		// Same test for an unknown element that does not switch into raw mode.
+		`Here are <eviltag> some "quotes".`,
+		"<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n",
 	}
 	doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT)
 }
--- a/sanitize.go
+++ b/sanitize.go
@ -107,6 +107,10 @@ func sanitizeHtmlSafe(input []byte) []byte {
 			} else {
 				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
 			}
+			// Make sure that tags like <script> that switch the parser into raw mode
+			// do not destroy the parse mode for following HTML text (the point is to
+			// escape them anyway). For that, switch off raw mode in the tokenizer.
+			tokenizer.NextIsNotRawText()
 		case html.EndTagToken:
 			// Whitelisted tokens can be written in raw.
 			tag, _ := tokenizer.TagName()