Rip off all blackfriday's html sanitization effort

As per discussion in issue #90.
2014-09-19 20:30:00 +03:00 · 2014-09-19 20:30:00 +03:00 · cf6bfc9d6d
parent 44a39c16c6
commit cf6bfc9d6d
5 changed files with 3 additions and 366 deletions
--- a/html.go
+++ b/html.go
@ -29,7 +29,6 @@ const (
 	HTML_SKIP_STYLE                           // skip embedded <style> elements
 	HTML_SKIP_IMAGES                          // skip embedded images
 	HTML_SKIP_LINKS                           // skip all links
-	HTML_SANITIZE_OUTPUT                      // strip output of everything that's not known to be safe
 	HTML_SAFELINK                             // only link to trusted protocols
 	HTML_NOFOLLOW_LINKS                       // only link with rel="nofollow"
 	HTML_HREF_TARGET_BLANK                    // add a blank target
--- a/inline_test.go
+++ b/inline_test.go
@ -425,15 +425,12 @@ func TestNofollowLink(t *testing.T) {
 	var tests = []string{
 		"[foo](http://bar.com/foo/)\n",
 		"<p><a href=\"http://bar.com/foo/\" rel=\"nofollow\">foo</a></p>\n",
-	}
-	doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS|HTML_SANITIZE_OUTPUT,
-		HtmlRendererParameters{})
-	// HTML_SANITIZE_OUTPUT won't allow relative links, so test that separately:
-	tests = []string{
+
 		"[foo](/bar/)\n",
 		"<p><a href=\"/bar/\">foo</a></p>\n",
 	}
-	doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS, HtmlRendererParameters{})
+	doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS,
+		HtmlRendererParameters{})
 }

 func TestHrefTargetBlank(t *testing.T) {
--- a/markdown.go
+++ b/markdown.go
@ -238,7 +238,6 @@ func MarkdownCommon(input []byte) []byte {
 	htmlFlags |= HTML_USE_SMARTYPANTS
 	htmlFlags |= HTML_SMARTYPANTS_FRACTIONS
 	htmlFlags |= HTML_SMARTYPANTS_LATEX_DASHES
-	htmlFlags |= HTML_SANITIZE_OUTPUT
 	renderer := HtmlRenderer(htmlFlags, "", "")

 	// set up the parser
@ -298,11 +297,6 @@ func Markdown(input []byte, renderer Renderer, extensions int) []byte {

 	first := firstPass(p, input)
 	second := secondPass(p, first)
-
-	if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 {
-		second = sanitizeHtmlSafe(second)
-	}
-
 	return second
 }

--- a/sanitize.go
+++ b/sanitize.go
@ -1,154 +0,0 @@
-package blackfriday
-
-import (
-	"bufio"
-	"bytes"
-	"code.google.com/p/go.net/html"
-	"fmt"
-	"io"
-)
-
-// Whitelisted element tags, attributes on particular tags, attributes that are
-// interpreted as protocols (again on particular tags), and allowed protocols.
-var (
-	whitelistTags      map[string]bool
-	whitelistAttrs     map[string]map[string]bool
-	protocolAttrs      map[string]map[string]bool
-	whitelistProtocols [][]byte
-)
-
-func init() {
-	whitelistTags = toSet([]string{
-		// Headings
-		"h1", "h2", "h3", "h4", "h5", "h6",
-		// Block elements
-		"p", "pre", "blockquote", "hr", "div", "header", "article", "aside", "footer",
-		"section", "main", "mark", "figure", "figcaption",
-		// Inline elements
-		"a", "br", "cite", "code", "img",
-		// Lists
-		"ol", "ul", "li",
-		// Tables
-		"table", "tbody", "td", "tfoot", "th", "thead", "tr", "colgroup", "col", "caption",
-		// Formatting
-		"u", "i", "em", "small", "strike", "b", "strong", "sub", "sup", "q",
-		// Definition lists
-		"dd", "dl", "dt",
-	})
-	whitelistAttrs = map[string]map[string]bool{
-		"a":   toSet([]string{"href", "title", "rel"}),
-		"img": toSet([]string{"src", "alt", "title"}),
-		"td":  toSet([]string{"align"}),
-		"th":  toSet([]string{"align"}),
-	}
-	protocolAttrs = map[string]map[string]bool{
-		"a":   toSet([]string{"href"}),
-		"img": toSet([]string{"src"}),
-	}
-	whitelistProtocols = [][]byte{
-		[]byte("http://"),
-		[]byte("https://"),
-		[]byte("ftp://"),
-		[]byte("mailto:"),
-	}
-}
-
-func toSet(keys []string) map[string]bool {
-	m := make(map[string]bool, len(keys))
-	for _, k := range keys {
-		m[k] = true
-	}
-	return m
-}
-
-// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
-// be safe elements and attributes. All other HTML is escaped, unsafe attributes
-// are stripped.
-func sanitizeHtmlSafe(input []byte) []byte {
-	r := bytes.NewReader(input)
-	var w bytes.Buffer
-	tokenizer := html.NewTokenizer(r)
-	wr := bufio.NewWriter(&w)
-
-	// Iterate through all tokens in the input stream and sanitize them.
-	for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
-		switch t {
-		case html.TextToken:
-			// Text is written escaped.
-			wr.WriteString(tokenizer.Token().String())
-		case html.SelfClosingTagToken, html.StartTagToken:
-			// HTML tags are escaped unless whitelisted.
-			tag, hasAttributes := tokenizer.TagName()
-			tagName := string(tag)
-			if whitelistTags[tagName] {
-				wr.WriteString("<")
-				wr.Write(tag)
-				for hasAttributes {
-					var key, val []byte
-					key, val, hasAttributes = tokenizer.TagAttr()
-					attrName := string(key)
-					// Only include whitelisted attributes for the given tagName.
-					tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
-					if ok && tagWhitelistedAttrs[attrName] {
-						// For whitelisted attributes, if it's an attribute that requires
-						// protocol checking, do so and strip it if it's not known to be safe.
-						tagProtocolAttrs, ok := protocolAttrs[tagName]
-						if ok && tagProtocolAttrs[attrName] {
-							if !isRelativeLink(val) && !protocolAllowed(val) {
-								continue
-							}
-						}
-						wr.WriteByte(' ')
-						wr.Write(key)
-						wr.WriteString(`="`)
-						wr.WriteString(html.EscapeString(string(val)))
-						wr.WriteByte('"')
-					}
-				}
-				if t == html.SelfClosingTagToken {
-					wr.WriteString("/>")
-				} else {
-					wr.WriteString(">")
-				}
-			} else {
-				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
-			}
-			// Make sure that tags like <script> that switch the parser into raw mode
-			// do not destroy the parse mode for following HTML text (the point is to
-			// escape them anyway). For that, switch off raw mode in the tokenizer.
-			tokenizer.NextIsNotRawText()
-		case html.EndTagToken:
-			// Whitelisted tokens can be written in raw.
-			tag, _ := tokenizer.TagName()
-			if whitelistTags[string(tag)] {
-				wr.Write(tokenizer.Raw())
-			} else {
-				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
-			}
-		case html.CommentToken:
-			// Comments are not really expected, but harmless.
-			wr.Write(tokenizer.Raw())
-		case html.DoctypeToken:
-			// Escape DOCTYPES, entities etc can be dangerous
-			wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
-		default:
-			tokenizer.Token()
-			panic(fmt.Errorf("Unexpected token type %v", t))
-		}
-	}
-	err := tokenizer.Err()
-	if err != nil && err != io.EOF {
-		panic(tokenizer.Err())
-	}
-	wr.Flush()
-	return w.Bytes()
-}
-
-func protocolAllowed(attr []byte) bool {
-	for _, prefix := range whitelistProtocols {
-		if bytes.HasPrefix(attr, prefix) {
-			return true
-		}
-	}
-	return false
-}
--- a/sanitize_test.go
+++ b/sanitize_test.go
@ -1,199 +0,0 @@
-package blackfriday
-
-import (
-	"testing"
-)
-
-func doTestsSanitize(t *testing.T, tests []string) {
-	doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT, HtmlRendererParameters{})
-}
-
-func TestSanitizeRawHtmlTag(t *testing.T) {
-	tests := []string{
-		"zz <style>p {}</style>\n",
-		"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
-
-		"zz <STYLE>p {}</STYLE>\n",
-		"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
-
-		"<SCRIPT>alert()</SCRIPT>\n",
-		"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
-
-		"zz <SCRIPT>alert()</SCRIPT>\n",
-		"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
-
-		"zz <script>alert()</script>\n",
-		"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
-
-		" <script>alert()</script>\n",
-		"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
-
-		"<script>alert()</script>\n",
-		"&lt;script&gt;alert()&lt;/script&gt;\n",
-
-		"<script src='foo'></script>\n",
-		"&lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;\n",
-
-		"<script src='a>b'></script>\n",
-		"&lt;script src=&#39;a&gt;b&#39;&gt;&lt;/script&gt;\n",
-
-		"zz <script src='foo'></script>\n",
-		"<p>zz &lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;</p>\n",
-
-		"zz <script src=foo></script>\n",
-		"<p>zz &lt;script src=foo&gt;&lt;/script&gt;</p>\n",
-
-		`<script><script src="http://example.com/exploit.js"></SCRIPT></script>`,
-		"&lt;script&gt;&lt;script src=&#34;http://example.com/exploit.js&#34;&gt;&lt;/script&gt;&lt;/script&gt;\n",
-
-		`'';!--"<XSS>=&{()}`,
-		"<p>&#39;&#39;;!--&#34;&lt;xss&gt;=&amp;{()}</p>\n",
-
-		"<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>",
-		"<p>&lt;script SRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
-
-		"<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>",
-		"<p>&lt;script \nSRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
-
-		`<IMG SRC="javascript:alert('XSS');">`,
-		"<p><img></p>\n",
-
-		"<IMG SRC=javascript:alert('XSS')>",
-		"<p><img></p>\n",
-
-		"<IMG SRC=JaVaScRiPt:alert('XSS')>",
-		"<p><img></p>\n",
-
-		"<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>",
-		"<p><img></p>\n",
-
-		`<a onmouseover="alert(document.cookie)">xss link</a>`,
-		"<p><a>xss link</a></p>\n",
-
-		"<a onmouseover=alert(document.cookie)>xss link</a>",
-		"<p><a>xss link</a></p>\n",
-
-		`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
-		"<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n",
-
-		"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
-		"<p><img></p>\n",
-
-		`<IMG SRC=# onmouseover="alert('xxs')">`,
-		"<p><img src=\"#\"></p>\n",
-
-		`<IMG SRC= onmouseover="alert('xxs')">`,
-		"<p><img></p>\n",
-
-		`<IMG onmouseover="alert('xxs')">`,
-		"<p><img></p>\n",
-
-		"<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>",
-		"<p><img></p>\n",
-
-		"<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>",
-		"<p><img></p>\n",
-
-		"<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>",
-		"<p><img></p>\n",
-
-		`<IMG SRC="javascriptascript:alert('XSS');">`,
-		"<p><img></p>\n",
-
-		`<IMG SRC="jav&#x09;ascript:alert('XSS');">`,
-		"<p><img></p>\n",
-
-		`<IMG SRC="jav&#x0A;ascript:alert('XSS');">`,
-		"<p><img></p>\n",
-
-		`<IMG SRC="jav&#x0D;ascript:alert('XSS');">`,
-		"<p><img></p>\n",
-
-		`<IMG SRC=" &#14;  javascript:alert('XSS');">`,
-		"<p><img></p>\n",
-
-		`<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
-		"<p>&lt;script/XSS SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
-
-		"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
-		"<p>&lt;body onload!#$%&amp;()*~+-_.,:;?@[/|\\]^`=alert(&#34;XSS&#34;)&gt;</p>\n",
-
-		`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
-		"<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
-
-		`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
-		"<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n",
-
-		"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
-		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n",
-
-		"<SCRIPT SRC=//ha.ckers.org/.j>",
-		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n",
-
-		`<IMG SRC="javascript:alert('XSS')"`,
-		"<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",
-
-		"<iframe src=http://ha.ckers.org/scriptlet.html <",
-		// The hyperlink gets linkified, the <iframe> gets escaped
-		"<p>&lt;iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> &lt;</p>\n",
-
-		// Additonal token types: SelfClosing, Comment, DocType.
-		"<br/>",
-		"<p><br/></p>\n",
-
-		"<!-- Comment -->",
-		"<!-- Comment -->\n",
-
-		"<!DOCTYPE test>",
-		"<p>&lt;!DOCTYPE test&gt;</p>\n",
-	}
-	doTestsSanitize(t, tests)
-}
-
-func TestSanitizeQuoteEscaping(t *testing.T) {
-	tests := []string{
-		// Make sure quotes are transported correctly (different entities or
-		// unicode, but correct semantics)
-		"<p>Here are some &quot;quotes&quot;.</p>\n",
-		"<p>Here are some &#34;quotes&#34;.</p>\n",
-
-		"<p>Here are some &ldquo;quotes&rdquo;.</p>\n",
-		"<p>Here are some \u201Cquotes\u201D.</p>\n",
-
-		// Within a <script> tag, content gets parsed by the raw text parsing rules.
-		// This test makes sure we correctly disable those parsing rules and do not
-		// escape e.g. the closing </p>.
-		`Here are <script> some "quotes".`,
-		"<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n",
-
-		// Same test for an unknown element that does not switch into raw mode.
-		`Here are <eviltag> some "quotes".`,
-		"<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n",
-	}
-	doTestsSanitize(t, tests)
-}
-
-func TestSanitizeSelfClosingTag(t *testing.T) {
-	tests := []string{
-		"<hr>\n",
-		"<hr>\n",
-
-		"<hr/>\n",
-		"<hr/>\n",
-
-		// Make sure that evil attributes are stripped for self closing tags.
-		"<hr onclick=\"evil()\"/>\n",
-		"<hr/>\n",
-	}
-	doTestsSanitize(t, tests)
-}
-
-func TestSanitizeInlineLink(t *testing.T) {
-	tests := []string{
-		"[link](javascript:evil)",
-		"<p><a>link</a></p>\n",
-		"[link](/abc)",
-		"<p><a href=\"/abc\">link</a></p>\n",
-	}
-	doTestsSanitize(t, tests)
-}