Merge pull request #50 from rtfb/master

Better protection against JavaScript injection
2024-03-22 13:40:34 +08:00 · 2014-03-30 19:52:13 +03:00 · 2014-03-30 19:52:13 +03:00 · d643453f1e
commit d643453f1e
parent e078bb8ec3 84ee8e62f6
5 changed files with 228 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -89,6 +89,11 @@ All features of upskirt are supported, including:
    known inputs that make it crash.  If you find one, please let me
    know and send me the input that does it.

+    NOTE: "safety" in this context means *runtime safety only*. It is
+    not bullet proof against JavaScript injections, though we're working
+    on it (https://github.com/russross/blackfriday/issues/11 tracks the
+    progress).
+
 *   **Fast processing**. It is fast enough to render on-demand in
    most web applications without having to cache the output.

--- a/html.go
+++ b/html.go
@ -18,6 +18,7 @@ package blackfriday
 import (
 	"bytes"
 	"fmt"
+	"regexp"
 	"strconv"
 	"strings"
 )
@ -28,7 +29,7 @@ const (
 	HTML_SKIP_STYLE                           // skip embedded <style> elements
 	HTML_SKIP_IMAGES                          // skip embedded images
 	HTML_SKIP_LINKS                           // skip all links
-	HTML_SKIP_SCRIPT                          // skip embedded <script> elements
+	HTML_SANITIZE_OUTPUT                      // strip output of everything that's not known to be safe
 	HTML_SAFELINK                             // only link to trusted protocols
 	HTML_NOFOLLOW_LINKS                       // only link with rel="nofollow"
 	HTML_TOC                                  // generate a table of contents
@ -41,6 +42,41 @@ const (
 	HTML_SMARTYPANTS_LATEX_DASHES             // enable LaTeX-style dashes (with HTML_USE_SMARTYPANTS)
 )

+var (
+	tags  = []string{
+		"b",
+		"blockquote",
+		"code",
+		"del",
+		"dd",
+		"dl",
+		"dt",
+		"em",
+		"h1",
+		"h2",
+		"h3",
+		"h4",
+		"h5",
+		"h6",
+		"i",
+		"kbd",
+		"li",
+		"ol",
+		"p",
+		"pre",
+		"s",
+		"sup",
+		"sub",
+		"strong",
+		"strike",
+		"ul",
+	}
+	urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
+	tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`)
+	anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
+	imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
+)
+
 // Html is a type that implements the Renderer interface for HTML output.
 //
 // Do not create this directly, instead use the HtmlRenderer function.
@ -138,6 +174,10 @@ func attrEscape(out *bytes.Buffer, src []byte) {
 	}
 }

+func (options *Html) GetFlags() int {
+	return options.flags
+}
+
 func (options *Html) Header(out *bytes.Buffer, text func() bool, level int) {
 	marker := out.Len()
 	doubleSpace(out)
@ -169,32 +209,10 @@ func (options *Html) BlockHtml(out *bytes.Buffer, text []byte) {
 	}

 	doubleSpace(out)
-	if options.flags&HTML_SKIP_SCRIPT != 0 {
-		out.Write(stripTag(string(text), "script", "p"))
-	} else {
 	out.Write(text)
-	}
 	out.WriteByte('\n')
 }

-func stripTag(text, tag, newTag string) []byte {
-	closeNewTag := fmt.Sprintf("</%s>", newTag)
-	i := 0
-	for i < len(text) && text[i] != '<' {
-		i++
-	}
-	if i == len(text) {
-		return []byte(text)
-	}
-	found, end := findHtmlTagPos([]byte(text[i:]), tag)
-	closeTag := fmt.Sprintf("</%s>", tag)
-	noOpen := text
-	if found {
-		noOpen = text[0:i+1] + newTag + text[end:]
-	}
-	return []byte(strings.Replace(noOpen, closeTag, closeNewTag, -1))
-}
-
 func (options *Html) HRule(out *bytes.Buffer) {
 	doubleSpace(out)
 	out.WriteString("<hr")
@ -522,9 +540,6 @@ func (options *Html) RawHtmlTag(out *bytes.Buffer, text []byte) {
 	if options.flags&HTML_SKIP_IMAGES != 0 && isHtmlTag(text, "img") {
 		return
 	}
-	if options.flags&HTML_SKIP_SCRIPT != 0 && isHtmlTag(text, "script") {
-		return
-	}
 	out.Write(text)
 }

@ -726,6 +741,29 @@ func isHtmlTag(tag []byte, tagname string) bool {
 	return found
 }

+// Look for a character, but ignore it when it's in any kind of quotes, it
+// might be JavaScript
+func skipUntilCharIgnoreQuotes(html []byte, start int, char byte) int {
+	inSingleQuote := false
+	inDoubleQuote := false
+	inGraveQuote := false
+	i := start
+	for i < len(html) {
+		switch {
+		case html[i] == char && !inSingleQuote && !inDoubleQuote && !inGraveQuote:
+			return i
+		case html[i] == '\'':
+			inSingleQuote = !inSingleQuote
+		case html[i] == '"':
+			inDoubleQuote = !inDoubleQuote
+		case html[i] == '`':
+			inGraveQuote = !inGraveQuote
+		}
+		i++
+	}
+	return start
+}
+
 func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
 	i := 0
 	if i < len(tag) && tag[0] != '<' {
@ -754,28 +792,54 @@ func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
 		return false, -1
 	}

-	// Now look for closing '>', but ignore it when it's in any kind of quotes,
-	// it might be JavaScript
-	inSingleQuote := false
-	inDoubleQuote := false
-	inGraveQuote := false
-	for i < len(tag) {
-		switch {
-		case tag[i] == '>' && !inSingleQuote && !inDoubleQuote && !inGraveQuote:
-			return true, i
-		case tag[i] == '\'':
-			inSingleQuote = !inSingleQuote
-		case tag[i] == '"':
-			inDoubleQuote = !inDoubleQuote
-		case tag[i] == '`':
-			inGraveQuote = !inGraveQuote
-		}
-		i++
+	rightAngle := skipUntilCharIgnoreQuotes(tag, i, '>')
+	if rightAngle > i {
+		return true, rightAngle
 	}

 	return false, -1
 }

+func sanitizeHtml(html []byte) []byte {
+	var result []byte
+	for string(html) != "" {
+		skip, tag, rest := findHtmlTag(html)
+		html = rest
+		result = append(result, skip...)
+		result = append(result, sanitizeTag(tag)...)
+	}
+	return append(result, []byte("\n")...)
+}
+
+func sanitizeTag(tag []byte) []byte {
+	if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
+		return tag
+	} else {
+		return []byte("")
+	}
+}
+
+func skipUntilChar(text []byte, start int, char byte) int {
+	i := start
+	for i < len(text) && text[i] != char {
+		i++
+	}
+	return i
+}
+
+func findHtmlTag(html []byte) (skip, tag, rest []byte) {
+	start := skipUntilChar(html, 0, '<')
+	rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
+	if rightAngle > start {
+		skip = html[0:start]
+		tag = html[start : rightAngle+1]
+		rest = html[rightAngle+1:]
+		return
+	}
+
+	return []byte(""), []byte(""), []byte("")
+}
+
 func skipSpace(tag []byte, i int) int {
 	for i < len(tag) && isspace(tag[i]) {
 		i++
--- a/inline_test.go
+++ b/inline_test.go
@ -90,18 +90,119 @@ func TestRawHtmlTag(t *testing.T) {
 		"<p>alert()</p>\n",

 		"<script>alert()</script>\n",
-		"<p>alert()</p>\n",
+		"alert()\n",

 		"<script src='foo'></script>\n",
-		"<p></p>\n",
+		"\n",
+
+		"<script src='a>b'></script>\n",
+		"\n",

 		"zz <script src='foo'></script>\n",
 		"<p>zz </p>\n",

 		"zz <script src=foo></script>\n",
 		"<p>zz </p>\n",
+
+		`<script><script src="http://example.com/exploit.js"></SCRIPT></script>`,
+		"\n",
+
+		`'';!--"<XSS>=&{()}`,
+		"<p>'';!--&quot;=&amp;{()}</p>\n",
+
+		"<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>",
+		"<p></p>\n",
+
+		"<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>",
+		"<p></p>\n",
+
+		`<IMG SRC="javascript:alert('XSS');">`,
+		"<p></p>\n",
+
+		"<IMG SRC=javascript:alert('XSS')>",
+		"<p></p>\n",
+
+		"<IMG SRC=JaVaScRiPt:alert('XSS')>",
+		"<p></p>\n",
+
+		"<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>",
+		"<p></p>\n",
+
+		`<a onmouseover="alert(document.cookie)">xss link</a>`,
+		"<p>xss link</a></p>\n",
+
+		"<a onmouseover=alert(document.cookie)>xss link</a>",
+		"<p>xss link</a></p>\n",
+
+		// XXX: this doesn't pass yet
+		//`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
+		//"<p></p>\n",
+
+		"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
+		"<p></p>\n",
+
+		`<IMG SRC=# onmouseover="alert('xxs')">`,
+		"<p></p>\n",
+
+		`<IMG SRC= onmouseover="alert('xxs')">`,
+		"<p></p>\n",
+
+		`<IMG onmouseover="alert('xxs')">`,
+		"<p></p>\n",
+
+		"<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>",
+		"<p></p>\n",
+
+		"<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>",
+		"<p></p>\n",
+
+		"<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>",
+		"<p></p>\n",
+
+		`<IMG SRC="javascriptascript:alert('XSS');">`,
+		"<p></p>\n",
+
+		`<IMG SRC="jav&#x09;ascript:alert('XSS');">`,
+		"<p></p>\n",
+
+		`<IMG SRC="jav&#x0A;ascript:alert('XSS');">`,
+		"<p></p>\n",
+
+		`<IMG SRC="jav&#x0D;ascript:alert('XSS');">`,
+		"<p></p>\n",
+
+		`<IMG SRC=" &#14;  javascript:alert('XSS');">`,
+		"<p></p>\n",
+
+		`<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
+		"<p></p>\n",
+
+		// XXX: this doesn't pass yet
+		//"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
+		//"\n",
+
+		`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
+		"<p></p>\n",
+
+		// XXX: this doesn't pass yet
+		//`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
+		//"",
+
+		"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
+		"<p></p>\n",
+
+		"<SCRIPT SRC=//ha.ckers.org/.j>",
+		"<p></p>\n",
+
+		// XXX: this doesn't pass yet
+		//`<IMG SRC="javascript:alert('XSS')"`,
+		//"",
+
+		// XXX: this doesn't pass yet
+		//"<iframe src=http://ha.ckers.org/scriptlet.html <",
+		//"",
 	}
-	doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SKIP_SCRIPT)
+	doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT)
 }

 func TestEmphasis(t *testing.T) {
--- a/latex.go
+++ b/latex.go
@ -34,6 +34,10 @@ func LatexRenderer(flags int) Renderer {
 	return &Latex{}
 }

+func (options *Latex) GetFlags() int {
+	return 0
+}
+
 // render code chunks using verbatim, or listings if we have a language
 func (options *Latex) BlockCode(out *bytes.Buffer, text []byte, lang string) {
 	if lang == "" {
--- a/markdown.go
+++ b/markdown.go
@ -165,6 +165,8 @@ type Renderer interface {
 	// Header and footer
 	DocumentHeader(out *bytes.Buffer)
 	DocumentFooter(out *bytes.Buffer)
+
+	GetFlags() int
 }

 // Callback functions for inline parsing. One such function is defined
@ -231,7 +233,7 @@ func MarkdownCommon(input []byte) []byte {
 	htmlFlags |= HTML_USE_SMARTYPANTS
 	htmlFlags |= HTML_SMARTYPANTS_FRACTIONS
 	htmlFlags |= HTML_SMARTYPANTS_LATEX_DASHES
-	htmlFlags |= HTML_SKIP_SCRIPT
+	htmlFlags |= HTML_SANITIZE_OUTPUT
 	renderer := HtmlRenderer(htmlFlags, "", "")

 	// set up the parser
@ -291,6 +293,10 @@ func Markdown(input []byte, renderer Renderer, extensions int) []byte {
 	first := firstPass(p, input)
 	second := secondPass(p, first)

+	if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 {
+		second = sanitizeHtml(second)
+	}
+
 	return second
 }