Merge pull request #117 from rtfb/remove-sanitization

Remove sanitization
This commit is contained in:
Vytautas Šaltenis 2014-09-20 14:54:26 +03:00
commit 64fbfbbadf
6 changed files with 28 additions and 370 deletions

View File

@ -48,6 +48,28 @@ feature set, use this instead:
output := blackfriday.MarkdownCommon(input)
### Sanitize untrusted content
Blackfriday itself does nothing to protect against malicious content. If you are
dealing with user-supplied markdown, we recommend running blackfriday's output
through HTML sanitizer such as
[Bluemonday](https://github.com/microcosm-cc/bluemonday).
Here's an example of simple usage of blackfriday together with bluemonday:
``` go
import (
"github.com/microcosm-cc/bluemonday"
"github.com/russross/blackfriday"
)
// ...
unsafe := blackfriday.MarkdownCommon(input)
html := bluemonday.UGCPolicy().SanitizeBytes(unsafe)
```
### Custom options
If you want to customize the set of options, first get a renderer
(currently either the HTML or LaTeX output engines), then use it to
call the more general `Markdown` function. For examples, see the
@ -93,10 +115,9 @@ All features of upskirt are supported, including:
known inputs that make it crash. If you find one, please let me
know and send me the input that does it.
NOTE: "safety" in this context means *runtime safety only*. It is
not bullet proof against JavaScript injections, though we're working
on it (https://github.com/russross/blackfriday/issues/11 tracks the
progress).
NOTE: "safety" in this context means *runtime safety only*. In order to
protect yourself agains JavaScript injection in untrusted content, see
[this example](https://github.com/russross/blackfriday#sanitize-untrusted-content).
* **Fast processing**. It is fast enough to render on-demand in
most web applications without having to cache the output.

View File

@ -29,7 +29,6 @@ const (
HTML_SKIP_STYLE // skip embedded <style> elements
HTML_SKIP_IMAGES // skip embedded images
HTML_SKIP_LINKS // skip all links
HTML_SANITIZE_OUTPUT // strip output of everything that's not known to be safe
HTML_SAFELINK // only link to trusted protocols
HTML_NOFOLLOW_LINKS // only link with rel="nofollow"
HTML_HREF_TARGET_BLANK // add a blank target

View File

@ -425,15 +425,12 @@ func TestNofollowLink(t *testing.T) {
var tests = []string{
"[foo](http://bar.com/foo/)\n",
"<p><a href=\"http://bar.com/foo/\" rel=\"nofollow\">foo</a></p>\n",
}
doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS|HTML_SANITIZE_OUTPUT,
HtmlRendererParameters{})
// HTML_SANITIZE_OUTPUT won't allow relative links, so test that separately:
tests = []string{
"[foo](/bar/)\n",
"<p><a href=\"/bar/\">foo</a></p>\n",
}
doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS, HtmlRendererParameters{})
doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS,
HtmlRendererParameters{})
}
func TestHrefTargetBlank(t *testing.T) {

View File

@ -238,7 +238,6 @@ func MarkdownCommon(input []byte) []byte {
htmlFlags |= HTML_USE_SMARTYPANTS
htmlFlags |= HTML_SMARTYPANTS_FRACTIONS
htmlFlags |= HTML_SMARTYPANTS_LATEX_DASHES
htmlFlags |= HTML_SANITIZE_OUTPUT
renderer := HtmlRenderer(htmlFlags, "", "")
// set up the parser
@ -298,11 +297,6 @@ func Markdown(input []byte, renderer Renderer, extensions int) []byte {
first := firstPass(p, input)
second := secondPass(p, first)
if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 {
second = sanitizeHtmlSafe(second)
}
return second
}

View File

@ -1,154 +0,0 @@
package blackfriday
import (
"bufio"
"bytes"
"code.google.com/p/go.net/html"
"fmt"
"io"
)
// Whitelisted element tags, attributes on particular tags, attributes that are
// interpreted as protocols (again on particular tags), and allowed protocols.
var (
whitelistTags map[string]bool
whitelistAttrs map[string]map[string]bool
protocolAttrs map[string]map[string]bool
whitelistProtocols [][]byte
)
func init() {
whitelistTags = toSet([]string{
// Headings
"h1", "h2", "h3", "h4", "h5", "h6",
// Block elements
"p", "pre", "blockquote", "hr", "div", "header", "article", "aside", "footer",
"section", "main", "mark", "figure", "figcaption",
// Inline elements
"a", "br", "cite", "code", "img",
// Lists
"ol", "ul", "li",
// Tables
"table", "tbody", "td", "tfoot", "th", "thead", "tr", "colgroup", "col", "caption",
// Formatting
"u", "i", "em", "small", "strike", "b", "strong", "sub", "sup", "q",
// Definition lists
"dd", "dl", "dt",
})
whitelistAttrs = map[string]map[string]bool{
"a": toSet([]string{"href", "title", "rel"}),
"img": toSet([]string{"src", "alt", "title"}),
"td": toSet([]string{"align"}),
"th": toSet([]string{"align"}),
}
protocolAttrs = map[string]map[string]bool{
"a": toSet([]string{"href"}),
"img": toSet([]string{"src"}),
}
whitelistProtocols = [][]byte{
[]byte("http://"),
[]byte("https://"),
[]byte("ftp://"),
[]byte("mailto:"),
}
}
func toSet(keys []string) map[string]bool {
m := make(map[string]bool, len(keys))
for _, k := range keys {
m[k] = true
}
return m
}
// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
// be safe elements and attributes. All other HTML is escaped, unsafe attributes
// are stripped.
func sanitizeHtmlSafe(input []byte) []byte {
r := bytes.NewReader(input)
var w bytes.Buffer
tokenizer := html.NewTokenizer(r)
wr := bufio.NewWriter(&w)
// Iterate through all tokens in the input stream and sanitize them.
for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
switch t {
case html.TextToken:
// Text is written escaped.
wr.WriteString(tokenizer.Token().String())
case html.SelfClosingTagToken, html.StartTagToken:
// HTML tags are escaped unless whitelisted.
tag, hasAttributes := tokenizer.TagName()
tagName := string(tag)
if whitelistTags[tagName] {
wr.WriteString("<")
wr.Write(tag)
for hasAttributes {
var key, val []byte
key, val, hasAttributes = tokenizer.TagAttr()
attrName := string(key)
// Only include whitelisted attributes for the given tagName.
tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
if ok && tagWhitelistedAttrs[attrName] {
// For whitelisted attributes, if it's an attribute that requires
// protocol checking, do so and strip it if it's not known to be safe.
tagProtocolAttrs, ok := protocolAttrs[tagName]
if ok && tagProtocolAttrs[attrName] {
if !isRelativeLink(val) && !protocolAllowed(val) {
continue
}
}
wr.WriteByte(' ')
wr.Write(key)
wr.WriteString(`="`)
wr.WriteString(html.EscapeString(string(val)))
wr.WriteByte('"')
}
}
if t == html.SelfClosingTagToken {
wr.WriteString("/>")
} else {
wr.WriteString(">")
}
} else {
wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
}
// Make sure that tags like <script> that switch the parser into raw mode
// do not destroy the parse mode for following HTML text (the point is to
// escape them anyway). For that, switch off raw mode in the tokenizer.
tokenizer.NextIsNotRawText()
case html.EndTagToken:
// Whitelisted tokens can be written in raw.
tag, _ := tokenizer.TagName()
if whitelistTags[string(tag)] {
wr.Write(tokenizer.Raw())
} else {
wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
}
case html.CommentToken:
// Comments are not really expected, but harmless.
wr.Write(tokenizer.Raw())
case html.DoctypeToken:
// Escape DOCTYPES, entities etc can be dangerous
wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
default:
tokenizer.Token()
panic(fmt.Errorf("Unexpected token type %v", t))
}
}
err := tokenizer.Err()
if err != nil && err != io.EOF {
panic(tokenizer.Err())
}
wr.Flush()
return w.Bytes()
}
func protocolAllowed(attr []byte) bool {
for _, prefix := range whitelistProtocols {
if bytes.HasPrefix(attr, prefix) {
return true
}
}
return false
}

View File

@ -1,199 +0,0 @@
package blackfriday
import (
"testing"
)
func doTestsSanitize(t *testing.T, tests []string) {
doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT, HtmlRendererParameters{})
}
func TestSanitizeRawHtmlTag(t *testing.T) {
tests := []string{
"zz <style>p {}</style>\n",
"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
"zz <STYLE>p {}</STYLE>\n",
"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
"<SCRIPT>alert()</SCRIPT>\n",
"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
"zz <SCRIPT>alert()</SCRIPT>\n",
"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
"zz <script>alert()</script>\n",
"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
" <script>alert()</script>\n",
"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
"<script>alert()</script>\n",
"&lt;script&gt;alert()&lt;/script&gt;\n",
"<script src='foo'></script>\n",
"&lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;\n",
"<script src='a>b'></script>\n",
"&lt;script src=&#39;a&gt;b&#39;&gt;&lt;/script&gt;\n",
"zz <script src='foo'></script>\n",
"<p>zz &lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;</p>\n",
"zz <script src=foo></script>\n",
"<p>zz &lt;script src=foo&gt;&lt;/script&gt;</p>\n",
`<script><script src="http://example.com/exploit.js"></SCRIPT></script>`,
"&lt;script&gt;&lt;script src=&#34;http://example.com/exploit.js&#34;&gt;&lt;/script&gt;&lt;/script&gt;\n",
`'';!--"<XSS>=&{()}`,
"<p>&#39;&#39;;!--&#34;&lt;xss&gt;=&amp;{()}</p>\n",
"<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>",
"<p>&lt;script SRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
"<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>",
"<p>&lt;script \nSRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
`<IMG SRC="javascript:alert('XSS');">`,
"<p><img></p>\n",
"<IMG SRC=javascript:alert('XSS')>",
"<p><img></p>\n",
"<IMG SRC=JaVaScRiPt:alert('XSS')>",
"<p><img></p>\n",
"<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>",
"<p><img></p>\n",
`<a onmouseover="alert(document.cookie)">xss link</a>`,
"<p><a>xss link</a></p>\n",
"<a onmouseover=alert(document.cookie)>xss link</a>",
"<p><a>xss link</a></p>\n",
`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
"<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n",
"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
"<p><img></p>\n",
`<IMG SRC=# onmouseover="alert('xxs')">`,
"<p><img src=\"#\"></p>\n",
`<IMG SRC= onmouseover="alert('xxs')">`,
"<p><img></p>\n",
`<IMG onmouseover="alert('xxs')">`,
"<p><img></p>\n",
"<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>",
"<p><img></p>\n",
"<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>",
"<p><img></p>\n",
"<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>",
"<p><img></p>\n",
`<IMG SRC="javascriptascript:alert('XSS');">`,
"<p><img></p>\n",
`<IMG SRC="jav&#x09;ascript:alert('XSS');">`,
"<p><img></p>\n",
`<IMG SRC="jav&#x0A;ascript:alert('XSS');">`,
"<p><img></p>\n",
`<IMG SRC="jav&#x0D;ascript:alert('XSS');">`,
"<p><img></p>\n",
`<IMG SRC=" &#14; javascript:alert('XSS');">`,
"<p><img></p>\n",
`<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
"<p>&lt;script/XSS SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
"<p>&lt;body onload!#$%&amp;()*~+-_.,:;?@[/|\\]^`=alert(&#34;XSS&#34;)&gt;</p>\n",
`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
"<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
"<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n",
"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n",
"<SCRIPT SRC=//ha.ckers.org/.j>",
"<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n",
`<IMG SRC="javascript:alert('XSS')"`,
"<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",
"<iframe src=http://ha.ckers.org/scriptlet.html <",
// The hyperlink gets linkified, the <iframe> gets escaped
"<p>&lt;iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> &lt;</p>\n",
// Additonal token types: SelfClosing, Comment, DocType.
"<br/>",
"<p><br/></p>\n",
"<!-- Comment -->",
"<!-- Comment -->\n",
"<!DOCTYPE test>",
"<p>&lt;!DOCTYPE test&gt;</p>\n",
}
doTestsSanitize(t, tests)
}
func TestSanitizeQuoteEscaping(t *testing.T) {
tests := []string{
// Make sure quotes are transported correctly (different entities or
// unicode, but correct semantics)
"<p>Here are some &quot;quotes&quot;.</p>\n",
"<p>Here are some &#34;quotes&#34;.</p>\n",
"<p>Here are some &ldquo;quotes&rdquo;.</p>\n",
"<p>Here are some \u201Cquotes\u201D.</p>\n",
// Within a <script> tag, content gets parsed by the raw text parsing rules.
// This test makes sure we correctly disable those parsing rules and do not
// escape e.g. the closing </p>.
`Here are <script> some "quotes".`,
"<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n",
// Same test for an unknown element that does not switch into raw mode.
`Here are <eviltag> some "quotes".`,
"<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n",
}
doTestsSanitize(t, tests)
}
func TestSanitizeSelfClosingTag(t *testing.T) {
tests := []string{
"<hr>\n",
"<hr>\n",
"<hr/>\n",
"<hr/>\n",
// Make sure that evil attributes are stripped for self closing tags.
"<hr onclick=\"evil()\"/>\n",
"<hr/>\n",
}
doTestsSanitize(t, tests)
}
func TestSanitizeInlineLink(t *testing.T) {
tests := []string{
"[link](javascript:evil)",
"<p><a>link</a></p>\n",
"[link](/abc)",
"<p><a href=\"/abc\">link</a></p>\n",
}
doTestsSanitize(t, tests)
}