Merge pull request #55 from rtfb/master

Autolink fixes
This commit is contained in:
Vytautas Šaltenis 2014-03-30 19:58:39 +03:00
commit 55bb56bf9b
3 changed files with 107 additions and 40 deletions

80
html.go
View File

@ -43,7 +43,7 @@ const (
) )
var ( var (
tags = []string{ tags = []string{
"b", "b",
"blockquote", "blockquote",
"code", "code",
@ -71,10 +71,12 @@ var (
"strike", "strike",
"ul", "ul",
} }
urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`) tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`)
anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`) anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
// TODO: improve this regexp to catch all possible entities:
htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
) )
// Html is a type that implements the Renderer interface for HTML output. // Html is a type that implements the Renderer interface for HTML output.
@ -128,45 +130,36 @@ func HtmlRenderer(flags int, title string, css string) Renderer {
} }
} }
// Using if statements is a bit faster than a switch statement. As the compiler
// improves, this should be unnecessary this is only worthwhile because
// attrEscape is the single largest CPU user in normal use.
// Also tried using map, but that gave a ~3x slowdown.
func escapeSingleChar(char byte) (string, bool) {
if char == '"' {
return "&quot;", true
}
if char == '&' {
return "&amp;", true
}
if char == '<' {
return "&lt;", true
}
if char == '>' {
return "&gt;", true
}
return "", false
}
func attrEscape(out *bytes.Buffer, src []byte) { func attrEscape(out *bytes.Buffer, src []byte) {
org := 0 org := 0
for i, ch := range src { for i, ch := range src {
// using if statements is a bit faster than a switch statement. if entity, ok := escapeSingleChar(ch); ok {
// as the compiler improves, this should be unnecessary
// this is only worthwhile because attrEscape is the single
// largest CPU user in normal use
if ch == '"' {
if i > org { if i > org {
// copy all the normal characters since the last escape // copy all the normal characters since the last escape
out.Write(src[org:i]) out.Write(src[org:i])
} }
org = i + 1 org = i + 1
out.WriteString("&quot;") out.WriteString(entity)
continue
}
if ch == '&' {
if i > org {
out.Write(src[org:i])
}
org = i + 1
out.WriteString("&amp;")
continue
}
if ch == '<' {
if i > org {
out.Write(src[org:i])
}
org = i + 1
out.WriteString("&lt;")
continue
}
if ch == '>' {
if i > org {
out.Write(src[org:i])
}
org = i + 1
out.WriteString("&gt;")
continue
} }
} }
if org < len(src) { if org < len(src) {
@ -174,6 +167,16 @@ func attrEscape(out *bytes.Buffer, src []byte) {
} }
} }
func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) {
end := 0
for _, rang := range skipRanges {
attrEscape(out, src[end:rang[0]])
out.Write(src[rang[0]:rang[1]])
end = rang[1]
}
attrEscape(out, src[end:])
}
func (options *Html) GetFlags() int { func (options *Html) GetFlags() int {
return options.flags return options.flags
} }
@ -418,10 +421,11 @@ func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) {
} }
func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
skipRanges := htmlEntity.FindAllIndex(link, -1)
if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL { if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL {
// mark it but don't link it if it is not a safe link: no smartypants // mark it but don't link it if it is not a safe link: no smartypants
out.WriteString("<tt>") out.WriteString("<tt>")
attrEscape(out, link) entityEscapeWithSkip(out, link, skipRanges)
out.WriteString("</tt>") out.WriteString("</tt>")
return return
} }
@ -430,7 +434,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
if kind == LINK_TYPE_EMAIL { if kind == LINK_TYPE_EMAIL {
out.WriteString("mailto:") out.WriteString("mailto:")
} }
attrEscape(out, link) entityEscapeWithSkip(out, link, skipRanges)
out.WriteString("\">") out.WriteString("\">")
// Pretty print: if we get an email address as // Pretty print: if we get an email address as
@ -442,7 +446,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
case bytes.HasPrefix(link, []byte("mailto:")): case bytes.HasPrefix(link, []byte("mailto:")):
attrEscape(out, link[len("mailto:"):]) attrEscape(out, link[len("mailto:"):])
default: default:
attrEscape(out, link) entityEscapeWithSkip(out, link, skipRanges)
} }
out.WriteString("</a>") out.WriteString("</a>")

View File

@ -15,9 +15,14 @@ package blackfriday
import ( import (
"bytes" "bytes"
"regexp"
"strconv" "strconv"
) )
var (
anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
)
// Functions to parse text within a block // Functions to parse text within a block
// Each function returns the number of chars taken care of // Each function returns the number of chars taken care of
// data is the complete block being rendered // data is the complete block being rendered
@ -612,12 +617,34 @@ func entity(p *parser, out *bytes.Buffer, data []byte, offset int) int {
return end return end
} }
func linkEndsWithEntity(data []byte, linkEnd int) bool {
entityRanges := htmlEntity.FindAllIndex(data[:linkEnd], -1)
if entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd {
return true
}
return false
}
func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
// quick check to rule out most false hits on ':' // quick check to rule out most false hits on ':'
if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' { if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
return 0 return 0
} }
// Now a more expensive check to see if we're not inside an anchor element
anchorStart := offset
offsetFromAnchor := 0
for anchorStart > 0 && data[anchorStart] != '<' {
anchorStart--
offsetFromAnchor++
}
anchorStr := anchorRe.Find(data[anchorStart:])
if anchorStr != nil {
out.Write(anchorStr[offsetFromAnchor:])
return len(anchorStr) - offsetFromAnchor
}
// scan backward for a word boundary // scan backward for a word boundary
rewind := 0 rewind := 0
for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) { for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) {
@ -635,12 +662,17 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
} }
linkEnd := 0 linkEnd := 0
for linkEnd < len(data) && !isspace(data[linkEnd]) { for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) {
linkEnd++ linkEnd++
} }
// Skip punctuation at the end of the link // Skip punctuation at the end of the link
if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' { if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' {
linkEnd--
}
// But don't skip semicolon if it's a part of escaped entity:
if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) {
linkEnd-- linkEnd--
} }
@ -718,6 +750,10 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
return linkEnd - rewind return linkEnd - rewind
} }
func isEndOfLink(char byte) bool {
return isspace(char) || char == '<'
}
var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://"), []byte("/")} var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://"), []byte("/")}
func isSafeLink(link []byte) bool { func isSafeLink(link []byte) bool {

View File

@ -682,6 +682,33 @@ func TestAutoLink(t *testing.T) {
"even a > can be escaped <http://new.com?q=\\>&etc>\n", "even a > can be escaped <http://new.com?q=\\>&etc>\n",
"<p>even a &gt; can be escaped <a href=\"http://new.com?q=&gt;&amp;etc\">" + "<p>even a &gt; can be escaped <a href=\"http://new.com?q=&gt;&amp;etc\">" +
"http://new.com?q=&gt;&amp;etc</a></p>\n", "http://new.com?q=&gt;&amp;etc</a></p>\n",
"<a href=\"http://fancy.com\">http://fancy.com</a>\n",
"<p><a href=\"http://fancy.com\">http://fancy.com</a></p>\n",
"<a href=\"http://fancy.com\">This is a link</a>\n",
"<p><a href=\"http://fancy.com\">This is a link</a></p>\n",
"<a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a>\n",
"<p><a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a></p>\n",
"(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (\n",
"<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (</p>\n",
"(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).\n",
"<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).</p>\n",
"http://www.foo.com<br />\n",
"<p><a href=\"http://www.foo.com\">http://www.foo.com</a><br /></p>\n",
"http://foo.com/viewtopic.php?f=18&amp;t=297",
"<p><a href=\"http://foo.com/viewtopic.php?f=18&amp;t=297\">http://foo.com/viewtopic.php?f=18&amp;t=297</a></p>\n",
"http://foo.com/viewtopic.php?param=&quot;18&quot;zz",
"<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;zz\">http://foo.com/viewtopic.php?param=&quot;18&quot;zz</a></p>\n",
"http://foo.com/viewtopic.php?param=&quot;18&quot;",
"<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;\">http://foo.com/viewtopic.php?param=&quot;18&quot;</a></p>\n",
} }
doTestsInline(t, tests) doTestsInline(t, tests)
} }