diff --git a/html.go b/html.go index 5a928b2..41220f5 100644 --- a/html.go +++ b/html.go @@ -43,7 +43,7 @@ const ( ) var ( - tags = []string{ + tags = []string{ "b", "blockquote", "code", @@ -71,10 +71,12 @@ var ( "strike", "ul", } - urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` + urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`) - anchorClean = regexp.MustCompile(`^(]+")?\s?>|<\/a>)$`) - imgClean = regexp.MustCompile(`^(]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) + anchorClean = regexp.MustCompile(`^(]+")?\s?>|<\/a>)$`) + imgClean = regexp.MustCompile(`^(]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) + // TODO: improve this regexp to catch all possible entities: + htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`) ) // Html is a type that implements the Renderer interface for HTML output. @@ -128,45 +130,36 @@ func HtmlRenderer(flags int, title string, css string) Renderer { } } +// Using if statements is a bit faster than a switch statement. As the compiler +// improves, this should be unnecessary this is only worthwhile because +// attrEscape is the single largest CPU user in normal use. +// Also tried using map, but that gave a ~3x slowdown. +func escapeSingleChar(char byte) (string, bool) { + if char == '"' { + return """, true + } + if char == '&' { + return "&", true + } + if char == '<' { + return "<", true + } + if char == '>' { + return ">", true + } + return "", false +} + func attrEscape(out *bytes.Buffer, src []byte) { org := 0 for i, ch := range src { - // using if statements is a bit faster than a switch statement. - // as the compiler improves, this should be unnecessary - // this is only worthwhile because attrEscape is the single - // largest CPU user in normal use - if ch == '"' { + if entity, ok := escapeSingleChar(ch); ok { if i > org { // copy all the normal characters since the last escape out.Write(src[org:i]) } org = i + 1 - out.WriteString(""") - continue - } - if ch == '&' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString("&") - continue - } - if ch == '<' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString("<") - continue - } - if ch == '>' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString(">") - continue + out.WriteString(entity) } } if org < len(src) { @@ -174,6 +167,16 @@ func attrEscape(out *bytes.Buffer, src []byte) { } } +func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) { + end := 0 + for _, rang := range skipRanges { + attrEscape(out, src[end:rang[0]]) + out.Write(src[rang[0]:rang[1]]) + end = rang[1] + } + attrEscape(out, src[end:]) +} + func (options *Html) GetFlags() int { return options.flags } @@ -418,10 +421,11 @@ func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) { } func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { + skipRanges := htmlEntity.FindAllIndex(link, -1) if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL { // mark it but don't link it if it is not a safe link: no smartypants out.WriteString("") - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) out.WriteString("") return } @@ -430,7 +434,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { if kind == LINK_TYPE_EMAIL { out.WriteString("mailto:") } - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) out.WriteString("\">") // Pretty print: if we get an email address as @@ -442,7 +446,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { case bytes.HasPrefix(link, []byte("mailto:")): attrEscape(out, link[len("mailto:"):]) default: - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) } out.WriteString("") diff --git a/inline.go b/inline.go index 0348dbf..41225ce 100644 --- a/inline.go +++ b/inline.go @@ -15,9 +15,14 @@ package blackfriday import ( "bytes" + "regexp" "strconv" ) +var ( + anchorRe = regexp.MustCompile(`^(]+")?\s?>` + urlRe + `<\/a>)`) +) + // Functions to parse text within a block // Each function returns the number of chars taken care of // data is the complete block being rendered @@ -612,12 +617,34 @@ func entity(p *parser, out *bytes.Buffer, data []byte, offset int) int { return end } +func linkEndsWithEntity(data []byte, linkEnd int) bool { + entityRanges := htmlEntity.FindAllIndex(data[:linkEnd], -1) + if entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd { + return true + } + return false +} + func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { // quick check to rule out most false hits on ':' if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' { return 0 } + // Now a more expensive check to see if we're not inside an anchor element + anchorStart := offset + offsetFromAnchor := 0 + for anchorStart > 0 && data[anchorStart] != '<' { + anchorStart-- + offsetFromAnchor++ + } + + anchorStr := anchorRe.Find(data[anchorStart:]) + if anchorStr != nil { + out.Write(anchorStr[offsetFromAnchor:]) + return len(anchorStr) - offsetFromAnchor + } + // scan backward for a word boundary rewind := 0 for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) { @@ -635,12 +662,17 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { } linkEnd := 0 - for linkEnd < len(data) && !isspace(data[linkEnd]) { + for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) { linkEnd++ } // Skip punctuation at the end of the link - if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' { + if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' { + linkEnd-- + } + + // But don't skip semicolon if it's a part of escaped entity: + if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) { linkEnd-- } @@ -718,6 +750,10 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { return linkEnd - rewind } +func isEndOfLink(char byte) bool { + return isspace(char) || char == '<' +} + var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://"), []byte("/")} func isSafeLink(link []byte) bool { diff --git a/inline_test.go b/inline_test.go index feba3a4..478fbe0 100644 --- a/inline_test.go +++ b/inline_test.go @@ -682,6 +682,33 @@ func TestAutoLink(t *testing.T) { "even a > can be escaped &etc>\n", "

even a > can be escaped " + "http://new.com?q=>&etc

\n", + + "http://fancy.com\n", + "

http://fancy.com

\n", + + "This is a link\n", + "

This is a link

\n", + + "http://www.fancy.com/A_B.pdf\n", + "

http://www.fancy.com/A_B.pdf

\n", + + "(http://www.fancy.com/A_B (\n", + "

(http://www.fancy.com/A_B (

\n", + + "(http://www.fancy.com/A_B (part two: http://www.fancy.com/A_B)).\n", + "

(http://www.fancy.com/A_B (part two: http://www.fancy.com/A_B)).

\n", + + "http://www.foo.com
\n", + "

http://www.foo.com

\n", + + "http://foo.com/viewtopic.php?f=18&t=297", + "

http://foo.com/viewtopic.php?f=18&t=297

\n", + + "http://foo.com/viewtopic.php?param="18"zz", + "

http://foo.com/viewtopic.php?param="18"zz

\n", + + "http://foo.com/viewtopic.php?param="18"", + "

http://foo.com/viewtopic.php?param="18"

\n", } doTestsInline(t, tests) }