From 9fc8c9d8660c52c8390ba80e50556577e179b984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= Date: Sat, 25 Jan 2014 21:42:34 +0200 Subject: [PATCH 1/6] Fix bug with overzealous autolink processing When the source Markdown contains an anchor tag with URL as link text (i.e. http://foo.bar), autolink converts that link text into another anchor tag, which is nonsense. Detect this situation with regexp and early exit autolink processing. --- inline.go | 19 +++++++++++++++++++ inline_test.go | 15 +++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/inline.go b/inline.go index 0348dbf..d83404e 100644 --- a/inline.go +++ b/inline.go @@ -15,9 +15,14 @@ package blackfriday import ( "bytes" + "regexp" "strconv" ) +var ( + anchorRe = regexp.MustCompile(`^(]+")?\s?>` + urlRe + `<\/a>)`) +) + // Functions to parse text within a block // Each function returns the number of chars taken care of // data is the complete block being rendered @@ -618,6 +623,20 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { return 0 } + // Now a more expensive check to see if we're not inside an anchor element + anchorStart := offset + offsetFromAnchor := 0 + for anchorStart > 0 && data[anchorStart] != '<' { + anchorStart-- + offsetFromAnchor++ + } + + anchorStr := anchorRe.Find(data[anchorStart:]) + if anchorStr != nil { + out.Write(anchorStr[offsetFromAnchor:]) + return len(anchorStr) - offsetFromAnchor + } + // scan backward for a word boundary rewind := 0 for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) { diff --git a/inline_test.go b/inline_test.go index 57bef81..7cb3cdf 100644 --- a/inline_test.go +++ b/inline_test.go @@ -674,6 +674,21 @@ func TestAutoLink(t *testing.T) { "even a > can be escaped &etc>\n", "

even a > can be escaped " + "http://new.com?q=>&etc

\n", + + "http://fancy.com\n", + "

http://fancy.com

\n", + + "This is a link\n", + "

This is a link

\n", + + "http://www.fancy.com/A_B.pdf\n", + "

http://www.fancy.com/A_B.pdf

\n", + + "(http://www.fancy.com/A_B (\n", + "

(http://www.fancy.com/A_B (

\n", + + "(http://www.fancy.com/A_B (part two: http://www.fancy.com/A_B)).\n", + "

(http://www.fancy.com/A_B (part two: http://www.fancy.com/A_B)).

\n", } doTestsInline(t, tests) } From f2d43f69a49fc75aa738627de774597242072fc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= Date: Sat, 25 Jan 2014 21:59:38 +0200 Subject: [PATCH 2/6] Fix bug in autolink termination Detect the end of link when it is immediately followed by an element. --- inline.go | 6 +++++- inline_test.go | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/inline.go b/inline.go index d83404e..952aa63 100644 --- a/inline.go +++ b/inline.go @@ -654,7 +654,7 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { } linkEnd := 0 - for linkEnd < len(data) && !isspace(data[linkEnd]) { + for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) { linkEnd++ } @@ -737,6 +737,10 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { return linkEnd - rewind } +func isEndOfLink(char byte) bool { + return isspace(char) || char == '<' +} + var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://"), []byte("/")} func isSafeLink(link []byte) bool { diff --git a/inline_test.go b/inline_test.go index 7cb3cdf..a4be3ba 100644 --- a/inline_test.go +++ b/inline_test.go @@ -689,6 +689,9 @@ func TestAutoLink(t *testing.T) { "(http://www.fancy.com/A_B (part two: http://www.fancy.com/A_B)).\n", "

(http://www.fancy.com/A_B (part two: http://www.fancy.com/A_B)).

\n", + + "http://www.foo.com
\n", + "

http://www.foo.com

\n", } doTestsInline(t, tests) } From 31a96c6ce76bbd2f8482560385986d927eb43b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= Date: Sun, 26 Jan 2014 21:21:25 +0200 Subject: [PATCH 3/6] go fmt --- html.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html.go b/html.go index 596fa35..bce2809 100644 --- a/html.go +++ b/html.go @@ -42,7 +42,7 @@ const ( ) var ( - tags = []string{ + tags = []string{ "b", "blockquote", "code", @@ -70,10 +70,10 @@ var ( "strike", "ul", } - urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` + urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`) - anchorClean = regexp.MustCompile(`^(]+")?\s?>|<\/a>)$`) - imgClean = regexp.MustCompile(`^(]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) + anchorClean = regexp.MustCompile(`^(]+")?\s?>|<\/a>)$`) + imgClean = regexp.MustCompile(`^(]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) ) // Html is a type that implements the Renderer interface for HTML output. From cc0d56d0920643c43d4fb60b26d62ac10893c6dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= Date: Sun, 26 Jan 2014 21:27:34 +0200 Subject: [PATCH 4/6] Extract a chain of ifs into separate func This gives a ~10% slowdown of a full test run, which is tolerable. Switch statement is still slightly slower (~5%). Using map turned out to be unacceptably slow (~3x slowdown). --- html.go | 53 ++++++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/html.go b/html.go index bce2809..2032b7d 100644 --- a/html.go +++ b/html.go @@ -127,45 +127,36 @@ func HtmlRenderer(flags int, title string, css string) Renderer { } } +// Using if statements is a bit faster than a switch statement. As the compiler +// improves, this should be unnecessary this is only worthwhile because +// attrEscape is the single largest CPU user in normal use. +// Also tried using map, but that gave a ~3x slowdown. +func escapeSingleChar(char byte) (string, bool) { + if char == '"' { + return """, true + } + if char == '&' { + return "&", true + } + if char == '<' { + return "<", true + } + if char == '>' { + return ">", true + } + return "", false +} + func attrEscape(out *bytes.Buffer, src []byte) { org := 0 for i, ch := range src { - // using if statements is a bit faster than a switch statement. - // as the compiler improves, this should be unnecessary - // this is only worthwhile because attrEscape is the single - // largest CPU user in normal use - if ch == '"' { + if entity, ok := escapeSingleChar(ch); ok { if i > org { // copy all the normal characters since the last escape out.Write(src[org:i]) } org = i + 1 - out.WriteString(""") - continue - } - if ch == '&' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString("&") - continue - } - if ch == '<' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString("<") - continue - } - if ch == '>' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString(">") - continue + out.WriteString(entity) } } if org < len(src) { From b0bdfbec4ceab22844aa766b3856aa95753ffde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= Date: Sun, 26 Jan 2014 21:39:38 +0200 Subject: [PATCH 5/6] Fix bug in autolink overescaping html entities If autolink encounters a link which already has an escaped html entity, it would escape the ampersand again, producing things like these: & --> &amp; " --> &quot; This commit solves that by first looking for all entity-looking things in the link and copying those ranges verbatim, only considering the rest of the string for escaping. Doesn't seem to have considerable performance impact. The mailto: links are processed the old way. --- html.go | 19 ++++++++++++++++--- inline_test.go | 6 ++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/html.go b/html.go index 2032b7d..ac62d8b 100644 --- a/html.go +++ b/html.go @@ -74,6 +74,8 @@ var ( tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`) anchorClean = regexp.MustCompile(`^(]+")?\s?>|<\/a>)$`) imgClean = regexp.MustCompile(`^(]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) + // TODO: improve this regexp to catch all possible entities: + htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`) ) // Html is a type that implements the Renderer interface for HTML output. @@ -164,6 +166,16 @@ func attrEscape(out *bytes.Buffer, src []byte) { } } +func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) { + end := 0 + for _, rang := range skipRanges { + attrEscape(out, src[end:rang[0]]) + out.Write(src[rang[0]:rang[1]]) + end = rang[1] + } + attrEscape(out, src[end:]) +} + func (options *Html) GetFlags() int { return options.flags } @@ -408,10 +420,11 @@ func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) { } func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { + skipRanges := htmlEntity.FindAllIndex(link, -1) if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL { // mark it but don't link it if it is not a safe link: no smartypants out.WriteString("") - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) out.WriteString("") return } @@ -420,7 +433,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { if kind == LINK_TYPE_EMAIL { out.WriteString("mailto:") } - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) out.WriteString("\">") // Pretty print: if we get an email address as @@ -432,7 +445,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { case bytes.HasPrefix(link, []byte("mailto:")): attrEscape(out, link[len("mailto:"):]) default: - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) } out.WriteString("") diff --git a/inline_test.go b/inline_test.go index a4be3ba..aea6bf1 100644 --- a/inline_test.go +++ b/inline_test.go @@ -692,6 +692,12 @@ func TestAutoLink(t *testing.T) { "http://www.foo.com
\n", "

http://www.foo.com

\n", + + "http://foo.com/viewtopic.php?f=18&t=297", + "

http://foo.com/viewtopic.php?f=18&t=297

\n", + + "http://foo.com/viewtopic.php?param="18"zz", + "

http://foo.com/viewtopic.php?param="18"zz

\n", } doTestsInline(t, tests) } From e5937643a93cc2e494ee8ba93d03948e221333f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= Date: Sun, 26 Jan 2014 23:40:26 +0200 Subject: [PATCH 6/6] Fix bug in autolink with trailing semicolon In case the link ends with escaped html entity, the semicolon is a part of the link and should not be interpreted as punctuation. --- inline.go | 15 ++++++++++++++- inline_test.go | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/inline.go b/inline.go index 952aa63..41225ce 100644 --- a/inline.go +++ b/inline.go @@ -617,6 +617,14 @@ func entity(p *parser, out *bytes.Buffer, data []byte, offset int) int { return end } +func linkEndsWithEntity(data []byte, linkEnd int) bool { + entityRanges := htmlEntity.FindAllIndex(data[:linkEnd], -1) + if entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd { + return true + } + return false +} + func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { // quick check to rule out most false hits on ':' if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' { @@ -659,7 +667,12 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { } // Skip punctuation at the end of the link - if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' { + if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' { + linkEnd-- + } + + // But don't skip semicolon if it's a part of escaped entity: + if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) { linkEnd-- } diff --git a/inline_test.go b/inline_test.go index aea6bf1..c6e8d10 100644 --- a/inline_test.go +++ b/inline_test.go @@ -698,6 +698,9 @@ func TestAutoLink(t *testing.T) { "http://foo.com/viewtopic.php?param="18"zz", "

http://foo.com/viewtopic.php?param="18"zz

\n", + + "http://foo.com/viewtopic.php?param="18"", + "

http://foo.com/viewtopic.php?param="18"

\n", } doTestsInline(t, tests) }