From 9fc8c9d8660c52c8390ba80e50556577e179b984 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= <vytas@rtfb.lt>
Date: Sat, 25 Jan 2014 21:42:34 +0200
Subject: [PATCH 1/6] Fix bug with overzealous autolink processing

When the source Markdown contains an anchor tag with URL as link text
(i.e. <a href=...>http://foo.bar</a>), autolink converts that link text
into another anchor tag, which is nonsense. Detect this situation with
regexp and early exit autolink processing.
---
 inline.go      | 19 +++++++++++++++++++
 inline_test.go | 15 +++++++++++++++
 2 files changed, 34 insertions(+)
diff --git a/inline.go b/inline.go
index 0348dbf..d83404e 100644
--- a/inline.go
+++ b/inline.go
@@ -15,9 +15,14 @@ package blackfriday
 
 import (
 	"bytes"
+	"regexp"
 	"strconv"
 )
 
+var (
+	anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
+)
+
 // Functions to parse text within a block
 // Each function returns the number of chars taken care of
 // data is the complete block being rendered
@@ -618,6 +623,20 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 		return 0
 	}
 
+	// Now a more expensive check to see if we're not inside an anchor element
+	anchorStart := offset
+	offsetFromAnchor := 0
+	for anchorStart > 0 && data[anchorStart] != '<' {
+		anchorStart--
+		offsetFromAnchor++
+	}
+
+	anchorStr := anchorRe.Find(data[anchorStart:])
+	if anchorStr != nil {
+		out.Write(anchorStr[offsetFromAnchor:])
+		return len(anchorStr) - offsetFromAnchor
+	}
+
 	// scan backward for a word boundary
 	rewind := 0
 	for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) {
diff --git a/inline_test.go b/inline_test.go
index 57bef81..7cb3cdf 100644
--- a/inline_test.go
+++ b/inline_test.go
@@ -674,6 +674,21 @@ func TestAutoLink(t *testing.T) {
 		"even a > can be escaped <http://new.com?q=\\>&etc>\n",
 		"<p>even a &gt; can be escaped <a href=\"http://new.com?q=&gt;&amp;etc\">" +
 			"http://new.com?q=&gt;&amp;etc</a></p>\n",
+
+		"<a href=\"http://fancy.com\">http://fancy.com</a>\n",
+		"<p><a href=\"http://fancy.com\">http://fancy.com</a></p>\n",
+
+		"<a href=\"http://fancy.com\">This is a link</a>\n",
+		"<p><a href=\"http://fancy.com\">This is a link</a></p>\n",
+
+		"<a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a>\n",
+		"<p><a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a></p>\n",
+
+		"(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (\n",
+		"<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (</p>\n",
+
+		"(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).\n",
+		"<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).</p>\n",
 	}
 	doTestsInline(t, tests)
 }

From f2d43f69a49fc75aa738627de774597242072fc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= <vytas@rtfb.lt>
Date: Sat, 25 Jan 2014 21:59:38 +0200
Subject: [PATCH 2/6] Fix bug in autolink termination

Detect the end of link when it is immediately followed by an element.
---
 inline.go      | 6 +++++-
 inline_test.go | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/inline.go b/inline.go
index d83404e..952aa63 100644
--- a/inline.go
+++ b/inline.go
@@ -654,7 +654,7 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	}
 
 	linkEnd := 0
-	for linkEnd < len(data) && !isspace(data[linkEnd]) {
+	for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) {
 		linkEnd++
 	}
 
@@ -737,6 +737,10 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	return linkEnd - rewind
 }
 
+func isEndOfLink(char byte) bool {
+	return isspace(char) || char == '<'
+}
+
 var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://"), []byte("/")}
 
 func isSafeLink(link []byte) bool {
diff --git a/inline_test.go b/inline_test.go
index 7cb3cdf..a4be3ba 100644
--- a/inline_test.go
+++ b/inline_test.go
@@ -689,6 +689,9 @@ func TestAutoLink(t *testing.T) {
 
 		"(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).\n",
 		"<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).</p>\n",
+
+		"http://www.foo.com<br />\n",
+		"<p><a href=\"http://www.foo.com\">http://www.foo.com</a><br /></p>\n",
 	}
 	doTestsInline(t, tests)
 }

From 31a96c6ce76bbd2f8482560385986d927eb43b03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= <vytas@rtfb.lt>
Date: Sun, 26 Jan 2014 21:21:25 +0200
Subject: [PATCH 3/6] go fmt

---
 html.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/html.go b/html.go
index 596fa35..bce2809 100644
--- a/html.go
+++ b/html.go
@@ -42,7 +42,7 @@ const (
 )
 
 var (
-	tags  = []string{
+	tags = []string{
 		"b",
 		"blockquote",
 		"code",
@@ -70,10 +70,10 @@ var (
 		"strike",
 		"ul",
 	}
-	urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
+	urlRe        = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
 	tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`)
-	anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
-	imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
+	anchorClean  = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
+	imgClean     = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
 )
 
 // Html is a type that implements the Renderer interface for HTML output.

From cc0d56d0920643c43d4fb60b26d62ac10893c6dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= <vytas@rtfb.lt>
Date: Sun, 26 Jan 2014 21:27:34 +0200
Subject: [PATCH 4/6] Extract a chain of ifs into separate func

This gives a ~10% slowdown of a full test run, which is tolerable.
Switch statement is still slightly slower (~5%). Using map turned out to
be unacceptably slow (~3x slowdown).
---
 html.go | 53 ++++++++++++++++++++++-------------------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/html.go b/html.go
index bce2809..2032b7d 100644
--- a/html.go
+++ b/html.go
@@ -127,45 +127,36 @@ func HtmlRenderer(flags int, title string, css string) Renderer {
 	}
 }
 
+// Using if statements is a bit faster than a switch statement. As the compiler
+// improves, this should be unnecessary this is only worthwhile because
+// attrEscape is the single largest CPU user in normal use.
+// Also tried using map, but that gave a ~3x slowdown.
+func escapeSingleChar(char byte) (string, bool) {
+	if char == '"' {
+		return "&quot;", true
+	}
+	if char == '&' {
+		return "&amp;", true
+	}
+	if char == '<' {
+		return "&lt;", true
+	}
+	if char == '>' {
+		return "&gt;", true
+	}
+	return "", false
+}
+
 func attrEscape(out *bytes.Buffer, src []byte) {
 	org := 0
 	for i, ch := range src {
-		// using if statements is a bit faster than a switch statement.
-		// as the compiler improves, this should be unnecessary
-		// this is only worthwhile because attrEscape is the single
-		// largest CPU user in normal use
-		if ch == '"' {
+		if entity, ok := escapeSingleChar(ch); ok {
 			if i > org {
 				// copy all the normal characters since the last escape
 				out.Write(src[org:i])
 			}
 			org = i + 1
-			out.WriteString("&quot;")
-			continue
-		}
-		if ch == '&' {
-			if i > org {
-				out.Write(src[org:i])
-			}
-			org = i + 1
-			out.WriteString("&amp;")
-			continue
-		}
-		if ch == '<' {
-			if i > org {
-				out.Write(src[org:i])
-			}
-			org = i + 1
-			out.WriteString("&lt;")
-			continue
-		}
-		if ch == '>' {
-			if i > org {
-				out.Write(src[org:i])
-			}
-			org = i + 1
-			out.WriteString("&gt;")
-			continue
+			out.WriteString(entity)
 		}
 	}
 	if org < len(src) {

From b0bdfbec4ceab22844aa766b3856aa95753ffde8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= <vytas@rtfb.lt>
Date: Sun, 26 Jan 2014 21:39:38 +0200
Subject: [PATCH 5/6] Fix bug in autolink overescaping html entities

If autolink encounters a link which already has an escaped html entity,
it would escape the ampersand again, producing things like these:
    &amp;  --> &amp;amp;
    &quot; --> &amp;quot;
This commit solves that by first looking for all entity-looking things
in the link and copying those ranges verbatim, only considering the rest
of the string for escaping.
Doesn't seem to have considerable performance impact.
The mailto: links are processed the old way.
---
 html.go        | 19 ++++++++++++++++---
 inline_test.go |  6 ++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/html.go b/html.go
index 2032b7d..ac62d8b 100644
--- a/html.go
+++ b/html.go
@@ -74,6 +74,8 @@ var (
 	tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`)
 	anchorClean  = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
 	imgClean     = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
+	// TODO: improve this regexp to catch all possible entities:
+	htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
 )
 
 // Html is a type that implements the Renderer interface for HTML output.
@@ -164,6 +166,16 @@ func attrEscape(out *bytes.Buffer, src []byte) {
 	}
 }
 
+func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) {
+	end := 0
+	for _, rang := range skipRanges {
+		attrEscape(out, src[end:rang[0]])
+		out.Write(src[rang[0]:rang[1]])
+		end = rang[1]
+	}
+	attrEscape(out, src[end:])
+}
+
 func (options *Html) GetFlags() int {
 	return options.flags
 }
@@ -408,10 +420,11 @@ func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) {
 }
 
 func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
+	skipRanges := htmlEntity.FindAllIndex(link, -1)
 	if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL {
 		// mark it but don't link it if it is not a safe link: no smartypants
 		out.WriteString("<tt>")
-		attrEscape(out, link)
+		entityEscapeWithSkip(out, link, skipRanges)
 		out.WriteString("</tt>")
 		return
 	}
@@ -420,7 +433,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
 	if kind == LINK_TYPE_EMAIL {
 		out.WriteString("mailto:")
 	}
-	attrEscape(out, link)
+	entityEscapeWithSkip(out, link, skipRanges)
 	out.WriteString("\">")
 
 	// Pretty print: if we get an email address as
@@ -432,7 +445,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
 	case bytes.HasPrefix(link, []byte("mailto:")):
 		attrEscape(out, link[len("mailto:"):])
 	default:
-		attrEscape(out, link)
+		entityEscapeWithSkip(out, link, skipRanges)
 	}
 
 	out.WriteString("</a>")
diff --git a/inline_test.go b/inline_test.go
index a4be3ba..aea6bf1 100644
--- a/inline_test.go
+++ b/inline_test.go
@@ -692,6 +692,12 @@ func TestAutoLink(t *testing.T) {
 
 		"http://www.foo.com<br />\n",
 		"<p><a href=\"http://www.foo.com\">http://www.foo.com</a><br /></p>\n",
+
+		"http://foo.com/viewtopic.php?f=18&amp;t=297",
+		"<p><a href=\"http://foo.com/viewtopic.php?f=18&amp;t=297\">http://foo.com/viewtopic.php?f=18&amp;t=297</a></p>\n",
+
+		"http://foo.com/viewtopic.php?param=&quot;18&quot;zz",
+		"<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;zz\">http://foo.com/viewtopic.php?param=&quot;18&quot;zz</a></p>\n",
 	}
 	doTestsInline(t, tests)
 }

From e5937643a93cc2e494ee8ba93d03948e221333f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vytautas=20=C5=A0altenis?= <vytas@rtfb.lt>
Date: Sun, 26 Jan 2014 23:40:26 +0200
Subject: [PATCH 6/6] Fix bug in autolink with trailing semicolon

In case the link ends with escaped html entity, the semicolon is a part
of the link and should not be interpreted as punctuation.
---
 inline.go      | 15 ++++++++++++++-
 inline_test.go |  3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/inline.go b/inline.go
index 952aa63..41225ce 100644
--- a/inline.go
+++ b/inline.go
@@ -617,6 +617,14 @@ func entity(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	return end
 }
 
+func linkEndsWithEntity(data []byte, linkEnd int) bool {
+	entityRanges := htmlEntity.FindAllIndex(data[:linkEnd], -1)
+	if entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd {
+		return true
+	}
+	return false
+}
+
 func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	// quick check to rule out most false hits on ':'
 	if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
@@ -659,7 +667,12 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	}
 
 	// Skip punctuation at the end of the link
-	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
+	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' {
+		linkEnd--
+	}
+
+	// But don't skip semicolon if it's a part of escaped entity:
+	if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) {
 		linkEnd--
 	}
 
diff --git a/inline_test.go b/inline_test.go
index aea6bf1..c6e8d10 100644
--- a/inline_test.go
+++ b/inline_test.go
@@ -698,6 +698,9 @@ func TestAutoLink(t *testing.T) {
 
 		"http://foo.com/viewtopic.php?param=&quot;18&quot;zz",
 		"<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;zz\">http://foo.com/viewtopic.php?param=&quot;18&quot;zz</a></p>\n",
+
+		"http://foo.com/viewtopic.php?param=&quot;18&quot;",
+		"<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;\">http://foo.com/viewtopic.php?param=&quot;18&quot;</a></p>\n",
 	}
 	doTestsInline(t, tests)
 }