Merge pull request #55 from rtfb/master

Autolink fixes
2024-03-22 13:40:34 +08:00 · 2014-03-30 19:58:39 +03:00 · 2014-03-30 19:58:39 +03:00 · 55bb56bf9b
commit 55bb56bf9b
parent d643453f1e e5937643a9
3 changed files with 107 additions and 40 deletions
--- a/html.go
+++ b/html.go
@ -43,7 +43,7 @@ const (
 )

 var (
-	tags  = []string{
+	tags = []string{
 		"b",
 		"blockquote",
 		"code",
@ -71,10 +71,12 @@ var (
 		"strike",
 		"ul",
 	}
-	urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
+	urlRe        = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
 	tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`)
-	anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
-	imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
+	anchorClean  = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
+	imgClean     = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
+	// TODO: improve this regexp to catch all possible entities:
+	htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
 )

 // Html is a type that implements the Renderer interface for HTML output.
@ -128,45 +130,36 @@ func HtmlRenderer(flags int, title string, css string) Renderer {
 	}
 }

+// Using if statements is a bit faster than a switch statement. As the compiler
+// improves, this should be unnecessary this is only worthwhile because
+// attrEscape is the single largest CPU user in normal use.
+// Also tried using map, but that gave a ~3x slowdown.
+func escapeSingleChar(char byte) (string, bool) {
+	if char == '"' {
+		return "&quot;", true
+	}
+	if char == '&' {
+		return "&amp;", true
+	}
+	if char == '<' {
+		return "&lt;", true
+	}
+	if char == '>' {
+		return "&gt;", true
+	}
+	return "", false
+}
+
 func attrEscape(out *bytes.Buffer, src []byte) {
 	org := 0
 	for i, ch := range src {
-		// using if statements is a bit faster than a switch statement.
-		// as the compiler improves, this should be unnecessary
-		// this is only worthwhile because attrEscape is the single
-		// largest CPU user in normal use
-		if ch == '"' {
+		if entity, ok := escapeSingleChar(ch); ok {
 			if i > org {
 				// copy all the normal characters since the last escape
 				out.Write(src[org:i])
 			}
 			org = i + 1
-			out.WriteString("&quot;")
-			continue
-		}
-		if ch == '&' {
-			if i > org {
-				out.Write(src[org:i])
-			}
-			org = i + 1
-			out.WriteString("&amp;")
-			continue
-		}
-		if ch == '<' {
-			if i > org {
-				out.Write(src[org:i])
-			}
-			org = i + 1
-			out.WriteString("&lt;")
-			continue
-		}
-		if ch == '>' {
-			if i > org {
-				out.Write(src[org:i])
-			}
-			org = i + 1
-			out.WriteString("&gt;")
-			continue
+			out.WriteString(entity)
 		}
 	}
 	if org < len(src) {
@ -174,6 +167,16 @@ func attrEscape(out *bytes.Buffer, src []byte) {
 	}
 }

+func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) {
+	end := 0
+	for _, rang := range skipRanges {
+		attrEscape(out, src[end:rang[0]])
+		out.Write(src[rang[0]:rang[1]])
+		end = rang[1]
+	}
+	attrEscape(out, src[end:])
+}
+
 func (options *Html) GetFlags() int {
 	return options.flags
 }
@ -418,10 +421,11 @@ func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) {
 }

 func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
+	skipRanges := htmlEntity.FindAllIndex(link, -1)
 	if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL {
 		// mark it but don't link it if it is not a safe link: no smartypants
 		out.WriteString("<tt>")
-		attrEscape(out, link)
+		entityEscapeWithSkip(out, link, skipRanges)
 		out.WriteString("</tt>")
 		return
 	}
@ -430,7 +434,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
 	if kind == LINK_TYPE_EMAIL {
 		out.WriteString("mailto:")
 	}
-	attrEscape(out, link)
+	entityEscapeWithSkip(out, link, skipRanges)
 	out.WriteString("\">")

 	// Pretty print: if we get an email address as
@ -442,7 +446,7 @@ func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
 	case bytes.HasPrefix(link, []byte("mailto:")):
 		attrEscape(out, link[len("mailto:"):])
 	default:
-		attrEscape(out, link)
+		entityEscapeWithSkip(out, link, skipRanges)
 	}

 	out.WriteString("</a>")
--- a/inline.go
+++ b/inline.go
@ -15,9 +15,14 @@ package blackfriday

 import (
 	"bytes"
+	"regexp"
 	"strconv"
 )

+var (
+	anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
+)
+
 // Functions to parse text within a block
 // Each function returns the number of chars taken care of
 // data is the complete block being rendered
@ -612,12 +617,34 @@ func entity(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	return end
 }

+func linkEndsWithEntity(data []byte, linkEnd int) bool {
+	entityRanges := htmlEntity.FindAllIndex(data[:linkEnd], -1)
+	if entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd {
+		return true
+	}
+	return false
+}
+
 func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	// quick check to rule out most false hits on ':'
 	if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
 		return 0
 	}

+	// Now a more expensive check to see if we're not inside an anchor element
+	anchorStart := offset
+	offsetFromAnchor := 0
+	for anchorStart > 0 && data[anchorStart] != '<' {
+		anchorStart--
+		offsetFromAnchor++
+	}
+
+	anchorStr := anchorRe.Find(data[anchorStart:])
+	if anchorStr != nil {
+		out.Write(anchorStr[offsetFromAnchor:])
+		return len(anchorStr) - offsetFromAnchor
+	}
+
 	// scan backward for a word boundary
 	rewind := 0
 	for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) {
@ -635,12 +662,17 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	}

 	linkEnd := 0
-	for linkEnd < len(data) && !isspace(data[linkEnd]) {
+	for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) {
 		linkEnd++
 	}

 	// Skip punctuation at the end of the link
-	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
+	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' {
+		linkEnd--
+	}
+
+	// But don't skip semicolon if it's a part of escaped entity:
+	if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) {
 		linkEnd--
 	}

@ -718,6 +750,10 @@ func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
 	return linkEnd - rewind
 }

+func isEndOfLink(char byte) bool {
+	return isspace(char) || char == '<'
+}
+
 var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://"), []byte("/")}

 func isSafeLink(link []byte) bool {
--- a/inline_test.go
+++ b/inline_test.go
@ -682,6 +682,33 @@ func TestAutoLink(t *testing.T) {
 		"even a > can be escaped <http://new.com?q=\\>&etc>\n",
 		"<p>even a &gt; can be escaped <a href=\"http://new.com?q=&gt;&amp;etc\">" +
 			"http://new.com?q=&gt;&amp;etc</a></p>\n",
+
+		"<a href=\"http://fancy.com\">http://fancy.com</a>\n",
+		"<p><a href=\"http://fancy.com\">http://fancy.com</a></p>\n",
+
+		"<a href=\"http://fancy.com\">This is a link</a>\n",
+		"<p><a href=\"http://fancy.com\">This is a link</a></p>\n",
+
+		"<a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a>\n",
+		"<p><a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a></p>\n",
+
+		"(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (\n",
+		"<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (</p>\n",
+
+		"(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).\n",
+		"<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).</p>\n",
+
+		"http://www.foo.com<br />\n",
+		"<p><a href=\"http://www.foo.com\">http://www.foo.com</a><br /></p>\n",
+
+		"http://foo.com/viewtopic.php?f=18&amp;t=297",
+		"<p><a href=\"http://foo.com/viewtopic.php?f=18&amp;t=297\">http://foo.com/viewtopic.php?f=18&amp;t=297</a></p>\n",
+
+		"http://foo.com/viewtopic.php?param=&quot;18&quot;zz",
+		"<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;zz\">http://foo.com/viewtopic.php?param=&quot;18&quot;zz</a></p>\n",
+
+		"http://foo.com/viewtopic.php?param=&quot;18&quot;",
+		"<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;\">http://foo.com/viewtopic.php?param=&quot;18&quot;</a></p>\n",
 	}
 	doTestsInline(t, tests)
 }