From f16de74d8b530942abf12fe4d3fee8e03c07cbb6 Mon Sep 17 00:00:00 2001 From: Russ Ross Date: Fri, 27 May 2011 13:38:10 -0600 Subject: [PATCH] fixing link parsing --- markdown.go | 284 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 184 insertions(+), 100 deletions(-) diff --git a/markdown.go b/markdown.go index ac1e1e0..0403e3c 100644 --- a/markdown.go +++ b/markdown.go @@ -364,7 +364,7 @@ func char_emphasis(ob *bytes.Buffer, rndr *render, data []byte, offset int) int if len(data) > 2 && data[1] != c { // whitespace cannot follow an opening emphasis; // strikethrough only takes two characters '~~' - if c == '~' || unicode.IsSpace(int(data[1])) { + if c == '~' || isspace(data[1]) { return 0 } if ret = parse_emph1(ob, rndr, data[1:], c); ret == 0 { @@ -375,7 +375,7 @@ func char_emphasis(ob *bytes.Buffer, rndr *render, data []byte, offset int) int } if len(data) > 3 && data[1] == c && data[2] != c { - if unicode.IsSpace(int(data[2])) { + if isspace(data[2]) { return 0 } if ret = parse_emph2(ob, rndr, data[2:], c); ret == 0 { @@ -386,7 +386,7 @@ func char_emphasis(ob *bytes.Buffer, rndr *render, data []byte, offset int) int } if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c { - if c == '~' || unicode.IsSpace(int(data[3])) { + if c == '~' || isspace(data[3]) { return 0 } if ret = parse_emph3(ob, rndr, data, 3, c); ret == 0 { @@ -507,6 +507,9 @@ func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { case data[i] == ']': level-- + if level <= 0 { + i-- // compensate for extra i++ in for loop + } } } @@ -518,8 +521,8 @@ func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { i++ // skip any amount of whitespace or newline - // (this is much more laxist than original markdown syntax) - for i < len(data) && unicode.IsSpace(int(data[i])) { + // (this is much more lax than original markdown syntax) + for i < len(data) && isspace(data[i]) { i++ } @@ -529,7 +532,7 @@ func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { // skipping initial whitespace i++ - for i < len(data) && unicode.IsSpace(int(data[i])) { + for i < len(data) && isspace(data[i]) { i++ } @@ -575,7 +578,7 @@ func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { // skipping whitespaces after title title_e = i - 1 - for title_e > title_b && unicode.IsSpace(int(data[title_e])) { + for title_e > title_b && isspace(data[title_e]) { title_e-- } @@ -587,7 +590,7 @@ func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { } // remove whitespace at the end of the link - for link_e > link_b && unicode.IsSpace(int(data[link_e-1])) { + for link_e > link_b && isspace(data[link_e-1]) { link_e-- } @@ -648,7 +651,9 @@ func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { } // find the link_ref with matching id - index := sort.Search(len(rndr.refs), func(i int) bool { return !byteslice_less(rndr.refs[i].id, id) }) + index := sort.Search(len(rndr.refs), func(i int) bool { + return !byteslice_less(rndr.refs[i].id, id) + }) if index >= len(rndr.refs) || !bytes.Equal(rndr.refs[index].id, id) { return 0 } @@ -682,7 +687,9 @@ func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { } // find the link_ref with matching id - index := sort.Search(len(rndr.refs), func(i int) bool { return !byteslice_less(rndr.refs[i].id, id) }) + index := sort.Search(len(rndr.refs), func(i int) bool { + return !byteslice_less(rndr.refs[i].id, id) + }) if index >= len(rndr.refs) || !bytes.Equal(rndr.refs[index].id, id) { return 0 } @@ -789,7 +796,7 @@ func char_entity(ob *bytes.Buffer, rndr *render, data []byte, offset int) int { end++ } - for end < len(data) && (unicode.IsDigit(int(data[end])) || unicode.IsLetter(int(data[end]))) { + for end < len(data) && isalnum(data[end]) { end++ } @@ -813,7 +820,7 @@ func char_autolink(ob *bytes.Buffer, rndr *render, data []byte, offset int) int data = data[offset:] if offset > 0 { - if !unicode.IsSpace(int(orig_data[offset-1])) && !ispunct(int(orig_data[offset-1])) { + if !isspace(orig_data[offset-1]) && !ispunct(orig_data[offset-1]) { return 0 } } @@ -823,7 +830,7 @@ func char_autolink(ob *bytes.Buffer, rndr *render, data []byte, offset int) int } link_end := 0 - for link_end < len(data) && !unicode.IsSpace(int(data[link_end])) { + for link_end < len(data) && !isspace(data[link_end]) { link_end++ } @@ -905,7 +912,7 @@ var valid_uris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://" func is_safe_link(link []byte) bool { for _, prefix := range valid_uris { - if len(link) > len(prefix) && !byteslice_less(link[:len(prefix)], prefix) && !byteslice_less(prefix, link[:len(prefix)]) && (unicode.IsLetter(int(link[len(prefix)])) || unicode.IsDigit(int(link[len(prefix)]))) { + if len(link) > len(prefix) && !byteslice_less(link[:len(prefix)], prefix) && !byteslice_less(prefix, link[:len(prefix)]) && isalnum(link[len(prefix)]) { return true } } @@ -915,8 +922,8 @@ func is_safe_link(link []byte) bool { // taken from regexp in the stdlib -func ispunct(c int) bool { - for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" { +func ispunct(c byte) bool { + for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") { if c == r { return true } @@ -924,6 +931,14 @@ func ispunct(c int) bool { return false } +func isspace(c byte) bool { + return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v' +} + +func isalnum(c byte) bool { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + // return the length of the given tag, or 0 is it's not valid func tag_length(data []byte, autolink *int) int { var i, j int @@ -943,7 +958,7 @@ func tag_length(data []byte, autolink *int) int { i = 1 } - if !unicode.IsDigit(int(data[i])) && !unicode.IsLetter(int(data[i])) { + if !isalnum(data[i]) { return 0 } @@ -951,7 +966,7 @@ func tag_length(data []byte, autolink *int) int { *autolink = MKDA_NOT_AUTOLINK // try to find the beggining of an URI - for i < len(data) && ((unicode.IsLetter(int(data[i])) || unicode.IsDigit(int(data[i]))) || data[i] == '.' || data[i] == '+' || data[i] == '-') { + for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') { i++ } @@ -978,7 +993,7 @@ func tag_length(data []byte, autolink *int) int { if data[i] == '\\' { i += 2 } else { - if data[i] == '>' || data[i] == '\'' || data[i] == '"' || unicode.IsSpace(int(data[i])) { + if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) { break } else { i++ @@ -1015,7 +1030,7 @@ func is_mail_autolink(data []byte) int { // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@' for i := 0; i < len(data); i++ { - if unicode.IsLetter(int(data[i])) || unicode.IsDigit(int(data[i])) { + if isalnum(data[i]) { continue } @@ -1145,10 +1160,10 @@ func parse_emph1(ob *bytes.Buffer, rndr *render, data []byte, c byte) int { continue } - if data[i] == c && !unicode.IsSpace(int(data[i-1])) { + if data[i] == c && !isspace(data[i-1]) { if rndr.ext_flags&MKDEXT_NO_INTRA_EMPHASIS != 0 { - if !(i+1 == len(data) || unicode.IsSpace(int(data[i+1])) || ispunct(int(data[i+1]))) { + if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) { continue } } @@ -1186,7 +1201,7 @@ func parse_emph2(ob *bytes.Buffer, rndr *render, data []byte, c byte) int { } i += length - if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !unicode.IsSpace(int(data[i-1])) { + if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) { work := bytes.NewBuffer(nil) parse_inline(work, rndr, data[:i]) r := render_method(ob, work.Bytes(), rndr.mk.opaque) @@ -1214,7 +1229,7 @@ func parse_emph3(ob *bytes.Buffer, rndr *render, data []byte, offset int, c byte i += length // skip whitespace preceded symbols - if data[i] != c || unicode.IsSpace(int(data[i-1])) { + if data[i] != c || isspace(data[i-1]) { continue } @@ -1649,18 +1664,18 @@ func is_codefence(data []byte, syntax **string) int { // string all whitespace at the beginning and the end // of the {} block - for syn > 0 && unicode.IsSpace(int(data[syntax_start])) { + for syn > 0 && isspace(data[syntax_start]) { syntax_start++ syn-- } - for syn > 0 && unicode.IsSpace(int(data[syntax_start+syn-1])) { + for syn > 0 && isspace(data[syntax_start+syn-1]) { syn-- } i++ } else { - for i < len(data) && !unicode.IsSpace(int(data[i])) { + for i < len(data) && !isspace(data[i]) { syn++ i++ } @@ -1671,7 +1686,7 @@ func is_codefence(data []byte, syntax **string) int { } for i < len(data) && data[i] != '\n' { - if !unicode.IsSpace(int(data[i])) { + if !isspace(data[i]) { return 0 } i++ @@ -1853,7 +1868,7 @@ func parse_table_row(ob *bytes.Buffer, rndr *render, data []byte, columns int, c } for col = 0; col < columns && i < len(data); col++ { - for i < len(data) && unicode.IsSpace(int(data[i])) { + for i < len(data) && isspace(data[i]) { i++ } @@ -1865,7 +1880,7 @@ func parse_table_row(ob *bytes.Buffer, rndr *render, data []byte, columns int, c cell_end := i - 1 - for cell_end > cell_start && unicode.IsSpace(int(data[cell_end])) { + for cell_end > cell_start && isspace(data[cell_end]) { cell_end-- } @@ -2371,13 +2386,13 @@ func rndr_blockcode(ob *bytes.Buffer, text []byte, lang string, opaque interface ob.WriteString("
\n")
 }
 
-func rndr_autolink(ob *bytes.Buffer, text []byte, kind int, opaque interface{}) int {
+func rndr_autolink(ob *bytes.Buffer, link []byte, kind int, opaque interface{}) int {
+	options := opaque.(*html_renderopts)
+
+	if len(link) == 0 {
+		return 0
+	}
+	if options.flags&HTML_SAFELINK != 0 && !is_safe_link(link) && kind != MKDA_EMAIL {
+		return 0
+	}
+
+	ob.WriteString("")
+
+	/*
+	 * Pretty printing: if we get an email address as
+	 * an actual URI, e.g. `mailto:foo@bar.com`, we don't
+	 * want to print the `mailto:` prefix
+	 */
+	if bytes.HasPrefix(link, []byte("mailto:")) {
+		attr_escape(ob, link[7:])
+	} else {
+		attr_escape(ob, link)
+	}
+
+	ob.WriteString("")
+
 	return 1
 }
 
@@ -2554,6 +2598,23 @@ func rndr_emphasis(ob *bytes.Buffer, text []byte, opaque interface{}) int {
 }
 
 func rndr_image(ob *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int {
+	options := opaque.(*html_renderopts)
+	if len(link) == 0 {
+		return 0
+	}
+	ob.WriteString("\"") 0 {
+		attr_escape(ob, alt)
+	}
+	if len(title) > 0 {
+		ob.WriteString("\" title=\"")
+		attr_escape(ob, title)
+	}
+
+	ob.WriteByte('"')
+	ob.WriteString(options.close_tag)
 	return 1
 }
 
@@ -2565,6 +2626,25 @@ func rndr_linebreak(ob *bytes.Buffer, opaque interface{}) int {
 }
 
 func rndr_link(ob *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int {
+	options := opaque.(*html_renderopts)
+
+	if options.flags&HTML_SAFELINK != 0 && !is_safe_link(link) {
+		return 0
+	}
+
+	ob.WriteString(" 0 {
+		ob.Write(link)
+	}
+	if len(title) > 0 {
+		ob.WriteString("\" title=\"")
+		attr_escape(ob, title)
+	}
+	ob.WriteString("\">")
+	if len(content) > 0 {
+		ob.Write(content)
+	}
+	ob.WriteString("")
 	return 1
 }
 
@@ -2616,7 +2696,7 @@ func is_html_tag(tag []byte, tagname string) bool {
 		return false
 	}
 	i++
-	for i < len(tag) && unicode.IsSpace(int(tag[i])) {
+	for i < len(tag) && isspace(tag[i]) {
 		i++
 	}
 
@@ -2624,7 +2704,7 @@ func is_html_tag(tag []byte, tagname string) bool {
 		i++
 	}
 
-	for i < len(tag) && unicode.IsSpace(int(tag[i])) {
+	for i < len(tag) && isspace(tag[i]) {
 		i++
 	}
 
@@ -2643,7 +2723,7 @@ func is_html_tag(tag []byte, tagname string) bool {
 		return false
 	}
 
-	return unicode.IsSpace(int(tag[i])) || tag[i] == '>'
+	return isspace(tag[i]) || tag[i] == '>'
 }
 
 
@@ -2656,68 +2736,72 @@ func is_html_tag(tag []byte, tagname string) bool {
 func main() {
 	ob := bytes.NewBuffer(nil)
 	input := ""
-	input += "##Header##\n"
-	input += "\n"
-	input += "----------\n"
-	input += "\n"
-	input += "Underlined header\n"
-	input += "-----------------\n"
-	input += "\n"
-	input += "

Some block html\n" - input += "

\n" - input += "\n" - input += "Score | Grade\n" - input += "------|------\n" - input += "94 | A\n" - input += "85 | B\n" - input += "74 | C\n" - input += "65 | D\n" - input += "\n" - input += "``` go\n" - input += "func fib(n int) int {\n" - input += " if n <= 1 {\n" - input += " return n\n" - input += " }\n" - input += " return n * fib(n-1)\n" - input += "}\n" - input += "```\n" - input += "\n" - input += "> A blockquote\n" - input += "> or something like that\n" - input += "> With a table | of two columns\n" - input += "> -------------|---------------\n" - input += "> key | value \n" - input += "\n" - input += "\n" + // input += "##Header##\n" + // input += "\n" + // input += "----------\n" + // input += "\n" + // input += "Underlined header\n" + // input += "-----------------\n" + // input += "\n" + // input += "

Some block html\n" + // input += "

\n" + // input += "\n" + // input += "Score | Grade\n" + // input += "------|------\n" + // input += "94 | A\n" + // input += "85 | B\n" + // input += "74 | C\n" + // input += "65 | D\n" + // input += "\n" + // input += "``` go\n" + // input += "func fib(n int) int {\n" + // input += " if n <= 1 {\n" + // input += " return n\n" + // input += " }\n" + // input += " return n * fib(n-1)\n" + // input += "}\n" + // input += "```\n" + // input += "\n" + // input += "> A blockquote\n" + // input += "> or something like that\n" + // input += "> With a table | of two columns\n" + // input += "> -------------|---------------\n" + // input += "> key | value \n" + // input += "\n" + // input += "\n" input += "Some **bold** Some *italic* and [a link][1] \n" - input += "\n" - input += "A little code sample\n" - input += "\n" - input += " \n" - input += " Web Page Title\n" - input += " \n" - input += "\n" - input += "A picture\n" - input += "\n" - input += "![alt text][2]\n" - input += "\n" - input += "A list\n" - input += "\n" - input += "- apples\n" - input += "- oranges\n" - input += "- eggs\n" - input += "\n" - input += "A numbered list\n" - input += "\n" - input += "1. a\n" - input += "2. b\n" - input += "3. c\n" - input += "\n" - input += "A little quote\n" - input += "\n" - input += "> It is now time for all good men to come to the aid of their country. \n" - input += "\n" - input += "A final paragraph. `code this` fool\n" + // input += "\n" + // input += "A little code sample\n" + // input += "\n" + // input += " \n" + // input += " Web Page Title\n" + // input += " \n" + // input += "\n" + // input += "A picture\n" + // input += "\n" + // input += "![alt text][2]\n" + // input += "\n" + // input += "A list\n" + // input += "\n" + // input += "- apples\n" + // input += "- oranges\n" + // input += "- eggs\n" + // input += "\n" + // input += "A numbered list\n" + // input += "\n" + // input += "1. a\n" + // input += "2. b\n" + // input += "3. c\n" + // input += "\n" + // input += "A little quote\n" + // input += "\n" + // input += "> It is now time for all good men to come to the aid of their country. \n" + // input += "\n" + // input += "A final paragraph. `code this` fool\n" + // input += "\n" + // input += "Click [here](http:google.com)\n" + // input += "\n" + // input += "\n" input += "\n" input += " [1]: http://www.google.com\n" input += " [2]: http://www.google.com/intl/en_ALL/images/logo.gif\n" @@ -2749,9 +2833,9 @@ func main() { rndrer.normal_text = rndr_normal_text - rndrer.opaque = &html_renderopts{close_tag: " />"} + rndrer.opaque = &html_renderopts{close_tag: ">\n"} - var extensions uint32 = MKDEXT_FENCED_CODE | MKDEXT_TABLES + var extensions uint32 = MKDEXT_NO_INTRA_EMPHASIS | MKDEXT_TABLES | MKDEXT_FENCED_CODE | MKDEXT_AUTOLINK | MKDEXT_STRIKETHROUGH | MKDEXT_LAX_HTML_BLOCKS | MKDEXT_SPACE_HEADERS // call the main rendered function Markdown(ob, ib, rndrer, extensions)