mirror of
https://github.com/russross/blackfriday.git
synced 2024-03-22 13:40:34 +08:00
Fixed HTML entity regex (#453)
The old regex missed a lot of HTML entities, like long references (from 6-character entites like ≈ to the somewhat rarer ∳) as well as numeric references (decimal e.g. Ӓ or hex e.g. 𓫶). This fixes that.
This commit is contained in:
parent
8c0d4cca94
commit
1bb1d0171c
18
inline.go
18
inline.go
|
@ -23,8 +23,22 @@ var (
|
|||
urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
|
||||
anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
|
||||
|
||||
// TODO: improve this regexp to catch all possible entities:
|
||||
htmlEntityRe = regexp.MustCompile(`&[a-z]{2,5};`)
|
||||
// https://www.w3.org/TR/html5/syntax.html#character-references
|
||||
// highest unicode code point in 17 planes (2^20): 1,114,112d =
|
||||
// 7 dec digits or 6 hex digits
|
||||
// named entity references can be 2-31 characters with stuff like <
|
||||
// at one end and ∳ at the other. There
|
||||
// are also sometimes numbers at the end, although this isn't inherent
|
||||
// in the specification; there are never numbers anywhere else in
|
||||
// current character references, though; see ¾ and ▒, etc.
|
||||
// https://www.w3.org/TR/html5/syntax.html#named-character-references
|
||||
//
|
||||
// entity := "&" (named group | number ref) ";"
|
||||
// named group := [a-zA-Z]{2,31}[0-9]{0,2}
|
||||
// number ref := "#" (dec ref | hex ref)
|
||||
// dec ref := [0-9]{1,7}
|
||||
// hex ref := ("x" | "X") [0-9a-fA-F]{1,6}
|
||||
htmlEntityRe = regexp.MustCompile(`&([a-zA-Z]{2,31}[0-9]{0,2}|#([0-9]{1,7}|[xX][0-9a-fA-F]{1,6}));`)
|
||||
)
|
||||
|
||||
// Functions to parse text within a block
|
||||
|
|
Loading…
Reference in New Issue
Block a user