mirror of
https://github.com/qTox/qTox.git
synced 2024-03-22 14:00:36 +08:00
fix(chatform): Broaden URL matching to include unicode
Fix #4853 Fix #4295 *Instead of searching strictly valid URIs, allow anything characters following scheme. This allows for UTF-8 characters used in other languages, as well as parentheses and other ASCII characters. This will over-match some invalid URLs. *Ignored surrounding characters of URIs and ending punctuation *Fix www-only links by adding http scheme to href
This commit is contained in:
parent
d3d81bbdf3
commit
e564b85e3c
|
@ -56,7 +56,7 @@ ChatMessage::Ptr ChatMessage::createChatMessage(const QString& sender, const QSt
|
|||
|
||||
// quotes (green text)
|
||||
text = detectQuotes(text, type);
|
||||
text = highlightURL(text);
|
||||
text = highlightURI(text);
|
||||
|
||||
// text styling
|
||||
Settings::StyleType styleType = Settings::getInstance().getStylePreference();
|
||||
|
|
|
@ -20,40 +20,42 @@
|
|||
#include "textformatter.h"
|
||||
|
||||
#include <QRegularExpression>
|
||||
#include <QVector>
|
||||
|
||||
// clang-format off
|
||||
|
||||
static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
|
||||
// Note: escaping of '\' is only needed because QStringLiteral is broken by linebreak
|
||||
static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|\\s)"
|
||||
"[%1]"
|
||||
"(?!\\s)"
|
||||
"([^%1\\n]+?)"
|
||||
"(?<!\\s)"
|
||||
"[%1]"
|
||||
"(?=$|[\\s\\n])");
|
||||
"(?=$|\\s)");
|
||||
|
||||
static const QString SINGLE_SLASH_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
|
||||
static const QString SINGLE_SLASH_PATTERN = QStringLiteral("(?<=^|\\s)"
|
||||
"/"
|
||||
"(?!\\s)"
|
||||
"([^/\\n]+?)"
|
||||
"(?<!\\s)"
|
||||
"/"
|
||||
"(?=$|[\\s\\n])");
|
||||
"(?=$|\\s)");
|
||||
|
||||
static const QString DOUBLE_SIGN_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
|
||||
static const QString DOUBLE_SIGN_PATTERN = QStringLiteral("(?<=^|\\s)"
|
||||
"[%1]{2}"
|
||||
"(?!\\s)"
|
||||
"([^\\n]+?)"
|
||||
"(?<!\\s)"
|
||||
"[%1]{2}"
|
||||
"(?=$|[\\s\\n])");
|
||||
"(?=$|\\s)");
|
||||
|
||||
static const QString MULTILINE_CODE = QStringLiteral("(?<=^|[\\s\\n])"
|
||||
static const QString MULTILINE_CODE = QStringLiteral("(?<=^|\\s)"
|
||||
"```"
|
||||
"(?!`)"
|
||||
"((.|\\n)+?)"
|
||||
"(?<!`)"
|
||||
"```"
|
||||
"(?=$|[\\s\\n])");
|
||||
"(?=$|\\s)");
|
||||
|
||||
#define REGEXP_WRAPPER_PAIR(pattern, wrapper)\
|
||||
{QRegularExpression(pattern,QRegularExpression::UseUnicodePropertiesOption),QStringLiteral(wrapper)}
|
||||
|
@ -74,41 +76,131 @@ static const QPair<QRegularExpression, QString> REGEX_TO_WRAPPER[] {
|
|||
#undef REGEXP_WRAPPER_PAIR
|
||||
|
||||
static const QString HREF_WRAPPER = QStringLiteral(R"(<a href="%1">%1</a>)");
|
||||
static const QString WWW_WRAPPER = QStringLiteral(R"(<a href="http://%1">%1</a>)");
|
||||
|
||||
// based in this: https://tools.ietf.org/html/rfc3986#section-2
|
||||
static const QString URL_PATH_PATTERN = QStringLiteral("[\\w:/?#\\[\\]@!$&'{}*+,;.~%=-]+");
|
||||
|
||||
static const QRegularExpression URL_PATTERNS[] = {
|
||||
QRegularExpression(QStringLiteral(R"(\b(www\.|((http[s]?)|ftp)://)%1)").arg(URL_PATH_PATTERN)),
|
||||
QRegularExpression(QStringLiteral(R"(\b(file|smb)://([\S| ]*))")),
|
||||
QRegularExpression(QStringLiteral(R"(\btox:[a-zA-Z\\d]{76})")),
|
||||
QRegularExpression(QStringLiteral(R"(\bmailto:\S+@\S+\.\S+)")),
|
||||
QRegularExpression(QStringLiteral(R"(\btox:\S+@\S+)")),
|
||||
static const QVector<QRegularExpression> WWW_WORD_PATTERN = {
|
||||
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((www)\S+))"))
|
||||
};
|
||||
|
||||
static const QVector<QRegularExpression> URI_WORD_PATTERNS = {
|
||||
// Note: This does not match only strictly valid URLs, but we broaden search to any string following scheme to
|
||||
// allow UTF-8 "IRI"s instead of ASCII-only URLs
|
||||
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((((http[s]?)|ftp)://)\S+))")),
|
||||
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((file|smb)://([\S| ]*)))")),
|
||||
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:[a-zA-Z\d]{76}))")),
|
||||
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(mailto:\S+@\S+\.\S+))")),
|
||||
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:\S+@\S+))")),
|
||||
};
|
||||
|
||||
|
||||
// clang-format on
|
||||
|
||||
struct MatchingUri {
|
||||
bool valid{false};
|
||||
int length{0};
|
||||
};
|
||||
|
||||
// pairs of characters that are ignored when surrounding a URI
|
||||
static const QPair<QString, QString> URI_WRAPPING_CHARS[] = {
|
||||
{QString("("), QString(")")},
|
||||
{QString("["), QString("]")},
|
||||
{QString("""), QString(""")},
|
||||
{QString("'"), QString("'")}
|
||||
};
|
||||
|
||||
// characters which are ignored from the end of URI
|
||||
static const QChar URI_ENDING_CHARS[] = {
|
||||
QChar::fromLatin1('?'),
|
||||
QChar::fromLatin1('.'),
|
||||
QChar::fromLatin1('!'),
|
||||
QChar::fromLatin1(':'),
|
||||
QChar::fromLatin1(',')
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Strips wrapping characters and ending punctuation from URI
|
||||
* @param QRegularExpressionMatch of a word containing a URI
|
||||
* @return MatchingUri containing info on the stripped URI
|
||||
*/
|
||||
MatchingUri stripSurroundingChars(const QStringRef wrappedUri, const int startOfBareUri)
|
||||
{
|
||||
bool matchFound;
|
||||
int curValidationStartPos = 0;
|
||||
int curValidationEndPos = wrappedUri.length();
|
||||
do {
|
||||
matchFound = false;
|
||||
for (auto const& surroundChars : URI_WRAPPING_CHARS)
|
||||
{
|
||||
const int openingCharLength = surroundChars.first.length();
|
||||
const int closingCharLength = surroundChars.second.length();
|
||||
if (surroundChars.first == wrappedUri.mid(curValidationStartPos, openingCharLength) &&
|
||||
surroundChars.second == wrappedUri.mid(curValidationEndPos - closingCharLength, closingCharLength)) {
|
||||
curValidationStartPos += openingCharLength;
|
||||
curValidationEndPos -= closingCharLength;
|
||||
matchFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (QChar const endChar : URI_ENDING_CHARS) {
|
||||
const int charLength = 1;
|
||||
if (endChar == wrappedUri.at(curValidationEndPos - charLength)) {
|
||||
curValidationEndPos -= charLength;
|
||||
matchFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (matchFound);
|
||||
MatchingUri strippedMatch;
|
||||
if (startOfBareUri != curValidationStartPos) {
|
||||
strippedMatch.valid = false;
|
||||
} else {
|
||||
strippedMatch.valid = true;
|
||||
strippedMatch.length = curValidationEndPos - startOfBareUri;
|
||||
}
|
||||
return strippedMatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Wrap substrings matching "patterns" with "wrapper" in "message"
|
||||
* @param message Where search for patterns
|
||||
* @param patterns Array of regex patterns to find strings to wrap
|
||||
* @param wrapper Surrounds the matched strings
|
||||
* @note done separately from URI since the link must have a scheme added to be valid
|
||||
* @return Copy of message with highlighted URLs
|
||||
*/
|
||||
QString highlight(const QString& message, const QVector<QRegularExpression>& patterns, const QString& wrapper)
|
||||
{
|
||||
QString result = message;
|
||||
for (const QRegularExpression& exp : patterns) {
|
||||
const int startLength = result.length();
|
||||
int offset = 0;
|
||||
QRegularExpressionMatchIterator iter = exp.globalMatch(result);
|
||||
while (iter.hasNext()) {
|
||||
const QRegularExpressionMatch match = iter.next();
|
||||
const int uriWithWrapMatch{0};
|
||||
const int uriWithoutWrapMatch{1};
|
||||
MatchingUri matchUri = stripSurroundingChars(match.capturedRef(uriWithWrapMatch),
|
||||
match.capturedStart(uriWithoutWrapMatch) - match.capturedStart(uriWithWrapMatch));
|
||||
if (!matchUri.valid) {
|
||||
continue;
|
||||
}
|
||||
const QString wrappedURL = wrapper.arg(match.captured(uriWithoutWrapMatch).left(matchUri.length));
|
||||
result.replace(match.capturedStart(uriWithoutWrapMatch) + offset, matchUri.length, wrappedURL);
|
||||
offset = result.length() - startLength;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Highlights URLs within passed message string
|
||||
* @param message Where search for URLs
|
||||
* @return Copy of message with highlighted URLs
|
||||
*/
|
||||
QString highlightURL(const QString& message)
|
||||
QString highlightURI(const QString& message)
|
||||
{
|
||||
QString result = message;
|
||||
for (const QRegularExpression& exp : URL_PATTERNS) {
|
||||
const int startLength = result.length();
|
||||
int offset = 0;
|
||||
QRegularExpressionMatchIterator iter = exp.globalMatch(result);
|
||||
while (iter.hasNext()) {
|
||||
const QRegularExpressionMatch match = iter.next();
|
||||
const int startPos = match.capturedStart() + offset;
|
||||
const int length = match.capturedLength();
|
||||
const QString wrappedURL = HREF_WRAPPER.arg(match.captured());
|
||||
result.replace(startPos, length, wrappedURL);
|
||||
offset = result.length() - startLength;
|
||||
}
|
||||
}
|
||||
QString result = highlight(message, URI_WORD_PATTERNS, HREF_WRAPPER);
|
||||
result = highlight(result, WWW_WORD_PATTERN, WWW_WRAPPER);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
#include <QString>
|
||||
|
||||
QString highlightURL(const QString& message);
|
||||
QString highlightURI(const QString& message);
|
||||
|
||||
QString applyMarkdown(const QString& message, bool showFormattingSymbols);
|
||||
|
||||
|
|
|
@ -168,11 +168,12 @@ static const QVector<StringPair> MIXED_FORMATTING_SPECIAL_CASES {
|
|||
};
|
||||
|
||||
#define MAKE_LINK(url) "<a href=\"" url "\">" url "</a>"
|
||||
#define MAKE_WWW_LINK(url) "<a href=\"http://" url "\">" url "</a>"
|
||||
|
||||
static const QVector<QPair<QString, QString>> URL_CASES {
|
||||
PAIR_FORMAT("https://github.com/qTox/qTox/issues/4233",
|
||||
MAKE_LINK("https://github.com/qTox/qTox/issues/4233")),
|
||||
PAIR_FORMAT("www.youtube.com", MAKE_LINK("www.youtube.com")),
|
||||
PAIR_FORMAT("www.youtube.com", MAKE_WWW_LINK("www.youtube.com")),
|
||||
PAIR_FORMAT("https://url.com/some*url/some*more*url/",
|
||||
MAKE_LINK("https://url.com/some*url/some*more*url/")),
|
||||
PAIR_FORMAT("https://url.com/some_url/some_more_url/",
|
||||
|
@ -191,7 +192,7 @@ static const QVector<QPair<QString, QString>> URL_CASES {
|
|||
"www.site.com/part1/part2",
|
||||
MAKE_LINK("http://site.com/part1/part2") " "
|
||||
MAKE_LINK("http://site.com/part3") " and one more time "
|
||||
MAKE_LINK("www.site.com/part1/part2")),
|
||||
MAKE_WWW_LINK("www.site.com/part1/part2")),
|
||||
PAIR_FORMAT("https://127.0.0.1/asd\n"
|
||||
"https://ABCD:EF01:2345:6789:ABCD:EF01:2345:6789/\n"
|
||||
"ftp://2001:DB8::8:800:200C:417A/\n"
|
||||
|
@ -213,6 +214,26 @@ static const QVector<QPair<QString, QString>> URL_CASES {
|
|||
MAKE_LINK("http://[::1]:22/") " "
|
||||
MAKE_LINK("http://[::]:20/") " "
|
||||
),
|
||||
// Test case from issue #4853 (include unicode, ending brackets that are part of URL)
|
||||
PAIR_FORMAT("https://ja.wikipedia.org/wiki/印章",
|
||||
MAKE_LINK("https://ja.wikipedia.org/wiki/印章")),
|
||||
PAIR_FORMAT("https://en.wikipedia.org/wiki/Seal_(East_Asia)",
|
||||
MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)")),
|
||||
// Test cases from issue #4295 (exclude surrounding quotes, brackets, ending punctuation)
|
||||
PAIR_FORMAT("(http://www.google.com)",
|
||||
"(" MAKE_LINK("http://www.google.com") ")"),
|
||||
PAIR_FORMAT(""http://www.google.com"",
|
||||
""" MAKE_LINK("http://www.google.com") """),
|
||||
PAIR_FORMAT("http://www.google.com.",
|
||||
MAKE_LINK("http://www.google.com") "."),
|
||||
PAIR_FORMAT("http://www.google.com,",
|
||||
MAKE_LINK("http://www.google.com") ","),
|
||||
PAIR_FORMAT("http://www.google.com?",
|
||||
MAKE_LINK("http://www.google.com") "?"),
|
||||
PAIR_FORMAT("https://google.com?gfe_rd=cr",
|
||||
MAKE_LINK("https://google.com?gfe_rd=cr")),
|
||||
PAIR_FORMAT("["https://en.wikipedia.org/wiki/Seal_(East_Asia)"]?",
|
||||
"["" MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)") ""]?")
|
||||
};
|
||||
|
||||
#undef PAIR_FORMAT
|
||||
|
@ -325,7 +346,7 @@ private slots:
|
|||
void urlTest();
|
||||
private:
|
||||
const MarkdownFunction markdownFunction = applyMarkdown;
|
||||
UrlHighlightFunction urlHighlightFunction = highlightURL;
|
||||
UrlHighlightFunction urlHighlightFunction = highlightURI;
|
||||
};
|
||||
|
||||
static QString commonWorkCasesProcessInput(const QString& str, const MarkdownToTags& mtt)
|
||||
|
|
Loading…
Reference in New Issue
Block a user