mirror of
https://github.com/qTox/qTox.git
synced 2024-03-22 14:00:36 +08:00
fix(chatform): Broaden URL matching to include unicode
Fix #4853 Fix #4295 *Instead of searching strictly valid URIs, allow anything characters following scheme. This allows for UTF-8 characters used in other languages, as well as parentheses and other ASCII characters. This will over-match some invalid URLs. *Ignored surrounding characters of URIs and ending punctuation *Fix www-only links by adding http scheme to href
This commit is contained in:
parent
d3d81bbdf3
commit
e564b85e3c
|
@ -56,7 +56,7 @@ ChatMessage::Ptr ChatMessage::createChatMessage(const QString& sender, const QSt
|
||||||
|
|
||||||
// quotes (green text)
|
// quotes (green text)
|
||||||
text = detectQuotes(text, type);
|
text = detectQuotes(text, type);
|
||||||
text = highlightURL(text);
|
text = highlightURI(text);
|
||||||
|
|
||||||
// text styling
|
// text styling
|
||||||
Settings::StyleType styleType = Settings::getInstance().getStylePreference();
|
Settings::StyleType styleType = Settings::getInstance().getStylePreference();
|
||||||
|
|
|
@ -20,40 +20,42 @@
|
||||||
#include "textformatter.h"
|
#include "textformatter.h"
|
||||||
|
|
||||||
#include <QRegularExpression>
|
#include <QRegularExpression>
|
||||||
|
#include <QVector>
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
|
|
||||||
static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
|
// Note: escaping of '\' is only needed because QStringLiteral is broken by linebreak
|
||||||
|
static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|\\s)"
|
||||||
"[%1]"
|
"[%1]"
|
||||||
"(?!\\s)"
|
"(?!\\s)"
|
||||||
"([^%1\\n]+?)"
|
"([^%1\\n]+?)"
|
||||||
"(?<!\\s)"
|
"(?<!\\s)"
|
||||||
"[%1]"
|
"[%1]"
|
||||||
"(?=$|[\\s\\n])");
|
"(?=$|\\s)");
|
||||||
|
|
||||||
static const QString SINGLE_SLASH_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
|
static const QString SINGLE_SLASH_PATTERN = QStringLiteral("(?<=^|\\s)"
|
||||||
"/"
|
"/"
|
||||||
"(?!\\s)"
|
"(?!\\s)"
|
||||||
"([^/\\n]+?)"
|
"([^/\\n]+?)"
|
||||||
"(?<!\\s)"
|
"(?<!\\s)"
|
||||||
"/"
|
"/"
|
||||||
"(?=$|[\\s\\n])");
|
"(?=$|\\s)");
|
||||||
|
|
||||||
static const QString DOUBLE_SIGN_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
|
static const QString DOUBLE_SIGN_PATTERN = QStringLiteral("(?<=^|\\s)"
|
||||||
"[%1]{2}"
|
"[%1]{2}"
|
||||||
"(?!\\s)"
|
"(?!\\s)"
|
||||||
"([^\\n]+?)"
|
"([^\\n]+?)"
|
||||||
"(?<!\\s)"
|
"(?<!\\s)"
|
||||||
"[%1]{2}"
|
"[%1]{2}"
|
||||||
"(?=$|[\\s\\n])");
|
"(?=$|\\s)");
|
||||||
|
|
||||||
static const QString MULTILINE_CODE = QStringLiteral("(?<=^|[\\s\\n])"
|
static const QString MULTILINE_CODE = QStringLiteral("(?<=^|\\s)"
|
||||||
"```"
|
"```"
|
||||||
"(?!`)"
|
"(?!`)"
|
||||||
"((.|\\n)+?)"
|
"((.|\\n)+?)"
|
||||||
"(?<!`)"
|
"(?<!`)"
|
||||||
"```"
|
"```"
|
||||||
"(?=$|[\\s\\n])");
|
"(?=$|\\s)");
|
||||||
|
|
||||||
#define REGEXP_WRAPPER_PAIR(pattern, wrapper)\
|
#define REGEXP_WRAPPER_PAIR(pattern, wrapper)\
|
||||||
{QRegularExpression(pattern,QRegularExpression::UseUnicodePropertiesOption),QStringLiteral(wrapper)}
|
{QRegularExpression(pattern,QRegularExpression::UseUnicodePropertiesOption),QStringLiteral(wrapper)}
|
||||||
|
@ -74,41 +76,131 @@ static const QPair<QRegularExpression, QString> REGEX_TO_WRAPPER[] {
|
||||||
#undef REGEXP_WRAPPER_PAIR
|
#undef REGEXP_WRAPPER_PAIR
|
||||||
|
|
||||||
static const QString HREF_WRAPPER = QStringLiteral(R"(<a href="%1">%1</a>)");
|
static const QString HREF_WRAPPER = QStringLiteral(R"(<a href="%1">%1</a>)");
|
||||||
|
static const QString WWW_WRAPPER = QStringLiteral(R"(<a href="http://%1">%1</a>)");
|
||||||
|
|
||||||
// based in this: https://tools.ietf.org/html/rfc3986#section-2
|
static const QVector<QRegularExpression> WWW_WORD_PATTERN = {
|
||||||
static const QString URL_PATH_PATTERN = QStringLiteral("[\\w:/?#\\[\\]@!$&'{}*+,;.~%=-]+");
|
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((www)\S+))"))
|
||||||
|
|
||||||
static const QRegularExpression URL_PATTERNS[] = {
|
|
||||||
QRegularExpression(QStringLiteral(R"(\b(www\.|((http[s]?)|ftp)://)%1)").arg(URL_PATH_PATTERN)),
|
|
||||||
QRegularExpression(QStringLiteral(R"(\b(file|smb)://([\S| ]*))")),
|
|
||||||
QRegularExpression(QStringLiteral(R"(\btox:[a-zA-Z\\d]{76})")),
|
|
||||||
QRegularExpression(QStringLiteral(R"(\bmailto:\S+@\S+\.\S+)")),
|
|
||||||
QRegularExpression(QStringLiteral(R"(\btox:\S+@\S+)")),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const QVector<QRegularExpression> URI_WORD_PATTERNS = {
|
||||||
|
// Note: This does not match only strictly valid URLs, but we broaden search to any string following scheme to
|
||||||
|
// allow UTF-8 "IRI"s instead of ASCII-only URLs
|
||||||
|
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((((http[s]?)|ftp)://)\S+))")),
|
||||||
|
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((file|smb)://([\S| ]*)))")),
|
||||||
|
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:[a-zA-Z\d]{76}))")),
|
||||||
|
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(mailto:\S+@\S+\.\S+))")),
|
||||||
|
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:\S+@\S+))")),
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
|
struct MatchingUri {
|
||||||
|
bool valid{false};
|
||||||
|
int length{0};
|
||||||
|
};
|
||||||
|
|
||||||
|
// pairs of characters that are ignored when surrounding a URI
|
||||||
|
static const QPair<QString, QString> URI_WRAPPING_CHARS[] = {
|
||||||
|
{QString("("), QString(")")},
|
||||||
|
{QString("["), QString("]")},
|
||||||
|
{QString("""), QString(""")},
|
||||||
|
{QString("'"), QString("'")}
|
||||||
|
};
|
||||||
|
|
||||||
|
// characters which are ignored from the end of URI
|
||||||
|
static const QChar URI_ENDING_CHARS[] = {
|
||||||
|
QChar::fromLatin1('?'),
|
||||||
|
QChar::fromLatin1('.'),
|
||||||
|
QChar::fromLatin1('!'),
|
||||||
|
QChar::fromLatin1(':'),
|
||||||
|
QChar::fromLatin1(',')
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Strips wrapping characters and ending punctuation from URI
|
||||||
|
* @param QRegularExpressionMatch of a word containing a URI
|
||||||
|
* @return MatchingUri containing info on the stripped URI
|
||||||
|
*/
|
||||||
|
MatchingUri stripSurroundingChars(const QStringRef wrappedUri, const int startOfBareUri)
|
||||||
|
{
|
||||||
|
bool matchFound;
|
||||||
|
int curValidationStartPos = 0;
|
||||||
|
int curValidationEndPos = wrappedUri.length();
|
||||||
|
do {
|
||||||
|
matchFound = false;
|
||||||
|
for (auto const& surroundChars : URI_WRAPPING_CHARS)
|
||||||
|
{
|
||||||
|
const int openingCharLength = surroundChars.first.length();
|
||||||
|
const int closingCharLength = surroundChars.second.length();
|
||||||
|
if (surroundChars.first == wrappedUri.mid(curValidationStartPos, openingCharLength) &&
|
||||||
|
surroundChars.second == wrappedUri.mid(curValidationEndPos - closingCharLength, closingCharLength)) {
|
||||||
|
curValidationStartPos += openingCharLength;
|
||||||
|
curValidationEndPos -= closingCharLength;
|
||||||
|
matchFound = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (QChar const endChar : URI_ENDING_CHARS) {
|
||||||
|
const int charLength = 1;
|
||||||
|
if (endChar == wrappedUri.at(curValidationEndPos - charLength)) {
|
||||||
|
curValidationEndPos -= charLength;
|
||||||
|
matchFound = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (matchFound);
|
||||||
|
MatchingUri strippedMatch;
|
||||||
|
if (startOfBareUri != curValidationStartPos) {
|
||||||
|
strippedMatch.valid = false;
|
||||||
|
} else {
|
||||||
|
strippedMatch.valid = true;
|
||||||
|
strippedMatch.length = curValidationEndPos - startOfBareUri;
|
||||||
|
}
|
||||||
|
return strippedMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Wrap substrings matching "patterns" with "wrapper" in "message"
|
||||||
|
* @param message Where search for patterns
|
||||||
|
* @param patterns Array of regex patterns to find strings to wrap
|
||||||
|
* @param wrapper Surrounds the matched strings
|
||||||
|
* @note done separately from URI since the link must have a scheme added to be valid
|
||||||
|
* @return Copy of message with highlighted URLs
|
||||||
|
*/
|
||||||
|
QString highlight(const QString& message, const QVector<QRegularExpression>& patterns, const QString& wrapper)
|
||||||
|
{
|
||||||
|
QString result = message;
|
||||||
|
for (const QRegularExpression& exp : patterns) {
|
||||||
|
const int startLength = result.length();
|
||||||
|
int offset = 0;
|
||||||
|
QRegularExpressionMatchIterator iter = exp.globalMatch(result);
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
const QRegularExpressionMatch match = iter.next();
|
||||||
|
const int uriWithWrapMatch{0};
|
||||||
|
const int uriWithoutWrapMatch{1};
|
||||||
|
MatchingUri matchUri = stripSurroundingChars(match.capturedRef(uriWithWrapMatch),
|
||||||
|
match.capturedStart(uriWithoutWrapMatch) - match.capturedStart(uriWithWrapMatch));
|
||||||
|
if (!matchUri.valid) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const QString wrappedURL = wrapper.arg(match.captured(uriWithoutWrapMatch).left(matchUri.length));
|
||||||
|
result.replace(match.capturedStart(uriWithoutWrapMatch) + offset, matchUri.length, wrappedURL);
|
||||||
|
offset = result.length() - startLength;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Highlights URLs within passed message string
|
* @brief Highlights URLs within passed message string
|
||||||
* @param message Where search for URLs
|
* @param message Where search for URLs
|
||||||
* @return Copy of message with highlighted URLs
|
* @return Copy of message with highlighted URLs
|
||||||
*/
|
*/
|
||||||
QString highlightURL(const QString& message)
|
QString highlightURI(const QString& message)
|
||||||
{
|
{
|
||||||
QString result = message;
|
QString result = highlight(message, URI_WORD_PATTERNS, HREF_WRAPPER);
|
||||||
for (const QRegularExpression& exp : URL_PATTERNS) {
|
result = highlight(result, WWW_WORD_PATTERN, WWW_WRAPPER);
|
||||||
const int startLength = result.length();
|
|
||||||
int offset = 0;
|
|
||||||
QRegularExpressionMatchIterator iter = exp.globalMatch(result);
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
const QRegularExpressionMatch match = iter.next();
|
|
||||||
const int startPos = match.capturedStart() + offset;
|
|
||||||
const int length = match.capturedLength();
|
|
||||||
const QString wrappedURL = HREF_WRAPPER.arg(match.captured());
|
|
||||||
result.replace(startPos, length, wrappedURL);
|
|
||||||
offset = result.length() - startLength;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
#include <QString>
|
#include <QString>
|
||||||
|
|
||||||
QString highlightURL(const QString& message);
|
QString highlightURI(const QString& message);
|
||||||
|
|
||||||
QString applyMarkdown(const QString& message, bool showFormattingSymbols);
|
QString applyMarkdown(const QString& message, bool showFormattingSymbols);
|
||||||
|
|
||||||
|
|
|
@ -168,11 +168,12 @@ static const QVector<StringPair> MIXED_FORMATTING_SPECIAL_CASES {
|
||||||
};
|
};
|
||||||
|
|
||||||
#define MAKE_LINK(url) "<a href=\"" url "\">" url "</a>"
|
#define MAKE_LINK(url) "<a href=\"" url "\">" url "</a>"
|
||||||
|
#define MAKE_WWW_LINK(url) "<a href=\"http://" url "\">" url "</a>"
|
||||||
|
|
||||||
static const QVector<QPair<QString, QString>> URL_CASES {
|
static const QVector<QPair<QString, QString>> URL_CASES {
|
||||||
PAIR_FORMAT("https://github.com/qTox/qTox/issues/4233",
|
PAIR_FORMAT("https://github.com/qTox/qTox/issues/4233",
|
||||||
MAKE_LINK("https://github.com/qTox/qTox/issues/4233")),
|
MAKE_LINK("https://github.com/qTox/qTox/issues/4233")),
|
||||||
PAIR_FORMAT("www.youtube.com", MAKE_LINK("www.youtube.com")),
|
PAIR_FORMAT("www.youtube.com", MAKE_WWW_LINK("www.youtube.com")),
|
||||||
PAIR_FORMAT("https://url.com/some*url/some*more*url/",
|
PAIR_FORMAT("https://url.com/some*url/some*more*url/",
|
||||||
MAKE_LINK("https://url.com/some*url/some*more*url/")),
|
MAKE_LINK("https://url.com/some*url/some*more*url/")),
|
||||||
PAIR_FORMAT("https://url.com/some_url/some_more_url/",
|
PAIR_FORMAT("https://url.com/some_url/some_more_url/",
|
||||||
|
@ -191,7 +192,7 @@ static const QVector<QPair<QString, QString>> URL_CASES {
|
||||||
"www.site.com/part1/part2",
|
"www.site.com/part1/part2",
|
||||||
MAKE_LINK("http://site.com/part1/part2") " "
|
MAKE_LINK("http://site.com/part1/part2") " "
|
||||||
MAKE_LINK("http://site.com/part3") " and one more time "
|
MAKE_LINK("http://site.com/part3") " and one more time "
|
||||||
MAKE_LINK("www.site.com/part1/part2")),
|
MAKE_WWW_LINK("www.site.com/part1/part2")),
|
||||||
PAIR_FORMAT("https://127.0.0.1/asd\n"
|
PAIR_FORMAT("https://127.0.0.1/asd\n"
|
||||||
"https://ABCD:EF01:2345:6789:ABCD:EF01:2345:6789/\n"
|
"https://ABCD:EF01:2345:6789:ABCD:EF01:2345:6789/\n"
|
||||||
"ftp://2001:DB8::8:800:200C:417A/\n"
|
"ftp://2001:DB8::8:800:200C:417A/\n"
|
||||||
|
@ -213,6 +214,26 @@ static const QVector<QPair<QString, QString>> URL_CASES {
|
||||||
MAKE_LINK("http://[::1]:22/") " "
|
MAKE_LINK("http://[::1]:22/") " "
|
||||||
MAKE_LINK("http://[::]:20/") " "
|
MAKE_LINK("http://[::]:20/") " "
|
||||||
),
|
),
|
||||||
|
// Test case from issue #4853 (include unicode, ending brackets that are part of URL)
|
||||||
|
PAIR_FORMAT("https://ja.wikipedia.org/wiki/印章",
|
||||||
|
MAKE_LINK("https://ja.wikipedia.org/wiki/印章")),
|
||||||
|
PAIR_FORMAT("https://en.wikipedia.org/wiki/Seal_(East_Asia)",
|
||||||
|
MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)")),
|
||||||
|
// Test cases from issue #4295 (exclude surrounding quotes, brackets, ending punctuation)
|
||||||
|
PAIR_FORMAT("(http://www.google.com)",
|
||||||
|
"(" MAKE_LINK("http://www.google.com") ")"),
|
||||||
|
PAIR_FORMAT(""http://www.google.com"",
|
||||||
|
""" MAKE_LINK("http://www.google.com") """),
|
||||||
|
PAIR_FORMAT("http://www.google.com.",
|
||||||
|
MAKE_LINK("http://www.google.com") "."),
|
||||||
|
PAIR_FORMAT("http://www.google.com,",
|
||||||
|
MAKE_LINK("http://www.google.com") ","),
|
||||||
|
PAIR_FORMAT("http://www.google.com?",
|
||||||
|
MAKE_LINK("http://www.google.com") "?"),
|
||||||
|
PAIR_FORMAT("https://google.com?gfe_rd=cr",
|
||||||
|
MAKE_LINK("https://google.com?gfe_rd=cr")),
|
||||||
|
PAIR_FORMAT("["https://en.wikipedia.org/wiki/Seal_(East_Asia)"]?",
|
||||||
|
"["" MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)") ""]?")
|
||||||
};
|
};
|
||||||
|
|
||||||
#undef PAIR_FORMAT
|
#undef PAIR_FORMAT
|
||||||
|
@ -325,7 +346,7 @@ private slots:
|
||||||
void urlTest();
|
void urlTest();
|
||||||
private:
|
private:
|
||||||
const MarkdownFunction markdownFunction = applyMarkdown;
|
const MarkdownFunction markdownFunction = applyMarkdown;
|
||||||
UrlHighlightFunction urlHighlightFunction = highlightURL;
|
UrlHighlightFunction urlHighlightFunction = highlightURI;
|
||||||
};
|
};
|
||||||
|
|
||||||
static QString commonWorkCasesProcessInput(const QString& str, const MarkdownToTags& mtt)
|
static QString commonWorkCasesProcessInput(const QString& str, const MarkdownToTags& mtt)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user