From e564b85e3c485b283855bfdf00dfc0ec5427fad4 Mon Sep 17 00:00:00 2001 From: Anthony Bilinski Date: Sat, 24 Feb 2018 20:35:47 -0800 Subject: [PATCH] fix(chatform): Broaden URL matching to include unicode Fix #4853 Fix #4295 *Instead of searching strictly valid URIs, allow anything characters following scheme. This allows for UTF-8 characters used in other languages, as well as parentheses and other ASCII characters. This will over-match some invalid URLs. *Ignored surrounding characters of URIs and ending punctuation *Fix www-only links by adding http scheme to href --- src/chatlog/chatmessage.cpp | 2 +- src/chatlog/textformatter.cpp | 156 ++++++++++++++++++++++------ src/chatlog/textformatter.h | 2 +- test/chatlog/textformatter_test.cpp | 27 ++++- 4 files changed, 150 insertions(+), 37 deletions(-) diff --git a/src/chatlog/chatmessage.cpp b/src/chatlog/chatmessage.cpp index bc7316b2e..a53b79766 100644 --- a/src/chatlog/chatmessage.cpp +++ b/src/chatlog/chatmessage.cpp @@ -56,7 +56,7 @@ ChatMessage::Ptr ChatMessage::createChatMessage(const QString& sender, const QSt // quotes (green text) text = detectQuotes(text, type); - text = highlightURL(text); + text = highlightURI(text); // text styling Settings::StyleType styleType = Settings::getInstance().getStylePreference(); diff --git a/src/chatlog/textformatter.cpp b/src/chatlog/textformatter.cpp index 408eb91ac..4cd858f01 100644 --- a/src/chatlog/textformatter.cpp +++ b/src/chatlog/textformatter.cpp @@ -20,40 +20,42 @@ #include "textformatter.h" #include +#include // clang-format off -static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|[\\s\\n])" +// Note: escaping of '\' is only needed because QStringLiteral is broken by linebreak +static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|\\s)" "[%1]" "(?!\\s)" "([^%1\\n]+?)" "(? REGEX_TO_WRAPPER[] { #undef REGEXP_WRAPPER_PAIR static const QString HREF_WRAPPER = QStringLiteral(R"(%1)"); +static const QString WWW_WRAPPER = QStringLiteral(R"(%1)"); -// based in this: https://tools.ietf.org/html/rfc3986#section-2 -static const QString URL_PATH_PATTERN = QStringLiteral("[\\w:/?#\\[\\]@!$&'{}*+,;.~%=-]+"); - -static const QRegularExpression URL_PATTERNS[] = { - QRegularExpression(QStringLiteral(R"(\b(www\.|((http[s]?)|ftp)://)%1)").arg(URL_PATH_PATTERN)), - QRegularExpression(QStringLiteral(R"(\b(file|smb)://([\S| ]*))")), - QRegularExpression(QStringLiteral(R"(\btox:[a-zA-Z\\d]{76})")), - QRegularExpression(QStringLiteral(R"(\bmailto:\S+@\S+\.\S+)")), - QRegularExpression(QStringLiteral(R"(\btox:\S+@\S+)")), +static const QVector WWW_WORD_PATTERN = { + QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((www)\S+))")) }; +static const QVector URI_WORD_PATTERNS = { + // Note: This does not match only strictly valid URLs, but we broaden search to any string following scheme to + // allow UTF-8 "IRI"s instead of ASCII-only URLs + QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((((http[s]?)|ftp)://)\S+))")), + QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((file|smb)://([\S| ]*)))")), + QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:[a-zA-Z\d]{76}))")), + QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(mailto:\S+@\S+\.\S+))")), + QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:\S+@\S+))")), +}; + + // clang-format on +struct MatchingUri { + bool valid{false}; + int length{0}; +}; + +// pairs of characters that are ignored when surrounding a URI +static const QPair URI_WRAPPING_CHARS[] = { + {QString("("), QString(")")}, + {QString("["), QString("]")}, + {QString("""), QString(""")}, + {QString("'"), QString("'")} +}; + +// characters which are ignored from the end of URI +static const QChar URI_ENDING_CHARS[] = { + QChar::fromLatin1('?'), + QChar::fromLatin1('.'), + QChar::fromLatin1('!'), + QChar::fromLatin1(':'), + QChar::fromLatin1(',') +}; + +/** + * @brief Strips wrapping characters and ending punctuation from URI + * @param QRegularExpressionMatch of a word containing a URI + * @return MatchingUri containing info on the stripped URI + */ +MatchingUri stripSurroundingChars(const QStringRef wrappedUri, const int startOfBareUri) +{ + bool matchFound; + int curValidationStartPos = 0; + int curValidationEndPos = wrappedUri.length(); + do { + matchFound = false; + for (auto const& surroundChars : URI_WRAPPING_CHARS) + { + const int openingCharLength = surroundChars.first.length(); + const int closingCharLength = surroundChars.second.length(); + if (surroundChars.first == wrappedUri.mid(curValidationStartPos, openingCharLength) && + surroundChars.second == wrappedUri.mid(curValidationEndPos - closingCharLength, closingCharLength)) { + curValidationStartPos += openingCharLength; + curValidationEndPos -= closingCharLength; + matchFound = true; + break; + } + } + for (QChar const endChar : URI_ENDING_CHARS) { + const int charLength = 1; + if (endChar == wrappedUri.at(curValidationEndPos - charLength)) { + curValidationEndPos -= charLength; + matchFound = true; + break; + } + } + } while (matchFound); + MatchingUri strippedMatch; + if (startOfBareUri != curValidationStartPos) { + strippedMatch.valid = false; + } else { + strippedMatch.valid = true; + strippedMatch.length = curValidationEndPos - startOfBareUri; + } + return strippedMatch; +} + +/** + * @brief Wrap substrings matching "patterns" with "wrapper" in "message" + * @param message Where search for patterns + * @param patterns Array of regex patterns to find strings to wrap + * @param wrapper Surrounds the matched strings + * @note done separately from URI since the link must have a scheme added to be valid + * @return Copy of message with highlighted URLs + */ +QString highlight(const QString& message, const QVector& patterns, const QString& wrapper) +{ + QString result = message; + for (const QRegularExpression& exp : patterns) { + const int startLength = result.length(); + int offset = 0; + QRegularExpressionMatchIterator iter = exp.globalMatch(result); + while (iter.hasNext()) { + const QRegularExpressionMatch match = iter.next(); + const int uriWithWrapMatch{0}; + const int uriWithoutWrapMatch{1}; + MatchingUri matchUri = stripSurroundingChars(match.capturedRef(uriWithWrapMatch), + match.capturedStart(uriWithoutWrapMatch) - match.capturedStart(uriWithWrapMatch)); + if (!matchUri.valid) { + continue; + } + const QString wrappedURL = wrapper.arg(match.captured(uriWithoutWrapMatch).left(matchUri.length)); + result.replace(match.capturedStart(uriWithoutWrapMatch) + offset, matchUri.length, wrappedURL); + offset = result.length() - startLength; + } + } + return result; +} + /** * @brief Highlights URLs within passed message string * @param message Where search for URLs * @return Copy of message with highlighted URLs */ -QString highlightURL(const QString& message) +QString highlightURI(const QString& message) { - QString result = message; - for (const QRegularExpression& exp : URL_PATTERNS) { - const int startLength = result.length(); - int offset = 0; - QRegularExpressionMatchIterator iter = exp.globalMatch(result); - while (iter.hasNext()) { - const QRegularExpressionMatch match = iter.next(); - const int startPos = match.capturedStart() + offset; - const int length = match.capturedLength(); - const QString wrappedURL = HREF_WRAPPER.arg(match.captured()); - result.replace(startPos, length, wrappedURL); - offset = result.length() - startLength; - } - } + QString result = highlight(message, URI_WORD_PATTERNS, HREF_WRAPPER); + result = highlight(result, WWW_WORD_PATTERN, WWW_WRAPPER); return result; } diff --git a/src/chatlog/textformatter.h b/src/chatlog/textformatter.h index 301da9c7e..9127d5ab0 100644 --- a/src/chatlog/textformatter.h +++ b/src/chatlog/textformatter.h @@ -22,7 +22,7 @@ #include -QString highlightURL(const QString& message); +QString highlightURI(const QString& message); QString applyMarkdown(const QString& message, bool showFormattingSymbols); diff --git a/test/chatlog/textformatter_test.cpp b/test/chatlog/textformatter_test.cpp index ffd89a550..91f0802e5 100644 --- a/test/chatlog/textformatter_test.cpp +++ b/test/chatlog/textformatter_test.cpp @@ -168,11 +168,12 @@ static const QVector MIXED_FORMATTING_SPECIAL_CASES { }; #define MAKE_LINK(url) "" url "" +#define MAKE_WWW_LINK(url) "" url "" static const QVector> URL_CASES { PAIR_FORMAT("https://github.com/qTox/qTox/issues/4233", MAKE_LINK("https://github.com/qTox/qTox/issues/4233")), - PAIR_FORMAT("www.youtube.com", MAKE_LINK("www.youtube.com")), + PAIR_FORMAT("www.youtube.com", MAKE_WWW_LINK("www.youtube.com")), PAIR_FORMAT("https://url.com/some*url/some*more*url/", MAKE_LINK("https://url.com/some*url/some*more*url/")), PAIR_FORMAT("https://url.com/some_url/some_more_url/", @@ -191,7 +192,7 @@ static const QVector> URL_CASES { "www.site.com/part1/part2", MAKE_LINK("http://site.com/part1/part2") " " MAKE_LINK("http://site.com/part3") " and one more time " - MAKE_LINK("www.site.com/part1/part2")), + MAKE_WWW_LINK("www.site.com/part1/part2")), PAIR_FORMAT("https://127.0.0.1/asd\n" "https://ABCD:EF01:2345:6789:ABCD:EF01:2345:6789/\n" "ftp://2001:DB8::8:800:200C:417A/\n" @@ -213,6 +214,26 @@ static const QVector> URL_CASES { MAKE_LINK("http://[::1]:22/") " " MAKE_LINK("http://[::]:20/") " " ), + // Test case from issue #4853 (include unicode, ending brackets that are part of URL) + PAIR_FORMAT("https://ja.wikipedia.org/wiki/印章", + MAKE_LINK("https://ja.wikipedia.org/wiki/印章")), + PAIR_FORMAT("https://en.wikipedia.org/wiki/Seal_(East_Asia)", + MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)")), + // Test cases from issue #4295 (exclude surrounding quotes, brackets, ending punctuation) + PAIR_FORMAT("(http://www.google.com)", + "(" MAKE_LINK("http://www.google.com") ")"), + PAIR_FORMAT(""http://www.google.com"", + """ MAKE_LINK("http://www.google.com") """), + PAIR_FORMAT("http://www.google.com.", + MAKE_LINK("http://www.google.com") "."), + PAIR_FORMAT("http://www.google.com,", + MAKE_LINK("http://www.google.com") ","), + PAIR_FORMAT("http://www.google.com?", + MAKE_LINK("http://www.google.com") "?"), + PAIR_FORMAT("https://google.com?gfe_rd=cr", + MAKE_LINK("https://google.com?gfe_rd=cr")), + PAIR_FORMAT("["https://en.wikipedia.org/wiki/Seal_(East_Asia)"]?", + "["" MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)") ""]?") }; #undef PAIR_FORMAT @@ -325,7 +346,7 @@ private slots: void urlTest(); private: const MarkdownFunction markdownFunction = applyMarkdown; - UrlHighlightFunction urlHighlightFunction = highlightURL; + UrlHighlightFunction urlHighlightFunction = highlightURI; }; static QString commonWorkCasesProcessInput(const QString& str, const MarkdownToTags& mtt)