From c7bf5f79f8a205a91bda0c8cf40e9486b51c81fa Mon Sep 17 00:00:00 2001 From: Nicolas Constant Date: Mon, 1 Feb 2021 21:48:47 -0500 Subject: [PATCH] testing new Hashtag regex --- .../Regexes/HashtagRegexes.cs | 3 +- .../Tools/StatusExtractor.cs | 46 ++++-- .../Tools/StatusExtractorTests.cs | 137 ++++++++++++++---- 3 files changed, 144 insertions(+), 42 deletions(-) diff --git a/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs b/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs index c5e8ed7..99b2f32 100644 --- a/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs +++ b/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs @@ -4,6 +4,7 @@ namespace BirdsiteLive.Common.Regexes { public class HashtagRegexes { - public static readonly Regex Hashtag = new Regex(@"(.)(#[a-zA-Z0-9]+)(\s|$|[.,;:!?/|-])"); + public static readonly Regex HashtagName = new Regex(@"^[a-zA-Z0-9_]+$"); + public static readonly Regex Hashtag = new Regex(@"(.?)#([a-zA-Z0-9_]+)(\s|$|[<.,;:!?/|-])"); } } \ No newline at end of file diff --git a/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs b/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs index a181ac2..9429096 100644 --- a/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs +++ b/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs @@ -4,6 +4,8 @@ using System.Text.RegularExpressions; using BirdsiteLive.ActivityPub.Models; using BirdsiteLive.Common.Regexes; using BirdsiteLive.Common.Settings; +using BirdsiteLive.Twitter; +using Microsoft.Extensions.Logging; namespace BirdsiteLive.Domain.Tools { @@ -14,7 +16,7 @@ namespace BirdsiteLive.Domain.Tools public class StatusExtractor : IStatusExtractor { - private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)"); + //private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)"); //private readonly Regex _hastagRegex = new Regex(@"#\w+"); //private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)"); //private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+)\b(?!;)"); @@ -27,29 +29,31 @@ namespace BirdsiteLive.Domain.Tools private readonly Regex _urlRegex = new Regex(@"((http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)"); private readonly InstanceSettings _instanceSettings; + private readonly ILogger _logger; #region Ctor - public StatusExtractor(InstanceSettings instanceSettings) + public StatusExtractor(InstanceSettings instanceSettings, ILogger logger) { _instanceSettings = instanceSettings; + _logger = logger; } #endregion public (string content, Tag[] tags) Extract(string messageContent, bool extractMentions = true) { var tags = new List(); - messageContent = $" {messageContent} "; + //messageContent = $" {messageContent} "; // Replace return lines - messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "

"); - messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "
"); - messageContent = Regex.Replace(messageContent, @"\(@", "( @"); - messageContent = Regex.Replace(messageContent, @"\(#", "( #"); + messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "

"); + messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "
"); + //messageContent = Regex.Replace(messageContent, @"\(@", "( @"); + //messageContent = Regex.Replace(messageContent, @"\(#", "( #"); - // Secure emojis - var emojiMatch = EmojiRegexes.Emoji.Matches(messageContent); - foreach (Match m in emojiMatch) - messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} "); + //// Secure emojis + //var emojiMatch = EmojiRegexes.Emoji.Matches(messageContent); + //foreach (Match m in emojiMatch) + // messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} "); // Extract Urls var urlMatch = _urlRegex.Matches(messageContent); @@ -83,12 +87,19 @@ namespace BirdsiteLive.Domain.Tools } // Extract Hashtags - var hashtagMatch = OrderByLength(_hastagRegex.Matches(messageContent)); + var hashtagMatch = OrderByLength(HashtagRegexes.Hashtag.Matches(messageContent)); foreach (Match m in hashtagMatch.OrderByDescending(x => x.Length)) { - var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim(); - var url = $"https://{_instanceSettings.Domain}/tags/{tag}"; + var tag = m.Groups[2].ToString(); + //var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim(); + if (!HashtagRegexes.HashtagName.IsMatch(tag)) + { + _logger.LogError("Parsing Hashtag failed: {Tag} on {Content}", tag, messageContent); + continue; + } + + var url = $"https://{_instanceSettings.Domain}/tags/{tag}"; tags.Add(new Tag { name = $"#{tag}", @@ -96,8 +107,11 @@ namespace BirdsiteLive.Domain.Tools type = "Hashtag" }); - messageContent = Regex.Replace(messageContent, m.ToString(), - $@" #{tag}"); + //messageContent = Regex.Replace(messageContent, m.ToString(), + // $@" #{tag}"); + + messageContent = Regex.Replace(messageContent, m.Groups[0].ToString(), + $@"{m.Groups[1]}#{tag}{m.Groups[3]}"); } // Extract Mentions diff --git a/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs b/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs index f5dc91a..f126b12 100644 --- a/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs +++ b/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs @@ -3,7 +3,9 @@ using System.Linq; using BirdsiteLive.Common.Settings; using BirdsiteLive.Domain.Tools; using BirdsiteLive.Twitter.Models; +using Microsoft.Extensions.Logging; using Microsoft.VisualStudio.TestTools.UnitTesting; +using Moq; namespace BirdsiteLive.Domain.Tests.Tools { @@ -28,11 +30,16 @@ namespace BirdsiteLive.Domain.Tests.Tools #region Stubs var message = "Bla.\n\n@Mention blo. https://t.co/pgtrJi9600"; #endregion - - var service = new StatusExtractor(_settings); + + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.IsTrue(result.content.Contains("Bla.")); Assert.IsTrue(result.content.Contains("

")); #endregion @@ -45,10 +52,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = "Bla.\n@Mention blo. https://t.co/pgtrJi9600"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.IsTrue(result.content.Contains("Bla.")); Assert.IsTrue(result.content.Contains("
")); #endregion @@ -61,10 +73,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"Bla!{Environment.NewLine}https://t.co/L8BpyHgg25"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(0, result.tags.Length); Assert.IsTrue(result.content.Contains("Bla!")); @@ -79,10 +96,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(0, result.tags.Length); Assert.IsTrue(result.content.Contains("Bla!")); @@ -97,10 +119,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(0, result.tags.Length); Assert.IsTrue(result.content.Contains("Bla!")); @@ -115,10 +142,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"https://t.co/L8BpyHgg25 Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(0, result.tags.Length); Assert.IsTrue(result.content.Contains("Bla!")); @@ -132,13 +164,18 @@ namespace BirdsiteLive.Domain.Tests.Tools public void Extract_SingleHashTag_Test() { #region Stubs - var message = $"Bla!{Environment.NewLine}#mytag⁠"; + var message = $"Bla!{Environment.NewLine}#mytag"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.AreEqual("#mytag", result.tags.First().name); Assert.AreEqual("Hashtag", result.tags.First().type); @@ -153,13 +190,18 @@ namespace BirdsiteLive.Domain.Tests.Tools public void Extract_SingleHashTag_AtStart_Test() { #region Stubs - var message = $"#mytag⁠ Bla!"; + var message = "#mytag Bla!"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.AreEqual("#mytag", result.tags.First().name); Assert.AreEqual("Hashtag", result.tags.First().type); @@ -174,20 +216,25 @@ namespace BirdsiteLive.Domain.Tests.Tools public void Extract_SingleHashTag_SpecialChar_Test() { #region Stubs - var message = $"Bla!{Environment.NewLine}#COVIDー19⁠"; + var message = $"Bla!{Environment.NewLine}#COVID_19"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); - Assert.AreEqual("#COVIDー19", result.tags.First().name); + Assert.AreEqual("#COVID_19", result.tags.First().name); Assert.AreEqual("Hashtag", result.tags.First().type); - Assert.AreEqual("https://domain.name/tags/COVIDー19", result.tags.First().href); + Assert.AreEqual("https://domain.name/tags/COVID_19", result.tags.First().href); Assert.IsTrue(result.content.Contains("Bla!")); - Assert.IsTrue(result.content.Contains(@"#COVIDー19")); + Assert.IsTrue(result.content.Contains(@"#COVID_19")); #endregion } @@ -195,13 +242,18 @@ namespace BirdsiteLive.Domain.Tests.Tools public void Extract_MultiHashTags_Test() { #region Stubs - var message = $"Bla!{Environment.NewLine}#mytag #mytag2 #mytag3⁠{Environment.NewLine}Test #bal Test"; + var message = $"Bla!{Environment.NewLine}#mytag #mytag2 #mytag3{Environment.NewLine}Test #bal Test"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(4, result.tags.Length); Assert.IsTrue(result.content.Contains("Bla!")); Assert.IsTrue(result.content.Contains(@"#mytag")); @@ -218,10 +270,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"Bla!{Environment.NewLine}@mynickname⁠"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.AreEqual("@mynickname@domain.name", result.tags.First().name); Assert.AreEqual("Mention", result.tags.First().type); @@ -239,10 +296,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"Bla!{Environment.NewLine}@my___nickname⁠"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.AreEqual("@my___nickname@domain.name", result.tags.First().name); Assert.AreEqual("Mention", result.tags.First().type); @@ -260,10 +322,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"@mynickname Bla!"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.AreEqual("@mynickname@domain.name", result.tags.First().name); Assert.AreEqual("Mention", result.tags.First().type); @@ -281,10 +348,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"Bla!{Environment.NewLine}@mynickname⁠ @mynickname2 @mynickname3{Environment.NewLine}Test @dada Test"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(4, result.tags.Length); Assert.IsTrue(result.content.Contains("Bla!")); Assert.IsTrue(result.content.Contains(@"@mynickname")); @@ -301,10 +373,15 @@ namespace BirdsiteLive.Domain.Tests.Tools var message = $"Bla!{Environment.NewLine}@mynickname⁠ #mytag2 @mynickname3{Environment.NewLine}Test @dada #dada Test"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(5, result.tags.Length); Assert.IsTrue(result.content.Contains("Bla!")); Assert.IsTrue(result.content.Contains(@"@mynickname")); @@ -324,10 +401,15 @@ namespace BirdsiteLive.Domain.Tests.Tools //var message = $"tests@mynickname"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.IsTrue(result.content.Contains( @"😤 @mynickname")); @@ -344,10 +426,15 @@ namespace BirdsiteLive.Domain.Tests.Tools //var message = $"tests@mynickname"; #endregion - var service = new StatusExtractor(_settings); + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); var result = service.Extract(message); #region Validations + logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.IsTrue(result.content.Equals(@"bla ( @mynickname test)")); #endregion