testing new Hashtag regex

This commit is contained in:
Nicolas Constant 2021-02-01 21:48:47 -05:00
parent 6fac0ceffa
commit c7bf5f79f8
No known key found for this signature in database
GPG key ID: 1E9F677FB01A5688
3 changed files with 144 additions and 42 deletions

View file

@ -4,6 +4,7 @@ namespace BirdsiteLive.Common.Regexes
{
public class HashtagRegexes
{
public static readonly Regex Hashtag = new Regex(@"(.)(#[a-zA-Z0-9]+)(\s|$|[.,;:!?/|-])");
public static readonly Regex HashtagName = new Regex(@"^[a-zA-Z0-9_]+$");
public static readonly Regex Hashtag = new Regex(@"(.?)#([a-zA-Z0-9_]+)(\s|$|[<.,;:!?/|-])");
}
}

View file

@ -4,6 +4,8 @@ using System.Text.RegularExpressions;
using BirdsiteLive.ActivityPub.Models;
using BirdsiteLive.Common.Regexes;
using BirdsiteLive.Common.Settings;
using BirdsiteLive.Twitter;
using Microsoft.Extensions.Logging;
namespace BirdsiteLive.Domain.Tools
{
@ -14,7 +16,7 @@ namespace BirdsiteLive.Domain.Tools
public class StatusExtractor : IStatusExtractor
{
private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)");
//private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)");
//private readonly Regex _hastagRegex = new Regex(@"#\w+");
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)");
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+)\b(?!;)");
@ -27,29 +29,31 @@ namespace BirdsiteLive.Domain.Tools
private readonly Regex _urlRegex = new Regex(@"((http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?)");
private readonly InstanceSettings _instanceSettings;
private readonly ILogger<StatusExtractor> _logger;
#region Ctor
public StatusExtractor(InstanceSettings instanceSettings)
public StatusExtractor(InstanceSettings instanceSettings, ILogger<StatusExtractor> logger)
{
_instanceSettings = instanceSettings;
_logger = logger;
}
#endregion
public (string content, Tag[] tags) Extract(string messageContent, bool extractMentions = true)
{
var tags = new List<Tag>();
messageContent = $" {messageContent} ";
//messageContent = $" {messageContent} ";
// Replace return lines
messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "</p><p> ");
messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "<br/> ");
messageContent = Regex.Replace(messageContent, @"\(@", "( @");
messageContent = Regex.Replace(messageContent, @"\(#", "( #");
messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "</p><p>");
messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "<br/>");
//messageContent = Regex.Replace(messageContent, @"\(@", "( @");
//messageContent = Regex.Replace(messageContent, @"\(#", "( #");
// Secure emojis
var emojiMatch = EmojiRegexes.Emoji.Matches(messageContent);
foreach (Match m in emojiMatch)
messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} ");
//// Secure emojis
//var emojiMatch = EmojiRegexes.Emoji.Matches(messageContent);
//foreach (Match m in emojiMatch)
// messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} ");
// Extract Urls
var urlMatch = _urlRegex.Matches(messageContent);
@ -83,12 +87,19 @@ namespace BirdsiteLive.Domain.Tools
}
// Extract Hashtags
var hashtagMatch = OrderByLength(_hastagRegex.Matches(messageContent));
var hashtagMatch = OrderByLength(HashtagRegexes.Hashtag.Matches(messageContent));
foreach (Match m in hashtagMatch.OrderByDescending(x => x.Length))
{
var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim();
var url = $"https://{_instanceSettings.Domain}/tags/{tag}";
var tag = m.Groups[2].ToString();
//var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim();
if (!HashtagRegexes.HashtagName.IsMatch(tag))
{
_logger.LogError("Parsing Hashtag failed: {Tag} on {Content}", tag, messageContent);
continue;
}
var url = $"https://{_instanceSettings.Domain}/tags/{tag}";
tags.Add(new Tag
{
name = $"#{tag}",
@ -96,8 +107,11 @@ namespace BirdsiteLive.Domain.Tools
type = "Hashtag"
});
messageContent = Regex.Replace(messageContent, m.ToString(),
$@" <a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>");
//messageContent = Regex.Replace(messageContent, m.ToString(),
// $@" <a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>");
messageContent = Regex.Replace(messageContent, m.Groups[0].ToString(),
$@"{m.Groups[1]}<a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>{m.Groups[3]}");
}
// Extract Mentions

View file

@ -3,7 +3,9 @@ using System.Linq;
using BirdsiteLive.Common.Settings;
using BirdsiteLive.Domain.Tools;
using BirdsiteLive.Twitter.Models;
using Microsoft.Extensions.Logging;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Moq;
namespace BirdsiteLive.Domain.Tests.Tools
{
@ -28,11 +30,16 @@ namespace BirdsiteLive.Domain.Tests.Tools
#region Stubs
var message = "Bla.\n\n@Mention blo. https://t.co/pgtrJi9600";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.IsTrue(result.content.Contains("Bla."));
Assert.IsTrue(result.content.Contains("</p><p>"));
#endregion
@ -45,10 +52,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = "Bla.\n@Mention blo. https://t.co/pgtrJi9600";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.IsTrue(result.content.Contains("Bla."));
Assert.IsTrue(result.content.Contains("<br/>"));
#endregion
@ -61,10 +73,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"Bla!{Environment.NewLine}https://t.co/L8BpyHgg25";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(0, result.tags.Length);
Assert.IsTrue(result.content.Contains("Bla!"));
@ -79,10 +96,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(0, result.tags.Length);
Assert.IsTrue(result.content.Contains("Bla!"));
@ -97,10 +119,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(0, result.tags.Length);
Assert.IsTrue(result.content.Contains("Bla!"));
@ -115,10 +142,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"https://t.co/L8BpyHgg25 Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(0, result.tags.Length);
Assert.IsTrue(result.content.Contains("Bla!"));
@ -132,13 +164,18 @@ namespace BirdsiteLive.Domain.Tests.Tools
public void Extract_SingleHashTag_Test()
{
#region Stubs
var message = $"Bla!{Environment.NewLine}#mytag";
var message = $"Bla!{Environment.NewLine}#mytag";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.AreEqual("#mytag", result.tags.First().name);
Assert.AreEqual("Hashtag", result.tags.First().type);
@ -153,13 +190,18 @@ namespace BirdsiteLive.Domain.Tests.Tools
public void Extract_SingleHashTag_AtStart_Test()
{
#region Stubs
var message = $"#mytag Bla!";
var message = "#mytag Bla!";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.AreEqual("#mytag", result.tags.First().name);
Assert.AreEqual("Hashtag", result.tags.First().type);
@ -174,20 +216,25 @@ namespace BirdsiteLive.Domain.Tests.Tools
public void Extract_SingleHashTag_SpecialChar_Test()
{
#region Stubs
var message = $"Bla!{Environment.NewLine}#COVIDー19";
var message = $"Bla!{Environment.NewLine}#COVID_19";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.AreEqual("#COVID19", result.tags.First().name);
Assert.AreEqual("#COVID_19", result.tags.First().name);
Assert.AreEqual("Hashtag", result.tags.First().type);
Assert.AreEqual("https://domain.name/tags/COVID19", result.tags.First().href);
Assert.AreEqual("https://domain.name/tags/COVID_19", result.tags.First().href);
Assert.IsTrue(result.content.Contains("Bla!"));
Assert.IsTrue(result.content.Contains(@"<a href=""https://domain.name/tags/COVIDー19"" class=""mention hashtag"" rel=""tag"">#<span>COVIDー19</span></a>"));
Assert.IsTrue(result.content.Contains(@"<a href=""https://domain.name/tags/COVID_19"" class=""mention hashtag"" rel=""tag"">#<span>COVID_19</span></a>"));
#endregion
}
@ -195,13 +242,18 @@ namespace BirdsiteLive.Domain.Tests.Tools
public void Extract_MultiHashTags_Test()
{
#region Stubs
var message = $"Bla!{Environment.NewLine}#mytag #mytag2 #mytag3{Environment.NewLine}Test #bal Test";
var message = $"Bla!{Environment.NewLine}#mytag #mytag2 #mytag3{Environment.NewLine}Test #bal Test";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(4, result.tags.Length);
Assert.IsTrue(result.content.Contains("Bla!"));
Assert.IsTrue(result.content.Contains(@"<a href=""https://domain.name/tags/mytag"" class=""mention hashtag"" rel=""tag"">#<span>mytag</span></a>"));
@ -218,10 +270,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"Bla!{Environment.NewLine}@mynickname";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.AreEqual("@mynickname@domain.name", result.tags.First().name);
Assert.AreEqual("Mention", result.tags.First().type);
@ -239,10 +296,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"Bla!{Environment.NewLine}@my___nickname";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.AreEqual("@my___nickname@domain.name", result.tags.First().name);
Assert.AreEqual("Mention", result.tags.First().type);
@ -260,10 +322,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"@mynickname Bla!";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.AreEqual("@mynickname@domain.name", result.tags.First().name);
Assert.AreEqual("Mention", result.tags.First().type);
@ -281,10 +348,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"Bla!{Environment.NewLine}@mynickname @mynickname2 @mynickname3{Environment.NewLine}Test @dada Test";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(4, result.tags.Length);
Assert.IsTrue(result.content.Contains("Bla!"));
Assert.IsTrue(result.content.Contains(@"<span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span>"));
@ -301,10 +373,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
var message = $"Bla!{Environment.NewLine}@mynickname #mytag2 @mynickname3{Environment.NewLine}Test @dada #dada Test";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(5, result.tags.Length);
Assert.IsTrue(result.content.Contains("Bla!"));
Assert.IsTrue(result.content.Contains(@"<span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span>"));
@ -324,10 +401,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
//var message = $"tests@mynickname";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.IsTrue(result.content.Contains(
@"😤 <span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span>"));
@ -344,10 +426,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
//var message = $"tests@mynickname";
#endregion
var service = new StatusExtractor(_settings);
#region Mocks
var logger = new Mock<ILogger<StatusExtractor>>();
#endregion
var service = new StatusExtractor(_settings, logger.Object);
var result = service.Extract(message);
#region Validations
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.IsTrue(result.content.Equals(@"bla ( <span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span> test)"));
#endregion