testing new Hashtag regex
This commit is contained in:
parent
6fac0ceffa
commit
c7bf5f79f8
3 changed files with 144 additions and 42 deletions
|
@ -4,6 +4,7 @@ namespace BirdsiteLive.Common.Regexes
|
|||
{
|
||||
public class HashtagRegexes
|
||||
{
|
||||
public static readonly Regex Hashtag = new Regex(@"(.)(#[a-zA-Z0-9]+)(\s|$|[.,;:!?/|-])");
|
||||
public static readonly Regex HashtagName = new Regex(@"^[a-zA-Z0-9_]+$");
|
||||
public static readonly Regex Hashtag = new Regex(@"(.?)#([a-zA-Z0-9_]+)(\s|$|[<.,;:!?/|-])");
|
||||
}
|
||||
}
|
|
@ -4,6 +4,8 @@ using System.Text.RegularExpressions;
|
|||
using BirdsiteLive.ActivityPub.Models;
|
||||
using BirdsiteLive.Common.Regexes;
|
||||
using BirdsiteLive.Common.Settings;
|
||||
using BirdsiteLive.Twitter;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace BirdsiteLive.Domain.Tools
|
||||
{
|
||||
|
@ -14,7 +16,7 @@ namespace BirdsiteLive.Domain.Tools
|
|||
|
||||
public class StatusExtractor : IStatusExtractor
|
||||
{
|
||||
private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)");
|
||||
//private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)");
|
||||
//private readonly Regex _hastagRegex = new Regex(@"#\w+");
|
||||
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)");
|
||||
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+)\b(?!;)");
|
||||
|
@ -27,29 +29,31 @@ namespace BirdsiteLive.Domain.Tools
|
|||
private readonly Regex _urlRegex = new Regex(@"((http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)");
|
||||
|
||||
private readonly InstanceSettings _instanceSettings;
|
||||
private readonly ILogger<StatusExtractor> _logger;
|
||||
|
||||
#region Ctor
|
||||
public StatusExtractor(InstanceSettings instanceSettings)
|
||||
public StatusExtractor(InstanceSettings instanceSettings, ILogger<StatusExtractor> logger)
|
||||
{
|
||||
_instanceSettings = instanceSettings;
|
||||
_logger = logger;
|
||||
}
|
||||
#endregion
|
||||
|
||||
public (string content, Tag[] tags) Extract(string messageContent, bool extractMentions = true)
|
||||
{
|
||||
var tags = new List<Tag>();
|
||||
messageContent = $" {messageContent} ";
|
||||
//messageContent = $" {messageContent} ";
|
||||
|
||||
// Replace return lines
|
||||
messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "</p><p> ");
|
||||
messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "<br/> ");
|
||||
messageContent = Regex.Replace(messageContent, @"\(@", "( @");
|
||||
messageContent = Regex.Replace(messageContent, @"\(#", "( #");
|
||||
messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "</p><p>");
|
||||
messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "<br/>");
|
||||
//messageContent = Regex.Replace(messageContent, @"\(@", "( @");
|
||||
//messageContent = Regex.Replace(messageContent, @"\(#", "( #");
|
||||
|
||||
// Secure emojis
|
||||
var emojiMatch = EmojiRegexes.Emoji.Matches(messageContent);
|
||||
foreach (Match m in emojiMatch)
|
||||
messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} ");
|
||||
//// Secure emojis
|
||||
//var emojiMatch = EmojiRegexes.Emoji.Matches(messageContent);
|
||||
//foreach (Match m in emojiMatch)
|
||||
// messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} ");
|
||||
|
||||
// Extract Urls
|
||||
var urlMatch = _urlRegex.Matches(messageContent);
|
||||
|
@ -83,12 +87,19 @@ namespace BirdsiteLive.Domain.Tools
|
|||
}
|
||||
|
||||
// Extract Hashtags
|
||||
var hashtagMatch = OrderByLength(_hastagRegex.Matches(messageContent));
|
||||
var hashtagMatch = OrderByLength(HashtagRegexes.Hashtag.Matches(messageContent));
|
||||
foreach (Match m in hashtagMatch.OrderByDescending(x => x.Length))
|
||||
{
|
||||
var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim();
|
||||
var url = $"https://{_instanceSettings.Domain}/tags/{tag}";
|
||||
var tag = m.Groups[2].ToString();
|
||||
//var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim();
|
||||
|
||||
if (!HashtagRegexes.HashtagName.IsMatch(tag))
|
||||
{
|
||||
_logger.LogError("Parsing Hashtag failed: {Tag} on {Content}", tag, messageContent);
|
||||
continue;
|
||||
}
|
||||
|
||||
var url = $"https://{_instanceSettings.Domain}/tags/{tag}";
|
||||
tags.Add(new Tag
|
||||
{
|
||||
name = $"#{tag}",
|
||||
|
@ -96,8 +107,11 @@ namespace BirdsiteLive.Domain.Tools
|
|||
type = "Hashtag"
|
||||
});
|
||||
|
||||
messageContent = Regex.Replace(messageContent, m.ToString(),
|
||||
$@" <a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>");
|
||||
//messageContent = Regex.Replace(messageContent, m.ToString(),
|
||||
// $@" <a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>");
|
||||
|
||||
messageContent = Regex.Replace(messageContent, m.Groups[0].ToString(),
|
||||
$@"{m.Groups[1]}<a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>{m.Groups[3]}");
|
||||
}
|
||||
|
||||
// Extract Mentions
|
||||
|
|
|
@ -3,7 +3,9 @@ using System.Linq;
|
|||
using BirdsiteLive.Common.Settings;
|
||||
using BirdsiteLive.Domain.Tools;
|
||||
using BirdsiteLive.Twitter.Models;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.VisualStudio.TestTools.UnitTesting;
|
||||
using Moq;
|
||||
|
||||
namespace BirdsiteLive.Domain.Tests.Tools
|
||||
{
|
||||
|
@ -28,11 +30,16 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
#region Stubs
|
||||
var message = "Bla.\n\n@Mention blo. https://t.co/pgtrJi9600";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.IsTrue(result.content.Contains("Bla."));
|
||||
Assert.IsTrue(result.content.Contains("</p><p>"));
|
||||
#endregion
|
||||
|
@ -45,10 +52,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = "Bla.\n@Mention blo. https://t.co/pgtrJi9600";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.IsTrue(result.content.Contains("Bla."));
|
||||
Assert.IsTrue(result.content.Contains("<br/>"));
|
||||
#endregion
|
||||
|
@ -61,10 +73,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"Bla!{Environment.NewLine}https://t.co/L8BpyHgg25";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(0, result.tags.Length);
|
||||
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
|
@ -79,10 +96,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(0, result.tags.Length);
|
||||
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
|
@ -97,10 +119,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(0, result.tags.Length);
|
||||
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
|
@ -115,10 +142,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"https://t.co/L8BpyHgg25 Bla!{Environment.NewLine}https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(0, result.tags.Length);
|
||||
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
|
@ -132,13 +164,18 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
public void Extract_SingleHashTag_Test()
|
||||
{
|
||||
#region Stubs
|
||||
var message = $"Bla!{Environment.NewLine}#mytag";
|
||||
var message = $"Bla!{Environment.NewLine}#mytag";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.AreEqual("#mytag", result.tags.First().name);
|
||||
Assert.AreEqual("Hashtag", result.tags.First().type);
|
||||
|
@ -153,13 +190,18 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
public void Extract_SingleHashTag_AtStart_Test()
|
||||
{
|
||||
#region Stubs
|
||||
var message = $"#mytag Bla!";
|
||||
var message = "#mytag Bla!";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.AreEqual("#mytag", result.tags.First().name);
|
||||
Assert.AreEqual("Hashtag", result.tags.First().type);
|
||||
|
@ -174,20 +216,25 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
public void Extract_SingleHashTag_SpecialChar_Test()
|
||||
{
|
||||
#region Stubs
|
||||
var message = $"Bla!{Environment.NewLine}#COVIDー19";
|
||||
var message = $"Bla!{Environment.NewLine}#COVID_19";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.AreEqual("#COVIDー19", result.tags.First().name);
|
||||
Assert.AreEqual("#COVID_19", result.tags.First().name);
|
||||
Assert.AreEqual("Hashtag", result.tags.First().type);
|
||||
Assert.AreEqual("https://domain.name/tags/COVIDー19", result.tags.First().href);
|
||||
Assert.AreEqual("https://domain.name/tags/COVID_19", result.tags.First().href);
|
||||
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
Assert.IsTrue(result.content.Contains(@"<a href=""https://domain.name/tags/COVIDー19"" class=""mention hashtag"" rel=""tag"">#<span>COVIDー19</span></a>"));
|
||||
Assert.IsTrue(result.content.Contains(@"<a href=""https://domain.name/tags/COVID_19"" class=""mention hashtag"" rel=""tag"">#<span>COVID_19</span></a>"));
|
||||
#endregion
|
||||
}
|
||||
|
||||
|
@ -195,13 +242,18 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
public void Extract_MultiHashTags_Test()
|
||||
{
|
||||
#region Stubs
|
||||
var message = $"Bla!{Environment.NewLine}#mytag #mytag2 #mytag3{Environment.NewLine}Test #bal Test";
|
||||
var message = $"Bla!{Environment.NewLine}#mytag #mytag2 #mytag3{Environment.NewLine}Test #bal Test";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(4, result.tags.Length);
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
Assert.IsTrue(result.content.Contains(@"<a href=""https://domain.name/tags/mytag"" class=""mention hashtag"" rel=""tag"">#<span>mytag</span></a>"));
|
||||
|
@ -218,10 +270,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"Bla!{Environment.NewLine}@mynickname";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.AreEqual("@mynickname@domain.name", result.tags.First().name);
|
||||
Assert.AreEqual("Mention", result.tags.First().type);
|
||||
|
@ -239,10 +296,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"Bla!{Environment.NewLine}@my___nickname";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.AreEqual("@my___nickname@domain.name", result.tags.First().name);
|
||||
Assert.AreEqual("Mention", result.tags.First().type);
|
||||
|
@ -260,10 +322,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"@mynickname Bla!";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.AreEqual("@mynickname@domain.name", result.tags.First().name);
|
||||
Assert.AreEqual("Mention", result.tags.First().type);
|
||||
|
@ -281,10 +348,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"Bla!{Environment.NewLine}@mynickname @mynickname2 @mynickname3{Environment.NewLine}Test @dada Test";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(4, result.tags.Length);
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
Assert.IsTrue(result.content.Contains(@"<span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span>"));
|
||||
|
@ -301,10 +373,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
var message = $"Bla!{Environment.NewLine}@mynickname #mytag2 @mynickname3{Environment.NewLine}Test @dada #dada Test";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(5, result.tags.Length);
|
||||
Assert.IsTrue(result.content.Contains("Bla!"));
|
||||
Assert.IsTrue(result.content.Contains(@"<span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span>"));
|
||||
|
@ -324,10 +401,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
//var message = $"tests@mynickname";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.IsTrue(result.content.Contains(
|
||||
@"😤 <span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span>"));
|
||||
|
@ -344,10 +426,15 @@ namespace BirdsiteLive.Domain.Tests.Tools
|
|||
//var message = $"tests@mynickname";
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings);
|
||||
#region Mocks
|
||||
var logger = new Mock<ILogger<StatusExtractor>>();
|
||||
#endregion
|
||||
|
||||
var service = new StatusExtractor(_settings, logger.Object);
|
||||
var result = service.Extract(message);
|
||||
|
||||
#region Validations
|
||||
logger.VerifyAll();
|
||||
Assert.AreEqual(1, result.tags.Length);
|
||||
Assert.IsTrue(result.content.Equals(@"bla ( <span class=""h-card""><a href=""https://domain.name/@mynickname"" class=""u-url mention"">@<span>mynickname</span></a></span> test)"));
|
||||
#endregion
|
||||
|
|
Reference in a new issue