using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using BirdsiteLive.ActivityPub.Models; using BirdsiteLive.Common.Settings; namespace BirdsiteLive.Domain.Tools { public interface IStatusExtractor { (string content, Tag[] tags) ExtractTags(string messageContent); } public class StatusExtractor : IStatusExtractor { private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)"); //private readonly Regex _hastagRegex = new Regex(@"#\w+"); //private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)"); //private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+)\b(?!;)"); private readonly Regex _mentionRegex = new Regex(@"\W(\@[a-zA-Z0-9_ー]+\b)(?!;)"); //private readonly Regex _mentionRegex = new Regex(@"@\w+"); //private readonly Regex _mentionRegex = new Regex(@"(?<=[\s>]|^)@(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)"); //private readonly Regex _mentionRegex = new Regex(@"(?<=[\s>]|^)@(\w*[a-zA-Z0-9_ー]+)\b(?!;)"); private readonly Regex _urlRegex = new Regex(@"((http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)"); private readonly InstanceSettings _instanceSettings; #region Ctor public StatusExtractor(InstanceSettings instanceSettings) { _instanceSettings = instanceSettings; } #endregion public (string content, Tag[] tags) ExtractTags(string messageContent) { var tags = new List(); messageContent = $" {messageContent} "; // Replace return lines messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "

"); messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "
"); // Extract Urls var urlMatch = _urlRegex.Matches(messageContent); foreach (Match m in urlMatch) { var url = m.ToString().Replace("\n", string.Empty).Trim(); var protocol = "https://"; if (url.StartsWith("http://")) protocol = "http://"; else if (url.StartsWith("ftp://")) protocol = "ftp://"; var truncatedUrl = url.Replace(protocol, string.Empty); if (truncatedUrl.StartsWith("www.")) { protocol += "www."; truncatedUrl = truncatedUrl.Replace("www.", string.Empty); } var firstPart = truncatedUrl; var secondPart = string.Empty; if (truncatedUrl.Length > 30) { firstPart = truncatedUrl.Substring(0, 30); secondPart = truncatedUrl.Substring(30); } messageContent = Regex.Replace(messageContent, m.ToString(), $@" {protocol}{firstPart}{secondPart}"); } // Extract Hashtags var hashtagMatch = OrderByLength(_hastagRegex.Matches(messageContent)); foreach (Match m in hashtagMatch) { var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim(); var url = $"https://{_instanceSettings.Domain}/tags/{tag}"; tags.Add(new Tag { name = $"#{tag}", href = url, type = "Hashtag" }); messageContent = Regex.Replace(messageContent, m.ToString(), $@" #{tag}"); } // Extract Mentions var mentionMatch = OrderByLength(_mentionRegex.Matches(messageContent)); foreach (Match m in mentionMatch) { var mention = m.ToString().Replace("@", string.Empty).Replace("\n", string.Empty).Trim(); var url = $"https://{_instanceSettings.Domain}/users/{mention}"; var name = $"@{mention}@{_instanceSettings.Domain}"; tags.Add(new Tag { name = name, href = url, type = "Mention" }); messageContent = Regex.Replace(messageContent, m.ToString(), $@" @{mention}"); } // Clean up return lines messageContent = Regex.Replace(messageContent, @"

", "

"); messageContent = Regex.Replace(messageContent, @"
", "
"); return (messageContent.Trim(), tags.ToArray()); } private IEnumerable OrderByLength(MatchCollection matches) { var result = new List(); foreach (Match m in matches) result.Add(m); result = result.OrderByDescending(x => x.Length).ToList(); return result; } } }