Skip to content

Commit

Permalink
Remove URLs & standard unicode emojis in message before translation r…
Browse files Browse the repository at this point in the history
…equest
  • Loading branch information
austins committed Feb 3, 2024
1 parent 7bf8a0b commit 607f52e
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ namespace DiscordTranslationBot.Tests.Handlers;
public sealed class FlagEmojiReactionHandlerTests
{
private const string Content = """
👍 test<:disdainsam:630009232128868353> _test_*test*
> test
__test__
""";
👍 test<:disdainsam:630009232128868353> _test_*test*
> test
__test__
""";

private const string ExpectedSanitizedMessage = """
👍 test testtest
test
test
""";
test testtest
test
test
""";

private const ulong BotUserId = 1UL;
private const ulong MessageUserId = 2UL;
Expand Down
17 changes: 11 additions & 6 deletions DiscordTranslationBot.Tests/Utilities/FormatUtilityTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,31 @@ public sealed class FormatUtilityTests
{
[Theory]
[InlineData(" textThatShouldBeTrimmed ", "textThatShouldBeTrimmed")]
[InlineData("text with unicode 👻 emoji 🤔", "text with unicode emoji")]
[InlineData("<@000000000000000000> test", "test")]
[InlineData("test <@!000000000000000000>", "test")]
[InlineData("<:emote:123000000000000000>", "")]
[InlineData("<:emote1:000000000000000000>", "")]
[InlineData("<:1234:000000000000000123>", "")]
[InlineData("test <a:test_emote:100000000000000123>", "test")]
[InlineData("test <> testing", "test < testing")]
[InlineData("<a:1A1A1A1A1A1A1A1A1A1A1A1A1A1A1A1A:100000000000000123>", "")]
[InlineData("text with links http://example.com https://example.com test", "text with links test")]
[InlineData(
"""
_markdown_ *markdown* `markdown` <a:1A1A1A1A1A1A1A1A1A1A1A1A1A1A1A1A:100000000000000123>
```json
{ "test": "test" }
```
```
<p>test</p>
<p>test</p>
```
```test```
[link](http://example.com)
""",
"""
markdown markdown markdown
json
{ "test": "test" }
""")]
"markdown markdown markdown")]
public void SanitizeText_Returns_AsExpected(string text, string expected)
{
// Act
Expand Down
21 changes: 11 additions & 10 deletions DiscordTranslationBot/DiscordTranslationBot.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="AsyncAwaitBestPractices" Version="7.0.0" />
<PackageReference Include="Discord.Net" Version="3.13.0" />
<PackageReference Include="FluentValidation.DependencyInjectionExtensions" Version="11.9.0" />
<PackageReference Include="Humanizer.Core" Version="2.14.1" />
<PackageReference Include="MediatR" Version="12.2.0" />
<PackageReference Include="Microsoft.Extensions.Hosting" Version="8.0.0" />
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="8.0.1" />
<PackageReference Include="Polly.Contrib.WaitAndRetry" Version="1.1.1" />
<PackageReference Include="Refit.HttpClientFactory" Version="7.0.0" />
<PackageReference Include="Unicode.net" Version="2.0.0" />
<PackageReference Include="AsyncAwaitBestPractices" Version="7.0.0"/>
<PackageReference Include="Discord.Net" Version="3.13.0"/>
<PackageReference Include="FluentValidation.DependencyInjectionExtensions" Version="11.9.0"/>
<PackageReference Include="Humanizer.Core" Version="2.14.1"/>
<PackageReference Include="Markdig.Signed" Version="0.34.0"/>
<PackageReference Include="MediatR" Version="12.2.0"/>
<PackageReference Include="Microsoft.Extensions.Hosting" Version="8.0.0"/>
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="8.0.1"/>
<PackageReference Include="Polly.Contrib.WaitAndRetry" Version="1.1.1"/>
<PackageReference Include="Refit.HttpClientFactory" Version="7.0.0"/>
<PackageReference Include="Unicode.net" Version="2.0.0"/>
</ItemGroup>
</Project>
1 change: 0 additions & 1 deletion DiscordTranslationBot/Handlers/FlagEmojiReactionHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ await notification.Message.RemoveReactionAsync(
}

var sanitizedMessage = FormatUtility.SanitizeText(notification.Message.Content);

if (string.IsNullOrWhiteSpace(sanitizedMessage))
{
_log.EmptySourceMessage();
Expand Down
3 changes: 0 additions & 3 deletions DiscordTranslationBot/Program.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
#pragma warning disable CA1852 // Seal internal types
#pragma warning disable SA1210 // Using directives should be ordered alphabetically by namespace
global using MediatR;
#pragma warning restore SA1210 // Using directives should be ordered alphabetically by namespace
using Discord;
using Discord.WebSocket;
using DiscordTranslationBot;
Expand Down
1 change: 1 addition & 0 deletions DiscordTranslationBot/Usings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
global using MediatR;
49 changes: 45 additions & 4 deletions DiscordTranslationBot/Utilities/FormatUtility.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
using System.Text;
using System.Text.RegularExpressions;
using Discord;
using Markdig;
using NeoSmart.Unicode;
using Emoji = NeoSmart.Unicode.Emoji;

namespace DiscordTranslationBot.Utilities;

Expand All @@ -9,16 +12,54 @@ namespace DiscordTranslationBot.Utilities;
public static partial class FormatUtility
{
/// <summary>
/// Remove all user and channel mentions and custom emotes,
/// then strip all markdown to make the translation clean.
/// Remove special Discord syntax, emojis, and Markdown to only translate what is necessary
/// and reduce translation providers' character quota usage.
/// </summary>
/// <param name="text">Text to sanitize.</param>
/// <returns>Sanitized text.</returns>
public static string SanitizeText(string text)
{
return Format.StripMarkDown(DiscordSyntaxRegex().Replace(text, string.Empty)).Trim();
// Remove all user mentions, channel mentions, and custom Discord emoji.
var result = DiscordSyntaxRegex().Replace(text, string.Empty);

// Remove text within all Markdown fenced code blocks.
result = MarkdownFencedCodeBlockRegex().Replace(result, string.Empty);

// Remove Markdown links first so its text gets removed instead of getting converted to plain text.
result = MarkdownLinkRegex().Replace(result, string.Empty);

// Remove Markdown.
result = Markdown.ToPlainText(result);

// Remove URLs starting with "http://" or "https://".
result = UrlRegex().Replace(result, string.Empty);

// Remove all unicode emoji.
var stringBuilder = new StringBuilder();
foreach (var letter in result.Letters().Where(letter => !Emoji.IsEmoji(letter)))
{
stringBuilder.Append(letter);
}

result = stringBuilder.ToString();

// Trim and return sanitized text.
return result.Trim();
}

/// <summary>
/// Regex for all user, channel mentions, and custom emotes.
/// </summary>
/// <returns>Regex.</returns>
[GeneratedRegex(@"<((@!?&?\d+)|(a?:.+?:\d+))>")]
private static partial Regex DiscordSyntaxRegex();

[GeneratedRegex(@"\`\`\`(?:.|[\r\n])*?\`\`\`")]
private static partial Regex MarkdownFencedCodeBlockRegex();

[GeneratedRegex(@"\[([^\]]+)\]\(([^)]+)\)")]
private static partial Regex MarkdownLinkRegex();

[GeneratedRegex(@"\b(?:https?://)\S+\b")]
private static partial Regex UrlRegex();
}

0 comments on commit 607f52e

Please sign in to comment.