From 0e1653ae5e0d88969809cb667fd0a189e2a67247 Mon Sep 17 00:00:00 2001 From: SimpleStation14 Date: Tue, 11 Jun 2024 15:24:01 -0400 Subject: [PATCH] Cherry-picked commit bf98a6a8bb2a57fb149459d6b053eaaf6abc8cd7 from space-wizards/space-station-14/master --- .../Chat/V2/Moderation/ChatCensor.cs | 59 +++ .../Chat/V2/Moderation/RegexCensor.cs | 15 + .../Chat/V2/Moderation/SimpleCensor.cs | 340 ++++++++++++++++++ .../Shared/Chat/V2/Moderation/SimpleCensor.cs | 162 +++++++++ 4 files changed, 576 insertions(+) create mode 100644 Content.Shared/Chat/V2/Moderation/ChatCensor.cs create mode 100644 Content.Shared/Chat/V2/Moderation/RegexCensor.cs create mode 100644 Content.Shared/Chat/V2/Moderation/SimpleCensor.cs create mode 100644 Content.Tests/Shared/Chat/V2/Moderation/SimpleCensor.cs diff --git a/Content.Shared/Chat/V2/Moderation/ChatCensor.cs b/Content.Shared/Chat/V2/Moderation/ChatCensor.cs new file mode 100644 index 00000000000..b5d6aa03441 --- /dev/null +++ b/Content.Shared/Chat/V2/Moderation/ChatCensor.cs @@ -0,0 +1,59 @@ +using System.Linq; + +namespace Content.Shared.Chat.V2.Moderation; + +public interface IChatCensor +{ + public bool Censor(string input, out string output, char replaceWith = '*'); +} + +public sealed class CompoundChatCensor(IEnumerable censors) : IChatCensor +{ + public bool Censor(string input, out string output, char replaceWith = '*') + { + var censored = false; + + foreach (var censor in censors) + { + if (censor.Censor(input, out output, replaceWith)) + { + censored = true; + } + } + + output = input; + + return censored; + } +} + +public sealed class ChatCensorFactory +{ + private List _censors = new(); + + public void With(IChatCensor censor) + { + _censors.Add(censor); + } + + /// + /// Builds a ChatCensor that combines all the censors that have been added to this. + /// + public IChatCensor Build() + { + return new CompoundChatCensor(_censors.ToArray()); + } + + /// + /// Resets the build state to zero, allowing for different rules to be provided to the next censor(s) built. + /// + /// True if the builder had any setup prior to the reset. + public bool Reset() + { + var notEmpty = _censors.Count > 0; + + _censors = new List(); + + return notEmpty; + } +} diff --git a/Content.Shared/Chat/V2/Moderation/RegexCensor.cs b/Content.Shared/Chat/V2/Moderation/RegexCensor.cs new file mode 100644 index 00000000000..cd47bf0c33c --- /dev/null +++ b/Content.Shared/Chat/V2/Moderation/RegexCensor.cs @@ -0,0 +1,15 @@ +using System.Text.RegularExpressions; + +namespace Content.Shared.Chat.V2.Moderation; + +public sealed class RegexCensor(Regex censorInstruction) : IChatCensor +{ + private readonly Regex _censorInstruction = censorInstruction; + + public bool Censor(string input, out string output, char replaceWith = '*') + { + output = _censorInstruction.Replace(input, replaceWith.ToString()); + + return !string.Equals(input, output); + } +} diff --git a/Content.Shared/Chat/V2/Moderation/SimpleCensor.cs b/Content.Shared/Chat/V2/Moderation/SimpleCensor.cs new file mode 100644 index 00000000000..a6bb70dd9f6 --- /dev/null +++ b/Content.Shared/Chat/V2/Moderation/SimpleCensor.cs @@ -0,0 +1,340 @@ +using System.Collections.Frozen; +using System.Linq; +using System.Text; +using System.Text.Unicode; + +namespace Content.Shared.Chat.V2.Moderation; + +/// +/// A basic censor. Not bullet-proof. +/// +public sealed class SimpleCensor : IChatCensor +{ + // Common substitution symbols are replaced with one of the characters they commonly substitute. + private bool _shouldSanitizeLeetspeak; + private FrozenDictionary _leetspeakReplacements = FrozenDictionary.Empty; + + // Special characters are replaced with spaces. + private bool _shouldSanitizeSpecialCharacters; + private HashSet _specialCharacterReplacements = []; + + // Censored words are removed unless they're a false positive (e.g. Scunthorpe) + private string[] _censoredWords = Array.Empty(); + private string[] _falsePositives = Array.Empty(); + + // False negatives are censored words that contain a false positives. + private string[] _falseNegatives = Array.Empty(); + + // What unicode ranges are allowed? If this array is empty, don't filter by range. + private UnicodeRange[] _allowedUnicodeRanges= Array.Empty(); + + /// + /// Censors the input string. + /// + /// The input string + /// The output string + /// The character to replace with + /// If output is valid + public bool Censor(string input, out string output, char replaceWith = '*') + { + output = Censor(input, replaceWith); + + return !string.Equals(input, output); + } + + public string Censor(string input, char replaceWith = '*') + { + // We flat-out ban anything not in the allowed unicode ranges, stripping them + input = SanitizeOutBlockedUnicode(input); + + var originalInput = input.ToCharArray(); + + input = SanitizeInput(input); + + var censored = input.ToList(); + + // Remove false negatives + input = CheckProfanity(input, censored, _falseNegatives, replaceWith); + + // Get false positives + var falsePositives = FindFalsePositives(censored, replaceWith); + + // Remove censored words + CheckProfanity(input, censored, _censoredWords, replaceWith); + + // Reconstruct + // Reconstruct false positives + for (var i = 0; i < falsePositives.Length; i++) + { + if (falsePositives[i] != replaceWith) + { + censored[i] = falsePositives[i]; + } + } + + for (var i = 0; i < originalInput.Length; i++) + { + if (originalInput[i] == ' ') + { + censored.Insert(i, ' '); + + continue; + } + + if (_shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(originalInput[i])) + { + censored.Insert(i, originalInput[i]); + + continue; + } + + if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters) + { + // detect "()" + if (originalInput[i] == '(' && i != originalInput.Length - 1 && originalInput[i+1] == ')') + { + // censored has now had "o" replaced with "o) so both strings line up again..." + censored.Insert(i+1, censored[i] != replaceWith ? ')' : replaceWith); + } + } + + if (censored[i] != replaceWith) + { + censored[i] = originalInput[i]; + } + } + + // SO says this is fast... + return string.Concat(censored); + } + + /// + /// Adds a l33tsp34k sanitization rule + /// + /// The censor for further configuration + public SimpleCensor WithSanitizeLeetSpeak() + { + _shouldSanitizeLeetspeak = true; + + return BuildCharacterReplacements(); + } + + /// + /// Adds a l33tsp34k sanitization rule + /// + /// The censor for further configuration + public SimpleCensor WithSanitizeSpecialCharacters() + { + _shouldSanitizeSpecialCharacters = true; + + return BuildCharacterReplacements(); + } + + public SimpleCensor WithRanges(UnicodeRange[] ranges) + { + _allowedUnicodeRanges = ranges; + + return this; + } + + public SimpleCensor WithCustomDictionary(string[] naughtyWords) + { + _censoredWords = naughtyWords; + + return this; + } + + public SimpleCensor WithFalsePositives(string[] falsePositives) + { + _falsePositives = falsePositives; + + return this; + } + + public SimpleCensor WithFalseNegatives(string[] falseNegatives) + { + _falseNegatives = falseNegatives; + + return this; + } + + public SimpleCensor WithLeetspeakReplacements(Dictionary replacements) + { + _leetspeakReplacements = replacements.ToFrozenDictionary(); + + return this; + } + + public SimpleCensor WithSpecialCharacterReplacements(Dictionary replacements) + { + _leetspeakReplacements = replacements.ToFrozenDictionary(); + + return this; + } + + private string CheckProfanity(string input, List censored, string[] words, char replaceWith = '*') + { + foreach (var word in words) + { + var wordLength = word.Length; + var endOfFoundWord = 0; + var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase); + + while(foundIndex > -1) + { + endOfFoundWord = foundIndex + wordLength; + + for (var i = 0; i < wordLength; i++) + { + censored[foundIndex+i] = replaceWith; + } + + foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase); + } + } + + return input; + } + + private char[] FindFalsePositives(List chars, char replaceWith = '*') + { + var input = string.Concat(chars); + + var output = Enumerable.Repeat(replaceWith, input.Length).ToArray(); + var inputAsARr = input.ToArray(); + + foreach (var word in _falsePositives) + { + var wordLength = word.Length; + var endOfFoundWord = 0; + var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase); + + while(foundIndex > -1) + { + endOfFoundWord = foundIndex + wordLength; + + for (var i = foundIndex; i < endOfFoundWord; i++) + { + output[i] = inputAsARr[i]; + } + + foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase); + } + } + + return output; + } + + private string SanitizeInput(string input) + { + // "()" is a broad enough trick to beat censors that we we should check for it broadly. + if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters) + { + input = input.Replace("()", "o"); + } + + var sb = new StringBuilder(); + + // ReSharper disable once ForeachCanBePartlyConvertedToQueryUsingAnotherGetEnumerator + foreach (var character in input) + { + if (character == ' ' || _shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(character)) + { + continue; + } + + if (_shouldSanitizeLeetspeak && _leetspeakReplacements.TryGetValue(character, out var leetRepl)) + { + sb.Append(leetRepl); + + continue; + } + + sb.Append(character); + } + + return sb.ToString(); + } + + /// + /// Returns a string with all characters not in ISO-8851-1 replaced with question marks + /// + private string SanitizeOutBlockedUnicode(string input) + { + if (_allowedUnicodeRanges.Length <= 0) + { + return input; + } + + var sb = new StringBuilder(); + + foreach (var symbol in input.EnumerateRunes()) + { + // ReSharper disable once LoopCanBeConvertedToQuery + foreach (var range in _allowedUnicodeRanges) + { + if (symbol.Value < range.FirstCodePoint || symbol.Value >= range.FirstCodePoint + range.Length) + continue; + + sb.Append(symbol); + + break; + } + } + + return sb.ToString(); + } + + private SimpleCensor BuildCharacterReplacements() + { + if (_shouldSanitizeSpecialCharacters) + { + _specialCharacterReplacements = + [ + '-', + '_', + '|', + '.', + ',', + '(', + ')', + '<', + '>', + '"', + '`', + '~', + '*', + '&', + '%', + '$', + '#', + '@', + '!', + '?', + '+' + ]; + } + + if (_shouldSanitizeLeetspeak) + { + _leetspeakReplacements = new Dictionary + { + ['4'] = 'a', + ['$'] = 's', + ['!'] = 'i', + ['+'] = 't', + ['#'] = 'h', + ['@'] = 'a', + ['0'] = 'o', + ['1'] = 'i', // also obviously can be l; gamer-words need i's more though. + ['7'] = 'l', + ['3'] = 'e', + ['5'] = 's', + ['9'] = 'g', + ['<'] = 'c' + }.ToFrozenDictionary(); + } + + return this; + } +} diff --git a/Content.Tests/Shared/Chat/V2/Moderation/SimpleCensor.cs b/Content.Tests/Shared/Chat/V2/Moderation/SimpleCensor.cs new file mode 100644 index 00000000000..09870af317c --- /dev/null +++ b/Content.Tests/Shared/Chat/V2/Moderation/SimpleCensor.cs @@ -0,0 +1,162 @@ +using System.Text.Unicode; +using Content.Shared.Chat.V2.Moderation; +using NUnit.Framework; + +namespace Content.Tests.Shared.Chat.V2.Moderation; + +public sealed class SimpleCensorTests +{ + [Test] + public void CanCensorASingleWord() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus"]); + var output = sut.Censor("hello amogus"); + + Assert.That(output, Is.EqualTo("hello ******")); + } + + // Basics - use custom dictionary + + [Test] + public void CanCensorMultipleWordInstances() + { + var sut= new SimpleCensor().WithCustomDictionary(["amogus"]); + var output = sut.Censor("amogus hello amogus"); + + Assert.That(output, Is.EqualTo("****** hello ******")); + } + + [Test] + public void CanCensorMultipleWords() + { + var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]); + var output = sut.Censor("amogus hello sus"); + + Assert.That(output, Is.EqualTo("****** hello ***")); + } + + [Test] + public void CanUseDifferentCensorSymbols() + { + var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]); + var output = sut.Censor("amogus hello sus", '#'); + + Assert.That(output, Is.EqualTo("###### hello ###")); + } + + [Test] + public void CanCatchCapitalizedWords() + { + var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]); + var output = sut.Censor("AMOGUS hello SUS"); + + Assert.That(output, Is.EqualTo("****** hello ***")); + } + + [Test] + public void CanCatchWordsWithSomeCaptialsInThem() + { + var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]); + var output = sut.Censor("AmoGuS hello SuS"); + + Assert.That(output, Is.EqualTo("****** hello ***")); + } + + [Test] + public void CanCatchWordsHiddenInsideOtherWords() + { + var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]); + var output = sut.Censor("helamoguslo suspicious"); + + Assert.That(output, Is.EqualTo("hel******lo ***picious")); + } + + // Sanitizing Leetspeak + + [Test] + public void CanSanitizeLeetspeak() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak(); + var output = sut.Censor("am0gu5 hello 5u5"); + + Assert.That(output, Is.EqualTo("****** hello ***")); + } + + [Test] + public void SanitizingLeetspeakOnlyOccursWhenTheWordIsBlocked() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak(); + var output = sut.Censor("he110"); + + Assert.That(output, Is.EqualTo("he110")); + } + + [Test] + public void CanCatchLeetspeakReplacementsWithMoreThanOneLetter() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak(); + var output = sut.Censor("am()gu5 hello 5u5"); + + Assert.That(output, Is.EqualTo("******* hello ***")); + } + + // Sanitizing special characters + + [Test] + public void DoesNotSanitizeOutUncensoredSpecialCharacters() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeSpecialCharacters(); + var output = sut.Censor("amogus!hello!sus"); + + Assert.That(output, Is.EqualTo("******!hello!***")); + } + + [Test] + public void DoesSanitizeOutCensoredSpecialCharacters() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeSpecialCharacters(); + var output = sut.Censor("amo!gus hello s?us"); + + Assert.That(output, Is.EqualTo("***!*** hello *?**")); + } + + // Unicode ranges + + [Test] + public void SanitizesOutNonLatinCharaters() + { + var sut = new SimpleCensor().WithRanges([UnicodeRanges.BasicLatin, UnicodeRanges.Latin1Supplement]); + var output = sut.Censor("amogus Україна sus 日本"); + + Assert.That(output, Is.EqualTo("amogus sus ")); + } + + [Test] + public void SanitizesOutNonLatinOrCyrillicCharaters() + { + var sut = new SimpleCensor().WithRanges([UnicodeRanges.BasicLatin, UnicodeRanges.Latin1Supplement, UnicodeRanges.Cyrillic]); + var output = sut.Censor("amogus Україна sus 日本"); + + Assert.That(output, Is.EqualTo("amogus Україна sus ")); + } + + // False positives + [Test] + public void CanHandleFalsePositives() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithFalsePositives(["amogusus"]); + var output = sut.Censor("amogusus hello amogus hello sus"); + + Assert.That(output, Is.EqualTo("amogusus hello ****** hello ***")); + } + + // False negatives + [Test] + public void CanHandleFalseNegatives() + { + var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithFalsePositives(["amogusus"]).WithFalseNegatives(["susamogusus"]); + var output = sut.Censor("susamogusus hello amogus hello sus amogusus"); + + Assert.That(output, Is.EqualTo("*********** hello ****** hello *** ********")); + } +}