Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Temporary thread safety fix for ICUTokenizer/Locking patches for AttributeSource #328

Merged
merged 9 commits into from
Aug 24, 2020
Merged
54 changes: 40 additions & 14 deletions src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,27 @@ namespace Lucene.Net.Analysis.Th
/// </summary>
public class ThaiTokenizer : SegmentingTokenizerBase
{
private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe (LUCENENET TODO: TO REVERT)

// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
private static readonly BreakIterator proto = (BreakIterator)BreakIterator.GetWordInstance(new CultureInfo("th")).Clone();
private static readonly BreakIterator proto = LoadProto();

/// <summary>
/// used for breaking the text into sentences
/// </summary>
private static readonly BreakIterator sentenceProto = (BreakIterator)BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture).Clone();
private static readonly BreakIterator sentenceProto = LoadSentenceProto();

private static BreakIterator LoadProto()
{
lock (syncLock)
return BreakIterator.GetWordInstance(new CultureInfo("th"));
}

private static BreakIterator LoadSentenceProto()
{
lock (syncLock)
return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
}

private readonly ThaiWordBreaker wordBreaker;
private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();
Expand All @@ -58,8 +72,6 @@ public class ThaiTokenizer : SegmentingTokenizerBase
private readonly ICharTermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;

private readonly object syncLock = new object();

/// <summary>
/// Creates a new <see cref="ThaiTokenizer"/> </summary>
public ThaiTokenizer(TextReader reader)
Expand All @@ -70,20 +82,36 @@ public ThaiTokenizer(TextReader reader)
/// <summary>
/// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
public ThaiTokenizer(AttributeFactory factory, TextReader reader)
: base(factory, reader, (BreakIterator)sentenceProto.Clone())
: base(factory, reader, CreateSentenceClone())
{
// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator

wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
lock (syncLock)
wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}

private static BreakIterator CreateSentenceClone()
{
lock (syncLock)
return (BreakIterator)sentenceProto.Clone();
}

public override void Reset()
{
lock (syncLock)
base.Reset();
}

public override State CaptureState()
{
lock (syncLock)
return base.CaptureState();
}

protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
{
// LUCENENET TODO: This class isn't passing thread safety checks.
// Adding locking and extra cloning of BreakIterator seems to help, but
// it is not a complete fix.
lock (syncLock)
{
this.sentenceStart = sentenceStart;
Expand All @@ -95,19 +123,17 @@ protected override void SetNextSentence(int sentenceStart, int sentenceEnd)

protected override bool IncrementWord()
{
// LUCENENET TODO: This class isn't passing thread safety checks.
// Adding locking and extra cloning of BreakIterator seems to help, but
// it is not a complete fix.
int start, end;
lock (syncLock)
{
int start = wordBreaker.Current;
start = wordBreaker.Current;
if (start == BreakIterator.Done)
{
return false; // BreakIterator exhausted
}

// find the next set of boundaries, skipping over non-tokens
int end = wordBreaker.Next();
end = wordBreaker.Next();
while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
{
start = end;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// Lucene version compatibility level 7.1.0
using ICU4N;
using ICU4N.Support.Text;
// Lucene version compatibility level 8.6.1
using ICU4N.Text;
using J2N;

namespace Lucene.Net.Analysis.Icu.Segmentation
{
Expand All @@ -23,146 +22,88 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
*/

/// <summary>
/// Contain all the issues surrounding BreakIterators in ICU in one place.
/// Basically this boils down to the fact that they aren't very friendly to any
/// sort of OO design.
/// <para/>
/// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
/// BreakIterator from <see cref="RuleBasedBreakIterator"/>
/// <para/>
/// DictionaryBasedBreakIterator is a subclass of <see cref="RuleBasedBreakIterator"/>, but
/// doesn't actually behave as a subclass: it always returns 0 for
/// getRuleStatus():
/// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
/// tags
/// Wraps <see cref="RuleBasedBreakIterator"/>, making object reuse convenient and
/// emitting a rule status for emoji sequences.
/// <para/>
/// @lucene.experimental
/// </summary>
internal abstract class BreakIteratorWrapper
internal sealed class BreakIteratorWrapper
{
protected readonly CharArrayIterator m_textIterator = new CharArrayIterator();
protected char[] m_text;
protected int m_start;
protected int m_length;

public abstract int Next();
public abstract int Current { get; }
public abstract int RuleStatus { get; }
public abstract void SetText(CharacterIterator text);
private readonly CharArrayIterator textIterator = new CharArrayIterator();
private readonly RuleBasedBreakIterator rbbi;
private char[] text;
private int start;
private int status;

public void SetText(char[] text, int start, int length)
internal BreakIteratorWrapper(RuleBasedBreakIterator rbbi)
{
this.m_text = text;
this.m_start = start;
this.m_length = length;
m_textIterator.SetText(text, start, length);
SetText(m_textIterator);
this.rbbi = rbbi;
}

/// <summary>
/// If its a <see cref="RuleBasedBreakIterator"/>, the rule status can be used for token type. If it's
/// any other <see cref="BreakIterator"/>, the rulestatus method is not available, so treat
/// it like a generic <see cref="BreakIterator"/>.
/// </summary>
/// <param name="breakIterator"></param>
/// <returns></returns>
public static BreakIteratorWrapper Wrap(BreakIterator breakIterator)
public int Current => rbbi.Current;

public int RuleStatus => status;

public int Next()
{
if (breakIterator is RuleBasedBreakIterator)
return new RBBIWrapper((RuleBasedBreakIterator)breakIterator);
else
return new BIWrapper(breakIterator);
int current = rbbi.Current;
int next = rbbi.Next();
status = CalcStatus(current, next);
return next;
}

/// <summary>
/// <see cref="RuleBasedBreakIterator"/> wrapper: <see cref="RuleBasedBreakIterator"/> (as long as it's not
/// a DictionaryBasedBreakIterator) behaves correctly.
/// </summary>
private sealed class RBBIWrapper : BreakIteratorWrapper
/// <summary>Returns current rule status for the text between breaks. (determines token type)</summary>
private int CalcStatus(int current, int next)
{
private readonly RuleBasedBreakIterator rbbi;

internal RBBIWrapper(RuleBasedBreakIterator rbbi)
{
this.rbbi = rbbi;
}

public override int Current => rbbi.Current;

public override int RuleStatus => rbbi.RuleStatus;

public override int Next()
// to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
// https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
if (next != BreakIterator.Done && IsEmoji(current, next))
{
return rbbi.Next();
return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
}

public override void SetText(CharacterIterator text)
else
{
rbbi.SetText(text);
return rbbi.RuleStatus;
}
}

/// <summary>
/// Generic <see cref="BreakIterator"/> wrapper: Either the rulestatus method is not
/// available or always returns 0. Calculate a rulestatus here so it behaves
/// like <see cref="RuleBasedBreakIterator"/>.
/// </summary>
/// <remarks>
/// Note: This is slower than <see cref="RuleBasedBreakIterator"/>.
/// </remarks>
private sealed class BIWrapper : BreakIteratorWrapper
{
private readonly BreakIterator bi;
private int status;

internal BIWrapper(BreakIterator bi)
{
this.bi = bi;
}

public override int Current => bi.Current;

public override int RuleStatus => status;
// See unicode doc L2/16-315 for rationale.
// basically for us the ambiguous cases (keycap/etc) as far as types go.
internal static readonly UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").Freeze();
// faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
//internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").Freeze(); // LUCENENET: Extended_Pictographic wasn't added until ICU 62
internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:]]").Freeze();

public override int Next()
{
int current = bi.Current;
int next = bi.Next();
status = CalcStatus(current, next);
return next;
}

private int CalcStatus(int current, int next)
/// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
private bool IsEmoji(int current, int next)
{
int begin = start + current;
int end = start + next;
int codepoint = UTF16.CharAt(text, 0, end, begin);
if (EMOJI.Contains(codepoint))
{
if (current == BreakIterator.Done || next == BreakIterator.Done)
return BreakIterator.WordNone;

int begin = m_start + current;
int end = m_start + next;

int codepoint;
for (int i = begin; i < end; i += UTF16.GetCharCount(codepoint))
if (EMOJI_RK.Contains(codepoint))
{
codepoint = UTF16.CharAt(m_text, 0, end, begin);

if (UChar.IsDigit(codepoint))
return BreakIterator.WordNumber;
else if (UChar.IsLetter(codepoint))
{
// TODO: try to separately specify ideographic, kana?
// [currently all bundled as letter for this case]
return BreakIterator.WordLetter;
}
// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
// an emoji presentation selector or keycap follows.
int trailer = begin + Character.CharCount(codepoint);
return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
}
else
{
return true;
}

return BreakIterator.WordNone;
}
return false;
}

public override void SetText(CharacterIterator text)
{
bi.SetText(text);
status = BreakIterator.WordNone;
}
public void SetText(char[] text, int start, int length)
{
this.text = text;
this.start = start;
textIterator.SetText(text, start, length);
rbbi.SetText(textIterator);
status = RuleBasedBreakIterator.WordNone;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Lucene version compatibility level 7.1.0
// Lucene version compatibility level 8.6.1
#if FEATURE_BREAKITERATOR
using ICU4N.Support.Text;
using Lucene.Net.Support;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Lucene version compatibility level 7.1.0
// Lucene version compatibility level 8.6.1
using ICU4N;
using ICU4N.Globalization;
using ICU4N.Text;
Expand Down Expand Up @@ -124,8 +124,8 @@ public void SetText(char[] text, int start, int length)

private BreakIteratorWrapper GetBreakIterator(int scriptCode)
{
if (wordBreakers[scriptCode] == null)
wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode));
if (wordBreakers[scriptCode] is null)
wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode));
return wordBreakers[scriptCode];
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Lucene version compatibility level 7.1.0
// Lucene version compatibility level 8.6.1
using ICU4N.Globalization;
using ICU4N.Text;
using J2N;
Expand Down Expand Up @@ -53,6 +53,8 @@ public class DefaultICUTokenizerConfig : ICUTokenizerConfig
public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/// <summary>Token type for words that appear to be numbers</summary>
public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/// <summary>Token type for words that appear to be emoji sequences</summary>
public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI

/// <summary>
/// the default breakiterators in use. these can be expensive to
Expand Down Expand Up @@ -90,21 +92,21 @@ public DefaultICUTokenizerConfig(bool cjkAsWords, bool myanmarAsWords)

public override bool CombineCJ => cjkAsWords;

public override BreakIterator GetBreakIterator(int script)
public override RuleBasedBreakIterator GetBreakIterator(int script)
{
switch (script)
{
case UScript.Japanese: return (BreakIterator)cjkBreakIterator.Clone();
case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone();
case UScript.Myanmar:
if (myanmarAsWords)
{
return (BreakIterator)defaultBreakIterator.Clone();
return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
}
else
{
return (BreakIterator)myanmarSyllableIterator.Clone();
return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone();
}
default: return (BreakIterator)defaultBreakIterator.Clone();
default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
}
}

Expand All @@ -120,6 +122,8 @@ public override string GetType(int script, int ruleStatus)
return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER;
case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER;
case EMOJI_SEQUENCE_STATUS:
return WORD_EMOJI;
default: /* some other custom code */
return "<OTHER>";
}
Expand Down
Loading