Skip to content

Commit

Permalink
BUG: Fixed surrogate pair and culture-sensitivity issues with many an…
Browse files Browse the repository at this point in the history
…alyzers. (see apache#296)
  • Loading branch information
NightOwl888 committed Jul 29, 2020
1 parent 7622fbf commit 22a1e22
Show file tree
Hide file tree
Showing 15 changed files with 43 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace Lucene.Net.Analysis.Core
/// <summary>
/// A <see cref="LetterTokenizer"/> is a tokenizer that divides text at non-letters. That's to
/// say, it defines tokens as maximal strings of adjacent letters, as defined by
/// <see cref="char.IsLetter(char)"/> predicate.
/// <see cref="Character.IsLetter(int)"/> predicate.
/// <para>
/// Note: this does a decent job for most European languages, but does a terrible
/// job for some Asian languages, where words are not separated by spaces.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using Lucene.Net.Analysis.Util;
using J2N;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System.IO;

Expand Down Expand Up @@ -65,7 +66,7 @@ public WhitespaceTokenizer(LuceneVersion matchVersion, AttributeFactory factory,
/// </summary>
protected override bool IsTokenChar(int c)
{
return !char.IsWhiteSpace((char)c);
return !Character.IsWhiteSpace(c);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System.Globalization;

namespace Lucene.Net.Analysis.El
{
Expand Down Expand Up @@ -37,6 +38,8 @@ public sealed class GreekLowerCaseFilter : TokenFilter
private readonly ICharTermAttribute termAtt;
private readonly CharacterUtils charUtils;

private static readonly CultureInfo culture = new CultureInfo("el"); // LUCENENET specific - use Greek culture when lowercasing.

/// <summary>
/// Create a <see cref="GreekLowerCaseFilter"/> that normalizes Greek token text.
/// </summary>
Expand Down Expand Up @@ -127,7 +130,7 @@ private int LowerCase(int codepoint)
return '\u03C2'; // small final sigma

default:
return Character.ToLower(codepoint);
return Character.ToLower(codepoint, culture); // LUCENENET specific - need to use specific culture to override current thread
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -841,9 +841,9 @@ public virtual bool Stem(int i0)
// ch = buffer[offset++];
// }

// if (char.IsLetter((char)ch))
// if (Character.IsLetter(ch))
// {
// s.Add(char.ToLowerInvariant((char)ch));
// s.Add(Character.ToLower(ch, CultureInfo.InvariantCulture));
// }
// else
// {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N;
using Lucene.Net.Analysis.TokenAttributes;
using System.Globalization;

namespace Lucene.Net.Analysis.Ga
{
Expand Down Expand Up @@ -28,6 +29,8 @@ public sealed class IrishLowerCaseFilter : TokenFilter
{
private readonly ICharTermAttribute termAtt;

private static readonly CultureInfo culture = new CultureInfo("ga"); // LUCENENET specific - use Irish culture when lowercasing.

/// <summary>
/// Create an <see cref="IrishLowerCaseFilter"/> that normalises Irish token text.
/// </summary>
Expand Down Expand Up @@ -60,7 +63,7 @@ public override bool IncrementToken()

for (int i = idx; i < chLen;)
{
i += Character.ToChars(Character.ToLower(chArray[i]), chArray, i);
i += Character.ToChars(Character.ToLower(chArray[i], culture), chArray, i); // LUCENENET specific - use Irish culture when lowercasing
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Globalization;
using J2N;
using System.Globalization;

namespace Lucene.Net.Analysis.Miscellaneous
{
Expand Down Expand Up @@ -84,17 +85,17 @@ private static byte[] LoadDefaultWordDelimTable() // LUCENENET: Avoid static con
for (int i = 0; i < 256; i++)
{
byte code = 0;
if (char.IsLower((char)i))
if (Character.IsLower(i))
{
code |= (byte)WordDelimiterFilter.LOWER;
code |= WordDelimiterFilter.LOWER;
}
else if (char.IsUpper((char)i))
else if (Character.IsUpper(i))
{
code |= (byte)WordDelimiterFilter.UPPER;
code |= WordDelimiterFilter.UPPER;
}
else if (char.IsDigit((char)i))
else if (Character.IsDigit(i))
{
code |= (byte)WordDelimiterFilter.DIGIT;
code |= WordDelimiterFilter.DIGIT;
}
if (code == 0)
{
Expand Down Expand Up @@ -318,7 +319,7 @@ private int CharType(int ch)
/// <returns> Type of the character </returns>
public static byte GetType(int ch)
{
switch (CharUnicodeInfo.GetUnicodeCategory((char)ch))
switch (Character.GetType(ch))
{
case UnicodeCategory.UppercaseLetter:
return WordDelimiterFilter.UPPER;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ protected override bool IncrementWord()

// find the next set of boundaries, skipping over non-tokens
int end = wordBreaker.Next();
while (end != BreakIterator.Done && !char.IsLetterOrDigit((char)Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
{
start = end;
end = wordBreaker.Next();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using J2N;
using J2N.Globalization;
using Lucene.Net.Analysis.TokenAttributes;
using System;
using System.Globalization;
Expand Down Expand Up @@ -35,12 +34,13 @@ namespace Lucene.Net.Analysis.Tr
public sealed class TurkishLowerCaseFilter : TokenFilter
{
private const int LATIN_CAPITAL_LETTER_I = '\u0049';
private const int LATIN_CAPITAL_LETTER_DOTTED_I = '\u0130';
private const int LATIN_SMALL_LETTER_I = '\u0069';
private const int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
private const int COMBINING_DOT_ABOVE = '\u0307';
private readonly ICharTermAttribute termAtt;

private static readonly CultureInfo culture = new CultureInfo("tr"); // LUCENENET specific - we need to do a culture-sensitive lowercase operation in Turkish

/// <summary>
/// Create a new <see cref="TurkishLowerCaseFilter"/>, that normalizes Turkish token text
/// to lower case.
Expand All @@ -64,7 +64,7 @@ public override sealed bool IncrementToken()
{
int ch = Character.CodePointAt(buffer, i, length);

iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && CharUnicodeInfo.GetUnicodeCategory((char)ch) == UnicodeCategory.NonSpacingMark));
iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && Character.GetType(ch) == UnicodeCategory.NonSpacingMark));

if (iOrAfter) // all the special I turkish handling happens here.
{
Expand Down Expand Up @@ -93,32 +93,8 @@ public override sealed bool IncrementToken()
}
}

using (var culture = new CultureContext("tr"))
{
switch (ch)
{
// LUCENENET: The .NET char.ToLower() function works correctly in
// Turkish as long as the current thread is set to tr-TR (well, technically the
// culture change is only required for the LATIN_CAPITAL_LETTER_I case). .NET does
// not split these characters into separate letter/non-spacing mark characters,
// but the user might still input them that way so we still need the above
// block to handle that case.
//
// LUCENENET TODO: Oddly, the Character.ToLowerCase() function below does not work right
// for Turkish. Which begs the question, should this special case be there so Turkish works
// everywhere? Or should we leave it a special case here because that is the way it works in Java?
//
// References:
// http://haacked.com/archive/2012/07/05/turkish-i-problem-and-why-you-should-care.aspx/
// http://www.i18nguy.com/unicode/turkish-i18n.html
case LATIN_CAPITAL_LETTER_I:
case LATIN_CAPITAL_LETTER_DOTTED_I:
i += Character.ToChars(char.ToLower((char)ch), buffer, i);
continue;
}
}

i += Character.ToChars(Character.ToLower(ch), buffer, i);
// LUCENENET specific - need to pass Turkish culture to get the correct lowercase results
i += Character.ToChars(Character.ToLower(ch, culture), buffer, i);
}

termAtt.Length = length;
Expand All @@ -139,8 +115,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
for (int i = pos; i < len;)
{
int ch = Character.CodePointAt(s, i, len);
//if (char.getType(ch) != char.NON_SPACING_MARK)
if (CharUnicodeInfo.GetUnicodeCategory((char)ch) != UnicodeCategory.NonSpacingMark)
if (Character.GetType(ch) != UnicodeCategory.NonSpacingMark)
{
return false;
}
Expand All @@ -161,9 +136,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
private int Delete(char[] s, int pos, int len)
{
if (pos < len)
{
Array.Copy(s, pos + 1, s, pos, len - pos - 1);
}

return len - 1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)


/// <summary>
/// Converts each unicode codepoint to lowerCase via <see cref="Character.ToLower(int)"/> starting
/// Converts each unicode codepoint to lowerCase via <see cref="TextInfo.ToLower(string)"/> in the invariant culture starting
/// at the given offset. </summary>
/// <param name="buffer"> the char buffer to lowercase </param>
/// <param name="offset"> the offset to start at </param>
Expand Down Expand Up @@ -199,7 +199,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)
}

/// <summary>
/// Converts each unicode codepoint to UpperCase via <see cref="Character.ToUpper(int)"/> starting
/// Converts each unicode codepoint to UpperCase via <see cref="TextInfo.ToUpper(string)"/> in the invariant culture starting
/// at the given offset. </summary>
/// <param name="buffer"> the char buffer to UPPERCASE </param>
/// <param name="offset"> the offset to start at </param>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using Morfologik.Stemming.Polish;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;

Expand Down Expand Up @@ -55,6 +56,8 @@ public class MorfologikFilter : TokenFilter

private int lemmaListIndex;

private static readonly CultureInfo culture = new CultureInfo("pl"); // LUCENENET specific - do lowercasing in Polish culture

/// <summary>
/// Creates a filter with the default (Polish) dictionary.
/// </summary>
Expand Down Expand Up @@ -166,7 +169,7 @@ private string ToLowercase(string chs)
for (int i = 0; i < length;)
{
i += Character.ToChars(
Character.ToLower(Character.CodePointAt(chs, i)), buffer, i);
Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread
}

return scratch.ToString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public virtual void TestRandomRealisticWhiteSpace()
for (int j = 0; j < charArray.Length;)
{
int cp = Character.CodePointAt(charArray, j, charArray.Length);
if (!char.IsWhiteSpace((char)cp))
if (!Character.IsWhiteSpace(cp))
{
sb.AppendCodePoint(cp);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Lucene.Net.Analysis.TokenAttributes;
using NUnit.Framework;
using System;
using System.Globalization;
using System.IO;

namespace Lucene.Net.Analysis.Payloads
Expand Down Expand Up @@ -39,7 +40,7 @@ public virtual void Test()
nptf.Reset();
while (nptf.IncrementToken())
{
assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), StringComparison.Ordinal));
assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), StringComparison.Ordinal)); // LUCENENET specific - intentionally using current culture
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.Payload != null);
string type = payloadAtt.Payload.Utf8ToString();
assertTrue(type + " is not equal to " + typeAtt.Type, type.Equals(typeAtt.Type, StringComparison.Ordinal));
Expand Down Expand Up @@ -67,7 +68,7 @@ public override bool IncrementToken()
{
if (m_input.IncrementToken())
{
typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString();
typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString(); // LUCENENET specific - intentionally using current culture
return true;
}
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ public override object Format(Passage[] passages, String content)
assertEquals(matchStart + 1, matchEnd);
// and the offsets must be correct...
assertEquals(1, term.Length);
assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart]));
assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart], CultureInfo.InvariantCulture));
}
// record just the start/end offset for simplicity
seen.Add(new Pair(p.StartOffset, p.EndOffset));
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ public void Test_Dictionary_Set_Null()
public void Test_Dictionary_AddReplace()
{
string key = "A";
string key2 = "a".ToUpper();
string key2 = "a".ToUpperInvariant();

dictionary.Add(key, "value");
dictionary[key2] = "value2";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ public override object Format(Passage[] passages, String content)
assertEquals(matchStart + 1, matchEnd);
// and the offsets must be correct...
assertEquals(1, term.Length);
assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart]));
assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart], CultureInfo.InvariantCulture)); // LUCENENET specific - need to use invariant culture to match Java
}
// record just the start/end offset for simplicity
seen.Add(new Pair(p.StartOffset, p.EndOffset));
Expand Down

0 comments on commit 22a1e22

Please sign in to comment.