diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs index 695e817552..c3b8f41e6b 100644 --- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs +++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs @@ -79,7 +79,7 @@ public static void ReadLine(DataInput input, BytesRef scratch) { break; } - + scratch.Bytes[upto++] = b; } } @@ -106,8 +106,9 @@ public static void CheckFooter(ChecksumIndexInput input) if (StringHelper.StartsWith(scratch, CHECKSUM) == false) { + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes throw new CorruptIndexException("SimpleText failure: expected checksum line but got " + - scratch.Utf8ToString() + " (resource=" + input + ")"); + scratch.Utf8ToStringWithFallback() + " (resource=" + input + ")"); } var actualChecksum = (new BytesRef(scratch.Bytes, CHECKSUM.Length, scratch.Length - CHECKSUM.Length)).Utf8ToString(); @@ -124,4 +125,4 @@ public static void CheckFooter(ChecksumIndexInput input) } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs index 91d4c355ab..3ca21e13f5 100644 --- a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs +++ b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs @@ -41,7 +41,7 @@ public class DefaultSortedSetDocValuesReaderState : SortedSetDocValuesReaderStat /// /// Creates this, pulling doc values from the specified - /// field. + /// field. /// public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME) { @@ -79,7 +79,8 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F string[] components = FacetsConfig.StringToPath(spare.Utf8ToString()); if (components.Length != 2) { - throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString()); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToStringWithFallback()); } if (!components[0].Equals(lastDim, StringComparison.Ordinal)) { @@ -101,7 +102,7 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F /// /// Return top-level doc values. /// - public override SortedSetDocValues GetDocValues() + public override SortedSetDocValues GetDocValues() { return topReader.GetSortedSetDocValues(field); } @@ -132,4 +133,4 @@ public override OrdRange GetOrdRange(string dim) /// public override int Count => valueCount; } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs index 8abb12520e..89599b49a6 100644 --- a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs +++ b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs @@ -275,7 +275,7 @@ public override int GetHashCode() public override string ToString() { return "FacetEntry{" + - "value=" + value.Utf8ToString() + + "value=" + value.Utf8ToStringWithFallback() + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes ", count=" + count + '}'; } diff --git a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs index 9d91aa64c1..9a5c9c1476 100644 --- a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs +++ b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs @@ -134,7 +134,7 @@ public WeightAnonymousClass(TermsIncludingScoreQuery outerInstance, Weight origi private TermsEnum segmentTermsEnum; - + public override Explanation Explain(AtomicReaderContext context, int doc) { SVInnerScorer scorer = (SVInnerScorer) GetBulkScorer(context, false, null); @@ -161,7 +161,7 @@ public override void Normalize(float norm, float topLevelBoost) { originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost); } - + public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { Terms terms = context.AtomicReader.GetTerms(outerInstance._field); @@ -181,7 +181,7 @@ public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost); } - + public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs) { if (scoreDocsInOrder) @@ -236,7 +236,7 @@ internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, //_cost = cost; // LUCENENET: Never read _doc = -1; } - + public override bool Score(ICollector collector, int max) { FakeScorer fakeScorer = new FakeScorer(); @@ -285,12 +285,12 @@ private int NextDocOutOfOrder() } } } - + protected virtual int DocsEnumNextDoc() { return docsEnum.NextDoc(); } - + internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibility from private to internal { int docId; @@ -314,7 +314,7 @@ internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibilit } while (docId != DocIdSetIterator.NO_MORE_DOCS); return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]], - "Score based on join value " + _termsEnum.Term.Utf8ToString()); + "Score based on join value " + _termsEnum.Term.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes } } @@ -326,13 +326,13 @@ internal class MVInnerScorer : SVInnerScorer internal readonly FixedBitSet alreadyEmittedDocs; internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, // LUCENENET: Never read */ - IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */) + IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */) : base(outerInstance, /*weight, // LUCENENET: Never read */ acceptDocs, termsEnum /*, cost // LUCENENET: Never read */) { alreadyEmittedDocs = new FixedBitSet(maxDoc); } - + protected override int DocsEnumNextDoc() { while (true) @@ -360,11 +360,11 @@ internal class SVInOrderScorer : Scorer internal readonly long cost; internal int currentDoc = -1; - + [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")] [SuppressMessage("CodeQuality", "S1699:Constructors should only call non-overridable methods", Justification = "Internal class")] internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, IBits acceptDocs, - TermsEnum termsEnum, int maxDoc, long cost) + TermsEnum termsEnum, int maxDoc, long cost) : base(weight) { this.m_outerInstance = outerInstance; @@ -374,7 +374,7 @@ internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, matchingDocsIterator = matchingDocs.GetIterator(); this.cost = cost; } - + protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs, TermsEnum termsEnum) { @@ -398,12 +398,12 @@ protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptD } } } - + public override float GetScore() { return scores[currentDoc]; } - + public override int Freq => 1; public override int DocID => currentDoc; @@ -412,7 +412,7 @@ public override int NextDoc() { return currentDoc = matchingDocsIterator.NextDoc(); } - + public override int Advance(int target) { return currentDoc = matchingDocsIterator.Advance(target); @@ -432,7 +432,7 @@ internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, : base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost) { } - + protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs, TermsEnum termsEnum) { @@ -465,4 +465,4 @@ protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits accept } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Misc/Misc/TermStats.cs b/src/Lucene.Net.Misc/Misc/TermStats.cs index 7ac5fbc918..c2d1664870 100644 --- a/src/Lucene.Net.Misc/Misc/TermStats.cs +++ b/src/Lucene.Net.Misc/Misc/TermStats.cs @@ -45,7 +45,8 @@ internal string GetTermText() public override string ToString() { - return ("TermStats: Term=" + TermText.Utf8ToString() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return "TermStats: Term=" + TermText.Utf8ToStringWithFallback() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq; } } } diff --git a/src/Lucene.Net.Queries/TermsFilter.cs b/src/Lucene.Net.Queries/TermsFilter.cs index 3aae8295f3..ced20eed59 100644 --- a/src/Lucene.Net.Queries/TermsFilter.cs +++ b/src/Lucene.Net.Queries/TermsFilter.cs @@ -319,7 +319,7 @@ public override string ToString() } first = false; builder.Append(current.field).Append(':'); - builder.Append(spare.Utf8ToString()); + builder.Append(spare.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes } } diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs index 06587b33ca..ae71302ae0 100644 --- a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs +++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs @@ -59,7 +59,8 @@ internal Completion(BytesRef key, int bucket) public override string ToString() { - return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return Utf8.Utf8ToStringWithFallback() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture); } /// diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs index cdace9c1cc..e771023d59 100644 --- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs +++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs @@ -440,7 +440,8 @@ public PendingTerm(BytesRef term, BlockTermState state) public override string ToString() { - return Term.Utf8ToString(); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return Term.Utf8ToStringWithFallback(); } } @@ -468,7 +469,8 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f public override string ToString() { - return $"BLOCK: {Prefix.Utf8ToString()}"; + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return $"BLOCK: {Prefix.Utf8ToStringWithFallback()}"; } #nullable enable diff --git a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs index 26a5b54a7a..fc941fab63 100644 --- a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs +++ b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs @@ -48,7 +48,7 @@ namespace Lucene.Net.Codecs.Lucene3x /// /// Exposes flex API on a pre-flex index, as a codec. /// - /// @lucene.experimental + /// @lucene.experimental /// [Obsolete("(4.0)")] internal class Lucene3xFields : FieldsProducer @@ -344,7 +344,8 @@ private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) if (DEBUG_SURROGATES) { - Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback())); } // Seek "back": @@ -488,7 +489,8 @@ private bool DoPop() if (DEBUG_SURROGATES) { - Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap @@ -599,10 +601,11 @@ private void SurrogateDance() if (DEBUG_SURROGATES) { + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes Console.WriteLine(" dance"); - Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString())); + Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToStringWithFallback())); Console.WriteLine(" " + prevTerm.ToString()); - Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString())); + Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback())); Console.WriteLine(" " + scratchTerm.ToString()); } @@ -679,7 +682,8 @@ private void DoPushes() if (DEBUG_SURROGATES) { - Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); } // Seek "forward": @@ -777,7 +781,7 @@ internal virtual void Reset(FieldInfo fieldInfo) { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; - + internedFieldName = fieldInfo.Name.Intern(); Term term = new Term(internedFieldName); @@ -832,7 +836,8 @@ public override SeekStatus SeekCeil(BytesRef term) { if (DEBUG_SURROGATES) { - Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback())); } skipNext = false; TermInfosReader tis = outerInstance.TermsDict; @@ -1232,4 +1237,4 @@ public override void CheckIntegrity() { } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs index a7c339f7c9..0124269216 100644 --- a/src/Lucene.Net/Util/BytesRef.cs +++ b/src/Lucene.Net/Util/BytesRef.cs @@ -248,9 +248,9 @@ public string Utf8ToString() /// resulting . /// /// - /// LUCENENET specific version that does not throw exceptions, - /// primarily for use in ToString() and other methods that - /// should not throw exceptions. + /// LUCENENET specific version that does not throw exceptions on invalid UTF-8, + /// primarily for use in ToString() and other cases that should not throw exceptions, + /// such as when building a message for another exception. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public string Utf8ToStringWithFallback() @@ -604,11 +604,11 @@ public override string ToString() switch (format) { case BytesRefFormat.UTF8: - try + if (bytesRef.TryUtf8ToString(out var utf8String)) { - return bytesRef.Utf8ToString(); + return utf8String; } - catch (Exception e) when (e.IsIndexOutOfBoundsException()) + else { return bytesRef.ToString(); }