diff --git a/SparqlForHumans.CLI/Program.cs b/SparqlForHumans.CLI/Program.cs index 16cfc679..268ec1b0 100644 --- a/SparqlForHumans.CLI/Program.cs +++ b/SparqlForHumans.CLI/Program.cs @@ -25,7 +25,7 @@ private static void Main(string[] args) //FilterReorderSortAll(); //FilterReorderSort500(); //CreateEntitiesIndex(@"C:\Users\admin\Desktop\DCC\SparqlforHumans\SparqlForHumans.CLI\bin\x64\Debug\netcoreapp2.1\filtered-All.Sorted.nt", true); - CreatePropertiesIndex(@"C:\Users\admin\Desktop\DCC\SparqlforHumans\SparqlForHumans.CLI\bin\x64\Debug\netcoreapp2.1\filtered-All.Sorted.nt", true); + //CreatePropertiesIndex(@"C:\Users\admin\Desktop\DCC\SparqlforHumans\SparqlForHumans.CLI\bin\x64\Debug\netcoreapp2.1\filtered-All.Sorted.nt", true); //ReorderAll(); //FilterAll(); //Filter5k(); @@ -69,7 +69,7 @@ public static void CreatePropertiesIndex(string filename, bool overwrite = false { var propertyOutputPath = LuceneDirectoryDefaults.PropertyIndexPath; propertyOutputPath.DeleteIfExists(overwrite); - new SimplePropertiesIndexer(filename, LuceneDirectoryDefaults.PropertyIndexPath).Index(); + new SimplePropertiesIndexer(filename, propertyOutputPath).Index(); } private static void Filter2MM() diff --git a/SparqlForHumans.Logger/BaseNotifier.cs b/SparqlForHumans.Logger/BaseNotifier.cs index 14ce3a1c..482e85b0 100644 --- a/SparqlForHumans.Logger/BaseNotifier.cs +++ b/SparqlForHumans.Logger/BaseNotifier.cs @@ -3,7 +3,7 @@ public abstract class BaseNotifier { private readonly NLog.Logger _logger = SparqlForHumans.Logger.Logger.Init(); - public int NotifyTicks { get; set; } = 100000; + public int NotifyTicks { get; } = 100000; public abstract string NotifyMessage { get; } public virtual void LogProgress(long Ticks, bool overrideCheck = false) diff --git a/SparqlForHumans.Lucene/Index/SimplePropertiesIndexer.cs b/SparqlForHumans.Lucene/Index/SimplePropertiesIndexer.cs index 181147ae..b9968cf1 100644 --- a/SparqlForHumans.Lucene/Index/SimplePropertiesIndexer.cs +++ b/SparqlForHumans.Lucene/Index/SimplePropertiesIndexer.cs @@ -1,4 +1,5 @@ using System; +using System.Collections; using System.Collections.Generic; using System.Linq; using Lucene.Net.Documents; @@ -15,7 +16,7 @@ namespace SparqlForHumans.Lucene.Index { - public class SimplePropertiesIndexer: BaseNotifier, IIndexer + public class SimplePropertiesIndexer : BaseNotifier, IIndexer { public SimplePropertiesIndexer(string inputFilename, string outputDirectory) { @@ -39,80 +40,18 @@ public bool FilterGroups(SubjectGroup tripleGroup) { return tripleGroup.IsEntityP(); } - private Dictionary FrequencyDictionary { get; set; } = new Dictionary(); - private Dictionary> DomainDictionary { get; set; } = new Dictionary>(); - private Dictionary> RangeDictionary { get; set; } = new Dictionary>(); + private Hashtable FrequencyHashTable { get; set; } = new Hashtable(); + private Dictionary> DomainDictionary { get; set; } = new Dictionary>(); + private Dictionary> RangeDictionary { get; set; } = new Dictionary>(); private static string FrequencyFieldName => Labels.Rank.ToString(); private static string DomainFieldName => Labels.DomainType.ToString(); private static string RangeFieldName => Labels.Range.ToString(); - internal void FrequencyParseTripleGroup(Dictionary dictionary, IEnumerable triples) - { - foreach (var triple in triples) - { - // Filter Properties Only - if (!triple.Predicate.IsProperty()) continue; - - var predicateIntId = triple.Predicate.GetIntId(); - - if (!dictionary.ContainsKey(predicateIntId)) - dictionary.Add(predicateIntId, 0); - - dictionary[predicateIntId]++; - } - } - - internal void DomainParseTripleGroup(Dictionary> dictionary, IEnumerable triples) - { - // Filter those the triples that are properties only (Exclude description, label, etc.) - var propertiesTriples = triples.Where(x => x.Predicate.IsProperty()); - - var (instanceOfSlice, otherPropertiesSlice) = propertiesTriples.SliceBy(x => x.Predicate.IsInstanceOf()); - - // InstanceOf Ids (Domain Types) and Properties - var propertyIds = otherPropertiesSlice.Select(x => x.Predicate.GetIntId()).Distinct().ToArray(); - var instanceOfIds = instanceOfSlice.Select(x => x.Object.GetIntId()).Distinct().ToArray(); - var instanceOfPropertyIds = instanceOfSlice.Select(x => x.Predicate.GetIntId()); - - foreach (var instanceOfId in instanceOfPropertyIds) - { - dictionary.AddSafe(instanceOfId, instanceOfIds); - } - - foreach (var propertyId in propertyIds) - { - dictionary.AddSafe(propertyId, instanceOfIds); - } - } - - internal void RangeParseTripleGroup(Dictionary> dictionary, IEnumerable triples) - { - // Filter those the triples that are properties only (Exclude description, label, etc.) - var propertiesTriples = triples.Where(x => x.Predicate.IsReverseProperty() - || x.Predicate.IsInstanceOf() - || x.Predicate.IsReverseInstanceOf()).ToArray(); - - var instanceOf = propertiesTriples.Where(x => x.Predicate.IsInstanceOf()); - var reverseInstanceOf = propertiesTriples.Where(x => x.Predicate.IsReverseInstanceOf()); - var reverseProperties = propertiesTriples.Where(x => x.Predicate.IsReverseProperty() && !x.Predicate.IsReverseInstanceOf()); - var instanceOfIds = instanceOf.Select(x => x.Object.GetIntId()); - var reverseInstanceOfIds = reverseInstanceOf.Select(x => x.Predicate.GetIntId()); - var reversePropertyIds = reverseProperties.Select(x => x.Predicate.GetIntId()); - - foreach (var reversePropertyId in reversePropertyIds) { - dictionary.AddSafe(reversePropertyId, instanceOfIds); - } - - foreach (var reverseInstanceOfId in reverseInstanceOfIds) { - dictionary.AddSafe(reverseInstanceOfId, instanceOfIds); - } - } - public IEnumerable FrequencyGetField(SubjectGroup subjectGroup) { var subjectId = subjectGroup.Id.ToNumbers(); - return FrequencyDictionary.ContainsKey(subjectId) - ? new List { new DoubleField(FrequencyFieldName, FrequencyDictionary[subjectId], Field.Store.YES) } + return FrequencyHashTable.ContainsKey(subjectId) + ? new List { new DoubleField(FrequencyFieldName, (int)FrequencyHashTable[subjectId], Field.Store.YES) } : new List(); } @@ -134,6 +73,8 @@ public IEnumerable DomainGetField(SubjectGroup subjectGroup) public void Index() { + var indexConfig = LuceneIndexDefaults.CreateStandardIndexWriterConfig(); + long readCount = 0; // Read All lines in the file (IEnumerable, yield) @@ -141,58 +82,78 @@ public void Index() var subjectGroups = FileHelper.GetInputLines(InputFilename) .GroupBySubject(); - NotifyTicks = 10000; - + //First Pass: foreach (var subjectGroup in subjectGroups.Where(x => x.IsEntityQ())) { - var subjectGroupArray = subjectGroup.ToArray(); - FrequencyParseTripleGroup(FrequencyDictionary, subjectGroupArray); - DomainParseTripleGroup(DomainDictionary, subjectGroupArray); - RangeParseTripleGroup(RangeDictionary, subjectGroupArray); + + var validTriples = subjectGroup.Where(x => + x.Predicate.IsProperty() || + (x.Predicate.IsReverseProperty() && !x.Predicate.IsReverseInstanceOf())).ToArray(); + + var properties = validTriples.Where(x => x.Predicate.IsProperty()).ToArray(); + + //FREQUENCY + foreach (var triple in properties) + { + var propertyIntId = triple.Predicate.GetIntId(); + + if (!FrequencyHashTable.ContainsKey(propertyIntId)) + FrequencyHashTable.Add(propertyIntId, 0); + + FrequencyHashTable[propertyIntId] = ((int)FrequencyHashTable[propertyIntId]) + 1; + } + + //DOMAIN: + var (instanceOf, otherProperties) = properties.SliceBy(x => x.Predicate.IsInstanceOf()); + var propertyIds = otherProperties.Select(x => x.Predicate.GetIntId()); + var instanceOfIds = instanceOf.Select(x => x.Object.GetIntId()).ToArray(); + DomainDictionary.AddSafe(31, instanceOfIds); + foreach (var propertyId in propertyIds) + DomainDictionary.AddSafe(propertyId, instanceOfIds); + + //RANGE: + var reverseProperties = validTriples.Where(x => x.Predicate.IsReverseProperty() && !x.Predicate.IsReverseInstanceOf()); + var reversePropertyIds = reverseProperties.Select(x => x.Predicate.GetIntId()); + RangeDictionary.AddSafe(31, instanceOfIds); + foreach (var reversePropertyId in reversePropertyIds) + RangeDictionary.AddSafe(reversePropertyId, instanceOfIds); + LogProgress(readCount++); } - NotifyTicks = 100000; - readCount = 0; - var indexConfig = LuceneIndexDefaults.CreateStandardIndexWriterConfig(); - + //Second Pass: using (var indexDirectory = FSDirectory.Open(OutputDirectory.GetOrCreateDirectory())) using (var writer = new IndexWriter(indexDirectory, indexConfig)) { - foreach (var subjectGroup in subjectGroups.Where(FilterGroups)) { + foreach (var subjectGroup in subjectGroups.Where(FilterGroups)) + { var document = new Document(); - FrequencyGetField(subjectGroup).ToList().ForEach(x => document.Add(x)); - DomainGetField(subjectGroup).ToList().ForEach(x => document.Add(x)); - RangeGetField(subjectGroup).ToList().ForEach(x => document.Add(x)); + foreach (var field in FrequencyGetField(subjectGroup)) + document.Add(field); + + foreach (var field in DomainGetField(subjectGroup)) + document.Add(field); + + foreach (var field in RangeGetField(subjectGroup)) + document.Add(field); var boostField = document.Fields.FirstOrDefault(x => x.Name.Equals(Labels.Rank.ToString())); var boost = 0.0; if (boostField != null) - { boost = (double)boostField.GetDoubleValue(); - } foreach (var fieldIndexer in FieldIndexers) - { fieldIndexer.Boost = boost; - } foreach (var fieldIndexer in FieldIndexers) - { foreach (var field in fieldIndexer.GetField(subjectGroup)) - { document.Add(field); - } - } LogProgress(readCount++); - //if (FilterGroups(subjectGroup)) - //{ - writer.AddDocument(document); - //} + writer.AddDocument(document); } } diff --git a/SparqlForHumans.UnitTests/Index/PropertiesIndexerTests.cs b/SparqlForHumans.UnitTests/Index/PropertiesIndexerTests.cs index 44a8daa2..4154a32d 100644 --- a/SparqlForHumans.UnitTests/Index/PropertiesIndexerTests.cs +++ b/SparqlForHumans.UnitTests/Index/PropertiesIndexerTests.cs @@ -187,8 +187,9 @@ public void TestAddRangeToIndex_InstanceOf() var property31WithRange = properties.FirstOrDefault(x => x.Id.Equals("P31")).Range; Assert.NotEmpty(property31WithRange); - Assert.Equal(100, property31WithRange[0]); - Assert.Equal(200, property31WithRange[1]); + Assert.Contains(100, property31WithRange); + Assert.Contains(200, property31WithRange); + Assert.Contains(5, property31WithRange); outputPath.DeleteIfExists(); } diff --git a/SparqlForHumans.Utilities/DictionaryExtensions.cs b/SparqlForHumans.Utilities/DictionaryExtensions.cs index 2856f220..e1afb28d 100644 --- a/SparqlForHumans.Utilities/DictionaryExtensions.cs +++ b/SparqlForHumans.Utilities/DictionaryExtensions.cs @@ -33,6 +33,18 @@ public static void AddSafe(this Dictionary> dictionary, T1 dictionary.Add(key, values.Distinct().ToList()); } } + public static void AddSafe(this Dictionary> dictionary, T1 key, IEnumerable values) + { + if (!values.Any()) + return; + + if (!dictionary.ContainsKey(key)) + dictionary.Add(key, new HashSet()); + + foreach (var value in values) + dictionary[key].Add(value); + } + public static Dictionary> InvertDictionary(this Dictionary> dictionary) {