From 042432376ed2691715f190f78c85c78acc8258c1 Mon Sep 17 00:00:00 2001 From: vjalili Date: Mon, 18 May 2020 20:18:54 -0700 Subject: [PATCH 1/2] Add a delimiter for the hash seed. --- GeUtilities/Intervals/Functions/HashFunctions.cs | 11 +++++++++++ GeUtilities/Intervals/Model/GeneralFeature.cs | 3 ++- GeUtilities/Intervals/Model/Interval.cs | 2 +- GeUtilities/Intervals/Model/Peak.cs | 3 ++- GeUtilities/Intervals/Model/RefSeqGene.cs | 3 ++- GeUtilities/Intervals/Model/Variant.cs | 5 +++-- GeUtilities/Intervals/Parsers/Parser.cs | 2 +- 7 files changed, 22 insertions(+), 7 deletions(-) diff --git a/GeUtilities/Intervals/Functions/HashFunctions.cs b/GeUtilities/Intervals/Functions/HashFunctions.cs index 56b527f..2e0d89f 100644 --- a/GeUtilities/Intervals/Functions/HashFunctions.cs +++ b/GeUtilities/Intervals/Functions/HashFunctions.cs @@ -9,6 +9,12 @@ public static class HashFunctions private const uint _FNVPrime_32 = 16777619; private const uint _FNVOffsetBasis_32 = 2166136261; + /// + /// Sets and gets a string used as a delimiter separating + /// properties used as hash seed. + /// + public const string HashSeedDelimiter = ";;"; + public static uint FNVHashFunction(string bytes) { uint hash = _FNVOffsetBasis_32; @@ -20,5 +26,10 @@ public static uint FNVHashFunction(string bytes) return hash; } + + public static string GetHashSeed(params string[] properties) + { + return string.Join(HashSeedDelimiter, properties); + } } } diff --git a/GeUtilities/Intervals/Model/GeneralFeature.cs b/GeUtilities/Intervals/Model/GeneralFeature.cs index de4e9f5..d730571 100644 --- a/GeUtilities/Intervals/Model/GeneralFeature.cs +++ b/GeUtilities/Intervals/Model/GeneralFeature.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Genometric.GeUtilities.IGenomics; +using Genometric.GeUtilities.Intervals.Functions; namespace Genometric.GeUtilities.Intervals.Model { @@ -10,7 +11,7 @@ public class GeneralFeature : Interval, IGeneralFeature { public GeneralFeature(int left, int right, string source, string feature, double score, string frame, string attribute, string hashSeed = "") : - base(left, right, source + feature + score.ToString() + frame + attribute + hashSeed) + base(left, right, HashFunctions.GetHashSeed(source, feature, score.ToString(), frame, attribute, hashSeed)) { Source = source; Feature = feature; diff --git a/GeUtilities/Intervals/Model/Interval.cs b/GeUtilities/Intervals/Model/Interval.cs index 1733a91..b43b6fd 100644 --- a/GeUtilities/Intervals/Model/Interval.cs +++ b/GeUtilities/Intervals/Model/Interval.cs @@ -16,7 +16,7 @@ public Interval(int left, int right, string hashSeed = "") unchecked { - _hashKey = (int)HashFunctions.FNVHashFunction(left.ToString() + right.ToString() + hashSeed); + _hashKey = (int)HashFunctions.FNVHashFunction(HashFunctions.GetHashSeed(left.ToString(), right.ToString(), hashSeed)); } } diff --git a/GeUtilities/Intervals/Model/Peak.cs b/GeUtilities/Intervals/Model/Peak.cs index 06b9b33..76ec0c5 100644 --- a/GeUtilities/Intervals/Model/Peak.cs +++ b/GeUtilities/Intervals/Model/Peak.cs @@ -3,13 +3,14 @@ // See the LICENSE file in the project root for more information. using Genometric.GeUtilities.IGenomics; +using Genometric.GeUtilities.Intervals.Functions; namespace Genometric.GeUtilities.Intervals.Model { public class Peak : Interval, IPeak { public Peak(int left, int right, double value, string name = null, int summit = -1, string hashSeed = "") : - base(left, right, value.ToString() + summit.ToString() + name + hashSeed) + base(left, right, HashFunctions.GetHashSeed(value.ToString(), summit.ToString(), name, hashSeed)) { Value = value; Summit = summit != -1 ? summit : (right - left) / 2; diff --git a/GeUtilities/Intervals/Model/RefSeqGene.cs b/GeUtilities/Intervals/Model/RefSeqGene.cs index 3934541..286daf2 100644 --- a/GeUtilities/Intervals/Model/RefSeqGene.cs +++ b/GeUtilities/Intervals/Model/RefSeqGene.cs @@ -3,13 +3,14 @@ // See the LICENSE file in the project root for more information. using Genometric.GeUtilities.IGenomics; +using Genometric.GeUtilities.Intervals.Functions; namespace Genometric.GeUtilities.Intervals.Model { public class RefSeqGene : Interval, IRefSeqGene { public RefSeqGene(int left, int right, string refSeqID, string geneSymbol, string hashSeed = "") : - base(left, right, refSeqID + geneSymbol + hashSeed) + base(left, right, HashFunctions.GetHashSeed(refSeqID, geneSymbol, hashSeed)) { RefSeqID = refSeqID; GeneSymbol = geneSymbol; diff --git a/GeUtilities/Intervals/Model/Variant.cs b/GeUtilities/Intervals/Model/Variant.cs index e83ce32..3944ded 100644 --- a/GeUtilities/Intervals/Model/Variant.cs +++ b/GeUtilities/Intervals/Model/Variant.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Genometric.GeUtilities.IGenomics; +using Genometric.GeUtilities.Intervals.Functions; namespace Genometric.GeUtilities.Intervals.Model { @@ -10,8 +11,8 @@ public class Variant : Interval, IVariant { public Variant(int left, int right, string id, Base[] refBase, Base[] altBase, double quality, string filter, string info, string hashSeed = "") : - base(left, right, id + (refBase == null ? "" : refBase.ToString()) - + (altBase == null ? "" : altBase.ToString()) + quality.ToString() + filter + info + hashSeed) + base(left, right, HashFunctions.GetHashSeed(id, (refBase == null ? "" : refBase.ToString()), + (altBase == null ? "" : altBase.ToString()), quality.ToString(), filter, info, hashSeed)) { ID = id; RefBase = refBase; diff --git a/GeUtilities/Intervals/Parsers/Parser.cs b/GeUtilities/Intervals/Parsers/Parser.cs index d8d90fd..3e740ba 100644 --- a/GeUtilities/Intervals/Parsers/Parser.cs +++ b/GeUtilities/Intervals/Parsers/Parser.cs @@ -262,7 +262,7 @@ private void Parse() continue; } - I readingInterval = BuildInterval(left, right, splittedLine, lineCounter, _data.FileHashKey + lineCounter.ToString()); + I readingInterval = BuildInterval(left, right, splittedLine, lineCounter, HashFunctions.GetHashSeed(_data.FileHashKey.ToString(), lineCounter.ToString())); if (DropReadingPeak) continue; From 9a2afb4f4f3ad9d6666eef163e1034636f5d7d67 Mon Sep 17 00:00:00 2001 From: vjalili Date: Mon, 18 May 2020 20:28:19 -0700 Subject: [PATCH 2/2] Record if an interval is dropped. --- GeUtilities/Intervals/Genome/Chromosome.cs | 10 +++++++++- .../Intervals/Parsers/Model/ParsedIntervals.cs | 13 ++++++++++--- GeUtilities/Intervals/Parsers/Parser.cs | 9 +++++++-- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/GeUtilities/Intervals/Genome/Chromosome.cs b/GeUtilities/Intervals/Genome/Chromosome.cs index cb0eaa9..69d1999 100644 --- a/GeUtilities/Intervals/Genome/Chromosome.cs +++ b/GeUtilities/Intervals/Genome/Chromosome.cs @@ -20,12 +20,20 @@ public Chromosome() Strands = new Dictionary>(); } - public void Add(I interval, char strand) + public bool TryAdd(I interval, char strand) { if (!Strands.ContainsKey(strand)) Strands.Add(strand, new Strand()); + if (Strands[strand].TryAdd(interval)) + { Statistics.Update(interval); + return true; + } + else + { + return false; + } } } } diff --git a/GeUtilities/Intervals/Parsers/Model/ParsedIntervals.cs b/GeUtilities/Intervals/Parsers/Model/ParsedIntervals.cs index 41df14d..15b4f17 100644 --- a/GeUtilities/Intervals/Parsers/Model/ParsedIntervals.cs +++ b/GeUtilities/Intervals/Parsers/Model/ParsedIntervals.cs @@ -29,12 +29,19 @@ protected ParsedIntervals() Statistics = new S(); } - public void Add(I interval, string chr, char strand) + public bool TryAdd(I interval, string chr, char strand) { if (!Chromosomes.ContainsKey(chr)) Chromosomes.Add(chr, new Chromosome()); - Chromosomes[chr].Add(interval, strand); - Statistics.Update(interval); + if (Chromosomes[chr].TryAdd(interval, strand)) + { + Statistics.Update(interval); + return true; + } + else + { + return false; + } } } } diff --git a/GeUtilities/Intervals/Parsers/Parser.cs b/GeUtilities/Intervals/Parsers/Parser.cs index 3e740ba..fc76391 100644 --- a/GeUtilities/Intervals/Parsers/Parser.cs +++ b/GeUtilities/Intervals/Parsers/Parser.cs @@ -291,8 +291,13 @@ private void Parse() (char.TryParse(splittedLine[_strandColumn], out strand) && strand != '+' && strand != '-' && strand != UnspecifiedStrandChar)) strand = UnspecifiedStrandChar; - _data.Add(readingInterval, chrName, strand); - _data.IntervalsCount++; + if (_data.TryAdd(readingInterval, chrName, strand)) + _data.IntervalsCount++; + else + { + DropLine("\tLine " + lineCounter.ToString() + "\t:\tPossibly Hash key collision."); + continue; + } } } }