From 92b9bbd8429df25e9c024c0f680779616aace57a Mon Sep 17 00:00:00 2001 From: Luca Liechti Date: Mon, 23 Jan 2017 23:31:01 +0100 Subject: [PATCH] calculating NULLs and legacy values --- bin/driver/Driver.class | Bin 4190 -> 4333 bytes src/datastructures/Lattice.java | 95 +++++++++++++++++++++++++++++++- src/driver/ContextCleanser.java | 13 +++-- src/driver/Driver.java | 14 ++--- src/driver/LatticeBuilder.java | 1 + 5 files changed, 107 insertions(+), 16 deletions(-) diff --git a/bin/driver/Driver.class b/bin/driver/Driver.class index 3c7949a4e45f73de72a06f1025384b1e2dd1dea6..7efbb81b08476c5863b1d15691cc2a3f50b4593d 100644 GIT binary patch delta 1960 zcmZ`&YgANa6y4vv?!7Ztu`$2_jKQ=(d73L_Op=EPIy?+Y5fbmf1xJS&x^odUbuy=78hoBwfT6=U`V_nQq#0dZj*+CE`~8h zUQeyu)!_36RZq6Z>)~-9LrU~^wWp%6woFHKORDN^lJyKL5>O!Ge#~azy{fE8YCzG( z8k3zxDB)1dU|%k+lyW*$uP>)O8W%AKrCOzNf~8DEIp%3Ajk7Fn5f!M^28>NJicl?L z0UluBmPx@%xl3g*re`b`P>Y8+JV;C(sKo)lN+Z?=N~tY}r?fF#lC2xhh%n<>hBPf? zNz~k&$amv85zpfVZ7G+1+e;)#uGFZr)I_{Y;5)hT6MC?f!xn~wW=WNTs?yP=f|^av4P5@VaYM zNeyz?rB(6CwqCrY^V`jkAK|x&pIF|5y(0F(!eDOl$&$k3ZQ9_}DG@{-@92y@jaKyH zposVIK7&Q|`d7$GWVx>B zpI(yTzV3pOK!?XC&(twaiue$xv~Ho?_OXa5_=GH+UaqY%Bqi}UL#T~BK4q}B%SuZ? zX_K3sJkA}7vxb?ln?~a49bM8c+Nwf>NU59Bk30G>(LOzlj4-mo$XnKFy{7Sz8?H{9H;MD8uTJEVWaFaaQX~b3W>f(T!05nAhmH0&2%zA zpnRE5M%ZzJ@LH))GLB*yyrjTEjK*?`TCjrZMXDUm@`Wg^jjo(93}F{|4t@@)v0nlQ z2X!t2eXROhra~fO%pgfcfSsYibmi;;5_ED*o%*cF19-#$7pdA=*N0Uuv%?gXE8&`4 zt4S`iBSc!QxhB+Ahk3*zbeP8sh{0|gM0cIT+=r)ip~KkZ;vAfwo*7-}iek+|c8(OY()AOasssf)Mu^|U(;zFn~QIVmLB;F-{E`k zMCt&3z>g4!+CKb*pQ(M!;lsd}qXsX7c?eT=gI{yE!hsClNq}T<`;cB7i%z1XXqZDc WhaL{SM&kcBYBT5!{6a$+@auoTy4A!0 delta 1800 zcmZuxYj6`)6#j1WxVxJLG|)n9YbpxUKp((3LIYF^r4%S_u|+JPB~7;_kS6VBDFuXB zz_&p4swfpiL;+E;plP9k_=ch)zR?-+kN$AR@vj4;^=uvpBQr_%=HByt=R4W1mlwZ?aLuUpWYPF7K5r1<*L!d!H2BxSef>-!0W9`#W%rRYTw~Vyh zi5cRcWwd>!idiTX=PlFiWh&;NT#T^#r%b~<74uQSz*qPpm0E|vV9E6?kx_|ijw*&5 zR{Gj~g+X7axv*w!#L!w)EW};nur;TkmO(BF26Gz2A+12c-MEKiF~jgUA`mXD)&n7< z#?Un%f$EO_Z5_hgOC_vi7*g%iBY{wJPE#PLiYQxQh1D6xC3IzyJ& zqhvW0v_a-r&5%j|)5S%_u9$mkRCHjiIBTCM1+iYm{n#L^yv*K)sEP+fHNVNai6LG5 zsiaeUC$^}t;X$#LkCi&{u!=|UsOaYfH$F~fPw>^7O17%lMl6?jw!I5aaXcxqrA)WL zSdyo8w1#z~!eM1IWKi2@1vOtNqUj2{vD*~+B14hr;%@Pmlojp9ODgulPB2|- z3AbxC)V!c(ghN#gD>c6nF>~!x@roH@^AndYVLuIZ()hR(2?xyr8|qE^Ln>a!8w_?M z5L%^Ci)rAc3fwrXVl>{OcID3XR9rQ7L$u0#4Kk@?7~n72|Qr9O`m;geXvyQ6CU%y@Zb#lvYh|3hOOe zV~&JRj#eu%CRb7;By#;0F-)y0iD9yr??<+aAH!6JQ*e9jE_+_S%jUAjP|}Cl=E+=! zor4#TV1ctHhDAMuwnCxbNl?Y_YG%}p1@OR+abzD4){~&k$b*9-%E8TCSc0YWw2)nv z;a*Uquo3ssDh)El*VD=ZCv+@_kGL}tKm+mF=sOgRbZUo2c$JWes3wYPrkxz54I;W2 z62}S-cj}YD;iAmbKo7;dnFJ&umPu5)3a|mHr5{;!&Ok428yABohJ30lf7~&wN`u$h zkG8sAM7%baHQAwzD;+YZL-Fpkr&<#o{LN)E>BjfrW+F6c)}_%f6SN!q(OKuR^lgNm5Gav&8G-F+7)IUDS_mVs-IeM0a^5mlVSu zGvMW9iWKnG{{gQ}e&9OEhZV`zI~@D5$z*-4NWPkPlst6DZWJf!(;aS<|BGG5VtFfuS~n9#aEx=G#S@T$TM-{=CZvQz}YA{x0np~ z89pcHu3TL4;He8^h((vQI^5CMq diff --git a/src/datastructures/Lattice.java b/src/datastructures/Lattice.java index fc5edc8..111c965 100644 --- a/src/datastructures/Lattice.java +++ b/src/datastructures/Lattice.java @@ -20,6 +20,7 @@ public class Lattice { private int currentNodeNumber; private Dictionary dic; private BitSet lastMergedInto; //used to keep track of which node has last been merged into in the tinker algorithm + private HashMap> bookkeeping; //used to calculate the number of NULLs and legacy values public Lattice(Dictionary _dic) { this.nodes = new ArrayList(); @@ -28,6 +29,7 @@ public Lattice(Dictionary _dic) { this.dic = _dic; this.nodesByLevel = new HashMap>(); this.lastMergedInto = null; + this.bookkeeping = null; } public void clear() { @@ -40,7 +42,8 @@ public void clear() { public String latticeStats() { // return "Nodes: " + nodes.size() + "\twith own objects: " + nodesWithOwnObjects() + "\tedges: " + edges.size() // + "\tclusterIndex: " + String.format("%.3f", clusterIndex()) + "\tcleanliness: " + String.format("%.1f", cleanliness()) + "%"; - return nodes.size() + "\t" + nodesWithOwnObjects() + "\t" + edges.size() + "\t" + String.format("%.3f", clusterIndex()) + "\t" + String.format("%.1f", cleanliness()); + return nodes.size() + "\t" + nodesWithOwnObjects() + "\t" + edges.size() + "\t" + String.format("%.3f", clusterIndex()) + "\t" + String.format("%.3f", cleanliness()) + + "\t" + String.format("%.3f", nullPercentage()) + "\t" + String.format("%.3f", legacyPercentage()); } public void addNode(LatticeNode node) { @@ -65,8 +68,8 @@ public void exportLatticeToFile(String outputFile){ latticeString += "digraph d{\n"; for(LatticeNode node : nodes) latticeString += node.getNodeNumber() - + " [label=\"" + node.getNiceAttributes() + node.getIntent() - + "\next.: " + node.numberOfObjects() + " (" + node.typesOfExtent() + ") " + + " [label=\"" + node.getNiceAttributes() //+ node.getIntent() ---------------------excluding intent for the moment + + "ext.: " + node.numberOfObjects() + " (" + node.typesOfExtent() + ") " + "\nown: " + node.numberOfOwnObjects() + " (" + node.typesOfOwnObjects() + ") " // + "\n merges into : " + node.mergesInto() + "\"" + peripheries(node) + color(node) + "]\n"; @@ -306,4 +309,90 @@ public double cleanliness() { } return ((double)majority/(double)total)*100; } + + public void initialiseBookkeeping() { + if(bookkeeping == null){ + bookkeeping = new HashMap>(); + for(LatticeNode node : nodes){ + if(node.hasOwnObjects()){ + for(FormalObject ownObject : node.ownObjects()){ + if(bookkeeping.containsKey(ownObject.getIntent().hashCode())) + bookkeeping.get(ownObject.getIntent().hashCode()).add(ownObject); + else { + ArrayList newList = new ArrayList(); + newList.add(ownObject); + bookkeeping.put(ownObject.getIntent().hashCode(), newList); + } + } + } + } + } + } + + //when all objects of one node (the mergee) are merged into another node (the merger), + //we keep track of this in the bookkeeping datastructure used to calculate NULLs and legacies + public void updateBookkeeping(LatticeNode mergee, LatticeNode merger) { + int mergeeHash = mergee.getIntent().hashCode(); + int mergerHash = merger.getIntent().hashCode(); + ArrayList mergedObjects = bookkeeping.get(mergeeHash); + for(FormalObject obj : mergedObjects){ + FormalObject copy = new FormalObject(); + copy.setIntent((BitSet)obj.getIntent().clone()); + bookkeeping.get(mergerHash).add(copy); + } + bookkeeping.remove(mergeeHash); + } + + private int totalCardinality() { + int card = 0; + for(LatticeNode node : nodes) + card += node.getIntent().cardinality()*node.numberOfOwnObjects(); + return card; + } + + private int nulls() { + int nulls = 0; + for(int hash : bookkeeping.keySet()){ + ArrayList nodeObjects = bookkeeping.get(hash); + BitSet archetype = findArchetype(hash, nodeObjects); + for(FormalObject comp : nodeObjects){ + BitSet nullSet = (BitSet)archetype.clone(); + nullSet.xor(comp.getIntent()); + nullSet.and(archetype); + nulls += nullSet.cardinality(); + } + } + return nulls; + } + + private int legacies() { + int legacies = 0; + for(int hash : bookkeeping.keySet()){ + ArrayList nodeObjects = bookkeeping.get(hash); + BitSet archetype = findArchetype(hash, nodeObjects); + for(FormalObject comp : nodeObjects){ + BitSet legSet = (BitSet)archetype.clone(); + legSet.xor(comp.getIntent()); + legSet.and(comp.getIntent()); + legacies += legSet.cardinality(); + } + } + return legacies; + } + + private BitSet findArchetype(int hash, ArrayList objectArray) { + for(FormalObject obj : objectArray){ + if(obj.getIntent().hashCode() == hash) + return (BitSet)obj.getIntent().clone(); + } + return null; + } + + private double nullPercentage() { + return (double)nulls()/(double)totalCardinality()*100; + } + + private double legacyPercentage() { + return (double)legacies()/(double)totalCardinality()*100; + } } diff --git a/src/driver/ContextCleanser.java b/src/driver/ContextCleanser.java index a2daa81..b8a6472 100644 --- a/src/driver/ContextCleanser.java +++ b/src/driver/ContextCleanser.java @@ -58,13 +58,13 @@ public double tinker() { mergeCandidates.addAll(node.upperNeighbours()); mergeCandidates.addAll(node.lowerNeighbours()); //nodes from same level with at least one shared parent - for(LatticeNode parent : node.upperNeighbours()){ - for(LatticeNode child : parent.lowerNeighbours()){ - if(child != node) mergeCandidates.add(child); - } - } +// for(LatticeNode parent : node.upperNeighbours()){ +// for(LatticeNode child : parent.lowerNeighbours()){ +// if(child != node) mergeCandidates.add(child); +// } +// } for(LatticeNode candidate : mergeCandidates) { - if(mergeScore(node, candidate) > highScore) { + if(mergeScore(node, candidate) >= highScore) { highScore = mergeScore(node, candidate); firstNode = node; secondNode = candidate; @@ -88,6 +88,7 @@ private double mergeScore(LatticeNode node, LatticeNode candidate) { private void mergeInto(LatticeNode firstNode, LatticeNode secondNode) { BitSet mergedIntent = (BitSet)secondNode.getIntent().clone(); + lattice.updateBookkeeping(firstNode, secondNode); for(FormalObject obj : firstNode.ownObjects()) obj.setIntent(mergedIntent); lattice.setLastMergedInto((BitSet)secondNode.getIntent().clone()); diff --git a/src/driver/Driver.java b/src/driver/Driver.java index b93007b..a57a660 100644 --- a/src/driver/Driver.java +++ b/src/driver/Driver.java @@ -22,15 +22,15 @@ public static void main(String[] args){ // docs.add(repoFolder + "XML\\ebay.xml"); docs.add(repoFolder + "XML\\DBLP\\1000Lattice.xml"); // // docs.add(repoFolder + "XML\\DBLP\\316NoSql.xml"); -// docs.add(repoFolder + "XML\\DBLP\\1000FCA.xml"); -// docs.add(repoFolder + "XML\\DBLP\\1000Schema.xml"); + docs.add(repoFolder + "XML\\DBLP\\1000FCA.xml"); + docs.add(repoFolder + "XML\\DBLP\\1000Schema.xml"); //add BibTex repos // docs.add(repoFolder + "BibTex\\BordatTest.bib"); // docs.add(repoFolder + "BibTex\\BordatTest3.bib"); - docs.add(repoFolder + "BibTex\\scg.bib"); -// docs.add(repoFolder + "BibTex\\listb.bib"); // -// docs.add(repoFolder + "BibTex\\zbMATH\\100Lattice.bib"); // +// docs.add(repoFolder + "BibTex\\scg.bib"); + docs.add(repoFolder + "BibTex\\listb.bib"); // + docs.add(repoFolder + "BibTex\\zbMATH\\100Lattice.bib"); // // docs.add(repoFolder + "BibTex\\zbMATH\\100Schema.bib"); // docs.add(repoFolder + "BibTex\\zbMATH\\100Algebra.bib"); // docs.add(repoFolder + "BibTex\\zbMATH\\100Groups.bib"); @@ -56,10 +56,10 @@ private static void parseDocument(String doc, String outputFolder, String graphv Lattice lattice = lb.buildLattice(); lattice.exportLatticeToFile(graphvizFolder + "0a_" + parser.getTargetLatticeFilename(doc)); - System.out.println("Nr\tScore\tNodes\tWithOwn\tedges\tindex\tclean"); + System.out.println("Nr\tScore\tNodes\tWithOwn\tedges\tindex\tclean\tnull\tleg"); System.out.println("orig\t---" + "\t" + lattice.latticeStats()); ContextCleanser cc = new ContextCleanser(fc, lattice); - cc.removeSingletonObjects(); +// cc.removeSingletonObjects(); // cc.removeRareAttributes(0); lattice.clear(); lattice = lb.buildLattice(); diff --git a/src/driver/LatticeBuilder.java b/src/driver/LatticeBuilder.java index 6eab75c..4ed9599 100644 --- a/src/driver/LatticeBuilder.java +++ b/src/driver/LatticeBuilder.java @@ -29,6 +29,7 @@ public Lattice buildLattice(){ computeExtents(); lattice.computeEdges(); lattice.computeAttributes(); + lattice.initialiseBookkeeping(); alreadyAddedObjects.clear(); return lattice; }