From fb2be20f0f1ae5dd94cc39bcb7744d6ce539be62 Mon Sep 17 00:00:00 2001 From: Luca Liechti Date: Fri, 13 Jan 2017 17:33:23 +0100 Subject: [PATCH] automatizing the merging --- bin/driver/Driver.class | Bin 3264 -> 3756 bytes src/datastructures/Lattice.java | 4 +-- src/driver/ContextCleanser.java | 42 ++++++++++++++++++++++---------- src/driver/Driver.java | 22 +++++++++++------ src/driver/LatticeBuilder.java | 1 + 5 files changed, 46 insertions(+), 23 deletions(-) diff --git a/bin/driver/Driver.class b/bin/driver/Driver.class index 6084b012003f60e37ff31960505e2a31c4ba43bf..2f2c23444608ad263734cd3059e0d1bb9eaceb82 100644 GIT binary patch delta 846 zcmZuv+e;Kt9RAMi<;>0urGlGl=FWqKR+@r{Znc#eA(byiC0V+=j<$zW*@F+I zY1UM0nPoTaVi&c>%a&;cfe`c-fkeoUCh4i$_pnt!1sy-Bq=>8&q&_7WtthJ?yxWFJ3H^^u~w}OktIqr7!qL1SegVXSF zr$xbKW1eS~i!f-k*Qe{jfTl)tpB_>DT623?^Ef2*;+lf%C}vPXNlI;;!O?Fl^3`G# zHx<}0U@UR(p(q9w+{PUSyB=&iu7w#a`2|&Rle-G;!Oma{gfw4R!~=%x<&nN-T??zN zTKJfzO31;mf(;m9$jPrZ&Is9MA|BI@JgSJN46H%KGY08|7N(-DS`&?5Fr+8-5jEJR z>g~EORAkhMzcOu+0PTiURkVV%su@a+Mw@eDpJN?kZbO%bC1Ds=C;&TX}zjuBQIf#t^M@*<5@QH$39X3Wv2!9zl~!G;yoK z5vLBh>?S%rJY{j8EHY&i4>KN-$#OFnkJVAk#wp|`Se-J96QBI2QI=yxbZ}`Pm+-I~K76KT*jzcCr$9 zSvhvG1E_{f`qRl1f**(Ji_Qx5gjfO{AtWTwON_w+Hz~)_K*{-R6DIJA?ui#1uVL1} zj+6L-NlXzsi*J~wI-KhCA%-^)sZS*Zi zAZDG@v?FRI+_AJ4enRrgp7PuE)ZFZM9_tKA4w)NL|4H+z6L5@gY^P1f9P+#`vwkc& z=ZUDBkz}n+nl20MbAhnb*GAnQ*SRK1%w&bM5#@yR5xwLUJ`yx&u`0$H9?|BtY;hig yH@qdtea`TX_d*YG+2ezBx{^Nf36p|&Y_!8;$!C!irE~&a getNodes() { } public void exportLatticeToFile(String outputFile){ - System.out.print("Writing lattice to file... "); +// System.out.print("Writing lattice to file... "); String latticeString = ""; latticeString += "digraph d{\n"; for(LatticeNode node : nodes) @@ -69,7 +69,7 @@ public void exportLatticeToFile(String outputFile){ latticeString += edge.getLowerNodeNumber() + "->" + edge.getUpperNodeNumber() + ";\n"; latticeString += "}"; writeToFile(latticeString, outputFile); - System.out.println("done."); +// System.out.println("done."); } private String peripheries(LatticeNode node) { diff --git a/src/driver/ContextCleanser.java b/src/driver/ContextCleanser.java index 2689f09..fcf2401 100644 --- a/src/driver/ContextCleanser.java +++ b/src/driver/ContextCleanser.java @@ -47,7 +47,7 @@ public void removeRareAttributes(int treshold) { } //changes the intents of objects that are very close to other objects. - //after running this, the lattice has to be recomputed :/ + //after running this, the lattice has to be recomputed public void mergeNodes(int factor, int attrDiff, int percent) { System.out.println("---BEGIN CLEANSING---"); System.out.println("Merging all nodes with their biggest neighbours if they\n" @@ -103,35 +103,45 @@ private Boolean isMergeCandidateFor(LatticeNode mergeCandidate, LatticeNode smal && mergeCandidate.getIntent().cardinality() <= (smallNode.getIntent().cardinality() + attributeDifference)); } - public void tinker() { - System.out.println("tinkering..."); + public double tinker() { +// System.out.println("tinkering..."); HashMap> latticeLevelNodes = lattice.nodesByLevel(); int[] levelArray = lattice.levelArray(); double highScore = 0.0; String highScoreMerge = ""; + LatticeNode firstNode = null; + LatticeNode secondNode = null; for(int i = 0; i < levelArray.length; i++) { ArrayList thisLevelNodes = latticeLevelNodes.get(levelArray[i]); for(LatticeNode node : thisLevelNodes) { for(LatticeNode upper : node.upperNeighbours()) { - if(editDistance(node, upper) > highScore) { - highScore = editDistance(node, upper); - highScoreMerge = node.getIntent() + " -> " + upper.getIntent() + - " (up, score = " + new DecimalFormat("#.##").format(editDistance(node, upper)) + ")"; + if(mergeScore(node, upper) > highScore) { + highScore = mergeScore(node, upper); + highScoreMerge = "merged " /*+ node.getIntent() + " -> " + upper.getIntent()*/ + + " (up, score = " + new DecimalFormat("#.##").format(mergeScore(node, upper)) + ")"; + firstNode = node; + secondNode = upper; } } for(LatticeNode lower : node.lowerNeighbours()) { - if(editDistance(node, lower) > highScore) { - highScore = editDistance(node, lower); - highScoreMerge = node.getIntent() + " -> " + lower.getIntent() + - " (down, score = " + new DecimalFormat("#.##").format(editDistance(node, lower)) + ")"; + if(mergeScore(node, lower) > highScore) { + highScore = mergeScore(node, lower); + highScoreMerge = "merged " /*+ node.getIntent() + " -> " + lower.getIntent()*/ + + " (down, score = " + new DecimalFormat("#.##").format(mergeScore(node, lower)) + ")"; + firstNode = node; + secondNode = lower; } } } } - System.out.println(highScoreMerge); + if(highScore > 0.0) { + mergeInto(firstNode, secondNode); + System.out.println(highScoreMerge); + } + return highScore; } - private double editDistance(LatticeNode node, LatticeNode candidate) { + private double mergeScore(LatticeNode node, LatticeNode candidate) { if(!node.hasOwnObjects() || !candidate.hasOwnObjects() || candidate.numberOfOwnObjects() <= node.numberOfOwnObjects()) return 0.0; double ownObjectRatio = candidate.numberOfOwnObjects()/(double)node.numberOfOwnObjects(); @@ -140,4 +150,10 @@ private double editDistance(LatticeNode node, LatticeNode candidate) { return 2*(ownObjectRatio/percentOfObjects); return ownObjectRatio/percentOfObjects; } + + private void mergeInto(LatticeNode firstNode, LatticeNode secondNode) { + BitSet mergedIntent = (BitSet)secondNode.getIntent().clone(); + for(FormalObject obj : firstNode.ownObjects()) + obj.setIntent(mergedIntent); + } } diff --git a/src/driver/Driver.java b/src/driver/Driver.java index 5fdc2a7..9c9d699 100644 --- a/src/driver/Driver.java +++ b/src/driver/Driver.java @@ -20,7 +20,7 @@ public static void main(String[] args){ // docs.add(repoFolder + "XML\\mondial.xml"); // docs.add(repoFolder + "XML\\SigmodRecord.xml"); // docs.add(repoFolder + "XML\\ebay.xml"); - docs.add(repoFolder + "XML\\DBLP\\1000Lattice.xml"); +// docs.add(repoFolder + "XML\\DBLP\\1000Lattice.xml"); // docs.add(repoFolder + "XML\\DBLP\\316NoSql.xml"); // docs.add(repoFolder + "XML\\DBLP\\1000FCA.xml"); // docs.add(repoFolder + "XML\\DBLP\\1000Schema.xml"); @@ -28,7 +28,7 @@ public static void main(String[] args){ //add BibTex repos // docs.add(repoFolder + "BibTex\\BordatTest.bib"); // docs.add(repoFolder + "BibTex\\scg.bib"); -// docs.add(repoFolder + "BibTex\\listb.bib"); + docs.add(repoFolder + "BibTex\\listb.bib"); // docs.add(repoFolder + "BibTex\\zbMATH\\100Lattice.bib"); // docs.add(repoFolder + "BibTex\\zbMATH\\100Schema.bib"); // docs.add(repoFolder + "BibTex\\zbMATH\\100Algebra.bib"); @@ -53,16 +53,22 @@ private static void parseDocument(String doc, String outputFolder, String graphv LatticeBuilder lb = new LatticeBuilder(fc); Lattice lattice = lb.buildLattice(); - lattice.exportLatticeToFile(graphvizFolder + parser.getTargetLatticeFilename(doc)); + lattice.exportLatticeToFile(graphvizFolder + "0_" + parser.getTargetLatticeFilename(doc)); + System.out.println("Lattice stats before:\t" + lattice.latticeStats()); ContextCleanser cc = new ContextCleanser(fc, lattice); - cc.tinker(); + double score = 1d; + int i = 1; + while(score > 0d) { + score = cc.tinker(); + lattice.clear(); + lattice = lb.buildLattice(); + System.out.println("Lattice stats after merge " + i + ": " + lattice.latticeStats()); + lattice.exportLatticeToFile(graphvizFolder + (i++) + "_" + parser.getTargetLatticeFilename(doc)); + } + System.out.println("performed " + (i-2) + " merges in total."); // cc.mergeNodes(10, 1, 5); // cc.removeRareAttributes(2); -// LatticeBuilder lb2 = new LatticeBuilder(fc); -// Lattice lattice2 = lb2.buildLattice(); -// System.out.println("Lattice stats after:\t" + lattice.latticeStats()); // System.out.println("---END CLEANSING---"); -// lattice.exportLatticeToFile(graphvizFolder + "edit_" + parser.getTargetLatticeFilename(doc)); } } \ No newline at end of file diff --git a/src/driver/LatticeBuilder.java b/src/driver/LatticeBuilder.java index 19d89fb..6eab75c 100644 --- a/src/driver/LatticeBuilder.java +++ b/src/driver/LatticeBuilder.java @@ -29,6 +29,7 @@ public Lattice buildLattice(){ computeExtents(); lattice.computeEdges(); lattice.computeAttributes(); + alreadyAddedObjects.clear(); return lattice; }