diff --git a/bin/datastructures/FormalContext.class b/bin/datastructures/FormalContext.class index 4ab36b0..1824ac4 100644 Binary files a/bin/datastructures/FormalContext.class and b/bin/datastructures/FormalContext.class differ diff --git a/bin/driver/Driver.class b/bin/driver/Driver.class index 500267a..fbecbcf 100644 Binary files a/bin/driver/Driver.class and b/bin/driver/Driver.class differ diff --git a/bin/parsers/BibTexParser.class b/bin/parsers/BibTexParser.class index cc201a9..844394c 100644 Binary files a/bin/parsers/BibTexParser.class and b/bin/parsers/BibTexParser.class differ diff --git a/bin/parsers/JSONParser.class b/bin/parsers/JSONParser.class index cd93a7d..c7cd900 100644 Binary files a/bin/parsers/JSONParser.class and b/bin/parsers/JSONParser.class differ diff --git a/bin/parsers/NoSQLParser.class b/bin/parsers/NoSQLParser.class index 9b25ee1..9e1a7ed 100644 Binary files a/bin/parsers/NoSQLParser.class and b/bin/parsers/NoSQLParser.class differ diff --git a/bin/parsers/XMLParser.class b/bin/parsers/XMLParser.class index fb8c373..d07b033 100644 Binary files a/bin/parsers/XMLParser.class and b/bin/parsers/XMLParser.class differ diff --git a/src/datastructures/FormalContext.java b/src/datastructures/FormalContext.java index 3c7b990..c0ae329 100644 --- a/src/datastructures/FormalContext.java +++ b/src/datastructures/FormalContext.java @@ -24,7 +24,7 @@ public FormalContext() { } //the BitSet is created at the time of the object being added to the context - public void addObject(FormalObject object){ + public void createAndAddObject(FormalObject object){ BitSet intent = new BitSet(); for(String attribute : object.getAttributes()){ if(!dic.containsAttribute(attribute)) @@ -36,6 +36,10 @@ public void addObject(FormalObject object){ objects.add(object); } + public void addObject(FormalObject obj) { + objects.add(obj); + } + private void countAttribute(String attribute) { if(!attributeSupport.containsKey(attribute)) attributeSupport.put(attribute, 1); diff --git a/src/datastructures/Lattice.java b/src/datastructures/Lattice.java index cf237e3..5fba0cb 100644 --- a/src/datastructures/Lattice.java +++ b/src/datastructures/Lattice.java @@ -18,6 +18,7 @@ public class Lattice { private ArrayList nodes; private ArrayList edges; private HashMap> nodesByLevel; + private FormalContext context; private int currentNodeNumber; private Dictionary dic; private BitSet lastMergedInto; //used to keep track of which node has last been merged into in the tinker algorithm @@ -26,9 +27,10 @@ public class Lattice { private ContextCleanser cc; private long time; - public Lattice(Dictionary _dic) { + public Lattice(Dictionary _dic, FormalContext _context) { this.nodes = new ArrayList(); this.edges = new ArrayList(); + this.context = _context; this.currentNodeNumber = 0; this.dic = _dic; this.nodesByLevel = new HashMap>(); @@ -49,8 +51,9 @@ public void clear() { public String latticeStats() { // return "Nodes: " + nodes.size() + "\twith own objects: " + nodesWithOwnObjects() + "\tedges: " + edges.size() // + "\tclusterIndex: " + String.format("%.3f", clusterIndex()) + "\tcleanliness: " + String.format("%.1f", cleanliness()) + "%"; - return numberOfAttributes() + "\t" + nodes.size() + "\t" + nodesWithOwnObjects() + "\t" + edges.size() + "\t" + String.format("%.3f", clusterIndex()) + "\t" + String.format("%.1f", cleanliness()) - + "\t" + String.format("%.1f", nullPercentage()) + "\t" + String.format("%.1f", legacyPercentage()) + "\t" + time; + return context.getObjects().size() + "\t" + types() + "\t" + numberOfAttributes() + "\t" + nodes.size() + "\t" + nodesWithOwnObjects() + + "\t" + edges.size() + "\t" + String.format("%.3f", clusterIndex()) + "\t" + String.format("%.1f", inMajority()) + + "\t" + String.format("%.1f", inCleanNodes()) + "\t" + String.format("%.1f", nullPercentage()) + "\t" + String.format("%.1f", legacyPercentage()) + "\t" + time; } private int numberOfAttributes() { @@ -315,7 +318,7 @@ public void setLastMergedInto(BitSet intent) { this.lastMergedInto = intent; } - public double cleanliness() { + public double inMajority() { int majority = 0; int total = 0; for(LatticeNode node : nodes) { @@ -327,6 +330,19 @@ public double cleanliness() { return ((double)majority/(double)total)*100; } + public double inCleanNodes() { + int inClean = 0; + int total = 0; + for(LatticeNode node : nodes) { + if(node.hasOwnObjects()){ + if(node.typesOfFormalObjects(node.ownObjects()).substring(0,4).equals("100%")) + inClean += node.numberOfOwnObjects(); + total += node.numberOfOwnObjects(); + } + } + return ((double)inClean/(double)total)*100; + } + public Boolean bookkeepingIsNull() { return bookkeeping == null; } @@ -436,6 +452,7 @@ public void retrofitSingletons() { bestFit.addToOwnObjects(single); //update the bookkeeping datastructure, ie. add the formalObject to the hash of the closest node. Do not re-compute anything. bookkeeping.get(cc.bitsetHash(bestFit.getIntent())).add(single); + context.addObject(single); } } @@ -459,6 +476,13 @@ private LatticeNode findBestNodeFit(FormalObject single) { // System.out.println("Retrofitting " + single.getIntent() + " into " + bestFit.getIntent() + " (score = " + bestFitScore + ", own = " + bestFitOwnObjects + ")"); return bestFit; } + + public int types() { + HashSet types = new HashSet(); + for(FormalObject obj : context.getObjects()) + types.add(obj.getName()); + return types.size(); + } public void setTime(long timeElapsed) { this.time = timeElapsed; diff --git a/src/datastructures/LatticeNode.java b/src/datastructures/LatticeNode.java index a06ee7e..332996a 100644 --- a/src/datastructures/LatticeNode.java +++ b/src/datastructures/LatticeNode.java @@ -167,7 +167,7 @@ public int majority() { return counts.get(majorityType); } - private String typesOfFormalObjects(HashSet set) { + protected String typesOfFormalObjects(HashSet set) { if(set.size() > 0) { HashMap counts = countObjectTypes(set); if(counts.keySet().size() == 1) diff --git a/src/driver/Driver.java b/src/driver/Driver.java index f68d854..9e76bb1 100644 --- a/src/driver/Driver.java +++ b/src/driver/Driver.java @@ -11,69 +11,64 @@ public class Driver { public static void main(String[] args){ - String folder = "C:\\Users\\Luca Liechti\\Desktop\\IESL"; + String folder200 = "C:\\Users\\Luca Liechti\\Desktop\\IESL200"; + String folder2000 = "C:\\Users\\Luca Liechti\\Desktop\\IESL2000"; String repoFolder = "C:\\Users\\Luca Liechti\\Dropbox\\Uni\\!BSc\\NoSQL repos\\"; - String ieslFolder = "C:\\Users\\Luca Liechti\\Desktop\\IESL\\"; String outputFolder = "C:\\Users\\Luca Liechti\\Dropbox\\Uni\\!BSc\\context files\\"; String graphvizFolder = "C:\\Users\\Luca Liechti\\Dropbox\\Uni\\!BSc\\graphviz files\\"; - ArrayList docs = new ArrayList(); + ArrayList docs200 = new ArrayList(); + ArrayList docs2000 = new ArrayList(); ParserFactory factory = new ParserFactory(); //CONFIGURE HERE double mergeStop = 0d; - Boolean deleteRareAttributes = true; - Boolean retroFitSingletons = true; + Boolean deleteRareAttributes = false; + Boolean retroFitSingletons = false; // //add XML repos -// docs.add(repoFolder + "XML\\mondial.xml"); -// docs.add(repoFolder + "XML\\SigmodRecord.xml"); -// docs.add(repoFolder + "XML\\ebay.xml"); - docs.add(repoFolder + "XML\\DBLP\\1000Lattice.xml"); // - docs.add(repoFolder + "XML\\DBLP\\316NoSql.xml"); // - docs.add(repoFolder + "XML\\DBLP\\1000FCA.xml"); // - docs.add(repoFolder + "XML\\DBLP\\1000Schema.xml"); // - - //add IESL repos -// docs.add(ieslFolder + "gp-bibliography.bib"); -// docs.add(ieslFolder + "visinfo.zib.de#EVlib#Bibliography#EVL-1998.bib"); +// docs2000.add(repoFolder + "XML\\DBLP\\1000Lattice.xml"); +// docs2000.add(repoFolder + "XML\\DBLP\\316NoSql.xml"); +// docs2000.add(repoFolder + "XML\\DBLP\\1000FCA.xml"); +// docs2000.add(repoFolder + "XML\\DBLP\\1000Schema.xml"); //add BibTex repos -// docs.add(repoFolder + "BibTex\\BordatTest.bib"); -// docs.add(repoFolder + "BibTex\\Test2.bib"); - docs.add(repoFolder + "BibTex\\scg.bib"); // - docs.add(repoFolder + "BibTex\\listb.bib"); // - docs.add(repoFolder + "BibTex\\zbMATH\\100Lattice.bib"); // - docs.add(repoFolder + "BibTex\\zbMATH\\100Schema.bib"); // - docs.add(repoFolder + "BibTex\\zbMATH\\100Algebra.bib"); // - docs.add(repoFolder + "BibTex\\zbMATH\\100Groups.bib"); // +// docs200.add(repoFolder + "BibTex\\scg.bib"); +// docs200.add(repoFolder + "BibTex\\listb.bib"); +// docs2000.add(repoFolder + "BibTex\\zbMATH\\100Lattice.bib"); +// docs2000.add(repoFolder + "BibTex\\zbMATH\\100Schema.bib"); +// docs2000.add(repoFolder + "BibTex\\zbMATH\\100Algebra.bib"); +// docs2000.add(repoFolder + "BibTex\\zbMATH\\100Groups.bib"); // //add JSON repos - docs.add(repoFolder + "JSON\\SIRA\\alle.js"); // + docs2000.add(repoFolder + "JSON\\SIRA\\alle.js"); //PARSING SINGLE FILES - for(String doc : docs) - parseDocument(doc, outputFolder, graphvizFolder, factory.makeParser(doc), retroFitSingletons, deleteRareAttributes, mergeStop); + for(String doc : docs200) + parseDocument(doc, outputFolder, graphvizFolder, factory.makeParser(doc), retroFitSingletons, deleteRareAttributes, mergeStop, 200); + for(String doc : docs2000) + parseDocument(doc, outputFolder, graphvizFolder, factory.makeParser(doc), retroFitSingletons, deleteRareAttributes, mergeStop, 2000); //PARSING ALL FILES IN FOLDER - parseFolder(folder, outputFolder, graphvizFolder, factory, retroFitSingletons, deleteRareAttributes, mergeStop); +// parseFolder(folder200, outputFolder, graphvizFolder, factory, retroFitSingletons, deleteRareAttributes, mergeStop, 200); +// parseFolder(folder2000, outputFolder, graphvizFolder, factory, retroFitSingletons, deleteRareAttributes, mergeStop, 200); System.out.println("All done."); } - private static void parseDocument(String doc, String outputFolder, String graphvizFolder, NoSQLParser parser, Boolean retroFitSingletons, Boolean deleteRareAttributes, double mergeStop){ + private static void parseDocument(String doc, String outputFolder, String graphvizFolder, NoSQLParser parser, Boolean retroFitSingletons, Boolean deleteRareAttributes, double mergeStop, int obj){ System.out.println("Parsing file " + doc); - ArrayList importedContext = parser.parseFile(doc); + ArrayList importedContext = parser.parseFile(doc, obj); FormalContext fc = new FormalContext(); for(FormalObject object : importedContext) - fc.addObject(object); + fc.createAndAddObject(object); fc.exportContextToFile(outputFolder + parser.getTargetContextFilename(doc)); LatticeBuilder lb = new LatticeBuilder(fc); Lattice lattice = lb.buildLattice(); lattice.exportLatticeToFile(graphvizFolder + "0a_original_" + parser.getTargetLatticeFilename(doc)); - System.out.println("\nNr\tScore\tAttr\tNodes\tWithOwn\tedges\tindex\tclean\tnull\tleg\ttime"); - System.out.println("-----------------------------------------------------------------------------------"); + System.out.println("\nNr\tScore\tObjects\tTypes\tAttr\tNodes\tWithOwn\tedges\tindex\tmajor\tinClean\tnull\tleg\ttime"); + System.out.println("------------------------------------------------------------------------------------------------------------"); System.out.println("orig\t---" + "\t" + lattice.latticeStats()); ContextCleanser cc = new ContextCleanser(fc, lattice); @@ -102,11 +97,11 @@ private static void parseDocument(String doc, String outputFolder, String graphv while(score > mergeStop) { lattice.clear(); lattice = lb.buildLattice(); - System.out.println(i + "\t" + String.format("%.2f", score) + "\t" + lattice.latticeStats()); +// System.out.println(i + "\t" + String.format("%.2f", score) + "\t" + lattice.latticeStats()); lattice.exportLatticeToFile(graphvizFolder + (i++) + "_" + parser.getTargetLatticeFilename(doc)); score = cc.tinker(); } - System.out.println("final (" + (--i) + ")\t" + lattice.latticeStats()); + if(!retroFitSingletons) System.out.println("final (" + (--i) + ")\t" + lattice.latticeStats()); ///SINGLETONS PT. 2/// if(retroFitSingletons) { @@ -115,14 +110,14 @@ private static void parseDocument(String doc, String outputFolder, String graphv lattice.exportLatticeToFile(graphvizFolder + i + "_retroFit_" + parser.getTargetLatticeFilename(doc)); } - System.out.println("-----------------------------------------------------------------------------------\n\n"); + System.out.println("------------------------------------------------------------------------------------------------------------\n\n"); } - private static void parseFolder(String inFolder, String outFolder, String gvFolder, ParserFactory fac, Boolean retroFitSingletons, Boolean deleteRareAttributes, double mergeStop) { + private static void parseFolder(String inFolder, String outFolder, String gvFolder, ParserFactory fac, Boolean retroFitSingletons, Boolean deleteRareAttributes, double mergeStop, int obj) { File fold = new File(inFolder); assert(fold.isDirectory()); String[] inFiles = fold.list(); for(int i = 0; i < inFiles.length; i++) - parseDocument(inFolder + "\\" + inFiles[i], outFolder, gvFolder, fac.makeParser(inFiles[i]), retroFitSingletons, deleteRareAttributes, mergeStop); + parseDocument(inFolder + "\\" + inFiles[i], outFolder, gvFolder, fac.makeParser(inFiles[i]), retroFitSingletons, deleteRareAttributes, mergeStop, obj); } } \ No newline at end of file diff --git a/src/driver/LatticeBuilder.java b/src/driver/LatticeBuilder.java index 50b2c4b..6ad3ead 100644 --- a/src/driver/LatticeBuilder.java +++ b/src/driver/LatticeBuilder.java @@ -20,7 +20,7 @@ public class LatticeBuilder { public LatticeBuilder(FormalContext _context) { this.context = _context; - this.lattice = new Lattice(_context.getDictionary()); + this.lattice = new Lattice(_context.getDictionary(), _context); this.alreadyAddedObjects = new ArrayList(); this.maximalConcept = null; } @@ -30,10 +30,15 @@ public Lattice buildLattice(){ //Norris algorithm for(FormalObject g : context.getObjects()) add(g); lattice.setTime(timer.timeElapsed()); +// System.out.println("Added all objects in " + timer.timeElapsed() + " ms."); timer.reset(); addNodeWithAllAttributes(); +// System.out.println("Added node with all attributes in " + timer.timeElapsed() + " ms."); timer.reset(); computeExtents(); +// System.out.println("Computed extents in " + timer.timeElapsed() + " ms."); timer.reset(); lattice.computeEdges(); +// System.out.println("Computed edges in " + timer.timeElapsed() + " ms."); timer.reset(); lattice.computeAttributes(); +// System.out.println("Computed which attribute enters where in " + timer.timeElapsed() + " ms."); if(lattice.bookkeepingIsNull()) lattice.initialiseBookkeeping(); alreadyAddedObjects.clear(); return lattice; diff --git a/src/parsers/BibTexParser.java b/src/parsers/BibTexParser.java index 413dd54..b36362f 100644 --- a/src/parsers/BibTexParser.java +++ b/src/parsers/BibTexParser.java @@ -10,9 +10,9 @@ public class BibTexParser implements NoSQLParser { - public ArrayList parseFile(String file){ + public ArrayList parseFile(String file, int MAX_OBJECTS){ ArrayList splitObjects = splitFile(file); //split the input file - return createFormalObjects(splitObjects); //extract attributes from split objects + return createFormalObjects(splitObjects, MAX_OBJECTS); //extract attributes from split objects } private ArrayList splitFile(String file) { @@ -45,7 +45,7 @@ private ArrayList splitFile(String file) { return splitString; } - private ArrayList createFormalObjects(ArrayList splitObjects) { + private ArrayList createFormalObjects(ArrayList splitObjects, int MAX_OBJECTS) { ArrayList parsedObjects = new ArrayList(); System.out.print("Parsing objects to context... "); //extract the attributes from each object @@ -67,10 +67,11 @@ else if(lines[i].matches("@.*\\{.*")){ } currentObject.setAttributes(attributes); currentObject.setName(name); - if(++k <= 200) //Comment in/out to look at all/n objects + if(MAX_OBJECTS == 0 || ++k <= MAX_OBJECTS) parsedObjects.add(currentObject); } System.out.println("done."); + assert (parsedObjects.size() <= MAX_OBJECTS); return parsedObjects; } diff --git a/src/parsers/JSONParser.java b/src/parsers/JSONParser.java index e589592..586919e 100644 --- a/src/parsers/JSONParser.java +++ b/src/parsers/JSONParser.java @@ -17,9 +17,9 @@ public class JSONParser implements NoSQLParser { private String nameAttribute = "file"; @Override - public ArrayList parseFile(String file) { + public ArrayList parseFile(String file, int MAX_OBJECTS) { JSONArray jarray = extractJSONarray(file); - return(createFormalObjects(jarray)); + return(createFormalObjects(jarray, MAX_OBJECTS)); } private JSONArray extractJSONarray(String file) { @@ -34,11 +34,12 @@ private JSONArray extractJSONarray(String file) { return array; } - private ArrayList createFormalObjects(JSONArray jarray) { + private ArrayList createFormalObjects(JSONArray jarray, int MAX_OBJECTS) { ArrayList parsedObjects = new ArrayList(); System.out.print("Parsing objects to context... "); + if(MAX_OBJECTS == 0) MAX_OBJECTS = jarray.length(); //declare how many objects we want. If all, just parse the whole array try{ - for(int i = 0; i < jarray.length(); i++) { + for(int i = 0; i < MAX_OBJECTS; i++) { FormalObject formalObj = new FormalObject(); ArrayList formalAttr = new ArrayList(); JSONObject obj = jarray.getJSONObject(i); @@ -53,6 +54,7 @@ private ArrayList createFormalObjects(JSONArray jarray) { } catch(JSONException jsone) { jsone.printStackTrace(); } System.out.print("Done."); + assert (parsedObjects.size() <= MAX_OBJECTS); return parsedObjects; } diff --git a/src/parsers/NoSQLParser.java b/src/parsers/NoSQLParser.java index 59c00c7..47dfc83 100644 --- a/src/parsers/NoSQLParser.java +++ b/src/parsers/NoSQLParser.java @@ -6,7 +6,7 @@ public interface NoSQLParser { - public ArrayList parseFile(String file); + public ArrayList parseFile(String file, int nrObj); public String getTargetContextFilename(String doc); diff --git a/src/parsers/XMLParser.java b/src/parsers/XMLParser.java index 5fe0c6a..36749f5 100644 --- a/src/parsers/XMLParser.java +++ b/src/parsers/XMLParser.java @@ -19,14 +19,15 @@ public class XMLParser implements NoSQLParser { private String wantedObjects = "info"; private String nameAttribute = "type"; - public ArrayList parseFile(String file){ - ArrayList wantedElements = extractElements(file); //split the input file + public ArrayList parseFile(String file, int MAX_OBJECTS){ + ArrayList wantedElements = extractElements(file, MAX_OBJECTS); //split the input file return createFormalObjects(wantedElements); //extract attributes from split objects } - private ArrayList extractElements(String file) { + private ArrayList extractElements(String file, int MAX_OBJECTS) { ArrayList wantedElements = new ArrayList(); ElementFilter ef = new ElementFilter(); + int numberOfParsedObjects = 0; try { File inputFile = new File(file); SAXBuilder saxBuilder = new SAXBuilder(); @@ -35,11 +36,15 @@ private ArrayList extractElements(String file) { Iterator allElementsIterator = rootElement.getDescendants(ef); while(allElementsIterator.hasNext()) { Element currentElement = allElementsIterator.next(); - if(currentElement.getName().equals(wantedObjects)) wantedElements.add(currentElement); + if(currentElement.getName().equals(wantedObjects) && (MAX_OBJECTS == 0 || MAX_OBJECTS < numberOfParsedObjects)) { + wantedElements.add(currentElement); + numberOfParsedObjects++; + } } } catch (JDOMException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } + assert ((MAX_OBJECTS == 0 || numberOfParsedObjects <= MAX_OBJECTS) && wantedElements.size() <= MAX_OBJECTS); return wantedElements; }