|
| 1 | +package zingg.common.core.block.blockingTree; |
| 2 | + |
| 3 | + |
| 4 | +import org.apache.commons.logging.Log; |
| 5 | +import org.apache.commons.logging.LogFactory; |
| 6 | +import zingg.common.client.FieldDefinition; |
| 7 | +import zingg.common.client.ZinggClientException; |
| 8 | +import zingg.common.core.block.Block; |
| 9 | +import zingg.common.core.block.Canopy; |
| 10 | +import zingg.common.core.block.Tree; |
| 11 | + |
| 12 | +import java.util.HashSet; |
| 13 | +import java.util.List; |
| 14 | +import java.util.Set; |
| 15 | + |
| 16 | +public abstract class OptimizedBlockingTreeBuilder<D, R, C, T> extends Block<D, R, C, T> implements IBlockingTreeBuilder<D, R, C, T> { |
| 17 | + |
| 18 | + public static final Log LOG = LogFactory.getLog(OptimizedBlockingTreeBuilder.class); |
| 19 | + |
| 20 | + |
| 21 | + @Override |
| 22 | + public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R> parent, Canopy<R> node, |
| 23 | + List<FieldDefinition> fieldsOfInterest, Block<D, R, C, T> cblock) |
| 24 | + throws Exception, ZinggClientException { |
| 25 | + LOG.info("--------- using optimized blocking tree builder ---------"); |
| 26 | + cblock.setHashFunctionsInCurrentNodePath(new HashSet<>()); |
| 27 | + Tree<Canopy<R>> blockingTree = dfsToGetBlockingTree(tree, parent, node, fieldsOfInterest, cblock.getHashFunctionsInCurrentNodePath(), cblock); |
| 28 | + return blockingTree; |
| 29 | + } |
| 30 | + |
| 31 | + private Tree<Canopy<R>> dfsToGetBlockingTree(Tree<Canopy<R>> tree, Canopy<R> parent, Canopy<R> node, List<FieldDefinition> fieldsOfInterest, |
| 32 | + Set<String> hashFunctionsInCurrentNodePath, Block<D, R, C, T> cblock) throws ZinggClientException, Exception { |
| 33 | + long size = node.getTrainingSize(); |
| 34 | + if (size > cblock.getMaxSize() && node.getDupeN() != null && !node.getDupeN().isEmpty()) { |
| 35 | + Canopy<R> best = cblock.getBestNode(tree, parent, node, fieldsOfInterest); |
| 36 | + if (best != null) { |
| 37 | + if (tree == null && parent == null) { |
| 38 | + tree = new Tree<>(node); |
| 39 | + } |
| 40 | + traverseThroughCanopies(best, tree, node, fieldsOfInterest, hashFunctionsInCurrentNodePath, cblock); |
| 41 | + } else { |
| 42 | + node.clearBeforeSaving(); |
| 43 | + } |
| 44 | + } else { |
| 45 | + if ((node.getDupeN() == null) || (node.getDupeN().isEmpty())) { |
| 46 | + LOG.warn("Ran out of training at size " + size + " for node " + node); |
| 47 | + } else { |
| 48 | + if (tree == null) { |
| 49 | + throw new ZinggClientException("Unable to create Zingg models due to insufficient data. Please run Zingg after adding more data"); |
| 50 | + } |
| 51 | + } |
| 52 | + node.clearBeforeSaving(); |
| 53 | + } |
| 54 | + return tree; |
| 55 | + } |
| 56 | + |
| 57 | + private void traverseThroughCanopies(Canopy<R> best, Tree<Canopy<R>> tree, Canopy<R> node, List<FieldDefinition> fieldsOfInterest, |
| 58 | + Set<String> hashFunctionsInCurrentNodePath, Block<D, R, C, T> cblock) throws ZinggClientException, Exception { |
| 59 | + hashFunctionsInCurrentNodePath.add(best.getFunction().getName() + ":" + best.getContext().fieldName); |
| 60 | + best.copyTo(node); |
| 61 | + List<Canopy<R>> canopies = node.getCanopies(); |
| 62 | + for (Canopy<R> n : canopies) { |
| 63 | + node.clearBeforeSaving(); |
| 64 | + tree.addLeaf(node, n); |
| 65 | + dfsToGetBlockingTree(tree, node, n, fieldsOfInterest, hashFunctionsInCurrentNodePath, cblock); |
| 66 | + } |
| 67 | + hashFunctionsInCurrentNodePath.remove(best.getFunction().getName() + ":" + best.getContext().fieldName); |
| 68 | + } |
| 69 | +} |
0 commit comments