Skip to content

Commit

Permalink
HBOS dynamic binwidth stack overflow fixed
Browse files Browse the repository at this point in the history
parameter changed to number of bins
  • Loading branch information
JohannGebhardt committed Sep 16, 2013
1 parent f981026 commit 9dcf17e
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 19 deletions.
2 changes: 1 addition & 1 deletion build.properties
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
extension.version=2
extension.revision=1
extension.update=000
extension.update=001
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,15 @@
column in the Example Set. There are two modes, one with a static
and one with a dynamic bandwidth. In the static mode every bin has
the same binwidth equally distributed over the value range. In the
dynamic mode the bindwidth can vary, but you can specify a minimum
number of examples contained in a bin. The default values for either
the number of bins or the minimum number of examples per bin is the
square root of the number of total examples (column properties set
to -1). To compute the outlier
dynamic mode the binwidth can vary, but you can specify a minimum
number of examples contained in a bin. The parameter number of bins sets
the total number of bins used for either mode. The binwidth / minimum number
values per bin is then calculated automatically.
In the dynamic mode it is possible that there are less bins then specified if
some bins contain more than the minimum number of values.
The default values for
the number of bins is the square root of the number of total examples
(number of bins set to -1). To compute the outlier
score, the histograms are normalized to one in height first. Then,
the score is inverted, so that anomalies have a high score and
normal examples a low score. It is also possible to apply a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ public class HistogramEvaluator {
public HistogramEvaluator(Operator logger) {
this.logger = logger;
}

static int asdf = 0;
private ArrayList<HistogramBin>[] histogram;
@SuppressWarnings("unchecked")
public ExampleSet evaluate (ExampleSet exampleSet, boolean log_scale, boolean ranked, HashMap<String,Integer> bin_info_help, HashMap<String,String> mode_help) {

Expand Down Expand Up @@ -87,7 +88,7 @@ public ExampleSet evaluate (ExampleSet exampleSet, boolean log_scale, boolean ra

// initialize histogram, one histogram for every dimension
// list of bins for every histogram
ArrayList<HistogramBin>[] histogram = new ArrayList[number_of_features];
histogram = new ArrayList[number_of_features];
for(int i = 0; i < number_of_features; i++) {
histogram[i] = new ArrayList<HistogramBin>();
}
Expand Down Expand Up @@ -118,21 +119,38 @@ public ExampleSet evaluate (ExampleSet exampleSet, boolean log_scale, boolean ra

// create histograms
for (int i = 0; i < number_of_features; i++) {
int last = 0;
double bin_start = data[0][i];
if(mode[i].equals("dynamic binwidth")){
// For nominal values every value gets its own bin. Rapidminer handels nominal values as intergers => binwidth 1
if (nominal[i]) {
createDynamicHistogram(histogram,data,0,1,i,true);
while(last<data.length-1){
last = createDynamicHistogram(histogram,data,last,1,i,true);
}
}
else {
createDynamicHistogram(histogram,data,0,bin_info[i],i,false);
int length = data.length;
int binwidth = bin_info[i];
while(last<data.length-1){
int values_per_bin = (int) Math.floor(data.length/bin_info[i]);
last = createDynamicHistogram(histogram,data,last,values_per_bin,i,false);
if(binwidth > 1) {
length = length - histogram[i].get(histogram[i].size()-1).get_quantity();
binwidth = binwidth -1;
values_per_bin = (int) Math.floor(length/binwidth);
}
}
}
}
else {
double binwidth = (data[items-1][i] - data[0][i]) / bin_info[i];
if(nominal[i] || binwidth == 0) {
binwidth = 1.0;
}
createStaticHistogram(histogram,data,0,binwidth,i,data[0][i]);
while(last<data.length-1) {
last = createStaticHistogram(histogram,data,last,binwidth,i,bin_start);
bin_start = bin_start+binwidth;
}
}
}

Expand Down Expand Up @@ -248,7 +266,9 @@ else if(ranked) {
* @param n
* @param row
*/
public static void createDynamicHistogram(ArrayList<HistogramBin>[] histogram_array, double[][] data, int first, int n, int feature, boolean nominal) {

public static int createDynamicHistogram(ArrayList<HistogramBin>[] histogram_array, double[][] data, int first, int n, int feature, boolean nominal) {

int last = first;
int end = 0;
// create new bin
Expand Down Expand Up @@ -327,9 +347,12 @@ else if (histogram_array[feature].size() == 0) {
/*
* if end of that file isn't reached start over with the last unused value as first value
*/
return last+1;
/*System.out.println(asdf);
if(last < data.length-1) {
asdf++;
createDynamicHistogram(histogram_array,data,last+1,n,feature,nominal);
}
}*/
}
/** Create histogram with static binWidth
* @param histogram_array
Expand All @@ -339,17 +362,18 @@ else if (histogram_array[feature].size() == 0) {
* @param feature
* @param binStart
*/
public static void createStaticHistogram(ArrayList<HistogramBin>[] histogram_array, double[][] data, int first, double binWidth, int feature, double binStart){
public static int createStaticHistogram(ArrayList<HistogramBin>[] histogram_array, double[][] data, int first, double binWidth, int feature, double binStart){
HistogramBin bin = new HistogramBin(binStart,binStart+binWidth,0,0);
int last = first-1;
for(int i = first; i < data.length&&data[i][feature] <= bin.get_range_to(); i++) {
bin.add_quantity(1);
last = i;
}
histogram_array[feature].add(bin);
if(last < data.length - 1) {
return last+1;
/*if(last < data.length - 1) {
createStaticHistogram(histogram_array,data,last+1,binWidth,feature,binStart+binWidth);
}
}*/
}

/** Sort the rows of an multidimensional array independently.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public class HistogramOperator extends Operator {
private static final String PARAMETER_PROPERTIES_LIST = "histogram properties";
private static String[] CONDITION_NAMES = new String[] { "all", "single"};
private static final String PARAMETER_FILTER_TYPE = "parameter mode";
private static final String PARAMETER_BIN_INFO ="bin_info";
private static final String PARAMETER_BIN_INFO ="number of bins";
private static final String PARAMETER_MODE="select mode";
private static final String PARAMETER_COLUMN_PROPERTIES = "column properties";
private static final String PARAMETER_ATTRIBUTE_NAME = "attribute name";
Expand Down Expand Up @@ -221,10 +221,13 @@ public List<ParameterType> getParameterTypes() {
String[] mode = new String[2];
mode[0] = "fixed binwidth";
mode[1] = "dynamic binwidth";
ParameterTypeString type_int= new ParameterTypeString(PARAMETER_BIN_INFO,"Specifies how many bins or how many values per bins are used. Set to -1 for default value (sqrt(N)).","-1");
ParameterTypeString type_int= new ParameterTypeString(PARAMETER_BIN_INFO,"Specifies the number of bins. " +
"When using static binwidth the binwidth is set to (range of values)/(number of bins)."+
"When using dynamic binwidth the minimum number of bins is set to (number of examples)/(number of bins)." +
"In this case it is possible that there are less bins than specified if some bins contain more than the minimum number of values. Set to -1 for default value (sqrt(N)).","-1");
ParameterTypeStringCategory type_category = new ParameterTypeStringCategory(PARAMETER_MODE,"Select dynamic or fixed binwidth mode",mode,"fixed binwidth");
type_category.setEditable(false);
ParameterTypeList typeList = new ParameterTypeList(PARAMETER_PROPERTIES_LIST, "properties for every column - select mode and number of bins/number of values per bin for every column (set binwidth to -1 for default value or to nominal for categorical data)",
ParameterTypeList typeList = new ParameterTypeList(PARAMETER_PROPERTIES_LIST, "properties for every column - select mode and number of bins for every column (set binwidth to -1 for default value or to nominal for categorical data)",
new ParameterTypeAttribute(PARAMETER_ATTRIBUTE_NAME, "The index of the column whose properties should be changed.",getExampleSetInputPort()),
new ParameterTypeTupel(PARAMETER_COLUMN_PROPERTIES, "properties",
type_category,
Expand Down

0 comments on commit 9dcf17e

Please sign in to comment.