Skip to content

Commit

Permalink
number of words used for creating patterns can now be adjusted (indep…
Browse files Browse the repository at this point in the history
…endent of context size)
  • Loading branch information
bolandka committed May 2, 2016
1 parent 3d82465 commit d3d507d
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public FrequencyBasedBootstrapping(DataStoreClient inputDataStoreClient, DataSto
private static final org.slf4j.Logger log = LoggerFactory.getLogger(FrequencyBasedBootstrapping.class);

public PatternInducer getPatternInducer() {
return new StandardPatternInducer();
return new StandardPatternInducer(getExecution().getWindowsize());
}

public PatternRanker getPatternRanker() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public ReliabilityBasedBootstrapping(DataStoreClient inputDataStoreClient, DataS
private Reliability r = new Reliability();

public PatternInducer getPatternInducer() {
return new StandardPatternInducer();
return new StandardPatternInducer(getExecution().getWindowsize());
}

public PatternRanker getPatternRanker() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,16 @@
public class StandardPatternInducer extends Bootstrapping.PatternInducer {

private static final org.slf4j.Logger log = LoggerFactory.getLogger(StandardPatternInducer.class);
// TODO derive from windowsize
public final static int patternsPerContext = 9;

private int windowsize;;
private static int patternsPerContext;
Pattern leadingWildcards = Pattern.compile("\"(\\*\\s)+");
Pattern trailingWildcards = Pattern.compile("(\\s\\*)+\"");

public StandardPatternInducer() {}
public StandardPatternInducer(int windowsize) {
this.windowsize = windowsize;
patternsPerContext = (windowsize * 2) -1;
}

public final int getPatternsPerContext() {
return patternsPerContext;
Expand Down Expand Up @@ -104,8 +108,6 @@ public String apply(String s) {
String delimiter_left = leftWords.get(0);
String delimiter_right = rightWords.get(0);

// TODO make configurable
int windowsize = 5;
// set default values in case the context of a term contains less elements than the given windowsize
List<InfolisPattern> inducedPatternsLeft = Stream.generate(InfolisPattern::new)
.limit(windowsize)
Expand Down Expand Up @@ -150,7 +152,13 @@ public String apply(String s) {
}
// order is important here: patterns are listed in ascending order with regard to their generality
// type2left and type2right etc. have equal generality
return (Arrays.asList(typeGeneral, inducedPatternsLeft.get(0), inducedPatternsRight.get(0), inducedPatternsLeft.get(1), inducedPatternsRight.get(1), inducedPatternsLeft.get(2), inducedPatternsRight.get(2), inducedPatternsLeft.get(3), inducedPatternsRight.get(3)));
List<InfolisPattern> patterns = new ArrayList<>();
patterns.add(typeGeneral);
for (int i = 0; i < windowsize -1; i++) {
patterns.add(inducedPatternsLeft.get(i));
patterns.add(inducedPatternsRight.get(i));
}
return patterns;
}

}
Expand Down
31 changes: 28 additions & 3 deletions src/main/java/io/github/infolis/model/Execution.java
Original file line number Diff line number Diff line change
Expand Up @@ -243,23 +243,33 @@ public Algorithm instantiateAlgorithm(

/**
* Any kind of search query that can be used within the algorithms.
* For example, it represtens the search query which is used
* For example, it represents the search query which is used
* to perform a search in different repositories to find
* fitting research data.
*
* {@link LuceneSearcher} {@link FederatedSearcher} {@link ApplyPatternAndResolve}
* {@link FederatedSearcher} {@link SearchPatternsAndCreateLinks}
*/
private String searchQuery;

/**
* Group numbers to use for RegexSearcher.
* Group numbers to use for RegexSearcher: group of reference term.
*
* {@Link RegexSearcher}
*/
private int referenceGroup = RegexUtils.doiGroupNum;

/**
* Group numbers to use for RegexSearcher: group of left context.
*
* {@Link RegexSearcher}
*/
private int leftContextGroup = RegexUtils.doiLeftContextGroupNum;

/**
* Group numbers to use for RegexSearcher: group of right context.
*
* {@Link RegexSearcher}
*/
private int rightContextGroup = RegexUtils.doiRightContextGroupNum;

/**
Expand Down Expand Up @@ -309,6 +319,13 @@ public Algorithm instantiateAlgorithm(
* {@link Bootstrapping}
*/
private int maxIterations = 10;

/**
* Number of words used for creation of patterns.
*
* {@link StandardPatternInducer}
*/
private int windowsize = 3;


//TODO: also used for frequencyBasedBootstrapping, should we just name
Expand Down Expand Up @@ -682,6 +699,14 @@ public int getMaxIterations() {
public void setMaxIterations(int maxIterations) {
this.maxIterations = maxIterations;
}

public int getWindowsize() {
return this.windowsize;
}

public void setWindowsize(int windowsize) {
this.windowsize = windowsize;
}

public double getReliabilityThreshold() {
return reliabilityThreshold;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class StandardPatternInducerTest {

@Test
public void testInduce() {
StandardPatternInducer inducer = new StandardPatternInducer();
StandardPatternInducer inducer = new StandardPatternInducer(5);
Double[] thresholds = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};

TextualReference ref = new TextualReference("15757 41727 5743 10877 10014 30850 Sozialstaatssurvey/", "ALLBUS", " .", "textfile", "pattern", "mentionsReference");
Expand Down

0 comments on commit d3d507d

Please sign in to comment.