Skip to content

Commit

Permalink
initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
krivard committed Mar 22, 2018
0 parents commit 1da503e
Show file tree
Hide file tree
Showing 15 changed files with 299 additions and 0 deletions.
67 changes: 67 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
PROJECT:=trivia
LIB:=lib
CP:=${LIB}/*
MODE:=pagewise

# ClueWebClient produces the query -> {HTML pages} mapping we use as a
# prerequisite; email krivard@cs for details
CONTEXT:=${PROJECT}-context-batched.tsv
CLUEWEBCLIENT:=../clueweb-fetch
vpath %-context-batched.tsv ${CLUEWEBCLIENT}

default: $(SAMPLE)$(MODE)-output.set

%.set: %.txt %.1tok.txt %.yes-answer.txt %.1tok.yes-answer.txt %.no-answer.ids.txt
true

${PROJECT}-export.xml:
java -cp "${CP}" ${JAVARGS} -Dssquad.taggerModel=${LIB}/models/english-left3words-distsim.tagger edu.cmu.ml.ssquad.TriviaSentenceExport $(SKIP) ${CONTEXT} $@ 2>&1 | tee $(APPEND) make-$@.log | grep -ve "PTBLexer next" -ve "WARNING: Untokenizable:" -ve "ERROR net.htmlparser.jericho"

${SAMPLE}sentencewise-output.txt.pre: ${SAMPLE}${CONTEXT}
java -cp "${CP}" ${JAVARGS} -Dssquad.taggerModel=${LIB}/models/english-left3words-distsim.tagger edu.cmu.ml.ssquad.TriviaSentencewiseDataset $(SKIP) $< $@ 2>&1 | tee $(APPEND) make-$@.log | grep -ve "PTBLexer next" -ve "WARNING: Untokenizable:" -ve "ERROR net.htmlparser.jericho"

${SAMPLE}sentencewise-output.txt:${SAMPLE}sentencewise-output.txt.pre
awk -f postprocess.awk $< > $@


ifneq ($(SKIP),)
APPEND:=-a
endif
$(SAMPLE)pagewise-output.txt.pre: $(SAMPLE)${CONTEXT}
java -cp "${CP}" ${JAVARGS} -Dssquad.taggerModel=${LIB}/models/english-left3words-distsim.tagger edu.cmu.ml.ssquad.TriviaPagewiseDataset $(SKIP) $< $@ 2>&1 | tee $(APPEND) make-$@.log | grep -ve "PTBLexer next" -ve "WARNING: Untokenizable:" -ve "ERROR net.htmlparser.jericho"

$(SAMPLE)pagewise-output.txt: $(SAMPLE)pagewise-output.txt.pre
awk -f postprocess.awk $< > $@

%.no-answer.ids.txt: %.txt
awk -f filter-noAnswer.awk $< > $@
# grep "answer not found in text" make-$<.log | awk '{print $$2}' > $@

%.yes-answer.txt: %.no-answer.ids.txt %.txt
perl filter-yesAnswer.pl $^ > $@

%.1tok.txt: %.txt
awk -f filter-1tok.awk $< > $@

%.1tok.yes-answer.txt: %.no-answer.ids.txt %.1tok.txt
perl filter-yesAnswer.pl $^ > $@

TAB=$(shell echo "\t")
${PROJECT}-queries-fitb.tsv:
sort -b ${PROJECT}-queries.tsv > ${PROJECT}-queries.sorted.tsv
grep "of these" ${PROJECT}-queries.tsv | sort -b > ${PROJECT}-queries-multiplechoice.sorted.tsv
comm -23 ${PROJECT}-queries.sorted.tsv ${PROJECT}-queries-multiplechoice.sorted.tsv | grep "^s" | \
awk 'BEGIN{FS=OFS="\t"}{sheet=$$1;sub("s","",sheet);sub("q.*","",sheet); query=$$1; sub(".*q","",query); print sheet,query,$$1,$$2}' | \
sort -k 1n,1 -k 2n,2 | \
cut -f 3,4 > $@

${PROJECT}-context-fitb_batched.tsv: ${PROJECT}-queries-fitb.tsv ${PROJECT}-context_batched.tsv
cut -f 1 $< | grep "^s" | sort -k 1b,1 > fitb-ids.txt
sort -k 2b,2 $(word 2,$^) > $(word 2,$^).sorted
join -t "${TAB}" -2 2 fitb-ids.txt $(word 2,$^).sorted | \
awk 'BEGIN{FS=OFS="\t"}{sheet=$$1;sub("s","",sheet);sub("q.*","",sheet); query=$$1; sub(".*q","",query); print $$2,$$1,$$3,$$4,sheet,query}' | \
sort -k 5n,5 -k 6n,6 | \
cut -f 1,2,3,4 > $@


.SECONDARY:
20 changes: 20 additions & 0 deletions experiment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

# SAM: Sample run, or not?
# "sample-" to run for just the first 100 queries
# "" to run all 40k queries
SAM=sample-

# N: Dry run, or not?
# "n" to just print the command text to the console
# "" to actually run commands
N=

# K: How many pseudodocuments to retrieve
K=100

set -x
# comment out to disable (don't delete)
SAMPLE=${SAM} MODE=sentencewise JAVARGS=-Dssquad.topk=${K} make -${N}Be ${SAM}sentencewise-output.set $@
SAMPLE=${SAM} MODE=pagewise JAVARGS=-Dssquad.topk=${K} make -${N}Be ${SAM}pagewise-output.set $@
set +x
38 changes: 38 additions & 0 deletions filter-1tok.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
function strip( thing ) {
sub(" *$","",thing);sub("^ *","",thing);
return thing;
}
function finish( id,doc,question,answer,candidates, A,n,i ) {
answer=strip(answer);
doc=strip(doc);
if (index(answer," ")==0 && length(doc)>0) {
printf "%s\n",id > "accepted.ids.txt"
print id; print doc; print question; print answer;
n=split(candidates,A);
candidates = "";
for (i=1;i<=n;i++) {
A[i]=strip(A[i]);
if (index(A[i]," ")==0) {
if (i==1 || A[i] != A[i-1]) { candidates=candidates FS A[i]; }
}
}
print substr(candidates,length(FS)+1);
return 0;
}
printf "%s\n",id > "rejected.ids.txt"
}
BEGIN { RS=ORS="\n\n";FS=OFS="\n";state="id"; }
{
if (state == "id") {
if (id) {
finish(id,doc,question,answer,candidates);
}
for(i=1;i<=NF;i++) {id=$i; if (id !~ /^[#]/) break;}
state="doc";
}
else if (state == "doc") { doc=$0; state="question"; }
else if (state == "question") { question=$0; state="answer"; }
else if (state == "answer") { answer=$0; state="candidates"; }
else if (state == "candidates") { candidates=$0; state="id"; }
}
END { finish(id,doc,question,answer,candidates); }
30 changes: 30 additions & 0 deletions filter-noAnswer.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
function strip( thing ) {
sub(" *$","",thing);sub("^ *","",thing);
return thing;
}
function finish( id,doc,question,answer,candidates, A,n,i ) {
answer=tolower(strip(answer));
doc=tolower(strip(doc));
#print "id: " id,"doc: ",doc,"question: " question,"answer: " answer,"candidates:",candidates;
if (index(doc,answer)==0) {
gsub("-"," ",answer);
#print "re-answer: " answer;
if (index(doc,answer)==0) { print id; }
}
#print "</entry>";
}
BEGIN { RS="\n\n";FS=OFS="\n";state="id"; }
{
if (state == "id") {
if (id) {
finish(id,doc,question,answer,candidates);
}
for(i=1;i<=NF;i++) {id=$i; if (id !~ /^[#]/) break;}
state="doc";
}
else if (state == "doc") { doc=$0; state="question"; }
else if (state == "question") { question=$0; state="answer"; }
else if (state == "answer") { answer=$0; state="candidates"; }
else if (state == "candidates") { candidates=$0; state="id"; }
}
END { finish(id,doc,question,answer,candidates); }
81 changes: 81 additions & 0 deletions filter-yesAnswer.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/perl
# syntax:
# perl filter-yesAnswer.pl no-answer.ids.txt output.txt > output.yes-answer.txt

my $idfn=shift;
my $outputfn=shift;

open(my $idf,"<$idfn") or die "Couldn't open $idfn for reading:\n$!\n";
open(my $outputf,"<$outputfn") or die "Couldn't open $outputfn for reading:\n$!\n";


my $state=0;
my $buf="";
my $pass=1;
my $id="";
my $n=0;

while(<$idf>) {
chomp;
my $skip=$_;
if (length($id)>0) {
$pass=idlt($id,$skip);
#print "### Reading new skip: $id, $skip => $pass\n";
if ($pass<0) {
next;
}
}
while(<$outputf>) {
$n++;
#print "# '$_'\n";
if ($_ =~ /^\n$/) {
#print "## state transition: $state\n";
# state transition
if ($state == 0) {
$state++;
# then we just finished id. Check it and set pass bit
$id=$buf;
chomp($id);
$pass=idlt($id,$skip);
$buf = $buf . $_;
#print "### Current pass: $id, $skip => $pass\n";
if ($pass<0) {
# break to outer loop to fetch another skip id
last;
}
} elsif ($state == 4) {
$buf = $buf . $_;
$state=0;
if ($pass==1) { print $buf; }
$buf="";
} else {
$buf = $buf . $_;
$state++;
}
} else {
$buf = $buf . $_;
}

#($n<20000) or die "failed to parse";
}
}
if (length($buf)>0) { print $buf; }
while(<$outputf>) {
print $_;
}
close($idf);
close($outputf);

sub idlt{
my $id=shift;
my $skip=shift;
my @ids=split("q",substr($id,1,length($id)-1));
my @skips=split("q",substr($skip,1,length($skip)-1));
if ($ids[1]<$skips[1] && $ids[0] <= $skips[0]) { return 1; }
if ($ids[1]>=$skips[1] && $ids[0] < $skips[0]) { return 1; }
if ($ids[1]==$skips[1] && $ids[0] == $skips[0]) { return 0; }
return -1;
}



Binary file added lib/SSquad.jar
Binary file not shown.
Binary file added lib/jericho-html-3.4.jar
Binary file not shown.
Binary file added lib/lucene-core-6.2.1.jar
Binary file not shown.
Binary file added lib/models/english-left3words-distsim.tagger
Binary file not shown.
35 changes: 35 additions & 0 deletions lib/models/english-left3words-distsim.tagger.props
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
## tagger training invoked at Tue Feb 25 04:12:25 PST 2014 with arguments:
model = english-left3words-distsim.tagger
arch = left3words,naacl2003unknowns,wordshapes(-1,1),distsim(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),distsimconjunction(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1)
wordFunction = edu.stanford.nlp.process.AmericanizeFunction
trainFile = /u/nlp/data/pos-tagger/english/train-wsj-0-18;/u/nlp/data/pos-tagger/english/train-extra-english;/u/nlp/data/pos-tagger/english/train-tech-english
closedClassTags =
closedClassTagThreshold = 40
curWordMinFeatureThresh = 2
debug = false
debugPrefix =
tagSeparator = _
encoding = UTF-8
iterations = 100
lang = english
learnClosedClassTags = false
minFeatureThresh = 2
openClassTags =
rareWordMinFeatureThresh = 10
rareWordThresh = 5
search = owlqn
sgml = false
sigmaSquared = 0.0
regL1 = 0.75
tagInside =
tokenize = true
tokenizerFactory =
tokenizerOptions =
verbose = false
verboseResults = true
veryCommonWordThresh = 250
xmlInput =
outputFile =
outputFormat = slashTags
outputFormatOptions =
nthreads = 1
Binary file added lib/slf4j-api.jar
Binary file not shown.
Binary file added lib/slf4j-simple.jar
Binary file not shown.
Binary file added lib/stanford-parser.jar
Binary file not shown.
Binary file added lib/stanford-postagger-3.5.2.jar
Binary file not shown.
28 changes: 28 additions & 0 deletions postprocess.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
function strip( thing ) {
sub(" *$","",thing);sub("^ *","",thing);
return thing;
}
function finish( id,doc,question,answer,candidates, A,n,i ) {
answer=strip(answer);
doc=strip(doc);
print id;
print doc;
print question;
print answer;
print candidates;
}
BEGIN { RS=ORS="\n\n";FS=OFS="\n";state="id"; }
{
if (state == "id") {
if (id) {
finish(id,doc,question,answer,candidates);
}
for(i=1;i<=NF;i++) {id=$i; if (id !~ /^[#]/) break;}
state="doc";
}
else if (state == "doc") { doc=$0; state="question"; }
else if (state == "question") { question=$0; state="answer"; }
else if (state == "answer") { answer=$0; sub(" *\.$","",answer); state="candidates"; }
else if (state == "candidates") { candidates=$0; gsub(" *\.\t","\t",candidates); state="id"; }
}
END { finish(id,doc,question,answer,candidates); }

0 comments on commit 1da503e

Please sign in to comment.