initial import

TeamCohen · Mar 22, 2018 · 1da503e · 1da503e
commit 1da503e
Show file tree

Hide file tree

Showing 15 changed files with 299 additions and 0 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,67 @@
+PROJECT:=trivia
+LIB:=lib
+CP:=${LIB}/*
+MODE:=pagewise
+
+# ClueWebClient produces the query -> {HTML pages} mapping we use as a
+# prerequisite; email krivard@cs for details
+CONTEXT:=${PROJECT}-context-batched.tsv
+CLUEWEBCLIENT:=../clueweb-fetch
+vpath %-context-batched.tsv ${CLUEWEBCLIENT}
+
+default: $(SAMPLE)$(MODE)-output.set
+
+%.set: %.txt %.1tok.txt %.yes-answer.txt %.1tok.yes-answer.txt %.no-answer.ids.txt
+	true
+
+${PROJECT}-export.xml:
+	java -cp "${CP}" ${JAVARGS} -Dssquad.taggerModel=${LIB}/models/english-left3words-distsim.tagger edu.cmu.ml.ssquad.TriviaSentenceExport $(SKIP) ${CONTEXT} $@ 2>&1 | tee $(APPEND) make-$@.log  | grep -ve "PTBLexer next" -ve "WARNING: Untokenizable:" -ve "ERROR net.htmlparser.jericho"
+
+${SAMPLE}sentencewise-output.txt.pre: ${SAMPLE}${CONTEXT}
+	java -cp "${CP}" ${JAVARGS} -Dssquad.taggerModel=${LIB}/models/english-left3words-distsim.tagger edu.cmu.ml.ssquad.TriviaSentencewiseDataset $(SKIP) $< $@ 2>&1 | tee $(APPEND) make-$@.log  | grep -ve "PTBLexer next" -ve "WARNING: Untokenizable:" -ve "ERROR net.htmlparser.jericho"
+
+${SAMPLE}sentencewise-output.txt:${SAMPLE}sentencewise-output.txt.pre
+	awk -f postprocess.awk $< > $@
+
+
+ifneq ($(SKIP),)
+APPEND:=-a
+endif
+$(SAMPLE)pagewise-output.txt.pre: $(SAMPLE)${CONTEXT}
+	java -cp "${CP}" ${JAVARGS} -Dssquad.taggerModel=${LIB}/models/english-left3words-distsim.tagger edu.cmu.ml.ssquad.TriviaPagewiseDataset $(SKIP) $< $@ 2>&1 | tee $(APPEND) make-$@.log | grep -ve "PTBLexer next" -ve "WARNING: Untokenizable:" -ve "ERROR net.htmlparser.jericho"
+
+$(SAMPLE)pagewise-output.txt: $(SAMPLE)pagewise-output.txt.pre
+	awk -f postprocess.awk $< > $@
+
+%.no-answer.ids.txt: %.txt
+	awk -f filter-noAnswer.awk $< > $@
+#	grep "answer not found in text" make-$<.log | awk '{print $$2}' > $@
+
+%.yes-answer.txt: %.no-answer.ids.txt %.txt 
+	perl filter-yesAnswer.pl $^ > $@
+
+%.1tok.txt: %.txt
+	awk -f filter-1tok.awk $< > $@
+
+%.1tok.yes-answer.txt: %.no-answer.ids.txt %.1tok.txt
+	perl filter-yesAnswer.pl $^ > $@
+
+TAB=$(shell echo "\t")
+${PROJECT}-queries-fitb.tsv:
+	sort -b ${PROJECT}-queries.tsv > ${PROJECT}-queries.sorted.tsv
+	grep "of these" ${PROJECT}-queries.tsv | sort -b > ${PROJECT}-queries-multiplechoice.sorted.tsv
+	comm -23 ${PROJECT}-queries.sorted.tsv ${PROJECT}-queries-multiplechoice.sorted.tsv | grep "^s" | \
+	awk 'BEGIN{FS=OFS="\t"}{sheet=$$1;sub("s","",sheet);sub("q.*","",sheet); query=$$1; sub(".*q","",query); print sheet,query,$$1,$$2}' | \
+	sort -k 1n,1 -k 2n,2 | \
+	cut -f 3,4 > $@
+
+${PROJECT}-context-fitb_batched.tsv: ${PROJECT}-queries-fitb.tsv ${PROJECT}-context_batched.tsv
+	cut -f 1 $< | grep "^s" | sort -k 1b,1 > fitb-ids.txt
+	sort -k 2b,2 $(word 2,$^) > $(word 2,$^).sorted
+	join -t "${TAB}" -2 2 fitb-ids.txt $(word 2,$^).sorted | \
+	awk 'BEGIN{FS=OFS="\t"}{sheet=$$1;sub("s","",sheet);sub("q.*","",sheet); query=$$1; sub(".*q","",query); print $$2,$$1,$$3,$$4,sheet,query}' | \
+	sort -k 5n,5 -k 6n,6 | \
+	cut -f 1,2,3,4 > $@
+
+
+.SECONDARY:
diff --git a/experiment.sh b/experiment.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# SAM: Sample run, or not?
+# "sample-" to run for just the first 100 queries
+# ""        to run all 40k queries
+SAM=sample-
+
+# N: Dry run, or not?
+# "n" to just print the command text to the console
+# ""  to actually run commands
+N=
+
+# K: How many pseudodocuments to retrieve
+K=100
+
+set -x
+# comment out to disable (don't delete)
+SAMPLE=${SAM} MODE=sentencewise JAVARGS=-Dssquad.topk=${K} make -${N}Be ${SAM}sentencewise-output.set $@
+SAMPLE=${SAM} MODE=pagewise     JAVARGS=-Dssquad.topk=${K} make -${N}Be ${SAM}pagewise-output.set $@
+set +x
diff --git a/filter-1tok.awk b/filter-1tok.awk
@@ -0,0 +1,38 @@
+function strip( thing ) {
+    sub("  *$","",thing);sub("^  *","",thing);
+    return thing;
+}
+function finish( id,doc,question,answer,candidates,    A,n,i ) {
+    answer=strip(answer);
+    doc=strip(doc);
+    if (index(answer," ")==0 && length(doc)>0) {
+	printf "%s\n",id > "accepted.ids.txt"
+	print id; print doc; print question; print answer;
+	n=split(candidates,A);
+	candidates = "";
+	for (i=1;i<=n;i++) {
+	    A[i]=strip(A[i]);
+	    if (index(A[i]," ")==0) { 
+		if (i==1 || A[i] != A[i-1]) { candidates=candidates FS A[i]; }
+	    }
+	}
+	print substr(candidates,length(FS)+1);
+	return 0;
+    }
+    printf "%s\n",id > "rejected.ids.txt"
+}
+BEGIN { RS=ORS="\n\n";FS=OFS="\n";state="id"; }
+{
+    if (state == "id") {
+	if (id) { 
+	    finish(id,doc,question,answer,candidates); 
+	}
+	for(i=1;i<=NF;i++) {id=$i; if (id !~ /^[#]/) break;}
+	state="doc";
+    }
+    else if (state == "doc") { doc=$0; state="question"; }
+    else if (state == "question") { question=$0; state="answer"; }
+    else if (state == "answer") { answer=$0; state="candidates"; }
+    else if (state == "candidates") { candidates=$0; state="id"; }
+}
+END { finish(id,doc,question,answer,candidates); }
diff --git a/filter-noAnswer.awk b/filter-noAnswer.awk
@@ -0,0 +1,30 @@
+function strip( thing ) {
+    sub("  *$","",thing);sub("^  *","",thing);
+    return thing;
+}
+function finish( id,doc,question,answer,candidates,    A,n,i ) {
+    answer=tolower(strip(answer));
+    doc=tolower(strip(doc));
+    #print "id: " id,"doc: ",doc,"question: " question,"answer: " answer,"candidates:",candidates;
+    if (index(doc,answer)==0) { 
+	gsub("-"," ",answer);
+	#print "re-answer: " answer;
+	if (index(doc,answer)==0) { print id; }
+    }
+    #print "</entry>";
+}
+BEGIN { RS="\n\n";FS=OFS="\n";state="id"; }
+{
+    if (state == "id") {
+	if (id) { 
+	    finish(id,doc,question,answer,candidates); 
+	}
+	for(i=1;i<=NF;i++) {id=$i; if (id !~ /^[#]/) break;}
+	state="doc";
+    }
+    else if (state == "doc") { doc=$0; state="question"; }
+    else if (state == "question") { question=$0; state="answer"; }
+    else if (state == "answer") { answer=$0; state="candidates"; }
+    else if (state == "candidates") { candidates=$0; state="id"; }
+}
+END { finish(id,doc,question,answer,candidates); }
diff --git a/filter-yesAnswer.pl b/filter-yesAnswer.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/perl
+# syntax:
+# perl filter-yesAnswer.pl no-answer.ids.txt output.txt > output.yes-answer.txt
+
+my $idfn=shift;
+my $outputfn=shift;
+
+open(my $idf,"<$idfn") or die "Couldn't open $idfn for reading:\n$!\n";
+open(my $outputf,"<$outputfn") or die "Couldn't open $outputfn for reading:\n$!\n";
+
+
+my $state=0;
+my $buf="";
+my $pass=1;
+my $id="";
+my $n=0;
+
+while(<$idf>) {
+    chomp;
+    my $skip=$_;
+    if (length($id)>0) {
+	$pass=idlt($id,$skip);
+	#print "### Reading new skip: $id, $skip => $pass\n";
+	if ($pass<0) {
+	    next;
+	}
+    }
+    while(<$outputf>) {
+	$n++;
+	#print "# '$_'\n";
+	if ($_ =~ /^\n$/) {
+	    #print "## state transition: $state\n";
+	    # state transition
+	    if ($state == 0) {
+		$state++;
+		# then we just finished id. Check it and set pass bit
+		$id=$buf;
+		chomp($id);
+		$pass=idlt($id,$skip);
+		$buf = $buf . $_;
+		#print "### Current pass: $id, $skip => $pass\n";
+		if ($pass<0) {
+		    # break to outer loop to fetch another skip id
+		    last;
+		}
+	    } elsif ($state == 4) { 
+		$buf = $buf . $_;
+		$state=0;
+		if ($pass==1) { print $buf; }
+		$buf="";
+	    } else { 
+		$buf = $buf . $_;
+		$state++; 
+	    }
+	} else { 	
+	    $buf = $buf . $_;
+	}
+
+	#($n<20000) or die "failed to parse";
+    }
+}
+if (length($buf)>0) { print $buf; }
+while(<$outputf>) {
+    print $_;
+}
+close($idf);
+close($outputf);
+
+sub idlt{
+    my $id=shift;
+    my $skip=shift;
+    my @ids=split("q",substr($id,1,length($id)-1));
+    my @skips=split("q",substr($skip,1,length($skip)-1));
+    if ($ids[1]<$skips[1] && $ids[0] <= $skips[0]) { return 1; }
+    if ($ids[1]>=$skips[1] && $ids[0] < $skips[0]) { return 1; }
+    if ($ids[1]==$skips[1] && $ids[0] == $skips[0]) { return 0; }
+    return -1;
+}
+
+
+
diff --git a/lib/SSquad.jar b/lib/SSquad.jar
diff --git a/lib/jericho-html-3.4.jar b/lib/jericho-html-3.4.jar
diff --git a/lib/lucene-core-6.2.1.jar b/lib/lucene-core-6.2.1.jar
diff --git a/lib/models/english-left3words-distsim.tagger b/lib/models/english-left3words-distsim.tagger
diff --git a/lib/models/english-left3words-distsim.tagger.props b/lib/models/english-left3words-distsim.tagger.props
@@ -0,0 +1,35 @@
+## tagger training invoked at Tue Feb 25 04:12:25 PST 2014 with arguments:
+                   model = english-left3words-distsim.tagger
+                    arch = left3words,naacl2003unknowns,wordshapes(-1,1),distsim(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),distsimconjunction(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1)
+            wordFunction = edu.stanford.nlp.process.AmericanizeFunction
+               trainFile = /u/nlp/data/pos-tagger/english/train-wsj-0-18;/u/nlp/data/pos-tagger/english/train-extra-english;/u/nlp/data/pos-tagger/english/train-tech-english
+         closedClassTags = 
+ closedClassTagThreshold = 40
+ curWordMinFeatureThresh = 2
+                   debug = false
+             debugPrefix = 
+            tagSeparator = _
+                encoding = UTF-8
+              iterations = 100
+                    lang = english
+    learnClosedClassTags = false
+        minFeatureThresh = 2
+           openClassTags = 
+rareWordMinFeatureThresh = 10
+          rareWordThresh = 5
+                  search = owlqn
+                    sgml = false
+            sigmaSquared = 0.0
+                   regL1 = 0.75
+               tagInside = 
+                tokenize = true
+        tokenizerFactory = 
+        tokenizerOptions = 
+                 verbose = false
+          verboseResults = true
+    veryCommonWordThresh = 250
+                xmlInput = 
+              outputFile = 
+            outputFormat = slashTags
+     outputFormatOptions = 
+                nthreads = 1
diff --git a/lib/slf4j-api.jar b/lib/slf4j-api.jar
diff --git a/lib/slf4j-simple.jar b/lib/slf4j-simple.jar
diff --git a/lib/stanford-parser.jar b/lib/stanford-parser.jar
diff --git a/lib/stanford-postagger-3.5.2.jar b/lib/stanford-postagger-3.5.2.jar
diff --git a/postprocess.awk b/postprocess.awk
@@ -0,0 +1,28 @@
+function strip( thing ) {
+    sub("  *$","",thing);sub("^  *","",thing);
+    return thing;
+}
+function finish( id,doc,question,answer,candidates,    A,n,i ) {
+    answer=strip(answer);
+    doc=strip(doc);
+    print id;
+    print doc;
+    print question;
+    print answer;
+    print candidates;
+}
+BEGIN { RS=ORS="\n\n";FS=OFS="\n";state="id"; }
+{
+    if (state == "id") {
+	if (id) { 
+	    finish(id,doc,question,answer,candidates);
+	}
+	for(i=1;i<=NF;i++) {id=$i; if (id !~ /^[#]/) break;}
+	state="doc";
+    }
+    else if (state == "doc") { doc=$0; state="question"; }
+    else if (state == "question") { question=$0; state="answer"; }
+    else if (state == "answer") { answer=$0; sub("  *\.$","",answer); state="candidates"; }
+    else if (state == "candidates") { candidates=$0; gsub("  *\.\t","\t",candidates); state="id"; }
+}
+END { finish(id,doc,question,answer,candidates); }