Skip to content

Commit

Permalink
Accept "pos" as an alias for ctag and default certainty to 1 for Tree…
Browse files Browse the repository at this point in the history
…Tagger

Fixes CoNLL-U-Treetagger compatibility.

Change-Id: I6301b3d826da8330ee33d83a286f765b08af04b6
  • Loading branch information
kupietz committed Mar 20, 2024
1 parent b8c5382 commit 7fe9cd9
Show file tree
Hide file tree
Showing 9 changed files with 374 additions and 6 deletions.
4 changes: 4 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
0.53 2023-03-20
- Added Spacy support. (kupietz)
- Support 'pos' as an alternative to 'ctag'
in Treetagger. (kupietz)
- Change default certainty value in TreeTagger
to 1.

0.52 2023-01-23
- Introduced 'quiet' flag.
Expand Down
4 changes: 2 additions & 2 deletions lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ sub parse {
$content = $fs->{fs}->{f};

my @val;
my $certainty = 0;
my $certainty = 1;
foreach (@$content) {
if ($_->{-name} eq 'certainty') {

Expand Down Expand Up @@ -54,7 +54,7 @@ sub parse {
};

# pos
if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
if (($_->{-name} eq 'ctag' || $_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
$pos{$found} += $certainty // 1;
};
};
Expand Down
7 changes: 7 additions & 0 deletions t/annotation/corpus/doc/0003/data.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>

<raw_text docid="Corpus_Doc.0003" xmlns="http://ids-mannheim.de/ns/KorAP">
<metadata file="metadata.xml" />
<text>Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, bevor der Betrieb Ende Schuljahr eingestellt wird.</text>
</raw_text>
66 changes: 66 additions & 0 deletions t/annotation/corpus/doc/0003/header.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
<idsHeader type="text" pattern="text" status="new" version="1.1" TEIform="teiHeader">
<fileDesc>
<titleStmt>
<textSigle>Corpus/Doc.Text</textSigle>
<t.title assemblage="regular"/>
</titleStmt>
<publicationStmt>
<distributor/>
<pubAddress/>
<availability region="world" status="unknown"/>
<pubDate/>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<h.title type="main">Beispiel Text</h.title>
<h.title type="sub">Beispiel Text Untertitel</h.title>
<h.author>Mustermann, Max</h.author>
<editor>Monika Mustermann</editor>
<imprint/>
<biblScope type="pp"/>
<biblScope type="suppl"/>
<biblScope type="suppltitle"/>
<biblNote n="1"/>
</analytic>
<monogr>
<h.title type="main">Beispiel Text</h.title>
<h.title type="sub">Best of!</h.title>
<h.author>Mustermann, Max</h.author>
<editor>Monika Mustermann</editor>
<imprint>
<publisher>Artificial articles Inc.</publisher>
<pubDate type="year">2001</pubDate>
<pubDate type="month">04</pubDate>
<pubDate type="day">02</pubDate>
<pubPlace>Mannheim</pubPlace>
</imprint>
<biblScope type="issue"/>
<biblScope type="issueplace"/>
</monogr>
</biblStruct>
<reference type="complete" assemblage="regular"/>
<reference type="short" assemblage="regular"/>
</sourceDesc>
</fileDesc>
<profileDesc>
<creation>
<creatDate>1999.06.01</creatDate>
</creation>
<textClass>
<catRef n="1" target="topic.freizeit-unterhaltung.vereine-veranstaltungen" scheme="topic"/>
<h.keywords>
<keyTerm/>
</h.keywords>
</textClass>
<textDesc>
<textType>Zeitung: Tageszeitung</textType>
<textTypeArt>Bericht</textTypeArt>
<textDomain/>
<column/>
</textDesc>
</profileDesc>
</idsHeader>
24 changes: 24 additions & 0 deletions t/annotation/corpus/doc/0003/opennlp/tokens.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="Corpus_Doc.0003" VERSION="KorAP-0.4">
<spanList>
<span id="s_7" from="0" to="3"/>
<span id="s_8" from="4" to="11"/>
<span id="s_9" from="12" to="23"/>
<span id="s_10" from="24" to="30"/>
<span id="s_11" from="31" to="35"/>
<span id="s_12" from="36" to="39"/>
<span id="s_13" from="40" to="47"/>
<span id="s_14" from="48" to="51"/>
<span id="s_15" from="52" to="63"/>
<span id="s_16" from="64" to="73"/>
<span id="s_17" from="74" to="77"/>
<span id="s_18" from="77" to="78"/>
<span id="s_19" from="79" to="84"/>
<span id="s_20" from="85" to="88"/>
<span id="s_21" from="89" to="96"/>
<span id="s_22" from="97" to="101"/>
<span id="s_23" from="102" to="111"/>
<span id="s_24" from="112" to="123"/>
<span id="s_25" from="124" to="128"/>
<span id="s_26" from="128" to="129"/>
</spanList>
</layer>
206 changes: 206 additions & 0 deletions t/annotation/corpus/doc/0003/tree_tagger/morpho.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<layer docid="Corpus_Doc.0003" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
<spanList>
<span id="s1_n1" from="0" to="3">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">APPRART</f>
<f name="lemma">zu+die</f>
</fs>
</f>
</fs>
</span>
<span id="s1_n2" from="4" to="11">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ADJA</f>
<f name="lemma">letzt</f>
</fs>
</f>
</fs>
</span>
<span id="s1_n3" from="12" to="23">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ADJA</f>
<f name="lemma">kulturell</f>
</fs>
</f>
</fs>
</span>
<span id="s1_n4" from="24" to="30">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NN</f>
<f name="lemma">Anlass</f>
</fs>
</f>
</fs>
</span>
<span id="s2_n1" from="31" to="35">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">VVFIN</f>
<f name="lemma">laden</f>
</fs>
</f>
</fs>
</span>
<span id="s2_n2" from="36" to="39">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ART</f>
<f name="lemma">die</f>
</fs>
</f>
</fs>
</span>
<span id="s2_n3" from="" to="">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NN</f>
<f name="lemma">Leitung</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n1" from="48" to="51">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ART</f>
<f name="lemma">die</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n2" from="52" to="63">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NN</f>
<f name="lemma">Schulheim</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n3" from="64" to="73">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NN</f>
<f name="lemma">&lt;unknown&gt;</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n4" from="74" to="77">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">PTKVZ</f>
<f name="lemma">ein</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n5" from="77" to="78">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">$,</f>
<f name="lemma">,</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n6" from="79" to="84">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">KOUS</f>
<f name="lemma">bevor</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n7" from="85" to="88">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ART</f>
<f name="lemma">die</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n8" from="89" to="96">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NN</f>
<f name="lemma">Betrieb</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n9" from="97" to="101">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NN</f>
<f name="lemma">Ende</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n10" from="102" to="111">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NN</f>
<f name="lemma">Schuljahr</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n11" from="112" to="123">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">VVPP</f>
<f name="lemma">einstellen</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n12" from="124" to="128">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">VAFIN</f>
<f name="lemma">werden</f>
</fs>
</f>
</fs>
</span>
<span id="s4_n1" from="48" to="51">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">$.</f>
<f name="lemma">.</f>
</fs>
</f>
</fs>
</span>
</spanList>
</layer>
24 changes: 24 additions & 0 deletions t/annotation/corpus/doc/0003/tree_tagger/tokens.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="Corpus_Doc.0003" VERSION="KorAP-0.4">
<spanList>
<span id="s_7" from="0" to="3"/>
<span id="s_8" from="4" to="11"/>
<span id="s_9" from="12" to="23"/>
<span id="s_10" from="24" to="30"/>
<span id="s_11" from="31" to="35"/>
<span id="s_12" from="36" to="39"/>
<span id="s_13" from="40" to="47"/>
<span id="s_14" from="48" to="51"/>
<span id="s_15" from="52" to="63"/>
<span id="s_16" from="64" to="73"/>
<span id="s_17" from="74" to="77"/>
<span id="s_18" from="77" to="78"/>
<span id="s_19" from="79" to="84"/>
<span id="s_20" from="85" to="88"/>
<span id="s_21" from="89" to="96"/>
<span id="s_22" from="97" to="101"/>
<span id="s_23" from="102" to="111"/>
<span id="s_24" from="112" to="123"/>
<span id="s_25" from="124" to="128"/>
<span id="s_26" from="128" to="129"/>
</spanList>
</layer>
Loading

0 comments on commit 7fe9cd9

Please sign in to comment.