Skip to content

Commit

Permalink
Add simple Spacy support
Browse files Browse the repository at this point in the history
Change-Id: I37ec0dce14ca456c8a4804dc9dd198c3d153b359
  • Loading branch information
kupietz authored and Akron committed Mar 20, 2024
1 parent a351837 commit b8c5382
Show file tree
Hide file tree
Showing 10 changed files with 361 additions and 7 deletions.
3 changes: 3 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.53 2023-03-20
- Added Spacy support. (kupietz)

0.52 2023-01-23
- Introduced 'quiet' flag.

Expand Down
5 changes: 4 additions & 1 deletion Readme.pod
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,9 @@ L<Krill|https://github.com/KorAP/Krill>.
#Lemma
#Morpho

Spacy
#Morpho

Talismane
#Dependency
#Morpho
Expand Down Expand Up @@ -613,7 +616,7 @@ Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>

Author: L<Nils Diewald|https://www.nils-diewald.de/>

Contributor: Eliza Margaretha
Contributor: Eliza Margaretha, Marc Kupietz

L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
Expand Down
47 changes: 47 additions & 0 deletions lib/KorAP/XML/Annotation/Spacy/Morpho.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package KorAP::XML::Annotation::Spacy::Morpho;
use KorAP::XML::Annotation::Base;

sub parse {
my $self = shift;

$$self->add_tokendata(
foundry => 'spacy',
layer => 'morpho',
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->get_pos);

my $content = $token->get_hash->{fs}->{f};

my $array = $content->{fs}->{f} or return;

# In case there is only a lemma/pos ...
$array = ref $array ne 'ARRAY' ? [$array] : $array;

my $found;

foreach my $f (@$array) {

next unless $f->{-name};

# pos tag
if (($f->{-name} eq 'pos') &&
($found = $f->{'#text'})) {
$mtt->add_by_term('spacy/p:' . $found);
}

# lemma tag
elsif (($f->{-name} eq 'lemma')
&& ($found = $f->{'#text'})) {
$mtt->add_by_term('spacy/l:' . $found);
};
};
}) or return;
return 1;
};

sub layer_info {
['spacy/l=tokens', 'spacy/p=tokens']
};

1;
2 changes: 1 addition & 1 deletion lib/KorAP/XML/Krill.pm
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use Exporter 'import';

our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);

our $VERSION = '0.52';
our $VERSION = '0.53';

has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
Expand Down
12 changes: 11 additions & 1 deletion script/korapxml2krill
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,12 @@ use Fcntl qw(:flock SEEK_END);
# 2023/02/13
# - Fix temporary-extract handling from configuration file.
#
# 2024/03/20
# - Added Spacy support.
#
# ----------------------------------------------------------

our $LAST_CHANGE = '2023/05/16';
our $LAST_CHANGE = '2024/03/20';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
Expand Down Expand Up @@ -514,6 +517,10 @@ push(@layers,
['Sgbr', 'Lemma'],
['Sgbr', 'Morpho']);

# Spacy
push(@layers,
['Spacy', 'Morpho']);

# Talismane
push(@layers,
['Talismane', 'Dependency'],
Expand Down Expand Up @@ -1521,6 +1528,9 @@ L<Krill|https://github.com/KorAP/Krill>.
#Lemma
#Morpho
Spacy
#Morpho
Talismane
#Dependency
#Morpho
Expand Down
206 changes: 206 additions & 0 deletions t/annotation/corpus/doc/0001/spacy/morpho.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<layer docid="Corpus_Doc.0001" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
<spanList>
<span id="s1_n1" from="0" to="3">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ADP</f>
<f name="lemma">zu</f>
</fs>
</f>
</fs>
</span>
<span id="s1_n2" from="4" to="11">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ADJ</f>
<f name="lemma">letzter</f>
</fs>
</f>
</fs>
</span>
<span id="s1_n3" from="12" to="23">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ADJ</f>
<f name="lemma">kulturell</f>
</fs>
</f>
</fs>
</span>
<span id="s1_n4" from="24" to="30">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NOUN</f>
<f name="lemma">Anlass</f>
</fs>
</f>
</fs>
</span>
<span id="s2_n1" from="31" to="35">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">VERB</f>
<f name="lemma">laden</f>
</fs>
</f>
</fs>
</span>
<span id="s2_n2" from="36" to="39">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">DET</f>
<f name="lemma">der</f>
</fs>
</f>
</fs>
</span>
<span id="s2_n3" from="" to="">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NOUN</f>
<f name="lemma">Leitung</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n1" from="48" to="51">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">DET</f>
<f name="lemma">der</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n2" from="52" to="63">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NOUN</f>
<f name="lemma">Schulheim</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n3" from="64" to="73">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">PROPN</f>
<f name="lemma">Hofbergli</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n4" from="74" to="77">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">ADV</f>
<f name="lemma">ein</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n5" from="77" to="78">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">PUNCT</f>
<f name="lemma">--</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n6" from="79" to="84">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">SCONJ</f>
<f name="lemma">bevor</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n7" from="85" to="88">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">DET</f>
<f name="lemma">der</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n8" from="89" to="96">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NOUN</f>
<f name="lemma">Betrieb</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n9" from="97" to="101">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NOUN</f>
<f name="lemma">Ende</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n10" from="102" to="111">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">NOUN</f>
<f name="lemma">Schuljahr</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n11" from="112" to="123">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">VERB</f>
<f name="lemma">einstellen</f>
</fs>
</f>
</fs>
</span>
<span id="s3_n12" from="124" to="128">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">AUX</f>
<f name="lemma">werden</f>
</fs>
</f>
</fs>
</span>
<span id="s4_n1" from="48" to="51">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">PUNCT</f>
<f name="lemma">--</f>
</fs>
</f>
</fs>
</span>
</spanList>
</layer>
55 changes: 55 additions & 0 deletions t/annotation/spacy_morpho.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use Test::More;
use KorAP::XML::Annotation::Spacy::Morpho;
use Scalar::Util qw/weaken/;
use Data::Dumper;
use lib 't/annotation';
use TestInit;

ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');

ok($tokens->add('Spacy', 'Morpho'), 'Add Structure');

my $data = $tokens->to_data->{data};

like($data->{foundries}, qr!spacy/morpho!, 'data');
like($data->{layerInfos}, qr!spacy/p=tokens!, 'data');
like($data->{layerInfos}, qr!spacy/l=tokens!, 'data');

is($data->{stream}->[0]->[5], 'spacy/l:zu', 'POS');
is($data->{stream}->[0]->[6], 'spacy/p:ADP', 'POS');

is($data->{stream}->[3]->[3], 'spacy/l:Anlass', 'POS');
is($data->{stream}->[3]->[4], 'spacy/p:NOUN', 'POS');

is($data->{stream}->[10]->[3], 'spacy/l:ein', 'POS');
is($data->{stream}->[10]->[4], 'spacy/p:ADV', 'POS');

is($data->{stream}->[13]->[3], 'spacy/l:Betrieb', 'POS');

is($data->{stream}->[-1]->[3], 'spacy/l:werden', 'POS');
is($data->{stream}->[-1]->[4], 'spacy/p:AUX', 'POS');

is($data->{stream}->[11]->[3], 'spacy/l:bevor',
'Lemma');
is($data->{stream}->[11]->[4], 'spacy/p:SCONJ',
'POS');

is($data->{stream}->[12]->[1], 'i:der','Surface');
is($data->{stream}->[13]->[1], 'i:betrieb','Surface');
is($data->{stream}->[14]->[1], 'i:ende','Surface');
is($data->{stream}->[15]->[1], 'i:schuljahr','Surface');
is($data->{stream}->[16]->[1], 'i:eingestellt','Surface');
is($data->{stream}->[17]->[1], 'i:wird','Surface');

ok(!$data->{stream}->[18],'Nothing');

is(scalar(@{$data->{stream}}), 18, 'Length');

done_testing;

__END__
Binary file added t/corpus/archives/wpd15-single.spacy.zip
Binary file not shown.
2 changes: 1 addition & 1 deletion t/script/archive.t
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ my ($json_1, $json_2);
ok(($json_1 = decode_json $file), 'decode json');

is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');

ok(-f $json_2, 'Json file exists');
Expand Down
Loading

0 comments on commit b8c5382

Please sign in to comment.