Skip to content

Commit

Permalink
Introduce support for CorpusExplorer
Browse files Browse the repository at this point in the history
Change-Id: I2133463dbf5e851b371e46f4b9c76bba71611532
  • Loading branch information
Akron committed Jun 5, 2024
1 parent 24ad3c0 commit 5530a55
Show file tree
Hide file tree
Showing 20 changed files with 26,924 additions and 2 deletions.
3 changes: 3 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.56 2024-06-05
- Add support für corpusexplorer.

0.55 2024-06-04
- Add support for xenodata to i5.

Expand Down
5 changes: 5 additions & 0 deletions Readme.pod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
__END__

=pod

=encoding utf8
Expand Down Expand Up @@ -393,6 +395,9 @@ L<Krill|https://github.com/KorAP/Krill>.
#NamedEntities
#Sentences

CorpusExplorer
#Morpho

CMC
#Morpho

Expand Down
60 changes: 60 additions & 0 deletions lib/KorAP/XML/Annotation/CorpusExplorer/Morpho.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package KorAP::XML::Annotation::CorpusExplorer::Morpho;
use KorAP::XML::Annotation::Base;
use Mojo::Util qw'trim';

sub parse {
my $self = shift;

$$self->add_tokendata(
foundry => 'corpusexplorer',
layer => 'morpho',
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->get_pos);

my $content = $token->get_hash->{fs}->{f} or return;

$content = ref $content ne 'ARRAY' ? [$content] : $content;

my $start = $token->get_hash->{-from};

# Iterate over feature structures
foreach my $fs (@$content) {
$content = $fs->{fs}->{f} or next;

foreach (@$content) {

next unless $_->{'#text'};
my $value = trim $_->{'#text'} or next;

# POS
if ($_->{-name} eq 'ctag') {
$mtt->add_by_term('cex/p:' . $value);
}

# Lemma
elsif ($_->{-name} eq 'lemma') {
$mtt->add_by_term('cex/l:' . $value);
}

# Phrase
elsif ($_->{-name} eq 'phrase') {
$mtt->add_by_term('cex/phrase:' . $value);
};

if ($start == 809) {
warn $mtt->to_string;
warn $value;
};
};
};
}) or return;

return 1;
};

sub layer_info {
['cex/p=tokens','cex/l=tokens','cex/phrase=tokens'];
};

1;
2 changes: 1 addition & 1 deletion lib/KorAP/XML/Krill.pm
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use Exporter 'import';

our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);

our $VERSION = '0.55';
our $VERSION = '0.56';

has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
Expand Down
4 changes: 4 additions & 0 deletions lib/KorAP/XML/Meta/I5.pm
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,10 @@ sub parse {
$key = 'A_';
my $title = $_->att('desc');
$value = $self->korap_data_uri($value, title => ($title // $value));
} elsif ($xtype eq 'number') {
$self->log->warn('Number currently not supported as xenodata type, treated as string');
$key = 'S_';
# Maybe render as Integer - but it's understood as a string
} else {
$self->log->warn('Unknown xenodata type: ' . $xtype);
return;
Expand Down
9 changes: 8 additions & 1 deletion script/korapxml2krill
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ use Fcntl qw(:flock SEEK_END);
# - Improve core count logging.
# ----------------------------------------------------------

our $LAST_CHANGE = '2024/06/04';
our $LAST_CHANGE = '2024/06/05';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
Expand Down Expand Up @@ -460,6 +460,10 @@ push(@layers,
['CoreNLP', 'Morpho'],
['CoreNLP', 'Constituency']);

# CorpusExplorer
push(@layers,
['CorpusExplorer', 'Morpho']);

# CMC
push(@layers, ['CMC', 'Morpho']);

Expand Down Expand Up @@ -1501,6 +1505,9 @@ L<Krill|https://github.com/KorAP/Krill>.
#NamedEntities
#Sentences
CorpusExplorer
#Morpho
CMC
#Morpho
Expand Down
Loading

0 comments on commit 5530a55

Please sign in to comment.