-
Notifications
You must be signed in to change notification settings - Fork 1
/
join_lines.pl
38 lines (33 loc) · 971 Bytes
/
join_lines.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/perl
# This file takes as input the output of filter_metainfo_from_cclines.pl
use strict;
use utf8;
use open qw(:std :utf8); # This also works for the Diamond Operator
binmode STDOUT, ':utf8';
my $lastx;
my $firstrun = 1;
while (my $x = <STDIN>) {
chomp $x;
$x =~ s/[^[:print:]]//g; # Remove non-printing characters
if ($x eq '<?xml version="1.0" encoding="UTF-8"?>') {
print $x;
next;
}
my $y = $x;
$y =~ s/<.*?>//g;
unless ($y =~ /^\s*$/) { # If line contains something other than XML tags and whitespace, we print a newline, but not in the first run. [Why not in the first run??]
if ($firstrun == 1) {
$firstrun = 0;
}
else {
print "\n";
}
}
### FIX FOR ERROR IN CORENLP 3.7.0 discussed here: https://github.com/stanfordnlp/CoreNLP/issues/401
if (($lastx =~ /_$/) && ($x =~ /^</)) {print " ";}
$x =~ s/_</_ </g;
### END FIX (and $lastx becomes unnecessary when the fix is removed)
print $x;
$lastx = $x;
}
print "\n";