-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtalbanken_xml_morphsplit.rb
62 lines (49 loc) · 1.56 KB
/
talbanken_xml_morphsplit.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
### Applying the MorphSplit to talbanken.xml. Aleksandrs Berdicevskis, 2020-06-09 ###
STDERR.puts "Usage: ruby talbanken_xml_morphsplit.rb. The files talbanken_morphsplit.tsv and talbanken.xml have to be in the same directory."
require "Nokogiri"
inputfilename = "talbanken.xml"
@tbsize = 6160
devsize = 3080
testsize = 3080
list_ids_from_xml = false
split_ids = false
split_tb = true
def readdatasplit
ids = File.open("talbanken_morphsplit.tsv","r:utf-8")
datasplit = {}
ids.each_line do |line|
line1 = line.strip.split("\t")
datasplit[line1[0]] = line1[1]
end
return datasplit
end
def extract_sentences(filename)
STDERR.puts ("Parsing xml...")
talbanken = Nokogiri::XML(File.read(filename))
STDERR.puts ("Searching xml...")
text = talbanken.css("text").to_a[0]
sentences = text.css("sentence")
return sentences
end
if split_tb
sentences = extract_sentences(inputfilename)
STDERR.puts ("Creating the sets...")
devf = File.open("talbanken_dev.xml","w:utf-8")
testf = File.open("talbanken_test.xml","w:utf-8")
datasets = {devf => "dev", testf => "test"}
datasets_rev = {"dev" => devf, "test" => testf}
datasets.each_key do |dataset|
dataset.puts "<corpus id = \"talbanken_#{datasets[dataset]}\">"
dataset.puts "<text>"
end
STDERR.puts ("Reading the split data...")
datasplit = readdatasplit
STDERR.puts ("Splitting...")
sentences.each do |sentence|
datasets_rev[datasplit[sentence["id"].to_s]].puts sentence
end
datasets.each_key do |dataset|
dataset.puts "</text>"
dataset.puts "</corpus>"
end
end