-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_wiki.rb
65 lines (56 loc) · 2.21 KB
/
parse_wiki.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
require 'nokogiri'
require 'open-uri'
# Fetch and parse the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes'
html = URI.open(url)
doc = Nokogiri::HTML(html)
# Locate the first table with class 'wikitable'
table = doc.at('table.wikitable')
# Initialize an array to hold the language data
languages = []
# Iterate over each row in the table
table.css('tr').each do |row|
cells = row.css('td')
next if cells.empty? # Skip header or empty rows
# Extract the relevant data from each cell, stripping whitespace and handling nil values
name = cells[0]&.text&.strip || ''
iso_639_1 = cells[1]&.text&.strip || ''
iso_639_2t = cells[2]&.text&.strip || ''
iso_639_2b = cells[3]&.text&.strip || ''
iso_639_3 = cells[4]&.text&.strip&.slice(0, 3) || '' # Take only the first 3 characters
scope = cells[5]&.text&.strip || ''
type_of = cells[6]&.text&.strip || ''
endonyms = cells[7]&.text&.strip || ''
other_names = cells[8]&.text&.strip || ''
notes = cells[9]&.text&.strip || ''
# Escape single quotes in the text
name = name.gsub("'", "\\\\'")
endonyms = endonyms.gsub("'", "\\\\'")
other_names = other_names.gsub("'", "\\\\'")
notes = notes.gsub("'", "\\\\'")
# Append the data to the languages array
languages << {
name: name,
iso_639_1: iso_639_1,
iso_639_2t: iso_639_2t,
iso_639_2b: iso_639_2b,
iso_639_3: iso_639_3,
scope: scope,
type_of: type_of,
endonyms: endonyms,
other_names: other_names,
notes: notes
}
end
# Write the languages data to a seeds file
File.open('languages.seeds.rb', 'w') do |file|
file.write("languages = [\n")
languages.each do |language|
file.write(" { name: '#{language[:name]}', iso_639_1: '#{language[:iso_639_1]}', iso_639_2t: '#{language[:iso_639_2t]}', iso_639_2b: '#{language[:iso_639_2b]}', iso_639_3: '#{language[:iso_639_3]}', scope: '#{language[:scope]}', type_of: '#{language[:type_of]}', endonyms: '#{language[:endonyms]}', other_names: '#{language[:other_names]}', notes: '#{language[:notes]}' },\n")
end
file.write("]\n\n")
file.write("languages.each do |language|\n")
file.write(" Language.create!(language)\n")
file.write("end\n")
end
puts 'languages.seeds.rb file has been generated.'