-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse-language-data.js
114 lines (101 loc) · 3.68 KB
/
parse-language-data.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
const fs = require('fs')
const CsvParser = require('csv-parse/lib/sync')
var output = {}
var countryLookup = {}
var langLookUp = {}
// extract country codes to the countryLookup:
const countryCodesCSV = fs.readFileSync('./research/country-codes.csv', 'utf8')
CsvParser(countryCodesCSV, {
columns: true,
skip_empty_lines: true,
}).forEach(items => {
output[items['short2']] = {
countryName: items.fullname,
languages: {},
populationCount: null,
}
countryLookup[items.fullname.toLowerCase().trim()] = {
code: items.short2,
}
})
// extract language data and codes to langLookup
const langCodesCSV = fs.readFileSync('./research/language-codes.csv', 'utf8')
const parsedCodes = CsvParser(langCodesCSV, {
columns: true,
skip_empty_lines: true,
})
parsedCodes.forEach(items => {
const longLanguage = items['lang en'].toLowerCase()
const langCode = items['short2']
// example: English: 'en'
langLookUp[longLanguage] = langCode
})
// Add population counts per country in output
const populations = fs.readFileSync('./research/refined-data/world-populations.csv', 'utf8').split('\n')
populations.forEach(function (element) {
let pairing = element.split(',')
const country = pairing[0].trim().toLowerCase()
let count = null
if (pairing.length === 2) {
count = parseInt(pairing[1].replace(/[\s]+/g, '').match(/[\d]+$/), 10) * 1000
}
if (countryLookup[country]) {
output[countryLookup[country].code]['populationCount'] = count
}
})
/**
* Extract CIA language census data:
*/
const ciaLangCensus = fs.readFileSync('./research/refined-data/cia-language-census.txt', 'utf8').split('\n')
const ciaLangCensusLookup = {}
ciaLangCensus.filter(function (line) {
return line[0] !== ' '
}).forEach(element => {
// console.log(element)
ciaLangCensusLookup[element.replace(':', '')] = {}
})
let countryName = ''
let countryCode = null
ciaLangCensus.forEach(function (line) {
if (line[0] !== ' ') {
// keep track of which country's languages we're parsing:
countryName = line.replace(':', '').trim().toLowerCase()
countryCode = countryLookup[countryName] ? countryLookup[countryName].code : null
} else if (countryCode) {
// remove leading whitespace for the languages:
const languageSet = line.trim().replace(/[\s]+/g, ':').split(':')
// split of the percentage value into a separate object per language:
const language = languageSet[0]
const langCode = langLookUp[language.toLowerCase()]
if (langCode) {
const percentage = (languageSet[1]) ? parseFloat(languageSet[1]) : null
output[countryCode].languages[langCode] = {}
output[countryCode].languages[langCode]['CIA'] = { percentage, }
if (percentage === null) {
console.log(languageSet)
}
}
}
})
/**
* Extract United Nations language census data:
*/
const UNdata = fs.readFileSync('./research/refined-data/UNdata_Export_20180311_154013997.csv', 'utf8')
const UNrecords = CsvParser(UNdata, {
columns: true,
skip_empty_lines: true,
}).filter(rec => {
return rec['Area'] === 'Total' && rec['Sex'] === 'Both Sexes'
})
UNrecords.forEach(record => {
const countryCode = countryLookup[record['Country or Area'].toLowerCase()] ? countryLookup[record['Country or Area'].toLowerCase()].code : null
const langCode = langLookUp[record['Language'].toLowerCase()] ? langLookUp[record['Language'].toLowerCase()] : null
if (langCode && countryCode) {
const count = parseInt(record['Value'], 10)
if (!output[countryCode].languages[langCode]) {
output[countryCode].languages[langCode] = {}
}
output[countryCode].languages[langCode]['UN'] = { count, }
}
})
fs.writeFile('./research/refined-data/world-languages.json', JSON.stringify(output), 'utf8', () => {})