-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.js
executable file
·113 lines (103 loc) · 3.69 KB
/
extractor.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
let fs = require( 'fs');
let R = require('ramda');
let nexcel = require("node-xlsx");
let results = {};
let allValues = {};
const objFromKeys = R.curry((fn, keys) =>
R.zipObj(keys, R.map(fn, keys)));
const walk = function(chap, dir, done){
if(R.isNil(results[chap])) results[chap] = {};
fs.readdir(dir, function(err, list){
if(err) return done(err);
//console.log(JSON.stringify(list, null, 2));
var i = 0;
(function next() {
var file = list[i++];
if (!file){
return done(null, results[chap]);
}
file = dir + '/' + file;
fs.stat(file, function(err, stat) {
if (stat && stat.isDirectory()) {
walk(chap, file, function(err, res) {
// console.log(res);
// results = results.concat(res);
next();
});
} else {
//console.log(file);
if(file.indexOf(".json") > 0){
let fileDetail = fs.readFileSync(file, 'utf8')
let fileJsonObj = JSON.parse(fileDetail);
console.log(file)
try {
results[chap][file] = R.prop("events", fileJsonObj);
}catch(e){
console.error(e);
}
// console.log(results);
//console.log(results[file]);
}
//results.push(file);
next();
}
});
})();
})
};
const inserAllValues = function(chap, result){
allValues[chap] = allValues[chap]||[];
for(let key in result){
let val = result[key];
if(R.is(Object, val)){
inserAllValues(chap, val);
}else if(R.is(String, val)){
const tester = /[一-龠]+|[ぁ-ゔ]+|[ァ-ヴー]+|[々〆〤]+/u;
if(R.test(tester, val)){
let str = R.compose(R.trim,
R.replace(/c[0-9|l]/g,""),
R.replace(/c10/g,""),
R.replace(/[CL|L3A1|41|41\s103\s1\sON|L5A1|L1A2]/g,""),
R.replace(/[i\d\d\d|i\d\d]/g,""), R.replace(/[`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/]/gi,''))(val);
allValues[chap].push(str);
}
}else if(R.is(Array, val)){
inserAllValues(chap, val);
}else{
}
}
}
let totalError = 0;
R.map((chap)=>{
let chapName = R.split("/",chap)[2];
walk (chap, __dirname+"/"+chap, function(err, result){
if(err) console.error(err);
inserAllValues(chap, result);
const uniqValues = R.uniq(allValues[chap]);
let countObj = objFromKeys(key => [0, R.length(R.trim(key)),R.length(R.trim(key))], uniqValues)
//console.log(countObj);
R.map(str =>{
try{
countObj[str][0]++;
countObj[str][2] = countObj[str][1] * countObj[str][0];
}catch(e){
totalError++;
console.log("Str : "+str);
}
}, allValues[chap]);
let newArrays = R.map(key=>{ return [key].concat(countObj[key])}, R.keys(countObj));
let newArrStr = R.reduce((acc, arr)=>acc+arr.toString()+",\n","",newArrays)
console.log(chap+":"+R.reduce((acc, val)=>{
return acc + R.length(val);
}, 0, uniqValues));
var buffer = nexcel.build([{name: chapName, data: [["원문","출현수","문장길이","출현수*문장길이"]].concat(newArrays)}]); // Returns a buffer
fs.writeFileSync(__dirname + "/dest/"+chapName+"_Info.xlsx", buffer, 'utf8')
///console.log(chap + " leng : "+R.length(allValues[chap]))
});
},[process.argv[2]]);
//,"noel_s3","noel_s4","noel_s5","noel_s6",
/*
c0 ~ c10;는 제외
문장은 항상 trim
데이터는 문장 | 출현수 | 문장길이 | 출현수 * 문장길이
*/