-
Notifications
You must be signed in to change notification settings - Fork 0
/
show_age_distr2009.rb
94 lines (78 loc) · 3.16 KB
/
show_age_distr2009.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
subforums = ["adoption","allmanna-ekonomi","allmanna-familjeliv","allmanna-fritid","allmanna-husdjur","allmanna-hushem","allmanna-kropp","allmanna-noje","allmanna-samhalle","allmanna-sandladan","anglarum","foralder","gravid","kansliga","medlem-allmanna","medlem-foraldrar","medlem-planerarbarn","medlem-vantarbarn","pappagrupp","planerarbarn","sexsamlevnad","svartattfabarn","expert"]
PATH = "C:\\Sasha\\D\\DGU\\CassandraMy\\SMCorpora\\familjeliv-age\\"
#PATH = "D:\\D\\DGU\\CassandraMy\\SMCorpora\\familjeliv-age\\"
token_threshold = 10000
firstage = 18
#total_threshold = 10
yob_authorhash = Hash.new{|hash,key| hash[key] = Hash.new(0.0)}
age_authorhash = Hash.new{|hash,key| hash[key] = Hash.new(0.0)}
cohort_authorhash = Hash.new{|hash,key| hash[key] = Hash.new(0.0)}
yearhash = Hash.new(0.0)
authorhash = Hash.new(0.0)
author_yob = {}
subforums.each do |subforum|
STDERR.puts subforum
f = File.open("#{PATH}familjeliv-#{subforum}_sentence_age#{token_threshold}_#{firstage}.conllu","r:utf-8")
current_age = ""
current_agebin = ""
current_username = ""
current_year = ""
yob = ""
prevprev_tokenc = ""
prev_tokenc = ""
prev_pos = ""
prevprev_pos = ""
prevprev_deprel = ""
prev_deprel = ""
prev_lemma = []
prevprev_lemma = []
prevprevprev_lemma = []
prev_msd = ""
prevprev_msd = ""
prevprevprev_msd = ""
f.each_line do |line|
line1 = line.strip
if line1 == "" #not necessary to reset all variables, but may be worth it for safety's sake
current_age = ""
current_agebin = ""
current_username = ""
current_year = ""
yob = ""
prevprev_tokenc = ""
prev_tokenc = ""
prev_pos = ""
prevprev_pos = ""
prevprev_deprel = ""
prev_deprel = ""
elsif line1[0] == "#"
if line1.include?("# agebin")
current_agebin = line1.split(" = ")[1]
elsif line1.include?("# age")
current_age = line1.split(" = ")[1]
elsif line1.include?("# yob")
yob = line1.split(" = ")[1].to_i
elsif line1.include?("# username")
current_username = line1.split(" = ")[1]
elsif line1.include?("# post_date")
current_year = line1.split(" = ")[1].split("-")[0].to_i
#authorhash[current_username] = true
end
else
if yob != 1970 and [2008,2009,2010].include?(current_year)
#yob_authorhash[current_year][yob] += 1
#age_authorhash[current_year[current_age] += 1
#cohort_authorhash[current_year][current_agebin] += 1
#yearhash[current_year] += 1
authorhash[current_username] += 1
if author_yob[current_username].nil?
author_yob[current_username] = yob
end
end
end
end
end
o = File.open("age\\stats2008-2010.tsv","w:utf-8")
o.puts "author\tyob\tntokens"
authorhash.each_pair do |author,ntokens|
o.puts "#{author}\t#{author_yob[author]}\t#{ntokens}"
end