-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathworld-cup.jl
315 lines (254 loc) · 10.7 KB
/
world-cup.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
using Plots, CSV, DataFrames
plotly()
Plots.default(show=true)
histWorldCup = CSV.read("data/world-cup/WorldCupsSummary.csv", DataFrame)
#=
1. 历年现场观众人数变化趋势
2. 参赛队伍数变化趋势
3. 历年进球数变化趋势
4. 历史上夺冠次数最多的国家队是哪支?
5. 夺冠队伍所在洲分析
6. 哪些国家队能经常打入决赛/半决赛?
7. 进入决赛的队伍夺冠概率是多少?
8. 东道主(主办国)进入决赛/半决赛大吗?
=#
# 历年现场观众人数变化趋势
# 数据预处理
let columns = names(histWorldCup)
for column in columns
replace!(histWorldCup[!, column], "Germany FR" => "Germany")
end
end
let title = "Attendance Number"
x = histWorldCup.Attendance
y = histWorldCup.Year
xticks = [500000, 1000000, 1500000, 2000000, 250000000, 3000000, 35000000, 4000000]
yticks = histWorldCup.Year
scatter(x, y, xticks=(:all, xticks), yticks=yticks, title=title) |> display
end
# 参赛队伍数变化趋势
let title = "QUalifiedTeams Number"
x = histWorldCup.QualifiedTeams
y = histWorldCup.Year
xticks = [0, 16, 24, 32, 48]
yticks = histWorldCup.Year
scatter(x, y, xticks=(:all, xticks), yticks=yticks, title=title) |> display
end
# 历年进球数变化趋势
let title = "Goals Number"
x = histWorldCup.GoalsScored
y = histWorldCup.Year
xticks = [50, 75, 100, 125, 150, 175, 200]
yticks = histWorldCup.Year
scatter(x, y, xticks=(:all, xticks), yticks=yticks, title=title) |> display
end
# 夺冠次数分析
import StatsBase: countmap
let title = "Champion Number Statistic"
x = histWorldCup.Winner
cmap = countmap(x)
xs = convert(Vector{String}, collect(keys(cmap)))
ys = collect(values(cmap))
bar(xs, ys, title=title) |> display
end
# 半决赛(4强)队伍次数统计
cmap1 = countmap(histWorldCup.Winner)
cmap2 = countmap(histWorldCup.Second)
cmap3 = countmap(histWorldCup.Third)
cmap4 = countmap(histWorldCup.Fourth)
countries = DataFrame()
_countries = cat(collect(keys(cmap1)),
collect(keys(cmap2)),
collect(keys(cmap3)),
collect(keys(cmap4)), dims=1) |> unique |> sort
countries[!, :Index] = _countries
fn(country::AbstractString, cmap::Dict{<:AbstractString,Int}) = begin
if !haskey(cmap, country)
return 0
else
return cmap[country]
end
end
countries[!, :Winner] = map(x -> fn(x, cmap1), countries[!, :Index])
countries[!, :Second] = map(x -> fn(x, cmap2), countries[!, :Index])
countries[!, :Third] = map(x -> fn(x, cmap3), countries[!, :Index])
countries[!, :Fourth] = map(x -> fn(x, cmap4), countries[!, :Index])
countries[!, :SemiFinal] = countries[!, :Winner] .+ countries[!, :Second] .+ countries[!, :Third] .+ countries[!, :Fourth]
countries[!, :Final] = countries[!, :Winner] .+ countries[!, :Second]
let
x = countries[!, :Index]
y = countries[!, :SemiFinal]
title = "SemiFinal Statistic"
bar(x, y, title=title, xrotation=45, xticks=:all) |> display
end
# 决赛队伍次数统计
filterfn = [:Winner, :Second] => (winner, second) -> !((winner == 0) & (second == 0) != 0)
finalist = filter(filterfn, countries)
let x = finalist[!, :Index]
y = finalist[!, :Final]
title = "Final Statistic"
bar(x, y, title = title, xticks = :all, xrotation = 45) |> display
end
# 进入决赛后夺冠以来分析
let
finalist[!, :ChampionProb] = finalist[!, :Winner] ./ finalist[:, :Final]
indexs = (finalist[!, :Second] .> 0) .| (finalist[!, :Winner] .> 0)
ratios = finalist[indexs, :]
x = ratios[!, :Index]
y = ratios[!, :ChampionProb]
title = "Percentage of winning reaching the final"
bar(x, y, title = title, xticks = :all, xrotation = 45) |> display
end
# 夺冠队伍所在大洲分布
let cmap = countmap(histWorldCup.WinnerContinent)
xs = collect(keys(cmap))
ys = collect(values(cmap))
title = "Champion Continent Numbers"
bar(xs, ys, title = title, xticks = :all) |> display
summary = reduce(+, ys)
_values = map(x -> x / summary, ys)
title = "Champion Continent Ratios"
pie(xs, _values, title = title) |> display
end
# 东道主进入半决赛/决赛/夺冠概率统计
let hostTop4 = map(row -> in(row.HostCountry, [row.Winner, row.Second, row.Third, row.Fourth]) ? 1 : 0, eachrow(histWorldCup))
cmap = countmap(hostTop4)
x = collect(keys(cmap))
y = collect(values(cmap))
title = "Host in Top4"
bar(x, y, title = title, xticks = :all) |> display
summary = reduce(+, y)
_values = map(x -> x / summary, y)
title = "Percentage"
pie(x, _values, title = title) |> display
end
# 东道主进入决赛概率
let hostTop2 = map(row -> in(row.HostCountry, [row.Winner, row.Second]) ? 1 : 0, eachrow(histWorldCup))
cmap = countmap(hostTop2)
x = collect(keys(cmap))
y = collect(values(cmap))
title = "Host in Top2"
bar(x, y, title = title, xticks = :all) |> display
summary = reduce(+, y)
_values = map(x -> x / summary, y)
title = "Percentage"
pie(x, _values, title = title) |> display
end
# 东道主夺冠概率
let hostWinner = map(row -> row.HostCountry == row.Winner ? 1 : 0, eachrow(histWorldCup))
cmap = countmap(hostWinner)
x = collect(keys(cmap))
y = collect(values(cmap))
title = "Host in Winner"
bar(x, y, title = title, xticks = :all) |> display
summary = reduce(+, y)
_values = map(x -> x / summary, y)
title = "Percentage"
pie(x, _values, title = title) |> display
end
# 分析世界杯比赛信息表
# PROBLEM 在 Attendance 字段有 missing 数据,然而在教程中我没有找到相关 fillmissing 的语句
matches = CSV.read("data/world-cup/WorldCupMatches.csv", DataFrame)
# 中国队参加的比赛
let indexs = (matches[!, Symbol("Away Team Name")] .== "China PR") .| (matches[!, Symbol("Home Team Name")] .== "China PR")
matches[indexs, :]
end
# 统一联邦德国和德国
let columns = names(matches)
for column in columns
replace!(matches[!, column], "Germany FR" => "Germany")
end
end
# 类型转化
matches[!, "Home Team Goals"] = convert(Vector{Int}, matches[!, "Home Team Goals"])
matches[!, "Away Team Goals"] = convert(Vector{Int}, matches[!, "Away Team Goals"])
matches[!, "Result"] = map((x, y) -> "$(x) - $(y)", matches[!, "Home Team Goals"], matches[!, "Away Team Goals"])
# 现场观赛人数分析
# PROBLEM there is missing data in `attendance` column
let top5Attendance = first(sort(matches, [order(:Attendance, rev=true)]), 5)
top5Attendance[!, :VS] = map((x, y) -> "$(x) VS $(y)", top5Attendance[!, "Home Team Name"], top5Attendance[!, "Away Team Name"])
x = top5Attendance[!, :VS]
y = top5Attendance[!, :Attendance]
bar(x, y) |> display
end
# 比赛进球数分析
matches[!, :TotalGoals] = matches[!, "Home Team Goals"] .+ matches[!, "Away Team Goals"]
matches[!, :VS] = map((x, y) -> "$(x) VS $(y)", matches[!, "Home Team Name"], matches[!, "Away Team Name"])
let top10Goals = first(sort(matches, [order(:TotalGoals, rev=true)]), 10)
top10Goals[!, :VS] = map((x, y) -> "$(x) VS $(y)", top10Goals[!, "Home Team Name"], top10Goals[!, "Away Team Name"])
top10Goals[!, :TotalGoalsStr] = map(x -> "$(x) goals scored", top10Goals[!, "TotalGoals"])
top10Goals[!, "Home Team Goals"] = convert(Vector{Int}, top10Goals[!, "Home Team Goals"])
top10Goals[!, "Away Team Goals"] = convert(Vector{Int}, top10Goals[!, "Away Team Goals"])
top10Goals[!, "Result"] = map((x, y) -> "$(x) - $(y)", top10Goals[!, "Home Team Goals"], top10Goals[!, "Away Team Goals"])
x = top10Goals[!, "VS"]
y = top10Goals[!, "TotalGoals"]
title = "Top10 Goals Match"
bar(x, y, title = title, xrotation = 90, xticks = :all) |> display
end
# 我们再来分析比赛分差最大的比赛
let
matches[!, "DifferenceGoals"] = abs.(matches[!, "Home Team Goals"] .- matches[!, "Away Team Goals"])
top10Difference = first(sort(matches, [order("DifferenceGoals", rev = true)]), 10)
top10Difference[!, "DifferenceGoals"] = convert(Vector{Int}, top10Difference[!, "DifferenceGoals"])
top10Difference[!, "DifferenceGoalsStr"] = map(x -> "$(x) goals difference", top10Difference[!, "DifferenceGoals"])
top10Difference[!, "Result"] = map((x, y) -> "$(x) - $(y)", top10Difference[!, "Home Team Goals"], top10Difference[!, "Away Team Goals"])
x = top10Difference[!, "VS"]
y = top10Difference[!, "DifferenceGoals"]
title = "Top10 Biggest Difference Matches"
bar(x, y, title = title, xrotation = 90) |> display
end
# 进球数分析
using StatsPlots
let columns = names(matches)
for column in columns
replace!(matches[!, column], "Germany FR" => "Germany")
end
listCountries = unique(matches[!, "Home Team Name"])
listHome = Int[]
listAway = Int[]
for country in listCountries
indexs = matches[!, "Home Team Name"] .== country
goalsHome = reduce(+, matches[!, "Home Team Goals"][indexs])
push!(listHome, goalsHome)
indexs = matches[!, "Away Team Name"] .== country
goalsAway = reduce(+, matches[!, "Away Team Goals"][indexs])
push!(listAway, goalsAway)
end
df = DataFrame(Country = listCountries, TotalHomeGoals = listHome, TotalAwayGoals = listAway)
df[!, "TotalGoals"] = df[!, "TotalHomeGoals"] .+ df[!, "TotalAwayGoals"]
mostGoals = first(sort(df, [order(:TotalGoals, rev=true)]), 10)
x = mostGoals[!, "Country"]
y = select(mostGoals, ["TotalHomeGoals", "TotalAwayGoals", "TotalGoals"]) |> Matrix
groupedbar(x, y, xrotation = 90, labels = ["TotalHomeGoals" "TotalAwayGoals" "TotalGoals"]) |> display
end
# 失球数分析
let finalista = finalist[!, :Index]
goalsConcededHome = Int[]
goalsConcededAway = Int[]
match1 = Int[]
match2 = Int[]
for country in finalista
indexs = matches[!, "Home Team Name"] .== country
goalsConcHome = reduce(+, matches[!, "Away Team Goals"][indexs])
push!(goalsConcededHome, goalsConcHome)
counted1 = reduce(+, indexs)
indexs = matches[!, "Away Team Name"] .== country
goalsConcAway = reduce(+, matches[!, "Home Team Goals"][indexs])
push!(goalsConcededAway, goalsConcAway)
counted2 = reduce(+, indexs)
push!(match1, counted1)
push!(match2, counted2)
end
df = DataFrame(Country = finalista, GoalsConcededHome = goalsConcededHome, GoalsConcededAway = goalsConcededAway, MatchesHome = match1, MatchesAway = match2)
df[!, "TotalMatches"] = df[!, "MatchesHome"] .+ df[!, "MatchesAway"]
df[!, "TotalGoalsConceded"] = df[!, "GoalsConcededHome"] .+ df[!, "GoalsConcededAway"]
df[!, "GoalMatchRate"] = round.(df[!, "TotalGoalsConceded"] ./ df[!, "TotalMatches"], digits = 2)
goalsConceded = first(sort(df, [order("GoalMatchRate", rev=true)]), 10)
x = goalsConceded[!, "Country"]
y = goalsConceded[!, "TotalGoalsConceded"]
bp1 = bar(x, y, xrotation = 45)
x = goalsConceded[!, "Country"]
y = goalsConceded[!, "GoalMatchRate"]
bp2 = bar(x, y, xrotation = 45)
plot(bp1, bp2) |> display
end