-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOPM.jl
86 lines (76 loc) · 3.45 KB
/
OPM.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# OPM - Load / Create a dataset from flat text files or r datasource
# Set this to true to force a refresh from source datafiles
refresh = false
# Set this to true to install packages from the internet
install = false
# Hardcode a location to the sctfile here.
sctfile = raw"C:\Users\ME\Desktop\data\1973-09-to-2014-06\SCTFILE.TXT"
# This is the location of the root of the files
fileroot = raw"C:\Users\ME\Desktop\data\1973-09-to-2014-06"
# Location of files of interest within our root
subdir = "non-dod/status"
path = joinpath(fileroot, subdir)
# Filename to write our results to disk or read from disk.
result_filename = joinpath(path, "../non-dod-summary.rds")
filtered_filename = joinpath(path, "../non-dod-filtered.rds")
model_filename = joinpath(path, "../non-dod-model.rds")
# List of fields to pull from the files.
fields_of_interest = vcat(:PSEUDO_ID, :FILE_DATE, :AGENCY, :OCCUPATION, :LOS_LEVEL, :PAY_PLAN, :GRADE, :ADJUSTED_BASIC_PAY, :WORK_SCHEDULE)
### Executed code from here on ###
# Get our filelist and parse sctdata
files = [files for (root, dirs, files) in walkdir(path)]
files = [joinpath(path, file) for file in files[1]]
# This loads all files and includes depencenies
include(raw"C:\Users\ME\OneDrive\Documents\julia\FWF\src\FWF.jl")
include("OPMInstallDeps.jl")
sctdata = opm_parse_sctfile_fwf(sctfile)
# For now, load ad-hoc here - julia
files = files[endswith.(files, "_12.txt")]
data = opm_load_filelist(files, sctdata, columns = fields_of_interest, silent=false)
# If refresh...delete our files.
#if (refresh) {
# file.remove(c(result_filename, filtered_filename, model_filename))
#}
# Load and prepare data...
# otherwise load from source files and sleep for a few hours.
##if (!file.exists(result_filename)) {
# print(paste(Sys.time(), "No Basic Result File, Reparsing sources."))
# result <- opm_generate_dataset_from_fwfs(files, sctdata = sctdata, columns = fields_of_interest)
# write_rds(result, result_filename)
# # If we wrote this file remove the filtered file, just to be sure
# file.remove(filtered_filename)
# print(paste(Sys.time(), "Result file generated"))
#}
#if (!file.exists(filtered_filename)) {
# print(paste(Sys.time(), "No Filtered File, Performing filtering and grouping"))
# rm(result)
# gc()
# This just always reads from the rds to hopefully allow the above to cleanup memory.
# result <- read_rds(result_filename) %>% ungroup() %>% opm_filter_unusable_rows() %>% opm_add_years() %>% group_by(`PSEUDO-ID`)
# Sneak in a gc to eek out a bit more memory.
# gc()
# result <- result %>% opm_filter_unusable_ids() %>% opm_number_year_runs() %>% group_by(`PSEUDO-ID`, `group-id`)
# write_rds(result, filtered_filename)
# print(paste(Sys.time(), "Filtered file generated"))
#} else if (!file.exists(model_filename)) {
# print(paste(Sys.time(), "Filtered File found. Loading for model building."))
# result <- read_rds(filtered_filename)
#}
#if (!file.exists(model_filename)) {
# print(paste(Sys.time(), "Model file not found, building model"))
# model <- opm_generate_model_tibble(result)
# write_rds(model, model_filename)
# rm(result)
# gc()
# print(paste(Sys.time(), "Model file generated"))
#} else {
# print(paste(Sys.time(), "Loading model file."))
# model <- read_rds(model_filename)
#}
# Interesting cases to look at...
# 8,289, 21, 14, 10357722, 166
# 21 is kelli
# 8 departs after 3 years of no increases
# 166, 289, 14 fragmented employment
#
#72738 - termination + long service