-
Notifications
You must be signed in to change notification settings - Fork 0
/
format_PRSCSX_files.sh
61 lines (55 loc) · 2.68 KB
/
format_PRSCSX_files.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/bin/bash
#SBATCH --export=NONE # required when using 'module'
#SBATCH --job-name=format_PRSCSX_files
#SBATCH --nodes=1
#SBATCH --ntasks=20
#SBATCH --mem=8G
#SBATCH --time=1-00:00:00
#SBATCH --output=/krummellab/data1/zachsten/logs/%x-%j.out
module load CBI miniconda3
conda activate scipy_env
echo "scipy_env loaded correctly"
# Change the number of threads to use
num_threads=10
N_THREADS=2
export MKL_NUM_THREADS=$N_THREADS
export NUMEXPR_NUM_THREADS=$N_THREADS
export OMP_NUM_THREADS=$N_THREADS
# Change the number of threads to use
num_threads=10
N_THREADS=2
export MKL_NUM_THREADS=$N_THREADS
export NUMEXPR_NUM_THREADS=$N_THREADS
export OMP_NUM_THREADS=$N_THREADS
process_chromosome() {
pop=$1
chr=$2
echo ${pop} ${chr}
# Assign the proper directories for everything so that we can process all the information.
bfile_dir="/krummellab/data1/zachsten/PRS/individual_analysis/data/bfiles/"
association_dir="/krummellab/data1/zachsten/PRS/individual_analysis/data/association/"
#format the association files using awk into the proper format for PRS-CSx
input_bim_path=$bfile_dir${pop}/${pop}_chr${chr}.bim
target_bim_path=$association_dir${pop}/${pop}_chr${chr}.qassoc
output_path=$association_dir${pop}/${pop}_chr${chr}.formatted.qassoc
# Use awk to get everything in the right position and create a temporary file:
# This is good but it's missing the headers and is in the wrong order at the end file. The end file currently looks like: rs12255619 -0.1151 1.209 C A
awk -v OFS='\t' 'NR==FNR {if ($2 != ".") {a[$2]=$5; b[$2]=$6;} next} FNR==1 {print "SNP", "A1", "A2", "BETA", "P";} FNR>1 && $2 in a {snp=$2; a1=a[snp]; a2=b[snp]; beta=$5; p=$6; print snp, a1, a2, beta, p;}' "$target_bim_path" "$input_bim_path" > "$output_path"
awk -v OFS='\t' 'BEGIN {print "SNP", "A1", "A2", "BETA", "P"} NR>1 && $4 != "NA" && $5 != "NA" {print $1, $4, $5, $2, $3}' "$output_path" > "${output_path}.temp" && mv "${output_path}.temp" "$output_path"
awk 'FNR==NR{a[$2]=$9;next} $5!="NA"{OFS="\t"; $5=a[$1]; print}' "$target_bim_path" "$output_path" > "${output_path}.temp" && mv "${output_path}.temp" "$output_path"
echo "Processing $pop chromosome $chr"
}
export -f process_chromosome
# This is an intermediate file, change the file path to your working dir, keep the file name.
cmd_file="/krummellab/data1/zachsten/PRS/process_commands_creating_bfiles.txt"
> ${cmd_file}
for pop in AFR EUR EAS AMR SAS ; do
for chr in {1..22}; do
# if [[ "$pop" == "AFR" && "$chr" -eq 22 ]]; then
# continue
# else
echo "$pop $chr" >> "$cmd_file"
# fi
done
done
cat "$cmd_file" | xargs -n 2 -P $num_threads bash -c 'process_chromosome "$@"' _