Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
mtcicero26 authored Oct 25, 2024
1 parent 9759f8b commit 85aaa8e
Show file tree
Hide file tree
Showing 11 changed files with 16,319 additions and 81 deletions.
4,098 changes: 4,098 additions & 0 deletions Example datasets/input files/accessible_probs.tsv

Large diffs are not rendered by default.

4,000 changes: 4,000 additions & 0 deletions Example datasets/input files/dm6_example.bed

Large diffs are not rendered by default.

4,098 changes: 4,098 additions & 0 deletions Example datasets/input files/inaccessible_probs.tsv

Large diffs are not rendered by default.

3,843 changes: 3,843 additions & 0 deletions Example datasets/output files/dm6_example_fp.bed

Large diffs are not rendered by default.

Binary file not shown.
151 changes: 151 additions & 0 deletions Example datasets/output files/dm6_example_training-reads.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
,rid
0,m64018_201129_132425/59835693/ccs
1,m64018_201129_132425/50528955/ccs
2,m64018_201129_132425/177998575/ccs
3,m64018_201129_132425/78776755/ccs
4,m64018_201129_132425/113771036/ccs
5,m64018_201129_132425/146212081/ccs
6,m64018_201129_132425/80874284/ccs
7,m64018_201129_132425/105121275/ccs
8,m64018_201129_132425/115936000/ccs
9,m64018_201129_132425/155256051/ccs
10,m64018_201129_132425/19662360/ccs
11,m64018_201129_132425/116393866/ccs
12,m64018_201129_132425/179766291/ccs
13,m64018_201129_132425/86114563/ccs
14,m64018_201129_132425/31591011/ccs
15,m64018_201129_132425/74055921/ccs
16,m64018_201129_132425/112002570/ccs
17,m64018_201129_132425/15992373/ccs
18,m64018_201129_132425/148834350/ccs
19,m64018_201129_132425/173670992/ccs
20,m64018_201129_132425/119079490/ccs
21,m64018_201129_132425/20055102/ccs
22,m64018_201129_132425/19399253/ccs
23,m64018_201129_132425/29820385/ccs
24,m64018_201129_132425/20120996/ccs
25,m64018_201129_132425/133431430/ccs
26,m64018_201129_132425/174719910/ccs
27,m64018_201129_132425/142739317/ccs
28,m64018_201129_132425/72417739/ccs
29,m64018_201129_132425/138151583/ccs
30,m64018_201129_132425/49940922/ccs
31,m64018_201129_132425/122093743/ccs
32,m64018_201129_132425/104268626/ccs
33,m64018_201129_132425/161154439/ccs
34,m64018_201129_132425/51053082/ccs
35,m64018_201129_132425/93783185/ccs
36,m64018_201129_132425/145491164/ccs
37,m64018_201129_132425/177867046/ccs
38,m64018_201129_132425/97518542/ccs
39,m64018_201129_132425/144441948/ccs
40,m64018_201129_132425/81331160/ccs
41,m64018_201129_132425/154995479/ccs
42,m64018_201129_132425/33030399/ccs
43,m64018_201129_132425/177537919/ccs
44,m64018_201129_132425/111937334/ccs
45,m64018_201129_132425/78709428/ccs
46,m64018_201129_132425/125961911/ccs
47,m64018_201129_132425/170722639/ccs
48,m64018_201129_132425/104399311/ccs
49,m64018_201129_132425/122882691/ccs
50,m64018_201129_132425/113443387/ccs
51,m64018_201129_132425/33228025/ccs
52,m64018_201129_132425/173539402/ccs
53,m64018_201129_132425/76874120/ccs
54,m64018_201129_132425/53740271/ccs
55,m64018_201129_132425/13501637/ccs
56,m64018_201129_132425/61605509/ccs
57,m64018_201129_132425/75105210/ccs
58,m64018_201129_132425/82575805/ccs
59,m64018_201129_132425/38733539/ccs
60,m64018_201129_132425/63833774/ccs
61,m64018_201129_132425/75958547/ccs
62,m64018_201129_132425/157877601/ccs
63,m64018_201129_132425/100010828/ccs
64,m64018_201129_132425/40961377/ccs
65,m64018_201129_132425/154143072/ccs
66,m64018_201129_132425/111476736/ccs
67,m64018_201129_132425/114100509/ccs
68,m64018_201129_132425/150341080/ccs
69,m64018_201129_132425/160236717/ccs
70,m64018_201129_132425/107676246/ccs
71,m64018_201129_132425/118161751/ccs
72,m64018_201129_132425/175769046/ccs
73,m64018_201129_132425/19138344/ccs
74,m64018_201129_132425/108137099/ccs
75,m64018_201129_132425/87688704/ccs
76,m64018_201129_132425/36767582/ccs
77,m64018_201129_132425/90112892/ccs
78,m64018_201129_132425/5637872/ccs
79,m64018_201129_132425/115541525/ccs
80,m64018_201129_132425/95355262/ccs
81,m64018_201129_132425/16845195/ccs
82,m64018_201129_132425/134285959/ccs
83,m64018_201129_132425/16058776/ccs
84,m64018_201129_132425/7144203/ccs
85,m64018_201129_132425/52758489/ccs
86,m64018_201129_132425/54200132/ccs
87,m64018_201129_132425/27984098/ccs
88,m64018_201129_132425/86443033/ccs
89,m64018_201129_132425/41420380/ccs
90,m64018_201129_132425/77006461/ccs
91,m64018_201129_132425/142542538/ccs
92,m64018_201129_132425/146669784/ccs
93,m64018_201129_132425/27330408/ccs
94,m64018_201129_132425/87359613/ccs
95,m64018_201129_132425/2689191/ccs
96,m64018_201129_132425/102629548/ccs
97,m64018_201129_132425/163513803/ccs
98,m64018_201129_132425/852612/ccs
99,m64018_201129_132425/26085411/ccs
100,m64018_201129_132425/24576528/ccs
101,m64018_201129_132425/137759476/ccs
102,m64018_201129_132425/35914220/ccs
103,m64018_201129_132425/101778531/ccs
104,m64018_201129_132425/97125814/ccs
105,m64018_201129_132425/70845169/ccs
106,m64018_201129_132425/87493196/ccs
107,m64018_201129_132425/87493196/ccs
108,m64018_201129_132425/44433866/ccs
109,m64018_201129_132425/59441460/ccs
110,m64018_201129_132425/100860177/ccs
111,m64018_201129_132425/8913286/ccs
112,m64018_201129_132425/58394634/ccs
113,m64018_201129_132425/100205421/ccs
114,m64018_201129_132425/19268899/ccs
115,m64018_201129_132425/33818500/ccs
116,m64018_201129_132425/44108008/ccs
117,m64018_201129_132425/109511077/ccs
118,m64018_201129_132425/163449133/ccs
119,m64018_201129_132425/24185046/ccs
120,m64018_201129_132425/124258490/ccs
121,m64018_201129_132425/76613822/ccs
122,m64018_201129_132425/57671992/ccs
123,m64018_201129_132425/50136773/ccs
124,m64018_201129_132425/134678842/ccs
125,m64018_201129_132425/156697166/ccs
126,m64018_201129_132425/95551542/ccs
127,m64018_201129_132425/159057316/ccs
128,m64018_201129_132425/36046930/ccs
129,m64018_201129_132425/88278674/ccs
130,m64018_201129_132425/178913817/ccs
131,m64018_201129_132425/22087156/ccs
132,m64018_201129_132425/158206485/ccs
133,m64018_201129_132425/8979707/ccs
134,m64018_201129_132425/139855570/ccs
135,m64018_201129_132425/152699751/ccs
136,m64018_201129_132425/81726277/ccs
137,m64018_201129_132425/108137099/ccs
138,m64018_201129_132425/169739238/ccs
139,m64018_201129_132425/16779652/ccs
140,m64018_201129_132425/130221613/ccs
141,m64018_201129_132425/43384891/ccs
142,m64018_201129_132425/174456929/ccs
143,m64018_201129_132425/144706087/ccs
144,m64018_201129_132425/148441221/ccs
145,m64018_201129_132425/7407615/ccs
146,m64018_201129_132425/76089528/ccs
147,m64018_201129_132425/154732065/ccs
148,m64018_201129_132425/155123719/ccs
149,m64018_201129_132425/31852035/ccs
Binary file added Example models/dm6_model.pickle
Binary file not shown.
29 changes: 25 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- **pickle**
- **tqdm**
- **h5py**
- **tempfile**
- **temnpfile**
- **multiprocessing** (for multiprocess version)
- **threading** (for multiprocess version)
- **logging** (for multiprocess version)
Expand All @@ -29,7 +29,7 @@
- `-r` Minimum read length (default: 1000).
- `-s` Count of reads until you write a checkpoint output (default: 10000).
- **Purpose:** Generates emission probabilities if you want to use your own different control datasets.
- **Note:** Pre-generated accessible and inaccessible probability TSV files are provided, based on experimental data described in our manuscript. Even with different organisms, changes to PacBio technology, and improvements to methylation calling we haven't found major changes in these control datasets. It's also not really necessary to run this on an entire dataset: the default parameters for total read count is more than enough to get stable probabilities. You will need to run this on both an accessible and inaccessible dataset (see our manuscript).
- **Note:** Pre-generated accessible and inaccessible probability TSV files are provided, based on experimental data described in our manuscript. Even with different organisms, changes to PacBio technology, and improvements to methylation calling we haven't found major changes in these control datasets. It's also not really necessary to run this on an entire dataset: the default parameters for total read count is more than enough to get stable probabilities. You will need to run this on both an accessible and inaccessible dataset (see our manuscript). These are also encoded in the model, so if you aren't training a new model you also don't need to worry about these.

### 2. **Encode context**
- **Command:** `encode_context.py -i path_to/genome_name.fa -o path_to/genome_name.h5`
Expand All @@ -43,20 +43,41 @@
- `-r` Total number of reads to use across all datasets.
- `-s` Random seed for reproducibility.
- `-b` Column number (0-based) in BED files with methylation starts (e.g., 11 for m6A output from fibertools, 27 by default for full output).
- `-e` How many bases to mask on both ends of the read as 0% methylation probability (default is 10). Required due to the fact that fibertools needs a window to call methylations.
- `-o` Directory path for storing output files (models, list of reads used in training).
- **Purpose:** Trains the model on a set of reads from specified datasets. Outputs the best model, a list of models in pickle format, and a TSV of reads used in training.
- **Usage:** In general, run once per organism. Model parameters should remain consistent across similar datasets.

### 4. **Apply model**
- **Command:**
- Single-core version: `apply_model.py -i path_to/dataset_1.bed -m path_to/best_model.pickle -t path_to/training-reads.tsv -g path_to/genome_name.h5 -p path_to/accessible_probs.tsv,path_to/inaccessible_probs.tsv -o path_to_output_directory`
- Multiprocess version: `apply_model_multiprocess.py -i path_to/dataset.bed -m path_to/best_model.pickle -t path_to/training-reads.tsv -g path_to/genome_name.h5 -p path_to/accessible_probs.tsv,path_to/inaccessible_probs.tsv -o path_to_output_directory`
- Single-core version: `apply_model.py -i path_to/dataset.bed -m path_to/best_model.pickle -t path_to/training-reads.tsv -g path_to/genome_name.h5 -o path_to_output_directory`
- Multiprocess version: `apply_model_multiprocess.py -i path_to/dataset.bed -m path_to/best_model.pickle -t path_to/training-reads.tsv -g path_to/genome_name.h5 -o path_to_output_directory`
- **Optional Parameters:**
- `-l` Minimum footprints allowed per read (default: 0).
- `-r` Enable circular mode (default: off).
- `-s` Chunk size (default: 50000).
- `-e` How many bases to mask on both ends of the read as 0% methylation probability (default is 10). Required due to the fact that fibertools needs a window to call methylations.
- Multiprocess-specific:
- `-c` Core count for parallel processing (default: all available CPU cores, typically I recommend 4-8 for stability).
- `-x` Timeout in seconds before restarting the pool (default: core count * 100).
- **Purpose:** Applies the trained model to remaining data. Outputs results in BED12 format, including footprint starts and lengths. Multiprocess version offers faster execution but may require tuning for stability.
- **Note:** When using circular mode, reads are tiled 3x, affecting footprint count/length; further downstream processing is required.

# **Example Files**

The repository includes a folder `Example files` containing necessary files to run FiberHMM on a short sample *Drosophila* dataset, enabling you to test each step of the workflow with default parameters:

### Input Files
- **accessible_probs.tsv** and **inaccessible_probs.tsv** – probability files for accessible and inaccessible regions (`-p` parameter).
- **dm6.fa***Drosophila melanogaster* (dm6) reference genome in FASTA format. This file is too large for github, so please download the file yourself.
- **dm6_example.bed** – example FiberHMM m6A-only output file for m6A modifications. You should specify `-b 11` to use the correct column.

### Expected Output Files
- **dm6_example_model.pickle** – trained model for the example dataset.
- **dm6.h5** – genome context database. This file is too big to upload, so you need to generate the file using dm6.fa.
- **dm6_example_training-reads.tsv** – reads used in model training.
- **dm6_example_fp.bed** – final footprint output.

# **Model Sharing**

The `Example models` folder is intended to house pre-trained models for the community. Currently, it contains only the dm6 model, but I’ll continue adding more as I generate them. Community contributions are welcome—if you’ve trained a model using FiberHMM, feel free to share it here for others to use!
60 changes: 33 additions & 27 deletions apply_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ def options():
train_rids = []
min_len = 0
circle=False
edge_trim = 10

optlist, args = getopt.getopt(sys.argv[1:],'i:g:m:t:o:b:s:l:r',
['infile=','context=','model=','train_reads=', 'outdir=', 'me_col=', 'min_len=','circular'])
optlist, args = getopt.getopt(sys.argv[1:],'i:g:m:t:o:b:s:l:re:',
['infile=','context=','model=','train_reads=', 'outdir=', 'me_col=', 'min_len=','circular', 'edge_trim='])

for o, a in optlist:
#comma-separated list of paths to input files
if o=='-i' or o=='--infiles':
#path to input file
if o=='-i' or o=='--infile':
infile=a
#path to context hdf5
elif o=='-g' or o=='--context':
Expand Down Expand Up @@ -56,30 +57,37 @@ def options():
#the output is a nonstandard bed12, which has 3x the footprints
elif o == '-r' or o == '--circular':
circle = True
elif o == '-e' or o == '--edge_trim':
edge_trim = int(a)

return infile, context, model, train_rids, outdir, me_col, chunk_size, min_len, circle
return infile, context, model, train_rids, outdir, me_col, chunk_size, min_len, circle, edge_trim


def encode_me(rid, read, read_info, context, circle):
def encode_me(rid, read, read_info, context, circle, edge_trim):
#grab info, read
chrom = read_info.loc[rid]['chrom']
me = np.array(read.dropna())
me = me[1:-1]
me = me.astype(int)
start = read_info.loc[rid, 'start']
end = read_info.loc[rid, 'end']

me = np.array(read.dropna()).astype(int)
#remove any methylations in the trim region
me = me[np.where(
(me>edge_trim)&(me<((end-start)-edge_trim))
)]

#make sure within range, find positions with no methylation
me = me[np.where(me < (end - start))[0]]
no_me = np.arange(end - start)
no_me = np.delete(no_me, me)

#grab sequence context info from context file
with h5py.File(context, 'r', swmr=True) as f:
hexamers = f[chrom]['table'][start:end]
hexamers = f[chrom]['table'][(start+edge_trim):(end-edge_trim)]

#encode methylations and no methylations
me_encode = np.array([item[1] for item in hexamers])
#add non-A (so, 0% probability of methylation) to edge bases
me_encode = np.pad(me_encode, pad_width=((edge_trim, edge_trim), (0, 0)), mode='constant', constant_values=4096)
no_me_encode = me_encode + 4097

#zero out me/no me positions
Expand Down Expand Up @@ -108,7 +116,7 @@ def gaps_to_lengths(fps):

return starts, lengths

def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_size, min_len, tmp_dir, circle):
def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_size, min_len, tmp_dir, circle, edge_trim):

dataset=f.split('/')[-1].split('.')[0]
sys.stdout.flush()
Expand Down Expand Up @@ -170,7 +178,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
pbar2.set_description(f"Applying model to chunk {i}")

for index, read in chunk.iterrows():
read_encode = encode_me(index, read, b12, context, circle)
read_encode = encode_me(index, read, b12, context, circle, edge_trim)
read_encode = read_encode.astype(int).reshape(-1, 1)
pos = model.predict(read_encode)

Expand Down Expand Up @@ -204,19 +212,17 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_

#combine, sort bed12s
b12 = b12.rename(columns={'rid': 'name'})
b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockSizes', 'blockStarts']
b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes']
b12 = pd.concat([b12, no_me_b12])
b12 = b12.sort_values(by=['chrom', 'start'])

# Write to a temporary file (split by chromosome if necessary)
for chrom in b12['chrom'].unique():
tmp_b12=b12.loc[b12['chrom']==chrom]
tmp_file = os.path.join(tmp_dir, f"{dataset}_{chrom}_{i}.bed")
tmp_b12.to_csv(tmp_file, sep='\t', index=False)
tmp_file = os.path.join(tmp_dir, f"{dataset}_{i}.bed")
b12.to_csv(tmp_file, sep='\t', index=False)



f, context, model, train_rids, outdir, me_col, chunk_size, min_len, circle = options()
f, context, model, train_rids, outdir, me_col, chunk_size, min_len, circle, edge_trim = options()

#grab list of chromosomes from context file
tmp=pd.HDFStore(context, 'r')
Expand All @@ -238,19 +244,19 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
dataset=f.split('/')[-1].split('.')[0]

#run model
apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_size, min_len, tmp_dir, circle)
apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_size, min_len, tmp_dir, circle, edge_trim)

#need to pause for 1s for small datasets
time.sleep(1)

#combine temp files, remove
for chrom in tqdm(chromlist, desc='Combining temporary files'):
tmp_files = glob.glob(os.path.join(tmp_dir, f"{dataset}_{chrom}_*.bed"))
final_file = os.path.join(outdir, f"{dataset}-{chrom}_fp.bed")
with open(final_file, 'w') as fout:
for tmp_file in tmp_files:
with open(tmp_file, 'r') as fin:
fout.write(fin.read())
os.remove(tmp_file)
print('Combining temporary files')
tmp_files = glob.glob(os.path.join(tmp_dir, f"{dataset}_*.bed"))
final_file = os.path.join(outdir, f"{dataset}_fp.bed")
with open(final_file, 'w') as fout:
for tmp_file in tmp_files:
with open(tmp_file, 'r') as fin:
fout.write(fin.read())
os.remove(tmp_file)

os.rmdir(tmp_dir)
Loading

0 comments on commit 85aaa8e

Please sign in to comment.