-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #32 from ENCODE-DCC/dev14
Dev14
- Loading branch information
Showing
3 changed files
with
286 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,231 @@ | ||
#!/usr/bin/env python3 | ||
"""dictTool: merge/split/flatten/unflatten dict | ||
Author: | ||
Jin Lee ([email protected]) at ENCODE-DCC | ||
""" | ||
|
||
import re | ||
from collections import defaultdict | ||
try: | ||
from collections.abc import MutableMapping | ||
except AttributeError: | ||
from collections import MutableMapping | ||
|
||
|
||
def merge_dict(a, b): | ||
"""Merges b into a recursively. This mutates a and overwrites | ||
items in b on a for conflicts. | ||
Ref: https://stackoverflow.com/questions/7204805/dictionaries | ||
-of-dictionaries-merge/7205107#7205107 | ||
""" | ||
for key in b: | ||
if key in a: | ||
if isinstance(a[key], dict) and isinstance(b[key], dict): | ||
merge_dict(a[key], b[key]) | ||
elif a[key] == b[key]: | ||
pass | ||
else: | ||
a[key] = b[key] | ||
else: | ||
a[key] = b[key] | ||
|
||
|
||
def flatten_dict(d, parent_key=()): | ||
"""Flattens dict into single-level-tuple-keyed dict with | ||
{(tuple of keys of parents and self): value} | ||
Returns: | ||
dict of { | ||
(key_lvl1, key_lvl2, key_lvl3, ...): value | ||
} | ||
""" | ||
items = [] | ||
for k, v in d.items(): | ||
new_key = parent_key + (k if isinstance(k, tuple) else (k,)) | ||
if isinstance(v, MutableMapping): | ||
items.extend(flatten_dict(v, parent_key=new_key).items()) | ||
else: | ||
items.append((new_key, v)) | ||
return type(d)(items) | ||
|
||
|
||
def unflatten_dict(d_flat): | ||
"""Unflattens single-level-tuple-keyed dict into dict | ||
""" | ||
result = type(d_flat)() | ||
for k_tuple, v in d_flat.items(): | ||
d_curr = result | ||
for i, k in enumerate(k_tuple): | ||
if i == len(k_tuple) - 1: | ||
d_curr[k] = v | ||
elif k not in d_curr: | ||
d_curr[k] = type(d_flat)() | ||
d_curr = d_curr[k] | ||
return result | ||
|
||
|
||
def split_dict(d, rules=None): | ||
"""Splits dict according to "rule" | ||
Returns: | ||
List of split dict | ||
Args: | ||
rule: | ||
A list of tuple (RULE_NAME: REGEX) | ||
If a key name in an JSON object matches with this REGEX | ||
then ALL objects with the same key will be separated from | ||
the original root JSON object while keeping their hierachy. | ||
RULE_NAME will be added to root of each new JSON object. | ||
For example, we have a JSON object like the following | ||
[ | ||
{ | ||
"flagstat_qc": { | ||
"rep1": { | ||
"read1": 100, | ||
"read2": 200 | ||
}, | ||
"rep2": { | ||
"read1": 300, | ||
"read2": 400 | ||
} | ||
}, | ||
"etc": { | ||
"samstat_qc": { | ||
"rep1": { | ||
"unmapped": 500, | ||
"mapped": 600 | ||
}, | ||
"rep2": { | ||
"unmapped": 700, | ||
"mapped": 800 | ||
} | ||
} | ||
}, | ||
"idr_qc": { | ||
"qc_test1" : 900 | ||
} | ||
} | ||
] | ||
with "new_row_rule" = "replicate:^rep\d+$", this JSON object | ||
will be splitted into three (original, rep1, rep2) JSON object. | ||
[ | ||
# original | ||
{ | ||
"idr_qc": { | ||
"qc_test1" : 900 | ||
} | ||
}, | ||
# rep1 | ||
{ | ||
"replicate": "rep1", | ||
"flagstat_qc": { | ||
"read1": 100, | ||
"read2": 200 | ||
}, | ||
"etc": { | ||
"samstat_qc": { | ||
"unmapped": 500, | ||
"mapped": 600 | ||
} | ||
} | ||
}, | ||
# rep2 | ||
{ | ||
"replicate": "rep2", | ||
"flagstat_qc": { | ||
"read1": 300, | ||
"read2": 400 | ||
}, | ||
"etc": { | ||
"samstat_qc": { | ||
"unmapped": 700, | ||
"mapped": 800 | ||
} | ||
} | ||
}, | ||
] | ||
""" | ||
if rules is None: | ||
return [d] | ||
if isinstance(rules, tuple): | ||
rules = [rules] | ||
|
||
d_flat = flatten_dict(d) | ||
result = [] | ||
keys_matched_regex = set() | ||
d_each_rule = defaultdict(type(d)) | ||
for rule_name, rule_regex in rules: | ||
for k_tuple, v in d_flat.items(): | ||
new_k_tuple = () | ||
pattern_matched_k = None | ||
for k in k_tuple: | ||
if re.findall(rule_regex, k): | ||
pattern_matched_k = (rule_name, k) | ||
else: | ||
new_k_tuple += (k,) | ||
if pattern_matched_k is not None: | ||
d_each_rule[pattern_matched_k][new_k_tuple] = v | ||
keys_matched_regex.add(k_tuple) | ||
|
||
for (rule_name, k), d_each_matched in d_each_rule.items(): | ||
d_ = unflatten_dict(d_each_matched) | ||
d_[rule_name] = k | ||
result.append(d_) | ||
|
||
d_others = type(d)() | ||
for k_tuple, v in d_flat.items(): | ||
if k_tuple not in keys_matched_regex: | ||
d_others[k_tuple] = v | ||
if d_others: | ||
d_ = unflatten_dict(d_others) | ||
result = [d_] + result | ||
return result | ||
|
||
|
||
def test(): | ||
import json | ||
from collections import OrderedDict | ||
d = OrderedDict({ | ||
"flagstat_qc": { | ||
"rep1": { | ||
"read1": 100, | ||
"read2": 200 | ||
}, | ||
"rep2": { | ||
"read1": 300, | ||
"read2": 400 | ||
} | ||
}, | ||
"etc": { | ||
"samstat_qc": { | ||
"rep1": { | ||
"unmapped": 500, | ||
"mapped": 600 | ||
}, | ||
"rep2": { | ||
"unmapped": 700, | ||
"mapped": 800 | ||
} | ||
} | ||
}, | ||
"idr_qc": { | ||
"qc_test1" : 900 | ||
} | ||
}) | ||
j = json.dumps(d, indent=4) | ||
print(j) | ||
# j_flat = flatten_dict(d) | ||
# print(j_flat) | ||
jsons_split = split_dict(d, {'replicate': r'^rep\d+$'}) | ||
print(json.dumps(jsons_split, indent=4)) | ||
# print(split_dict(d, {'replicate': r'^rep\d+$'})) | ||
return 0 | ||
|
||
|
||
if __name__ == '__main__': | ||
test() |