From c623fd71b2afa4b06fd1d1eddb07b3480afe0fc4 Mon Sep 17 00:00:00 2001 From: HorusZhang Date: Tue, 6 Jun 2023 16:52:24 +0800 Subject: [PATCH] organization --- README.md | 25 ++++----- experiments/code_repair.py | 9 +-- experiments/data/README.md | 2 +- experiments/show/README.md | 2 +- experiments/show_main.py | 67 ++++++++++++----------- experiments/show_supp.py | 109 ++++++++++++++++++++++--------------- experiments/sort_data.py | 25 +++++++-- setup.py | 32 +++++------ 8 files changed, 148 insertions(+), 123 deletions(-) diff --git a/README.md b/README.md index e340ca9..fd87265 100644 --- a/README.md +++ b/README.md @@ -7,21 +7,16 @@ [![License](https://img.shields.io/badge/License-BGI_Research-orange.svg)](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/LICENSE.pdf) -DNA has been considered a promising medium for storing digital information. -As an essential step in the DNA-based data storage workflow, -coding scheme is responsible to implement functions including bit-to-base transcoding, error correction, etc. -In previous studies, these functions are normally realized by introducing independent coding algorithms. -Here, we report a graph-based architecture, named SPIDER-WEB, -providing an all-in-one coding solution by generating customized algorithms automatically. -SPIDER-WEB supports correcting at maximum 4% edit errors in the DNA sequences -including substitution and insertion/deletion (indel), -while the required logical redundancy at only 5.5%. -Since no DNA sequence pretreatment is required for correcting and decoding processes, -SPIDER-WEB offers the function of real-time information retrieval, -which is 305.08 times faster than the speed of single-molecule sequencing techniques. -Our retrieval process can improve 2 orders of magnitude faster compared to conventional one -under megabyte-level data and can be scalable to fit exabyte-level data. -Therefore, SPIDER-WEB holds the potential to improve the practicability in large-scale data storage applications. +DNA has been considered a promising medium for storing digital information. +Previously, functions including bit-to-base transcoding and error correction are implemented by independent algorithms. +It would result in either increased computational complexity or compromised error tolerance +when attempting to correct various types of errors, especially for insertions/deletions (indels). +To address this issue, we report a graph-based architecture, named SPIDER-WEB, providing an all-in-one coding solution +by generating customized algorithms with an efficient built-in error correction function. +SPIDER-WEB is able to correct a maximum of 4% edit errors in the DNA sequences including substitution and indel, +with only 5.5% logical redundancy. In addition, SPIDER-WEB enables real-time retrieval of megabyte-level data with +a 100x execution speed improvement over conventional methods and hold the potential of practicability for +large-scale data storage applications at the exabyte-level. ## Installation You can install this package using pip: diff --git a/experiments/code_repair.py b/experiments/code_repair.py index 460a449..f2a160b 100644 --- a/experiments/code_repair.py +++ b/experiments/code_repair.py @@ -1,11 +1,9 @@ from itertools import product from time import time -from networkx import DiGraph -from numpy import random, array, arange, ones, zeros, fromfile, unpackbits, hstack, expand_dims -from numpy import abs, all, sum, min, max, log, where, longlong, uint8 +from numpy import random, array, arange, zeros, abs, min, max, log, where, all, longlong from warnings import filterwarnings -from dsw import encode, decode, set_vt, repair_dna, obtain_vertices, bit_to_number, dna_to_number, number_to_bit +from dsw import set_vt, repair_dna, obtain_vertices, bit_to_number from dsw import Monitor, accessor_to_latter_map, approximate_capacity, remove_nasty_arc filterwarnings("ignore", category=RuntimeWarning) @@ -412,7 +410,7 @@ def next(self, nucleotide_index, current_score): records = zeros(shape=(repeats,), dtype=int) for repeat in range(repeats): print("Run test " + str(repeat + 1) + "/" + str(repeats) + " with " + str(errors) + " errors.") - used_message, found = random.randint(0, 2, size=(dna_length // 3 + 1,)), False + used_message = random.randint(0, 2, size=(dna_length // 3 + 1,)) right_dna_sequence = hedges_encode(binary_message=used_message, strand_index=repeat, pattern=[1, 0, 0]) wrong_dna_sequence = introduce_errors(right_dna_sequence=right_dna_sequence, errors=errors) availables, size, step, process = hedges_decode(dna_sequence=wrong_dna_sequence, strand_index=repeat, @@ -420,7 +418,6 @@ def next(self, nucleotide_index, current_score): correct_penalty=-0.324) for available in availables: if all(available[:-1] == used_message[:-1]): - found = True break records[repeat] = step diff --git a/experiments/data/README.md b/experiments/data/README.md index 6c652b4..4d0b8db 100644 --- a/experiments/data/README.md +++ b/experiments/data/README.md @@ -7,4 +7,4 @@ The simplified supporting data support all figures and tables in the main text a | file name | MD5 | file size | | ---- | ---- | ---- | -| supplementary data.xlsx | 81FE364BDB95A5A86B3519A161401DEB | 13,576 KB | \ No newline at end of file +| supplementary data.xlsx | DF3A194087004E4203D968E781E5AD33 | 13,579 KB | \ No newline at end of file diff --git a/experiments/show/README.md b/experiments/show/README.md index f176cb3..1dd143c 100644 --- a/experiments/show/README.md +++ b/experiments/show/README.md @@ -1,5 +1,5 @@ After obtaining [the raw data](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/experiments/raw), -you can generate all the figures (13) and tables (8) through +you can generate all the figures (13) and tables (9) through [show_main.py](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/experiments/show_main.py) and [show_supp.py](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/experiments/show_supp.py). \ No newline at end of file diff --git a/experiments/show_main.py b/experiments/show_main.py index 4dae2b7..521cb60 100644 --- a/experiments/show_main.py +++ b/experiments/show_main.py @@ -22,9 +22,10 @@ def main01(): pyplot.figure(figsize=(10, 7), tight_layout=True) ax = pyplot.subplot(3, 1, 1) + pyplot.text(0.03, 0.98, "a", va="center", ha="center", fontsize=14) pyplot.text(0.03, 0.5, "generating process", color="#0070C0", - va="center", ha="center", rotation="vertical", fontsize=12) - pyplot.vlines(0.1, 0, 1, color="#0070C0", lw=1.5) + va="center", ha="center", rotation="vertical", fontsize=11) + pyplot.vlines(0.1, 0.05, 0.95, color="#0070C0", lw=1.5) ax.add_patch(patches.Circle(xy=(0.5, 0.5), radius=0.200, facecolor="#EEEEEE", edgecolor="#000000", lw=0.75)) pyplot.text(0.5, 0.5, r"$\mathcal{D}_4^k$", va="center", ha="center", fontsize=14) pyplot.text(0.5, 0.1, "quaternary de Bruijn graph\nof order " + r"$k$", @@ -88,10 +89,11 @@ def main01(): pyplot.ylim(0, 1) pyplot.subplot(3, 1, 2) + pyplot.text(0.03, 0.98, "b", va="center", ha="center", fontsize=14) pyplot.text(0.03, 0.5, "coding process", color="#0070C0", - va="center", ha="center", rotation="vertical", fontsize=12) - pyplot.vlines(0.1, 0, 1, color="#0070C0", lw=1.5) - pyplot.text(0.5, 0.92, "part of coding digraph", va="center", ha="center", fontsize=10) + va="center", ha="center", rotation="vertical", fontsize=11) + pyplot.vlines(0.1, 0.05, 0.95, color="#0070C0", lw=1.5) + pyplot.text(0.5, 0.92, "walk of coding digraph", va="center", ha="center", fontsize=10) points = array([[0.5, 0.2, 0.4, 0.6, 0.8, 0.5, 0.7, 0.4, 0.6, 0.4, 0.6, 0.8], [0.9, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.3, 0.3, 0.1, 0.1, 0.1]]) points[1] -= 0.05 @@ -188,9 +190,10 @@ def main01(): pyplot.ylim(0, 1) pyplot.subplot(3, 1, 3) + pyplot.text(0.03, 0.98, "c", va="center", ha="center", fontsize=14) pyplot.text(0.03, 0.5, "error correcting process", color="#0070C0", - va="center", ha="center", rotation="vertical", fontsize=12) - pyplot.vlines(0.1, 0, 1, color="#0070C0", lw=1.5) + va="center", ha="center", rotation="vertical", fontsize=11) + pyplot.vlines(0.1, 0.05, 0.95, color="#0070C0", lw=1.5) pyplot.fill_between([0.15, 3.50], 0.05, 0.95, color="#EFEFEF", lw=0) points = array([[0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 0.4, 0.6, 0.8, 1.0, 1.2, 0.4, 0.6, 0.8, 1.0, 0.4, 0.6, 0.8], [0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.55, 0.55, 0.55, 0.55, 0.55, 0.4, 0.4, 0.4, 0.4, 0.25, 0.25, 0.25]]) @@ -419,7 +422,7 @@ def draw_graph(ps, s, w, se=None, h=None): arrowprops=dict(arrowstyle="<|-", color="red", lw=1)) pyplot.text(0.35, 0.14, "normal arc", va="center", ha="left", fontsize=8) pyplot.text(0.35, 0.06, "access arc (error found)", va="center", ha="left", fontsize=8) - pyplot.fill_between([0.7 - 0.017, 0.7 + 0.017], 0.14 - 0.015, 0.14 + 0.018, lw=0, color="lightblue") + pyplot.fill_between([0.66 - 0.017, 0.66 + 0.017], 0.14 - 0.015, 0.14 + 0.018, lw=0, color="lightblue") pyplot.text(0.66, 0.14, "N", va="center", ha="center", fontsize=8) pyplot.text(0.70, 0.14, "adjust (substitute/insert/delete) by N", va="center", ha="left", fontsize=8) pyplot.text(0.94, 0.06, "N", va="center", ha="center", color="r", fontsize=8) @@ -632,7 +635,7 @@ def main03(): pyplot.plot(arange(len(values)) + 0.5, values + 0.5, lw=0.75, ls="--", color="k", zorder=1) pyplot.legend(loc="upper right", ncol=2, framealpha=1, title="% of edit errors", fontsize=9, title_fontsize=9) pyplot.xlabel("reads number", fontsize=10) - pyplot.ylabel("maximum loss after retrieval", fontsize=10) + pyplot.ylabel("maximum number of\nuncorrectable sequences", fontsize=10) pyplot.xticks(arange(10) + 0.5, (arange(10) + 1) * 5, fontsize=10) pyplot.yticks(arange(6) + 0.5, ["1E+" + str(v) for v in arange(6)], fontsize=10) pyplot.xlim(0, 10) @@ -650,18 +653,18 @@ def main03(): show_data[show_data <= 0] = nan # noinspection PyUnresolvedReferences - ax.add_patch(patches.FancyBboxPatch((0.25, 5.85), width=3.3, height=1.9, + ax.add_patch(patches.FancyBboxPatch((0.25, 5.85), width=4.6, height=1.9, fc="w", ec="silver", boxstyle="Round,pad=0.1", lw=0.75)) - pyplot.text(1.9, 7.3, "smallest gap between\ncorrect-incorrect reads", va="center", ha="center", fontsize=9, - zorder=2) + pyplot.text(2.55, 7.3, "smallest frequency gap between\ncorrect-incorrect sequences", + va="center", ha="center", fontsize=9, zorder=2) for index, color in enumerate(pyplot.get_cmap("gist_earth_r")(linspace(0, 0.5, 40))): - pyplot.fill_between([0.4 + index * (3.0 / 40.0), 0.4 + (index + 1) * (3.0 / 40.0)], 6.5, 6.8, + pyplot.fill_between([0.4 + index * (4.25 / 40.0), 0.4 + (index + 1) * (4.25 / 40.0)], 6.5, 6.8, fc=color, lw=0, zorder=2) for index, number in enumerate([0, 10, 20, 30, 40]): - pyplot.vlines(0.4 + 0.75 * index, 6.5, 6.3, lw=0.75, zorder=2) - pyplot.text(0.4 + 0.75 * index, 6.0, str(number), va="center", ha="center", fontsize=9, zorder=2) - pyplot.plot([0.4, 3.4, 3.4, 0.4, 0.4], [6.8, 6.8, 6.5, 6.5, 6.8], color="k", lw=0.75, zorder=3) + pyplot.vlines(0.4 + 1.0625 * index, 6.5, 6.3, lw=0.75, zorder=2) + pyplot.text(0.4 + 1.0625 * index, 6.0, str(number), va="center", ha="center", fontsize=9, zorder=2) + pyplot.plot([0.40, 4.65, 4.65, 0.40, 0.40], [6.8, 6.8, 6.5, 6.5, 6.8], color="k", lw=0.75, zorder=3) pyplot.pcolormesh(arange(11), arange(9), show_data.T, vmin=0, vmax=70, cmap="gist_earth_r") for x in range(10): @@ -670,7 +673,7 @@ def main03(): pyplot.text(x + 0.5, y + 0.5, matrix[x, y], va="center", ha="center", fontsize=9) pyplot.xlabel("reads number", fontsize=10) - pyplot.ylabel("error rate", fontsize=10) + pyplot.ylabel("error rate", fontsize=10, labelpad=12) pyplot.xticks(arange(10) + 0.5, (arange(10) + 1) * 5, fontsize=10) pyplot.yticks(arange(8) + 0.5, ["%.1f" % ((v + 1) / 2) + "%" for v in arange(8)], fontsize=10) pyplot.xlim(0, 10) @@ -691,14 +694,14 @@ def main03(): for retrieval_rate, style in zip([1.0, 0.999, 0.99], used_styles): x_values, y_values = data[(error_rate, retrieval_rate)] info_1 = "%.1f" % (error_rate * 100) + "%" - info_2 = "%.1f" % (retrieval_rate * 100) + "%" if retrieval_rate < 1 else "lossless" + info_2 = "%.1f" % (retrieval_rate * 100) + "%" if retrieval_rate < 1 else " 100%" pyplot.plot(log10(x_values), y_values, color=color, lw=2, ls=style, zorder=1, label=info_1 + " | " + info_2) pyplot.legend(loc="upper left", ncol=2, fontsize=9, title_fontsize=9, title=" minimum reads number\nfor \"error rate | retrieval rate\"") pyplot.xlabel("sequence diversity", fontsize=10) - pyplot.ylabel("reads number", fontsize=10) + pyplot.ylabel("reads number", fontsize=10, labelpad=12) pyplot.xticks(arange(0, 13), ["1E+" + str(value).zfill(2) if value % 3 == 0 else "" for value in arange(0, 13)], fontsize=10) pyplot.yticks(arange(0, 281, 40), arange(0, 281, 40), fontsize=10) @@ -735,9 +738,9 @@ def function_0_5(diversity, depth): pyplot.plot(x_values, y_values, color=used_colors[1], lw=2, ls=style, label="0.5% | " + diversity_name) pyplot.legend(loc="upper left", ncol=2, fontsize=9, title_fontsize=9, - title=" non-blocking reads threshold\nfor \"error rate | sequence diversity\"") + title=" non-blocking threshold\nfor \"error rate | sequence diversity\"") pyplot.xlabel("reads number", fontsize=10) - pyplot.ylabel("reads threshold", fontsize=10) + pyplot.ylabel("maximum frequency of\nincorrect DNA sequences", fontsize=10) pyplot.xticks(arange(0, 281, 40), arange(0, 281, 40), fontsize=10) pyplot.yticks(arange(0, 141, 20), arange(0, 141, 20), fontsize=10) pyplot.xlim(0, 280) @@ -748,10 +751,10 @@ def function_0_5(diversity, depth): ax.spines["right"].set_visible(False) figure.align_labels() - figure.text(0.020, 0.99, "a", va="center", ha="center", fontsize=14) - figure.text(0.518, 0.99, "b", va="center", ha="center", fontsize=14) - figure.text(0.020, 0.49, "c", va="center", ha="center", fontsize=14) - figure.text(0.518, 0.49, "d", va="center", ha="center", fontsize=14) + figure.text(0.024, 0.99, "a", va="center", ha="center", fontsize=14) + figure.text(0.523, 0.99, "b", va="center", ha="center", fontsize=14) + figure.text(0.024, 0.49, "c", va="center", ha="center", fontsize=14) + figure.text(0.523, 0.49, "d", va="center", ha="center", fontsize=14) pyplot.savefig("./show/main03.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() @@ -773,17 +776,17 @@ def main04(): pyplot.bar([-1], [-1], fc="#E2F0D9", ec="k", lw=0.75, width=0.3, label="high-throughput sequencing") for index, speed in enumerate(correction_speeds): - height = (speed - 100000) / 250 + 1600 + height = (speed * 144 - 10000000) / 50000 + 1600 pyplot.vlines(index - 0.3, 0, 1300, lw=0.75, zorder=0) pyplot.vlines(index, 0, 1300, lw=0.75, zorder=0) pyplot.plot([index - 0.3, index - 0.3, index, index], [1500, height, height, 1500], lw=0.75, color="k", zorder=0) - pyplot.text(index - 0.15, height + 20, int(speed), va="bottom", ha="center", fontsize=9) + pyplot.text(index - 0.15, height + 20, int(speed * 144), va="bottom", ha="center", fontsize=9) pyplot.fill_between([index - 0.3, index], 0, 1300, fc="#E2F0D9", ec="w", lw=0, zorder=-1) pyplot.fill_between([index - 0.3, index], 1500, height, fc="#E2F0D9", ec="w", lw=0, zorder=-1) pyplot.text(-0.5, 1400, " ", bbox=dict(fc="w", ec="w"), va="center", ha="center", fontsize=1, zorder=5) - figure.add_artist(lines.Line2D([0.100, 0.108], [0.703, 0.713], lw=1, color="k")) - figure.add_artist(lines.Line2D([0.100, 0.108], [0.728, 0.738], lw=1, color="k")) + figure.add_artist(lines.Line2D([0.115, 0.123], [0.722, 0.732], lw=1, color="k")) + figure.add_artist(lines.Line2D([0.115, 0.123], [0.746, 0.756], lw=1, color="k")) pyplot.bar(arange(len(correction_speeds)) + 0.15, correction_speeds / 450, fc="#A9D18E", ec="k", lw=0.75, width=0.3, label="single-molecule sequencing") for h_location, value in enumerate(correction_speeds / 450): @@ -793,10 +796,10 @@ def main04(): pyplot.ylabel("performance\n(correction speed / sequencing speed)", fontsize=10) pyplot.xticks(arange(len(correction_speeds)), ["%.1f" % (value / 2) + "%" for value in arange(len(correction_speeds))], fontsize=10) - pyplot.yticks([0, 400, 800, 1200, 1600, 2000, 2400, 2800, 3200, 3600], - [0, 400, 800, 1200, 100000, 200000, 300000, 400000, 500000, 600000], fontsize=10) + pyplot.yticks([0, 400, 800, 1200, 1600, 2000, 2400, 2800, 3200], + [0, 400, 800, 1200, 10000000, 30000000, 50000000, 70000000, 90000000], fontsize=10) pyplot.xlim(-0.5, len(correction_speeds) - 0.5) - pyplot.ylim(0, 3600) + pyplot.ylim(0, 3200) # noinspection PyUnresolvedReferences ax.spines["top"].set_visible(False) # noinspection PyUnresolvedReferences diff --git a/experiments/show_supp.py b/experiments/show_supp.py index 3dd5690..fb07420 100644 --- a/experiments/show_supp.py +++ b/experiments/show_supp.py @@ -24,12 +24,32 @@ def supp01(): + book = Workbook() + sheet = book.create_sheet(title="supp01", index=0) + + sheet.append(["micro-organism", "undesired motifs"]) + sheet.append(["Bacillus subtilis", "GTCGAC"]) + sheet.append(["Bifidobacterium bifidum", "CTGCAG"]) + sheet.append(["Escherichia coli", "CAGCAG and GATATC"]) + sheet.append(["Gluconobacter dioxyacetonicus", "AGGCCT"]) + sheet.append(["Haemophilus gallinarum ", "GACGC"]) + sheet.append(["Klebsiella pneumoniae", "GGTACC"]) + sheet.append(["Oerskovia xanthineolytica", "AGCT"]) + sheet.append(["Streptomyces achromogenes", "ACTAGT"]) + sheet.append(["Streptomyces caespitosus", "AGTACT"]) + sheet.append(["Streptomyces phaeochromogenes", "GCATGC"]) + sheet.append(["Streptomyces stanford", "GAGCTC"]) + sheet.append(["Xanthomonas badrii", "TCTAGA"]) + + book.save("./show/supp01.xlsx") + + +def supp02(): capacities, _ = load_data(load_path="./raw/capacity_reference.pkl") book = Workbook() - sheet = book.create_sheet(title="supp01", index=0) - sheet.append(["set index", "homopolymer run-length", "regionalized GC content", "undesired motifs", - "capacity"]) + sheet = book.create_sheet(title="supp02", index=0) + sheet.append(["set index", "homopolymer run-length", "regionalized GC content", "undesired motifs", "capacity"]) for constraint_index, bio_filter in local_bio_filters.items(): save_data = [constraint_index] @@ -65,10 +85,10 @@ def supp01(): sheet.append(save_data) - book.save("./show/supp01.xlsx") + book.save("./show/supp02.xlsx") -def supp02(): +def supp03(): filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] colors = pyplot.get_cmap("Set3")(linspace(0, 1, 12)) record = load_data(load_path="./raw/generation_evaluation.pkl") @@ -93,11 +113,11 @@ def supp02(): # noinspection PyUnresolvedReferences ax.spines["right"].set_visible(False) - pyplot.savefig("./show/supp02.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp03.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp03(): +def supp04(): filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] colors = pyplot.get_cmap("Set3")(linspace(0, 1, 12)) record = load_data(load_path="./raw/generation_evaluation.pkl") @@ -123,16 +143,16 @@ def supp03(): # noinspection PyUnresolvedReferences ax.spines["right"].set_visible(False) - pyplot.savefig("./show/supp03.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp04.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp04(): +def supp05(): record = load_data(load_path="./raw/generation_evaluation.pkl") change_data = record["change"] book = Workbook() - sheet = book.create_sheet(title="supp04", index=0) + sheet = book.create_sheet(title="supp05", index=0) sheet.append(["trimming cycle", "C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10", "C11", "C12"]) for cycle_index in range(13): @@ -145,10 +165,10 @@ def supp04(): values.append("") sheet.append(values) - book.save("./show/supp04.xlsx") + book.save("./show/supp05.xlsx") -def supp05(): +def supp06(): random_seed, param_number = 2021, 100 random.seed(random_seed) @@ -167,7 +187,7 @@ def supp05(): hedges_params = array(hedges_params).tolist() book = Workbook() - sheet = book.create_sheet(title="supp05", index=0) + sheet = book.create_sheet(title="supp06", index=0) sheet.append(["param index", "DNA Fountain", "", "Yin-Yang Code", "HEDGES", ""]) sheet.append(["", "c", "delta", "rule id", "pattern", "correct penalty"]) for param_index in range(param_number): @@ -184,10 +204,10 @@ def supp05(): sheet.append(record) - book.save("./show/supp05.xlsx") + book.save("./show/supp06.xlsx") -def supp06(): +def supp07(): random_seed, param_number = 2021, 100 spiderweb_params = [] @@ -201,7 +221,7 @@ def supp06(): spiderweb_params = array(spiderweb_params).T.tolist() book = Workbook() - sheet = book.create_sheet(title="supp08", index=0) + sheet = book.create_sheet(title="supp07", index=0) sheet.append(["param index", "SPIDER-WEB", "", "", "", "", "", "", "", "", "", "", ""]) sheet.append(["", "C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10", "C11", "C12"]) for param_index in range(param_number): @@ -211,16 +231,16 @@ def supp06(): sheet.append(record) - book.save("./show/supp06.xlsx") + book.save("./show/supp07.xlsx") -def supp07(): +def supp08(): display_data = [[[] for _ in range(4)] for _ in range(12)] record = load_data(load_path="./raw/coding_evaluation_1.pkl") for algorithm_index, filter_index, _, _, code_rate in record: display_data[int(filter_index - 1)][int(algorithm_index - 1)].append(code_rate) book = Workbook() - sheet = book.create_sheet(title="supp07", index=0) + sheet = book.create_sheet(title="supp08", index=0) sheet.append(["set index", "DNA Fountain", "Yin-Yang Code", "HEDGES", "SPIDER-WEB"]) for index, data in enumerate(display_data): record = [str(index + 1).zfill(2)] @@ -235,17 +255,17 @@ def supp07(): record.append("%.2f ~ %.2f" % (min(sub_data), max(sub_data))) sheet.append(record) - book.save("./show/supp07.xlsx") + book.save("./show/supp08.xlsx") -def supp08(): +def supp09(): display_data = [[[] for _ in range(4)] for _ in range(12)] record = load_data(load_path="./raw/coding_evaluation_1.pkl") for algorithm_index, filter_index, _, _, code_rate in record: display_data[int(filter_index) - 1][int(algorithm_index) - 1].append(code_rate) book = Workbook() - sheet = book.create_sheet(title="supp08", index=0) + sheet = book.create_sheet(title="supp09", index=0) sheet.append(["set index", "DNA Fountain", "Yin-Yang Code", "HEDGES", "SPIDER-WEB"]) for index, data in enumerate(display_data): record = [str(index + 1).zfill(2)] @@ -260,10 +280,10 @@ def supp08(): record.append("%.4f" % std(sub_data)) sheet.append(record) - book.save("./show/supp08.xlsx") + book.save("./show/supp09.xlsx") -def supp09(): +def supp10(): counts = ones(shape=(12, 6)) * 100 record = load_data(load_path="./raw/coding_evaluation_1.pkl") for algorithm_index, filter_index, param_index, _, code_rate in record: @@ -272,7 +292,7 @@ def supp09(): counts /= 100.0 book = Workbook() - sheet = book.create_sheet(title="supp09", index=0) + sheet = book.create_sheet(title="supp10", index=0) sheet.append(["set index", "pattern 1", "pattern 2", "pattern 3", "pattern 4", "pattern 5", "pattern 6"]) for index, data in enumerate(counts): record = [str(index + 1).zfill(2)] @@ -280,10 +300,10 @@ def supp09(): record.append(str(int(value * 100)) + "%") sheet.append(record) - book.save("./show/supp09.xlsx") + book.save("./show/supp10.xlsx") -def supp10(): +def supp11(): filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] capacities, _ = load_data(load_path="./raw/capacity_reference.pkl") @@ -319,11 +339,11 @@ def supp10(): pyplot.yticks([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0], ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9", "2.0"], fontsize=10) pyplot.ylim(0.95, 2.05) - pyplot.savefig("./show/supp10.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp11.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp11(): +def supp12(): data = load_data(load_path="./raw/correction_evaluation_1.pkl") pyplot.figure(figsize=(10, 6), tight_layout=True) @@ -365,11 +385,11 @@ def supp11(): # noinspection PyUnresolvedReferences ax.spines["right"].set_visible(False) - pyplot.savefig("./show/supp11.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp12.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp12(): +def supp13(): data = load_data(load_path="./raw/correction_evaluation_1.pkl") pyplot.figure(figsize=(10, 6), tight_layout=True) @@ -407,11 +427,11 @@ def supp12(): pyplot.yticks([0, 2, 4, 6, 8, 10], [1, 4, 16, 64, 256, 1024], fontsize=10) pyplot.xlim(-0.8, 7.8) pyplot.ylim(-0.5, 10.5) - pyplot.savefig("./show/supp12.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp13.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp13(): +def supp14(): data = load_data(load_path="./raw/correction_evaluation_4.pkl") figure = pyplot.figure(figsize=(10, 5)) @@ -448,11 +468,11 @@ def supp13(): bar.set_label("minimum reads number to reach the given retrieval rate", fontsize=10) bar.ax.tick_params(labelsize=10) - pyplot.savefig("./show/supp13.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp14.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp14(): +def supp15(): data = load_data(load_path="./raw/correction_evaluation_6.pkl") figure = pyplot.figure(figsize=(10, 6)) @@ -487,25 +507,25 @@ def supp14(): bar.set_label("maximum frequency of incorrect reads", fontsize=10) bar.ax.tick_params(labelsize=10) - pyplot.savefig("./show/supp14.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp15.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp15(): +def supp16(): data = load_data(load_path="./raw/correction_evaluation_7.pkl") book = Workbook() - sheet = book.create_sheet(title="supp15", index=0) + sheet = book.create_sheet(title="supp16", index=0) sheet.append(["error rate", "SPIDER-WEB (average)", "SPIDER-WEB (median)", "HEDGES (average)", "HEDGES (median)"]) for value in range(9): value_x = data["spiderweb"][value / 200.0] value_y = data["hedges"][value / 200.0] sheet.append([value / 200.0, mean(value_x), median(value_x), mean(value_y), median(value_y)]) - book.save("./show/supp15.xlsx") + book.save("./show/supp16.xlsx") -def supp16(): +def supp17(): filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] coding_graphs = load_data(load_path="./raw/graph_coding.pkl") follow_ups = zeros(shape=(12, 3)) @@ -517,7 +537,7 @@ def supp16(): counts /= sum(counts) follow_ups[index] = counts - figure = pyplot.figure(figsize=(10, 6), tight_layout=True) + pyplot.figure(figsize=(10, 6), tight_layout=True) gradient_colors = pyplot.get_cmap("RdYlGn")(linspace(0, 1, 12)) for index, filter_index in enumerate(filter_indices): lengths = linspace(0, 256, 257) @@ -531,11 +551,11 @@ def supp16(): pyplot.yticks([0, 64, 128, 192, 256], ["2^0", "2^64", "2^128", "2^192", "2^256"], fontsize=10) pyplot.xlim(0, 256) pyplot.ylim(0, 256) - pyplot.savefig("./show/supp16.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp17.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() -def supp17(): +def supp18(): record = load_data(load_path="./raw/capacity_evaluation.pkl") data, errors = record["detail"] @@ -661,7 +681,7 @@ def supp17(): figure.text(0.024, 0.66, "b", va="center", ha="center", fontsize=14) figure.text(0.024, 0.34, "c", va="center", ha="center", fontsize=14) - pyplot.savefig("./show/supp17.pdf", format="pdf", bbox_inches="tight", dpi=600) + pyplot.savefig("./show/supp18.pdf", format="pdf", bbox_inches="tight", dpi=600) pyplot.close() @@ -683,3 +703,4 @@ def supp17(): supp15() supp16() supp17() + supp18() diff --git a/experiments/sort_data.py b/experiments/sort_data.py index 93a2b91..f903a38 100644 --- a/experiments/sort_data.py +++ b/experiments/sort_data.py @@ -109,6 +109,21 @@ def propose05(diversity): propose40(sequence_diversity), propose05(sequence_diversity)]) sheet = book.create_sheet(title="Table S1") + sheet.append(["micro-organism", "undesired motifs"]) + sheet.append(["Bacillus subtilis", "GTCGAC"]) + sheet.append(["Bifidobacterium bifidum", "CTGCAG"]) + sheet.append(["Escherichia coli", "CAGCAG and GATATC"]) + sheet.append(["Gluconobacter dioxyacetonicus", "AGGCCT"]) + sheet.append(["Haemophilus gallinarum ", "GACGC"]) + sheet.append(["Klebsiella pneumoniae", "GGTACC"]) + sheet.append(["Oerskovia xanthineolytica", "AGCT"]) + sheet.append(["Streptomyces achromogenes", "ACTAGT"]) + sheet.append(["Streptomyces caespitosus", "AGTACT"]) + sheet.append(["Streptomyces phaeochromogenes", "GCATGC"]) + sheet.append(["Streptomyces stanford", "GAGCTC"]) + sheet.append(["Xanthomonas badrii", "TCTAGA"]) + + sheet = book.create_sheet(title="Table S2") sheet.append(["set index", "homopolymer run-length", "regionalized GC content", "undesired motifs", "capacity"]) record, _ = load_data(load_path="./raw/capacity_reference.pkl") for constraint_index, bio_filter in local_bio_filters.items(): @@ -149,7 +164,7 @@ def propose05(diversity): for index in range(12): sheet.append([str(index + 1).zfill(2), record[str(index + 1).zfill(2)][0]]) - sheet = book.create_sheet(title="Table S2") + sheet = book.create_sheet(title="Table S3") sheet.append(["trimming cycle", "set 01", "set 02", "set 03", "set 04", "set 05", "set 06", "set 07", "set 08", "set 09", "set 10", "set 11", "set 12"]) @@ -164,7 +179,7 @@ def propose05(diversity): values.append("N/A") sheet.append(values) - sheet = book.create_sheet(title="Table S3") + sheet = book.create_sheet(title="Table S4") sheet.append(["param index", "c in DNA Fountain", "delta in DNA Fountain", "rule index in Yin-Yang Code", "pattern in HEDGES", "correct penalty in HEDGES"]) random_seed, param_number = 2021, 100 @@ -193,7 +208,7 @@ def propose05(diversity): info.append("N/A") sheet.append(info) - sheet = book.create_sheet(title="Table S4") + sheet = book.create_sheet(title="Table S5") sheet.append(["param index", *["set " + str(idx + 1).zfill(2) for idx in range(12)]]) random_seed, param_number, spiderweb_params = 2021, 100, [] record = load_data(load_path="./raw/graph_coding.pkl") @@ -210,7 +225,7 @@ def propose05(diversity): info.append(vertex_index) sheet.append(info) - sheet = book.create_sheet(title="Table S5,S6,S7") + sheet = book.create_sheet(title="Table S6,S7,S8") sheet.append(["set index", "parameter index", "data index", "code rate of DNA Fountain", "code rate of Yin-Yang Code", "code rate of HEDGES", "code rate of SPIDER-WEB"]) @@ -266,7 +281,7 @@ def propose05(diversity): for index_2, sequence_diversity in enumerate([1e1, 1e2, 1e3, 1e4, 1e5]): sheet.append([error_rate, reads_number, sequence_diversity, record[error_rate][index_1, index_2]]) - sheet = book.create_sheet(title="Table S8") + sheet = book.create_sheet(title="Table S9") sheet.append(["error rate", "test index", "vertex access frequency of SPIDER-WEB", "vertex access frequency of HEDGES"]) record = load_data(load_path="./raw/correction_evaluation_7.pkl") diff --git a/setup.py b/setup.py index 1166081..75ee8c9 100644 --- a/setup.py +++ b/setup.py @@ -6,30 +6,24 @@ description="SPIDER-WEB generates coding algorithms with " "superior error tolerance and real-time information retrieval capacity", long_description="DNA has been considered a promising medium for storing digital information. " - "As an essential step in the DNA-based data storage workflow, " - "coding scheme is responsible to implement functions " - "including bit-to-base transcoding, error correction, etc. " - "In previous studies, these functions are normally realized by " - "introducing independent coding algorithms. " - "Here, we report a graph-based architecture, named SPIDER-WEB, " - "providing an all-in-one coding solution by generating customized algorithms automatically. " - "SPIDER-WEB supports correcting at maximum 4% edit errors in the DNA sequences " - "including substitution and insertion/deletion (indel), " - "while the required logical redundancy at only 5.5%. " - "Since no DNA sequence pretreatment is required for correcting and decoding processes, " - "SPIDER-WEB offers the function of real-time information retrieval, " - "which is 305.08 times faster than the speed of single-molecule sequencing techniques." - " Our retrieval process can improve 2 orders of magnitude faster compared to conventional one " - "under megabyte-level data and can be scalable to fit exabyte-level data. " - "Therefore, SPIDER-WEB holds the potential to improve the practicability " - "in large-scale data storage applications.", + "Previously, functions including bit-to-base transcoding and error correction are implemented by " + "independent algorithms. It would result in either increased computational complexity " + "or compromised error tolerance when attempting to correct various types of errors, " + "especially for insertions/deletions (indels). To address this issue, " + "we report a graph-based architecture, named SPIDER-WEB, providing an all-in-one coding solution " + "by generating customized algorithms with an efficient built-in error correction function. " + "SPIDER-WEB is able to correct a maximum of 4% edit errors in the DNA sequences " + "including substitution and indel, with only 5.5% logical redundancy. In addition, " + "SPIDER-WEB enables real-time retrieval of megabyte-level data with a 100x execution speed " + "improvement over conventional methods and hold the potential of practicability " + "for large-scale data storage applications at the exabyte-level.", author="Haoling ZHANG", author_email="zhanghaoling@genomics.cn", url="https://github.com/HaolingZHANG/DNASpiderWeb", packages=["dsw", "tests"], install_requires=["numpy", "networkx"], - license="GPL", - classifiers=["License :: BGI Research", + license="BGI-Research", + classifiers=["License :: BGI-Research", "Programming Language :: Python :: 3", "Operating System :: OS Independent"], keywords="DNA-based data storage, coding algorithm, automatic algorithm generator, probabilistic error correction",