From c623fd71b2afa4b06fd1d1eddb07b3480afe0fc4 Mon Sep 17 00:00:00 2001
From: HorusZhang <hlzchn@gmail.com>
Date: Tue, 6 Jun 2023 16:52:24 +0800
Subject: [PATCH] organization

---
 README.md                  |  25 ++++-----
 experiments/code_repair.py |   9 +--
 experiments/data/README.md |   2 +-
 experiments/show/README.md |   2 +-
 experiments/show_main.py   |  67 ++++++++++++-----------
 experiments/show_supp.py   | 109 ++++++++++++++++++++++---------------
 experiments/sort_data.py   |  25 +++++++--
 setup.py                   |  32 +++++------
 8 files changed, 148 insertions(+), 123 deletions(-)

diff --git a/README.md b/README.md
index e340ca9..fd87265 100644
--- a/README.md
+++ b/README.md
@@ -7,21 +7,16 @@
 [![License](https://img.shields.io/badge/License-BGI_Research-orange.svg)](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/LICENSE.pdf)
 
 
-DNA has been considered a promising medium for storing digital information.
-As an essential step in the DNA-based data storage workflow, 
-coding scheme is responsible to implement functions including bit-to-base transcoding, error correction, etc. 
-In previous studies, these functions are normally realized by introducing independent coding algorithms. 
-Here, we report a graph-based architecture, named SPIDER-WEB, 
-providing an all-in-one coding solution by generating customized algorithms automatically. 
-SPIDER-WEB supports correcting at maximum 4% edit errors in the DNA sequences 
-including substitution and insertion/deletion (indel), 
-while the required logical redundancy at only 5.5%. 
-Since no DNA sequence pretreatment is required for correcting and decoding processes, 
-SPIDER-WEB offers the function of real-time information retrieval, 
-which is 305.08 times faster than the speed of single-molecule sequencing techniques. 
-Our retrieval process can improve 2 orders of magnitude faster compared to conventional one 
-under megabyte-level data and can be scalable to fit exabyte-level data. 
-Therefore, SPIDER-WEB holds the potential to improve the practicability in large-scale data storage applications.
+DNA has been considered a promising medium for storing digital information. 
+Previously, functions including bit-to-base transcoding and error correction are implemented by independent algorithms. 
+It would result in either increased computational complexity or compromised error tolerance 
+when attempting to correct various types of errors, especially for insertions/deletions (indels). 
+To address this issue, we report a graph-based architecture, named SPIDER-WEB, providing an all-in-one coding solution 
+by generating customized algorithms with an efficient built-in error correction function. 
+SPIDER-WEB is able to correct a maximum of 4% edit errors in the DNA sequences including substitution and indel, 
+with only 5.5% logical redundancy. In addition, SPIDER-WEB enables real-time retrieval of megabyte-level data with 
+a 100x execution speed improvement over conventional methods and hold the potential of practicability for 
+large-scale data storage applications at the exabyte-level. 
 
 ## Installation
 You can install this package using pip:
diff --git a/experiments/code_repair.py b/experiments/code_repair.py
index 460a449..f2a160b 100644
--- a/experiments/code_repair.py
+++ b/experiments/code_repair.py
@@ -1,11 +1,9 @@
 from itertools import product
 from time import time
-from networkx import DiGraph
-from numpy import random, array, arange, ones, zeros, fromfile, unpackbits, hstack, expand_dims
-from numpy import abs, all, sum, min, max, log, where, longlong, uint8
+from numpy import random, array, arange, zeros, abs, min, max, log, where, all, longlong
 from warnings import filterwarnings
 
-from dsw import encode, decode, set_vt, repair_dna, obtain_vertices, bit_to_number, dna_to_number, number_to_bit
+from dsw import set_vt, repair_dna, obtain_vertices, bit_to_number
 from dsw import Monitor, accessor_to_latter_map, approximate_capacity, remove_nasty_arc
 
 filterwarnings("ignore", category=RuntimeWarning)
@@ -412,7 +410,7 @@ def next(self, nucleotide_index, current_score):
     records = zeros(shape=(repeats,), dtype=int)
     for repeat in range(repeats):
         print("Run test " + str(repeat + 1) + "/" + str(repeats) + " with " + str(errors) + " errors.")
-        used_message, found = random.randint(0, 2, size=(dna_length // 3 + 1,)), False
+        used_message = random.randint(0, 2, size=(dna_length // 3 + 1,))
         right_dna_sequence = hedges_encode(binary_message=used_message, strand_index=repeat, pattern=[1, 0, 0])
         wrong_dna_sequence = introduce_errors(right_dna_sequence=right_dna_sequence, errors=errors)
         availables, size, step, process = hedges_decode(dna_sequence=wrong_dna_sequence, strand_index=repeat,
@@ -420,7 +418,6 @@ def next(self, nucleotide_index, current_score):
                                                         correct_penalty=-0.324)
         for available in availables:
             if all(available[:-1] == used_message[:-1]):
-                found = True
                 break
         records[repeat] = step
 
diff --git a/experiments/data/README.md b/experiments/data/README.md
index 6c652b4..4d0b8db 100644
--- a/experiments/data/README.md
+++ b/experiments/data/README.md
@@ -7,4 +7,4 @@ The simplified supporting data support all figures and tables in the main text a
 
 | file name               | MD5                              | file size |
 | ----                    | ----                             | ----      |
-| supplementary data.xlsx | 81FE364BDB95A5A86B3519A161401DEB | 13,576 KB |
\ No newline at end of file
+| supplementary data.xlsx | DF3A194087004E4203D968E781E5AD33 | 13,579 KB |
\ No newline at end of file
diff --git a/experiments/show/README.md b/experiments/show/README.md
index f176cb3..1dd143c 100644
--- a/experiments/show/README.md
+++ b/experiments/show/README.md
@@ -1,5 +1,5 @@
 After obtaining [the raw data](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/experiments/raw),
-you can generate all the figures (13) and tables (8) through
+you can generate all the figures (13) and tables (9) through
 [show_main.py](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/experiments/show_main.py) 
 and 
 [show_supp.py](https://github.com/HaolingZHANG/DNASpiderWeb/blob/main/experiments/show_supp.py).
\ No newline at end of file
diff --git a/experiments/show_main.py b/experiments/show_main.py
index 4dae2b7..521cb60 100644
--- a/experiments/show_main.py
+++ b/experiments/show_main.py
@@ -22,9 +22,10 @@ def main01():
     pyplot.figure(figsize=(10, 7), tight_layout=True)
 
     ax = pyplot.subplot(3, 1, 1)
+    pyplot.text(0.03, 0.98, "a", va="center", ha="center", fontsize=14)
     pyplot.text(0.03, 0.5, "generating process", color="#0070C0",
-                va="center", ha="center", rotation="vertical", fontsize=12)
-    pyplot.vlines(0.1, 0, 1, color="#0070C0", lw=1.5)
+                va="center", ha="center", rotation="vertical", fontsize=11)
+    pyplot.vlines(0.1, 0.05, 0.95, color="#0070C0", lw=1.5)
     ax.add_patch(patches.Circle(xy=(0.5, 0.5), radius=0.200, facecolor="#EEEEEE", edgecolor="#000000", lw=0.75))
     pyplot.text(0.5, 0.5, r"$\mathcal{D}_4^k$", va="center", ha="center", fontsize=14)
     pyplot.text(0.5, 0.1, "quaternary de Bruijn graph\nof order " + r"$k$",
@@ -88,10 +89,11 @@ def main01():
     pyplot.ylim(0, 1)
 
     pyplot.subplot(3, 1, 2)
+    pyplot.text(0.03, 0.98, "b", va="center", ha="center", fontsize=14)
     pyplot.text(0.03, 0.5, "coding process", color="#0070C0",
-                va="center", ha="center", rotation="vertical", fontsize=12)
-    pyplot.vlines(0.1, 0, 1, color="#0070C0", lw=1.5)
-    pyplot.text(0.5, 0.92, "part of coding digraph", va="center", ha="center", fontsize=10)
+                va="center", ha="center", rotation="vertical", fontsize=11)
+    pyplot.vlines(0.1, 0.05, 0.95, color="#0070C0", lw=1.5)
+    pyplot.text(0.5, 0.92, "walk of coding digraph", va="center", ha="center", fontsize=10)
     points = array([[0.5, 0.2, 0.4, 0.6, 0.8, 0.5, 0.7, 0.4, 0.6, 0.4, 0.6, 0.8],
                     [0.9, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.3, 0.3, 0.1, 0.1, 0.1]])
     points[1] -= 0.05
@@ -188,9 +190,10 @@ def main01():
     pyplot.ylim(0, 1)
 
     pyplot.subplot(3, 1, 3)
+    pyplot.text(0.03, 0.98, "c", va="center", ha="center", fontsize=14)
     pyplot.text(0.03, 0.5, "error correcting process", color="#0070C0",
-                va="center", ha="center", rotation="vertical", fontsize=12)
-    pyplot.vlines(0.1, 0, 1, color="#0070C0", lw=1.5)
+                va="center", ha="center", rotation="vertical", fontsize=11)
+    pyplot.vlines(0.1, 0.05, 0.95, color="#0070C0", lw=1.5)
     pyplot.fill_between([0.15, 3.50], 0.05, 0.95, color="#EFEFEF", lw=0)
     points = array([[0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 0.4, 0.6, 0.8, 1.0, 1.2, 0.4, 0.6, 0.8, 1.0, 0.4, 0.6, 0.8],
                     [0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.55, 0.55, 0.55, 0.55, 0.55, 0.4, 0.4, 0.4, 0.4, 0.25, 0.25, 0.25]])
@@ -419,7 +422,7 @@ def draw_graph(ps, s, w, se=None, h=None):
                     arrowprops=dict(arrowstyle="<|-", color="red", lw=1))
     pyplot.text(0.35, 0.14, "normal arc", va="center", ha="left", fontsize=8)
     pyplot.text(0.35, 0.06, "access arc (error found)", va="center", ha="left", fontsize=8)
-    pyplot.fill_between([0.7 - 0.017, 0.7 + 0.017], 0.14 - 0.015, 0.14 + 0.018, lw=0, color="lightblue")
+    pyplot.fill_between([0.66 - 0.017, 0.66 + 0.017], 0.14 - 0.015, 0.14 + 0.018, lw=0, color="lightblue")
     pyplot.text(0.66, 0.14, "N", va="center", ha="center", fontsize=8)
     pyplot.text(0.70, 0.14, "adjust (substitute/insert/delete) by N", va="center", ha="left", fontsize=8)
     pyplot.text(0.94, 0.06, "N", va="center", ha="center", color="r", fontsize=8)
@@ -632,7 +635,7 @@ def main03():
         pyplot.plot(arange(len(values)) + 0.5, values + 0.5, lw=0.75, ls="--", color="k", zorder=1)
     pyplot.legend(loc="upper right", ncol=2, framealpha=1, title="% of edit errors", fontsize=9, title_fontsize=9)
     pyplot.xlabel("reads number", fontsize=10)
-    pyplot.ylabel("maximum loss after retrieval", fontsize=10)
+    pyplot.ylabel("maximum number of\nuncorrectable sequences", fontsize=10)
     pyplot.xticks(arange(10) + 0.5, (arange(10) + 1) * 5, fontsize=10)
     pyplot.yticks(arange(6) + 0.5, ["1E+" + str(v) for v in arange(6)], fontsize=10)
     pyplot.xlim(0, 10)
@@ -650,18 +653,18 @@ def main03():
     show_data[show_data <= 0] = nan
 
     # noinspection PyUnresolvedReferences
-    ax.add_patch(patches.FancyBboxPatch((0.25, 5.85), width=3.3, height=1.9,
+    ax.add_patch(patches.FancyBboxPatch((0.25, 5.85), width=4.6, height=1.9,
                                         fc="w", ec="silver", boxstyle="Round,pad=0.1", lw=0.75))
-    pyplot.text(1.9, 7.3, "smallest gap between\ncorrect-incorrect reads", va="center", ha="center", fontsize=9,
-                zorder=2)
+    pyplot.text(2.55, 7.3, "smallest frequency gap between\ncorrect-incorrect sequences",
+                va="center", ha="center", fontsize=9, zorder=2)
 
     for index, color in enumerate(pyplot.get_cmap("gist_earth_r")(linspace(0, 0.5, 40))):
-        pyplot.fill_between([0.4 + index * (3.0 / 40.0), 0.4 + (index + 1) * (3.0 / 40.0)], 6.5, 6.8,
+        pyplot.fill_between([0.4 + index * (4.25 / 40.0), 0.4 + (index + 1) * (4.25 / 40.0)], 6.5, 6.8,
                             fc=color, lw=0, zorder=2)
     for index, number in enumerate([0, 10, 20, 30, 40]):
-        pyplot.vlines(0.4 + 0.75 * index, 6.5, 6.3, lw=0.75, zorder=2)
-        pyplot.text(0.4 + 0.75 * index, 6.0, str(number), va="center", ha="center", fontsize=9, zorder=2)
-    pyplot.plot([0.4, 3.4, 3.4, 0.4, 0.4], [6.8, 6.8, 6.5, 6.5, 6.8], color="k", lw=0.75, zorder=3)
+        pyplot.vlines(0.4 + 1.0625 * index, 6.5, 6.3, lw=0.75, zorder=2)
+        pyplot.text(0.4 + 1.0625 * index, 6.0, str(number), va="center", ha="center", fontsize=9, zorder=2)
+    pyplot.plot([0.40, 4.65, 4.65, 0.40, 0.40], [6.8, 6.8, 6.5, 6.5, 6.8], color="k", lw=0.75, zorder=3)
 
     pyplot.pcolormesh(arange(11), arange(9), show_data.T, vmin=0, vmax=70, cmap="gist_earth_r")
     for x in range(10):
@@ -670,7 +673,7 @@ def main03():
                 pyplot.text(x + 0.5, y + 0.5, matrix[x, y], va="center", ha="center", fontsize=9)
 
     pyplot.xlabel("reads number", fontsize=10)
-    pyplot.ylabel("error rate", fontsize=10)
+    pyplot.ylabel("error rate", fontsize=10, labelpad=12)
     pyplot.xticks(arange(10) + 0.5, (arange(10) + 1) * 5, fontsize=10)
     pyplot.yticks(arange(8) + 0.5, ["%.1f" % ((v + 1) / 2) + "%" for v in arange(8)], fontsize=10)
     pyplot.xlim(0, 10)
@@ -691,14 +694,14 @@ def main03():
         for retrieval_rate, style in zip([1.0, 0.999, 0.99], used_styles):
             x_values, y_values = data[(error_rate, retrieval_rate)]
             info_1 = "%.1f" % (error_rate * 100) + "%"
-            info_2 = "%.1f" % (retrieval_rate * 100) + "%" if retrieval_rate < 1 else "lossless"
+            info_2 = "%.1f" % (retrieval_rate * 100) + "%" if retrieval_rate < 1 else " 100%"
             pyplot.plot(log10(x_values), y_values, color=color, lw=2, ls=style, zorder=1,
                         label=info_1 + " | " + info_2)
 
     pyplot.legend(loc="upper left", ncol=2, fontsize=9, title_fontsize=9,
                   title="  minimum reads number\nfor \"error rate | retrieval rate\"")
     pyplot.xlabel("sequence diversity", fontsize=10)
-    pyplot.ylabel("reads number", fontsize=10)
+    pyplot.ylabel("reads number", fontsize=10, labelpad=12)
     pyplot.xticks(arange(0, 13),
                   ["1E+" + str(value).zfill(2) if value % 3 == 0 else "" for value in arange(0, 13)], fontsize=10)
     pyplot.yticks(arange(0, 281, 40), arange(0, 281, 40), fontsize=10)
@@ -735,9 +738,9 @@ def function_0_5(diversity, depth):
         pyplot.plot(x_values, y_values, color=used_colors[1], lw=2, ls=style, label="0.5% | " + diversity_name)
 
     pyplot.legend(loc="upper left", ncol=2, fontsize=9, title_fontsize=9,
-                  title="    non-blocking reads threshold\nfor \"error rate | sequence diversity\"")
+                  title="        non-blocking threshold\nfor \"error rate | sequence diversity\"")
     pyplot.xlabel("reads number", fontsize=10)
-    pyplot.ylabel("reads threshold", fontsize=10)
+    pyplot.ylabel("maximum frequency of\nincorrect DNA sequences", fontsize=10)
     pyplot.xticks(arange(0, 281, 40), arange(0, 281, 40), fontsize=10)
     pyplot.yticks(arange(0, 141, 20), arange(0, 141, 20), fontsize=10)
     pyplot.xlim(0, 280)
@@ -748,10 +751,10 @@ def function_0_5(diversity, depth):
     ax.spines["right"].set_visible(False)
 
     figure.align_labels()
-    figure.text(0.020, 0.99, "a", va="center", ha="center", fontsize=14)
-    figure.text(0.518, 0.99, "b", va="center", ha="center", fontsize=14)
-    figure.text(0.020, 0.49, "c", va="center", ha="center", fontsize=14)
-    figure.text(0.518, 0.49, "d", va="center", ha="center", fontsize=14)
+    figure.text(0.024, 0.99, "a", va="center", ha="center", fontsize=14)
+    figure.text(0.523, 0.99, "b", va="center", ha="center", fontsize=14)
+    figure.text(0.024, 0.49, "c", va="center", ha="center", fontsize=14)
+    figure.text(0.523, 0.49, "d", va="center", ha="center", fontsize=14)
 
     pyplot.savefig("./show/main03.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
@@ -773,17 +776,17 @@ def main04():
     pyplot.bar([-1], [-1], fc="#E2F0D9", ec="k", lw=0.75, width=0.3, label="high-throughput sequencing")
 
     for index, speed in enumerate(correction_speeds):
-        height = (speed - 100000) / 250 + 1600
+        height = (speed * 144 - 10000000) / 50000 + 1600
         pyplot.vlines(index - 0.3, 0, 1300, lw=0.75, zorder=0)
         pyplot.vlines(index, 0, 1300, lw=0.75, zorder=0)
         pyplot.plot([index - 0.3, index - 0.3, index, index], [1500, height, height, 1500],
                     lw=0.75, color="k", zorder=0)
-        pyplot.text(index - 0.15, height + 20, int(speed), va="bottom", ha="center", fontsize=9)
+        pyplot.text(index - 0.15, height + 20, int(speed * 144), va="bottom", ha="center", fontsize=9)
         pyplot.fill_between([index - 0.3, index], 0, 1300, fc="#E2F0D9", ec="w", lw=0, zorder=-1)
         pyplot.fill_between([index - 0.3, index], 1500, height, fc="#E2F0D9", ec="w", lw=0, zorder=-1)
     pyplot.text(-0.5, 1400, " ", bbox=dict(fc="w", ec="w"), va="center", ha="center", fontsize=1, zorder=5)
-    figure.add_artist(lines.Line2D([0.100, 0.108], [0.703, 0.713], lw=1, color="k"))
-    figure.add_artist(lines.Line2D([0.100, 0.108], [0.728, 0.738], lw=1, color="k"))
+    figure.add_artist(lines.Line2D([0.115, 0.123], [0.722, 0.732], lw=1, color="k"))
+    figure.add_artist(lines.Line2D([0.115, 0.123], [0.746, 0.756], lw=1, color="k"))
     pyplot.bar(arange(len(correction_speeds)) + 0.15, correction_speeds / 450, fc="#A9D18E", ec="k", lw=0.75, width=0.3,
                label="single-molecule sequencing")
     for h_location, value in enumerate(correction_speeds / 450):
@@ -793,10 +796,10 @@ def main04():
     pyplot.ylabel("performance\n(correction speed / sequencing speed)", fontsize=10)
     pyplot.xticks(arange(len(correction_speeds)),
                   ["%.1f" % (value / 2) + "%" for value in arange(len(correction_speeds))], fontsize=10)
-    pyplot.yticks([0, 400, 800, 1200, 1600, 2000, 2400, 2800, 3200, 3600],
-                  [0, 400, 800, 1200, 100000, 200000, 300000, 400000, 500000, 600000], fontsize=10)
+    pyplot.yticks([0, 400, 800, 1200, 1600, 2000, 2400, 2800, 3200],
+                  [0, 400, 800, 1200, 10000000, 30000000, 50000000, 70000000, 90000000], fontsize=10)
     pyplot.xlim(-0.5, len(correction_speeds) - 0.5)
-    pyplot.ylim(0, 3600)
+    pyplot.ylim(0, 3200)
     # noinspection PyUnresolvedReferences
     ax.spines["top"].set_visible(False)
     # noinspection PyUnresolvedReferences
diff --git a/experiments/show_supp.py b/experiments/show_supp.py
index 3dd5690..fb07420 100644
--- a/experiments/show_supp.py
+++ b/experiments/show_supp.py
@@ -24,12 +24,32 @@
 
 
 def supp01():
+    book = Workbook()
+    sheet = book.create_sheet(title="supp01", index=0)
+
+    sheet.append(["micro-organism", "undesired motifs"])
+    sheet.append(["Bacillus subtilis", "GTCGAC"])
+    sheet.append(["Bifidobacterium bifidum", "CTGCAG"])
+    sheet.append(["Escherichia coli", "CAGCAG and GATATC"])
+    sheet.append(["Gluconobacter dioxyacetonicus", "AGGCCT"])
+    sheet.append(["Haemophilus gallinarum ", "GACGC"])
+    sheet.append(["Klebsiella pneumoniae", "GGTACC"])
+    sheet.append(["Oerskovia xanthineolytica", "AGCT"])
+    sheet.append(["Streptomyces achromogenes", "ACTAGT"])
+    sheet.append(["Streptomyces caespitosus", "AGTACT"])
+    sheet.append(["Streptomyces phaeochromogenes", "GCATGC"])
+    sheet.append(["Streptomyces stanford", "GAGCTC"])
+    sheet.append(["Xanthomonas badrii", "TCTAGA"])
+
+    book.save("./show/supp01.xlsx")
+
+
+def supp02():
     capacities, _ = load_data(load_path="./raw/capacity_reference.pkl")
 
     book = Workbook()
-    sheet = book.create_sheet(title="supp01", index=0)
-    sheet.append(["set index", "homopolymer run-length", "regionalized GC content", "undesired motifs",
-                  "capacity"])
+    sheet = book.create_sheet(title="supp02", index=0)
+    sheet.append(["set index", "homopolymer run-length", "regionalized GC content", "undesired motifs", "capacity"])
 
     for constraint_index, bio_filter in local_bio_filters.items():
         save_data = [constraint_index]
@@ -65,10 +85,10 @@ def supp01():
 
         sheet.append(save_data)
 
-    book.save("./show/supp01.xlsx")
+    book.save("./show/supp02.xlsx")
 
 
-def supp02():
+def supp03():
     filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
     colors = pyplot.get_cmap("Set3")(linspace(0, 1, 12))
     record = load_data(load_path="./raw/generation_evaluation.pkl")
@@ -93,11 +113,11 @@ def supp02():
     # noinspection PyUnresolvedReferences
     ax.spines["right"].set_visible(False)
 
-    pyplot.savefig("./show/supp02.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp03.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp03():
+def supp04():
     filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
     colors = pyplot.get_cmap("Set3")(linspace(0, 1, 12))
     record = load_data(load_path="./raw/generation_evaluation.pkl")
@@ -123,16 +143,16 @@ def supp03():
     # noinspection PyUnresolvedReferences
     ax.spines["right"].set_visible(False)
 
-    pyplot.savefig("./show/supp03.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp04.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp04():
+def supp05():
     record = load_data(load_path="./raw/generation_evaluation.pkl")
     change_data = record["change"]
 
     book = Workbook()
-    sheet = book.create_sheet(title="supp04", index=0)
+    sheet = book.create_sheet(title="supp05", index=0)
     sheet.append(["trimming cycle",
                   "C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10", "C11", "C12"])
     for cycle_index in range(13):
@@ -145,10 +165,10 @@ def supp04():
                 values.append("")
         sheet.append(values)
 
-    book.save("./show/supp04.xlsx")
+    book.save("./show/supp05.xlsx")
 
 
-def supp05():
+def supp06():
     random_seed, param_number = 2021, 100
 
     random.seed(random_seed)
@@ -167,7 +187,7 @@ def supp05():
     hedges_params = array(hedges_params).tolist()
 
     book = Workbook()
-    sheet = book.create_sheet(title="supp05", index=0)
+    sheet = book.create_sheet(title="supp06", index=0)
     sheet.append(["param index", "DNA Fountain", "", "Yin-Yang Code", "HEDGES", ""])
     sheet.append(["", "c", "delta", "rule id", "pattern", "correct penalty"])
     for param_index in range(param_number):
@@ -184,10 +204,10 @@ def supp05():
 
         sheet.append(record)
 
-    book.save("./show/supp05.xlsx")
+    book.save("./show/supp06.xlsx")
 
 
-def supp06():
+def supp07():
     random_seed, param_number = 2021, 100
 
     spiderweb_params = []
@@ -201,7 +221,7 @@ def supp06():
     spiderweb_params = array(spiderweb_params).T.tolist()
 
     book = Workbook()
-    sheet = book.create_sheet(title="supp08", index=0)
+    sheet = book.create_sheet(title="supp07", index=0)
     sheet.append(["param index", "SPIDER-WEB", "", "", "", "", "", "", "", "", "", "", ""])
     sheet.append(["", "C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10", "C11", "C12"])
     for param_index in range(param_number):
@@ -211,16 +231,16 @@ def supp06():
 
         sheet.append(record)
 
-    book.save("./show/supp06.xlsx")
+    book.save("./show/supp07.xlsx")
 
 
-def supp07():
+def supp08():
     display_data = [[[] for _ in range(4)] for _ in range(12)]
     record = load_data(load_path="./raw/coding_evaluation_1.pkl")
     for algorithm_index, filter_index, _, _, code_rate in record:
         display_data[int(filter_index - 1)][int(algorithm_index - 1)].append(code_rate)
     book = Workbook()
-    sheet = book.create_sheet(title="supp07", index=0)
+    sheet = book.create_sheet(title="supp08", index=0)
     sheet.append(["set index", "DNA Fountain", "Yin-Yang Code", "HEDGES", "SPIDER-WEB"])
     for index, data in enumerate(display_data):
         record = [str(index + 1).zfill(2)]
@@ -235,17 +255,17 @@ def supp07():
                 record.append("%.2f ~ %.2f" % (min(sub_data), max(sub_data)))
         sheet.append(record)
 
-    book.save("./show/supp07.xlsx")
+    book.save("./show/supp08.xlsx")
 
 
-def supp08():
+def supp09():
     display_data = [[[] for _ in range(4)] for _ in range(12)]
     record = load_data(load_path="./raw/coding_evaluation_1.pkl")
     for algorithm_index, filter_index, _, _, code_rate in record:
         display_data[int(filter_index) - 1][int(algorithm_index) - 1].append(code_rate)
 
     book = Workbook()
-    sheet = book.create_sheet(title="supp08", index=0)
+    sheet = book.create_sheet(title="supp09", index=0)
     sheet.append(["set index", "DNA Fountain", "Yin-Yang Code", "HEDGES", "SPIDER-WEB"])
     for index, data in enumerate(display_data):
         record = [str(index + 1).zfill(2)]
@@ -260,10 +280,10 @@ def supp08():
                 record.append("%.4f" % std(sub_data))
         sheet.append(record)
 
-    book.save("./show/supp08.xlsx")
+    book.save("./show/supp09.xlsx")
 
 
-def supp09():
+def supp10():
     counts = ones(shape=(12, 6)) * 100
     record = load_data(load_path="./raw/coding_evaluation_1.pkl")
     for algorithm_index, filter_index, param_index, _, code_rate in record:
@@ -272,7 +292,7 @@ def supp09():
     counts /= 100.0
 
     book = Workbook()
-    sheet = book.create_sheet(title="supp09", index=0)
+    sheet = book.create_sheet(title="supp10", index=0)
     sheet.append(["set index", "pattern 1", "pattern 2", "pattern 3", "pattern 4", "pattern 5", "pattern 6"])
     for index, data in enumerate(counts):
         record = [str(index + 1).zfill(2)]
@@ -280,10 +300,10 @@ def supp09():
             record.append(str(int(value * 100)) + "%")
         sheet.append(record)
 
-    book.save("./show/supp09.xlsx")
+    book.save("./show/supp10.xlsx")
 
 
-def supp10():
+def supp11():
     filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
 
     capacities, _ = load_data(load_path="./raw/capacity_reference.pkl")
@@ -319,11 +339,11 @@ def supp10():
     pyplot.yticks([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
                   ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9", "2.0"], fontsize=10)
     pyplot.ylim(0.95, 2.05)
-    pyplot.savefig("./show/supp10.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp11.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp11():
+def supp12():
     data = load_data(load_path="./raw/correction_evaluation_1.pkl")
 
     pyplot.figure(figsize=(10, 6), tight_layout=True)
@@ -365,11 +385,11 @@ def supp11():
     # noinspection PyUnresolvedReferences
     ax.spines["right"].set_visible(False)
 
-    pyplot.savefig("./show/supp11.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp12.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp12():
+def supp13():
     data = load_data(load_path="./raw/correction_evaluation_1.pkl")
 
     pyplot.figure(figsize=(10, 6), tight_layout=True)
@@ -407,11 +427,11 @@ def supp12():
     pyplot.yticks([0, 2, 4, 6, 8, 10], [1, 4, 16, 64, 256, 1024], fontsize=10)
     pyplot.xlim(-0.8, 7.8)
     pyplot.ylim(-0.5, 10.5)
-    pyplot.savefig("./show/supp12.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp13.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp13():
+def supp14():
     data = load_data(load_path="./raw/correction_evaluation_4.pkl")
 
     figure = pyplot.figure(figsize=(10, 5))
@@ -448,11 +468,11 @@ def supp13():
     bar.set_label("minimum reads number to reach the given retrieval rate", fontsize=10)
     bar.ax.tick_params(labelsize=10)
 
-    pyplot.savefig("./show/supp13.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp14.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp14():
+def supp15():
     data = load_data(load_path="./raw/correction_evaluation_6.pkl")
 
     figure = pyplot.figure(figsize=(10, 6))
@@ -487,25 +507,25 @@ def supp14():
     bar.set_label("maximum frequency of incorrect reads", fontsize=10)
     bar.ax.tick_params(labelsize=10)
 
-    pyplot.savefig("./show/supp14.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp15.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp15():
+def supp16():
     data = load_data(load_path="./raw/correction_evaluation_7.pkl")
 
     book = Workbook()
-    sheet = book.create_sheet(title="supp15", index=0)
+    sheet = book.create_sheet(title="supp16", index=0)
     sheet.append(["error rate", "SPIDER-WEB (average)", "SPIDER-WEB (median)", "HEDGES (average)", "HEDGES (median)"])
     for value in range(9):
         value_x = data["spiderweb"][value / 200.0]
         value_y = data["hedges"][value / 200.0]
         sheet.append([value / 200.0, mean(value_x), median(value_x), mean(value_y), median(value_y)])
 
-    book.save("./show/supp15.xlsx")
+    book.save("./show/supp16.xlsx")
 
 
-def supp16():
+def supp17():
     filter_indices = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
     coding_graphs = load_data(load_path="./raw/graph_coding.pkl")
     follow_ups = zeros(shape=(12, 3))
@@ -517,7 +537,7 @@ def supp16():
         counts /= sum(counts)
         follow_ups[index] = counts
 
-    figure = pyplot.figure(figsize=(10, 6), tight_layout=True)
+    pyplot.figure(figsize=(10, 6), tight_layout=True)
     gradient_colors = pyplot.get_cmap("RdYlGn")(linspace(0, 1, 12))
     for index, filter_index in enumerate(filter_indices):
         lengths = linspace(0, 256, 257)
@@ -531,11 +551,11 @@ def supp16():
     pyplot.yticks([0, 64, 128, 192, 256], ["2^0", "2^64", "2^128", "2^192", "2^256"], fontsize=10)
     pyplot.xlim(0, 256)
     pyplot.ylim(0, 256)
-    pyplot.savefig("./show/supp16.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp17.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
-def supp17():
+def supp18():
     record = load_data(load_path="./raw/capacity_evaluation.pkl")
 
     data, errors = record["detail"]
@@ -661,7 +681,7 @@ def supp17():
     figure.text(0.024, 0.66, "b", va="center", ha="center", fontsize=14)
     figure.text(0.024, 0.34, "c", va="center", ha="center", fontsize=14)
 
-    pyplot.savefig("./show/supp17.pdf", format="pdf", bbox_inches="tight", dpi=600)
+    pyplot.savefig("./show/supp18.pdf", format="pdf", bbox_inches="tight", dpi=600)
     pyplot.close()
 
 
@@ -683,3 +703,4 @@ def supp17():
     supp15()
     supp16()
     supp17()
+    supp18()
diff --git a/experiments/sort_data.py b/experiments/sort_data.py
index 93a2b91..f903a38 100644
--- a/experiments/sort_data.py
+++ b/experiments/sort_data.py
@@ -109,6 +109,21 @@ def propose05(diversity):
                           propose40(sequence_diversity), propose05(sequence_diversity)])
 
         sheet = book.create_sheet(title="Table S1")
+        sheet.append(["micro-organism", "undesired motifs"])
+        sheet.append(["Bacillus subtilis", "GTCGAC"])
+        sheet.append(["Bifidobacterium bifidum", "CTGCAG"])
+        sheet.append(["Escherichia coli", "CAGCAG and GATATC"])
+        sheet.append(["Gluconobacter dioxyacetonicus", "AGGCCT"])
+        sheet.append(["Haemophilus gallinarum ", "GACGC"])
+        sheet.append(["Klebsiella pneumoniae", "GGTACC"])
+        sheet.append(["Oerskovia xanthineolytica", "AGCT"])
+        sheet.append(["Streptomyces achromogenes", "ACTAGT"])
+        sheet.append(["Streptomyces caespitosus", "AGTACT"])
+        sheet.append(["Streptomyces phaeochromogenes", "GCATGC"])
+        sheet.append(["Streptomyces stanford", "GAGCTC"])
+        sheet.append(["Xanthomonas badrii", "TCTAGA"])
+
+        sheet = book.create_sheet(title="Table S2")
         sheet.append(["set index", "homopolymer run-length", "regionalized GC content", "undesired motifs", "capacity"])
         record, _ = load_data(load_path="./raw/capacity_reference.pkl")
         for constraint_index, bio_filter in local_bio_filters.items():
@@ -149,7 +164,7 @@ def propose05(diversity):
         for index in range(12):
             sheet.append([str(index + 1).zfill(2), record[str(index + 1).zfill(2)][0]])
 
-        sheet = book.create_sheet(title="Table S2")
+        sheet = book.create_sheet(title="Table S3")
         sheet.append(["trimming cycle",
                       "set 01", "set 02", "set 03", "set 04", "set 05", "set 06",
                       "set 07", "set 08", "set 09", "set 10", "set 11", "set 12"])
@@ -164,7 +179,7 @@ def propose05(diversity):
                     values.append("N/A")
             sheet.append(values)
 
-        sheet = book.create_sheet(title="Table S3")
+        sheet = book.create_sheet(title="Table S4")
         sheet.append(["param index", "c in DNA Fountain", "delta in DNA Fountain", "rule index in Yin-Yang Code",
                       "pattern in HEDGES", "correct penalty in HEDGES"])
         random_seed, param_number = 2021, 100
@@ -193,7 +208,7 @@ def propose05(diversity):
                 info.append("N/A")
             sheet.append(info)
 
-        sheet = book.create_sheet(title="Table S4")
+        sheet = book.create_sheet(title="Table S5")
         sheet.append(["param index", *["set " + str(idx + 1).zfill(2) for idx in range(12)]])
         random_seed, param_number, spiderweb_params = 2021, 100, []
         record = load_data(load_path="./raw/graph_coding.pkl")
@@ -210,7 +225,7 @@ def propose05(diversity):
                 info.append(vertex_index)
             sheet.append(info)
 
-        sheet = book.create_sheet(title="Table S5,S6,S7")
+        sheet = book.create_sheet(title="Table S6,S7,S8")
         sheet.append(["set index", "parameter index", "data index",
                       "code rate of DNA Fountain", "code rate of Yin-Yang Code", "code rate of HEDGES",
                       "code rate of SPIDER-WEB"])
@@ -266,7 +281,7 @@ def propose05(diversity):
                 for index_2, sequence_diversity in enumerate([1e1, 1e2, 1e3, 1e4, 1e5]):
                     sheet.append([error_rate, reads_number, sequence_diversity, record[error_rate][index_1, index_2]])
 
-        sheet = book.create_sheet(title="Table S8")
+        sheet = book.create_sheet(title="Table S9")
         sheet.append(["error rate", "test index",
                       "vertex access frequency of SPIDER-WEB", "vertex access frequency of HEDGES"])
         record = load_data(load_path="./raw/correction_evaluation_7.pkl")
diff --git a/setup.py b/setup.py
index 1166081..75ee8c9 100644
--- a/setup.py
+++ b/setup.py
@@ -6,30 +6,24 @@
     description="SPIDER-WEB generates coding algorithms with "
                 "superior error tolerance and real-time information retrieval capacity",
     long_description="DNA has been considered a promising medium for storing digital information. "
-                     "As an essential step in the DNA-based data storage workflow, "
-                     "coding scheme is responsible to implement functions "
-                     "including bit-to-base transcoding, error correction, etc. "
-                     "In previous studies, these functions are normally realized by "
-                     "introducing independent coding algorithms. "
-                     "Here, we report a graph-based architecture, named SPIDER-WEB, "
-                     "providing an all-in-one coding solution by generating customized algorithms automatically. "
-                     "SPIDER-WEB supports correcting at maximum 4% edit errors in the DNA sequences "
-                     "including substitution and insertion/deletion (indel), "
-                     "while the required logical redundancy at only 5.5%. "
-                     "Since no DNA sequence pretreatment is required for correcting and decoding processes, "
-                     "SPIDER-WEB offers the function of real-time information retrieval, "
-                     "which is 305.08 times faster than the speed of single-molecule sequencing techniques."
-                     " Our retrieval process can improve 2 orders of magnitude faster compared to conventional one "
-                     "under megabyte-level data and can be scalable to fit exabyte-level data. "
-                     "Therefore, SPIDER-WEB holds the potential to improve the practicability "
-                     "in large-scale data storage applications.",
+                     "Previously, functions including bit-to-base transcoding and error correction are implemented by "
+                     "independent algorithms. It would result in either increased computational complexity "
+                     "or compromised error tolerance when attempting to correct various types of errors, "
+                     "especially for insertions/deletions (indels). To address this issue, "
+                     "we report a graph-based architecture, named SPIDER-WEB, providing an all-in-one coding solution "
+                     "by generating customized algorithms with an efficient built-in error correction function. "
+                     "SPIDER-WEB is able to correct a maximum of 4% edit errors in the DNA sequences "
+                     "including substitution and indel, with only 5.5% logical redundancy. In addition, "
+                     "SPIDER-WEB enables real-time retrieval of megabyte-level data with a 100x execution speed "
+                     "improvement over conventional methods and hold the potential of practicability "
+                     "for large-scale data storage applications at the exabyte-level.",
     author="Haoling ZHANG",
     author_email="zhanghaoling@genomics.cn",
     url="https://github.com/HaolingZHANG/DNASpiderWeb",
     packages=["dsw", "tests"],
     install_requires=["numpy", "networkx"],
-    license="GPL",
-    classifiers=["License :: BGI Research",
+    license="BGI-Research",
+    classifiers=["License :: BGI-Research",
                  "Programming Language :: Python :: 3",
                  "Operating System :: OS Independent"],
     keywords="DNA-based data storage, coding algorithm, automatic algorithm generator, probabilistic error correction",