Skip to content

Commit

Permalink
Merge pull request #190 from brain-bican/preserve_annot
Browse files Browse the repository at this point in the history
reload data with preserving annotations
  • Loading branch information
hkir-dev authored Aug 19, 2024
2 parents ddd9bf7 + cd66be2 commit 075f9b6
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 5 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ RUN python3 -m pip install deepmerge==1.1.0
RUN python3 -m pip install numpy==1.26.4
RUN python3 -m pip install marshmallow==3.21.1
RUN python3 -m pip install python-dateutil==2.9.0
RUN python3 -m pip install --no-deps cas-tools==1.0.3
RUN python3 -m pip install cap-anndata==0.2.1
RUN python3 -m pip install --no-deps cas-tools==1.0.7
RUN python3 -m pip install --no-deps tdta==0.1.0.dev17

#RUN Rscript $WORKSPACE/dendR/install_packages.R
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ load_data:
# Forcefully loads data. Drops and recreates tables.
.PHONY: reload_data
reload_data: clean
python3 $(IMPORT) import-data --input input_data/ --schema src/schema/ --curation_tables curation_tables/ --force
python3 $(IMPORT) import-data --input input_data/ --schema src/schema/ --curation_tables curation_tables/ --force --preserve

.PHONY: runR
runR:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ openpyxl
urlp
mkdocs
# installed from Dockerfile
# cas-tools==0.0.1.dev38
# cas-tools==1.0.7
# tdta==0.1.0.dev5
cell-annotation-schema==0.2b0
pandas
Expand Down
39 changes: 37 additions & 2 deletions scripts/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,17 @@ def cli():
@click.option('-i', '--input', type=click.Path(exists=True), help='Data folder path.')
@click.option('-s', '--schema', type=click.Path(exists=True), help='Nanobot schema folder path.')
@click.option('-ct', '--curation_tables', type=click.Path(exists=True), help='TDT curation tables folder path.')
@click.option('--force', '-f', is_flag=True, default=False, help="Forcefully drops and recreates tables.")
def import_data(input, schema, curation_tables, force):
@click.option('-f', '--force', is_flag=True, default=False, help="Forcefully drops and recreates tables.")
@click.option('-p', '--preserve', is_flag=True, default=False, help='Preserve existing annotations.')
def import_data(input, schema, curation_tables, force, preserve):
"""
Imports user data to the system
Parameters:
input (str): Path to the input data folder
schema: Nanobot schema folder path.
curation_tables: TDT curation tables folder path.
force: Forcefully drops and recreates tables.
preserve: Preserve existing annotations.
"""
global last_accession_id, accession_ids
last_accession_id = 0
Expand Down Expand Up @@ -84,6 +86,9 @@ def import_data(input, schema, curation_tables, force):
new_files.append(user_data_ct_path)
else:
raise Exception("Couldn't find the config data files (with yaml or yml extension) in folder: " + input)
if preserve:
print("Preserving existing annotations.")
preserve_existing_annotations(std_data, curation_tables)

project_config = retrieve_project_config(Path(input).parent.absolute())
std_tables = serialize_to_tables(std_data, user_file_name, input, project_config)
Expand All @@ -106,6 +111,36 @@ def import_data(input, schema, curation_tables, force):
add_new_files_to_git(project_folder, new_files)


def preserve_existing_annotations(cas_obj, curation_tables_folder):
"""
Copies existing annotations to the new ingested data.
Parameters:
cas_obj: cas python object instance
curation_tables_folder: curation tables folder path
"""
columns_to_preserve = ["cell_ontology_term_id", "cell_ontology_term", "rationale", "rationale_dois"]

existing_annotations_table = os.path.join(curation_tables_folder, "annotation.tsv")
if os.path.isfile(existing_annotations_table):
headers, data = read_csv_to_dict(existing_annotations_table, delimiter="\t", id_column_name="cell_label")
new_annotations = dict()
for annotation in cas_obj.annotations:
cell_label = annotation.cell_label
new_annotations[cell_label] = annotation

for cell_label in data:
records = data[cell_label]
for column in columns_to_preserve:
if records.get(column):
if cell_label in new_annotations:
if column not in asdict(new_annotations[cell_label]): # don't overwrite existing values
new_annotations[cell_label][column] = records[column]
else:
print("Couldn't transferred the '{}' column of the '{}'.".format(column, cell_label))
else:
print("Existing annotations couldn't be found in the folder to copy: " + curation_tables_folder)


def add_new_files_to_git(project_folder, new_files):
"""
Runs git add command to add imported files to the version control.
Expand Down

0 comments on commit 075f9b6

Please sign in to comment.