Skip to content

Commit

Permalink
Merge pull request #120 from CBIIT/CCDIDC-1301_neo4j_bug
Browse files Browse the repository at this point in the history
CCDIDC-1301 neo4j bug
  • Loading branch information
svburke authored Aug 22, 2024
2 parents 4b10aee + 1ad1f55 commit 73f1288
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions src/neo4j_data_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class Neo4jCypherQuery:
)
all_nodes_entries_study_cypher_query: str = (
"""
MATCH (study:study {{study_id: "{study_id}"}})-[*0..7]-(node)
MATCH (study:study {{study_id: "{study_id}"}})-[*1..7]-(node)
RETURN labels(node) AS NodeLabel, COUNT(node) AS NodeCount
"""
)
Expand Down Expand Up @@ -282,12 +282,17 @@ def export_node_ids_a_study(tx, study_id: str, node: str, output_dir: str) -> No
# run the cypher query with specified study and node
result = tx.run(cypher_query)
db_id_list = [record["id"] for record in result]
# print(f"study {study_id} node {node} has ids: {*db_id_list,}")
study_node_id_df = pd.DataFrame(columns=["study_id", "node", "id"])
study_node_id_df["id"] = db_id_list
if len(db_id_list) == 0:
study_node_id_df["id"] = [pd.NA]
else:
study_node_id_df["id"] = db_id_list
study_node_id_df["study_id"] = study_id
study_node_id_df["node"] = node
output_filepath = os.path.join(output_dir, f"{study_id}_{node}_id_list.csv")
study_node_id_df.to_csv(output_filepath, index=False)

return None


Expand Down Expand Up @@ -339,7 +344,6 @@ def parse_tsv_files(filelist: list) -> DataFrame:
return return_df


@task
def compare_id_input_db(
db_id_pulled_dict: dict, parsed_tsv_file_df: DataFrame, logger
) -> DataFrame:
Expand Down Expand Up @@ -408,7 +412,7 @@ def pull_node_ids_all_studies_write(
return temp_folder_name


@flow
@flow(log_prints=True)
def pull_node_ids_all_studies(driver, studies_dataframe: DataFrame, logger) -> Dict:
"""Returns a dictionary of db id list using study id and node name
Expand All @@ -429,7 +433,7 @@ def pull_node_ids_all_studies(driver, studies_dataframe: DataFrame, logger) -> D
else:
pass
file_node = file_df["node"].unique().tolist()[0]
ids_dict[file_study][file_node] = file_df["id"].tolist()
ids_dict[file_study][file_node] = file_df["id"].dropna().tolist()
return ids_dict


Expand Down Expand Up @@ -530,20 +534,22 @@ def validate_DB_with_input_tsvs(
tsv_files = list_type_files(file_dir=tsv_folder, file_type=".tsv")
ingested_studies_dataframe = parse_tsv_files(tsv_files)

logger.info("pulled all ids based off the nodes and studies from tsv provided")
db_id_list_all_studies = pull_node_ids_all_studies(
driver=driver,
studies_dataframe=ingested_studies_dataframe[["study_id", "node"]],
logger=logger,
)

logger.info("Start comparing db pulled id with ids in tsv files")
comparison_df = compare_id_input_db(
db_id_pulled_dict=db_id_list_all_studies,
parsed_tsv_file_df=ingested_studies_dataframe,
logger=logger,
)

merged_summary_table = pd.merge(
studies_dataframe, comparison_df, on=["study_id", "node"], how="left"
studies_dataframe, comparison_df, on=["study_id", "node"], how="outer"
)
merged_summary_table.drop(columns=["tsv_id"], inplace=True)

Expand Down

0 comments on commit 73f1288

Please sign in to comment.