From 81ec7854aa43dc12c84c84476078ee02de1ceab5 Mon Sep 17 00:00:00 2001 From: Christine Cunningham <5705329+ccunningham101@users.noreply.github.com> Date: Tue, 5 Dec 2023 16:41:15 +0000 Subject: [PATCH] Allow different filters for sponsor/site in sankey They can be manual, ror, or country Provide the correct country column --- query_ror.py | 39 ++++++++++++++++++++++++++++++++++----- utils.py | 9 +++++---- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/query_ror.py b/query_ror.py index a69981e..0f56957 100644 --- a/query_ror.py +++ b/query_ror.py @@ -528,17 +528,17 @@ def site_sponsor(args): for index, name in enumerate(counts.sponsor_who_region.unique()) } link = dict( - source=list(counts.who_region.map(who_map)), - target=list(counts.sponsor_who_region.map(who_sponsor_map)), + source=list(counts.sponsor_who_region.map(who_sponsor_map)), + target=list(counts.who_region.map(who_map)), value=list(counts.trial_id), ) data = go.Sankey( link=link, node=dict(label=list(who_map.keys()) + list(who_sponsor_map.keys())) ) fig = go.Figure(data) - sources = sorted(set(merged.source_x).intersection(set(merged.source_y))) + sources = sorted(set(merged.source)) fig.update_layout( - title=f"Mapping Trials Sites Country to Sponsor Country by WHO Region (data from: {' '.join(sources)})", + title=f"Mapping Sponsor to Trial Site by WHO Region (data from: {' '.join(sources)})", ) fig.write_html("sankey.html") @@ -725,6 +725,12 @@ def multisite(args): map_parser.set_defaults(func=make_map) org_parser = subparsers.add_parser("sponsor-org", parents=[results]) + org_parser.add_argument( + "--country-column", + type=str, + help="Name of country column to use", + default="country", + ) org_parser.set_defaults(func=org_region) flowchart_parser = subparsers.add_parser("flowchart", parents=[results]) @@ -750,11 +756,34 @@ def multisite(args): help="One or more glob patterns for matching input files", ) site_sponsor_parser.add_argument( - "--file-filter", + "--sponsor-filter", choices=["manual", "ror", "country"], default="country", help="Filter registry data", ) + site_sponsor_parser.add_argument( + "--site-filter", + choices=["manual", "ror", "country"], + default="country", + help="Filter registry data", + ) + site_sponsor_parser.add_argument( + "--sponsor-country-column", + type=str, + help="Name of sponsor country column to use", + default="country", + ) + site_sponsor_parser.add_argument( + "--site-country-column", + type=str, + help="Name of site country column to use", + default="country", + ) + site_sponsor_parser.add_argument( + "--exclude-indiv-company", + action="store_true", + help="Exclude individuals and companies", + ) site_sponsor_parser.set_defaults(func=site_sponsor) args = ror_parser.parse_args() diff --git a/utils.py b/utils.py index a834239..0fa1998 100644 --- a/utils.py +++ b/utils.py @@ -134,7 +134,8 @@ def load_glob(filenames, file_filter, exclude_indiv_company=False): if "country" not in df.columns: logging.info(f"Skipping {input_file}: has no country data") continue - + # One entry per trial/source/country + df = df.groupby(["trial_id", "source", "country"]).first().reset_index() # TODO: do we need to merge so they have the same columns? Fillna logging.info(f"Adding {input_file}") frames.append(df) @@ -842,17 +843,17 @@ def region_map(counts, country_column="country", legend_title="Number of Trials" legend=True, legend_kwds={"label": f"{legend_title}"}, ) - ax.set_title(f"{region_name} Trial Sites") + ax.set_title(f"{region_name}") ax.set_xticklabels([]) ax.set_yticklabels([]) -def region_pie(df, legend_title="Number of Trials"): +def region_pie(df, country_column, legend_title="Number of Trials"): """ Counts is a series indexed by iso2 country """ # TODO: which country- country_ror? - df["who_region"] = map_who(df["country"]) + df["who_region"] = map_who(df[country_column]) grouped = df.groupby("who_region") orgs = df.organization_type.unique()