Skip to content

Commit

Permalink
Output more underlying data
Browse files Browse the repository at this point in the history
A single file for sponsor (no companies but yes individuals) and a
single file for sites. In the merged site-sponsor-file no companies or
individuals, and ensure each column has a prefix for site or sponsor.
  • Loading branch information
ccunningham101 committed Dec 12, 2023
1 parent cd66550 commit 69de18a
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 8 deletions.
77 changes: 70 additions & 7 deletions query_ror.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,34 +520,45 @@ def site_sponsor(args):
exclude_same = args.exclude_same

site_df = load_glob(site_files, site_filter, True, False, False)
site_df.columns = site_df.columns.map(
lambda x: "site_" + x if x != "source" and x != "trial_id" else x
)
sponsor_df = load_glob(sponsor_files, sponsor_filter, True, True, True)
sponsor_df.columns = sponsor_df.columns.map(
lambda x: "sponsor_" + x if x != "source" and x != "trial_id" else x
)
# Only use sites with same source as sponsor
site_df = site_df[site_df.source.isin(sponsor_df.source.unique())]
site_df["who_region"] = map_who(site_df[site_country_column])
sponsor_df["sponsor_who_region"] = map_who(sponsor_df[sponsor_country_column])
site_df["site_who_region"] = map_who(site_df["site_" + site_country_column])
sponsor_df["sponsor_who_region"] = map_who(
sponsor_df["sponsor_" + sponsor_country_column]
)
merged = site_df.merge(
sponsor_df,
left_on=["source", "trial_id"],
right_on=["source", "trial_id"],
how="left",
how="inner",
)
merged.to_csv("site_sponsor.csv", index=False)
counts = (
merged.groupby(["who_region", "sponsor_who_region"])
merged.groupby(["site_who_region", "sponsor_who_region"])
.trial_id.count()
.reset_index()
)

if exclude_same:
counts = counts.loc[counts.who_region != counts.sponsor_who_region]
counts = counts.loc[counts.site_who_region != counts.sponsor_who_region]
# Map nodes to node ids
who_map = {name: index for index, name in enumerate(counts.who_region.unique())}
who_map = {
name: index for index, name in enumerate(counts.site_who_region.unique())
}
who_sponsor_map = {
name: index + len(who_map)
for index, name in enumerate(counts.sponsor_who_region.unique())
}
link = dict(
source=list(counts.sponsor_who_region.map(who_sponsor_map)),
target=list(counts.who_region.map(who_map)),
target=list(counts.site_who_region.map(who_map)),
value=list(counts.trial_id),
)
data = go.Sankey(
Expand All @@ -563,6 +574,41 @@ def site_sponsor(args):
fig.write_html("sankey.html")


def combine_datasets(args):
sponsor_files = args.sponsor_files
site_files = args.site_files
# Do not exclude individuals (exclude companies)
sponsor_df = load_glob(sponsor_files, "manual", True, False, True)
sponsor_df["sponsor_who_region"] = map_who(sponsor_df["country_normalized"])
sponsor_counts = sponsor_df.groupby(
["sponsor_who_region", "country_normalized"]
).trial_id.count()
sponsor_counts = (
sponsor_counts.reset_index()
.sort_values(by=["sponsor_who_region", "trial_id"], ascending=[True, False])
.set_index(["sponsor_who_region", "country_normalized"])
)
sponsor_counts.to_csv("sponsor_counts.csv")
import code

code.interact(local=locals())

# This will just get one trial/source/country
site_df = load_glob(site_files, "country", True, False, False)
site_df["who_region"] = map_who(site_df["country"])
site_counts = site_df.groupby(["who_region", "country"]).trial_id.count()
site_counts = (
site_counts.reset_index()
.sort_values(by=["who_region", "trial_id"], ascending=[True, False])
.set_index(["who_region", "country"])
)
site_counts.to_csv("site_counts.csv")

# TODO: include an output dir
sponsor_df.to_csv("sponsor_combined.csv", index=False)
site_df.to_csv("site_combined.csv", index=False)


def flowchart(args):
input_files = args.input_files
df = load_glob(input_files, "manual")
Expand Down Expand Up @@ -810,6 +856,23 @@ def multisite(args):
)
site_sponsor_parser.set_defaults(func=site_sponsor)

combine_datasets_parser = subparsers.add_parser("combine-datasets", parents=[verb])
combine_datasets_parser.add_argument(
"--site-files",
required=True,
action="append",
type=match_paths,
help="One or more glob patterns for matching input files",
)
combine_datasets_parser.add_argument(
"--sponsor-files",
required=True,
action="append",
type=match_paths,
help="One or more glob patterns for matching input files",
)
combine_datasets_parser.set_defaults(func=combine_datasets)

args = ror_parser.parse_args()
if hasattr(args, "func"):
setup_logger(args.verbosity)
Expand Down
6 changes: 5 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,16 @@ def load_glob(
logging.info(f"Skipping {input_file}: has no country data")
continue
# One entry per trial/source/country
# NOTE: this is to match any ICTRP data
df = df.groupby(["trial_id", "source", "country"]).first().reset_index()
# TODO: do we need to merge so they have the same columns? Fillna
logging.info(f"Adding {input_file}")
frames.append(df)
if len(frames) > 0:
return pandas.concat(frames, ignore_index=True)
joined = pandas.concat(frames, ignore_index=True)
return joined.drop(
joined.columns[joined.columns.str.contains("unnamed", case=False)], axis=1
)
else:
logging.error(f"No data passed the {file_filter} filter")
sys.exit(1)
Expand Down

0 comments on commit 69de18a

Please sign in to comment.