Skip to content

Commit

Permalink
Move results back to functions
Browse files Browse the repository at this point in the history
Shared functions go in utils, but each analysis produces its own
results. This is because the openalex data, for example, needs filtering
by first or last author, but the same map plotting code for the world or
by region can be reused.
  • Loading branch information
ccunningham101 committed Oct 11, 2023
1 parent 2abc5da commit 25b3439
Show file tree
Hide file tree
Showing 8 changed files with 587 additions and 304 deletions.
55 changes: 48 additions & 7 deletions openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
from collections import defaultdict
from datetime import datetime

import matplotlib.pyplot as plt
import numpy
import pandas

from setup import get_base_parser, get_verbosity_parser, setup_logger
from utils import create_session
from utils import create_session, region_map


DEFAULT_PROTOCOL = "(clinicaltrial[Filter] NOT editorial)"
Expand Down Expand Up @@ -45,15 +46,18 @@
def read_dataset(fpath):
df = pandas.read_csv(
fpath,
delimiter="\t",
delimiter="%%",
names=[
"pmid",
"title",
"accession",
"abstract",
"pubdate",
"pub_types",
"journal_date",
"epub_date",
],
parse_dates=["pubdate"],
na_values="-",
parse_dates=["journal_date", "epub_date"],
dtype={"pmid": str},
)
return df

Expand All @@ -73,11 +77,14 @@ def build_cohort(args):
logging.error("Is edirect installed?")
# The date MUST be included in the query with [dp] (rather than
# -mindate -maxdate) in order for 10k+ queries to work
# cmd = f"efetch -db pubmed -id 30553130 -format xml | xtract -pattern PubmedArticle -sep '|' -def '-' -element MedlineCitation/PMID -element AccessionNumber -element AbstractText -block PubDate -sep '-' -element Year,Month,Day > {output_file}"
cmd = f"esearch -db pubmed -query '({start_date}:{end_date}[dp]) AND ({protocol})' | efetch -format xml | xtract -pattern PubmedArticle -sep '|' -def '-' -element MedlineCitation/PMID -element AccessionNumber -element AbstractText -block PubDate -sep '-' -element Year,Month,Day > {output_file}"
# cmd = f"efetch -db pubmed -id 30553130 -format xml"
# cmd += f" | xtract -pattern PubmedArticle -def '' -sep '|' -tab '%%' -element MedlineCitation/PMID -element ArticleTitle -element AccessionNumber -element AbstractText -element PublicationType -block Journal -sep '-' -tab '%%' -element Year,Month -block ArticleDate -sep '-' -element Year,Month,Day > {output_file}"

cmd = f"esearch -db pubmed -query '({start_date}:{end_date}[dp]) AND ({protocol})' | efetch -format xml | xtract -pattern PubmedArticle -def '' -sep '|' -tab '%%' -element MedlineCitation/PMID -element ArticleTitle -element AccessionNumber -element AbstractText -element PublicationType -block Journal -sep '-' -tab '%%' -element Year,Month -block ArticleDate -sep '-' -element Year,Month,Day > {output_file}"
logging.info(cmd)
edirect.pipeline(cmd)

# could do as stringio
df = read_dataset(output_file)
df = split_bar(df, columns=["accession"])
df = get_ids_from_abstract(df)
Expand Down Expand Up @@ -198,6 +205,27 @@ def query_openalex(args):
)


def make_site_map(args):
input_file = args.input_file
output_file = args.output_file
last_author = args.last_author

df = pandas.read_csv(input_file)
if last_author:
df[df.author_position == "last"]
title = "Last Author Affiliation by WHO Region: Trials in Pubmed 2018-2023"
else:
df[df.author_position == "first"]
title = "First Author Affiliation by WHO Region: Trials in Pubmed 2018-2023"

counts = (
df.groupby(["pmid", "country"]).author_name.nunique().groupby("country").sum()
)
region_map(counts)
plt.suptitle(title)
plt.savefig(output_file, bbox_inches="tight")


if __name__ == "__main__":
verbosity_parser = get_verbosity_parser()
base_parser = get_base_parser()
Expand Down Expand Up @@ -247,6 +275,19 @@ def query_openalex(args):
required=True,
help="Output file name to write openalex cohort",
)

site_map_parser = subparsers.add_parser("site_map", parents=[base_parser])
site_map_parser.add_argument(
"--output-file",
type=pathlib.Path,
required=True,
help="Output file to save map",
)
site_map_parser.add_argument(
"--last-author", action="store_true", help="Use last author rather than first"
)
site_map_parser.set_defaults(func=make_site_map)

args = openalex_parser.parse_args()
if hasattr(args, "func"):
setup_logger(args.verbosity)
Expand Down
54 changes: 47 additions & 7 deletions pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
import logging
import multiprocessing as mp
import os
import pathlib
import re
import shutil
import sys
from functools import partial
from io import StringIO

import matplotlib.pyplot as plt
import pandas

from setup import get_env_setting, get_full_parser, setup_logger
from setup import get_env_setting, get_full_parser, get_verbosity_parser, setup_logger
from utils import (
REGISTRY_MAP,
create_session,
filter_unindexed,
load_trials,
Expand All @@ -26,7 +29,7 @@ def analyse_metadata(args):
input_file,
parse_dates=["Date_enrollment", "epub_date", "journal_date"],
index_col=[0],
dtype={"pmid": "str"},
dtype={"pmid": str},
)
df = df[~(df.title.str.contains("protocol", flags=re.IGNORECASE) is True)]
import code
Expand Down Expand Up @@ -136,7 +139,8 @@ def add_pubmed_metadata(args):
n = args.chunk_size

df = pandas.read_csv(input_file, dtype={"pmids": "str"})
df["source"] = df.trial_id.str[0:3]
df["source"] = df.trial_id.str[0:3].str.upper()
df.loc[df.source.str.startswith("NL"), "source"] = "NTR"
unique_pmids = df.pmids.dropna().unique()
try:
sys.path.insert(1, os.path.dirname(shutil.which("xtract")))
Expand Down Expand Up @@ -166,20 +170,56 @@ def add_pubmed_metadata(args):
df.to_csv(output_file)


def reported_over_time(args):
input_file = args.input_file
df = pandas.read_csv(input_file)

fig, ax = plt.subplots(figsize=(12, 6))
df["enrollment_year"] = pandas.to_datetime(df.Date_enrollment).dt.strftime("%Y")
df["source"] = df.source.map(REGISTRY_MAP)
counts = df.groupby(["enrollment_year", "source"]).agg(
{"trial_id": "count", "pmids": "count"}
)
counts["pcnt"] = 100 * (counts.pmids / counts.trial_id)
to_plot = counts.reset_index().pivot(index="source", columns=["enrollment_year"])[
"pcnt"
]
to_plot = to_plot.sort_values("2014", ascending=False)
to_plot.plot.bar(ax=ax)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1), title="Enrollment Year")
plt.title(
"Percent of trials with trial id in Pubmed Accession or Abstract by registry"
)
plt.xlabel("Registry")
plt.ylabel("Percent (%)")
plt.xticks(rotation=45)
plt.savefig("percent_reported.png", bbox_inches="tight")


if __name__ == "__main__":
verb = get_verbosity_parser()
parent = get_full_parser()
pubmed_parser = argparse.ArgumentParser(parents=[parent])
pubmed_parser = argparse.ArgumentParser()
subparsers = pubmed_parser.add_subparsers()

query_parser = subparsers.add_parser("query")
query_parser = subparsers.add_parser("query", parents=[parent])
query_parser.set_defaults(func=trials_in_pubmed)

metadata_parser = subparsers.add_parser("metadata")
metadata_parser = subparsers.add_parser("metadata", parents=[parent])
metadata_parser.set_defaults(func=add_pubmed_metadata)

analyse_parser = subparsers.add_parser("analyse")
analyse_parser = subparsers.add_parser("analyse", parents=[parent])
analyse_parser.set_defaults(func=analyse_metadata)

reported_parser = subparsers.add_parser("percent-reported", parents=[verb])
reported_parser.add_argument(
"--input-file",
required=True,
type=pathlib.Path,
help="Cohort file with discovered pmids",
)
reported_parser.set_defaults(func=reported_over_time)

args = pubmed_parser.parse_args()
if hasattr(args, "func"):
setup_logger(args.verbosity)
Expand Down
Loading

0 comments on commit 25b3439

Please sign in to comment.