Skip to content

Commit

Permalink
update pubmed parser
Browse files Browse the repository at this point in the history
  • Loading branch information
KaimingTao committed Apr 13, 2024
1 parent dc21c3a commit 1f4bf3f
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 32 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


*.csv
85 changes: 53 additions & 32 deletions pubmed_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
import pandas as pd
import argparse


def parse_pubmed(ids):
'''
Inputs:
ids: .csv file [NO HEADER] with a list of PMIDs of interest 1 on each line.
Output:
tblReferences.csv file with the following columns:
tblReferences.csv file with the following columns:
_RefID
Author
Title
Expand All @@ -18,61 +19,81 @@ def parse_pubmed(ids):
first author last name
first author first name initial
first author initials
tblAuthors.csv file with the following columns:
tblAuthors.csv file with the following columns:
_RefID
LastName
Initials
'''

ids=pd.read_csv(ids,header=None)[0].values

output_data={'title':[],'year':[],'lastname':[],'first_initial':[],'initials':[],'id':[],'journal':[]}
authors={'_RefID':[],'LastName':[],'Initials':[]} #RefID LastName Initials
counter=0

ids = pd.read_csv(ids, header=None)[0].values
print(ids)

output_data = {
'RefID': [],
'title': [],
'year': [],
'lastname': [],
'first_initial': [],
'initials': [],
'id': [],
'journal': []}
authors = {
'RefID': [],
'LastName': [],
'Initials': []
} # RefID LastName Initials

counter = 3290

for entry in ids:
fetch = PubMedFetcher()
article = fetch.article_by_pmid(entry)
title=article.title
year=article.year
journal=article.journal
lastname=article.author1_last_fm.split(' ')[0]
first_initial=article.author1_last_fm.split(' ')[1]
initials=first_initial + lastname[0]

title = article.title
year = article.year
journal = article.journal
lastname = article.author1_last_fm.split(' ')[0]
first_initial = article.author1_last_fm.split(' ')[1]
initials = first_initial + lastname[0]

output_data['RefID'].append(counter)
output_data['title'].append(title)
output_data['year'].append(year)
output_data['lastname'].append(lastname)
output_data['first_initial'].append(first_initial)
output_data['journal'].append(journal)
output_data['initials'].append(initials)
output_data['id'].append(entry)
for a in article.authors:
authors['_RefID'].append(counter)

for a in article.authors:
authors['RefID'].append(counter)
authors['LastName'].append(a.split(' ')[0])
authors['Initials'].append(a.split(' ')[1]+a.split(' ')[0][0])
counter+=1

res=pd.DataFrame(output_data)
res['_RefID']=[i for i in range(len(res))]
res['Author']=res['lastname']+', '+res['first_initial']
res=res[['_RefID','Author', 'title','journal','year','id']]
res.columns=['_RefID','Author','Title','Journal','RefYear','MedlineID']
res['Published']='Yes'
res.to_csv('tblReferences.csv',index=False,encoding='utf-8-sig')

res2=pd.DataFrame(authors)
res2.to_csv('tblAuthors.csv',index=False, encoding='utf-8-sig')

counter += 1

res = pd.DataFrame(output_data)
res['Author'] = res['lastname']+', '+res['first_initial']
res = res[['RefID', 'Author', 'title', 'journal', 'year', 'id']]
res.columns = ['RefID', 'Author', 'Title', 'Journal', 'RefYear', 'MedlineID']
res['Published'] = 'Yes'
res.to_csv('tblReferences.csv', index=False, encoding='utf-8-sig')

res2 = pd.DataFrame(authors)
res2.to_csv('tblAuthors.csv', index=False, encoding='utf-8-sig')


def main():
parser = argparse.ArgumentParser("PUBMED METADATA PARSER")
parser.add_argument("ids", help=".csv file containing Pubmed IDs", type=str)
args = parser.parse_args()

print('\n-----PROCESSING-----')
print(args.ids)
print('--------------------\n')

parse_pubmed(args.ids)
print('done!')

main()

if __name__ == '__main__':
main()

0 comments on commit 1f4bf3f

Please sign in to comment.