diff --git a/README.md b/README.md index 1970915..55f85f7 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,8 @@ The dataframe columns are: - Abstract (from PubMed metadata). - mesh - MeSH (Medical Subject Headings) provided by Medline. +- keywords + - This field contains largely non-MeSH subject terms that describe the content of an article. Beginning in January 2013, author-supplied keywords. - authors - journal - pub_type @@ -272,6 +274,13 @@ A: It seems that you are on a shared computer, you need to identify who is the o ## Version +### Version 0.3.14 +-> Add the keyword field from the medline file to the result. + +-> Fixed data type, when reading the medline file, in case of add_mesh. + +-> Fixed code where 1 article was missing if using list of PMIDs as update. + ### Version 0.3.13 -> Since Crossref retired the API key feature to let Elsevier and Wiley identified the author of the publication request. wiley_api_key and elsevier_api_key optional parameters have been added as input parameters. These are not mandatory parameters but increase greatly the retrieval rate as they give access to Wiley and Elsevier publications respectively. diff --git a/cadmus/__init__.py b/cadmus/__init__.py index c0a9037..fa1cdcc 100644 --- a/cadmus/__init__.py +++ b/cadmus/__init__.py @@ -63,3 +63,4 @@ from cadmus.post_retrieval.parsed_to_df import parsed_to_df from cadmus.retrieval.edirect import pipeline from cadmus.pre_retrieval.display_export_path import display_export_path +from cadmus.pre_retrieval.add_keywords import add_keywords diff --git a/cadmus/main/bioscraping.py b/cadmus/main/bioscraping.py index bd83f96..fc6824a 100644 --- a/cadmus/main/bioscraping.py +++ b/cadmus/main/bioscraping.py @@ -37,6 +37,7 @@ from cadmus.post_retrieval.clean_up_dir import clean_up_dir from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint from cadmus.pre_retrieval.change_output_structure import change_output_structure +from cadmus.pre_retrieval.add_keywords import add_keywords def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_api_key = None, start = None, idx = None , full_search = None, keep_abstract = True, click_through_api_key = 'XXXXXXXX-XXXXXXXX-XXXXXXXX-XXXXXXXX'): # first bioscraping checks whether this is an update of a previous search or a new search. @@ -57,6 +58,9 @@ def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_a if 'mesh' not in original_df.columns: print('Implementing changes to your previous result due to change in the library.') original_df = add_mesh_remove_preprint(original_df) + if 'keywords' not in original_df.columns: + print('Implementing changes to your previous result due to change in the library.') + original_df = add_keywords(original_df) if original_df.iloc[0].content_text == 0 or original_df.iloc[0].content_text == 1: pass else: diff --git a/cadmus/pre_retrieval/__init__.py b/cadmus/pre_retrieval/__init__.py index 0585fbf..c5a11e0 100644 --- a/cadmus/pre_retrieval/__init__.py +++ b/cadmus/pre_retrieval/__init__.py @@ -7,4 +7,5 @@ from cadmus.pre_retrieval.check_for_retrieved_df import check_for_retrieved_df from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint from cadmus.pre_retrieval.change_output_structure import change_output_structure -from cadmus.pre_retrieval.display_export_path import display_export_path \ No newline at end of file +from cadmus.pre_retrieval.display_export_path import display_export_path +from cadmus.pre_retrieval.add_keywords import add_keywords \ No newline at end of file diff --git a/cadmus/pre_retrieval/add_keywords.py b/cadmus/pre_retrieval/add_keywords.py new file mode 100644 index 0000000..3fbdca4 --- /dev/null +++ b/cadmus/pre_retrieval/add_keywords.py @@ -0,0 +1,82 @@ +import json +import pandas as pd +import subprocess +import zipfile +import glob +import os + +def add_keywords(df): + + #retrieving the names of the file present in the medline file to extract previously fectched mesh terms + command = subprocess.getstatusoutput(f"ls -lR ./output/medline/txts") + command = list(command) + command = command[1] + command = str(command).split('\n') + my_medline_files = [] + for i in range(2,len(command)): + my_medline_files.append(command[i].split()[-1]) + + total_list = [] + for i in range(len(my_medline_files)): + my_file = '' + with zipfile.ZipFile(f"./output/medline/txts/{my_medline_files[i]}", "r") as z: + for filename in z.namelist(): + with z.open(filename) as f: + my_file = f.read() + f.close() + z.close() + total_list.append(str(str(my_file.decode('utf-8')))) + + total_list = total_list[0].split('\n') + + my_pmid_filtered = [] + my_kw_filtered = [] + current_kw = [] + current = False + for i in range(len(total_list)): + if total_list[i][:4] == 'PMID' and current == False: + my_pmid_filtered.append(total_list[i]) + current = True + if total_list[i][:2] == 'OT' and total_list[i][:3] != 'OTO': + current_kw.append(total_list[i]) + if total_list[i][:4] == 'PMID' and current == True: + my_kw_filtered.append(current_kw) + current_kw = [] + my_pmid_filtered.append(total_list[i]) + my_kw_filtered.append(current_kw) + + for i in range(len(my_pmid_filtered)): + my_pmid_filtered[i] = my_pmid_filtered[i].replace('PMID- ', '') + for i in range(len(my_kw_filtered)): + for j in range(len(my_kw_filtered[i])): + my_kw_filtered[i][j] = my_kw_filtered[i][j].replace('OT - ', '') + + df_keywords = pd.DataFrame(list(zip(my_pmid_filtered, my_kw_filtered)), + columns =['pmid', 'keywords']) + + df_keywords = df_keywords.drop_duplicates(subset=['pmid']) + for index, row in df_keywords.iterrows(): + if df_keywords.keywords.loc[index] == []: + df_keywords.loc[index, 'keywords'] = None + + df = df.reset_index().merge(df_keywords, on='pmid').set_index('index') + df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'keywords', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']] + + df.pub_date = df.pub_date.astype(str) + result = df.to_json(orient="index") + if len(glob.glob('./output/retrieved_df/retrieved_df2.json.zip')) == 0: + with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file: + dumped_JSON: str = json.dumps(result, indent=4) + zip_file.writestr("retrieved_df2.json", data=dumped_JSON) + zip_file.testzip() + zip_file.close() + else: + os.rename('./output/retrieved_df/retrieved_df2.json.zip', './output/retrieved_df/temp_retrieved_df2.json.zip') + with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file: + dumped_JSON: str = json.dumps(result, indent=4) + zip_file.writestr("retrieved_df2.json", data=dumped_JSON) + zip_file.testzip() + zip_file.close() + os.remove('./output/retrieved_df/temp_retrieved_df2.json.zip') + + return df \ No newline at end of file diff --git a/cadmus/pre_retrieval/add_mesh_remove_preprint.py b/cadmus/pre_retrieval/add_mesh_remove_preprint.py index 13fa96c..0d89258 100644 --- a/cadmus/pre_retrieval/add_mesh_remove_preprint.py +++ b/cadmus/pre_retrieval/add_mesh_remove_preprint.py @@ -26,14 +26,12 @@ def add_mesh_remove_preprint(df): my_file = f.read() f.close() z.close() - total_list.extend(my_file) + total_list.append(str(str(my_file.decode('utf-8')))) - for i in range(len(total_list)): - total_list[i] = total_list[i].replace('\n', '') + total_list = total_list[0].split('\n') my_pmid_filtered = [] my_mh_filtered = [] - current_pmid = [] current_mh = [] current = False for i in range(len(total_list)): @@ -58,8 +56,11 @@ def add_mesh_remove_preprint(df): columns =['pmid', 'mesh']) df_mesh = df_mesh.drop_duplicates(subset=['pmid']) + for index, row in df_mesh.iterrows(): + if df_mesh.mesh.loc[index] == []: + df_mesh.loc[index, 'mesh'] = None - df = df.merge(df_mesh, on='pmid') + df = df.reset_index().merge(df_mesh, on='pmid').set_index('index') df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']] index_to_keep = [] diff --git a/cadmus/pre_retrieval/creation_retrieved_df.py b/cadmus/pre_retrieval/creation_retrieved_df.py index cdd7d28..102900a 100644 --- a/cadmus/pre_retrieval/creation_retrieved_df.py +++ b/cadmus/pre_retrieval/creation_retrieved_df.py @@ -63,6 +63,7 @@ def creation_retrieved_df(medline_file_name): if abstract == None or abstract == '': abstract = record.get('OAB') mesh_terms = record.get('MH') + keywords = record.get('OT') authors = record.get('AU') journal_title = record.get('JT') pub_type = record.get('PT') @@ -81,6 +82,7 @@ def creation_retrieved_df(medline_file_name): 'title': title, 'abstract': abstract, 'mesh': mesh_terms, + 'keywords': keywords, 'authors':authors, 'journal':journal_title, 'pub_type':pub_type, diff --git a/cadmus/retrieval/search_terms_to_medline.py b/cadmus/retrieval/search_terms_to_medline.py index 674f6a4..899accf 100644 --- a/cadmus/retrieval/search_terms_to_medline.py +++ b/cadmus/retrieval/search_terms_to_medline.py @@ -52,13 +52,14 @@ def search_terms_to_medline(query_string, api_key): d = f.read() f.close() z.close() - d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8') + d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8') os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip') with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file: zip_file.writestr("medline_output.txt", data=d) zip_file.testzip() zip_file.close() os.remove('./output/medline/txts/temp_medline_output.txt.zip') + print('Medline Records retrieved and saved') else: #to avoid errors for large pmids list. We now chunk into smaller set of 9000. Finally we append every chunk in the medline text file. for i in range(len(query_string)): @@ -76,7 +77,7 @@ def search_terms_to_medline(query_string, api_key): d = f.read() f.close() z.close() - d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8') + d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8') os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip') with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file: zip_file.writestr("medline_output.txt", data=d) diff --git a/setup.py b/setup.py index 531d0f4..3887b16 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setuptools.setup( name="cadmus", - version="0.3.13", + version="0.3.14", author="Jamie Campbell, Ian Simpson, Antoine Lain", author_email="Jamie.campbell@igmm.ed.ac.uk, Ian.Simpson@ed.ac.uk, Antoine.Lain@ed.ac.uk", description="This projects is to build full text retrieval system setup for generation of large biomedical corpora from published literature.",