From 261f859042295d77e5c5637dc3b31190ea948fdb Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 1 Apr 2021 17:10:48 -0700 Subject: [PATCH 1/2] SPP client; readme; clean up process PDF to use SPP client --- doc2json/spp2json/process_pdf.py | 59 ++++++++++++----------------- doc2json/spp2json/spp/README.md | 25 ++++++++++++ doc2json/spp2json/spp/spp_client.py | 14 +++++-- 3 files changed, 59 insertions(+), 39 deletions(-) create mode 100644 doc2json/spp2json/spp/README.md diff --git a/doc2json/spp2json/process_pdf.py b/doc2json/spp2json/process_pdf.py index 8558abb..449b54f 100644 --- a/doc2json/spp2json/process_pdf.py +++ b/doc2json/spp2json/process_pdf.py @@ -9,64 +9,53 @@ -def process_pdf_file(input_file: str, temp_dir: str, output_dir: str) -> str: - """ - Process a PDF file and get JSON representation - :param input_file: - :param temp_dir: - :param output_dir: - :return: - """ - # get paper id as the name of the file - paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1]) - spp_json_file = os.path.join(temp_dir, f'{paper_id}.json') - output_file = os.path.join(output_dir, f'{paper_id}.json') - - # check if input file exists and output file doesn't - if not os.path.exists(input_file): - raise FileNotFoundError(f"{input_file} doesn't exist") - if os.path.exists(output_file): - raise Warning(f'{output_file} already exists!') +def process_one_pdf(infile: str, spp_tempfile: str, outfile: str) -> str: # process PDF through SPP -> SPP JSON client = SppClient() - # TODO: compute PDF hash - client.process(input_file, temp_dir) + client.process(infile, spp_tempfile) # process SPP JSON -> S2ORC JSON - assert os.path.exists(spp_json_file) - with open(spp_json_file, 'r') as f_in: + with open(spp_tempfile, 'r') as f_in: spp_json = json.load(f_in) paper = convert_spp_json_to_s2orc_json(spp_json=spp_json) # write to file - with open(output_file, 'w') as outf: + with open(outfile, 'w') as outf: json.dump(paper.release_json(), outf, indent=4, sort_keys=False) - return output_file + return outfile if __name__ == '__main__': parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON") - parser.add_argument("-i", "--input", default=None, help="path to the input PDF file") - parser.add_argument("-t", "--temp", default='temp/', help="path to the temp dir for putting tei xml files") - parser.add_argument("-o", "--output", default='output/', help="path to the output dir for putting json files") + parser.add_argument("-i", "--indir", default=None, help="path to the input PDF dir") + parser.add_argument("-t", "--tempdir", default='temp/', help="path to the temp dir for putting SPP JSON files") + parser.add_argument("-o", "--outdir", default='output/', help="path to the output dir for putting S2ORC JSON files") parser.add_argument("-k", "--keep", action='store_true') args = parser.parse_args() - input_path = args.input - temp_path = args.temp - output_path = args.output - keep_temp = args.keep + indir = args.indir + tempdir = args.tempdir + outdir = args.outdir + is_keep_temp = args.keep + + os.makedirs(tempdir, exist_ok=True) + os.makedirs(outdir, exist_ok=True) + start_time = time.time() - os.makedirs(temp_path, exist_ok=True) - os.makedirs(output_path, exist_ok=True) + for fname in os.listdir(indir): + infile = os.path.join(indir, fname) + tempfile = os.path.join(tempdir, fname.replace('.pdf', '-spp.json')) + outfile = os.path.join(outdir, fname.replace('.pdf', '-s2orc.json')) + process_one_pdf(infile=infile, spp_tempfile=tempfile, outfile=outfile) - process_pdf_file(input_path, temp_path, output_path) + if not is_keep_temp: + os.remove(tempfile) runtime = round(time.time() - start_time, 3) print("runtime: %s seconds " % (runtime)) - print('done.') \ No newline at end of file + print('done.') diff --git a/doc2json/spp2json/spp/README.md b/doc2json/spp2json/spp/README.md new file mode 100644 index 0000000..084320c --- /dev/null +++ b/doc2json/spp2json/spp/README.md @@ -0,0 +1,25 @@ +# python client for ScienceParsePlus + +### setup + +Install [ScienceParsePlus](https://github.com/allenai/scienceparseplus). The README should document how to build and run the service via Docker. The running service should be accessible at `http://localhost:8080`. + + +### dependencies + +This assumes Python 3.7. + +### usage + +As a script, run: +``` +python spp_client.py --input example.pdf --output example.json +``` + +As a Python library: +``` +from doc2json.spp2json.spp.spp_client import SppClient + +client = SppClient() +client.process('example.pdf', ) +``` \ No newline at end of file diff --git a/doc2json/spp2json/spp/spp_client.py b/doc2json/spp2json/spp/spp_client.py index 07c9c96..4d32fd2 100644 --- a/doc2json/spp2json/spp/spp_client.py +++ b/doc2json/spp2json/spp/spp_client.py @@ -7,16 +7,22 @@ import ntpath from typing import List +import requests class SppClient: - def process(self, input: str, output: str): - raise NotImplementedError + def process(self, infile: str, outfile: str): + with open(infile, 'rb') as f_in: + files = {"pdf_file": (f_in.name, f_in, "multipart/form-data")} + r = requests.post('http://localhost:8080/detect', files=files) + layout = r.json() + with open(outfile, 'w') as f_out: + json.dump(layout, f_out, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Client for ScienceParsePlus (SPP) services") - parser.add_argument("--input", default=None, help="path to the directory containing PDF to process") - parser.add_argument("--output", default=None, help="path to the directory where to put the results") + parser.add_argument("--input", default=None, help="path to the PDF to process") + parser.add_argument("--output", default=None, help="path to the target output file") args = parser.parse_args() input_path = args.input From 2aaacafb752f5552d7cffb8bb8d2e40b6f1c0905 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 1 Apr 2021 17:12:45 -0700 Subject: [PATCH 2/2] finish writing README for spp client --- doc2json/spp2json/spp/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc2json/spp2json/spp/README.md b/doc2json/spp2json/spp/README.md index 084320c..6d40e30 100644 --- a/doc2json/spp2json/spp/README.md +++ b/doc2json/spp2json/spp/README.md @@ -18,8 +18,12 @@ python spp_client.py --input example.pdf --output example.json As a Python library: ``` +import json from doc2json.spp2json.spp.spp_client import SppClient client = SppClient() -client.process('example.pdf', ) +client.process('example.pdf', 'example.json') + +with open('example.json', 'r') as f_in: + spp_json = json.load(f_in) ``` \ No newline at end of file