Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPP client; readme; clean up process PDF to use SPP client #12

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 24 additions & 35 deletions doc2json/spp2json/process_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,64 +9,53 @@



def process_pdf_file(input_file: str, temp_dir: str, output_dir: str) -> str:
"""
Process a PDF file and get JSON representation
:param input_file:
:param temp_dir:
:param output_dir:
:return:
"""
# get paper id as the name of the file
paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
spp_json_file = os.path.join(temp_dir, f'{paper_id}.json')
output_file = os.path.join(output_dir, f'{paper_id}.json')

# check if input file exists and output file doesn't
if not os.path.exists(input_file):
raise FileNotFoundError(f"{input_file} doesn't exist")
if os.path.exists(output_file):
raise Warning(f'{output_file} already exists!')
def process_one_pdf(infile: str, spp_tempfile: str, outfile: str) -> str:

# process PDF through SPP -> SPP JSON
client = SppClient()
# TODO: compute PDF hash
client.process(input_file, temp_dir)
client.process(infile, spp_tempfile)

# process SPP JSON -> S2ORC JSON
assert os.path.exists(spp_json_file)
with open(spp_json_file, 'r') as f_in:
with open(spp_tempfile, 'r') as f_in:
spp_json = json.load(f_in)
paper = convert_spp_json_to_s2orc_json(spp_json=spp_json)

# write to file
with open(output_file, 'w') as outf:
with open(outfile, 'w') as outf:
json.dump(paper.release_json(), outf, indent=4, sort_keys=False)

return output_file
return outfile


if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
parser.add_argument("-t", "--temp", default='temp/', help="path to the temp dir for putting tei xml files")
parser.add_argument("-o", "--output", default='output/', help="path to the output dir for putting json files")
parser.add_argument("-i", "--indir", default=None, help="path to the input PDF dir")
parser.add_argument("-t", "--tempdir", default='temp/', help="path to the temp dir for putting SPP JSON files")
parser.add_argument("-o", "--outdir", default='output/', help="path to the output dir for putting S2ORC JSON files")
parser.add_argument("-k", "--keep", action='store_true')

args = parser.parse_args()

input_path = args.input
temp_path = args.temp
output_path = args.output
keep_temp = args.keep
indir = args.indir
tempdir = args.tempdir
outdir = args.outdir
is_keep_temp = args.keep

os.makedirs(tempdir, exist_ok=True)
os.makedirs(outdir, exist_ok=True)


start_time = time.time()

os.makedirs(temp_path, exist_ok=True)
os.makedirs(output_path, exist_ok=True)
for fname in os.listdir(indir):
infile = os.path.join(indir, fname)
tempfile = os.path.join(tempdir, fname.replace('.pdf', '-spp.json'))
outfile = os.path.join(outdir, fname.replace('.pdf', '-s2orc.json'))
process_one_pdf(infile=infile, spp_tempfile=tempfile, outfile=outfile)

process_pdf_file(input_path, temp_path, output_path)
if not is_keep_temp:
os.remove(tempfile)

runtime = round(time.time() - start_time, 3)
print("runtime: %s seconds " % (runtime))
print('done.')
print('done.')
29 changes: 29 additions & 0 deletions doc2json/spp2json/spp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# python client for ScienceParsePlus

### setup

Install [ScienceParsePlus](https://github.com/allenai/scienceparseplus). The README should document how to build and run the service via Docker. The running service should be accessible at `http://localhost:8080`.


### dependencies

This assumes Python 3.7.

### usage

As a script, run:
```
python spp_client.py --input example.pdf --output example.json
```

As a Python library:
```
import json
from doc2json.spp2json.spp.spp_client import SppClient

client = SppClient()
client.process('example.pdf', 'example.json')

with open('example.json', 'r') as f_in:
spp_json = json.load(f_in)
```
14 changes: 10 additions & 4 deletions doc2json/spp2json/spp/spp_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@
import ntpath
from typing import List

import requests

class SppClient:
def process(self, input: str, output: str):
raise NotImplementedError
def process(self, infile: str, outfile: str):
with open(infile, 'rb') as f_in:
files = {"pdf_file": (f_in.name, f_in, "multipart/form-data")}
r = requests.post('http://localhost:8080/detect', files=files)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe pick a different port for spp to avoid collision. i think currently, the flask app and grobid run on 8070 and 8080?

layout = r.json()
with open(outfile, 'w') as f_out:
json.dump(layout, f_out, indent=4)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Client for ScienceParsePlus (SPP) services")
parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
parser.add_argument("--output", default=None, help="path to the directory where to put the results")
parser.add_argument("--input", default=None, help="path to the PDF to process")
parser.add_argument("--output", default=None, help="path to the target output file")
args = parser.parse_args()

input_path = args.input
Expand Down