From 261f859042295d77e5c5637dc3b31190ea948fdb Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 1 Apr 2021 17:10:48 -0700
Subject: [PATCH 1/2] SPP client; readme; clean up process PDF to use SPP
 client

---
 doc2json/spp2json/process_pdf.py    | 59 ++++++++++++-----------------
 doc2json/spp2json/spp/README.md     | 25 ++++++++++++
 doc2json/spp2json/spp/spp_client.py | 14 +++++--
 3 files changed, 59 insertions(+), 39 deletions(-)
 create mode 100644 doc2json/spp2json/spp/README.md

diff --git a/doc2json/spp2json/process_pdf.py b/doc2json/spp2json/process_pdf.py
index 8558abb..449b54f 100644
--- a/doc2json/spp2json/process_pdf.py
+++ b/doc2json/spp2json/process_pdf.py
@@ -9,64 +9,53 @@
 
 
 
-def process_pdf_file(input_file: str, temp_dir: str, output_dir: str) -> str:
-    """
-    Process a PDF file and get JSON representation
-    :param input_file:
-    :param temp_dir:
-    :param output_dir:
-    :return:
-    """
-    # get paper id as the name of the file
-    paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
-    spp_json_file = os.path.join(temp_dir, f'{paper_id}.json')
-    output_file = os.path.join(output_dir, f'{paper_id}.json')
-
-    # check if input file exists and output file doesn't
-    if not os.path.exists(input_file):
-        raise FileNotFoundError(f"{input_file} doesn't exist")
-    if os.path.exists(output_file):
-        raise Warning(f'{output_file} already exists!')
+def process_one_pdf(infile: str, spp_tempfile: str, outfile: str) -> str:
 
     # process PDF through SPP -> SPP JSON
     client = SppClient()
-    # TODO: compute PDF hash
-    client.process(input_file, temp_dir)
+    client.process(infile, spp_tempfile)
 
     # process SPP JSON -> S2ORC JSON
-    assert os.path.exists(spp_json_file)
-    with open(spp_json_file, 'r') as f_in:
+    with open(spp_tempfile, 'r') as f_in:
         spp_json = json.load(f_in)
     paper = convert_spp_json_to_s2orc_json(spp_json=spp_json)
 
     # write to file
-    with open(output_file, 'w') as outf:
+    with open(outfile, 'w') as outf:
         json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
 
-    return output_file
+    return outfile
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
-    parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
-    parser.add_argument("-t", "--temp", default='temp/', help="path to the temp dir for putting tei xml files")
-    parser.add_argument("-o", "--output", default='output/', help="path to the output dir for putting json files")
+    parser.add_argument("-i", "--indir", default=None, help="path to the input PDF dir")
+    parser.add_argument("-t", "--tempdir", default='temp/', help="path to the temp dir for putting SPP JSON files")
+    parser.add_argument("-o", "--outdir", default='output/', help="path to the output dir for putting S2ORC JSON files")
     parser.add_argument("-k", "--keep", action='store_true')
 
     args = parser.parse_args()
 
-    input_path = args.input
-    temp_path = args.temp
-    output_path = args.output
-    keep_temp = args.keep
+    indir = args.indir
+    tempdir = args.tempdir
+    outdir = args.outdir
+    is_keep_temp = args.keep
+
+    os.makedirs(tempdir, exist_ok=True)
+    os.makedirs(outdir, exist_ok=True)
+
 
     start_time = time.time()
 
-    os.makedirs(temp_path, exist_ok=True)
-    os.makedirs(output_path, exist_ok=True)
+    for fname in os.listdir(indir):
+        infile = os.path.join(indir, fname)
+        tempfile = os.path.join(tempdir, fname.replace('.pdf', '-spp.json'))
+        outfile = os.path.join(outdir, fname.replace('.pdf', '-s2orc.json'))
+        process_one_pdf(infile=infile, spp_tempfile=tempfile, outfile=outfile)
 
-    process_pdf_file(input_path, temp_path, output_path)
+        if not is_keep_temp:
+            os.remove(tempfile)
 
     runtime = round(time.time() - start_time, 3)
     print("runtime: %s seconds " % (runtime))
-    print('done.')
\ No newline at end of file
+    print('done.')
diff --git a/doc2json/spp2json/spp/README.md b/doc2json/spp2json/spp/README.md
new file mode 100644
index 0000000..084320c
--- /dev/null
+++ b/doc2json/spp2json/spp/README.md
@@ -0,0 +1,25 @@
+# python client for ScienceParsePlus 
+
+### setup
+
+Install [ScienceParsePlus](https://github.com/allenai/scienceparseplus). The README should document how to build and run the service via Docker.  The running service should be accessible at `http://localhost:8080`.
+
+
+### dependencies
+
+This assumes Python 3.7.
+
+### usage
+
+As a script, run:
+```
+python spp_client.py --input example.pdf --output example.json
+```
+
+As a Python library:
+```
+from doc2json.spp2json.spp.spp_client import SppClient
+
+client = SppClient()
+client.process('example.pdf', )
+```
\ No newline at end of file
diff --git a/doc2json/spp2json/spp/spp_client.py b/doc2json/spp2json/spp/spp_client.py
index 07c9c96..4d32fd2 100644
--- a/doc2json/spp2json/spp/spp_client.py
+++ b/doc2json/spp2json/spp/spp_client.py
@@ -7,16 +7,22 @@
 import ntpath
 from typing import List
 
+import requests
 
 class SppClient:
-    def process(self, input: str, output: str):
-        raise NotImplementedError
+    def process(self, infile: str, outfile: str):
+        with open(infile, 'rb') as f_in:
+            files = {"pdf_file": (f_in.name, f_in, "multipart/form-data")}
+            r = requests.post('http://localhost:8080/detect', files=files)
+            layout = r.json()
+            with open(outfile, 'w') as f_out:
+                json.dump(layout, f_out, indent=4)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Client for ScienceParsePlus (SPP) services")
-    parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
-    parser.add_argument("--output", default=None, help="path to the directory where to put the results")
+    parser.add_argument("--input", default=None, help="path to the PDF to process")
+    parser.add_argument("--output", default=None, help="path to the target output file")
     args = parser.parse_args()
 
     input_path = args.input

From 2aaacafb752f5552d7cffb8bb8d2e40b6f1c0905 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 1 Apr 2021 17:12:45 -0700
Subject: [PATCH 2/2] finish writing README for spp client

---
 doc2json/spp2json/spp/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc2json/spp2json/spp/README.md b/doc2json/spp2json/spp/README.md
index 084320c..6d40e30 100644
--- a/doc2json/spp2json/spp/README.md
+++ b/doc2json/spp2json/spp/README.md
@@ -18,8 +18,12 @@ python spp_client.py --input example.pdf --output example.json
 
 As a Python library:
 ```
+import json
 from doc2json.spp2json.spp.spp_client import SppClient
 
 client = SppClient()
-client.process('example.pdf', )
+client.process('example.pdf', 'example.json')
+
+with open('example.json', 'r') as f_in:
+    spp_json = json.load(f_in)
 ```
\ No newline at end of file