-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpredict_citation.py
74 lines (59 loc) · 2.07 KB
/
predict_citation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 4 11:00:05 2019
@author: 726094
"""
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
import plac
from pathlib import Path
import random
import os
import re
import ntpath
def predict_single(raw_text_file):
output_dir="./model_folder"
print("Loading from", output_dir)
nlpOut = spacy.load(output_dir)
print("Model loaded")
# raw_text_files_folder = 'C:/Users/726094/Desktop/analysis/predict_single_file/'
# raw_text_file = 'CT-4VYR-5YH0-TXFN-T263-00000-00.txt'
myfile = open(raw_text_file).read()
# myfile = re.sub(r'[^\x00-\x7F]+|\x0c',' ', myfile) # remove all non-XML-compatible characters
fname = ntpath.basename(raw_text_file)
print('Feeding raw data from: ',fname)
lstFile = []
lstCitation=[]
lstLabel = []
# lstSubType = []
lstAnaphoric=[]
lstEnt = []
content = myfile
doc2 = nlpOut(content)
for ent in doc2.ents:
raw_text_file = fname.replace('txt','xml.xml')
lstFile.append(raw_text_file)
lstCitation.append(ent.text)
label = ent.label_
lstEnt.append(label)
if label.find("LONG") >= 0:
lstLabel.append("L")
lstAnaphoric.append("F")
else:
lstLabel.append("S")
lstAnaphoric.append("T")
# print(ent.label_, ent.text)
dfResult = pd.DataFrame()
dfResult["FileName"] = lstFile
dfResult["SubType"] ="JUDICIALCOURTDECISION"
dfResult["Anaphoric"] = lstAnaphoric
dfResult["Citation"] = lstCitation
dfResult["CitationType"] = lstLabel
dfResult["CoReference"]="NA"
#dfResult["TestENT"]= lstEnt
#dfResult["FileName"] = lstFile.replace('txt','xml.xml')
dfResult.to_csv("outputs/result_citation.csv", index = False)
print("Model Execution finished")
print('Number of citations found: ',len(doc2.ents))
print('Citation data saved as: result_citation.csv')