-
Notifications
You must be signed in to change notification settings - Fork 1
/
streamlit-ui.py
92 lines (71 loc) · 3.35 KB
/
streamlit-ui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import os
import fitz # PyMuPDF
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
import warnings
from sklearn.exceptions import ConvergenceWarning
# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Summarization pipeline setup
model_name = "shresthasingh/my_awesome_billsum_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
# NER pipeline setup
ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
page_text = page.get_text()
text += page_text
return text
def summarize_text(text, min_length=30):
return summarizer(text, min_length=min_length, do_sample=False)[0]['summary_text']
def chunk_text(text, chunk_size=512):
words = text.split()
chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
def recursive_summarize(text, chunk_size=300, min_length=30):
if len(text.split()) <= chunk_size:
return summarize_text(text, min_length)
chunks = chunk_text(text, chunk_size)
summaries = [summarize_text(chunk, min_length) for chunk in chunks]
combined_summary = ' '.join(summaries)
return recursive_summarize(combined_summary, chunk_size, min_length)
def extract_named_entities(text, chunk_size=256):
chunks = chunk_text(text, chunk_size)
entities = {'PER': set(), 'ORG': set(), 'LOC': set()}
for chunk in chunks:
ner_results = ner_pipeline(chunk)
for result in ner_results:
entity_type = result['entity'].split('-')[-1]
if entity_type in entities:
entity_value = result['word'].replace('##', '')
entities[entity_type].add(entity_value)
return entities
def process_legal_document(pdf_file):
# Extract text from PDF
text = extract_text_from_pdf(pdf_file)
# Generate summary
summary = recursive_summarize(text)
# Extract named entities
entities = extract_named_entities(text)
return summary, entities
# Streamlit App
st.title("Legal Document Summarizer")
st.write("Upload PDF documents to generate summaries and extract named entities.")
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
if uploaded_files:
for uploaded_file in uploaded_files:
summary, entities = process_legal_document(uploaded_file)
st.write(f"**File:** {uploaded_file.name}")
st.write(f"**Summary:** {summary}")
st.write("**Named Entities:**")
st.write(f"**Persons:** {', '.join(entities['PER'])}")
st.write(f"**Organizations:** {', '.join(entities['ORG'])}")
st.write(f"**Locations:** {', '.join(entities['LOC'])}")
st.write("Note: The analysis results are displayed above.")