-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsumary.py
47 lines (34 loc) · 1.32 KB
/
sumary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import nltk
from nltk.summarization import IndicativeTransformer
def summarize_text(text):
# Make sure to download the NLTK 'punkt' package
nltk.download('punkt')
# Tokenize the text into sentences
sentences = nltk.sent_tokenize(text)
# Initialize IndicativeTransformer
summarizer = IndicativeTransformer()
# Summarize the text using NLTK
summary = summarizer.summarize(sentences)
# Join summary sentences into a string
summary_text = ' '.join(summary)
# Return the summary text
return summary_text
def process_data_chunk(chunk):
chunk['summary'] = chunk['processed_content'].apply(summarize_text)
return chunk
def main():
data = pd.read_csv('lda_content_10000.csv')
# Dividing the data into several chunks
chunk_size = 1000
chunks = [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
# Using ThreadPoolExecutor to process chunks in parallel
with ThreadPoolExecutor() as executor:
processed_chunks = list(executor.map(process_data_chunk, chunks))
# Combining the processed chunks back together
processed_data = pd.concat(processed_chunks)
# Saving the result to a new CSV file
processed_data.to_csv('summarized_data.csv', index=False)
if __name__ == "__main__":
main()