generated from oracle-devrel/repo-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
83 lines (67 loc) · 2.33 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# scrapy scripts
'''
from trending_spider import main as run_trending_spider
from info_spider import main as run_info_spider
'''
import json
from nltk.tokenize import word_tokenize
import re
import yaml
from db_handler import DatabaseHandler
# python scripts
from readme_reader import main as run_readme_reader
from summarize_llm import main as run_summarizer
def count_tokens(data: str = "") -> None:
print('*******{}*******'.format(len(word_tokenize(data))))
return len(word_tokenize(data))
def preprocess_string(data: str) -> None:
processed_str = re.sub(r'[^\x00-\x7f]',r'', data)
processed_str = re.sub(r'<.*?>',r'', data)
encoded_str = processed_str.encode("ascii", "ignore")
string_decode = encoded_str.decode()
return string_decode
def main():
# Load database configuration
with open('config.yaml', 'r') as file:
config = yaml.safe_load(file)
# Initialize database connection
db = DatabaseHandler(
username=config['db_username'],
password=config['db_password'],
dsn=config['db_dsn']
)
db.connect()
try:
readme_list = run_readme_reader()
print('Obtained {} README records'.format(len(readme_list)))
iterator = 1
for x in readme_list:
new_text = preprocess_string(x)
if (len(new_text)) < 250:
print('Skipping iteration as it does not have enough data to summarize')
iterator += 1
continue
print('Text length: {}'.format(len(x)))
print('Text length: {}'.format(len(new_text)))
if len(new_text) > 10000:
new_text = new_text[0:10000]
else:
new_text = new_text
print('Text length: {}'.format(len(new_text)))
summary = run_summarizer(new_text)
print(summary)
# Save to file
output_file = f'outputs/output_{iterator}.txt'
with open(output_file, 'w', encoding='utf-8') as file:
file.write(summary)
# Save to database
db.insert_summary(
summary_text=summary,
daily_position=iterator,
file_path=output_file
)
iterator += 1
finally:
db.close()
if __name__ == '__main__':
main()