forked from tobischimanski/transition_NLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
transition_analysis.py
344 lines (282 loc) · 13.3 KB
/
transition_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
from llama_index.core.prompts.base import PromptTemplate
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
import numpy as np
import pandas as pd
import openai
from llama_index.llms.openai import OpenAI
from openai import AsyncOpenAI
import glob
import json
import asyncio
import re
import os
import time
import sys
PROMPT_TEMPLATE_GENERAL = ("""
You are tasked with the role of a climate scientist, assigned to analyze a company's sustainability report. Based on the following extracted parts from the sustainability report, answer the given QUESTIONS.
If you don't know the answer, just say that you don't know by answering "NA". Don't try to make up an answer.
Given are the following sources:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n
QUESTIONS:
1. What is the company of the report?
2. What sector does the company belong to?
3. Where is the company located?
Format your answers in JSON format with the following keys: COMPANY_NAME and COMPANY_SECTOR COMPANY_LOCATION.
Your FINAL_ANSWER in JSON (ensure there's no format error):
""")
PROMPT_TEMPLATE_YEAR = ("""
You are tasked with the role of a climate scientist, assigned to analyze a company's sustainability report. Based on the following extracted parts from the sustainability report, answer the given QUESTION.
If you don't know the answer, just say that you don't know by answering "NA". Don't try to make up an answer.
Given are the following sources:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n
QUESTION:
In which year was the report published?
Format your answers in JSON format with the following key: YEAR
Your FINAL_ANSWER in JSON (ensure there's no format error):
""")
PROMPT_TEMPLATE_QA = ("""
You are a senior sustainabiliy analyst with expertise in climate science evaluating a company's climate-related transition plan and strategy.
This is basic information to the company:
{basic_info}
You are presented with the following sources from the company's report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n
Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{question}||
Please consider the following additional explanation to the question encapsulated in "+++++" as crucial for answering the question:
+++++ [BEGIN OF EXPLANATION]
{explanation}
+++++ [END OF EXPLANATION]
Please enforce to the following guidelines in your answer:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Keep your ANSWER within {answer_length} words.
4. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
5. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
6. Always acknowledge that the information provided is representing the company's view based on its report.
7. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
8. Start your answer with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.
Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
Your FINAL_ANSWER in JSON (ensure there's no format error):
""")
# function that takes the report and creates the retriever (with indexes etc.)
def createRetriever(REPORT, CHUNK_SIZE, CHUNK_OVERLAP, TOP_K):
# load in document
documents = SimpleDirectoryReader(input_files=[REPORT]).load_data()
parser = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) # tries to keep sentences together
nodes = parser.get_nodes_from_documents(documents)
# build indexes
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
index = VectorStoreIndex(
nodes,
embed_model=embed_model
)
# configure retriever
retriever = VectorIndexRetriever(
index=index,
similarity_top_k=TOP_K,
)
return retriever
def basicInformation(retriever, PROMPT_TEMPLATE_GENERAL, MODEL):
# Query content
retrieved_nodes = retriever.retrieve(
"What is the name of the company, the sector it operates in and location of headquarters?")
# create the "sources" block
sources = []
for i in retrieved_nodes:
page_num = i.metadata['page_label']
# remove "\n" from the sources
source = i.get_content().replace("\n", "")
sources.append(f"PAGE {page_num}: {source}")
sources_block = "\n\n\n".join(sources)
qa_template = PromptTemplate(PROMPT_TEMPLATE_GENERAL)
# you can create text prompt (for completion API)
prompt = qa_template.format(sources=sources_block)
# or easily convert to message prompts (for chat API)
# messages = qa_template.format_messages(sources=sources_block)
# get response
response = OpenAI(temperature=0, model=MODEL).complete(prompt)
# replace front or back ```json {} ```
response_text_json = response.text.replace("```json", "").replace("```", "")
response_text = json.loads(response_text_json)
# create a text to it
basic_info = f" - Company name: {response_text['COMPANY_NAME']}\n - Industry: {response_text['COMPANY_SECTOR']}\n - Headquarter Location: {response_text['COMPANY_LOCATION']}"
#print(basic_info)
return basic_info, response_text
def yearInformation(retriever, PROMPT_TEMPLATE_YEAR, MODEL):
# Query content
retrieved_nodes = retriever.retrieve(
"In which year was the report published?")
# create the "sources" block
sources = []
for i in retrieved_nodes:
page_num = i.metadata['page_label']
# remove "\n" from the sources
source = i.get_content().replace("\n", "")
sources.append(f"PAGE {page_num}: {source}")
sources_block = "\n\n\n".join(sources)
qa_template = PromptTemplate(PROMPT_TEMPLATE_YEAR)
# you can create text prompt (for completion API)
prompt = qa_template.format(sources=sources_block)
# or easily convert to message prompts (for chat API)
# messages = qa_template.format_messages(sources=sources_block)
# get response
response = OpenAI(temperature=0, model=MODEL).complete(prompt)
# replace front or back ```json {} ```
response_text_json = response.text.replace("```json", "").replace("```", "")
response_text = json.loads(response_text_json)
return response_text
def createPromptTemplate(retriever, BASIC_INFO, QUERY_STR, PROMPT_TEMPLATE_QA, EXPLANTATION, ANSWER_LENGTH):
# Query content
retrieved_nodes = retriever.retrieve(QUERY_STR)
# create the "sources" block
sources = []
for i in retrieved_nodes:
page_num = i.metadata['page_label']
# remove "\n" from the sources
source = i.get_content().replace("\n", "")
sources.append(f"PAGE {page_num}: {source}")
sources_block = "\n\n\n".join(sources)
qa_template = PromptTemplate(PROMPT_TEMPLATE_QA)
# you can create text prompt (for completion API)
prompt = qa_template.format(basic_info=BASIC_INFO, sources=sources_block, question=QUERY_STR,
explanation=EXPLANTATION, answer_length=ANSWER_LENGTH)
# or easily convert to message prompts (for chat API)
messages = qa_template.format_messages(basic_info=BASIC_INFO, sources=sources_block, question=QUERY_STR,
explanation=EXPLANTATION, answer_length=ANSWER_LENGTH)
return prompt
def createPrompts(retriever, PROMPT_TEMPLATE_QA, BASIC_INFO, ANSWER_LENGTH, MASTERFILE):
prompts = []
questions = []
for i in np.arange(0, MASTERFILE.shape[0]):
QUERY_STR = MASTERFILE.iloc[i]["question"]
questions.append(QUERY_STR)
EXPLANTATION = MASTERFILE.iloc[i]["question definitions"]
prompts.append(
createPromptTemplate(retriever, BASIC_INFO, QUERY_STR, PROMPT_TEMPLATE_QA, EXPLANTATION, ANSWER_LENGTH))
print("Prompts Created")
return prompts, questions
# asynced creation of answers
async def answer_async(prompts, MODEL):
coroutines = []
llm = OpenAI(temperature=0, model=MODEL)
for p in prompts:
co = llm.acomplete(p)
coroutines.append(co)
# Schedule three calls *concurrently*:
out = await asyncio.gather(*coroutines)
# print(L)
return out
async def createAnswersAsync(prompts, MODEL):
# async answering
answers = await answer_async(prompts, MODEL)
# return
return answers
def createAnswers(prompts, MODEL):
# sync answering
answers = []
llm = OpenAI(temperature=0, model=MODEL)
for p in prompts:
response = llm.complete(p)
answers.append(response)
print("Answers Given")
### create HTML of it
return answers
def outputExcel(answers, questions, prompts, REPORT, MASTERFILE, MODEL, option="", excels_path="Excels_SustReps"):
# create the columns
categories, ans, ans_verdicts, source_pages, source_texts = [], [], [], [], []
subcategories = [i.split("_")[1] for i in MASTERFILE.identifier.to_list()]
for i, a in enumerate(answers):
try:
# replace front or back ```json {} ```
a = a.text.replace("```json", "").replace("```", "")
answer_dict = json.loads(a)
except:
print(f"{i} with formatting error")
try:
answer_dict = {"ANSWER": "CAUTION: Formatting error occurred, this is the raw answer:\n" + a.text,
"SOURCES": "See In Answer"}
except:
answer_dict = {"ANSWER": "Failure in answering this question.", "SOURCES": "NA"}
# final verdict
verdict = re.search(r"\[\[([^]]+)\]\]", answer_dict["ANSWER"])
if verdict:
ans_verdicts.append(verdict.group(1))
else:
ans_verdicts.append("NA")
# other values
ans.append(answer_dict["ANSWER"])
source_pages.append(", ".join(map(str, answer_dict["SOURCES"])))
source_texts.append(prompts[i].split("---------------------")[1])
if i == 0:
category = "target"
if i == 12:
category = "governance"
if i == 21:
category = "strategy"
if i == 45:
category = "tracking"
categories.append(category)
# create DataFrame and export as excel
df_out = pd.DataFrame(
{"category": categories, "subcategory": subcategories, "question": questions, "decision": ans_verdicts,
"answer": ans,
"source_pages": source_pages, "source_texts": source_texts})
excel_path_qa = f"./{excels_path}/" + REPORT.split("/")[-1].split(".")[0] + f"_{MODEL}" + f"{option}" + ".xlsx"
df_out.to_excel(excel_path_qa)
return excel_path_qa
async def main():
print(sys.argv)
if len(sys.argv) < 3:
print("WRONG USAGE PATTERN!\nPlease use: 'python api_key report model [num indicators]'")
args = sys.argv[1:]
os.environ["OPENAI_API_KEY"] = args[0]
openai.api_key = args[0]
# Global paramters
MASTERFILE = pd.read_excel("questions_masterfile_100524.xlsx")
CHUNK_SIZE = 350
CHUNK_OVERLAP = 50
TOP_K = 8
ANSWER_LENGTH = 200
REPORT = args[1]
MODEL = args[2]
# if option of less is given
try:
less = int(args[3])
MASTERFILE = MASTERFILE[:less].copy()
print(f"Execution with subset of {less} indicators.")
except:
less = "all"
print("Executiuon with all paramters.")
retriever = createRetriever(REPORT, CHUNK_SIZE, CHUNK_OVERLAP, TOP_K)
BASIC_INFO, response_text = basicInformation(retriever, PROMPT_TEMPLATE_GENERAL, MODEL)
year_info = yearInformation(retriever, PROMPT_TEMPLATE_YEAR, MODEL)
response_text["YEAR"] = year_info["YEAR"]
response_text["REPORT_NAME"] = REPORT
print(response_text)
prompts, questions = createPrompts(retriever, PROMPT_TEMPLATE_QA, BASIC_INFO, ANSWER_LENGTH, MASTERFILE)
# MAKE SURE TO NOT HIT RATE LIMITS
answers = []
step_size = 20
for i in np.arange(0, len(prompts), step_size):
p_loc = prompts[i:i+step_size]
a_loc = await createAnswersAsync(p_loc, MODEL)
answers.extend(a_loc)
num = i+20
if num > len(prompts):
num = len(prompts)
print(f"{num} Answers Given")
excels_path = "Excel_Output"
option = f"_topk{TOP_K}_params{less}"
path_excel = outputExcel(answers, questions, prompts, REPORT, MASTERFILE, MODEL, option, excels_path)
# For usage on windows:
#asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())