-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_channel_data_markdown.py
180 lines (151 loc) · 7.75 KB
/
convert_channel_data_markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Python Script: convert_channel_data.py
This script extracts question-answer pairs from a given JSON data file and creates a new markdown file for each pair.
It uses the OpenAI API to generate questions and answers based on the input data.
The script accepts two command-line arguments:
1. 'input_file': The path to the input JSON file.
2. 'output_dir': The path to the output directory where the question-answer pair files will be saved.
The script uses environment variables for OpenAI API configuration, which should be set in a .env file:
1. 'AZURE_OPENAI_API_KEY': The OpenAI API key.
2. 'AZURE_OPENAI_ENDPOINT': The OpenAI API endpoint.
3. 'AZURE_OPENAI_DEPLOYMENT_NAME': The name of the OpenAI deployment.
How to run:
1. Ensure that all the necessary Python libraries are installed and the .env file is set up correctly.
2. Run the script from the command line with the input file and output directory as arguments.
For example: python convert_channel_data.py input.json output_dir
Author: Mario Guerra
License: MIT
"""
import json
import os
import re
import argparse
import openai
import asyncio
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_type = "azure"
openai.api_version = os.get("AZURE_OPENAI_API_VERSION")
# # Define a prompt for OpenAI API to generate questions and
question_prompt = """Given the following input, extract and summarize the questions being asked. Clean up the formatting to remove extraneous characters and improve readability. Distill each question down to the essence of what is being asked, removing extraneous information and formatting. Do not include anyone's names in the output. Format the output using the following template, with each question forming a unique item as shown in the example below:
[
{
"question": "How do we specify the naming of the SDK's through TypeSpec?"
},
{
"question": "Is the naming of the package related to the names of the folders in the folder structure?"
},
...
"""
answers_prompt = """Given the following question and answers, extract and synthesize the answers to the question being asked. Create a single concise and correct answer for each question from all answers provided. If an answer includes a name, rephrase it to avoid using the name. Clean up the formatting to improve readability. Format the output using the following template, with each question forming a unique item as shown in the example below:
[
{
"answer": "Specify the name via the `name` field in the `TypeSpec`."
},
...
"""
# Check if a string is a valid JSON object
def is_valid_json(json_string):
try:
json.loads(json_string)
return True
except ValueError:
print("Invalid JSON:", json_string)
return False
# Generate questions and answers: generate_qna() is an async
# function that sends request to OpenAI API to generate QnA based on input.
# Retries request if response is invalid, up to a maximum number of attempts.
async def generate_qna(input, prompt):
max_retries = 3
retry_count = 0
retry_delay = 2 # Delay between retries in seconds
system_prompt = prompt
while retry_count < max_retries:
try:
print(f"Attempt {retry_count + 1} of {max_retries}...")
response = openai.ChatCompletion.create(
engine=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": input},
],
max_tokens=4096,
n=1,
stop=None,
temperature=0.1,
top_p=0.1,
)
return response.choices[0].message.content
except openai.error.Timeout:
if retry_count < max_retries - 1: # If we haven't reached the maximum number of retries
print(f"Request timed out, retrying (attempt {retry_count + 1})...")
await asyncio.sleep(retry_delay) # Wait before retrying
continue # Retry the request
else: # If we've reached the maximum number of retries
print(f"Request timed out after {max_retries} attempts. Skipping Q&A for this chunk.")
return "{}" # Return an empty JSON string as a fallback
except Exception as e: # Catch all other exceptions
error_message = str(e)
delay_str = re.search(r'Please retry after (\d+)', error_message)
if delay_str:
delay = int(delay_str.group(1))
print(f"Rate limit exceeded. Retrying in {delay} seconds...")
await asyncio.sleep(delay)
retry_count += 1
else:
print(f"An error occurred: {e}")
return "{}" # Return an empty JSON string as a fallback
retry_count += 1
if retry_count < max_retries - 1:
await asyncio.sleep(retry_delay)
print(f"Failed to get a valid response after {max_retries} attempts. Skipping Q&A for this chunk.")
return "{}" # Return an empty JSON string as a fallback
async def extract_qa_pairs(input_file, output_dir):
with open(input_file, 'r') as f:
data = json.load(f)
counter = 0 # Initialize the counter
file_counter = 0 # Initialize the file counter
markdown_content = "" # Initialize the markdown content
for item in data['messages']:
# Skip this message if there are no replies
if not item['replies']:
print("No replies to message, skipping messageId ", item['messageId'])
continue
message_id = item['messageId']
message_content = item['messageContent']
replies = item['replies']
answers = [reply['replyContent'] for reply in replies]
answer_content = ' '.join(answers)
questions = await generate_qna(message_content, question_prompt)
answers = await generate_qna(questions + '\n' + answer_content, answers_prompt)
print("Message ID:", message_id)
print("Questions:", questions)
print("Answers:", answers)
# Create a dictionary for this pair of questions and answers
if (is_valid_json(questions) and is_valid_json(answers)):
question_list = json.loads(questions)
answer_list = json.loads(answers)
for question, answer in zip(question_list, answer_list):
# Check if 'question' and 'answer' exist before trying to access them
if 'question' in question and 'answer' in answer:
# Add the question and answer to the markdown content
markdown_content += f"## {question['question']}\n\n{answer['answer']}\n\n"
counter += 1 # Increment the counter
# If we have reached 50 pairs, increment the file counter and reset the counter and content
if counter == 50:
counter = 0
markdown_content = ""
file_counter += 1 # Increment the file counter
# Write the markdown content to a file
with open(os.path.join(output_dir, f'typespec_channel_qna_{file_counter}.md'), 'w') as f:
f.write(markdown_content)
else:
print("Invalid JSON returned from chat function, skipping this pair.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract question/answer pairs from JSON data.')
parser.add_argument('input_file', type=str, help='Path to the input JSON file.')
parser.add_argument('output_dir', type=str, help='Path to the output directory.')
args = parser.parse_args()
asyncio.run(extract_qa_pairs(args.input_file, args.output_dir))