-
Notifications
You must be signed in to change notification settings - Fork 2
/
podcast_generator.py
240 lines (205 loc) · 9.83 KB
/
podcast_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import requests
import json
from prompt import talk_generate_prompt_en, talk_generate_prompt_zh, single_article_prompt_zh, single_article_prompt_en
import re
import logging
import config
logging.basicConfig(level=logging.INFO)
class PodcastGenerator:
def __init__(self, api_url, api_token, max_retries=3):
self.api_url = api_url
self.api_token = api_token
self.max_retries = max_retries
def generate(self, news_articles):
prompt = self._create_prompt(news_articles)
retry_count = 0
last_error = None
while retry_count < self.max_retries:
try:
headers = {
"Authorization": f"Bearer {self.api_token}",
"Content-Type": "application/json"
}
data = {
"model": config.LLM_MODLE,
"messages": [{"role": "user", "content": prompt}]
}
response = requests.post(self.api_url, headers=headers, data=json.dumps(data))
response_json = response.json()
content = response_json['choices'][0]['message']['content']
logging.info(f"Generated content:\n {content}")
# Try to parse and process content
processed_content = self._post_process_content(content)
return processed_content
except (json.JSONDecodeError, KeyError) as e:
retry_count += 1
last_error = str(e)
logging.warning(f"Generation attempt {retry_count} failed: {last_error}")
if retry_count < self.max_retries:
# Add error message to prompt and retry
error_message = f"""
Last generation failed with error: {last_error}
Please ensure the generated content is valid JSON format and strictly follows this structure:
{{
"podcast": {{
"opening": [
{{"role": "host", "content": "..."}},
{{"role": "guest", "content": "..."}}
],
"main_content": [
{{
"article_index": 1,
"discussion": [
{{"role": "host", "content": "..."}},
{{"role": "guest", "content": "..."}}
]
}}
],
"closing": [
{{"role": "host", "content": "..."}},
{{"role": "guest", "content": "..."}}
]
}}
}}
Please regenerate the content:
"""
prompt += error_message
else:
logging.error(f"Reached maximum retry attempts ({self.max_retries}), generation failed")
raise Exception(f"Failed to generate podcast content, last error: {last_error}")
def _create_prompt(self, news_articles):
# Select prompt based on language setting
prompt = talk_generate_prompt_en if config.LANGUAGE == "en" else talk_generate_prompt_zh
# Add language-specific formatting for articles
article_format = {
"en": "Title: {}\nDate: {}\nContent: {}\nSource: {}\n\n",
"zh": "标题:{}\n日期:{}\n内容:{}\n来源:{}\n\n"
}
for article in news_articles:
prompt += article_format[config.LANGUAGE].format(
article['title'],
article['published_date'],
article['content'],
article['source']
)
# Add language-specific closing instruction
closing_instruction = {
"en": "Please present in dialogue form, including host and guest interactions, maintaining an engaging and professional tone.",
"zh": "请以对话形式呈现,包含主持人和嘉宾的互动,保持专业性和趣味性。"
}
prompt += closing_instruction[config.LANGUAGE]
return prompt
def _post_process_content(self, content):
"""Post-process the generated content to ensure correct format"""
try:
# Use regex to match JSON content
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
content = json_match.group(0)
# Parse JSON content
podcast_data = json.loads(content)
processed_lines = []
# Use language-specific role names
role_names = {
"en": {"host": "Host", "guest": "Guest"},
"zh": {"host": "主持人", "guest": "嘉宾"}
}[config.LANGUAGE]
# Process opening
for dialog in podcast_data['podcast']['opening']:
role = role_names["host"] if dialog['role'] == "host" else role_names["guest"]
processed_lines.append(f"{role}: {dialog['content']}")
# Process main content
for article in podcast_data['podcast']['main_content']:
for dialog in article['discussion']:
role = role_names["host"] if dialog['role'] == "host" else role_names["guest"]
processed_lines.append(f"{role}: {dialog['content']}")
# Process closing
for dialog in podcast_data['podcast']['closing']:
role = role_names["host"] if dialog['role'] == "host" else role_names["guest"]
processed_lines.append(f"{role}: {dialog['content']}")
return '\n'.join(processed_lines)
except json.JSONDecodeError as e:
logging.error(f"JSON parsing failed, invalid format: {str(e)}")
raise
except KeyError as e:
logging.error(f"JSON structure incomplete, missing required keys: {str(e)}")
raise
def generate_single_article(self, article):
"""为单篇文章生成播客内容"""
# 选择语言对应的提示词模板
prompt_template = single_article_prompt_zh if config.LANGUAGE == "zh" else single_article_prompt_en
# 格式化提示词
prompt = prompt_template.format(
title=article['title'],
authors=', '.join(article['authors']),
published_date=article['published_date'],
categories=', '.join(article['categories']),
doi=article['doi'] or 'N/A',
comment=article['comment'] or 'N/A',
journal_ref=article['journal_ref'] or 'N/A',
content=article['content']
)
retry_count = 0
last_error = None
while retry_count < self.max_retries:
try:
headers = {
"Authorization": f"Bearer {self.api_token}",
"Content-Type": "application/json"
}
data = {
"model": config.LLM_MODLE,
"messages": [{"role": "user", "content": prompt}]
}
response = requests.post(self.api_url, headers=headers, data=json.dumps(data))
response_json = response.json()
content = response_json['choices'][0]['message']['content']
logging.info(f"Generated content:\n {content}")
# 处理生成的内容
processed_content = self._post_process_single_article(content)
return processed_content
except (json.JSONDecodeError, KeyError) as e:
retry_count += 1
last_error = str(e)
logging.warning(f"Generation attempt {retry_count} failed: {last_error}")
if retry_count >= self.max_retries:
logging.error(f"Reached maximum retry attempts ({self.max_retries}), generation failed")
raise Exception(f"Failed to generate podcast content, last error: {last_error}")
def _post_process_single_article(self, content):
"""处理单篇文章生成的内容"""
try:
# 使用正则表达式匹配JSON内容
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
content = json_match.group(0)
# 解析JSON内容
podcast_data = json.loads(content)
processed_lines = []
# 使用语言特定的角色名称
role_names = {
"en": {"host": "Host", "guest": "Guest"},
"zh": {"host": "主持人", "guest": "嘉宾"}
}[config.LANGUAGE]
# 处理开场白
for dialog in podcast_data['podcast']['opening']:
role = role_names["host"] if dialog['role'] == "host" else role_names["guest"]
processed_lines.append(f"{role}: {dialog['content']}")
# 处理主要内容
for section in podcast_data['podcast']['main_content']:
# 添加章节标题
processed_lines.append(f"\n=== {section['section']} ===\n")
for dialog in section['discussion']:
role = role_names["host"] if dialog['role'] == "host" else role_names["guest"]
processed_lines.append(f"{role}: {dialog['content']}")
# 处理结束语
processed_lines.append("\n=== 总结 ===\n")
for dialog in podcast_data['podcast']['closing']:
role = role_names["host"] if dialog['role'] == "host" else role_names["guest"]
processed_lines.append(f"{role}: {dialog['content']}")
return '\n'.join(processed_lines)
except json.JSONDecodeError as e:
logging.error(f"JSON parsing failed, invalid format: {str(e)}")
raise
except KeyError as e:
logging.error(f"JSON structure incomplete, missing required keys: {str(e)}")
raise