Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelthwan committed Mar 9, 2023
1 parent 8a74cfb commit 34e22e1
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
26 changes: 16 additions & 10 deletions src/FrontendService.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,42 @@
import re
from urllib.parse import urlparse

from SemanticSearchService import BatchOpenAISemanticSearchService
from Util import setup_logger

logger = setup_logger('FootnoteService')




class FrontendService:
def __init__(self, config, response_text, gpt_input_text_df):
self.config = config
self.response_text = response_text
used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id
self.gpt_input_text_df = gpt_input_text_df[used_columns]


def get_data_json(self, response_text, gpt_input_text_df):
def create_response_json_object(text, type):
return {"text": text, "type": type}

def create_source_json_object(footnote, domain, url, title, text):
return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text}

def get_response_json(response_text):
# find reference in text & re-order
def reorder_url_id(response_text, gpt_input_text_df):
# response_text: find reference in text & re-order
url_id_list = [int(x) for x in dict.fromkeys(re.findall(r'\[([0-9]+)\]', response_text))]
url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1)))

for url_id, new_url_id in url_id_map.items():
response_text = response_text.replace(f'[{url_id}]', f'[{new_url_id}]')

# gpt_input_text_df: find reference in text & re-order
in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy()
in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map)
return response_text, in_scope_source_df

def get_response_json(response_text):
response_json = []
split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text)

Expand All @@ -37,12 +45,9 @@ def get_response_json(response_text):
response_json.append(create_response_json_object(sentence, "footnote"))
else:
response_json.append(create_response_json_object(sentence, "response"))
return response_json, url_id_map
return response_json

def get_source_json(gpt_input_text_df, url_id_map):
# include only sources used in response_text & remap url_id
in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy()
in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map)
def get_source_json(in_scope_source_df):
in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int)
in_scope_source_df.sort_values('docno', inplace=True)
source_text_list = []
Expand All @@ -64,8 +69,9 @@ def get_source_json(gpt_input_text_df, url_id_map):
source_json = sorted(source_json, key=lambda x: x['footnote'])
return source_json, source_text

response_json, url_id_map = get_response_json(response_text)
source_json, source_text = get_source_json(gpt_input_text_df, url_id_map)
response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df)
response_json = get_response_json(response_text)
source_json, source_text = get_source_json(in_scope_source_df)

return source_text, {'response_json': response_json, 'source_json': source_json}

Expand Down
4 changes: 3 additions & 1 deletion src/Util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ def setup_logger(tag):


def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit):
# clean out of prompt texts
# clean out of prompt texts for existing [1], [2], [3]... in the source_text
gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x))

gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x))
gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum()
max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1
gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit
# reorder url_id with url that in scope.
url_id_list = gpt_input_text_df['url_id'].unique()
url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1)))
gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map)
Expand Down

0 comments on commit 34e22e1

Please sign in to comment.