Skip to content

Commit

Permalink
Address information leakage issue; delete Chinese annotations (#22) (#23
Browse files Browse the repository at this point in the history
)

Co-authored-by: YuxuanLei <[email protected]>
Co-authored-by: v-leiyuxuan <[email protected]>
  • Loading branch information
3 people authored Mar 26, 2024
1 parent 230786e commit 8126b72
Show file tree
Hide file tree
Showing 10 changed files with 36 additions and 329 deletions.
8 changes: 4 additions & 4 deletions InteRecAgent/demonstration/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


parser = argparse.ArgumentParser()
parser.add_argument("--demo_dir_or_file", type=str, default="/home/v-huangxu/work/gen_demos")
parser.add_argument("--demo_dir_or_file", type=str, default="./work/gen_demos")
parser.add_argument("--save", type=str, default="./tagged/")
args, _ = parser.parse_known_args()

Expand Down Expand Up @@ -62,10 +62,10 @@ def extract_tags(file_path):
tags = re.findall(pattern, content)
return tags

# 示例用法
file_path = '/home/v-huangxu/work/LLM4CRS/tagged/tag_cache.txt'
# Example usage
file_path = './work/LLM4CRS/tagged/tag_cache.txt'
tags = extract_tags(file_path)
# print(tags) # 输出:['y', 'n', 'Y', ...]
# print(tags) # Output:['y', 'n', 'Y', ...]


examples = load_examples(args.demo_dir_or_file)
Expand Down
2 changes: 1 addition & 1 deletion InteRecAgent/llm4crs/demo/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def fit_domain(self, examples: List[Dict], domain: str):


if __name__ == "__main__":
selector = DemoSelector("/home/v-huangxu/work/LLM4CRS/demonstration/gen_demos/2023-06-28-08_53_56.jsonl", k=3)
selector = DemoSelector("./LLM4CRS/demonstration/gen_demos/2023-06-28-08_53_56.jsonl", k=3)
request = "I want some farming games."

demo_prompt = selector(request)
Expand Down
65 changes: 4 additions & 61 deletions InteRecAgent/preprocess/movies.ipynb

Large diffs are not rendered by default.

98 changes: 2 additions & 96 deletions InteRecAgent/preprocess/prepare_amazon.ipynb

Large diffs are not rendered by default.

14 changes: 3 additions & 11 deletions InteRecAgent/preprocess/prepare_steam.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@
"from bs4 import BeautifulSoup \n",
" \n",
"def remove_html_tags(text): \n",
" \"\"\"删除文本中的HTML标签\"\"\" \n",
" \"\"\"Remove HTML tags from text\"\"\" \n",
" soup = BeautifulSoup(text, \"html.parser\") \n",
" return soup.get_text() \n",
" \n",
Expand Down Expand Up @@ -1374,17 +1374,9 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sucessfully saved into /home/v-huangxu/blob/raw_datasets/steam/chatbot/simulator_test_data_900_230816.jsonl.\n"
]
}
],
"outputs": [],
"source": [
"from typing import *\n",
"import json, pickle\n",
Expand Down
138 changes: 2 additions & 136 deletions InteRecAgent/preprocess/preprocess_redial.ipynb

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions RecExplainer/preprocess/data_preprocess_amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def get_interaction(datas):
user_seq[user].append((item, time))

for user, item_time in user_seq.items():
item_time.sort(key=lambda x: x[1]) # 对各个数据集得单独排序
item_time.sort(key=lambda x: x[1])
items = []
for t in item_time:
items.append(t[0])
Expand All @@ -234,15 +234,15 @@ def check_Kcore(user_items, user_core, item_core):
for item, num in item_count.items():
if num < item_core:
return user_count, item_count, False
return user_count, item_count, True # 已经保证Kcore
return user_count, item_count, True # Already guaranteed Kcore

def filter_Kcore(user_items, user_core, item_core):
# 循环过滤 K-core,过滤掉不满足K-core的user和item
# Loop filter K-core, filter out users and items that do not meet K-core
user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
while not isKcore:
cur_user_items = copy.deepcopy(user_items)
for user, num in user_count.items():
if user_count[user] < user_core: # 直接把user 删除
if user_count[user] < user_core: # Delete the user
cur_user_items.pop(user)
else:
for item in user_items[user]:
Expand All @@ -265,7 +265,7 @@ def id_map(user_items): # user_items dict
item_id = 1
final_data = {}
random_user_list = list(user_items.keys())
random.shuffle(random_user_list) # user 随机打乱后重新编码
random.shuffle(random_user_list) # user is shuffled and re-encoded
for user in random_user_list:
items = user_items[user]
if user not in user2id:
Expand Down
2 changes: 1 addition & 1 deletion RecLM-gen/scripts/rl_merge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ CUDA_VISIBLE_DEVICES=8 python main.py \
--RL_actor_lora_a 2 \
--RL_critic_lora_r 4 \
--RL_critic_lora_a 2 \
--RL_load /home/lws/projects/RecLM-gen/snap/ICR_SubMovie_Title64T_0_Llama7bChat_LCT_E40_CCR2_SCG2-0.5_IDX/RL_Total_train_LM-True_VM-False_NR-20.1_SN-2_Q-False_T6_FG-True_LR-5e-06_LDO-0.0_WD-0.0_KLC-0.3_EW-0.01_RS-False_RW-True_VFC-0.1_KLT-0.05_LRP-2.0_GAMMA-0.99_GAS-4_LB-1_RA_0.5_/4800step_RL \
--RL_load snap/ICR_SubMovie_Title64T_0_Llama7bChat_LCT_E40_CCR2_SCG2-0.5_IDX/RL_Total_train_LM-True_VM-False_NR-20.1_SN-2_Q-False_T6_FG-True_LR-5e-06_LDO-0.0_WD-0.0_KLC-0.3_EW-0.01_RS-False_RW-True_VFC-0.1_KLT-0.05_LRP-2.0_GAMMA-0.99_GAS-4_LB-1_RA_0.5_/4800step_RL \
--lm_head_full_tune \
--FA2

22 changes: 11 additions & 11 deletions RecLM-gen/unirec/asyc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,30 +163,30 @@ async def model_runner(self):
self.queue_lock = asyncio.Lock(loop=app.loop)
self.needs_processing = asyncio.Event(loop=app.loop)
logger.info("started model runner for {}".format(self.model_name))
# while True 无限循环,程序会处于监听状态
# while True: Infinite loop, the program will be in a listening state
while True:
# 等待有任务来
# Waiting for a task to come
await self.needs_processing.wait()
self.needs_processing.clear()
# 清空计时器
# Clear timer
if self.needs_processing_timer is not None:
self.needs_processing_timer.cancel()
self.needs_processing_timer = None
# 处理队列都开启锁
# All processing queues are locked
async with self.queue_lock:
# 如果队列不为空则设置最长等待时间
# If the queue is not empty, set the maximum waiting time
if self.queue:
longest_wait = app.loop.time() - self.queue[0]["time"]
else: # oops
longest_wait = None
# 日志记录启动处理,队列大小,等待时间
# Logger start processing
logger.debug("launching processing. queue size: {}. longest wait: {}".format(len(self.queue), longest_wait))
# 获取一个批次的数据
# Get a batch of data
to_process = self.queue[:MAX_BATCH_SIZE]
# 然后把这些数据从任务队列中删除
# delete these data from the task queue
del self.queue[:len(to_process)]
self.schedule_processing_if_needed()
# 生成批数据
# Generate batch data
# print(to_process)
if len(to_process) == 0:
continue
Expand All @@ -196,11 +196,11 @@ async def model_runner(self):
'item_seq': torch.stack([t["item_id_list"] for t in to_process], dim=0),
}
# print(batch_data)
# 在一个单独的线程中运行模型,然后返回结果
# Run the model in a separate thread and return the results
scores = await app.loop.run_in_executor(
None, functools.partial(self.run_model, batch_data)
)
# 记录结果并设置一个完成事件
# Log the results and set a completion event
for t, s in zip(to_process, scores):
t["score"] = s
t["done_event"].set()
Expand Down
6 changes: 3 additions & 3 deletions RecLM-gen/utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,15 +223,15 @@ def __init__(self, model_name='', port=8000) -> None:
self.client = None
self.max_wrong_time = 2
self.port = port
self.model_name = 'gpt-3.5-turbo-1106' if 'gpt-3.5-turbo-1106' in model_name else model_name
self.model_name = 'gpt-3.5' if 'gpt-3.5' in model_name else model_name
self.init_client()
print(f'use model of {self.model_name}')

def init_client(self):
self.client = OpenAI(
api_key='sk-E9oyiDL777ZaNZdRrzRSPzsbvbqvhebRl2xiTheKjh6bE4Jx' if self.model_name == 'gpt-3.5-turbo-1106' else 'EMPTY',
api_key='xxx' if self.model_name == 'gpt-3.5' else 'EMPTY',
max_retries=self.max_wrong_time,
base_url='https://openkey.cloud/v1' if self.model_name == 'gpt-3.5-turbo-1106' else f'http://127.0.0.1:{self.port}/v1'
base_url='https://xxx.xxx/v1' if self.model_name == 'gpt-3.5' else f'http://127.0.0.1:{self.port}/v1'
)

def call(self, content, t=0.0):
Expand Down

0 comments on commit 8126b72

Please sign in to comment.