Address information leakage issue; delete Chinese annotations (#22) (#23

) Co-authored-by: YuxuanLei <[email protected]> Co-authored-by: v-leiyuxuan <[email protected]>
microsoft · Mar 26, 2024 · 8126b72 · 8126b72
1 parent 230786e
commit 8126b72
Show file tree

Hide file tree

Showing 10 changed files with 36 additions and 329 deletions.
diff --git a/InteRecAgent/demonstration/tagger.py b/InteRecAgent/demonstration/tagger.py
@@ -12,7 +12,7 @@
 
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--demo_dir_or_file", type=str, default="/home/v-huangxu/work/gen_demos")
+parser.add_argument("--demo_dir_or_file", type=str, default="./work/gen_demos")
 parser.add_argument("--save", type=str, default="./tagged/")
 args, _ = parser.parse_known_args()
 
@@ -62,10 +62,10 @@ def extract_tags(file_path):
         tags = re.findall(pattern, content)  
     return tags  
 
-# 示例用法  
-file_path = '/home/v-huangxu/work/LLM4CRS/tagged/tag_cache.txt'  
+# Example usage  
+file_path = './work/LLM4CRS/tagged/tag_cache.txt'  
 tags = extract_tags(file_path)  
-# print(tags)  # 输出：['y', 'n', 'Y', ...]  
+# print(tags)  # Output：['y', 'n', 'Y', ...]  
 
 
 examples = load_examples(args.demo_dir_or_file)

diff --git a/InteRecAgent/llm4crs/demo/base.py b/InteRecAgent/llm4crs/demo/base.py
@@ -94,7 +94,7 @@ def fit_domain(self, examples: List[Dict], domain: str):
 
 
 if __name__ == "__main__":
-    selector = DemoSelector("/home/v-huangxu/work/LLM4CRS/demonstration/gen_demos/2023-06-28-08_53_56.jsonl", k=3)
+    selector = DemoSelector("./LLM4CRS/demonstration/gen_demos/2023-06-28-08_53_56.jsonl", k=3)
     request = "I want some farming games."
 
     demo_prompt = selector(request)

diff --git a/InteRecAgent/preprocess/movies.ipynb b/InteRecAgent/preprocess/movies.ipynb
diff --git a/InteRecAgent/preprocess/prepare_amazon.ipynb b/InteRecAgent/preprocess/prepare_amazon.ipynb
diff --git a/InteRecAgent/preprocess/prepare_steam.ipynb b/InteRecAgent/preprocess/prepare_steam.ipynb
@@ -338,7 +338,7 @@
     "from bs4 import BeautifulSoup  \n",
     "  \n",
     "def remove_html_tags(text):  \n",
-    "    \"\"\"删除文本中的HTML标签\"\"\"  \n",
+    "    \"\"\"Remove HTML tags from text\"\"\"  \n",
     "    soup = BeautifulSoup(text, \"html.parser\")  \n",
     "    return soup.get_text()  \n",
     "  \n",
@@ -1374,17 +1374,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sucessfully saved into /home/v-huangxu/blob/raw_datasets/steam/chatbot/simulator_test_data_900_230816.jsonl.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from typing import *\n",
     "import json, pickle\n",

diff --git a/InteRecAgent/preprocess/preprocess_redial.ipynb b/InteRecAgent/preprocess/preprocess_redial.ipynb
diff --git a/RecExplainer/preprocess/data_preprocess_amazon.py b/RecExplainer/preprocess/data_preprocess_amazon.py
@@ -212,7 +212,7 @@ def get_interaction(datas):
             user_seq[user].append((item, time))
 
     for user, item_time in user_seq.items():
-        item_time.sort(key=lambda x: x[1])  # 对各个数据集得单独排序
+        item_time.sort(key=lambda x: x[1])
         items = []
         for t in item_time:
             items.append(t[0])
@@ -234,15 +234,15 @@ def check_Kcore(user_items, user_core, item_core):
     for item, num in item_count.items():
         if num < item_core:
             return user_count, item_count, False
-    return user_count, item_count, True # 已经保证Kcore
+    return user_count, item_count, True # Already guaranteed Kcore
 
 def filter_Kcore(user_items, user_core, item_core):
-    # 循环过滤 K-core，过滤掉不满足K-core的user和item
+    # Loop filter K-core, filter out users and items that do not meet K-core
     user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
     while not isKcore:
         cur_user_items = copy.deepcopy(user_items)
         for user, num in user_count.items():
-            if user_count[user] < user_core: # 直接把user 删除
+            if user_count[user] < user_core: # Delete the user
                 cur_user_items.pop(user)
             else:
                 for item in user_items[user]:
@@ -265,7 +265,7 @@ def id_map(user_items): # user_items dict
     item_id = 1
     final_data = {}
     random_user_list = list(user_items.keys())
-    random.shuffle(random_user_list)  # user 随机打乱后重新编码
+    random.shuffle(random_user_list)  # user is shuffled and re-encoded
     for user in random_user_list:
         items = user_items[user]
         if user not in user2id:

diff --git a/RecLM-gen/scripts/rl_merge.sh b/RecLM-gen/scripts/rl_merge.sh
@@ -11,7 +11,7 @@ CUDA_VISIBLE_DEVICES=8 python main.py \
   --RL_actor_lora_a 2 \
   --RL_critic_lora_r 4 \
   --RL_critic_lora_a 2 \
-  --RL_load /home/lws/projects/RecLM-gen/snap/ICR_SubMovie_Title64T_0_Llama7bChat_LCT_E40_CCR2_SCG2-0.5_IDX/RL_Total_train_LM-True_VM-False_NR-20.1_SN-2_Q-False_T6_FG-True_LR-5e-06_LDO-0.0_WD-0.0_KLC-0.3_EW-0.01_RS-False_RW-True_VFC-0.1_KLT-0.05_LRP-2.0_GAMMA-0.99_GAS-4_LB-1_RA_0.5_/4800step_RL \
+  --RL_load snap/ICR_SubMovie_Title64T_0_Llama7bChat_LCT_E40_CCR2_SCG2-0.5_IDX/RL_Total_train_LM-True_VM-False_NR-20.1_SN-2_Q-False_T6_FG-True_LR-5e-06_LDO-0.0_WD-0.0_KLC-0.3_EW-0.01_RS-False_RW-True_VFC-0.1_KLT-0.05_LRP-2.0_GAMMA-0.99_GAS-4_LB-1_RA_0.5_/4800step_RL \
   --lm_head_full_tune \
   --FA2
 
diff --git a/RecLM-gen/unirec/asyc_server.py b/RecLM-gen/unirec/asyc_server.py
@@ -163,30 +163,30 @@ async def model_runner(self):
         self.queue_lock = asyncio.Lock(loop=app.loop)
         self.needs_processing = asyncio.Event(loop=app.loop)
         logger.info("started model runner for {}".format(self.model_name))
-        # while True 无限循环，程序会处于监听状态
+        # while True: Infinite loop, the program will be in a listening state
         while True:
-            # 等待有任务来
+            # Waiting for a task to come
             await self.needs_processing.wait()
             self.needs_processing.clear()
-            # 清空计时器
+            # Clear timer
             if self.needs_processing_timer is not None:
                 self.needs_processing_timer.cancel()
                 self.needs_processing_timer = None
-            # 处理队列都开启锁
+            # All processing queues are locked
             async with self.queue_lock:
-                # 如果队列不为空则设置最长等待时间
+                # If the queue is not empty, set the maximum waiting time
                 if self.queue:
                     longest_wait = app.loop.time() - self.queue[0]["time"]
                 else:  # oops
                     longest_wait = None
-                # 日志记录启动处理，队列大小，等待时间
+                # Logger start processing
                 logger.debug("launching processing. queue size: {}. longest wait: {}".format(len(self.queue), longest_wait))
-                # 获取一个批次的数据
+                # Get a batch of data
                 to_process = self.queue[:MAX_BATCH_SIZE]
-                # 然后把这些数据从任务队列中删除
+                # delete these data from the task queue
                 del self.queue[:len(to_process)]
                 self.schedule_processing_if_needed()
-            # 生成批数据
+            # Generate batch data
             # print(to_process)
             if len(to_process) == 0:
                 continue
@@ -196,11 +196,11 @@ async def model_runner(self):
                 'item_seq': torch.stack([t["item_id_list"] for t in to_process], dim=0),
             }
             # print(batch_data)
-            # 在一个单独的线程中运行模型，然后返回结果
+            # Run the model in a separate thread and return the results
             scores = await app.loop.run_in_executor(
                 None, functools.partial(self.run_model, batch_data)
             )
-            # 记录结果并设置一个完成事件
+            # Log the results and set a completion event
             for t, s in zip(to_process, scores):
                 t["score"] = s
                 t["done_event"].set()

diff --git a/RecLM-gen/utils/tools.py b/RecLM-gen/utils/tools.py
@@ -223,15 +223,15 @@ def __init__(self, model_name='', port=8000) -> None:
         self.client = None
         self.max_wrong_time = 2
         self.port = port
-        self.model_name = 'gpt-3.5-turbo-1106' if 'gpt-3.5-turbo-1106' in model_name else model_name
+        self.model_name = 'gpt-3.5' if 'gpt-3.5' in model_name else model_name
         self.init_client()
         print(f'use model of {self.model_name}')
 
     def init_client(self):
         self.client = OpenAI(
-            api_key='sk-E9oyiDL777ZaNZdRrzRSPzsbvbqvhebRl2xiTheKjh6bE4Jx' if self.model_name == 'gpt-3.5-turbo-1106' else 'EMPTY',
+            api_key='xxx' if self.model_name == 'gpt-3.5' else 'EMPTY',
             max_retries=self.max_wrong_time,
-            base_url='https://openkey.cloud/v1' if self.model_name == 'gpt-3.5-turbo-1106' else f'http://127.0.0.1:{self.port}/v1'
+            base_url='https://xxx.xxx/v1' if self.model_name == 'gpt-3.5' else f'http://127.0.0.1:{self.port}/v1'
         )
 
     def call(self, content, t=0.0):