-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDeslopTool.py
100 lines (80 loc) · 4.56 KB
/
DeslopTool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
from pathlib import Path
def load_jsonl(file_path):
data = []
# Open file with UTF-8 encoding and error handling
with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
for line in file:
try:
data.append(json.loads(line))
except json.JSONDecodeError as e:
print(f"Skipping invalid JSON line: {line}. Error: {e}")
return data
def load_filter_criteria(filter_files):
filter_criteria = []
for filter_file in filter_files:
# Open filter files with UTF-8 encoding
with open(filter_file, 'r', encoding='utf-8', errors='replace') as f:
filter_criteria.extend(line.strip() for line in f if line.strip())
return filter_criteria
def filter_conversations(conversations, filter_criteria, threshold=None):
filtered_conversations = []
total_matched_phrases = 0 # Total number of matched phrases
removed_conversation_count = 0 # Count of removed conversation keys
# Calculate total matched phrases in each conversation
matched_counts = []
for conversation in conversations:
total_phrases_in_conversation = 0 # Total matched phrases in this conversation
for msg in conversation.get("conversations", []):
# Check for valid "from" and "value" keys, and handle None safely
if msg.get("from") == "gpt" and msg.get("value"):
# Count matched phrases
matched_phrases = [phrase for phrase in filter_criteria if phrase in msg["value"]]
matched_count = len(matched_phrases) # Count matched phrases in the current message
total_phrases_in_conversation += matched_count # Accumulate total matched phrases
matched_counts.append(total_phrases_in_conversation) # Store matched count for the current conversation
total_matched_phrases += total_phrases_in_conversation # Count matched phrases overall
# Check if total matched phrases is zero; if so, return original conversations
if total_matched_phrases == 0:
return conversations, 0, total_matched_phrases, 0
# Calculate average matched phrases per conversation
average_matched_phrases = total_matched_phrases / len(conversations) if conversations else 0
# Apply threshold filtering based on the average if a threshold is provided
for idx, conversation in enumerate(conversations):
if threshold is not None and matched_counts[idx] >= average_matched_phrases * threshold:
removed_conversation_count += 1 # Increment removed conversation count
else:
filtered_conversations.append(conversation)
filtered_count = removed_conversation_count # Total filtered conversations
return filtered_conversations, filtered_count, total_matched_phrases, removed_conversation_count
def write_filtered_jsonl(filtered_data, output_file_path):
# Write to file with UTF-8 encoding
with open(output_file_path, 'w', encoding='utf-8', errors='replace') as file:
for conversation in filtered_data:
json.dump(conversation, file, ensure_ascii=False)
file.write('\n') # Write a newline after each JSON object
def filter_dataset(dataset_file, filter_files, threshold=None):
# Load filter criteria
filter_criteria = load_filter_criteria(filter_files)
# Load dataset
data = load_jsonl(dataset_file)
# Filter conversations
filtered_data, filtered_count, total_matched_phrases, removed_conversation_count = filter_conversations(data, filter_criteria, threshold)
# Create output folder in the same directory as the main app
output_folder = Path(__file__).parent / "deslopped" # Use __file__ to get the current script's directory
output_folder.mkdir(exist_ok=True) # Create folder if it doesn't exist
# Prepare output file path
dataset_name = Path(dataset_file).stem # Get the stem (name without extension)
output_file_path = output_folder / f"{dataset_name}_deslopped.jsonl" # Create new output file name
# Write filtered conversations to the output file
write_filtered_jsonl(filtered_data, output_file_path)
# Prepare output message
output_message = (
f"Total original conversations: {len(data)}\n"
f"Total filtered conversations: {filtered_count}\n"
f"Remaining conversations after filtering: {len(filtered_data)}\n"
f"Total matched phrases: {total_matched_phrases}\n"
f"Total conversations removed: {removed_conversation_count}\n"
f"Filtered output written to: {output_file_path}\n"
)
return output_message