-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinguisticProbabilisticExperiment.py
184 lines (151 loc) · 7.99 KB
/
LinguisticProbabilisticExperiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
LinguisticProbabilisticExperiment.py
import torch
import numpy as np
from tqdm import tqdm # For progress bars
import pandas as pd
from scipy.stats import entropy
from transformers import AutoTokenizer
# Assuming LinguisticQuantumPerturber is in perturber.py
from perturber import LinguisticQuantumPerturber
class LinguisticProbabilisticExperiment:
def __init__(self, perturber, model_name="distilbert-base-uncased"):
"""
Initializes the experiment class.
Args:
perturber (LinguisticQuantumPerturber): An instance of the perturbation class.
"""
self.perturber = perturber
self.tokenizer = AutoTokenizer.from_pretrained(model_name) #Required for KL Divergence
self.model = perturber.model # Access the model from the perturber
self.results = [] # Store experimental results
def load_data(self, data_source, max_sentences=None):
"""
Loads sentences from a data source.
Args:
data_source (str or list): Either a path to a text file (one sentence per line)
or a list of strings (sentences).
max_sentences (int, optional): Maximum number of sentences to load.
Returns:
list: A list of sentences (strings).
"""
sentences = []
if isinstance(data_source, str): # File path
try:
with open(data_source, 'r', encoding='utf-8') as f:
for line in f:
sentence = line.strip()
if sentence: # Skip empty lines
sentences.append(sentence)
if max_sentences is not None and len(sentences) >= max_sentences:
break
except FileNotFoundError:
print(f"Error: File not found: {data_source}")
return [] # Return empty list on error
elif isinstance(data_source, list): # List of sentences
sentences = data_source[:max_sentences] if max_sentences is not None else data_source
else:
print("Error: Invalid data_source type. Must be a file path (str) or a list of sentences.")
return []
return sentences
def _calculate_kl_divergence(self, original_logits, perturbed_logits):
"""Calculates KL divergence between two probability distributions."""
# Convert logits to probabilities using softmax
original_probs = torch.nn.functional.softmax(original_logits, dim=-1)
perturbed_probs = torch.nn.functional.softmax(perturbed_logits, dim=-1)
# Calculate KL divergence. Use a small epsilon for numerical stability.
kl_div = entropy(original_probs.cpu().numpy(), perturbed_probs.cpu().numpy() + 1e-12)
return kl_div
def _get_model_output(self, sentence):
"""Gets model outputs (logits and attention) for a sentence."""
inputs = self.tokenizer(sentence, return_tensors="pt", add_special_tokens=True, truncation=True, padding=True).to(self.perturber.device)
with torch.no_grad():
outputs = self.model(**inputs, output_attentions=True)
return outputs.logits, outputs.attentions
def run_experiment(self, sentences, perturbation_types, num_perturbations=1, similarity_threshold=0.2, frequency_threshold_factor=2.0, batch_size=32):
"""
Runs the perturbation experiment.
Args:
sentences (list): A list of sentences (strings).
perturbation_types (list): List of perturbation types (e.g., ['synonym', 'antonym']).
num_perturbations (int): Number of words to perturb per sentence.
batch_size (int): The number of sentences to process at a time
"""
self.results = [] # Clear previous results
for i in tqdm(range(0, len(sentences), batch_size), desc="Processing sentences"):
batch_sentences = sentences[i:i+batch_size]
# Get original model outputs for the entire batch
original_logits_batch = []
original_attentions_batch = []
for sentence in batch_sentences:
original_logits, original_attentions = self._get_model_output(sentence)
original_logits_batch.append(original_logits)
original_attentions_batch.append(original_attentions) # Store attention for later
# Now, perturb each sentence in the batch
for j, original_sentence in enumerate(batch_sentences):
original_logits = original_logits_batch[j] # Logits for the current sentence
for perturbation_type in perturbation_types:
perturbed_sentences = self.perturber.perturb_sentence(
original_sentence,
perturbation_type,
num_perturbations=num_perturbations,
similarity_threshold=similarity_threshold,
frequency_threshold_factor=frequency_threshold_factor
)
for perturbed_sentence in perturbed_sentences:
# Get model output for the perturbed sentence
perturbed_logits, perturbed_attentions = self._get_model_output(perturbed_sentence)
# Calculate KL Divergence
kl_divergence = self._calculate_kl_divergence(original_logits.squeeze(0), perturbed_logits.squeeze(0)) # Remove batch dimension
# Calculate Semantic Similarity
semantic_similarity = self.perturber.compute_semantic_similarity(original_sentence, perturbed_sentence)
# Store Results
result = {
'original_sentence': original_sentence,
'perturbed_sentence': perturbed_sentence,
'perturbation_type': perturbation_type,
'kl_divergence': kl_divergence,
'semantic_similarity': semantic_similarity,
# You'll add attention shift calculations here later
}
self.results.append(result)
def get_results_dataframe(self):
"""Returns the results as a Pandas DataFrame."""
return pd.DataFrame(self.results)
def analyze_results(self):
"""Provides basic statistical analysis of the results."""
if not self.results:
print("No results to analyze. Run an experiment first.")
return
df = self.get_results_dataframe()
# Group by perturbation type and calculate mean/std for KL divergence and similarity
grouped_results = df.groupby('perturbation_type').agg({
'kl_divergence': ['mean', 'std', 'count'],
'semantic_similarity': ['mean', 'std']
})
print(grouped_results)
return grouped_results
# Example Usage
if __name__ == '__main__':
# Create a Perturber instance (you can customize its parameters)
perturber = LinguisticQuantumPerturber()
# Create an Experiment instance
experiment = LinguisticProbabilisticExperiment(perturber)
# Load some sentences (replace with your data source)
sentences = [
"The quick brown fox jumps over the lazy dog.",
"This is an example sentence for testing.",
"Artificial intelligence is a fascinating field.",
"The cat sat on the mat.",
"She sells seashells by the seashore."
]
#Or load from file
#sentences = experiment.load_data("sentences.txt", max_sentences=100)
# Run the experiment
experiment.run_experiment(sentences, perturbation_types=['synonym', 'antonym', 'related', 'random', 'mlm'], num_perturbations=1)
# Get the results as a DataFrame
results_df = experiment.get_results_dataframe()
print(results_df.head())
# Analyze the results
experiment.analyze_results()
#Save to csv
results_df.to_csv("experiment_results.csv", index=False)