-
Notifications
You must be signed in to change notification settings - Fork 0
/
coq_data_prep_definitions_from_coq_to_csv.py
136 lines (105 loc) · 6.25 KB
/
coq_data_prep_definitions_from_coq_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import re
import csv
import os
import zipfile
import pandas as pd
"""
Auther: @Amer N. Tahat, Collins Aerospace, This is a legacy code for the early desktop version of coqdog, and used also for web app to extract items and convert them into csv for clustering the data into chuncks. 2023 all rights reserved.
"""
# Regex patterns for lemmas, proofs, and definitions
lemma_pattern = re.compile(r"Lemma\s+(\w+)\s*:(.*?)(?:Proof|Qed|Defined|\.)(?!\s*with)", re.DOTALL)
definition_pattern = re.compile(
r"Definition\s+(\w+)([^.]*?)(?:\.\s*(?:\n|\Z|(?=\s*Proof)|(?=\s*Lemma)|(?=\s*Definition)))", re.DOTALL)
proof_pattern = re.compile(r"Proof\s*.(.*?)(?:Qed|Defined)\.", re.DOTALL)
# Function to extract lemmas, definitions, and proofs using the patterns
def extract_items(file_content, lemma_pattern, definition_pattern, proof_pattern):
lemmas = lemma_pattern.findall(file_content)
definitions = definition_pattern.findall(file_content)
proofs = proof_pattern.findall(file_content)
return lemmas, definitions, proofs
# Function to integrate lemmas, definitions, and their proofs for CSV writing
def extract_all_items_to_csv(zip_path, csv_output_path, subdir, lemma_pattern, definition_pattern, proof_pattern):
output_data = []
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
temp_extract_folder = "/mnt/data/coq_files_extracted"
subdir_path = os.path.join(temp_extract_folder, subdir)
for coq_file in os.listdir(subdir_path):
if coq_file.endswith('.v'):
file_path = os.path.join(subdir_path, coq_file)
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
file_content = f.read()
lemmas, definitions, proofs = extract_items(file_content, lemma_pattern, definition_pattern,
proof_pattern)
for lemma, lemma_body in lemmas:
output_data.append([lemma, lemma_body.strip(), 1, "Lemma"])
if proofs:
proof_body = proofs.pop(0)
output_data.append([lemma + "_proof", proof_body[0].strip(), 1, "Proof"])
for definition_name, definition_body in definitions:
output_data.append([definition_name, definition_body.strip(), 2, "Definition"])
if proofs:
proof_body = proofs.pop(0)
output_data.append([definition_name + "_proof", proof_body[0].strip(), 2, "Proof"])
with open(csv_output_path, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["item-name", "item-body", "label_int", "label"])
for row in output_data:
writer.writerow(row)
return csv_output_path
csv_output_path = '/path/to/output/coq_data.csv' # Specify your output path here
output_path_all_items = extract_all_items_to_csv('/path/to/your/zip/file.zip', csv_output_path, 'src', lemma_pattern,
definition_pattern, proof_pattern)
df_combined_corrected_first_30
####
""" this version is mixing the two catigories. The version only works for the definitions"""
lemma_pattern = re.compile(r"Lemma\s+(\w+)\s*:(.*?)(?:Proof|Qed|Defined|\.)(?!\s*with)", re.DOTALL)
proof_pattern = re.compile(r"Proof\s*.(.*?)(?:Qed|Defined)\.", re.DOTALL)
# Extract lemmas, definitions, and proofs using the patterns
def extract_items(file_content, lemma_pattern, definition_pattern, proof_pattern):
lemmas = lemma_pattern.findall(file_content)
definitions = definition_pattern.findall(file_content)
proofs = proof_pattern.findall(file_content)
return lemmas, definitions, proofs
# Function to integrate lemmas, definitions, and their proofs for CSV writing
def extract_all_items_to_csv(zip_path, csv_output_path, subdir, lemma_pattern, definition_pattern, proof_pattern):
output_data = []
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Extract all the files into a temporary directory
temp_extract_folder = "/mnt/data/coq_files_extracted"
subdir_path = os.path.join(temp_extract_folder, subdir)
# Process each Coq file within the subdirectory
for coq_file in os.listdir(subdir_path):
if coq_file.endswith('.v'):
file_path = os.path.join(subdir_path, coq_file)
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
file_content = f.read()
lemmas, definitions, proofs = extract_items(file_content, lemma_pattern, definition_pattern,
proof_pattern)
# Integrate lemmas, definitions, and their proofs
for lemma, lemma_body in lemmas:
output_data.append([lemma, lemma_body.strip(), 1, "Lemma"])
# If proof exists for this lemma, add it
if proofs:
proof_body = proofs.pop(0)
output_data.append([lemma + "_proof", proof_body[0].strip(), 0, "Proof"])
for definition_name, definition_body in definitions:
output_data.append([definition_name, definition_body.strip(), 2, "Definition"])
# If proof exists for this definition, add it
if proofs:
proof_body = proofs.pop(0)
output_data.append([definition_name + "_proof", proof_body[0].strip(), 0, "Proof"])
# Write data to CSV
with open(csv_output_path, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write header
writer.writerow(["item-name", "item-body", "label_int", "label"])
# Write extracted data
for row in output_data:
writer.writerow(row)
return csv_output_path
# Extract data from the Coq files using the updated patterns and write to CSV
output_path_all_items = extract_all_items_to_csv('/mnt/data/src.zip', csv_output_path, 'src', lemma_pattern,
second_approach_pattern, proof_pattern)
# Display the first 10 rows of the updated CSV with all items
df_all_items = pd.read_csv(output_path_all_items)
df_all_items.head(10)