-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCoqDog_ContextFinder_from_html.py
39 lines (30 loc) · 1.38 KB
/
CoqDog_ContextFinder_from_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""
Auther: @Amer N. Tahat, Collins Aerospace, This is a legacy coqdog code for the early desktop version of coqdog, test coq html items extraction.
"""
def extract_coq_context_from_html(file_path):
"""
Extracts coq context (lemmas, proofs, definitions, etc.) from the given HTML file.
Args:
- file_path (str): Path to the HTML file.
Returns:
- List of strings: Extracted coq contexts.
"""
with open(file_path, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
# Extract all text from the HTML
raw_text = soup.get_text()
# Tokenize based on known delimiters
delimiters = ["Lemma", "Proof", "Qed.", "Notation", ":=", "Fixedpoint", "Inductive", "Axiom", "Ltac", ".\n\n"]
contexts = []
start = 0
for delimiter in delimiters:
while raw_text.find(delimiter, start) != -1:
idx = raw_text.find(delimiter, start)
if idx != start:
context = raw_text[start:idx + len(delimiter)]
contexts.append(context.strip())
start = idx + len(delimiter)
return contexts
# Let's test the function on a sample file path
sample_file_path = "/mnt/data/sample_coq.html" # Placeholder path for testing
# extract_coq_context_from_html(sample_file_path) # Uncommenting this would execute on the sample path