Skip to content

Commit ffe8da4

Browse files
author
Colin Leong
committed
MUCH faster citation check by simpler regex
1 parent 8662a6a commit ffe8da4

File tree

1 file changed

+28
-17
lines changed

1 file changed

+28
-17
lines changed

src/find_bare_citations.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,30 +20,36 @@ def extract_citation_keys(bib_file_path):
2020
citation_keys.append(match.group(1))
2121
return citation_keys
2222

23-
24-
def find_bare_citations(markdown_file_path:Path, citation_keys:list)->list:
25-
23+
def find_bare_citations(markdown_file_path: Path, citation_keys: list) -> list:
2624
content = markdown_file_path.read_text()
2725

2826
# Remove HTML comments. regex from https://stackoverflow.com/a/28208465
29-
content = re.sub("(<!--.*?-->)", "", content, flags=re.DOTALL)
27+
content = re.sub(r"<!--.*?-->", "", content, flags=re.DOTALL)
3028

3129
# Remove Markdown code blocks, regex from https://stackoverflow.com/a/64116935
3230
markdown_code_block_pattern = re.compile(r'```[^`]*```', re.DOTALL)
33-
3431
content = markdown_code_block_pattern.sub('', content)
3532

3633
for citation_key in citation_keys:
37-
38-
# magical regex from ChatGPT: captures the whole line that has a bare citation.
39-
pattern = re.compile(r'^.*(?<!@)(?:' + re.escape(citation_key) + r').*$', re.MULTILINE)
40-
41-
# Find all matching lines
42-
if pattern.search(content) is not None:
43-
matches = pattern.finditer(content)
34+
# Find all positions of the citation key without the @ symbol
35+
key_pattern = re.compile(re.escape(citation_key))
36+
matches = []
37+
for match in key_pattern.finditer(content):
38+
start_index = match.start()
39+
line_start = content.rfind('\n', 0, start_index) + 1
40+
line_end = content.find('\n', start_index)
41+
if line_end == -1:
42+
line_end = len(content)
43+
line = content[line_start:line_end]
44+
45+
# Ensure the citation key is not immediately preceded by an @ symbol
46+
if '@' not in content[start_index-1:start_index]:
47+
matches.append(line)
48+
49+
if matches:
4450
issue_tuple = citation_key, matches
4551
yield issue_tuple
46-
52+
4753

4854
if __name__ == "__main__":
4955

@@ -61,12 +67,16 @@ def find_bare_citations(markdown_file_path:Path, citation_keys:list)->list:
6167

6268
args = parser.parse_args()
6369

70+
print(f"Parsing {args.bib_file_path} for citations")
71+
extract_citations_start = timeit.default_timer()
6472
citation_keys = extract_citation_keys(args.bib_file_path)
73+
extract_citations_time = timeit.default_timer() - extract_citations_start
74+
print(f"Finding citations took {extract_citations_time} seconds")
75+
6576

66-
# for i in range(50):
67-
# citation_keys.extend([str(uuid.uuid4) for _ in range(100)])
77+
print(f"Bibliography had {len(citation_keys)} citations")
6878

69-
print(f"Bibliography had {len(citation_keys)} citations, checking for bare citations:")
79+
print(f"Beginning bare-citations check: checking {args.markdown_file_path}")
7080

7181
start_time = timeit.default_timer()
7282
issues = find_bare_citations(args.markdown_file_path, citation_keys)
@@ -81,7 +91,8 @@ def find_bare_citations(markdown_file_path:Path, citation_keys:list)->list:
8191
print(f"Citation key: {citation_key}")
8292

8393
for match in matches:
84-
print(f"* {match.group(0)}")
94+
print(match)
95+
# print(f"* {match.group(0)}")
8596

8697
# iff we've gotten here then issues exist and we should set return value to 1 at the end.
8798
issues_exist = True

0 commit comments

Comments
 (0)