-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvalidate_markdown_metadata.py
133 lines (116 loc) · 5.32 KB
/
validate_markdown_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import sys
import yaml
import re
# Helper method that ensures the file has both a starting and ending '---'
def has_yaml_header(file_content):
""" Detects if a string (usually the contents of a .md file) has a YAML header
>>> has_yaml_header("---\\nTest: Yes\\n---")
True
>>> has_yaml_header("Test: No")
False
>>> has_yaml_header("---\\nProblem: about to be too many dashes\\n----")
False
"""
p = re.compile(r"^---$", re.MULTILINE)
results = p.findall(file_content)
return len(results) > 1
# This method is extracted from Portmap
# and it should stay consistent with it if it needs updating
def extract_yaml_and_body(file_content):
""" Extracts a YAML header delimited by lines consisting of '---' from the rest of a markdown document.
>>> extract_yaml_and_body("---\\nTest: Data\\nPart: Deux\\n---\\nSeparate this body part\\n")
({'Test': 'Data', 'Part': 'Deux'}, 'Separate this body part\\n')
"""
# Slightly modified from Portmap method
if not has_yaml_header(file_content):
raise ValueError("The file does not have a valid YAML header.")
# assert has_yaml_header(file_content) # File does not have a YAML header
in_yaml_header = False
in_body = False
yaml_content = []
body_content = []
for line in file_content.split("\n"):
if not in_yaml_header and not in_body and line == "---":
in_yaml_header = True
elif in_yaml_header and line == "---":
in_yaml_header = False
in_body = True
elif in_yaml_header:
yaml_content.append(line)
elif in_body:
body_content.append(line)
yaml_content = yaml.safe_load('\n'.join(yaml_content))
body = '\n'.join(body_content)
return yaml_content, body
# Checks if a field is of the expected type
def is_field_valid_type(field_name, field_value, expected_type, file_path, errors):
if not isinstance(field_value, expected_type):
# If expected_type is a tuple (e.g., (str, list)), it means that multiple types are allowed
expected_types = (
# Constructing user-friendly string that describes the expected type(s)
f"{' or '.join([t.__name__ for t in expected_type])}"
if isinstance(expected_type, tuple)
else expected_type.__name__
)
errors.append(f"'{field_name}' must be a {expected_types} in {file_path}")
# Validates the required fields in the frontmatter
def validate_fields(frontmatter, file_path, errors):
# 'title': Must be a string
if 'title' not in frontmatter:
errors.append(f" - 'title' is missing in {file_path}")
else:
is_field_valid_type('title', frontmatter.get('title'), str, file_path, errors)
# 'datatype': Must be a string (no lists allowed)
if 'datatype' not in frontmatter:
errors.append(f" - 'datatype' is missing in {file_path}")
else:
is_field_valid_type('datatype', frontmatter.get('datatype'), str, file_path, errors)
# 'sources': Must be a string or a list
if 'sources' not in frontmatter:
errors.append(f" - 'sources' is missing in {file_path}")
else:
is_field_valid_type('sources', frontmatter.get('sources'), (str, list), file_path, errors)
# 'destinations': Must be a string or a list
if 'destinations' not in frontmatter:
errors.append(f" - 'destinations' is missing in {file_path}")
else:
is_field_valid_type('destinations', frontmatter.get('destinations'), (str, list), file_path, errors)
# Checks if a specified YAML field ends with a comma
def does_field_end_with_comma(field, yaml_body):
datatype_pattern = re.compile(r"^" + re.escape(field) + r":\s*.*,\s*$", re.MULTILINE)
match = datatype_pattern.search(yaml_body)
return match is not None
def validate_frontmatter(file_path):
errors = []
try:
# Open the markdown file and extract content
with open(file_path, 'r') as f:
# Reads content and stores it
content = f.read()
# Extract frontmatter and body using Portmap method
try:
frontmatter, _ = extract_yaml_and_body(content)
except ValueError as ve:
errors.append(str(ve))
return "\n".join(errors)
# Validate the extracted frontmatter
validate_fields(frontmatter, file_path, errors)
fields_to_check = ['title', 'datatype', 'sources', 'destinations']
# Iterate over fields to check and then check for trailing commas
comma_errors = [field for field in fields_to_check if does_field_end_with_comma(field, content)]
if comma_errors:
errors.append(f"Trailing comma found in fields: {', '.join(comma_errors)} in {file_path}")
if errors:
return "\n".join(errors)
return "True"
except yaml.YAMLError as e:
# Catch any YAML syntax errors
return f"YAML Error in {file_path}: {e}"
except Exception as e:
# Catch other errors (missing fields, invalid structure, trailing comma, etc.)
return f"Error in {file_path}: {e}"
if __name__ == "__main__":
# The script takes the file path as an argument
file_path = sys.argv[1]
# Run validation and print the result ("True" or "False")
print(validate_frontmatter(file_path))