-
Notifications
You must be signed in to change notification settings - Fork 0
/
alttexter-ghclient.py
437 lines (350 loc) · 19.1 KB
/
alttexter-ghclient.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
import asyncio
import base64
import glob
import json
import logging
import os
import re
import aiohttp
from ghutils import GitHubHandler, RateLimiter
logging.basicConfig(level=logging.DEBUG,
format='%(levelname)s [%(asctime)s] %(message)s',
datefmt='%d-%m-%Y %H:%M:%S')
SUPPORTED_FILE_EXTENSIONS = ('.md', '.mdx', '.ipynb')
SUPPORTED_IMAGE_EXTENSIONS = ('.png', '.gif', '.jpg', '.jpeg', '.webp')
include_svg = os.getenv('ALTTEXTER_INCLUDE_SVG', '0').lower() in ['true', '1']
if include_svg:
logging.info("SVG files are included as supported image extensions.")
SUPPORTED_IMAGE_EXTENSIONS += ('.svg',)
class ImageMetadataUpdater:
"""
Handles retrieval and updating of image metadata in markdown files.
"""
def __init__(self):
pass
# Pattern to identify markdown image syntax
IMAGE_PATTERN = re.compile(r'!\[(.*?)\]\((.*?)\s*(?:\"(.*?)\"|\'(.*?)\'|\))?\)')
async def check_url_content_type(self, url, file_path, session):
"""
Asynchronously checks the content type of the specified URL to determine if it points to a supported image type.
Args:
url (str): The URL of the image to check.
file_path (str): Path of the file containing the URL, used for logging purposes.
session (aiohttp.ClientSession): The session object used for making HTTP requests.
Returns:
bool: True if the content type of the URL is a supported image type; False otherwise.
"""
headers = {'User-Agent': 'Mozilla/5.0'}
# Limiting download size to quickly verify if the URL points to a supported image type
max_content_length = 1024
extension_to_mime = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp',
'.svg': 'image/svg+xml'
}
supported_mime_types = [extension_to_mime[ext] for ext in SUPPORTED_IMAGE_EXTENSIONS if ext in extension_to_mime]
try:
# async request to get the content type of the URL as last resort
async with session.get(url, headers=headers, allow_redirects=True) as response:
if response.status == 200:
content_type = response.headers.get('Content-Type', '')
await response.content.read(max_content_length)
return any(content_type.startswith(mime) for mime in supported_mime_types)
return False
except Exception as e:
logging.error(f"[{file_path}] Error checking content type for URL {url}: {e}")
return False
async def extract_image_paths(self, markdown_content, file_path, session):
"""
Extracts and categorizes image paths from markdown content into local images and image URLs.
Args:
markdown_content (str): The markdown content to be scanned for images.
file_path (str): The path of the markdown file being processed, used for logging.
session (aiohttp.ClientSession): The session object used for making HTTP requests to check URL content types.
Returns:
tuple: A tuple containing two lists - the first list contains tuples of local image paths and their metadata,
and the second list contains tuples of image URLs and their metadata.
"""
images = self.IMAGE_PATTERN.findall(markdown_content)
local_images = []
image_urls = []
# Parsing the image details: alt text, path, and optional title.
for image_info in images:
logging.debug(f"[{file_path}] Regex match: {image_info}")
alt_text, image_path = image_info[:2]
title_double, title_single = image_info[2:4]
title = title_double or title_single
clean_path, _, _ = image_path.partition('?')
_, ext = os.path.splitext(clean_path)
if not title:
title = None
# Local image files
if not clean_path.startswith(('http://', 'https://')):
if ext.lower() in SUPPORTED_IMAGE_EXTENSIONS:
local_images.append((alt_text, clean_path, title))
else:
logging.info(f"[{file_path}] Unsupported local image type: {image_path}")
continue
# URLs
if (
ext
and ext.lower() in SUPPORTED_IMAGE_EXTENSIONS
or not ext
and await self.check_url_content_type(clean_path, file_path, session)
):
image_urls.append((alt_text, image_path, title))
elif ext and ext.lower() not in SUPPORTED_IMAGE_EXTENSIONS:
logging.info(f"[{file_path}] Unsupported image URL extension: {clean_path}")
else:
logging.info(f"[{file_path}] URL does not point to a supported image type: {clean_path}")
logging.info(f"[{file_path}] Total local images found: {len(local_images)}, Total image URLs found: {len(image_urls)}")
return local_images, image_urls
def encode_images(self, images, base_dir, file_path, repo_root):
"""
Encodes local images specified in the images list to base64 strings.
Args:
images (list): A list of tuples representing local images to be encoded. Each tuple contains the image's alt text, path, and optional title.
base_dir (str): The base directory of the markdown file being processed.
file_path (str): The path of the markdown file, used for logging.
repo_root (str): The root directory of the repository containing the markdown file.
Returns:
dict: A dictionary mapping the image paths to their base64 encoded strings.
"""
encoded_images = {}
# Encoding local images to base64 strings, handling both direct and relative paths.
for _, image_path, _ in images:
if image_path.startswith('/'):
full_image_path = os.path.join(repo_root, image_path.lstrip('/'))
else:
full_image_path = os.path.join(base_dir, image_path)
logging.debug(f"[{file_path}] Calculated full image path: {full_image_path}")
if not os.path.exists(full_image_path):
logging.debug(f"[{file_path}] Direct path not found, searching in repository: {image_path}")
search_path = os.path.join(repo_root, '**', os.path.basename(image_path))
if search_results := glob.glob(search_path, recursive=True):
full_image_path = search_results[0] # Take the first match
logging.debug(f"[{file_path}] Found image at: {full_image_path}")
else:
logging.info(f"[{file_path}] Image not found in repository: {image_path}")
continue
if image_path.lower().endswith(SUPPORTED_IMAGE_EXTENSIONS):
encoded_images[image_path] = self.encode_image(full_image_path)
else:
logging.info(f"[{file_path}] Unsupported image type: {image_path} (Full path: {full_image_path})")
return encoded_images
def encode_image(self, image_path):
"""
Encodes a single image file located at the specified path to a base64 string.
Args:
image_path (str): The filesystem path to the image file.
Returns:
str: The base64 encoded string representation of the image file.
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
async def get_image_metadata(self, session, markdown_content, file_path, encoded_images, image_urls, alttexter_endpoint, rate_limiter):
"""
Asynchronously retrieves image metadata for both local and remote images specified in the markdown content.
Args:
session (aiohttp.ClientSession): HTTP session for making requests.
markdown_content (str): Markdown content from which to extract image metadata.
file_path (str): Path of the markdown file, used for logging.
encoded_images (dict): Local images encoded in base64 format.
image_urls (list): URLs of remote images.
alttexter_endpoint (str): Endpoint URL for the ALTTEXTER service.
rate_limiter (RateLimiter): Rate limiter for controlling request frequency.
Returns:
dict: A dictionary with a success flag and either the retrieved image metadata or an error message.
"""
ALTTEXTER_TIMEOUT = 240
response_structure = {"success": False, "data": {}}
all_image_metadata = []
try:
batch_size = int(os.getenv('ALTTEXTER_BATCH_SIZE', '0'))
if batch_size < 1:
batch_size = None
except ValueError:
batch_size = None
# Combine encoded images and image URLs, marking each with its type
combined_images = [{'type': 'encoded', 'data': data, 'path': path} for path, data in encoded_images.items()] + \
[{'type': 'url', 'url': url, 'alt': alt, 'title': title} for alt, url, title in image_urls]
if batch_size:
logging.info(f"[{file_path}] Processing images in batches of size {batch_size}")
batches = [combined_images[i:i + batch_size] for i in range(0, len(combined_images), batch_size)]
else:
logging.info(f"[{file_path}] Processing all images in a single batch")
batches = [combined_images] # No batching, process all in one go
headers = {
"Content-Type": "application/json",
"X-API-Token": os.getenv('ALTTEXTER_TOKEN')
}
for batch_index, batch in enumerate(batches, start=1):
# Separate the current batch back into encoded images and URLs
batch_encoded_images = {item['path']: item['data'] for item in batch if item['type'] == 'encoded'}
batch_image_urls = [item['url'] for item in batch if item['type'] == 'url']
payload = {
"text": markdown_content,
"images": batch_encoded_images,
"image_urls": batch_image_urls
}
await rate_limiter.acquire(file_path, batch_index, len(batches))
try:
async with session.post(alttexter_endpoint, json=payload, headers=headers, timeout=ALTTEXTER_TIMEOUT) as response:
response.raise_for_status()
batch_response = await response.json()
if isinstance(batch_response, dict) and 'images' in batch_response:
all_image_metadata.extend(batch_response['images'])
else:
logging.error(f"[{file_path}] Unexpected response structure for batch {batch_index}")
except Exception as e:
logging.error(f"[{file_path}] An error occurred while processing the batch {batch_index}: {e}")
if all_image_metadata:
response_structure["data"]['images'] = all_image_metadata
response_structure["success"] = True
else:
logging.error(f"[{file_path}] Failed to retrieve image metadata for all batches")
return response_structure
def update_image_metadata(self, markdown_content, image_metadata, base_dir, repo_root, is_ipynb):
"""
Updates the markdown content with new alt text and title attributes for images based on the provided metadata.
Args:
markdown_content (str): The original markdown content.
image_metadata (list): A list of metadata for each image.
base_dir (str): The base directory of the markdown file.
repo_root (str): The root directory of the repository.
is_ipynb (bool): Flag indicating if the file is a Jupyter Notebook.
Returns:
tuple: The updated markdown content and a list of images that were not updated due to missing metadata.
"""
images_not_updated = []
def replacement(match):
groups = match.groups()
_, image_path = groups[:2]
title_double, title_single = groups[2:4] if len(groups) == 4 else ("", "")
image_title = title_double or title_single or ""
if image_path.startswith(('http://', 'https://')):
full_path = image_path
elif image_path.startswith('/'):
full_path = os.path.join(repo_root, image_path.lstrip('/'))
else:
full_path = os.path.join(base_dir, image_path)
metadata = next((img for img in image_metadata if img['name'] in [full_path, image_path]), None)
if metadata:
alt_text = metadata["alt_text"]
title = metadata.get("title", image_title or "")
if is_ipynb:
return f"![{alt_text}]({image_path} '{title}')"
else:
return f'![{alt_text}]({image_path} "{title}")'
else:
images_not_updated.append(image_path)
return match.group(0)
# Replacing the existing image markdown syntax with updated alt text and title attributes
updated_markdown_content = self.IMAGE_PATTERN.sub(replacement, markdown_content)
return updated_markdown_content, images_not_updated
async def process_file(session, file, alttexter_endpoint, github_handler, metadata_updater, rate_limiter):
"""
Processes a markdown file to update image metadata.
Args:
session (aiohttp.ClientSession): HTTP session.
file (GitHub ContentFile): File to process.
alttexter_endpoint (str): Metadata service URL.
github_handler (GitHubHandler): GitHub API manager.
metadata_updater (ImageMetadataUpdater): Metadata updater.
rate_limiter (RateLimiter): API request rate limiter.
"""
file_path = file.filename
alttexter_filter = os.getenv('ALTTEXTER_FILTER', '')
# Skip processing if the file does not match the filter criteria
if alttexter_filter:
with open(file_path, 'r', encoding='utf-8') as md_file:
file_content = md_file.read()
try:
filters = json.loads(alttexter_filter)
if not isinstance(filters, list): # If it's not a list, revert to treating it as a single string filter
filters = [alttexter_filter]
except json.JSONDecodeError:
filters = [alttexter_filter]
all_filters_matched = True
for filter_str in filters:
pattern = re.compile(rf'{filter_str.strip()}')
if not pattern.search(file_content):
all_filters_matched = False
break
if not all_filters_matched:
logging.info(f"[{file_path}] Does not match all filter criteria. Skipping.")
return
else:
logging.info(f"[{file_path}] Matches all filter criteria. Proceeding with processing.")
logging.info(f"[{file_path}] Starting to process file")
if not file_path.lower().endswith(SUPPORTED_FILE_EXTENSIONS):
logging.info(f"[{file_path}] Skipping non-markdown formatted file")
return
try:
with open(file_path, 'r', encoding='utf-8') as md_file:
markdown_content = md_file.read()
local_images, image_urls = await metadata_updater.extract_image_paths(markdown_content, file_path, session)
base_dir = os.path.dirname(file_path)
repo_root = os.getcwd()
encoded_images = metadata_updater.encode_images(local_images, base_dir, file_path, repo_root)
is_ipynb = file_path.lower().endswith('.ipynb')
local_complete_check = all(alt and title for alt, _, title in local_images)
url_complete_check = all(alt and title for alt, _, title in image_urls)
if local_complete_check and url_complete_check:
logging.info(f"[{file_path}] No update needed")
return
response = await metadata_updater.get_image_metadata(session, markdown_content, file_path, encoded_images, image_urls, alttexter_endpoint, rate_limiter)
if not response["success"]:
logging.error(f"[{file_path}] Failed to get a response from ALTTEXTER_ENDPOINT")
github_handler.post_comment(f"Failed to get a response from the ALTTEXTER_ENDPOINT for file `{file_path}`. Please check the logs for more details.")
return
image_metadata = response["data"].get('images', [])
updated_content, images_not_updated = metadata_updater.update_image_metadata(
markdown_content, image_metadata, base_dir, repo_root, is_ipynb
)
if updated_content != markdown_content:
logging.info(f"[{file_path}] Writing updated metadata to file")
with open(file_path, 'w', encoding='utf-8') as md_file:
md_file.write(updated_content)
commit_message = f"Update image alt and title attributes in {file_path}"
if commit_push_successful := github_handler.commit_and_push([file_path], commit_message):
review_message = "Please check the LLM generated alt-text and title attributes in this file as they may contain inaccuracies."
if run_url := response["data"].get('run_url'):
review_message += f" [Explore how the LLM generated them.]({run_url})"
github_handler.post_generic_review_comment(file_path, review_message)
else:
logging.error(f"[{file_path}] Failed to commit and push changes")
elif images_not_updated:
logging.info(f"[{file_path}] Some images were not updated: {images_not_updated}")
else:
logging.info(f"[{file_path}] Metadata is already up to date")
except Exception as e:
logging.error(f"[{file_path}] Error processing file: {e}")
github_handler.post_comment(f"Error processing file `{file_path}`")
async def main():
"""
Main asynchronous function to run the script.
"""
rate_limit = int(os.getenv('ALTTEXTER_RATEMINUTE'))
rate_limiter = RateLimiter(rate=rate_limit, per=60)
repo_name = os.getenv('GITHUB_REPOSITORY')
pr_number = int(os.getenv('PR_NUMBER'))
alttexter_endpoint = os.getenv('ALTTEXTER_ENDPOINT')
silent_mode = os.getenv('ALTTEXTER_SILENTMODE', '0').lower() in ['true', '1']
github_handler = GitHubHandler(repo_name, pr_number, silent_mode)
metadata_updater = ImageMetadataUpdater()
async with aiohttp.ClientSession() as session:
files = github_handler.pr.get_files()
logging.debug(f"Files in PR: {[file.filename for file in files]}")
tasks = [
asyncio.create_task(process_file(session, file, alttexter_endpoint, github_handler, metadata_updater, rate_limiter))
for file in files
if file.filename.lower().endswith(SUPPORTED_FILE_EXTENSIONS) and file.status in ['added', 'modified']
]
logging.debug(f"Processing tasks: {tasks}")
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())