-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use uuid5 associated with file SHA1 instead of randomly generated uui…
…d4 (#11) Use uuid5 associated with file SHA1 instead of randomly generated uuid4
- Loading branch information
Showing
5 changed files
with
171 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import io | ||
import pathlib | ||
import hashlib | ||
from uuid import UUID, uuid5, NAMESPACE_DNS | ||
|
||
NAMESPACE_STR = 'github.com/hv0905/NekoImageGallery' | ||
|
||
|
||
def generate(file_input: pathlib.Path | io.BytesIO) -> UUID: | ||
namespace_uuid = uuid5(NAMESPACE_DNS, NAMESPACE_STR) | ||
if isinstance(file_input, pathlib.Path): | ||
with open(file_input, 'rb') as f: | ||
file_content = f.read() | ||
elif isinstance(file_input, io.BytesIO): | ||
file_input.seek(0) | ||
file_content = file_input.read() | ||
else: | ||
raise ValueError("Unsupported file type. Must be pathlib.Path or io.BytesIO.") | ||
file_hash = hashlib.sha1(file_content).hexdigest() | ||
return uuid5(namespace_uuid, file_hash) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,47 +1,82 @@ | ||
import uuid | ||
from datetime import datetime | ||
from pathlib import Path | ||
from shutil import copy2 | ||
from uuid import uuid4 | ||
|
||
import PIL | ||
from PIL import Image | ||
from loguru import logger | ||
|
||
from app.Models.img_data import ImageData | ||
from app.Services.provider import index_service | ||
from app.Services.provider import index_service, db_context | ||
from app.config import config | ||
from .local_utility import gather_valid_files | ||
from app.util import generate_uuid | ||
from .local_utility import gather_valid_files, fetch_path_uuid_list | ||
|
||
overall_count = 0 | ||
|
||
async def copy_and_index(file_path: Path): | ||
|
||
async def copy_and_index(file_path: Path, uuid_str: str = None): | ||
global overall_count | ||
overall_count += 1 | ||
logger.info("[{}] Indexing {}", str(overall_count), str(file_path)) | ||
try: | ||
img = Image.open(file_path) | ||
except PIL.UnidentifiedImageError as e: | ||
logger.error("Error when opening image {}: {}", file_path, e) | ||
return | ||
image_id = uuid4() | ||
image_id = uuid.UUID(uuid_str) if uuid_str else generate_uuid.generate(file_path) | ||
img_ext = file_path.suffix | ||
imgdata = ImageData(id=image_id, | ||
url=f'/static/{image_id}{img_ext}', | ||
index_date=datetime.now(), | ||
local=True) | ||
try: | ||
await index_service.index_image(img, imgdata) | ||
# This has already been checked for duplicated, so there's no need to double-check. | ||
await index_service.index_image(img, imgdata, allow_overwrite=True) | ||
except Exception as e: | ||
logger.error("Error when processing image {}: {}", file_path, e) | ||
return | ||
# copy to static | ||
copy2(file_path, Path(config.static_file.path) / f'{image_id}{img_ext}') | ||
|
||
|
||
async def copy_and_index_batch(file_path_list: list[tuple[Path, str]]): | ||
for file_path_uuid_tuple in file_path_list: | ||
await copy_and_index(file_path_uuid_tuple[0], uuid_str=file_path_uuid_tuple[1]) | ||
|
||
|
||
@logger.catch() | ||
async def main(args): | ||
root = Path(args.local_index_target_dir) | ||
static_path = Path(config.static_file.path) | ||
static_path.mkdir(exist_ok=True) | ||
counter = 0 | ||
for item in gather_valid_files(root): | ||
counter += 1 | ||
logger.info("[{}] Indexing {}", str(counter), str(item.relative_to(root))) | ||
await copy_and_index(item) | ||
logger.success("Indexing completed! {} images indexed", counter) | ||
# First, check if the database is empty | ||
item_number = await db_context.get_counts(exact=False) | ||
if item_number == 0: | ||
# database is empty, do as usual | ||
logger.warning("The database is empty, Will not check for duplicate points.") | ||
for item in gather_valid_files(root): | ||
await copy_and_index(item) | ||
else: | ||
# database is not empty, check for duplicate points | ||
logger.warning("The database is not empty, Will check for duplicate points.") | ||
for itm in gather_valid_files(root, max_files=5000): | ||
local_file_path_with_uuid_list = fetch_path_uuid_list(itm) | ||
local_file_uuid_list = [itm[1] for itm in local_file_path_with_uuid_list] | ||
duplicate_uuid_list = await db_context.validate_ids(local_file_uuid_list) | ||
if len(duplicate_uuid_list) > 0: | ||
duplicate_uuid_list = set(duplicate_uuid_list) | ||
local_file_path_with_uuid_list = [item for item in local_file_path_with_uuid_list | ||
if item[1] not in duplicate_uuid_list] | ||
logger.info("Found {} duplicate points, of which {} are duplicates in the database. " | ||
"The remaining {} points will be indexed.", | ||
len(itm) - len(local_file_path_with_uuid_list), len(duplicate_uuid_list), | ||
len(local_file_path_with_uuid_list)) | ||
else: | ||
logger.info("Found {} duplicate points, of which {} are duplicates in the database." | ||
" The remaining {} points will be indexed.", | ||
0, 0, len(local_file_path_with_uuid_list)) | ||
await copy_and_index_batch(local_file_path_with_uuid_list) | ||
|
||
logger.success("Indexing completed! {} images indexed", overall_count) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,39 @@ | ||
import itertools | ||
from pathlib import Path | ||
|
||
from loguru import logger | ||
|
||
from app.util import generate_uuid | ||
|
||
def gather_valid_files(root: Path, pattern: str = '**/*.*'): | ||
for item in root.glob(pattern): | ||
if item.suffix in ['.jpg', '.png', '.jpeg', '.jfif', '.webp', '.gif']: | ||
yield item | ||
|
||
def gather_valid_files(root: Path, pattern: str = '**/*.*', max_files=None): | ||
valid_extensions = {'.jpg', '.png', '.jpeg', '.jfif', '.webp', '.gif'} | ||
|
||
def file_generator(): | ||
for file in root.glob(pattern): | ||
if file.suffix.lower() in valid_extensions: | ||
yield file | ||
else: | ||
logger.warning(f"Unsupported file type: {file.suffix}. Skipping file: {file}") | ||
|
||
def generator(): | ||
gen = file_generator() | ||
if max_files is None: | ||
yield from gen | ||
else: | ||
logger.warning("Unsupported file type: {}. Skip...", item.suffix) | ||
while True: | ||
batch = list(itertools.islice(gen, max_files)) | ||
if not batch: | ||
break | ||
yield batch | ||
|
||
return generator() | ||
|
||
|
||
def calculate_uuid(file_path: Path) -> str: | ||
return str(generate_uuid.generate(file_path)) | ||
|
||
|
||
def fetch_path_uuid_list(file_path: Path | list[Path]) -> list[tuple[Path, str]]: | ||
file_path = [file_path] if isinstance(file_path, Path) else file_path | ||
return [(itm, calculate_uuid(itm)) for itm in file_path] |