From 99c07c815452be0d87aea323fbc2398abc33772e Mon Sep 17 00:00:00 2001 From: zyddnys Date: Sat, 16 Nov 2024 14:47:21 -0500 Subject: [PATCH 01/15] fix #741 --- manga_translator/__main__.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/manga_translator/__main__.py b/manga_translator/__main__.py index c4f580031..69f64d40d 100644 --- a/manga_translator/__main__.py +++ b/manga_translator/__main__.py @@ -55,16 +55,19 @@ async def dispatch(args: Namespace): else: # batch dest = args.dest for path in natural_sort(args.input): - # Apply pre-translation dictionaries - await translator.translate_path(path, dest, args_dict) - for textline in translator.textlines: - textline.text = translator.apply_dictionary(textline.text, pre_dict) - logger.info(f'Pre-translation dictionary applied: {textline.text}') - - # Apply post-translation dictionaries - for textline in translator.textlines: - textline.translation = translator.apply_dictionary(textline.translation, post_dict) - logger.info(f'Post-translation dictionary applied: {textline.translation}') + try : + # Apply pre-translation dictionaries + await translator.translate_path(path, dest, args_dict) + for textline in translator.textlines: + textline.text = translator.apply_dictionary(textline.text, pre_dict) + logger.info(f'Pre-translation dictionary applied: {textline.text}') + + # Apply post-translation dictionaries + for textline in translator.textlines: + textline.translation = translator.apply_dictionary(textline.translation, post_dict) + logger.info(f'Post-translation dictionary applied: {textline.translation}') + except Exception : + pass elif args.mode == 'web': from .server.web_main import dispatch From 796d6d3d3899e0af90a271c799e6321a0f2e24b3 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 17:09:56 +0900 Subject: [PATCH 02/15] wip --- gradio-app.py | 122 ++++++++++++++++++++++++++++ manga_translator/gradio/__init__.py | 43 ++++++++++ 2 files changed, 165 insertions(+) create mode 100644 gradio-app.py create mode 100644 manga_translator/gradio/__init__.py diff --git a/gradio-app.py b/gradio-app.py new file mode 100644 index 000000000..735b67ef0 --- /dev/null +++ b/gradio-app.py @@ -0,0 +1,122 @@ +import gradio as gr +import numpy as np + + +import dotenv +import logging +import asyncio +import manga_translator.detection as detection +import manga_translator.ocr as ocr +import manga_translator.textline_merge as textline_merge +import manga_translator.utils.generic as utils_generic +import manga_translator.utils.textblock as utils_textblock +from manga_translator.gradio import DetectionState, mit_detect_text_default_params +from typing import List, Optional, TypedDict + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +if gr.NO_RELOAD: + logging.basicConfig(level=logging.INFO, force=True) + for name in ["httpx"]: + logging.getLogger(name).setLevel(logging.WARN) + +dotenv.load_dotenv() + + +mit_ocr_default_params = dict( + ocr_key="48px", # recommended by rowland + # ocr_key="48px_ctc", + # ocr_key="mocr", # XXX: mocr may have different output format + # use_mocr_merge=True, + verbose=True, +) + + +class DetectionResult(TypedDict): + textlines: List[utils_generic.Quadrilateral] + mask_raw: np.ndarray + mask: np.ndarray | None + + +async def run_detection_single_image( + image: np.ndarray, detector_key: str +) -> DetectionResult: + print("image", image.shape) + textlines, mask_raw, mask = await detection.dispatch( + image=image, **{"detector_key": detector_key, **mit_detect_text_default_params} + ) + print("textlines", textlines) + print("mask_raw", mask_raw) + print("mask", mask) + return { + "textlines": textlines, + "mask_raw": mask_raw, + "mask": mask, + } + + +input_single_img = gr.Image(label="input image") +output_json = gr.JSON(label="output json") + +with gr.Blocks() as demo: + gr.Markdown( + """ +# manga-image-translator demo + """.strip() + ) + + detector_state = gr.State(DetectionState()) + + with gr.Row(): + with gr.Column(): + gr.Markdown("## Detection") + img_file = gr.Image(label="input image", height=256, width=256) + detector_key = gr.Radio( + choices=["default", "dbconvnext", "ctd", "craft", "none"], + label="detector key", + ) + + btn_detect = gr.Button("detect") + detector_state_dump = gr.TextArea( + label="detection state" # , value=lambda: repr(detector_state.value) + ) + with gr.Column(): + gr.Markdown("## OCR") + ocr_key = gr.Radio(choices=["48px", "48px_ctc", "mocr"], label="ocr key") + btn_ocr = gr.Button("ocr") + ocr_state_dump = gr.TextArea(label="ocr state") + + @btn_detect.click( + inputs=[detector_state, img_file, detector_key], + outputs=[detector_state, detector_state_dump], + ) + async def run_detector( + prev: DetectionState | gr.State, + img, + detector_key: Optional[str], + ): + # print("prev", prev) + prev_value = prev if isinstance(prev, DetectionState) else prev.value + logger.debug("run_detector %s %s", prev_value, type(img)) + + value = prev_value.copy(img=img) + + if detector_key: + value = value.copy( + args={**mit_detect_text_default_params, "detector_key": detector_key} + ) + + if value.img is not None and value.args is not None: + logger.debug("run inference") + textlines, mask_raw, mask = await detection.dispatch( + image=img, **value.args + ) + value = value.copy(textlines=textlines, mask_raw=mask_raw, mask=mask) + + logger.debug("run_detector result %s", value) + return value, repr(value) + + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0") diff --git a/manga_translator/gradio/__init__.py b/manga_translator/gradio/__init__.py new file mode 100644 index 000000000..081ccd0da --- /dev/null +++ b/manga_translator/gradio/__init__.py @@ -0,0 +1,43 @@ +import manga_translator.textline_merge as textline_merge +import manga_translator.utils.generic as utils_generic +import manga_translator.utils.textblock as utils_textblock +from dataclasses import dataclass +from typing import List, Optional + +import numpy as np + + +mit_detect_text_default_params = dict( + detector_key="default", + # mostly defaults from manga-image-translator/args.py + detect_size=2560, + text_threshold=0.5, + box_threshold=0.7, + unclip_ratio=2.3, + invert=False, + device="cuda", + gamma_correct=False, + rotate=False, + verbose=True, +) + + +@dataclass(frozen=True, kw_only=True) +class DetectionState: + img: Optional[np.ndarray] = None + args: Optional[dict] = None + textlines: Optional[List[utils_generic.Quadrilateral]] = None + mask: Optional[np.ndarray] = None + mask_raw: Optional[np.ndarray] = None + + def copy(self, **kwargs): + return DetectionState( + img=kwargs.get("img", self.img), + args=kwargs.get("args", self.args), + textlines=kwargs.get("textlines", self.textlines), + mask=kwargs.get("mask", self.mask), + mask_raw=kwargs.get("mask_raw", self.mask_raw), + ) + + def __repr__(self): + return f"DetectionState(img={type(self.img)}, args={self.args}, textlines={type(self.textlines)}, mask={type(self.mask)}, mask_raw={type(self.mask_raw)})" From 1ef7c1754f68633e47ae6153f3304eee239c26a6 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 17:48:51 +0900 Subject: [PATCH 03/15] run with single image --- gradio-app.py | 97 +++++++++++++---------- manga_translator/gradio/__init__.py | 36 +++++++++ manga_translator/gradio/json_encoder.py | 35 +++++++++ moeflow_worker.py | 100 +++++++++--------------- requirements-moeflow.txt | 1 + 5 files changed, 164 insertions(+), 105 deletions(-) create mode 100644 manga_translator/gradio/json_encoder.py diff --git a/gradio-app.py b/gradio-app.py index 735b67ef0..e5e9915de 100644 --- a/gradio-app.py +++ b/gradio-app.py @@ -10,7 +10,12 @@ import manga_translator.textline_merge as textline_merge import manga_translator.utils.generic as utils_generic import manga_translator.utils.textblock as utils_textblock -from manga_translator.gradio import DetectionState, mit_detect_text_default_params +from manga_translator.gradio import ( + DetectionState, + OcrState, + mit_detect_text_default_params, + to_json, +) from typing import List, Optional, TypedDict logger = logging.getLogger(__name__) @@ -24,41 +29,6 @@ dotenv.load_dotenv() -mit_ocr_default_params = dict( - ocr_key="48px", # recommended by rowland - # ocr_key="48px_ctc", - # ocr_key="mocr", # XXX: mocr may have different output format - # use_mocr_merge=True, - verbose=True, -) - - -class DetectionResult(TypedDict): - textlines: List[utils_generic.Quadrilateral] - mask_raw: np.ndarray - mask: np.ndarray | None - - -async def run_detection_single_image( - image: np.ndarray, detector_key: str -) -> DetectionResult: - print("image", image.shape) - textlines, mask_raw, mask = await detection.dispatch( - image=image, **{"detector_key": detector_key, **mit_detect_text_default_params} - ) - print("textlines", textlines) - print("mask_raw", mask_raw) - print("mask", mask) - return { - "textlines": textlines, - "mask_raw": mask_raw, - "mask": mask, - } - - -input_single_img = gr.Image(label="input image") -output_json = gr.JSON(label="output json") - with gr.Blocks() as demo: gr.Markdown( """ @@ -67,23 +37,31 @@ async def run_detection_single_image( ) detector_state = gr.State(DetectionState()) + ocr_state = gr.State(OcrState()) with gr.Row(): with gr.Column(): gr.Markdown("## Detection") img_file = gr.Image(label="input image", height=256, width=256) detector_key = gr.Radio( - choices=["default", "dbconvnext", "ctd", "craft", "none"], - label="detector key", + choices=[ + "default", + # maybe broken: manga_translator.utils.inference.InvalidModelMappingException: [DBConvNextDetector->model] Invalid _MODEL_MAPPING - Malformed url property + # "dbconvnext", + "ctd", + "craft", + "none", + ], + label="detector", ) - btn_detect = gr.Button("detect") + btn_detect = gr.Button("run detector") detector_state_dump = gr.TextArea( - label="detection state" # , value=lambda: repr(detector_state.value) + label="detector result" # , value=lambda: repr(detector_state.value) ) with gr.Column(): gr.Markdown("## OCR") - ocr_key = gr.Radio(choices=["48px", "48px_ctc", "mocr"], label="ocr key") + ocr_key = gr.Radio(choices=["48px", "48px_ctc", "mocr"], label="ocr") btn_ocr = gr.Button("ocr") ocr_state_dump = gr.TextArea(label="ocr state") @@ -97,7 +75,8 @@ async def run_detector( detector_key: Optional[str], ): # print("prev", prev) - prev_value = prev if isinstance(prev, DetectionState) else prev.value + prev_value = prev if isinstance(prev, DetectionState) else None # prev.value + assert prev_value, "prev_value is None" logger.debug("run_detector %s %s", prev_value, type(img)) value = prev_value.copy(img=img) @@ -117,6 +96,40 @@ async def run_detector( logger.debug("run_detector result %s", value) return value, repr(value) + @btn_ocr.click( + inputs=[ocr_state, detector_state, ocr_key], + outputs=[ocr_state, ocr_state_dump], + ) + async def run_ocr( + prev_value: OcrState, + detector_state: DetectionState, + ocr_key: Optional[str], + ): + logger.debug( + "run ocr %s %s %s", type(prev_value), type(detector_state), ocr_key + ) + + if not ( + ocr_key and (detector_state.img is not None) and detector_state.textlines + ): + return prev_value, repr(prev_value) + + textlines = await ocr.dispatch( + ocr_key=ocr_key, + image=detector_state.img, + regions=detector_state.textlines, + args={}, + verbose=True, + ) + + img_w, img_h = detector_state.img.shape[:2] + text_blocks = await textline_merge.dispatch( + textlines=textlines, width=img_w, height=img_h + ) + + value = prev_value.copy(text_blocks=text_blocks, ocr_key=ocr_key) + return value, repr(value) + if __name__ == "__main__": demo.launch(server_name="0.0.0.0") diff --git a/manga_translator/gradio/__init__.py b/manga_translator/gradio/__init__.py index 081ccd0da..66d17a8d6 100644 --- a/manga_translator/gradio/__init__.py +++ b/manga_translator/gradio/__init__.py @@ -3,8 +3,10 @@ import manga_translator.utils.textblock as utils_textblock from dataclasses import dataclass from typing import List, Optional +from .json_encoder import JSONEncoder import numpy as np +import json mit_detect_text_default_params = dict( @@ -41,3 +43,37 @@ def copy(self, **kwargs): def __repr__(self): return f"DetectionState(img={type(self.img)}, args={self.args}, textlines={type(self.textlines)}, mask={type(self.mask)}, mask_raw={type(self.mask_raw)})" + + def __json__(self): + return { + "img": to_json(self.img), + "args": to_json(self.args), + "textlines": to_json(self.textlines), + "mask": to_json(self.mask), + "mask_raw": to_json(self.mask_raw), + } + + +mit_ocr_default_params = dict( + ocr_key="48px", # recommended by rowland + # ocr_key="48px_ctc", + # ocr_key="mocr", # XXX: mocr may have different output format + # use_mocr_merge=True, + verbose=True, +) + + +@dataclass(frozen=True, kw_only=True) +class OcrState: + text_blocks: Optional[List[utils_textblock.TextBlock]] = None + ocr_key: Optional[str] = None + + def copy(self, **kwargs): + return OcrState( + ocr_key=kwargs.get("ocr_key", self.ocr_key), + text_blocks=kwargs.get("text_blocks", self.text_blocks), + ) + + +def to_json(obj): + return json.loads(json.dumps(obj, cls=JSONEncoder)) diff --git a/manga_translator/gradio/json_encoder.py b/manga_translator/gradio/json_encoder.py new file mode 100644 index 000000000..e8125bb7c --- /dev/null +++ b/manga_translator/gradio/json_encoder.py @@ -0,0 +1,35 @@ +import json +import numpy as np +import manga_translator.utils.generic as utils_generic +import manga_translator.utils.textblock as utils_textblock + + +class JSONEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, utils_textblock.TextBlock): + return { + "pts": o.lines, + "text": o.text, + "textlines": self.default(o.texts), + } + if isinstance(o, utils_generic.Quadrilateral): + return { + "pts": o.pts, + "text": o.text, + "prob": o.prob, + "textlines": self.default(o.textlines), + } + elif isinstance(o, filter) or isinstance(o, tuple): + return self.default(list(o)) + elif isinstance(o, list): + return o + elif isinstance(o, str): + return o + elif isinstance(o, np.ndarray): + return o.tolist() + elif isinstance(o, np.integer): + return int(o) + elif isinstance(o, np.floating): + return float(o) + else: + return super().default(o) diff --git a/moeflow_worker.py b/moeflow_worker.py index 08a8870d9..5e699d8b8 100644 --- a/moeflow_worker.py +++ b/moeflow_worker.py @@ -9,6 +9,7 @@ import manga_translator.textline_merge as textline_merge import manga_translator.utils.generic as utils_generic import manga_translator.utils.textblock as utils_textblock + # FIXME: impl better translator , maybe with Langchain # FIXME: maybe create a different translators package import manga_translator.translators as translators @@ -22,8 +23,8 @@ import numpy as np dotenv.load_dotenv() -BROKER_URL = os.environ.get('CELERY_BROKER_URL') -BACKEND_URL = os.environ.get('CELERY_BACKEND_URL') +BROKER_URL = os.environ.get("CELERY_BROKER_URL") +BACKEND_URL = os.environ.get("CELERY_BACKEND_URL") logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -31,41 +32,10 @@ "manga-image-translator-moeflow-worker", broker=BROKER_URL, backend=BACKEND_URL, - result_expires = 7 * 24 * 60 * 60, # 7d + result_expires=7 * 24 * 60 * 60, # 7d ) -class JSONEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, utils_textblock.TextBlock): - return { - 'pts': o.lines, - 'text': o.text, - 'textlines': self.default(o.texts), - } - if isinstance(o, utils_generic.Quadrilateral): - return { - 'pts': o.pts, - 'text': o.text, - 'prob': o.prob, - 'textlines': self.default(o.textlines), - } - elif isinstance(o, filter) or isinstance(o, tuple): - return self.default(list(o)) - elif isinstance(o, list): - return o - elif isinstance(o, str): - return o - elif isinstance(o, np.ndarray): - return o.tolist() - elif isinstance(o, np.integer): - return int(o) - elif isinstance(o, np.floating): - return float(o) - else: - return super().default(o) - - def to_json(value: object) -> Any: """ :return: a json-serizable deep clone of `value` @@ -82,7 +52,7 @@ def mit_detect_text(path_or_url: str, **kwargs): # OCR + detect_textblocks + merge_textlines -@celery_app.task(name='tasks.mit.ocr') +@celery_app.task(name="tasks.mit.ocr") def mit_ocr(path_or_url: str, **kwargs): logger.debug("Running OCR %s %s", path_or_url, kwargs) # for unknown reason async_ocr returns [[Quad]] instead of [result] @@ -92,21 +62,22 @@ def mit_ocr(path_or_url: str, **kwargs): img_w, img_h, *_rest = load_rgb_image(path_or_url).shape - min_text_length = kwargs.get('min_text_length', 0) + min_text_length = kwargs.get("min_text_length", 0) text_blocks_all: list[utils_textblock.TextBlock] = async_textline_merge( - textlines=textlines, - width=img_w, - height=img_h) + textlines=textlines, width=img_w, height=img_h + ) # logger.debug("text_blocks_all = %s", text_regions_all) text_blocks = filter( - lambda r: len(r.text) > min_text_length and utils_generic.is_valuable_text(r.text), - text_blocks_all) + lambda r: len(r.text) > min_text_length + and utils_generic.is_valuable_text(r.text), + text_blocks_all, + ) return to_json(text_blocks) -@celery_app.task(name='tasks.mit.translate') +@celery_app.task(name="tasks.mit.translate") def mit_translate(**kwargs): logger.debug("Running translate %s", kwargs) result = async_translate(**kwargs) @@ -114,13 +85,13 @@ def mit_translate(**kwargs): return result -@celery_app.task(name='tasks.mit.inpaint') +@celery_app.task(name="tasks.mit.inpaint") def mit_inpaint(path_or_url: str, **kwargs): raise NotImplementedError() def load_rgb_image(path_or_url: str) -> np.ndarray: - if re.match(r'^https?://', path_or_url): + if re.match(r"^https?://", path_or_url): raise NotImplementedError("URL not supported yet") img = Image.open(path_or_url) img_rgb, img_alpha = utils_generic.load_image(img) @@ -131,14 +102,14 @@ def deserialize_quad_list(text_lines: list[dict]) -> list[utils_generic.Quadrila def create(json_value: dict) -> utils_generic.Quadrilateral: optional_args = { k: json_value[k] - for k in ['fg_r', 'fg_g', 'fg_b', 'bg_r', 'bg_g', 'bg_b'] + for k in ["fg_r", "fg_g", "fg_b", "bg_r", "bg_g", "bg_b"] if k in json_value } return utils_generic.Quadrilateral( - pts=np.array(json_value['pts']), - text=json_value['text'], - prob=json_value['prob'], - **optional_args + pts=np.array(json_value["pts"]), + text=json_value["text"], + prob=json_value["prob"], + **optional_args, ) return list(map(create, text_lines)) @@ -146,7 +117,7 @@ def create(json_value: dict) -> utils_generic.Quadrilateral: @async_to_sync async def async_detection(path_or_url: str, **kwargs: str): - await detection.prepare(kwargs['detector_key']) + await detection.prepare(kwargs["detector_key"]) img = load_rgb_image(path_or_url) textlines, mask_raw, mask = await detection.dispatch( image=img, @@ -154,43 +125,46 @@ async def async_detection(path_or_url: str, **kwargs: str): **kwargs, ) return { - 'textlines': json.loads(json.dumps(textlines, cls=JSONEncoder)), + "textlines": json.loads(json.dumps(textlines, cls=JSONEncoder)), # 'mask_raw': mask_raw, # 'mask': mask, } @async_to_sync -async def async_ocr(path_or_url: str, **kwargs) -> Awaitable[list[utils_generic.Quadrilateral]]: - await ocr.prepare(kwargs['ocr_key']) +async def async_ocr( + path_or_url: str, **kwargs +) -> Awaitable[list[utils_generic.Quadrilateral]]: + await ocr.prepare(kwargs["ocr_key"]) img = load_rgb_image(path_or_url) - quads = deserialize_quad_list(kwargs['regions']) + quads = deserialize_quad_list(kwargs["regions"]) result: list[utils_generic.Quadrilateral] = await ocr.dispatch( - ocr_key=kwargs['ocr_key'], + ocr_key=kwargs["ocr_key"], image=img, regions=quads, args=kwargs, - verbose=kwargs.get('verbose', False), + verbose=kwargs.get("verbose", False), ) return result @async_to_sync -async def async_textline_merge(*, textlines: list[utils_generic.Quadrilateral], width: int, height: int) \ - -> list[utils_textblock.TextBlock]: +async def async_textline_merge( + *, textlines: list[utils_generic.Quadrilateral], width: int, height: int +) -> list[utils_textblock.TextBlock]: return await textline_merge.dispatch(textlines, width, height) @async_to_sync async def async_translate(**kwargs) -> Awaitable[list[str]]: - query = kwargs['query'] - target_lang = kwargs['target_lang'] - translator = translators.get_translator(kwargs['translator']) + query = kwargs["query"] + target_lang = kwargs["target_lang"] + translator = translators.get_translator(kwargs["translator"]) if isinstance(translator, translators.OfflineTranslator): await translator.download() - await translator.load('auto', target_lang, device='cpu') + await translator.load("auto", target_lang, device="cpu") result = await translator.translate( - from_lang='auto', + from_lang="auto", to_lang=target_lang, queries=[query], ) diff --git a/requirements-moeflow.txt b/requirements-moeflow.txt index d6664d7ba..5175cc508 100644 --- a/requirements-moeflow.txt +++ b/requirements-moeflow.txt @@ -50,3 +50,4 @@ manga-ocr # pydensecrf@https://github.com/lucasb-eyer/pydensecrf/archive/refs/heads/master.zip # accelerate # bitsandbytes +gradio==5.6.0 From 781243f4dbe39fb293cf8c41ac9c2ba3821fa115 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 18:08:54 +0900 Subject: [PATCH 04/15] wip --- gradio-app.py | 25 ++++-- manga_translator/gradio/__init__.py | 88 ++----------------- manga_translator/gradio/detection.py | 57 ++++++++++++ .../gradio/export_moeflow_project.py | 10 +++ manga_translator/gradio/json_encoder.py | 4 + manga_translator/gradio/ocr.py | 25 ++++++ 6 files changed, 125 insertions(+), 84 deletions(-) create mode 100644 manga_translator/gradio/detection.py create mode 100644 manga_translator/gradio/export_moeflow_project.py create mode 100644 manga_translator/gradio/ocr.py diff --git a/gradio-app.py b/gradio-app.py index e5e9915de..f32bb6241 100644 --- a/gradio-app.py +++ b/gradio-app.py @@ -1,10 +1,14 @@ +from fileinput import filename import gradio as gr import numpy as np +from PIL import Image import dotenv import logging import asyncio +import os.path +from pathlib import Path import manga_translator.detection as detection import manga_translator.ocr as ocr import manga_translator.textline_merge as textline_merge @@ -14,7 +18,6 @@ DetectionState, OcrState, mit_detect_text_default_params, - to_json, ) from typing import List, Optional, TypedDict @@ -42,7 +45,9 @@ with gr.Row(): with gr.Column(): gr.Markdown("## Detection") - img_file = gr.Image(label="input image", height=256, width=256) + img_file = gr.Image( + label="input image", height=256, width=256, type="filepath" + ) detector_key = gr.Radio( choices=[ "default", @@ -71,15 +76,25 @@ ) async def run_detector( prev: DetectionState | gr.State, - img, + img_path: Optional[str], detector_key: Optional[str], ): # print("prev", prev) prev_value = prev if isinstance(prev, DetectionState) else None # prev.value assert prev_value, "prev_value is None" - logger.debug("run_detector %s %s", prev_value, type(img)) + logger.debug("run_detector %s %s", prev_value, img_path) + + value = prev_value.copy() - value = prev_value.copy(img=img) + if img_path: + raw_bytes = Path(img_path).read_bytes() + pil_img = Image.open(img_path) + img, mask = utils_generic.load_image(pil_img) + value = value.copy( + raw_filename=os.path.basename(img_path), raw_bytes=raw_bytes, img=img + ) + else: + value = prev_value.copy(raw_filename=None, raw_bytes=None, img=None) if detector_key: value = value.copy( diff --git a/manga_translator/gradio/__init__.py b/manga_translator/gradio/__init__.py index 66d17a8d6..740821b1d 100644 --- a/manga_translator/gradio/__init__.py +++ b/manga_translator/gradio/__init__.py @@ -1,79 +1,9 @@ -import manga_translator.textline_merge as textline_merge -import manga_translator.utils.generic as utils_generic -import manga_translator.utils.textblock as utils_textblock -from dataclasses import dataclass -from typing import List, Optional -from .json_encoder import JSONEncoder - -import numpy as np -import json - - -mit_detect_text_default_params = dict( - detector_key="default", - # mostly defaults from manga-image-translator/args.py - detect_size=2560, - text_threshold=0.5, - box_threshold=0.7, - unclip_ratio=2.3, - invert=False, - device="cuda", - gamma_correct=False, - rotate=False, - verbose=True, -) - - -@dataclass(frozen=True, kw_only=True) -class DetectionState: - img: Optional[np.ndarray] = None - args: Optional[dict] = None - textlines: Optional[List[utils_generic.Quadrilateral]] = None - mask: Optional[np.ndarray] = None - mask_raw: Optional[np.ndarray] = None - - def copy(self, **kwargs): - return DetectionState( - img=kwargs.get("img", self.img), - args=kwargs.get("args", self.args), - textlines=kwargs.get("textlines", self.textlines), - mask=kwargs.get("mask", self.mask), - mask_raw=kwargs.get("mask_raw", self.mask_raw), - ) - - def __repr__(self): - return f"DetectionState(img={type(self.img)}, args={self.args}, textlines={type(self.textlines)}, mask={type(self.mask)}, mask_raw={type(self.mask_raw)})" - - def __json__(self): - return { - "img": to_json(self.img), - "args": to_json(self.args), - "textlines": to_json(self.textlines), - "mask": to_json(self.mask), - "mask_raw": to_json(self.mask_raw), - } - - -mit_ocr_default_params = dict( - ocr_key="48px", # recommended by rowland - # ocr_key="48px_ctc", - # ocr_key="mocr", # XXX: mocr may have different output format - # use_mocr_merge=True, - verbose=True, -) - - -@dataclass(frozen=True, kw_only=True) -class OcrState: - text_blocks: Optional[List[utils_textblock.TextBlock]] = None - ocr_key: Optional[str] = None - - def copy(self, **kwargs): - return OcrState( - ocr_key=kwargs.get("ocr_key", self.ocr_key), - text_blocks=kwargs.get("text_blocks", self.text_blocks), - ) - - -def to_json(obj): - return json.loads(json.dumps(obj, cls=JSONEncoder)) +from .ocr import mit_ocr_default_params, OcrState +from .detection import mit_detect_text_default_params, DetectionState + +__all__ = [ + "mit_ocr_default_params", + "OcrState", + "mit_detect_text_default_params", + "DetectionState", +] diff --git a/manga_translator/gradio/detection.py b/manga_translator/gradio/detection.py new file mode 100644 index 000000000..675301ae7 --- /dev/null +++ b/manga_translator/gradio/detection.py @@ -0,0 +1,57 @@ +from dataclasses import dataclass +from typing import List, Optional + +from gradio_client import file +from .json_encoder import to_json +import manga_translator.utils.generic as utils_generic +import numpy as np + +mit_detect_text_default_params = dict( + detector_key="default", + # mostly defaults from manga-image-translator/args.py + detect_size=2560, + text_threshold=0.5, + box_threshold=0.7, + unclip_ratio=2.3, + invert=False, + device="cuda", + gamma_correct=False, + rotate=False, + verbose=True, +) + + +@dataclass(frozen=True, kw_only=True) +class DetectionState: + raw_filename: Optional[str] = None + raw_bytes: Optional[bytes] = None + img: Optional[np.ndarray] = None + args: Optional[dict] = None + textlines: Optional[List[utils_generic.Quadrilateral]] = None + mask: Optional[np.ndarray] = None + mask_raw: Optional[np.ndarray] = None + + def copy(self, **kwargs): + return DetectionState( + raw_filename=kwargs.get("raw_filename", self.raw_filename), + raw_bytes=kwargs.get("raw_bytes", self.raw_bytes), + img=kwargs.get("img", self.img), + args=kwargs.get("args", self.args), + textlines=kwargs.get("textlines", self.textlines), + mask=kwargs.get("mask", self.mask), + mask_raw=kwargs.get("mask_raw", self.mask_raw), + ) + + def __repr__(self): + return f"DetectionState(raw_filename={self.raw_filename}, raw_bytes={type(self.raw_bytes)} img={type(self.img)}, args={self.args}, textlines={type(self.textlines)}, mask={type(self.mask)}, mask_raw={type(self.mask_raw)})" + + def __json__(self): + return { + "raw_filename": self.raw_filename, + "raw_bytes": to_json(self.raw_bytes), + "img": to_json(self.img), + "args": to_json(self.args), + "textlines": to_json(self.textlines), + "mask": to_json(self.mask), + "mask_raw": to_json(self.mask_raw), + } diff --git a/manga_translator/gradio/export_moeflow_project.py b/manga_translator/gradio/export_moeflow_project.py new file mode 100644 index 000000000..49d3d4a96 --- /dev/null +++ b/manga_translator/gradio/export_moeflow_project.py @@ -0,0 +1,10 @@ +from manga_translator.gradio.detection import DetectionState +from manga_translator.gradio.ocr import OcrState + + +def create_json(detection_state: DetectionState, ocr_state: OcrState) -> dict: + return { + "img": img_bytes, + "detection_state": detection_state.__json__(), + "ocr_state": ocr_state.__json__(), + } diff --git a/manga_translator/gradio/json_encoder.py b/manga_translator/gradio/json_encoder.py index e8125bb7c..c5f12baef 100644 --- a/manga_translator/gradio/json_encoder.py +++ b/manga_translator/gradio/json_encoder.py @@ -33,3 +33,7 @@ def default(self, o): return float(o) else: return super().default(o) + + +def to_json(obj) -> object: + return json.loads(json.dumps(obj, cls=JSONEncoder)) diff --git a/manga_translator/gradio/ocr.py b/manga_translator/gradio/ocr.py new file mode 100644 index 000000000..82fa00c91 --- /dev/null +++ b/manga_translator/gradio/ocr.py @@ -0,0 +1,25 @@ +import manga_translator.textline_merge as textline_merge +import manga_translator.utils.textblock as utils_textblock +from dataclasses import dataclass +from typing import List, Optional +from .json_encoder import to_json + +mit_ocr_default_params = dict( + ocr_key="48px", # recommended by rowland + # ocr_key="48px_ctc", + # ocr_key="mocr", # XXX: mocr may have different output format + # use_mocr_merge=True, + verbose=True, +) + + +@dataclass(frozen=True, kw_only=True) +class OcrState: + text_blocks: Optional[List[utils_textblock.TextBlock]] = None + ocr_key: Optional[str] = None + + def copy(self, **kwargs): + return OcrState( + ocr_key=kwargs.get("ocr_key", self.ocr_key), + text_blocks=kwargs.get("text_blocks", self.text_blocks), + ) From fe238a9b83a8176d294547567c23b58012657623 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 18:29:26 +0900 Subject: [PATCH 05/15] wip --- conda.working.yaml | 12 ++++++++++++ requirements-moeflow.txt | 1 + 2 files changed, 13 insertions(+) create mode 100644 conda.working.yaml diff --git a/conda.working.yaml b/conda.working.yaml new file mode 100644 index 000000000..cc645e1b4 --- /dev/null +++ b/conda.working.yaml @@ -0,0 +1,12 @@ +# name: mit-py311 +channels: +- conda-forge +- pytorch +- nvidia +dependencies: +- python==3.11 +- pytorch==2.2.2 +- torchvision==0.17.2 +- torchaudio==2.2.2 +- pytorch-cuda=12.1 +- numpy<2 diff --git a/requirements-moeflow.txt b/requirements-moeflow.txt index 5175cc508..7bec1ed19 100644 --- a/requirements-moeflow.txt +++ b/requirements-moeflow.txt @@ -51,3 +51,4 @@ manga-ocr # accelerate # bitsandbytes gradio==5.6.0 +orjson==3.10.11 From e29f484b8f91f018a638526f908f23371b26521a Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 20:22:14 +0900 Subject: [PATCH 06/15] move file --- gradio-multi.py | 4 ++++ gradio-app.py => gradio-single.py | 0 2 files changed, 4 insertions(+) create mode 100644 gradio-multi.py rename gradio-app.py => gradio-single.py (100%) diff --git a/gradio-multi.py b/gradio-multi.py new file mode 100644 index 000000000..a7ebe3e51 --- /dev/null +++ b/gradio-multi.py @@ -0,0 +1,4 @@ +import gradio as gr + + +with gr. \ No newline at end of file diff --git a/gradio-app.py b/gradio-single.py similarity index 100% rename from gradio-app.py rename to gradio-single.py From c3829f1bb0c887ac5752b218752a2839d857cc4b Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 22:38:02 +0900 Subject: [PATCH 07/15] wip --- gradio-multi.py | 155 ++++++++++++++++++++++++++- gradio-single.py | 7 +- manga_translator/gradio/detection.py | 2 +- manga_translator/gradio/ocr.py | 2 +- moeflow_worker.py | 6 +- 5 files changed, 161 insertions(+), 11 deletions(-) diff --git a/gradio-multi.py b/gradio-multi.py index a7ebe3e51..d80bf6111 100644 --- a/gradio-multi.py +++ b/gradio-multi.py @@ -1,4 +1,157 @@ +import logging +from typing import List import gradio as gr +import asyncio +from pathlib import Path +import json +import uuid +from PIL import Image +import manga_translator.detection as mit_detection +import manga_translator.ocr as mit_ocr +import manga_translator.textline_merge as textline_merge +import manga_translator.utils.generic as utils_generic +from manga_translator.gradio import ( + mit_detect_text_default_params, + mit_ocr_default_params, + storage_dir, + MitJSONEncoder, +) +from manga_translator.utils.textblock import TextBlock +STORAGE_DIR_RESOLVED = storage_dir.resolve() -with gr. \ No newline at end of file +if gr.NO_RELOAD: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + force=True, + ) + for name in ["httpx"]: + logging.getLogger(name).setLevel(logging.WARN) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +async def copy_files(gradio_temp_files: list[str]) -> list[str]: + new_root: Path = storage_dir / uuid.uuid4().hex + new_root.mkdir(parents=True, exist_ok=True) + + ret: list[str] = [] + for f in gradio_temp_files: + new_file = new_root / f.split("/")[-1] + new_file.write_bytes(Path(f).read_bytes()) + ret.append(str(new_file.relative_to(storage_dir))) + logger.debug("copied %s to %s", f, new_file) + + return ret + + +def log_file(basename: str, result: List[TextBlock]): + logger.debug("file: %s", basename) + for i, b in enumerate(result): + logger.debug(" block %d: %s", i, b.text) + + +async def process_files( + filename_list: list[str], detector_key: str, ocr_key: str, device: str +) -> str: + path_list: list[Path] = [] + for f in filename_list: + assert f + # p = (storage_dir / f).resolve() + # assert p.is_file() and STORAGE_DIR_RESOLVED in p.parents, f"illegal path: {f}" + path_list.append(Path(f)) + + await mit_detection.prepare(detector_key) + await mit_ocr.prepare(ocr_key, device) + + result = await asyncio.gather( + *[process_file(p, detector_key, ocr_key) for p in path_list] + ) + + for r in result: + log_file(r["filename"], r["text_blocks"]) + + return json.dumps(result, cls=MitJSONEncoder) + + +async def process_file( + img_path: Path, detector: str, ocr_key: str, device: str +) -> dict: + pil_img = Image.open(img_path) + img, mask = utils_generic.load_image(pil_img) + img_w, img_h = img.shape[:2] + + try: + # detector + detector_args = { + **mit_detect_text_default_params, + "detector_key": detector, + "device": device, + } + regions, mask_raw, mask = await mit_detection.dispatch( + image=img, **detector_args + ) + # ocr + ocr_args = {**mit_ocr_default_params, "ocr_key": ocr_key, "device": device} + textlines = await mit_ocr.dispatch(image=img, regions=regions, **ocr_args) + # textline merge + text_blocks = await textline_merge.dispatch( + textlines=textlines, width=img_w, height=img_h + ) + except Exception as e: + logger.error("error processing %s: %s", img_path, e) + text_blocks = [] + else: + logger.debug("processed %s", img_path) + + return { + "filename": img_path.name, + "text_blocks": text_blocks, + } + + +with gr.Blocks() as demo: + demo.enable_queue = True + file_input = gr.File(label="upload file", file_count="multiple", type="filepath") + + ocr_output = gr.JSON( + label="OCR output", + ) + + device_input = gr.Radio(choices=["cpu", "cuda"], label="device", value="cuda") + detector_key_input = gr.Radio( + choices=[ + "default", + # maybe broken: manga_translator.utils.inference.InvalidModelMappingException: [DBConvNextDetector->model] Invalid _MODEL_MAPPING - Malformed url property + # "dbconvnext", + "ctd", + "craft", + "none", + ], + value="default", + label="detector", + ) + + ocr_key_input = gr.Radio( + choices=["48px", "48px_ctc", "mocr"], label="ocr", value="48px" + ) + run_button = gr.Button("upload + text detection + OCR + textline_merge") + + @run_button.click( + inputs=[file_input, detector_key_input, ocr_key_input, device_input], + outputs=[ocr_output], + ) + async def on_run_button( + gradio_temp_files: list[str], detector_key: str, ocr_key: str, device: str + ) -> str: + res = await process_files(gradio_temp_files, detector_key, ocr_key, device) + return res + + +if __name__ == "__main__": + demo.launch( + server_name="0.0.0.0", + ) diff --git a/gradio-single.py b/gradio-single.py index f32bb6241..3139b5497 100644 --- a/gradio-single.py +++ b/gradio-single.py @@ -1,4 +1,3 @@ -from fileinput import filename import gradio as gr import numpy as np from PIL import Image @@ -6,14 +5,12 @@ import dotenv import logging -import asyncio import os.path from pathlib import Path import manga_translator.detection as detection -import manga_translator.ocr as ocr +import manga_translator.ocr as mit_ocr import manga_translator.textline_merge as textline_merge import manga_translator.utils.generic as utils_generic -import manga_translator.utils.textblock as utils_textblock from manga_translator.gradio import ( DetectionState, OcrState, @@ -129,7 +126,7 @@ async def run_ocr( ): return prev_value, repr(prev_value) - textlines = await ocr.dispatch( + textlines = await mit_ocr.dispatch( ocr_key=ocr_key, image=detector_state.img, regions=detector_state.textlines, diff --git a/manga_translator/gradio/detection.py b/manga_translator/gradio/detection.py index 675301ae7..83eac0729 100644 --- a/manga_translator/gradio/detection.py +++ b/manga_translator/gradio/detection.py @@ -14,7 +14,7 @@ box_threshold=0.7, unclip_ratio=2.3, invert=False, - device="cuda", + # device="cpu", gamma_correct=False, rotate=False, verbose=True, diff --git a/manga_translator/gradio/ocr.py b/manga_translator/gradio/ocr.py index 82fa00c91..3b2fdf402 100644 --- a/manga_translator/gradio/ocr.py +++ b/manga_translator/gradio/ocr.py @@ -2,13 +2,13 @@ import manga_translator.utils.textblock as utils_textblock from dataclasses import dataclass from typing import List, Optional -from .json_encoder import to_json mit_ocr_default_params = dict( ocr_key="48px", # recommended by rowland # ocr_key="48px_ctc", # ocr_key="mocr", # XXX: mocr may have different output format # use_mocr_merge=True, + # device="cpu", verbose=True, ) diff --git a/moeflow_worker.py b/moeflow_worker.py index 5e699d8b8..2cf78a3cf 100644 --- a/moeflow_worker.py +++ b/moeflow_worker.py @@ -5,7 +5,7 @@ from celery import Celery from asgiref.sync import async_to_sync import manga_translator.detection as detection -import manga_translator.ocr as ocr +import manga_translator.ocr as mit_ocr import manga_translator.textline_merge as textline_merge import manga_translator.utils.generic as utils_generic import manga_translator.utils.textblock as utils_textblock @@ -135,10 +135,10 @@ async def async_detection(path_or_url: str, **kwargs: str): async def async_ocr( path_or_url: str, **kwargs ) -> Awaitable[list[utils_generic.Quadrilateral]]: - await ocr.prepare(kwargs["ocr_key"]) + await mit_ocr.prepare(kwargs["ocr_key"]) img = load_rgb_image(path_or_url) quads = deserialize_quad_list(kwargs["regions"]) - result: list[utils_generic.Quadrilateral] = await ocr.dispatch( + result: list[utils_generic.Quadrilateral] = await mit_ocr.dispatch( ocr_key=kwargs["ocr_key"], image=img, regions=quads, From d9aab23f357c22141abf70d8ea75be8ffc867657 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 22:39:48 +0900 Subject: [PATCH 08/15] fix --- gradio-multi.py | 2 +- manga_translator/gradio/__init__.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gradio-multi.py b/gradio-multi.py index d80bf6111..163e6fee7 100644 --- a/gradio-multi.py +++ b/gradio-multi.py @@ -68,7 +68,7 @@ async def process_files( await mit_ocr.prepare(ocr_key, device) result = await asyncio.gather( - *[process_file(p, detector_key, ocr_key) for p in path_list] + *[process_file(p, detector_key, ocr_key, device) for p in path_list] ) for r in result: diff --git a/manga_translator/gradio/__init__.py b/manga_translator/gradio/__init__.py index 740821b1d..cc6dbc638 100644 --- a/manga_translator/gradio/__init__.py +++ b/manga_translator/gradio/__init__.py @@ -1,9 +1,15 @@ +from pathlib import Path from .ocr import mit_ocr_default_params, OcrState from .detection import mit_detect_text_default_params, DetectionState +from .json_encoder import JSONEncoder as MitJSONEncoder + +storage_dir = Path(__file__).parent.parent / "storage" __all__ = [ "mit_ocr_default_params", "OcrState", "mit_detect_text_default_params", "DetectionState", + "storage_dir", + "MitJSONEncoder", ] From e8ad19e36b9ea1c4e5624ea908a1113bc764d2ad Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 22:40:54 +0900 Subject: [PATCH 09/15] wip --- manga_translator/gradio/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 manga_translator/gradio/.gitignore diff --git a/manga_translator/gradio/.gitignore b/manga_translator/gradio/.gitignore new file mode 100644 index 000000000..dbca33e9d --- /dev/null +++ b/manga_translator/gradio/.gitignore @@ -0,0 +1 @@ +/storage From 1b30b79aedd830fb6960d12e33a72c3a6e6b8c79 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 22:47:20 +0900 Subject: [PATCH 10/15] fix --- gradio-multi.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gradio-multi.py b/gradio-multi.py index 163e6fee7..c24b9f93c 100644 --- a/gradio-multi.py +++ b/gradio-multi.py @@ -115,7 +115,11 @@ async def process_file( with gr.Blocks() as demo: demo.enable_queue = True - file_input = gr.File(label="upload file", file_count="multiple", type="filepath") + file_input = gr.File( + label="upload file", + file_count="multiple", + type="filepath", + ) ocr_output = gr.JSON( label="OCR output", @@ -152,6 +156,4 @@ async def on_run_button( if __name__ == "__main__": - demo.launch( - server_name="0.0.0.0", - ) + demo.launch(server_name="0.0.0.0", max_file_size=10 * gr.FileSize.MB) From e2bb9f1cacf54814719a89460fc470af979a0822 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 23:06:42 +0900 Subject: [PATCH 11/15] workaround empty textline --- manga_translator/utils/textblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manga_translator/utils/textblock.py b/manga_translator/utils/textblock.py index de3c8a4cd..5f1004c20 100644 --- a/manga_translator/utils/textblock.py +++ b/manga_translator/utils/textblock.py @@ -82,8 +82,8 @@ def __init__(self, lines: List[Tuple[int, int, int, int]], self.text = texts[0] if self.text and len(texts) > 1: for txt in texts[1:]: - first_cjk = '\u3000' <= self.text[-1] <= '\u9fff' - second_cjk = '\u3000' <= txt[0] <= '\u9fff' + first_cjk = txt and ('\u3000' <= self.text[-1] <= '\u9fff') + second_cjk = txt and ('\u3000' <= txt[0] <= '\u9fff') if first_cjk or second_cjk : self.text += txt else : From 3ecfe04ee8d91e19de65e41f24cd5f73f3ed83b6 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 23:25:57 +0900 Subject: [PATCH 12/15] fix --- Dockerfile | 2 +- docker_prepare.py | 2 +- manga_translator/utils/generic.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4249d46ba..f94d11ace 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN apt-get remove -y g++ && \ COPY . /app # Prepare models -RUN python -u docker_prepare.py +RUN python -u docker_prepare.py --continue-on-error RUN rm -rf /tmp diff --git a/docker_prepare.py b/docker_prepare.py index 005a3f4b4..5acbabf04 100644 --- a/docker_prepare.py +++ b/docker_prepare.py @@ -31,7 +31,7 @@ async def download(dict): async def main(): models: set[str] = set(filter(None, cli_args.models.split(","))) - # print("parsed.models", models) + await download( { k: v diff --git a/manga_translator/utils/generic.py b/manga_translator/utils/generic.py index a5b5b5cf1..92c55c6ac 100644 --- a/manga_translator/utils/generic.py +++ b/manga_translator/utils/generic.py @@ -1,5 +1,5 @@ import os -from typing import List, Callable, Tuple +from typing import List, Callable, Tuple, Optional import numpy as np import cv2 import functools @@ -246,7 +246,7 @@ def __call__(self, val = None): else: return 0 -def load_image(img: Image.Image) -> Tuple[np.ndarray, any]: +def load_image(img: Image.Image) -> Tuple[np.ndarray, Optional[Image.Image]]: if img.mode == 'RGBA': # from https://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil img.load() # needed for split() From 52609bd53af2bfac73759a87c78e89960e2dfd12 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 23:27:04 +0900 Subject: [PATCH 13/15] tune logs --- gradio-multi.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gradio-multi.py b/gradio-multi.py index c24b9f93c..eab51df5c 100644 --- a/gradio-multi.py +++ b/gradio-multi.py @@ -22,7 +22,7 @@ if gr.NO_RELOAD: logging.basicConfig( - level=logging.INFO, + level=logging.WARN, format="%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", force=True, @@ -31,7 +31,7 @@ logging.getLogger(name).setLevel(logging.WARN) logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) +logger.setLevel(logging.INFO) async def copy_files(gradio_temp_files: list[str]) -> list[str]: @@ -49,9 +49,9 @@ async def copy_files(gradio_temp_files: list[str]) -> list[str]: def log_file(basename: str, result: List[TextBlock]): - logger.debug("file: %s", basename) + logger.info("file: %s", basename) for i, b in enumerate(result): - logger.debug(" block %d: %s", i, b.text) + logger.info(" block %d: %s", i, b.text) async def process_files( @@ -103,6 +103,7 @@ async def process_file( ) except Exception as e: logger.error("error processing %s: %s", img_path, e) + print(e) text_blocks = [] else: logger.debug("processed %s", img_path) From 73c55167119917e736ab0a2a40ebeab164171aec Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Sun, 17 Nov 2024 23:44:22 +0900 Subject: [PATCH 14/15] enable gradio queue --- gradio-multi.py | 12 ++++++++---- manga_translator/gradio/__init__.py | 4 ++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/gradio-multi.py b/gradio-multi.py index eab51df5c..8b41d032e 100644 --- a/gradio-multi.py +++ b/gradio-multi.py @@ -14,6 +14,7 @@ mit_detect_text_default_params, mit_ocr_default_params, storage_dir, + load_model_mutex, MitJSONEncoder, ) from manga_translator.utils.textblock import TextBlock @@ -64,8 +65,9 @@ async def process_files( # assert p.is_file() and STORAGE_DIR_RESOLVED in p.parents, f"illegal path: {f}" path_list.append(Path(f)) - await mit_detection.prepare(detector_key) - await mit_ocr.prepare(ocr_key, device) + with load_model_mutex: + await mit_detection.prepare(detector_key) + await mit_ocr.prepare(ocr_key, device) result = await asyncio.gather( *[process_file(p, detector_key, ocr_key, device) for p in path_list] @@ -115,7 +117,6 @@ async def process_file( with gr.Blocks() as demo: - demo.enable_queue = True file_input = gr.File( label="upload file", file_count="multiple", @@ -157,4 +158,7 @@ async def on_run_button( if __name__ == "__main__": - demo.launch(server_name="0.0.0.0", max_file_size=10 * gr.FileSize.MB) + demo.queue(api_open=True, max_size=100).launch( + server_name="0.0.0.0", + max_file_size=10 * gr.FileSize.MB, + ) diff --git a/manga_translator/gradio/__init__.py b/manga_translator/gradio/__init__.py index cc6dbc638..6edc7491d 100644 --- a/manga_translator/gradio/__init__.py +++ b/manga_translator/gradio/__init__.py @@ -1,8 +1,11 @@ +from threading import RLock from pathlib import Path from .ocr import mit_ocr_default_params, OcrState from .detection import mit_detect_text_default_params, DetectionState from .json_encoder import JSONEncoder as MitJSONEncoder +load_model_mutex = RLock() + storage_dir = Path(__file__).parent.parent / "storage" __all__ = [ @@ -12,4 +15,5 @@ "DetectionState", "storage_dir", "MitJSONEncoder", + "load_model_mutex", ] From 6827c889219581a036a27f86b52270abfffe4489 Mon Sep 17 00:00:00 2001 From: Wang Guan Date: Mon, 18 Nov 2024 01:21:56 +0900 Subject: [PATCH 15/15] minor changes --- gradio-multi.py | 1 + manga_translator/utils/textblock.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/gradio-multi.py b/gradio-multi.py index 8b41d032e..71161591a 100644 --- a/gradio-multi.py +++ b/gradio-multi.py @@ -159,6 +159,7 @@ async def on_run_button( if __name__ == "__main__": demo.queue(api_open=True, max_size=100).launch( + debug=True, server_name="0.0.0.0", max_file_size=10 * gr.FileSize.MB, ) diff --git a/manga_translator/utils/textblock.py b/manga_translator/utils/textblock.py index 5f1004c20..8c2f390f5 100644 --- a/manga_translator/utils/textblock.py +++ b/manga_translator/utils/textblock.py @@ -82,7 +82,7 @@ def __init__(self, lines: List[Tuple[int, int, int, int]], self.text = texts[0] if self.text and len(texts) > 1: for txt in texts[1:]: - first_cjk = txt and ('\u3000' <= self.text[-1] <= '\u9fff') + first_cjk = '\u3000' <= self.text[-1] <= '\u9fff' second_cjk = txt and ('\u3000' <= txt[0] <= '\u9fff') if first_cjk or second_cjk : self.text += txt