feat: more params support

zmh-program · Jul 23, 2024 · f9091fa · f9091fa · zmh-program · Jul 23, 2024
1 parent 508b08d
commit f9091fa
Show file tree

Hide file tree

Showing 8 changed files with 98 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -67,11 +67,21 @@ uvicorn main:app
 `POST` `/upload` Upload a file
 ```json
 {
-    "file": "file",
-    "model": "gpt-4-turbo-preview" // optional (for ocr models detection)
+    "file": "[file]",
+    "enable_ocr": false,
+    "enable_vision": true,
+    "save_all": false
 }
 ```
 
+| Parameter       | Type    | Description                                                                          |
+|-----------------|---------|--------------------------------------------------------------------------------------|
+| `file`          | *File   | File to Upload                                                                       |
+| `enable_ocr`    | Boolean | Enable OCR (Default: `false`) <br/>**should configure OCR config*                    |
+| `enable_vision` | Boolean | Enable Vision (Default: `true`) <br/>**skip if `enable_ocr` is true*                 |
+| `save_all`      | Boolean | Save All Images (Default: `false`) <br/>**store all types of files without handling* |
+
+
 Response
 
 ```json
@@ -83,8 +93,16 @@ Response
 }
 ```
 
+| Parameter       | Type     | Description    |
+|-----------------|----------|----------------|
+| `status`        | Boolean  | Request Status |
+| `type`          | String   | File Type      |
+| `content`       | String   | File Data      |
+| `error`         | String   | Error Message  |
+
 ## Environment Variables
-### 🎨 General Config (Optional)
+
+### `1` 🎨 General Config (Optional)
 
 - `PDF_MAX_IMAGES`: Max Images Extracted from a PDF File (Default: `10`)
     - **0**: Never Extract Images
@@ -95,20 +113,19 @@ Response
   - *Tips: Size limit is also depend on the server configuration (e.g. Nginx/Apache Config, Vercel Free Plan Limit **5MB** Body Size)*
 - `CORS_ALLOW_ORIGINS`: CORS Allow Origins (Default: `*`)
   - e.g.: *http://localhost:3000,https://example.com*
+
+### `2` 🔊 Audio Config (Optional)
 - `AZURE_SPEECH_KEY`: Azure Speech to Text Service Key (Required for Audio Support)
 - `AZURE_SPEECH_REGION`: Azure Speech to Text Service Region (Required for Audio Support)
 
-
-
-### 🖼 Image Storage Config (Optional)
+### `3` 🖼 Storage Config (Optional)
 > [!NOTE]
-> **When OCR is enabled, the service will firstly using OCR then store the images.**
->
-> **You can configure the OCR Advanced Config to control the OCR Models Filtering.**
+> Storage Config Apply to **Image** Files And `Save All` Option Only.
 
 1. ✨ No Storage (Default)
    - [x] **No Storage Required & No External Dependencies**
    - [x] Base64 Encoding/Decoding
+   - [x] Do **Not** Store Anything
    - [x] Support Serverless Deployment **Without Storage** (e.g. Vercel)
    - [ ] No Direct URL Access *(Base64 not support models like `gpt-4-all`)*
 
@@ -164,36 +181,13 @@ Response
       - set env `TG_ENDPOINT` to your TG-STATE Endpoint (e.g. `TG_ENDPOINT=https://tgstate.vercel.app`)
       - *[Optional] if you are using password authentication, you can set `TG_PASSWORD` to your TG-STATE Password*
 
-
-
-### 🔍 OCR Config (Optional)
+
+### `4` 🔍 OCR Config (Optional)
 > [!NOTE]
-> OCR Support is based on [PaddleOCR API](https://github.com/cgcel/PaddleOCRFastAPI), please deploy the API to use OCR feature.
-> 
-> When OCR is enabled, the service will automatically extract text from the image and **skip the original image storage solution** below.
-
-- `OCR_ENABLED` Image OCR Enabled (`1` for **Enabled**, `0` for **Disabled**, Default is **Disabled**)
-- `OCR_ENDPOINT` Paddle OCR Endpoint ([Deploy PaddleOCR API](https://github.com/cgcel/PaddleOCRFastAPI))
-    - e.g.: *http://example.com:8000*
-
-Advanced OCR Config:
-> [!WARNING]
-> Advanced Config Chat Nio Supported Version >= **4.3.1** or **3.10.9**
-
-- `OCR_SKIP_MODELS`: Skip OCR Models List (Commonly for Vision Models)
-    - e.g.: *gpt-4-v,gpt-4-vision-preview,gpt-4-turbo*, then the service will skip these models and directly store the image.
-      - Tips: Each model has character inclusion matching, so when you set `gpt-4-v` model, it will skip all models that contain **gpt-4-v** (like azure-**gpt-4-v**ision-preview, **gpt-4-v**ision-preview will be also matched).
-- `OCR_SPEC_MODELS`: Specific OCR Models List (Commonly for Non-Vision Models)
-    - then although the image has marked as `SKIP_MODELS`, the service will still ocr process the image with this model first.
-    - for example, when you set `gpt-4-turbo` to `SKIP_MODELS` (because `gpt-4-turbo` support vision and don't need to use OCR, `gpt-4-turbo-preview` cannot vision and need OCR), commonly the **gpt-4-turbo**-preview will be marked as **gpt-4-turbo** and skipped, then you can set `gpt-4-turbo-preview` to `SPEC_MODELS` to force OCR process.
-
-EXAMPLE OCR Config:
-```env
-OCR_ENABLED=1
-OCR_ENDPOINT=http://example.com:8000
-OCR_SKIP_MODELS=vision,gpt-4-v,gpt-4-all,gpt-4-vision-preview,gpt-4-1106-vision-preview,gpt-4-turbo,gemini-pro-vision,gemini-1.5-pro,claude-3,glm-4v
-OCR_SPEC_MODELS=gpt-4-turbo-preview,claude-3-haiku
-```
+> OCR Support is based on 👉 [PaddleOCR API](https://github.com/cgcel/PaddleOCRFastAPI) (✔ Self Hosted ✔ Open Source)
+
+- `OCR_ENDPOINT` Paddle OCR Endpoint
+    - *e.g.: *http://example.com:8000*
 
 ## Development
 - **~/config.py**: Env Config

diff --git a/config.py b/config.py
@@ -82,6 +82,5 @@ def to_int(value: str, default: int) -> int:
 
 # OCR Config
 OCR_ENDPOINT = to_endpoint("OCR_ENDPOINT", "")  # OCR Endpoint
-OCR_ENABLED = to_bool("OCR_ENABLED", False)  # OCR Enabled
 OCR_SKIP_MODELS = to_list("OCR_SKIP_MODELS", [])  # OCR Skip Models
 OCR_SPEC_MODELS = to_list("OCR_SPEC_MODELS", [])  # OCR Specific Models
diff --git a/handlers/image.py b/handlers/image.py
@@ -1,5 +1,6 @@
 from fastapi import UploadFile
-from handlers.ocr import ocr_image, could_enable_ocr
+
+from handlers.ocr import create_ocr_task
 from store.store import process_image
 
 COMMON_IMAGE_EXTENSIONS = {
@@ -17,9 +18,15 @@ def is_image(filename: str) -> bool:
     return filename.split(".")[-1] in COMMON_IMAGE_EXTENSIONS
 
 
-async def process(file: UploadFile, model: str) -> str:
+async def process(file: UploadFile, enable_ocr: bool, enable_vision: bool, not_raise: bool = False):
     """Process image."""
-    if could_enable_ocr(model):
-        return ocr_image(file)
+    if enable_ocr:
+        return create_ocr_task(file)
+
+    if not enable_vision:
+        if not not_raise:
+            return ""
+
+        raise ValueError("Trying to upload image with Vision disabled.")
 
     return await process_image(file)
diff --git a/handlers/ocr.py b/handlers/ocr.py
@@ -1,6 +1,6 @@
 from fastapi import UploadFile, File
 import requests
-from config import OCR_ENDPOINT, OCR_ENABLED, OCR_SKIP_MODELS, OCR_SPEC_MODELS
+from config import OCR_ENDPOINT, OCR_SKIP_MODELS, OCR_SPEC_MODELS
 import time
 from typing import List
 
@@ -17,7 +17,7 @@ def get_ocr_source(data: any) -> List[str]:
     return []
 
 
-def ocr_image(file: UploadFile = File(...)) -> str:
+def create_ocr_task(file: UploadFile = File(...)) -> str:
     start = time.time()
 
     response = requests.post(
@@ -39,8 +39,8 @@ def ocr_image(file: UploadFile = File(...)) -> str:
     return " ".join(get_ocr_source(result))
 
 
-def could_enable_ocr(model: str = "") -> bool:
-    if not OCR_ENABLED:
+def deprecated_could_enable_ocr(model: str = "") -> bool:
+    if len(OCR_ENDPOINT) == 0:
         # if OCR is disabled
         return False
 

diff --git a/handlers/pdf.py b/handlers/pdf.py
@@ -12,7 +12,7 @@ def is_pdf(filename: str) -> bool:
     return filename.endswith(".pdf")
 
 
-async def process(file: UploadFile, model: str) -> str:
+async def process(file: UploadFile, enable_ocr: bool, enable_vision: bool) -> str:
     filename = file.filename.replace(" ", "_").replace(".", "_")
     doc = fitz.open("pdf", file.file.read())  # read the file from memory
     stack = []
@@ -41,7 +41,7 @@ async def process(file: UploadFile, model: str) -> str:
 
             # create a file-like object for the image
             image_file = UploadFile(io, filename=image_name)
-            stack.append(await process_image(image_file, model))
+            stack.append(await process_image(image_file, enable_ocr=enable_ocr, enable_vision=enable_vision, not_raise=True))
 
             print(f"[pdf] extracted image: {image_name} (page: {page.number}, cursor: {cursor}, max: {PDF_MAX_IMAGES})")
 

diff --git a/handlers/processor.py b/handlers/processor.py
@@ -9,6 +9,7 @@
     image,
     speech,
 )
+from store.store import process_all
 
 
 async def read_file_size(file: UploadFile) -> float:
@@ -22,7 +23,12 @@ async def read_file_size(file: UploadFile) -> float:
     return file_size / 1024 / 1024
 
 
-async def process_file(file: UploadFile = File(...), model: str = "") -> (str, str):
+async def process_file(
+        file: UploadFile = File(...),
+        enable_ocr: bool = False,
+        enable_vision: bool = True,
+        save_all: bool = False,
+) -> (str, str):
     """Process file and return its contents."""
 
     if MAX_FILE_SIZE > 0:
@@ -31,16 +37,28 @@ async def process_file(file: UploadFile = File(...), model: str = "") -> (str, s
             raise ValueError(f"File size {file_size:.2f} MiB exceeds the limit of {MAX_FILE_SIZE} MiB.")
 
     filename = file.filename.lower()
+    if save_all:
+        # save all types of files to storage
+        return "file", await process_all(file)
+
     if pdf.is_pdf(filename):
-        return "pdf", await pdf.process(file, model)
+        return "pdf", await pdf.process(
+            file,
+            enable_ocr=enable_ocr,
+            enable_vision=enable_vision,
+        )
     elif word.is_docx(filename):
         return "docx", word.process(file)
     elif ppt.is_pptx(filename):
         return "pptx", ppt.process(file)
     elif xlsx.is_xlsx(filename):
         return "xlsx", xlsx.process(file)
     elif image.is_image(filename):
-        return "image", await image.process(file, model)
+        return "image", await image.process(
+            file,
+            enable_ocr=enable_ocr,
+            enable_vision=enable_vision,
+        )
     elif ENABLE_AZURE_SPEECH and speech.is_audio(filename):
         return "audio", speech.process(file)
 

diff --git a/main.py b/main.py
@@ -4,7 +4,7 @@
 from fastapi.responses import FileResponse
 from handlers.processor import process_file
 from config import *
-
+from handlers.ocr import create_ocr_task, deprecated_could_enable_ocr
 
 app = FastAPI()
 app.add_middleware(
@@ -28,11 +28,30 @@ def favicon():
 
 
 @app.post("/upload")
-async def upload(file: UploadFile = File(...), model: str = Form(default="")):
+async def upload(
+        file: UploadFile = File(...),
+        enable_ocr: bool = Form(default=False),
+        enable_vision: bool = Form(default=True),
+        save_all: bool = Form(default=False),
+        model: str = Form(default=""),  # deprecated
+):
     """Accepts file and returns its contents."""
 
+    if model and len(model) > 0:
+        # compatibility with deprecated model parameter
+        enable_ocr = deprecated_could_enable_ocr(model)
+        enable_vision = not enable_ocr
+
+    if len(OCR_ENDPOINT) == 0:
+        enable_ocr = False
+
     try:
-        filetype, contents = await process_file(file, model)
+        filetype, contents = await process_file(
+            file,
+            enable_ocr=enable_ocr,
+            enable_vision=enable_vision,
+            save_all=save_all,
+        )
         return {
             "status": True,
             "content": contents,

diff --git a/store/store.py b/store/store.py
@@ -20,3 +20,9 @@ async def process_image(file: UploadFile) -> str:
 
     handler = IMAGE_HANDLERS.get(STORAGE_TYPE, IMAGE_HANDLERS["common"])
     return await handler(file)
+
+
+async def process_all(file: UploadFile) -> str:
+    """Process all files"""
+
+    return await process_image(file)