Skip to content

Commit

Permalink
Merge pull request #782 from icecraft/feat/data_api
Browse files Browse the repository at this point in the history
Feat/data api
  • Loading branch information
myhloli authored Oct 24, 2024
2 parents e36627b + c200eff commit 82dd7ac
Show file tree
Hide file tree
Showing 56 changed files with 20,026 additions and 255 deletions.
95 changes: 50 additions & 45 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,45 +1,50 @@
*.tar
*.tar.gz
*.zip
venv*/
envs/
slurm_logs/

sync1.sh
data_preprocess_pj1
data-preparation1
__pycache__
*.log
*.pyc
.vscode
debug/
*.ipynb
.idea

# vscode history
.history

.DS_Store
.env

bad_words/
bak/

app/tests/*
temp/
tmp/
tmp
.vscode
.vscode/
ocr_demo
.coveragerc
/app/common/__init__.py
/magic_pdf/config/__init__.py
source.dev.env

tmp

projects/web/node_modules
projects/web/dist

projects/web_demo/web_demo/static/
*.tar
*.tar.gz
*.zip
venv*/
envs/
slurm_logs/

sync1.sh
data_preprocess_pj1
data-preparation1
__pycache__
*.log
*.pyc
.vscode
debug/
*.ipynb
.idea

# vscode history
.history

.DS_Store
.env

bad_words/
bak/

app/tests/*
temp/
tmp/
tmp
.vscode
.vscode/
ocr_demo
.coveragerc
/app/common/__init__.py
/magic_pdf/config/__init__.py
source.dev.env

tmp

projects/web/node_modules
projects/web/dist

projects/web_demo/web_demo/static/
cli_debug/
debug_utils/

# sphinx docs
_build/
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"]
args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
Expand All @@ -12,11 +12,12 @@ repos:
rev: v0.32.0
hooks:
- id: yapf
args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"]
args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
- repo: https://github.com/codespell-project/codespell
rev: v2.2.1
hooks:
- id: codespell
args: ['--skip', '*.json']
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
Expand Down
9 changes: 9 additions & 0 deletions docs/en/api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Data Api
------------------

.. toctree::
:maxdepth: 2

api/dataset.rst
api/data_reader_writer.rst
api/read_api.rst
44 changes: 44 additions & 0 deletions docs/en/api/data_reader_writer.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

Data Reader Writer
--------------------

.. autoclass:: magic_pdf.data.data_reader_writer.DataReader
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
:members:
:inherited-members:

22 changes: 22 additions & 0 deletions docs/en/api/dataset.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Dataset Api
------------------

.. autoclass:: magic_pdf.data.dataset.PageableData
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.dataset.Dataset
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.dataset.ImageDataset
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.dataset.PymuDocDataset
:members:
:inherited-members:

.. autoclass:: magic_pdf.data.dataset.Doc
:members:
:inherited-members:
Empty file added docs/en/api/io.rst
Empty file.
6 changes: 6 additions & 0 deletions docs/en/api/read_api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
read_api Api
------------------

.. automodule:: magic_pdf.data.read_api
:members:
:inherited-members:
Empty file added docs/en/api/schemas.rst
Empty file.
1 change: 1 addition & 0 deletions docs/en/api/utils.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

12 changes: 12 additions & 0 deletions docs/en/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,15 @@ Welcome to the MinerU Documentation
<a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
</p>


API Reference
-------------

If you are looking for information on a specific function, class or
method, this part of the documentation is for you.

.. toctree::
:maxdepth: 2

api
5 changes: 5 additions & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
boto3>=1.28.43
loguru>=0.6.0
myst-parser
Pillow==8.4.0
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
sphinx
sphinx-argparse
sphinx-book-theme
Expand Down
7 changes: 7 additions & 0 deletions magic_pdf/config/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

import enum


class SupportedPdfParseMethod(enum.Enum):
OCR = 'ocr'
TXT = 'txt'
32 changes: 32 additions & 0 deletions magic_pdf/config/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

class FileNotExisted(Exception):

def __init__(self, path):
self.path = path

def __str__(self):
return f'File {self.path} does not exist.'


class InvalidConfig(Exception):
def __init__(self, msg):
self.msg = msg

def __str__(self):
return f'Invalid config: {self.msg}'


class InvalidParams(Exception):
def __init__(self, msg):
self.msg = msg

def __str__(self):
return f'Invalid params: {self.msg}'


class EmptyData(Exception):
def __init__(self, msg):
self.msg = msg

def __str__(self):
return f'Empty data: {self.msg}'
Empty file added magic_pdf/data/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions magic_pdf/data/data_reader_writer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from magic_pdf.data.data_reader_writer.filebase import \
FileBasedDataReader # noqa: F401
from magic_pdf.data.data_reader_writer.filebase import \
FileBasedDataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
MultiBucketS3DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
MultiBucketS3DataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.s3 import S3DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.s3 import S3DataWriter # noqa: F401
from magic_pdf.data.data_reader_writer.base import DataReader # noqa: F401
from magic_pdf.data.data_reader_writer.base import DataWriter # noqa: F401
51 changes: 51 additions & 0 deletions magic_pdf/data/data_reader_writer/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

from abc import ABC, abstractmethod


class DataReader(ABC):

def read(self, path: str) -> bytes:
"""Read the file.
Args:
path (str): file path to read
Returns:
bytes: the content of the file
"""
return self.read_at(path)

@abstractmethod
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read the file at offset and limit.
Args:
path (str): the file path
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of the file
"""
pass


class DataWriter(ABC):
@abstractmethod
def write(self, path: str, data: bytes) -> None:
"""Write the data to the file.
Args:
path (str): the target file where to write
data (bytes): the data want to write
"""
pass

def write_string(self, path: str, data: str) -> None:
"""Write the data to file, the data will be encoded to bytes.
Args:
path (str): the target file where to write
data (str): the data want to write
"""
self.write(path, data.encode())
59 changes: 59 additions & 0 deletions magic_pdf/data/data_reader_writer/filebase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os

from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter


class FileBasedDataReader(DataReader):
def __init__(self, parent_dir: str = ''):
"""Initialized with parent_dir.
Args:
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
"""
self._parent_dir = parent_dir

def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read at offset and limit.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the length of bytes want to read. Defaults to -1.
Returns:
bytes: the content of file
"""
fn_path = path
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)

with open(fn_path, 'rb') as f:
f.seek(offset)
if limit == -1:
return f.read()
else:
return f.read(limit)


class FileBasedDataWriter(DataWriter):
def __init__(self, parent_dir: str = '') -> None:
"""Initialized with parent_dir.
Args:
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
"""
self._parent_dir = parent_dir

def write(self, path: str, data: bytes) -> None:
"""Write file with data.
Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write
"""
fn_path = path
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)

with open(fn_path, 'wb') as f:
f.write(data)
Loading

0 comments on commit 82dd7ac

Please sign in to comment.