Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Sep 5, 2024
2 parents ed9c0c3 + 6c07ec9 commit 7bf92bb
Show file tree
Hide file tree
Showing 21 changed files with 1,456 additions and 352 deletions.
2 changes: 2 additions & 0 deletions assets/full_configs.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ port = 8090
ssl-key = ""
ssl-cert = ""
repos-path = "./repos"
cache-size-limit = ""
cache-clean-strategy = "LRU"
hf-scheme = "https"
hf-netloc = "huggingface.co"
hf-lfs-netloc = "cdn-lfs.huggingface.co"
Expand Down
67 changes: 67 additions & 0 deletions docs/en/quickstart.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@

## Quick Start
Run the command in the console:
```bash
python -m olah.server
```

Then set the Environment Variable `HF_ENDPOINT` to the mirror site (Here is http://localhost:8090).

Linux:
```bash
export HF_ENDPOINT=http://localhost:8090
```

Windows Powershell:
```bash
$env:HF_ENDPOINT = "http://localhost:8090"
```

Starting from now on, all download operations in the HuggingFace library will be proxied through this mirror site.
```bash
pip install -U huggingface_hub
```

```python
from huggingface_hub import snapshot_download

snapshot_download(repo_id='Qwen/Qwen-7B', repo_type='model',
local_dir='./model_dir', resume_download=True,
max_workers=8)
```

Or you can download models and datasets by using huggingface cli.

Download GPT2:
```bash
huggingface-cli download --resume-download openai-community/gpt2 --local-dir gpt2
```

Download WikiText:
```bash
huggingface-cli download --repo-type dataset --resume-download Salesforce/wikitext --local-dir wikitext
```

You can check the path `./repos`, in which olah stores all cached datasets and models.

## Start the server
Run the command in the console:
```bash
python -m olah.server
```

Or you can specify the host address and listening port:
```bash
python -m olah.server --host localhost --port 8090
```
**Note: Please change --mirror-netloc and --mirror-lfs-netloc to the actual URLs of the mirror sites when modifying the host and port.**
```bash
python -m olah.server --host 192.168.1.100 --port 8090 --mirror-netloc 192.168.1.100:8090
```

The default mirror cache path is `./repos`, you can change it by `--repos-path` parameter:
```bash
python -m olah.server --host localhost --port 8090 --repos-path ./hf_mirrors
```

**Note that the cached data between different versions cannot be migrated. Please delete the cache folder before upgrading to the latest version of Olah.**
45 changes: 45 additions & 0 deletions docs/zh/quickstart.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
## 快速开始
在控制台运行以下命令:
```bash
python -m olah.server
```

然后将环境变量`HF_ENDPOINT`设置为镜像站点(这里是http://localhost:8090/)。

Linux:
```bash
export HF_ENDPOINT=http://localhost:8090
```

Windows Powershell:
```bash
$env:HF_ENDPOINT = "http://localhost:8090"
```

从现在开始,HuggingFace库中的所有下载操作都将通过此镜像站点代理进行。
```bash
pip install -U huggingface_hub
```

```python
from huggingface_hub import snapshot_download

snapshot_download(repo_id='Qwen/Qwen-7B', repo_type='model',
local_dir='./model_dir', resume_download=True,
max_workers=8)

```

或者你也可以使用huggingface cli直接下载模型和数据集.

下载GPT2:
```bash
huggingface-cli download --resume-download openai-community/gpt2 --local-dir gpt2
```

下载WikiText:
```bash
huggingface-cli download --repo-type dataset --resume-download Salesforce/wikitext --local-dir wikitext
```

您可以查看路径`./repos`,其中存储了所有数据集和模型的缓存。
9 changes: 7 additions & 2 deletions olah/cache/stat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import argparse

# coding=utf-8
# Copyright 2024 XiaHan
#
# Use of this source code is governed by an MIT-style
# license that can be found in the LICENSE file or at
# https://opensource.org/licenses/MIT.

import argparse
import os
import sys
from olah.cache.olah_cache import OlahCache
Expand Down
20 changes: 16 additions & 4 deletions olah/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
# license that can be found in the LICENSE file or at
# https://opensource.org/licenses/MIT.

from typing import List, Optional
from typing import List, Literal, Optional, Union
import toml
import re
import fnmatch

from olah.utils.disk_utils import convert_to_bytes

DEFAULT_PROXY_RULES = [
{"repo": "*", "allow": True, "use_re": False},
{"repo": "*/*", "allow": True, "use_re": False},
Expand Down Expand Up @@ -78,22 +80,24 @@ class OlahConfig(object):
def __init__(self, path: Optional[str] = None) -> None:

# basic
self.host = "localhost"
self.host: Union[List[str], str] = "localhost"
self.port = 8090
self.ssl_key = None
self.ssl_cert = None
self.repos_path = "./repos"
self.cache_size_limit: Optional[int] = None
self.cache_clean_strategy: Literal["LRU", "FIFO", "LARGE_FIRST"] = "LRU"

self.hf_scheme: str = "https"
self.hf_netloc: str = "huggingface.co"
self.hf_lfs_netloc: str = "cdn-lfs.huggingface.co"

self.mirror_scheme: str = "http" if self.ssl_key is None else "https"
self.mirror_netloc: str = (
f"{self.host if self.host != '0.0.0.0' else 'localhost'}:{self.port}"
f"{self.host if self._is_specific_addr(self.host) else 'localhost'}:{self.port}"
)
self.mirror_lfs_netloc: str = (
f"{self.host if self.host != '0.0.0.0' else 'localhost'}:{self.port}"
f"{self.host if self._is_specific_addr(self.host) else 'localhost'}:{self.port}"
)

self.mirrors_path: List[str] = []
Expand All @@ -105,6 +109,12 @@ def __init__(self, path: Optional[str] = None) -> None:

if path is not None:
self.read_toml(path)

def _is_specific_addr(self, host: Union[List[str], str]) -> bool:
if isinstance(host, str):
return host not in ['0.0.0.0', '::']
else:
return False

def hf_url_base(self) -> str:
return f"{self.hf_scheme}://{self.hf_netloc}"
Expand Down Expand Up @@ -134,6 +144,8 @@ def read_toml(self, path: str) -> None:
self.ssl_key = self.empty_str(basic.get("ssl-key", self.ssl_key))
self.ssl_cert = self.empty_str(basic.get("ssl-cert", self.ssl_cert))
self.repos_path = basic.get("repos-path", self.repos_path)
self.cache_size_limit = convert_to_bytes(basic.get("cache-size-limit", self.cache_size_limit))
self.cache_clean_strategy = basic.get("cache-clean-strategy", self.cache_clean_strategy)

self.hf_scheme = basic.get("hf-scheme", self.hf_scheme)
self.hf_netloc = basic.get("hf-netloc", self.hf_netloc)
Expand Down
33 changes: 27 additions & 6 deletions olah/database/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,37 @@

from olah.utils.olah_utils import get_olah_path



db_path = os.path.join(get_olah_path(), "database.db")
db = SqliteDatabase(db_path)


class BaseModel(Model):
class Meta:
database = db


class User(BaseModel):
username = CharField(unique=True)
class Token(BaseModel):
token = CharField(unique=True)
first_dt = DateTimeField()
last_dt = DateTimeField()

class DownloadLogs(BaseModel):
id = CharField(unique=True)
org = CharField()
repo = CharField()
path = CharField()
range_start = BigIntegerField()
range_end = BigIntegerField()
datetime = DateTimeField()
token = CharField()

class FileLevelLRU(BaseModel):
org = CharField()
repo = CharField()
path = CharField()
datetime = DateTimeField(default=datetime.datetime.now)

db.connect()
db.create_tables([
Token,
DownloadLogs,
FileLevelLRU,
])
44 changes: 41 additions & 3 deletions olah/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,17 @@ def error_repo_not_found() -> JSONResponse:
)


def error_page_not_found() -> Response:
return Response(
def error_page_not_found() -> JSONResponse:
return JSONResponse(
content={"error":"Sorry, we can't find the page you are looking for."},
headers={
"x-error-code": "RepoNotFound",
"x-error-message": "Sorry, we can't find the page you are looking for.",
},
status_code=404,
)

def error_entry_not_found(branch: str, path: str) -> Response:
def error_entry_not_found_branch(branch: str, path: str) -> Response:
return Response(
headers={
"x-error-code": "EntryNotFound",
Expand All @@ -38,3 +39,40 @@ def error_entry_not_found(branch: str, path: str) -> Response:
status_code=404,
)

def error_entry_not_found() -> Response:
return Response(
headers={
"x-error-code": "EntryNotFound",
"x-error-message": "Entry not found",
},
status_code=404,
)

def error_revision_not_found(revision: str) -> Response:
return JSONResponse(
content={"error": f"Invalid rev id: {revision}"},
headers={
"x-error-code": "RevisionNotFound",
"x-error-message": f"Invalid rev id: {revision}",
},
status_code=404,
)

# Olah Custom Messages
def error_proxy_timeout() -> Response:
return Response(
headers={
"x-error-code": "ProxyTimeout",
"x-error-message": "Proxy Timeout",
},
status_code=504,
)

def error_proxy_invalid_data() -> Response:
return Response(
headers={
"x-error-code": "ProxyInvalidData",
"x-error-message": "Proxy Invalid Data",
},
status_code=504,
)
Loading

0 comments on commit 7bf92bb

Please sign in to comment.