diff --git a/src/content/Docs/_sequence.ts b/src/content/Docs/_sequence.ts index 233e1b64..3f2d5c7c 100644 --- a/src/content/Docs/_sequence.ts +++ b/src/content/Docs/_sequence.ts @@ -47,23 +47,76 @@ export const docsSequence = [ }, { label: "Praetor App", - subItems: [{ label: "What is Praetor App?" }], + subItems: [ + { label: "What is Praetor App?"}, + { label: "Akash Provider", + subItems: [ + { label: "Build a Cloud Provider"}, + { label: "Existing Kubernetes Cluster"}, + { label: "Single Server Provider"}, + { label: "Build a Cloud Provider (Windows Machine)"}, + { label: "Sudo User Requirements"}, + { label: "Decommission a Provider"}, + ], + }, + ], }, { label: "Sandbox", subItems: [{ label: "Sandbox Introduction" }] }, { label: "Stable Payment Deployments" }, + ], }, { label: "Guides", - subItems: [ - { label: "TLS Termination of Akash Deployments" }, - { label: "Kava RPC Node Deployment" }, - { label: "Chia on Akash" }, - { label: "Polygon on Akash" }, - { label: "Mine Raptoreum on Akash Network" }, - { label: "Unstoppable Web 2.0" }, - { label: "Multi-Tiered Deployment" }, - { label: "Helium Validator" }, + subItems: [ + { + label: "Machine Learning", + subItems: [ + { label: "OneClickLLM"}, + { label: "DeTrain"}, + { label: "Ray"}, + { label: "FLock.io"}, + { label: "vLLM"}, + { label: "Llama-2 70B"}, + { label: "Akash Chat API"}, + ], + }, + { + label: "Deployments", + subItems: [ + { label: "TLS Termination of Akash Deployments" }, + { label: "Multi-Tiered Deployment" }, + { label: "Ruby on Rails with Sia and Auth0"}, + { label: "AkashOS"}, + { label: "Jackal S3 Gateway"}, + { label: "Cloudflare"}, + ], + }, + { + label: "Tooling", + subItems: [ + { label: "PostgreSQL restore/backup"}, + { label: "Provider Utilization Dashboard"}, + { label: "Akash Playground"}, + ], + }, + { + label: "Blockchain", + subItems: [ + { label: "Kava RPC Node Deployment" }, + { label: "Chia on Akash" }, + { label: "Polygon on Akash" }, + { label: "Mine Raptoreum on Akash Network" }, + ], + }, + { + label: "Apps", + subItems: [ + { label: "Waku"}, + { label: "qBittorrent"}, + ], + }, + ], }, { diff --git a/src/content/Docs/assets/1_homepage.png b/src/content/Docs/assets/1_homepage.png new file mode 100644 index 00000000..0ebb44be Binary files /dev/null and b/src/content/Docs/assets/1_homepage.png differ diff --git a/src/content/Docs/assets/2_general_settings.png b/src/content/Docs/assets/2_general_settings.png new file mode 100644 index 00000000..db3b7af4 Binary files /dev/null and b/src/content/Docs/assets/2_general_settings.png differ diff --git a/src/content/Docs/assets/3_node_config.png b/src/content/Docs/assets/3_node_config.png new file mode 100644 index 00000000..14ec022f Binary files /dev/null and b/src/content/Docs/assets/3_node_config.png differ diff --git a/src/content/Docs/assets/4_tensor_nodes.png b/src/content/Docs/assets/4_tensor_nodes.png new file mode 100644 index 00000000..facfe944 Binary files /dev/null and b/src/content/Docs/assets/4_tensor_nodes.png differ diff --git a/src/content/Docs/assets/5_training_script.png b/src/content/Docs/assets/5_training_script.png new file mode 100644 index 00000000..e00d06d5 Binary files /dev/null and b/src/content/Docs/assets/5_training_script.png differ diff --git a/src/content/Docs/assets/6_review_and_train_model.png b/src/content/Docs/assets/6_review_and_train_model.png new file mode 100644 index 00000000..acd82978 Binary files /dev/null and b/src/content/Docs/assets/6_review_and_train_model.png differ diff --git a/src/content/Docs/assets/7_code_base.png b/src/content/Docs/assets/7_code_base.png new file mode 100644 index 00000000..4af5b264 Binary files /dev/null and b/src/content/Docs/assets/7_code_base.png differ diff --git a/src/content/Docs/assets/akashos.png b/src/content/Docs/assets/akashos.png new file mode 100644 index 00000000..9abe33e6 Binary files /dev/null and b/src/content/Docs/assets/akashos.png differ diff --git a/src/content/Docs/assets/architecture.png b/src/content/Docs/assets/architecture.png new file mode 100644 index 00000000..fc68daff Binary files /dev/null and b/src/content/Docs/assets/architecture.png differ diff --git a/src/content/Docs/assets/flock_io-stake.png b/src/content/Docs/assets/flock_io-stake.png new file mode 100644 index 00000000..9c4c2374 Binary files /dev/null and b/src/content/Docs/assets/flock_io-stake.png differ diff --git a/src/content/Docs/assets/flock_io_select.png b/src/content/Docs/assets/flock_io_select.png new file mode 100644 index 00000000..6e04cd22 Binary files /dev/null and b/src/content/Docs/assets/flock_io_select.png differ diff --git a/src/content/Docs/assets/llama_cpu_vram.png b/src/content/Docs/assets/llama_cpu_vram.png new file mode 100644 index 00000000..59e0fec8 Binary files /dev/null and b/src/content/Docs/assets/llama_cpu_vram.png differ diff --git a/src/content/Docs/assets/llama_gpu_vram.png b/src/content/Docs/assets/llama_gpu_vram.png new file mode 100644 index 00000000..c1dd17ec Binary files /dev/null and b/src/content/Docs/assets/llama_gpu_vram.png differ diff --git a/src/content/Docs/assets/llama_logs.png b/src/content/Docs/assets/llama_logs.png new file mode 100644 index 00000000..3ffd0bb5 Binary files /dev/null and b/src/content/Docs/assets/llama_logs.png differ diff --git a/src/content/Docs/assets/playground.png b/src/content/Docs/assets/playground.png new file mode 100644 index 00000000..cee87426 Binary files /dev/null and b/src/content/Docs/assets/playground.png differ diff --git a/src/content/Docs/assets/provider-stats1.png b/src/content/Docs/assets/provider-stats1.png new file mode 100644 index 00000000..1424c352 Binary files /dev/null and b/src/content/Docs/assets/provider-stats1.png differ diff --git a/src/content/Docs/assets/provider-stats2.png b/src/content/Docs/assets/provider-stats2.png new file mode 100644 index 00000000..ad5bbd68 Binary files /dev/null and b/src/content/Docs/assets/provider-stats2.png differ diff --git a/src/content/Docs/assets/qbittorent.png b/src/content/Docs/assets/qbittorent.png new file mode 100644 index 00000000..ac3acb21 Binary files /dev/null and b/src/content/Docs/assets/qbittorent.png differ diff --git a/src/content/Docs/assets/ray-akashconsole.png b/src/content/Docs/assets/ray-akashconsole.png new file mode 100644 index 00000000..364b6a99 Binary files /dev/null and b/src/content/Docs/assets/ray-akashconsole.png differ diff --git a/src/content/Docs/assets/ray-akashconsoleyml.png b/src/content/Docs/assets/ray-akashconsoleyml.png new file mode 100644 index 00000000..31a57e59 Binary files /dev/null and b/src/content/Docs/assets/ray-akashconsoleyml.png differ diff --git a/src/content/Docs/assets/ray-work.png b/src/content/Docs/assets/ray-work.png new file mode 100644 index 00000000..7d08373e Binary files /dev/null and b/src/content/Docs/assets/ray-work.png differ diff --git a/src/content/Docs/assets/ray.png b/src/content/Docs/assets/ray.png new file mode 100644 index 00000000..0b7d5afb Binary files /dev/null and b/src/content/Docs/assets/ray.png differ diff --git a/src/content/Docs/assets/ssh-tunnel.png b/src/content/Docs/assets/ssh-tunnel.png new file mode 100644 index 00000000..24f6c2d6 Binary files /dev/null and b/src/content/Docs/assets/ssh-tunnel.png differ diff --git a/src/content/Docs/assets/vllm.png b/src/content/Docs/assets/vllm.png new file mode 100644 index 00000000..8bf415e8 Binary files /dev/null and b/src/content/Docs/assets/vllm.png differ diff --git a/src/content/Docs/assets/vllm2.png b/src/content/Docs/assets/vllm2.png new file mode 100644 index 00000000..d3239601 Binary files /dev/null and b/src/content/Docs/assets/vllm2.png differ diff --git a/src/content/Docs/guides/apps/qbittorent/index.md b/src/content/Docs/guides/apps/qbittorent/index.md new file mode 100644 index 00000000..20690d24 --- /dev/null +++ b/src/content/Docs/guides/apps/qbittorent/index.md @@ -0,0 +1,46 @@ +--- +categories: ["Guides"] +tags: ["Apps", "p2p", "file-sharing"] +weight: 1 +title: "qBittorrent" +linkTitle: "qBittorrent" +--- + +The [qBittorrent](https://www.qbittorrent.org/) project aims to provide an open-source software alternative to µTorrent. The provided deployment configuration [(`deploy-ssh-tunnel.yaml`)](https://github.com/akash-network/awesome-akash/blob/master/qbittorrent/deploy-ssh-tunnel.yaml) enables a relatively secure means of torrenting on Akash via qBittorrent. Please use this responsibly. + +![](../../../assets/qbittorent.png) + +## Overview + +The deployment uses a custom image (`ghcr.io/spacepotahto/qbittorrent:1.0.0`, code [here](https://github.com/spacepotahto/docker-qbittorrent-server)) that bundles the qBittorrent client ([base image](https://github.com/linuxserver/docker-qbittorrent) provided by LinuxServer.io) with a HTTP file server. This allows the user to access the qBitorrent WebUI to download files to the Akash provider, and to download the downloaded files through the HTTP file server. `deploy.yaml` and `deploy-ssh-tunnel.yaml` (recommended) differs in how security is handled. + +## `deploy.yaml` + +This configuration deploys the qBittorrent client + File Server, and the user can access them using their web browser. However, by default the Akash deployment is accessible through HTTP only, so traffic between the user and the qBittorrent WebUI + file server is unencrypted. Thus, it's recommended to enabled HTTPS. One way to do so is to use a custom domain with CloudFlare SSL as described in this [guide](https://teeyeeyang.medium.com/how-to-use-a-custom-domain-with-your-akash-deployment-5916585734a2) written by a community member. The downside is the traffic goes through CloudFlare. For personal use, using `deploy-ssh-tunnel.yaml` instead is recommended. + +### Usage + +Once deployed, the qBittorrent WebUI is accessible at `http://`. The default username and password is `admin` and `adminadmin` respectively. They can be changed via the WebUI settings once authenticated. + +When downloading files using the WebUI, keep the default download location at `/downloads`. Once the files finish downloading, you can download the files to your computer by accessing the HTTP file server that serves `/downloads` at `http://:`. + +## `deploy-ssh-tunnel.yaml` (Recommended) + +Since this is likely for personal use only, it's also possible to utilize a SSH tunnel in lieu of HTTPS to establish an encyrpted connection between the user and qBittorrent WebUI + file server. The diagram below illustrates how this works: + +![](../../../assets/ssh-tunnel.png) + +The qBittorrent WebUI + file server deployment is not exposed to the public internet, and is only accessible from the SSH enabled deployment. The SSH enabled deployment is only accessible by the user through an encrypted connection. Thus the user can securely access the qBittorrent WebUI + file server through SSH tunneling. + +### Usage + +The `deploy-ssh-tunnel.yaml` is configured to enable SSH tunneling (using the `ghcr.io/spacepotahto/openssh-server:1.0.0` image, code [here](https://github.com/spacepotahto/docker-openssh-server)), with the option to use password based or key based (generally safer) authentication through setting environment variables. Once deployed, you can establish the SSH tunnel with: + +``` + +ssh -p -N -L 8080:web:8080 -L 5000:web:5000 @ + +``` +Then in your browser, you can navigate to `http://localhost:8080` to access the qBittorrent WebUI. The default username and password is `admin` and `adminadmin` respectively. They can be changed via the WebUI settings once authenticated. + +When adding the torrent files or magnet links using the WebUI, keep the default download location at `/downloads`. Once the files finish downloading, you can download the files to your computer by accessing the HTTP file server that serves `/downloads` at `http://localhost:5000`. Due to the SSH tunneling, your file download is done through an encrypted connection. \ No newline at end of file diff --git a/src/content/Docs/guides/apps/waku/index.md b/src/content/Docs/guides/apps/waku/index.md new file mode 100644 index 00000000..1add6370 --- /dev/null +++ b/src/content/Docs/guides/apps/waku/index.md @@ -0,0 +1,118 @@ +--- +categories: ["Guides"] +tags: ["Apps", "Privacy", "Messaging"] +weight: 1 +title: "What is Waku?" +linkTitle: "Waku" +--- + +[Waku](https://waku.org/) is a family of robust, censorship-resistant communication protocols designed to enable privacy-focused messaging for web3 apps. + +In other words, Waku allows you to build decentralized applications which require any form of message transfer (e.g. chats, push notifications, event broadcasting, weak consensus/coordination, message queues). You can build your own application protocols on top of plug&play Waku protocols, which ensure your node will be well connected with the network and your messages will be broadcasted reliably. + +Waku also includes protocols friendly to resource restricted / adaptive devices like smartphones or laptops which may not have a lot of compute power or bandwith. + +## Who is this deployment for? + +The deployment manifests in this folder target users who wish to support the Waku Network as a node operator (i.e. making the network more robust, decentralized and private), but also users who would like to build an application on top of waku. + +We provide 2 deployment manifests - they both enable `Relay`, `Lightpush`, `Filter` and `Store` protocols and connect to The Waku Network, but differ in the archive backend for `Store` protocol. + +The [deploy.sqlite.yaml](https://github.com/akash-network/awesome-akash/blob/master/waku/deploy.sqlite.yaml) is the simpler of two, leverages SQLite for archive, hence results in simpler and slightly cheaper deployment. + +The [deploy.yaml](https://github.com/akash-network/awesome-akash/blob/master/waku/deploy.yaml) deploys Postgresql database instance in addition to the `nwaku` node. + +## How to + +Waku nodes connecting to The Waku Network require access to an Ethereum (Sepolia Testnet) RPC endpoint. You can obtain one from miriad of node providers like Infura, Chainstack, Quicknode, or run one yourself (even on Akash:)). + +The RPC node URL needs to be configured in `ETH_CLIENT_ADDRESS` environment variable of the deployment manifest. + +Another mandatory environment variable is `IP_ADDR`. This is the IP address leased for this deplyoment. Sadly, it cannot be obtained before the actual deployment happens neither can it be resolved from inside the deployment. This means that after deploying first with empty `IP_ADDR`, you need to get the address of the IP lease and **update the deployment** by adding the value to the environment variable. Only after you do that, the node will be able to properly announce itself to the network and other peers will be able to connect to it. + +### Deploying as Relayer + +If you just want to support the network without intent to publish your own messages, you can simple add the RPC node URL and submit the deployment manifest. After the node syncs the RLN membership tree it will start relaying messages. You don't need to do anything else. + +### Deploying as Publisher + +In case your goal is to use this node to publish messages, you will need to obtain an RLN membership yourself. For that you can use a helper script in [`nwaku-compose`](https://github.com/waku-org/nwaku-compose/blob/master/register_rln.sh) which allows you to register the membership and produces `keystore.json` file. You will need to encode the content of this file with `base64` encoding and pass it into the deployment fail as `RLN_RELAY_CRED_BASE64`. + +### Deploying with Postgres + +This deployment not only deploys Waku node, but also a Postgres database serving as an archive for the store protocol. The manifest including Postgres instance is availabale in [`deploy.yaml`](https://github.com/akash-network/awesome-akash/blob/master/waku/deploy.yaml). + +You may want to change the `POSTGRES_PASSWORD` in `services.node.env` and `services.postgres.env` (make sure both values match!). + +### Deploying with Permanent PeerId + +PeerID is part of the node's identity. It is based on a private key which randomly generated by default. In case you want or need to hardcode your node in your application, you will need a stable/permanent PeerID. This is achieved by providing a value in the `NODEKEY` environment variable. You can generate a node key by running + +``` + +openssl rand -hex 32 + +``` + +### Deploying with Domain Name + +There is an environment variable `DOMAIN` available in the manifests which allows you to provide a domain name which can be used to address your node. This requires a 2 step deployment as we cannot predict the IP lease before the actual instance is deployed. + +1. Deploy one of the manifests with empty `DOMAIN`. + +2. Check the IP leased to your deployment. + +3. Create an A record for the desired domain name and the leased IP. + +4. Update your deployment with the `DOMAIN` value. + +After applying the updated deployment, you should see Let's Encrypt certificates being provisioned in logs and then node starting and relaying messages. + +### After You Deploy + +Two important pieces of information about your node are the node's multiaddresses and ENR. You can find both early in the logs output after the deployment. + +``` + +[node]: INF 2023-08-14 13:59:38.175+00:00 PeerInfo topics="waku node" tid=1 file=waku_node.nim:796 peerId=16U*9q9fq5 addrs=@[] +[node]: INF 2023-08-14 13:59:38.175+00:00 Listening on topics="waku node" tid=1 file=waku_node.nim:803 full=[/dns4/waku.myrandomdemos.online/tcp/60000/p2p/16Uiu2HAmDdZ1brt7nq717ugWSK1EcGdaxUMVmHeVFzcPGb9q9fq5][/dns4/waku.myrandomdemos.online/tcp/8000/wss/p2p/16Uiu2HAmDdZ1brt7nq717ugWSK1EcGdaxUMVmHeVFzcPGb9q9fq5] +[node]: INF 2023-08-14 13:59:38.175+00:00 DNS: discoverable ENR topics="waku node" tid=1 file=waku_node.nim:804 enr=enr:-OG4QIzHr0Xd9OJVY3cDxqmDvwprccDQcRL0km9LR-q0MnjwFXZsqri_mnFwECqzVOxi78YierreeH9DUyYpdCeWZvIBgmlkgnY0gmlwhLhporWKbXVsdGlhZGRyc7hCAB42GXdha3UubXlyYW5kb21kZW1vcy5vbmxpbmUG6mAAIDYZd2FrdS5teXJhbmRvbWRlbW9zLm9ubGluZQYfQN4DiXNlY3AyNTZrMaEDDn10Z_V6Qh_BJV0BA_Y7wuTaApavCGi0WiIoZkMlGXyDdGNwgupgg3VkcIIjLYV3YWt1Mg8 + +``` + +You can verify that your deployment was successful and your node is reachable by connecting to it using [`wakucanary`](https://github.com/waku-org/nwaku/releases/latest) tool or, if you used the full deployment, by connecting via WSS using one of the [js-waku-examples](https://examples.waku.org/light-js/). + +You can monitor your deployment with Prometheus and use a Grafana dashboard available in https://github.com/waku-org/nwaku-compose/tree/master/monitoring. + +You should see basic metrics printed in logs as well: + +``` + +[node]: INF 2023-08-15 10:39:37.807+00:00 Relay peer connections topics="waku node peer_manager" tid=1 file=peer_manager.nim:683 inRelayConns=4/160 outRelayConns=17/80 totalConnections=24/300 notConnectedPeers=137 outsideBackoffPeers=5 +[node]: INF 2023-08-15 10:39:39.460+00:00 Finished dialing multiple peers topics="waku node peer_manager" tid=1 file=peer_manager.nim:532 successfulConns=0 attempted=3 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total connections initiated topics="waku node metrics" tid=1 file=waku_metrics.nim:56 count=0.0 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total messages topics="waku node metrics" tid=1 file=waku_metrics.nim:57 count=231479.0 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total store peers topics="waku node metrics" tid=1 file=waku_metrics.nim:58 count=0.0 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total peer exchange peers topics="waku node metrics" tid=1 file=waku_metrics.nim:59 count=0.0 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total lightpush peers topics="waku node metrics" tid=1 file=waku_metrics.nim:60 count=0.0 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total filter peers topics="waku node metrics" tid=1 file=waku_metrics.nim:61 count=0.0 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total active filter subscriptions topics="waku node metrics" tid=1 file=waku_metrics.nim:62 count=0.0 +[node]: INF 2023-08-15 10:39:40.532+00:00 Total errors topics="waku node metrics" tid=1 file=waku_metrics.nim:63 count=1.0 +[node]: INF 2023-08-15 10:40:10.532+00:00 Total connections initiated topics="waku node metrics" tid=1 file=waku_metrics.nim:56 count=0.0 + +``` + +Notice the `inRelayConns` and `outRelayConns` - those represent the number of nodes in peer-to-peer relay network you are connected to. + +Notice the `Total messages` metric - this represents how many messages went through your node. + +## Links + +- [Nwaku Akash repository](https://github.com/vpavlin/nwaku-akash) + +- [nwaku](https://github.com/waku-org/nwaku) + +- [Waku Docs](https://docs.waku.org/) + +- [js-waku](https://github.com/waku-org/js-waku) + diff --git a/src/content/Docs/guides/chia-on-akash/index.md b/src/content/Docs/guides/blockchain/chia-on-akash/index.md similarity index 99% rename from src/content/Docs/guides/chia-on-akash/index.md rename to src/content/Docs/guides/blockchain/chia-on-akash/index.md index b8ce0d1d..3cf45452 100644 --- a/src/content/Docs/guides/chia-on-akash/index.md +++ b/src/content/Docs/guides/blockchain/chia-on-akash/index.md @@ -12,7 +12,7 @@ Welcome [Chia](https://www.chia.net/) community! We are excited to announce supp ## Summer Sale -![](../../assets/SummerSaleforChiaPlots5.png) +![](../../../assets/SummerSaleforChiaPlots5.png) ### Providers @@ -283,7 +283,7 @@ deployment: To access the Chia Plot Manager, click on the \`Uri\` link on the deployment detail page.\ To download plots, click an invididual plot in the Chia Plot Manager and click on Download/Open. -![Chia Plot Manager](<../../assets/image (7).png>) +![Chia Plot Manager](<../../../assets/image (7).png>) \*Once your download has finished - Delete the plot from the container - to make room for new plots! Plots will continue to be created as long as there is enough free space available in the container (Max 32Tb) and the deployment is fully funded. diff --git a/src/content/Docs/guides/kava-rpc-node-deployment/index.md b/src/content/Docs/guides/blockchain/kava-rpc-node-deployment/index.md similarity index 89% rename from src/content/Docs/guides/kava-rpc-node-deployment/index.md rename to src/content/Docs/guides/blockchain/kava-rpc-node-deployment/index.md index a4ef17e1..95e34d70 100644 --- a/src/content/Docs/guides/kava-rpc-node-deployment/index.md +++ b/src/content/Docs/guides/blockchain/kava-rpc-node-deployment/index.md @@ -33,7 +33,7 @@ The Akash Console can be accessed [here](https://console.akash.network/). - If Keplr is not installed as a browser extension and/or a funded Akash account is not available, follow the instructions in our [Keplr Guide](/docs/getting-started/token-and-wallets/#keplr-wallet) - Select the desired Akash account in Keplr and then select the `Connect Wallet` option within the Akash Console as shown below -![](../../assets/akashConsoleWallet.png) +![](../../../assets/akashConsoleWallet.png) ## Kava RPC Node Deployment @@ -41,15 +41,15 @@ The Akash Console can be accessed [here](https://console.akash.network/). - Within the Akash Console template gallery, locate the Kava card and select the `Deploy Now` option -![](../../assets/akashConsoleDeployment.png) +![](../../../assets/akashConsoleDeployment.png) - Proceed with the deployment of the Kava Node by selecting the `Deploy Now` option -![](../../assets/akashConsoleProceedWithDeployment.png) +![](../../../assets/akashConsoleProceedWithDeployment.png) - Assign the Deployment an appropriate name and then click `Review SDL` -![](../../assets/akashConsoleEditSDL.png) +![](../../../assets/akashConsoleEditSDL.png) - The Kava RPC Node snapshot is updated every 24 hours and must be changed in the Akash SDL - Obtain the latest snapshot URL [here](https://polkachu.com/tendermint_snapshots/kava). Find the `DOWNLOAD` hyperlink > right click > and Copy Link Address. @@ -59,21 +59,21 @@ The Akash Console can be accessed [here](https://console.akash.network/). - Select `Save & Close` when this single Akash SDL update is in place. -![](../../assets/akashConsoleSnapshotUpdate.png) +![](../../../assets/akashConsoleSnapshotUpdate.png) - Proceed by selecting `Create Deployment` -![](../../assets/akashConsoleCreateDeployment.png) +![](../../../assets/akashConsoleCreateDeployment.png) - The Akash Console will conduct necessary pre-deployment verifications to ensure that a wallet is connected with sufficient funds and that a certificate exists to communicate with the deployment - If all pre-deployment checks pass, select the `Next` option to proceed -![](../../assets/akashConsolePreflightCheck.png) +![](../../../assets/akashConsolePreflightCheck.png) - A Keplr wallet prompt will display requesting approval of a small blockchain fee to proceed with deployment creation - Select the `Approve` option to proceed -![](../../assets/akashConsoleDeploymentFees.png) +![](../../../assets/akashConsoleDeploymentFees.png) - The Akash open marketplace displays available cloud providers to deploy your Kava RPC Node on - Select the cloud provider of your preference @@ -81,18 +81,18 @@ The Akash Console can be accessed [here](https://console.akash.network/). > _**NOTE -**_ the cloud providers available for your deployment may be different than those shown in the example below -![](../../assets/akashConsoleSelectProvider.png) +![](../../../assets/akashConsoleSelectProvider.png) - Accept the Keplr prompt to approve small blockchain fee for lease creation with the selected cloud provider -![](../../assets/akashConsoleLeaseFees.png) +![](../../../assets/akashConsoleLeaseFees.png) ## Kava RPC Node Deployment Complete - When the deployment of the Kava RPC Node is complete and live on the selected cloud provider, a verification screen will display - Proceed to the [Kava RPC Node Health Check](#kava-rpc-node-health-check) section to conduct a health/status check of the node -![](../../assets/akashConsoleLeaseStatus.png) +![](../../../assets/akashConsoleLeaseStatus.png) ## Kava RPC Node Health Check @@ -101,11 +101,11 @@ The Akash Console can be accessed [here](https://console.akash.network/). - In the Akash Console a URL for the deployment is displayed - Click on the URL hyperlink -![](../../assets/akashConsoleDeploymentURI.png) +![](../../../assets/akashConsoleDeploymentURI.png) - From the displayed web page, select the `status` link to view the current state of the Kava RPC Node -![](../../assets/akashConsoleNodeStatus.png) +![](../../../assets/akashConsoleNodeStatus.png) #### Expected Status When Node is in Sync diff --git a/src/content/Docs/guides/mine-raptoreum-on-akash/index.md b/src/content/Docs/guides/blockchain/mine-raptoreum-on-akash/index.md similarity index 98% rename from src/content/Docs/guides/mine-raptoreum-on-akash/index.md rename to src/content/Docs/guides/blockchain/mine-raptoreum-on-akash/index.md index 63ba0c66..d6d2f368 100644 --- a/src/content/Docs/guides/mine-raptoreum-on-akash/index.md +++ b/src/content/Docs/guides/blockchain/mine-raptoreum-on-akash/index.md @@ -7,7 +7,7 @@ linkTitle: "Mine Raptoreum on Akash Network" description: How to Mine Raptoreum (RTM) on Akash Network --- -![](../../assets/raptoreumAkashlytics.png) +![](../../../assets/raptoreumAkashlytics.png) ## Why use Akash? @@ -102,7 +102,7 @@ deployment: Akash is a marketplace of compute. Providers set their own prices for compute resources. We recommend you try different providers and check your logs after deployment to determine the hashrate. -![](<../../assets/chooseProvider (1).png>) +![](<../../../assets/chooseProvider (1).png>) ## How to speed up mining? diff --git a/src/content/Docs/guides/polygon-on-akash/index.md b/src/content/Docs/guides/blockchain/polygon-on-akash/index.md similarity index 100% rename from src/content/Docs/guides/polygon-on-akash/index.md rename to src/content/Docs/guides/blockchain/polygon-on-akash/index.md diff --git a/src/content/Docs/guides/deployments/akashos/index.md b/src/content/Docs/guides/deployments/akashos/index.md new file mode 100644 index 00000000..eaafb62c --- /dev/null +++ b/src/content/Docs/guides/deployments/akashos/index.md @@ -0,0 +1,173 @@ +--- +categories: ["Guides"] +tags: ["Deployment"] +weight: 1 +title: "AkashOS: Akash Provider OS - Ubuntu Server 24.04 Edition" +linkTitle: "AkashOS" +--- + + +![](../../../assets/akashos.png) + + +[AkashOS](https://github.com/cryptoandcoffee/akashos/releases) is an innovative solution for those aspiring to become a provider within the Akash Network. By utilizing Autoinstall and cloud-init, AkashOS offers a seamless, unattended installation of Ubuntu Server. It autonomously establishes a Kubernetes cluster and deploys Helm charts to configure the system as an Akash Network provider. + +After installation, users can configure the provider via a user-friendly Dashboard/GUI or through SSH, providing a versatile approach to provider configuration. The installation process is designed to be intuitive, requiring users to answer a few straightforward questions, ensuring a smooth setup experience. + +## 🌟 Become a Provider with Ease! + +Start your journey as a provider with a minimal investment of 25 AKT, valued at $75 at the time of writing. Explore unlimited earning possibilities within the Akash Network. + +- 🧮 **Estimate Your Earnings:** Curious about what your hardware could be earning? Check out the [Akash Calculator](https://akashcalcualtor.com)! +- 📊 **Explore Existing Provider Earnings:** Discover what existing providers are earning in real-time on the Akash Network at [Akash Dash](https://akashdash.com). + + +## 🛠 Quick & Easy Setup! +Download and attach the latest AkashOS Release ISO to your chosen hardware: Bare-Metal, VPS, or Virtual Machine. and watch it transform into a provider on the Akash Network! + +## 💡 Why AkashOS? +- **Streamlined & Automated:** Effortlessly install Ubuntu Server and configure your system with this automated setup. +- **Infinite Earnings:** Unlock unparalleled earning potential as a provider. +- **Versatile Application:** Compatible with various setups, ensuring everyone can join. + +## What is this image best used for? + +You can use this image to take over any x86 machine or virtual machine that you want to configure as a provider on the Akash Network. + +## Target Audience for this ISO + +You should be familiar with at least one of the following: + +1. Hypervisor (Proxmox/VMware) +2. Homelab +3. Unraid/TrueNas +4. DevOps/SRE/Kubernetes Administration +4. Full-stack development + +## Installation Difficulty Level + +### Medium (terminal experience required) + +Human requirements, estimated time: ~30 minutes + +- Acquire at least 50 AKT +- Add DNS records +- Forward ports + +Software requirements, estimated time: ~30 minutes + +- Install [AkashOS](https://github.com/cryptoandcoffee/akashos/releases) +- Configure Pricing + +## Dependencies + +### Human Requirements + +1. Be ready to run workloads for dWeb. Understand what you are getting into and be prepared to learn. +2. Docker and Kubernetes experience will greatly help you, learn all you can. +3. With great power comes great responsibility. Be aware of the risks and use [Lens](https://k8slens.dev/) to monitor your cluster. +4. Report any offending wallet if you experience any abuse, DDoS, spam, or other issues to [Akash](https://discord.akash.network/). + +### Software Requirements + +1. Domain name (example.com) that you own and can manage DNS records. +2. 50 AKT to send to the new provider's wallet. +3. Access to your firewall/router for port forwarding. +4. [Lens](https://k8slens.dev/) - Recommended for cluster daily operations; you'll need this to interact with your new cluster. +5. One of [Balena Etcher](https://www.balena.io/etcher/), [Rufus](https://rufus.ie/), or [Ventoy](https://www.ventoy.net/en/index.html) for creating bootable USB drives on Linux, Mac, PC. +6. Dynamic DNS update client and domain for residential IPs. + +### Minimum Hardware Requirements +**First Node** + +- 2 CPU / 4 Threads +- 8 GB Memory +- 64 GB Disk + +**Additional Nodes** + +- 1 CPU +- 2 GB Memory +- 8 GB Disk + +### Setup Instructions + +**Proxmox / VirtualBox / VMware** + +1. Download the [AkashOS ISO](https://github.com/cryptoandcoffee/akashos/releases). +2. Create VM - Attach a disk drive with the ISO. +3. Start the VM. +4. Reboot when the install is completed and detach the ISO. +5. Login with default username and password (both of which are `akash`) and then follow the on-screen instructions. +6. Once the system has rebooted, go to the Control Panel. +7. Update the provider attributes with the recommended values and click Save. +8. Click STOP next to Provider. +9. Click "Re-Deploy Provider" Button. +10. Send at least 5 AKT to the new wallet address to start the provider. +11. Click Download Kubeconfig and import it into Lens. Set the Namespace to "All" to see everything. + + +**Bare Metal Datacenter with IPMI/ISO Support** + +1. Download the [AkashOS ISO](https://github.com/cryptoandcoffee/akashos/releases). +2. Upload the ISO to the datacenter ISO storage location (Vultr/HostHatch/etc) or attach the ISO to your IPMI Virtual Console Session. +3. Start the machine with the ISO for the boot drive (F11 may be required). +4. Reboot when the install is completed and detach the ISO. +5. Login with default username and password (both of which are `akash`) and then follow the on-screen instructions. +6. Once the system has rebooted, go to the Control Panel. +7. Update the provider attributes with the recommended values and click Save. +8. Click STOP next to Provider. +9. Click "Re-Deploy Provider" Button. +10. Send at least 5 AKT to the new wallet address to start the provider. +11. Click "Download Kubeconfig" and import it into Lens. Set the Namespace to "All" to see everything. + +**USB Key** + +1. Download the [AkashOS ISO](https://github.com/cryptoandcoffee/akashos/releases). +2. Use one of [Balena Etcher](https://www.balena.io/etcher/), [Rufus](https://rufus.ie/), or [Ventoy](https://www.ventoy.net/en/index.html) to write the ISO to a USB key. +3. Insert the USB key into the computer you want to make an Akash provider. +4. Start the machine with the USB key for the boot drive (F11 may be required). +5. Reboot when the install is completed and unplug the USB key. +6. Login with default username and password (both of which are `akash`) and then follow the on-screen instructions. +7. Once the system has rebooted, go to the Control Panel. +8. Update the provider attributes with the recommended values and click Save. +9. Click STOP next to Provider. +10. Click "Re-Deploy Provider" Button. +11. Send at least 5 AKT to the new wallet address to start the provider. +12. Click "Download Kubeconfig" and import it into Lens. Set the Namespace to "All" to see everything. + +### Todos +- When changing pricing parameters, delete the configmap `akash-provider-bidscripts` from `akash-services` before re-deploying. +- Remove the static/dynamic question during the initial boot as it may confuse users. +- Show nodes in the cluster on the Dashboard with `kubectl get nodes -A -o wide`. +- Allow adding a new node to the cluster with just an IP address. +- Remove the question for adding a node to the cluster for the original IP; all add/remove operations should happen from the Dashboard only. +- Update `run-helm-k3s` to use functions so each can be called separately. +- Update the bid-engine script with the latest version. +- Add/Remove Attributes from Dashboard and default GPU, etc. + + +Stack + +``` + +Copy code + Akash Provider + || + ------------- + | Helm Charts | + ------------- + || + ------------- + | Kubernetes | + ------------- + || + ----------------------- +| cloud-init | + ----------------------- + || + ------------------------- +| Ubuntu 22.04 AutoInstall | + -------------------------- + + ``` \ No newline at end of file diff --git a/src/content/Docs/guides/deployments/cloudflare/index.md b/src/content/Docs/guides/deployments/cloudflare/index.md new file mode 100644 index 00000000..41e2cffb --- /dev/null +++ b/src/content/Docs/guides/deployments/cloudflare/index.md @@ -0,0 +1,7 @@ +--- +categories: ["Guides"] +tags: ["HTTPS/SSL", "Custom Domain", "Cloudflare"] +weight: 1 +title: "Using Cloudflare to enable custom domain names and HTTPS/SSL" +linkTitle: "Cloudflare" +--- \ No newline at end of file diff --git a/src/content/Docs/guides/deployments/jackal-s3/index.md b/src/content/Docs/guides/deployments/jackal-s3/index.md new file mode 100644 index 00000000..53620e5d --- /dev/null +++ b/src/content/Docs/guides/deployments/jackal-s3/index.md @@ -0,0 +1,129 @@ +--- +categories: ["Guides"] +tags: ["Deployment"] +weight: 1 +title: "Jackal S3-Compatible Node.js File Storage Server" +linkTitle: "Jackal S3 Gateway" +--- + +[This project](https://github.com/One-Punch-Cloud/Jackal-S3-Gateway) provides an S3-compatible Node.js file storage server that interacts with the Jackal filesystem. The server supports operations like creating buckets, uploading files, downloading files, listing files, and deleting files, all secured with simple authentication. + +## Features + +- S3-compatible API endpoints. + +- Authentication using `ACCESS_KEY_ID` and `SECRET_ACCESS_KEY`. + +- File operations are performed directly on the Jackal filesystem. + +## Prerequisites + +- Node.js and npm + +- A Jackal mnemonic and network information + +## Getting Started + +### Installation + +1. Clone the repository: + +``` + +git clone https://github.com/yourusername/jackal-s3-server.git +cd jackal-s3-server + +``` + +2. Install the required packages: + +``` + +npm install + +``` + +3. Create a .env file in the root directory with the following content: + +``` + +SIGNER_CHAIN=lupulella-2 +MNEMONIC=your_jackal_mnemonic +QUERY_ADDR=https://testnet-grpc.jackalprotocol.com +TX_ADDR=https://testnet-rpc.jackalprotocol.com +ACCESS_KEY_ID=your_access_key_id +SECRET_ACCESS_KEY=your_secret_access_key + +``` + +### Running the Server + +Start the server with: + +``` + +node server.js + +``` + +### API Endpoints + +#### List Buckets + +GET / + +``` + +curl -H "x-access-key-id: your_access_key_id" -H "x-secret-access-key: your_secret_access_key" http://localhost:3000/ + +``` + +#### Create Bucket + +PUT /:bucket + +``` + +curl -X PUT -H "x-access-key-id: your_access_key_id" -H "x-secret-access-key: your_secret_access_key" http://localhost:3000/your-bucket-name + +``` + +#### List Objects in Bucket + +GET /:bucket + +``` + +curl -H "x-access-key-id: your_access_key_id" -H "x-secret-access-key: your_secret_access_key" http://localhost:3000/your-bucket-name + +``` + +#### Upload File + +PUT /:bucket/:key + +``` + +curl -X PUT -H "x-access-key-id: your_access_key_id" -H "x-secret-access-key: your_secret_access_key" -F "file=@/path/to/your/file" http://localhost:3000/your-bucket-name/your-file-name + +``` + +#### Download File + +GET /:bucket/:key + +``` + +curl -H "x-access-key-id: your_access_key_id" -H "x-secret-access-key: your_secret_access_key" -o /path/to/save/file http://localhost:3000/your-bucket-name/your-file-name + +``` + +#### Delete File [not implemented yet] + +DELETE /:bucket/:key + +``` + +curl -X DELETE -H "x-access-key-id: your_access_key_id" -H "x-secret-access-key: your_secret_access_key" http://localhost:3000/your-bucket-name/your-file-name + +``` \ No newline at end of file diff --git a/src/content/Docs/guides/multi-tiered-deployments/index.md b/src/content/Docs/guides/deployments/multi-tiered-deployments/index.md similarity index 99% rename from src/content/Docs/guides/multi-tiered-deployments/index.md rename to src/content/Docs/guides/deployments/multi-tiered-deployments/index.md index 1e765e95..8b957e04 100644 --- a/src/content/Docs/guides/multi-tiered-deployments/index.md +++ b/src/content/Docs/guides/deployments/multi-tiered-deployments/index.md @@ -1,5 +1,5 @@ --- -categories: ["Guides"] +categories: ["Deployments"] tags: ["Blockchain"] weight: 1 title: "Multi-Tiered Deployment" diff --git a/src/content/Docs/guides/ruby-on-rails-with-sia-and-auth0/index.md b/src/content/Docs/guides/deployments/ruby-on-rails-with-sia-and-auth0/index.md similarity index 99% rename from src/content/Docs/guides/ruby-on-rails-with-sia-and-auth0/index.md rename to src/content/Docs/guides/deployments/ruby-on-rails-with-sia-and-auth0/index.md index 9fc2afeb..f00e2c5e 100644 --- a/src/content/Docs/guides/ruby-on-rails-with-sia-and-auth0/index.md +++ b/src/content/Docs/guides/deployments/ruby-on-rails-with-sia-and-auth0/index.md @@ -1,5 +1,5 @@ --- -categories: ["Guides"] +categories: ["Deployments"] tags: ["Blockchain"] weight: 1 title: "Ruby on Rails with Sia and Auth0" diff --git a/src/content/Docs/guides/tls-termination-of-akash-deployment/index.md b/src/content/Docs/guides/deployments/tls-termination-of-akash-deployment/index.md similarity index 99% rename from src/content/Docs/guides/tls-termination-of-akash-deployment/index.md rename to src/content/Docs/guides/deployments/tls-termination-of-akash-deployment/index.md index a7bdbae2..0bfdae80 100644 --- a/src/content/Docs/guides/tls-termination-of-akash-deployment/index.md +++ b/src/content/Docs/guides/deployments/tls-termination-of-akash-deployment/index.md @@ -1,5 +1,5 @@ --- -categories: ["Guides"] +categories: ["Deployments"] tags: ["Blockchain"] weight: 1 title: "TLS Termination of Akash Deployments" diff --git a/src/content/Docs/guides/machine-learning/detrain/index.md b/src/content/Docs/guides/machine-learning/detrain/index.md new file mode 100644 index 00000000..a4367847 --- /dev/null +++ b/src/content/Docs/guides/machine-learning/detrain/index.md @@ -0,0 +1,199 @@ +--- +categories: ["Guides"] +tags: ["AI/ML", "Training", "Framework"] +weight: 1 +title: "DeTrain" +linkTitle: "DeTrain" +--- + +## Introduction + +[DeTrain](https://github.com/a2nfinance/detrain) is a framework for distributed training and model parallelism (MP). DeTrain includes tools and lightweight libraries using PyTorch to simplify distributed training pipelines. + +## Demo information + +- [Frontend Application](https://detrain-console.a2n.finance/) + +- [Demo video](https://www.youtube.com/watch?v=YaSvU51iQg0) + +- [DeTrain Agent Docker image](https://hub.docker.com/r/a2nfinance/detrain-nodes) + +- [DeTrain Python library](https://pypi.org/project/detrain/) + +## Repositories + +DeTrain's different components would later be maintained in seperate repositories, namely: + +- [DeTrain Python Library](https://github.com/a2nfinance/detrain-python-lib) + +- [DeTrain Console](https://github.com/a2nfinance/detrain-console) + +- [DeTrain Agent](https://github.com/a2nfinance/detrain-agent) + +- [DeTrain Examples](https://github.com/a2nfinance/detrain-example) + +## Below are screenshots of a walkthrough of the app + +1. **Home page (My Pipelines)** + + +![](../../../assets/1_homepage.png) + + +2. **New pipeline - General settings** + +![](../../../assets/2_general_settings.png) + +3. **New pipeline - PP training - Nodes settings** + +![](../../../assets/3_node_config.png) + +4. **New pipeline - TP training - Nodes settings** + +![](../../../assets/4_tensor_nodes.png) + +5. **New pipeline - Training script settings** + +![](../../../assets/5_training_script.png) + +6. **New pipeline - Review & start training** + +![](../../../assets/6_review_and_train_model.png) + +7. **DeTrain - Code base** + +![](../../../assets/7_code_base.png) + +## Artchitecture + +![](../../../assets/architecture.png) + +The DeTrain project comprises four components: + +- **DeTrain Python Lightweight Library**: This library, currently in the early development phase, is constructed based on the PyTorch Distributed package and PyTorch RPC package. It simplifies the development of distributed AI model training code by reducing complexity. DeTrain primarily focuses on MP training, with two types: Tensor parallelism (TP) and Pipeline parallelism (PP). + + - To facilitate communication between training nodes in PP training, DeTrain utilizes remote methods of the RPC package. + + - For distributed optimization, DeTrain employs distributed autograd and the loss_parallel context. + + - To integrate with Data Parallelism (DP), DeTrain utilizes DeviceMesh 2D. + +- **Agent**: This component consists of two parts: + + - A Docker image including Python with support for Cuda 11.8, FastAPI, Uvicorn, Gunicorn, and DeTrain libraries. + + - A simple software program that assists nodes in communicating with each other and with the DeTrain console. + +- **DeTrain Console**: This web application aids developers in designing custom pipelines for training AI models tailored to user-defined infrastructure. + +- **Sample SDL Templates & Example Code for Distributed Training Jobs**: This component provides examples to guide developers on how to define infrastructure for PP & TP training. + +Examples for training are currently included in `model_parallelism/detrain/examples` for easy reference. They would later be maintained in the [DeTrain Examples repository](https://github.com/a2nfinance/detrain-example). + +## Technology Stack + +### DeTrain console + +- **NextJS** and Ant Design for the frontend. + +- **AkashJS** for interacting with Akash networks. + +- **Cosmos Kit** for blockchain integration. + +- **Mongoose** for database management. + +### Agent software + +- **FastAPI and StreamingResponse** function for the backend. + +- **Uvicorn and Gunicorn** for ASGI server. + +- **Docker** for containerization. + +### DeTrain Python library + +- **PyTorch RPC and Distributed packages** for distributed training functionalities. + +### Infrastructure + +- **Akash Networks** marketplace. + +## Installation + +### To run DeTrain console frontend application + +- `cd frontend` + +- `npm i` + +- `npm run dev` to run it in dev mode + +- `npm run build && npm run start` for the production mode. + +### To deploy nodes on Akash + +- Go to the [Akash console](https://console.akash.network) + +- Select "upload SDL" + +- Go to folder "SDL template", select one of templates. + +- Change settings and start deployment. + +### To customize DeTrain docker image for agents + +- Go to folder `agent` + +- Modify Dockerfile if you want to add new commands or change the base image + +- To add new function, go to `main.py` and add your code block + +- Build new image + +- Push to your docker repository. + +## Run examples manually + +Ensure that DeTrain library is installed on your machines. + +### To test DeTrain python library + +- Go to folder `model_parallelism/examples` + +- Select what kind of training you want to see + + - PPL: Pipeline parallelism + + - TP: Tensor parallelism + + - FSDP + TP: Full sharding data parallelism + Tensor Parallelism + +- All examples contain logs and evaluation steps, these functions can make training process longer. These files for testing and instruction only. + +### Torchrun commands: + +- For PPL training: + +`torchrun --nnodes=3 --nproc_per_node=1 --node_rank=0 --master_addr=localhost --master_port=9999 main.py --gpu="0_0_0" --epochs=2 --batch_size=40 --lr=0.001 --model_name="ppl_04"` + +Node rank is the rank of each node joining the training process. If you have one master node and two GPU worker nodes, you need to run each command for each node with node_rank values of 0, 1, and 2. The master address is the address of the master node for tensor offloading. + +- For TP and FSDP + TP training: + +`torchrun --nnodes=1 --node_rank=0 --nproc_per_node=2 --rdzv_id=101 --rdzv-backend=c10d --rdzv_endpoint="localhost:9999" main.py --gpu="0_0_0" --epochs=4 --batch_size=50 --lr=0.001 --model_name="ppl_04"` + +If you have N nodes, rdzv_endpoint is the rendezvous backend endpoint: use localhost on the master node and use internal IP or public IP on the remaining nodes. + +## Run examples using the DeTrain console + +Use this tool; you don't need to set up any environment, SSH to a remote node, and run commands manually. Follow these steps: + +- Deploy new nodes on Akash. + +- Use `DeTrain console` to create new pipelines. + + + + + + diff --git a/src/content/Docs/guides/machine-learning/flock_io/index.md b/src/content/Docs/guides/machine-learning/flock_io/index.md new file mode 100644 index 00000000..763bcbe4 --- /dev/null +++ b/src/content/Docs/guides/machine-learning/flock_io/index.md @@ -0,0 +1,210 @@ +--- +categories: ["Guides"] +tags: ["AI & ML"] +weight: 1 +title: "Running FLock.io Nodes on Akash" +linkTitle: "FLock.io" +--- + +This is a step-by-step guide for successfully deploying either a [FLock Training Node](https://console.akash.network/templates/akash-network-awesome-akash-FLock-training-node) or [FLock Validator](https://console.akash.network/templates/akash-network-awesome-akash-FLock-validator) directly from the [Akash Console](https://console.akash.network). + +## Pre-Requisites + +Before running FLock.io Validator and/or Training Nodes on Akash, users must have the following: + +- Whitelisted Ethereum address on train.flock.io. Not whitelisted? Complete FLock.io [whitelist form](https://blog.flock.io/news/trainflock)*. + +- Ethereum-supported Web3 wallet, such as MetaMask (used to stake FML, FLock.io’s testnet token on Base Sepolia). + +- IBC-compatible Web3 wallet, such as Keplr (funded with AKT or USDC used to pay for Akash compute). + +- `HF_USERNAME` and `HF_TOKEN` from a HuggingFace account. + +*At the time of writing FLock.io’s decentralized training platform is still in private beta. Whitelist restrictions will be lifted in the coming months as the team progressively opens the platform up to the community.* + +## Set up train.flock.io + +Before you deploy on Akash, you must first do the following on train.flock.io: + +- Stake FML on the task you wish to participate in. + +- Get the Task ID. + +- Get your API key. + +Staking FML makes you eligible to participate in the training task as a Training Node or Validator. Once staked you will need the Task ID and your API key in order to run the deployments on Akash. + +### 1. Select the task you want to stake + +*NOTE: On the Stake page, be sure you are on the Training Node or Validator tab, depending on how you want to participate in the training task.* + +![](../../../assets/flock_io_select.png) + +### 2. Stake FML tokens on the task + +![](../../../assets/flock_io-stake.png) + +### 3. Get the Task ID & API Key + +You can find the Task ID on the Tasks tab, and the API key can be found by clicking the upper right button where your address is displayed. + +Now that you’ve successfully staked, and retrieved the Task ID and API key, you are ready to deploy a FLock.io node on Akash. + +## Running a FLock.io Training Node on Akash + + +On the [Templates](https://console.akash.network/templates) page in Akash Console, search and select “FLock-Training-Node”. The SDL (Stack Definition Language) is a pre-populated template of FLock’s [`testnet-training-node-quickstart`](https://github.com/FLock-io/testnet-training-node-quickstart) script (`image: public.ecr.aws/e7z6j8c3/flock:training-quickstart-akash`). + +This script automates the training process and submits up to 6 models per day. Click “Deploy” and update the SDL with the following environment variables: + +- `FLOCK_API_KEY` - API key from [train.flock.io](https://train.flock.io/). + +- `HF_USERNAME` - [HuggingFace](https://hugginface.co) username. + +- `TASK_ID` - ID for the task that you staked on through [train.flock.io](https://train.flock.io/). + +- `HF_TOKEN` - token associated with your HuggingFace account + +You will also notice an optional `GIT_URL` environment variable. Trainers who wish to further customize training beyond the out-of-the-box `dataset demo_data.json` or `training_args.yml` included in the training script can update `GIT_URL`. + +Here’s a full look at the `deploy.yml` SDL: + +``` + +--- +version: "2.0" + +services: + flock-train: + image: public.ecr.aws/e7z6j8c3/flock:training-quickstart-akash + env: + - FLOCK_API_KEY= + - HF_USERNAME= + - TASK_ID= + - HF_TOKEN= + # Choose whether to use your own dataset demo_data.jsonl or training_args.yml + #- GIT_URL= + expose: + - port: 3000 + as: 80 + to: + - global: true + +profiles: + compute: + flock-train: + resources: + cpu: + units: 8 + memory: + size: 24Gi + storage: + size: 100Gi + gpu: + units: 1 + attributes: + vendor: + nvidia: + - model: h100 + - model: a100 + - model: rtx3090 + - model: rtx4090 + - model: rtx8000 + - model: rtxa6000 + - model: a40 + - model: p40 + placement: + akash: + pricing: + flock-train: + denom: uakt + amount: 10000 + +deployment: + flock-train: + akash: + profile: flock-train + count: 1 + +``` + +After you’ve created your deployment, choose a GPU provider commensurate with the task. Most training tasks can be completed using GPU with 16GB vRAM, though 24GB is recommended if you would like to train larger model. + +## Running a FLock.io Validator Node on Akash + +On the [Templates](https://console.akash.network/templates) page in Akash Console, search and select `FLock Validator`. The SDL is a pre-populated template of FLock’s `llm-loss-validator` script (`image: ghcr.io/flock-io/llm-loss-validator:v0.0.6`). + +This script listens for submissions from Training Nodes, then picks up and completes validation assignments. Click “Deploy” and update the SDL with the following environment variables: + +- `FLOCK_API_KEY` - API key from train.flock.io. + +- `TASK_ID` - ID for the task that you staked on through train.flock.io. + +- `HF_TOKEN` - token associated with your HuggingFace account. + +Here’s a full look at the `deploy.yml` SDL: + +``` + +--- +version: "2.0" + +services: + flock-validater: + image: ghcr.io/flock-io/llm-loss-validator:v0.0.6 + env: + - FLOCK_API_KEY= + # support multi_task, such as 1,2,3 + - TASK_ID= + - HF_TOKEN= + expose: + - port: 3000 + as: 80 + to: + - global: true + +profiles: + compute: + flock-validater: + resources: + cpu: + units: 8 + memory: + size: 24Gi + storage: + size: 100Gi + gpu: + units: 1 + attributes: + vendor: + nvidia: + - model: h100 + - model: a100 + - model: rtx3090 + - model: rtx4090 + - model: rtx8000 + - model: rtxa6000 + - model: a40 + - model: p40 + placement: + akash: + pricing: + flock-validater: + denom: uakt + amount: 10000 + +deployment: + flock-validater: + akash: + profile: flock-validater + count: 1 + +``` + +After you’ve created your deployment, choose a compute provider commensurate with the task. Validation assignments require minimal compute so it is possible to complete the task with many of the less resource-intensive options available. + + + + + + diff --git a/src/content/Docs/guides/machine-learning/llama-2/index.md b/src/content/Docs/guides/machine-learning/llama-2/index.md new file mode 100644 index 00000000..f3cf398d --- /dev/null +++ b/src/content/Docs/guides/machine-learning/llama-2/index.md @@ -0,0 +1,43 @@ +--- +categories: ["Guides"] +tags: ["AI/ML", "Training", "Framework"] +weight: 1 +title: "Llama-2 70B" +linkTitle: "Llama-2 70B" +--- + +Meta developed and publicly released the Llama 2 family of large language models (LLMs), a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. + +In this deployment, the [`meta-llama/Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) pretrained model is used, which generates a continuation of the incoming text. But to access this model you must have access granted by the Meta. Nothing complicated, but it's a bit inconvenient, so someone created their own Hugging Face repository [`cryptoman/converted-llama-2-70b`](https://huggingface.co/cryptoman/converted-llama-2-70b) with this model weights with open access, since the license allows it. + +There is also a meta-llama/Llama-2-70b-chat-hf model that are optimized for dialogue use cases and can answer questions. The "converted" and "hf" in the model names means that the model is converted to the Hugging Face Transformers format. + +In this deployment, the model is loaded using QLoRa. It reduces the memory usage of LLM finetuning without performance tradeoffs compared to standard 16-bit model finetuning. This method enables 33B model finetuning on a single 24GB GPU and 65B model finetuning on a single 46GB GPU. QLoRA uses 4-bit quantization to compress a pretrained language model. Model loads in 4bit using NF4 quantization with double quantization with the compute dtype bfloat16 More details can be found [here](https://huggingface.co/blog/4bit-transformers-bitsandbytes). + +## Deploying + +This model require >40Gb of GPU VRAM. Tested on NVIDIA A6000 and H100 GPUs. + +![](../../../assets/llama_gpu_vram.png) + +You'll need more than 300 MB of VRAM is not enough to work on NVIDIA A100. Using that amount the application starts, but when the generation function is called, the error "CUDA error: out of memory" appears. + +1[](../../../assets/llama_cpu_vram.png) + +When the deployment begins, 15 model weights files will be downloaded with a total of 130 GB and loaded into memory, and this may take some time. You can watch the loading process in the logs. + + +Use this [SDL](https://github.com/akash-network/awesome-akash/blob/master/Llama-2-70B/deploy.yaml) to deploy the application on Akash. There are two environment variables in SDL: + +- `MAX_INPUT_TOKEN_LENGTH` - this value specifies the maximum number of incoming text tokens that will be directly processed by the model. Text is truncated at the left end, as the model works to write a continuation of the entered text. The larger this value, the better the model will understand the context of the entered text (if it is relatively large text), but it will also require more computing resources. + +- `MAX_NEW_TOKENS` - this value specifies how many new tokens the model will generate. The larger this value, the more computing resources are required. + +These parameters must be selected depending on the tasks that have been applied to the model and the power of the GPUs. + +## Logs + +The logs on the screenshot below show that the loading of the model weights files has completed and the Uvicorn web server has started and the application is ready to work. + +![](../../../assets/llama_logs.png) + diff --git a/src/content/Docs/guides/machine-learning/ray/index.md b/src/content/Docs/guides/machine-learning/ray/index.md new file mode 100644 index 00000000..809e81fe --- /dev/null +++ b/src/content/Docs/guides/machine-learning/ray/index.md @@ -0,0 +1,164 @@ +--- +categories: ["Guides"] +tags: ["AI & ML"] +weight: 1 +title: "Distributed Machine Learning on Akash Network With Ray" +linkTitle: "Ray" +--- + +## Background + +The [proliferation](https://akash.network/blog/the-fast-evolving-ai-landscape/) of open source AI and ML models in the past year has enabled developers to build applications at a fairly rapid pace. This includes not just inference but even fine-tuning a model to adapt it to custom data sets and application needs and in some cases, even training a foundation model. Open source libraries from [Pytorch](https://pytorch.org/), [Tensorflow](https://www.tensorflow.org/), [Keras](https://keras.io/), Scikit-learn and others have allowed those without extensive experience in AI and ML, to relatively easily build Python based applications that leverage advanced AI capabilities. The challenge then shifts to being able to horizontally scale workloads to take advantage of a large number of computers, so as to be able to accelerate time to market and/ or run a service in production and scale it in response to user demand. + +This concept of taking an ML workload that is built to run on a single GPU (server) and enabling it to run on a number of GPU servers (referred to as a “cluster”) is what is called “clustering” and “scaling”. The challenge of course is that, in order to do this, the application that was built to run on a single server needs to be parallelized to run on multiple machines. Doing this “natively” (by adding support for it directly in the application code) would require an advanced understanding of cloud infrastructure and parallel computing. This creates a technical barrier to being able to launch and scale such applications. + +## Ray to save the day! + +[Ray](https://github.com/ray-project/ray) is an open source framework that enables software developers not trained in distributed systems to easily leverage distributed computing. It does this by removing the burden of needing to natively parallelize a machine learning application, while enabling computations to scale out across a cluster of servers. This allows AI and ML developers to easily scale out their application or workload across a cluster of servers, without having to write additional code for that or needing to understand the details of the underlying infrastructure. + +Some of the capabilities Ray offers include: + +- **Automatic scaling**: In theory at least, a ray cluster creates the virtual abstraction of a single computer across a distributed cluster that can be very large. + +- **Fault Tolerance**: By automatically rerouting tasks to other machines, when one or more nodes fail, Ray enables resilience that is crucial for long running AI training workloads and/ or production scale inference. + +- **State Management**: With built in support for sharing and coordinating data across tasks and flexibility to use a variety of storage options - including in-memory (redis or equivalent) as well as cloud storage (S3 equivalent). + +## Ray AI Runtime (AIR) + +Ray’s ultimate goal is to provide a simple programmatic interface for developers to leverage distributed computing, so that you don’t have to be a distributed systems expert to be able to deploy, run, scale and manage your ML workload across a cluster of machines. The Ray AI Runtime (AIR) is the toolkit that implements this abstraction on top of the core Ray functionality. + +![](../../../assets/ray.png) + +The libraries made available as part of the AIR toolkit enable organizations to run the full gamut of ML workloads on distributed computing platforms like Akash Network. + +- [Ray Serve](https://docs.ray.io/en/latest/serve/index.html): Framework-agnostic model serving library that can be used to build and deploy end-to-end distributed AI/ ML inference applications. + +- [Ray Tune](https://docs.ray.io/en/latest/tune/index.html): Library for ML experiment execution and [Hyperparameter Tuning](https://en.wikipedia.org/wiki/Hyperparameter_optimization). + +- [Ray Train](https://docs.ray.io/en/latest/train/train.html): Scalable machine learning library for distributed training and fine-tuning. + +## Typical Ray Workflow + +The typical Ray based machine learning workflow may involve the following steps: + +1. Build your AI/ML application (typically Python code) + +2. Write a YAML file that will define your Ray cluster + +3. Execute commands on the CLI to turn on/ off the cluster + +4. Submit jobs and monitor them via the (web) dashboard + +![](../../../assets/ray-work.png) + +## Ray on Akash + +Ray is fairly infrastructure agnostic and works with Docker containers - so it naturally lends itself to being a great solution to provide an abstraction of a “single server” on the Akash Network decentralized cloud. + +To enable users of Akash Network to easily utilize Ray’s capabilities, the ThumperAI team working with the Overclock Labs team has built a set of docker images and an Akash SDL (Stack Definition Language) template that can be used as a reference for anyone wanting to run Ray clusters on Akash. The source code for this can be found on the “awesome-akash” repository (contains an ever growing set of reference templates for running various common applications on Akash) [here](https://github.com/akash-network/awesome-akash/tree/master/ray). + +The reference template is specifically optimized for GPU-based workloads and deploys a Ray cluster consisting of one Head Node and six Worker Nodes. + +You can customize the reference to your specific workload needs in the following ways: + +1. To update the version of Ray, Python or CUDA, update the Dockerfiles and replace the Ray Docker image used there, with one from here. The image used in the reference contains Python version 3.10, CUDA version 11.8. + +``` + +FROM rayproject/ray-ml:nightly-py310-cu118 +EXPOSE 6380 +EXPOSE 8265 +RUN sudo apt-get install git-lfs s3fs -y +RUN git lfs install --skip-repo +RUN pip install s3fs +COPY /starthead.sh . +RUN sudo chmod 777 /home/ray/starthead.sh +RUN sudo chmod a+x /home/ray/starthead.sh +RUN sudo chown ray /home/ray/starthead.sh +RUN sudo chmod 777 /home/ray +# ENTRYPOINT ["bash -c"] +CMD ["/home/ray/starthead.sh"] + +``` + +Note that you will need to rebuild the docker images for the head node and the worker node and push them to a container registry, if you update the Dockerfiles. Follow the instructions in the [README](https://github.com/akash-network/awesome-akash/blob/master/ray/README.md) for that. + +2. To update the number of worker nodes in your Ray cluster, modify the [deployment example file](https://github.com/akash-network/awesome-akash/blob/master/ray/deployment_example.yaml) (note that the reference deploys 6 Ray worker nodes). + +``` + +version: '2.0' +services: +ray-head: + image: thumperai/rayakash:ray-head-gpu-py310 + expose: + + - port: 8265 + as: 8265 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + - global: true + - port: 6380 + as: 6380 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + - global: true + + - port: 8078 + as: 8078 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + - port: 8079 + as: 8079 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + - port: 10002 + as: 10002 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + - port: 10003 + as: 10003 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + - port: 10004 + as: 10004 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + - port: 10005 + as: 10005 + to: + - service: ray-worker, ray-worker1, ray-worker2, ray-worker3, ray-worker4, ray-worker5 + +``` + +3. You will need to add various environment variables for your AWS access key and secret (if you are using S3 for storage), MinIO access key and secret (if using MinIO) and other things: + +- `RAY_ADDRESS_HOST`: Specifies the address of the head node. Only edit if you are trying to use ray across multiple providers. + +- `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`: Credentials for AWS services. + +- `R2_BUCKET_URL`, `S3_ENDPOINT_URL`: URLs for S3-compatible storage services. + +- `B2_APPLICATION_KEY_ID`, `B2_APPLICATION_KEY`: Credentials for Backblaze B2 storage. + +- `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`: Credentials for MinIO storage. + +- `AWS_DEFAULT_REGION`: The default AWS region for services. + +- `WANDB_API_KEY`, `WANDB_PROJECT`: Credentials and project name for Weights & Biases logging. + +4. Update the resources needed for your specific workload, per worker, by modifying the service definition for `ray-head` and each `ray-worker` in the deployment example YAML file. + +Once you have those things set up correctly, you can head over to the [Akash Console](https://console.akash.network/) and use the template builder option to deploy your Ray cluster on Akash. + +![](../../../assets/ray-akashconsole.png) + +![](../../../assets/ray-akashconsoleyml.png) + +And here is a quick rin throigh of what the whole end-to-end deployment workflow looks like: + +[![How To Deploy a Ray Cluster on Akash Network](https://markdown-videos-api.jorgenkh.no/url?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpRsDy6rBY_A)](https://www.youtube.com/watch?v=pRsDy6rBY_A) + + + diff --git a/src/content/Docs/guides/machine-learning/vvlm/index.md b/src/content/Docs/guides/machine-learning/vvlm/index.md new file mode 100644 index 00000000..7fb96a7e --- /dev/null +++ b/src/content/Docs/guides/machine-learning/vvlm/index.md @@ -0,0 +1,155 @@ +--- +categories: ["Guides"] +tags: ["AI & ML"] +weight: 1 +title: "Running vLLM on Akash" +linkTitle: "vLLM" +--- + +## Introduction + +vLLM is an LLM server implementation first introduced in a paper in 2023. Its primary objective was to make LLM inference faster for multiuser services. What vLLM does also achieve is overcoming some of the limitations of Ollama. vLLM enables you to serve more than 1 user at a time, natively, without having to proxy user requests between multiple GPUs. It also allows has somewhere between 2-4X the throughput of Ollama for concurrent requests. + +The main change vLLM makes is adding Paged Attention to an LLM model by swapping out all the transformer attention modules for Paged Attention which implements attention more efficiently. The authors of vLLM describe Page Attention as, “Paged Attention’s memory sharing greatly reduces the memory overhead of complex sampling algorithms, such as parallel sampling and beam search, cutting their memory usage by up to 55%. This can translate into up to 2.2x improvement in throughput”. You can read more about the technical details of paged attention in the offical vLLM blog post. The current server implementation has gone beyond just Paged Attention and will soon support speculative encoding approaches. Other open source alternatives to vLLM include [Hugging Face](https://huggingface.co/)’s TGI and the sglang engine with its Radix Attention implementation. The only drawback to using vLLM is that it doesn’t support all of the super low quantization methods and file formats such as GGUF. If you haven’t used GGUF before in llama.cpp-based tools like ollama then you should note that most people actively try to avoid using models with quantization lower than the 4bit (Q4) quantization due to performance issues. The good news is that most models are available in GPTQ or AWQ quantization formats that are supported by vLLM. + +![](../../../assets/vllm.png) + +At the time of the original paper (June 2023), vLLM dramatically outperformed TGI. Since then other 3rd parties have also found vLLM to be one of the most performant open source LLM server implementations available for concurrent requests. Again, one thing to note is that no one is putting Ollama or llama.cpp based implementations onto these benchmarks because those LLM servers are really intended to only support one user at a time and they are not suitable for being used as a service. + +![](../../../assets/vllm2.png) + +## Preparation + +1. Create an Akash account and ensure you have AKT tokens. + +2. Login to console.akash.network with your wallet to launch an instance with an SDL (YAML) found in [vLLM](https://github.com/akash-network/awesome-akash/tree/master/vllm) folder of the [Awesome-Akash repository](https://github.com/akash-network/awesome-akash/). + +## Containerization + +We are going to use the latest official vLLM container image: `vLLM/vLLM-openai:v0.4.0.post1`. + +You can also build your own image using the Dockerfile in the root of the vLLM repo. + +**Note:** +*You should never use "latest" as a tag for your containers in Akash SDL and that if you have a new model you should check if it has official vLLM support and note the date to make sure the container has been pushed since support has been added.* + +## Deployment + +1. **Create a Deployment Configuration**: Create a YAML file for your vLLM deployment, including Docker configurations, resource requirements, and port exposures. See the example below which you should be able to copy and paste into Akash Console. + +2. **Deploy**: Use Akash Console to deploy your application, which matches you with a suitable provider based on your deployment specifications. + +3. **Use LLM UI**: After deployment, utilize the Akash Console field to find the IP address of the service and you should be good to go. + +4. **Use LLM API**: After deployment, utilize the Akash Console field to find the IP address of the vLLM service and add the URI and API key variables to whichever client you are using (e.g. ”http://localhost:8000/v1”). + +You can find an example of vLLM using CrewAI in the `vLLM_crew_notebook_deployment.yml`. + +Below is a code snippet using the LLM with Langchain in Python. Tool calling should be supported pretty well by any model that is as performant as WizardLM2-7B or better. + +``` + +import os +os.environ["OPENAI_API_KEY"]="MYPASSWORD" +#if inside akash service +os.environ["OPENAI_API_BASE"]="http://vLLM:8000/v1" +#if outside akash service update as needed based on provider, the port will change for every new deployment +os.environ["OPENAI_API_BASE"]="https://provider.hurricane.akash.pub:31816/v1" +#update for your model name +os.environ["OPENAI_MODEL_NAME"] = "MaziyarPanahi/WizardLM-2-7B-AWQ" + +from langchain_community.llms import vLLMOpenAI +llm = vLLMOpenAI( + openai_api_key="MYPASSWORD", + openai_api_base="http://vLLM:8000/v1", + model_name="MaziyarPanahi/WizardLM-2-7B-AWQ", +) +or +print(llm.invoke("Rome is")) + +``` + +The vLLM server is designed to be compatible with the OpenAI API, allowing you to use it as a drop-in replacement for applications using the OpenAI API. + +This repository contains 4 example vLLM YAMLs, one example without a user interface and 3 with the OpenWebUI tool: + +- `vLLM_no_ui_deployment.yml` a basic example without a user interface + +- `vLLM_with_openwebui_dolphin2-9-llama3-70b.yml` + +- `vLLM_with_openwebui_wizardlm2-7b.yml` + +- `vLLM_with_openwebui_wizardlm2-8x22.yml` + +- `vLLM_crew_notebook_deployment.yml` + + +The vLLM server supports the following OpenAI API endpoints: + +- List models + +- Create chat completion + +- Create completion + +## Sizing vLLM for a number of different users + +Sizing LLM Server resources for a particular application can be challenging because of the impact of model choice, quantization of that model, GPU hardware, and usage pattern ( human being vs agent). Anyscale ( the company behind Ray) has released a great LLM benchmarking tool called llmperf that is worth using for benchmarking your use case with your specific application requirements. Aside from using this tool, it has been reported that a single Nvidia A100 GPU can support between 10-20 concurrent users for 7B parameter Mistral model with AWQ on vLLM with lower throughput for other server options. Larger models will have lower throughput. There are also a lot of performance improvements going from 1 to 2 GPUs in a vLLM server, but this effect diminishes rapidly. + +## Troubleshooting + +If you don’t see the model in the OpenWebUI dropdown it usually means you chose a model that was too large for GPU, too large for disk space, or has a bad [Hugging Face](https://huggingface.co/) repo entered into the deployment. You can usually verify which issue is from the logs. You will usually have to redeploy the deployment to change these parameters. + +**Steps to Troubleshoot** + +1. Read the logs for the vLLM container and the UI container. + +2. Check that the model weights are downloading correctly and can be loaded into VRAM. + +3. Check the model size is smaller than VRAM and container disk space. + +4. Check the [Hugging Face](https://huggingface.co/) model repo to make sure the repo and author are correct. + +5. Check the [Hugging Face](https://huggingface.co/) repo model type and make sure its not GGUF and is compatible with vLLM (GPTQ, AWQ, GGML, Safetensors etc). + +6. Make sure that the model is [officially supported by vLLM](https://docs.vllm.ai/en/latest/models/supported_models.html). + +7. Check the environment variables for setting the `vLLM_API_KEY` and `OPENAI_API_KEYS` environment variables match. + +8. Check that the the `HUGGING_FACE_HUB_TOKEN` is set for downloading models. + +9. If you are using the API externally make sure you have updated the url to use the deployer endpoint variable which can be done by setting `OPENAI_API_BASE`. + +9. If you have checked all of these and still have problems than open a [issue in the awesome-akash repo](https://github.com/akash-network/awesome-akash/issues) and tag [@rakataprime](https://github.com/rakataprime). In the issue, please provide your logs and deployment used with the [Hugging Face](https://huggingface.co/) token and other secrets set to XXXXXXXXXXXX. + +## Choosing The Right GPU for An LLM Model + +- vLLM supported files formats include : GPTQ, AWQ, GGML(squeezelm ), and pytorch pth/bin and safetensors files. + +- vLLM DOES NOT SUPPORT GGUF files. + +- Check to make sure the base foundation model is supported [here](https://docs.vllm.ai/en/latest/models/supported_models.html). + +- Make sure that the file size is smaller than the VRAM of the GPU with a little buffer room. Try to make sure the model files fit within 90% of the Total VRAM. + +- Make sure the vLLM container has a large of disk space to store the model weights. + +- If you are doing something funny with really large context lengths you can use the tools below to help estimate VRAM utilization. + +- Remember that really large models like Grok sometimes require multiple gpus. the max number of gpus supported by this vLLM example is 8 gpus or 640Gb VRAM for a100. + +- If you needed more than 8 gpus you can use a larger ray cluster instead, but this is beyond the scope of this example. you can contact logan@thumper.ai for asistance if you need help with this. + +### Tools for Checking Model VRAM requirements + +- [“Can-it-run” - HuggingFace](https://huggingface.co/spaces/Vokturz/can-it-run-llm) + +- [“LLM-Model-VRAM-Calculator” - HuggingFace](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator) + +## Learn more about vLLM + +- [vLLM Docs](https://docs.vllm.ai/) + +- [vLLM repo](https://github.com/vLLM-project/vLLM) + +- [In-depth comparison study](https://pages.run.ai/hubfs/PDFs/Serving-Large-Language-Models-Run-ai-Benchmarking-Study.pdf) \ No newline at end of file diff --git a/src/content/Docs/guides/tooling/playground/index.md b/src/content/Docs/guides/tooling/playground/index.md new file mode 100644 index 00000000..0db32c57 --- /dev/null +++ b/src/content/Docs/guides/tooling/playground/index.md @@ -0,0 +1,90 @@ +--- +categories: ["Guides"] +tags: ["AI/GPU", "Marketplace"] +weight: 3 +title: "Akash Playground" +linkTitle: "Akash Playground" +--- + +[Akash Playground](https://github.com/javiersuweijie/plygrnd-ui/tree/main) is a community-driven marketplace where users can upload, run, and monetize their AI models without the hassle of managing underlying infrastructure. This platform simplifies infrastructure management, making AI models more accessible and usable for various applications. It is compatible with [Hugging Face Spaces](https://huggingface.co/spaces) repositories, facilitating seamless integration and collaboration. + +![](../../../assets/playground.png) + + +## Motivation + +The rapid development of AI models has spurred numerous innovative applications. However, the lack of discoverability and ease of use limits the growth of users within the AI ecosystem. While Hugging Face Spaces democratizes the sharing and demonstration of AI models, it doesn't directly benefit model builders. Akash Playground bridges this gap by creating a marketplace that rewards model builders for their contributions. + +## Key Features + +- **Model Upload and Deployment**: Developers can upload their AI models to the platform, which will be deployed on Akash Cloud's infrastructure. This is achieved by sharing a GitHub or Hugging Face repository that runs a Gradio application. + +- **Ease of Use**: Users can access models by launching a playground with a single click, selecting from all the public models indexed by the platform. + +- **Infrastructure Abstraction**: The platform abstracts the complexities of infrastructure management, allowing users to focus on model development and deployment. + +- **Monetization Mechanism**: Model builders receive a share of the infrastructure costs when their models are used by others, incentivizing contributions to the ecosystem. + +- **Compatibility with Hugging Face Spaces**: The platform is compatible with Hugging Face Spaces repositories, enabling seamless integration and collaboration. + +- **Community Engagement**: The platform includes a community forum for discussions, feedback, and collaboration among users. + +## Setup & Deployment + +### Clone the Repo & Install the Dependencies + +Clone the repo: + +``` +https://github.com/javiersuweijie/plygrnd-ui.git + +``` + +Once you've successfully clone the repo, run the following command: + +``` + +cd plygrnd-ui + +``` + +Install the dependencies with `npm install` (or `pnpm install` or `yarn install`). + +### Setting Up the Environment Variables + +Create a .env file and set the following environment variables: + +``` + +AKASH_KEY_NAME=plygrnd +AKASH_KEYRING_BACKEND=os +AKASH_NET="https://raw.githubusercontent.com/akash-network/net/main/mainnet" +AKASH_VERSION=v0.6.1 +AKASH_CHAIN_ID=akashnet-2 +AKASH_NODE=https://rpc.akashnet.net:443 +AKASH_GAS_ADJUSTMENT=1.15 +AKASH_GAS_PRICES=0.025uakt +AKASH_GAS=auto +AKASH_SIGN_MODE=amino-json + +AKASH_MNEMONIC= +AKASH_ACCOUNT_ADDRESS= +AKASH_CERT= +AKASH_CERT_PUB_KEY= +AKASH_CERT_KEY= +SUPABASE_KEY= + +``` + +### Starting the Development Server + +Start the development server by running the following command: + +``` +npm run dev + +``` + +By using Akash Playground, developers can effortlessly deploy and monetize their AI models, while users can easily access a wide range of AI capabilities, driving growth and innovation in the AI ecosystem. + + diff --git a/src/content/Docs/guides/postgres-sql-restore-or-backup/index.md b/src/content/Docs/guides/tooling/postgres-sql-restore-or-backup/index.md similarity index 100% rename from src/content/Docs/guides/postgres-sql-restore-or-backup/index.md rename to src/content/Docs/guides/tooling/postgres-sql-restore-or-backup/index.md diff --git a/src/content/Docs/guides/tooling/provider-stats/index.md b/src/content/Docs/guides/tooling/provider-stats/index.md new file mode 100644 index 00000000..67ed773e --- /dev/null +++ b/src/content/Docs/guides/tooling/provider-stats/index.md @@ -0,0 +1,82 @@ +--- +categories: ["Guides"] +tags: ["Dashboard"] +weight: 3 +title: "Akash Provider Node Utilization Dashboard" +linkTitle: "Provider Utilization Dashboard" +--- + + +The [Akash Provider Utilization Dashboard](https://github.com/Zblocker64/provider-stats) has three pods with three diffrent services: + +1. **Periodic Shell Script:** This script runs at a configurable frequency, making a gRPCurl request to a server with reflection enabled. It retrieves the current provider utilization data and stores the JSON response in a database. +2. **NoSQL Database (CouchDB):** This database stores the server utilization logs (in JSON format) retrieved by the shell script. +3. **Web Application:** This app features a dashboard that displays the daily average utilization of resources such as CPU, GPU, memory, and storage. + + +## Getting Started + +These instructions will guide you through deploying this project using the [Akash Console](https://console.akash.network). + + + +### Deploying + +You will need an Akash wallet with at least 0.5 AKT. You can use any deployment tool of your choice. This guide uses the [Akash Console](https://console.akash.network), but you can also the [Akash CLI](/docs/deployments/akash-cli/overview/). + +#### Environment Variables + +- `COUCHDB_USER` - The admin username for CouchDB. +- `COUCHDB_PASSWORD` - The password for the CouchDB admin user. +- `COUCHDB_URL` - The URL for CouchDB, eg., "http://admin:password@couchdb:5984" +- `COUCHDB_DB_NAME` - The name of the database. +- `COUCHDB_DESIGN_DOC` - The database design document. +- `COUCHDB_CPU_VIEW` - The database index for CPU data. +- `COUCHDB_GPU_VIEW` - The database index for GPU data. +- `COUCHDB_MEMORY_VIEW` - The database index for memory data. +- `COUCHDB_STORAGE_VIEW` - The database index for storage data. +- `PROVIDER_URL` - The URL from which the service will pull data. + +**Note:** + + When changing the COUCHDB_USER and COUCHDB_PASSWORD values, update them in the environment variables for both containers. + + #### Volumes + +- `couchdb_data`: This is where CouchDB persists its data. + +### Accessing the Applications + +**CouchDB** + +To access the CouchDB UI, navigate to the URL provided by the [Akash Console](https://console.akash.network) in your web browser. Log in with the `COUCHDB_USER` and `COUCHDB_PASSWORD` configured in your environment variables. Add `/_utils/` to the end of your URL. + +![](../../../assets/provider-stats1.png) + +**Flask App** + +Your Flask application will be available at the URL provided by the [Akash Console](https://console.akash.network). You can also set a custom domain as specified in the SDL. + +![](../../../assets/provider-stats2.png) + +**gRPC Poller** + +This service depends on CouchDB and performs periodic polling. Adjust the interval by setting the `REQUEST_INTERVAL` environment variable. No further access is required after deployment. + +### Persistent Data + +The composition uses named volumes to ensure your data persists across container restarts. + +### Troubleshooting + +Deployment should be straightforward, but if the database does not initialize correctly and no data appears on your web dashboard, follow these steps: + +1. Log into the database using the provided DB URL. +2. Log in with your username and password. +3. Navigate to the setup tab. +4. Select "single node" and re-enter your username and password. + +This should resolve the issue, and data should appear within 15 minutes. + + +