blockchain-etl · julian-bcw · Aug 6, 2024 · Dec 28, 2023
diff --git a/.github/workflows/extractor-build.yaml b/.github/workflows/extractor-build.yaml
@@ -0,0 +1,34 @@
+name: extractor-build
+on:
+  workflow_call:
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    env: 
+      GH_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+    steps:
+    - name: Install required tools
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y git unzip protobuf-compiler build-essential curl
+
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+
+    - uses: arduino/setup-protoc@v1
+    - uses: actions-rs/toolchain@v1
+      with:
+        toolchain: stable
+
+    - name: Build
+      run: |
+        export RUSTFLAGS="-C target-cpu=x86-64"
+        cargo build --release --features "SOLANA,RABBITMQ_CLASSIC,SINGLE_PUBLISHER"
+
+    - name: Upload Release Asset
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh release upload ${{ github.event.release.tag_name }} target/release/blockchain_etl_indexer --clobber
diff --git a/.github/workflows/inserter-build.yaml b/.github/workflows/inserter-build.yaml
@@ -0,0 +1,49 @@
+name: inserter-build
+on:
+  workflow_call:
+
+env:
+  GOMOD: ${{ github.workspace }}/storage-write/deprecated/go.mod
+  GOMODULE_DIR:  ${{ github.workspace }}/storage-write/deprecated
+  GO_VERSION: '1.20.x'
+  PROTOC_VERSION: '23.2'
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v4
+    - name: Setup Go
+      uses: actions/setup-go@v5
+      with:
+        go-version: ${{ env.GO_VERSION }}
+
+    - name: Install dependencies
+      run: |
+        sudo apt-get update && sudo apt-get install -y g++
+        go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+        go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+        cd ${{ env.GOMODULE_DIR }}
+        go mod tidy
+        go mod download
+
+    - name: Download and install protoc
+      run: |
+        curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v${{ env.PROTOC_VERSION }}/protoc-${{ env.PROTOC_VERSION }}-linux-x86_64.zip
+        unzip protoc-${{ env.PROTOC_VERSION }}-linux-x86_64.zip -d $HOME/.local
+
+    - name: Build
+      env:
+        CGO_ENABLED: 0
+        GOOS: linux
+      run: |
+        cd ${{ env.GOMODULE_DIR }}
+        go generate
+        go build -o ${{ github.workspace }}/target/release/blockchain_etl_inserter main.go
+
+    - name: Upload Release Asset
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh release upload ${{ github.event.release.tag_name }} target/release/blockchain_etl_inserter --clobber
+
diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml
@@ -0,0 +1,37 @@
+name: main-build
+on:
+  release:
+    types: [created]
+
+jobs:
+  determine-job:
+    runs-on: ubuntu-latest
+    outputs:
+      run-extractor-job: ${{ steps.check-ref.outputs.run_extractor }}
+      run-inserter-job: ${{ steps.check-ref.outputs.run_inserter }}
+    steps:
+    - name: Check tag pattern
+      id: check-ref
+      run: |
+        if [[ "${GITHUB_REF##*/}" == extractor* ]]; then
+          echo "::set-output name=run_extractor::true"
+        else
+          echo "::set-output name=run_extractor::false"
+        fi
+        if [[ "${GITHUB_REF##*/}" == inserter* ]]; then
+          echo "::set-output name=run_inserter::true"
+        else
+          echo "::set-output name=run_inserter::false"
+        fi
+
+  trigger-extractor-build:
+    needs: determine-job
+    if: needs.determine-job.outputs.run-extractor-job == 'true'
+    uses: ./.github/workflows/extractor-build.yaml
+
+
+  trigger-inserter-build:
+    needs: determine-job
+    if: needs.determine-job.outputs.run-inserter-job == 'true'
+    uses: ./.github/workflows/inserter-build.yaml
+
diff --git a/Cargo.toml b/Cargo.toml
@@ -68,7 +68,6 @@ prost-build = { version = "0.12.1" }
 lto = true
 
 [features]
-default = ["SOLANA", "SEPARATE_PUBLISHERS"]
 # Solana features
 SOLANA_BIGTABLE = ["SOLANA", "dep:solana-storage-bigtable"]
 SOLANA = [

diff --git a/README.md b/README.md
@@ -13,28 +13,5 @@ The overall infrastructure is depicted below.
 
 For more information check the [documentation](/docs/).
 
-## Setup
-Use the script in `scripts/setup.sh` to automatically install system dependencies, clone the repo and all submodules, and compile:
--  Tested on Ubuntu LTS 22.04
-```
-bash scripts/setup.sh
-```
-NOTE: you may need to run with `sudo`.
-
-Next, build and run the development profile (default) with appropriate features:
-
-E.g. to output to Google Pub/Sub or to output JSON files, replace `<OUTPUT_TYPE>` with `GOOGLE_PUBSUB` or `JSON`, respectively:
-
-`cargo build --features <OUTPUT_TYPE>`
-
-Finally, execute with the appropriate function and parameters.
-
-E.g. Index starting from genesis onwards:
-
-`./target/debug/blockchain_etl_indexer index-range stream 0`
-
-Or to index from genesis to block 10:
-`./target/debug/blockchain_etl_indexer index-range stream 0 10`
-
-And to index a list of specific blocks, provide a CSV filepath with `index-list` command:
-`./target/debug/blockchain_etl_indexer index-list stream FILE_PATH.csv`
+## Usage
+For instructions on system setup, compilation, and running, see the documentation on [Getting Started](/docs/getting-started.md).
diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -0,0 +1,42 @@
+# Getting Started
+## Install System Dependencies
+Tested on Ubuntu LTS 22.04:
+```
+sudo apt install git cargo g++ protobuf-compiler
+```
+## Clone the Repo
+```
+git clone https://github.com/blockchain-etl/solana-etl.git
+```
+## Compile the Code
+```
+cd solana-etl
+cargo build –-release --features <OUTPUT>
+```
+NOTE: you must replace `<OUTPUT>` in the above command with one of the supported output types, depending on how you would like to run the indexer. The supported outputs are:
+1. `JSON`
+2. `JSONL`
+3. `GOOGLE_PUBSUB`
+4. `RABBITMQ_CLASSIC`
+5. `RABBITMQ_STREAM`
+
+If you would like to upload the records as files to GCS buckets, then you should use `JSONL` as output, and you can run [this script](/scripts/upload_to_gcs.sh) to continually upload them.
+
+If you would like to write the records to BigQuery using the Storage Write API, then you should use `RABBITMQ_CLASSIC` as output, and setup a RabbitMQ instance using the scripts in the iac directory. That documentation is available [here](/iac/README.md) and the Storage Write API scripts are available [here](/storage-write).
+
+## Configure the Environment Variables
+See the [documentation on environment variables](/docs/environment-variables.md).
+
+## Run the Indexer
+There are two CLI options to choose from:
+1. `index-range`
+2. `index-list`
+Option 1 requires that you pass a starting slot to index from, and you can optionally provide a second slot as the ending index. The start is inclusive, and the end is exclusive.
+
+Option 2 requires that you pass the path to a CSV file containing a list of specified slots to index.
+
+As an example, if you would like to index from the genesis block onwards, you can run the following command:
+```
+RUST_LOG=WARN ./target/release/blockchain_etl_indexer index-range stream 0
+```
+NOTE: `RUST_LOG` specifies the logging level. For more information, see the `log` crate and [its logging levels](https://docs.rs/log/latest/log/enum.Level.html).
diff --git a/src/README.md b/src/README.md
@@ -0,0 +1,161 @@
+ # ETL-Core Documentation
+ Here we should explain what the ETL-Core is and how it works.
+
+ ## ETL Infrastructure Architecture
+ ### Architecture Framework
+ The `etl-core` repository will serve as the primary engine for ETL actions, operating at the network level and service level, and can accept custom configurations. Developers will be able to set up custom configurations within `etl-core`.  Once the network and export service is selected, users can use `etl-core` to export the desired blockchain data.
+
+
+ Currently, the Solana blockchain is supported in [etl-solana-config](https://github.com/BCWResearch/etl-solana-config).
+
+ ### Macro Infrastructure
+ An RPC node is expected to serve requests. Blocks are continually requested using the node, and if necessary, other data such as accounts may be requested as well. Upon response, the data is converted into a Protocol Buffers data format and sent to a streaming queue, such as Google Cloud Pub/Sub or RabbitMQ. You will need a transformer and loader that listens for the messages, transforms them to match the table schema, and inserts them into BigQuery.
+
+ ## Response Deserialization
+ To deserialize JSON responses from the blockchain node, we expect the blockchain configuration to specify the structure of the response in a Rust `struct` and annotate it with the `Deserialize` macro from the `serde` library. This macro generates deserialization code for the developer which eases development, but more importantly allows us to deserialize it with the `simd-json` library.
+
+ The `simd-json` library uses CPU vector extensions for accelerated JSON deserialization. Currently, the library supports x86 and ARM vector extensions, but falls back to standard deserialization if used on a system that doesn't support SIMD.
+ * Since x86's AVX2 is 256-bit, while ARM's NEON is 128-bit, *you can expect best performance on x86*.
+ * This library is only used when compiled in the `release` profile, because its error messages are less descriptive. For development, it is recommended that you compile in debug mode (the default profile), which will use the `serde` deserializer, thus providing more descriptive errors.
+
+ ## Environmental Variables
+ ### Synopsis
+
+ You can define enviornmental variables in a `.env` file. Examples are illustrated in `.env.example.`
+
+ ### Variables
+- `ENDPOINT`
+ **Required**. Specifies the address to use for json RPC requests.
+
+ - `FALLBACK_ENDPOINT`
+ **Required**. Specifies the address to use for json RPC requests, when the primary endpoint is failing. This value can be the same `ENDPOINT`.
+
+ - `NUM_EXTRACTOR_THREADS`
+ **Required**. Specifies the number of concurrent threads to run an extract job.
+
+ - `ENABLE_METRICS`
+ **Required**. This variable determines whether to launch a metrics server to collect metrics for Prometheus.
+
+ - `METRICS_ADDRESS`
+ Optional. Required only if `ENABLE_METRICS` is true. Specifies the address of the metrics server.
+
+ - `METRICS_PORT`
+ Optional. Required only if `ENABLE_METRICS` is true. Specifies the port of the metrics server.
+
+ - `RABBITMQ_ADDRESS`
+ Optional. Required only if _STREAM_EXPORTER_  is set to `RABBITMQ_STREAM`. Specifies the address of RabbitMQ.
+
+ - `RABBITMQ_PORT`
+ Optional. Required only if _STREAM_EXPORTER_  is set to `RABBITMQ_STREAM`. Specifies the port of RabbitMQ.
+
+ - `BIGTABLE_CRED`
+ Optional. Specifies the file path of the credential file required to access GCP Bigtable.
+
+ - `GCP_CREDENTIALS_JSON_PATH`
+ Optional. Required only if _STREAM_EXPORTER_  is set to `GOOGLE_PUBSUB`. Specifies the file path of the credential file required to access Google Pubsub.
+
+ - `GOOGLE_PUBSUB_TOPIC`
+ Optional. Required only if _STREAM_EXPORTER_ is set to `GOOGLE_PUBSUB`. Specifies the Google Pubsub topic to be used during exporting. It is assumed that the PubSub Topic is already created.
+
+ ## Data Extraction
+
+ All RPC requests are retried with backoff upon failure, with failures logged at the `warning` level.
+
+ Blocks are requested from the node by the `call_getBlock()` function.
+
+ The `call_getBlockHeight()` function requests the current block height.
+
+ The `call_getMultipleAccounts()` function requests account data for a list of pubkeys. These pubkeys come from the created accounts and token mints in the block data.
+
+ The blockchain configuration is expected to define the HTTP requests that these functions make in a `<BLOCKCHAIN_CONFIG>/types/request_types.rs` file. These requests should be specified using `struct`s called `BlockHeightRequest` and `BlockRequest`, and should implement `serde::Serialize`. It is recommended that you annotate the struct with `#[derive(serde::Serialize)]`  to simplify this process and generate the code.
+
+ ### Concurrency
+
+ The master thread continually sends slot values to a concurrent queue for worker threads to index.
+
+ Long-lived threads are created at the start of runtime by the master thread, and continually pull tasks (slot values) from the concurrent queue. Each thread makes requests to the node for the block data at that slot, then deserializes the response, and transmits the data to a stream queue.
+ * For communication with the stream queue (which supports concurrent producers), each thread serializes its data using the protocol buffers interface, and transmits the information.
+
+ ## Features
+
+ ### Synopsis
+
+ You can either define `--features` in the `Cargo.toml` file inside the `etl-core` repository or specify them as part of a command.
+
+ `cargo build --features ARGS...`
+ `cargo run --features ARGS...`
+
+ The `--features` option is required to build or run the ETL project.
+
+ ### Arguments
+
+ Currently, the following blockchains are supported:
+ - `SOLANA`
+
+ A message queue is required to be specified:
+ - `RABBITMQ` - a classic RabbitMQ queue
+ - `RABBITMQ_STREAM` - a RabbitMQ with Stream Queue plugin
+ - `GOOGLE_PUBSUB` - Google Cloud Pub/Sub
+
+ ### Examples
+
+ 1. Build the local project and its dependencies for the _SOLANA_ blockchain
+ ```
+ cargo build --release --features SOLANA,RABBITMQ_STREAM
+ ```
+
+ 2. Run the local project and its dependencies for the _SOLANA_blockchain and _RABBITMQ_STREAM_ exporter
+ ```
+ cargo run --features SOLANA,RABBITMQ_STREAM
+ ```
+
+ ## Limitations
+ - Only limited number of `Token-2022 Program` information is extracted.
+ - `SOLANA_BIGTABLE` feature can only request 1000 confirmed slots each time.
+
+ ## Project Progress
+
+ ### Deployment Method
+| Metrics                | Development Status |
+| ---------------------- | ------------------ |
+| Dockerfile             | In Development     |
+| Helm Chart             | In Development     |
+
+ ### Export Method
+| Metrics                | Development Status |
+| ---------------------- | ------------------ |
+| CSV                    | Completed          |
+| Google PubSub          | Completed          |
+| RabbitMQ               | Completed          |
+
+ ### Extraction Source
+| Metrics                | Development Status |
+| ---------------------- | ------------------ |
+| Bigtable               | Completed          |
+| JSON RPC               | Completed          |
+
+ ### Metrics Collection
+| Metrics                | Development Status |
+| ---------------------- | ------------------ |
+| Block Request Count    | In Development     |
+| Failed Block Count     | Not Started        |
+
+ ### Tables
+| Table            | Development Status |
+| ---------------- | ------------------ |
+| Accounts         | Completed          |
+| Blocks           | Completed          |
+| Instructions     | Completed          |
+| Tokens           | Completed          |
+| Token Transfers  | Completed          |
+| Transactions     | Completed          |
+
+
+ ## Protocol Buffers
+
+ We use protocol buffers to serialize our data for transmission to a pub/sub system like RabbitMQ or Google Cloud Pub/Sub.
+
+ Some blockchains provide their own protobuf interfaces, so when possible, we will attempt to use those.
+
+ ### Codegen
+ To generate Rust code from our protobuf interface, we use the `PROST` library. This is a popular library for Rust, and is used by the Solana blockchain with their official "storage" protobuf. We perform this codegen at compile time, using a custom Rust build script: `build_proto.rs`. This script uses the `include!` macro to import the protobuf build script from the blockchain-specific configuration. It is expected that each blockchain config will define its own protobuf build script.