Skip to content

Commit

Permalink
Update big query script (#155)
Browse files Browse the repository at this point in the history
* Move data collection to dedicated folder

* Add last blocks collected

* Move data collection to dedicated folder

* Add last blocks collected

* Add block number and timestamp conditions to queries

* Flake8

* Remove force_query arg

* Fix typo

* Add quotes around timestamp in queries
  • Loading branch information
LadyChristina committed Jul 16, 2024
1 parent ae769b0 commit d03816a
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 21 deletions.
Original file line number Diff line number Diff line change
@@ -1,39 +1,40 @@
"""
This script can be used to run queries on BigQuery for any number of blockchains, and save the results in the
raw_block_data directory of the project.
The relevant queries must be stored in a file named 'queries.yaml' in the root directory of the project.
The relevant queries must be stored in a file named 'queries.yaml' in the `data_collection_scripts` directory of
the project.
Attention! Before running this script, you need to generate service account credentials from Google, as described
here (https://developers.google.com/workspace/guides/create-credentials#service-account) and save your key in the
root directory of the project under the name 'google-service-account-key.json'
`data_collection_scripts` directory of the project under the name 'google-service-account-key.json'
"""
import consensus_decentralization.helper as hlp
import google.cloud.bigquery as bq
import json
import argparse
import logging
from yaml import safe_load
from datetime import datetime

from consensus_decentralization.helper import ROOT_DIR, RAW_DATA_DIR


def collect_data(ledgers, force_query):
def collect_data(ledgers, from_block, to_date):
if not RAW_DATA_DIR.is_dir():
RAW_DATA_DIR.mkdir()

with open(ROOT_DIR / "queries.yaml") as f:
data_collection_dir = ROOT_DIR / "data_collection_scripts"

with open(data_collection_dir / "queries.yaml") as f:
queries = safe_load(f)

client = bq.Client.from_service_account_json(json_credentials_path=ROOT_DIR / "google-service-account-key.json")
client = bq.Client.from_service_account_json(json_credentials_path=data_collection_dir / "google-service-account-key.json")

for ledger in ledgers:
file = RAW_DATA_DIR / f'{ledger}_raw_data.json'
if not force_query and file.is_file():
logging.info(f'{ledger} data already exists locally. '
f'For querying {ledger} anyway please run the script using the flag --force-query')
continue
logging.info(f"Querying {ledger}..")
query = (queries[ledger])

query = (queries[ledger]).replace("{{block_number}}", str(from_block[ledger]) if from_block[ledger] else "-1").replace("{{timestamp}}", to_date)
query_job = client.query(query)
try:
rows = query_job.result()
Expand All @@ -44,13 +45,30 @@ def collect_data(ledgers, force_query):
continue

logging.info(f"Writing {ledger} data to file..")
# write json lines to file
with open(file, 'w') as f:
# Append result to file
with open(file, 'a') as f:
for row in rows:
f.write(json.dumps(dict(row), default=str) + "\n")
logging.info(f'Done writing {ledger} data to file.\n')


def get_last_block_collected(ledger):
"""
Get the last block collected for a ledger. This is useful for knowing where to start collecting data from.
Assumes that the data is stored in a json lines file, ordered in increasing block number.
:param ledger: the ledger to get the last block collected for
:returns: the number of the last block collected for the specified ledger
"""
file = RAW_DATA_DIR / f'{ledger}_raw_data.json'
if not file.is_file():
return None
with open(file) as f:
for line in f:
pass
last_block = json.loads(line)
return last_block['number']


if __name__ == '__main__':
logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)

Expand All @@ -66,9 +84,12 @@ def collect_data(ledgers, force_query):
help='The ledgers to collect data for.'
)
parser.add_argument(
'--force-query',
action='store_true',
help='Flag to specify whether to query for project data regardless if the relevant data already exist.'
'--to_date',
type=hlp.valid_date,
default=datetime.today().strftime('%Y-%m-%d'),
help='The date until which to get data for (YYYY-MM-DD format). Defaults to today.'
)

args = parser.parse_args()
collect_data(ledgers=args.ledgers, force_query=args.force_query)
from_block = {ledger: get_last_block_collected(ledger) for ledger in args.ledgers}
collect_data(ledgers=args.ledgers, from_block=from_block, to_date=args.to_date)
File renamed without changes.
17 changes: 17 additions & 0 deletions queries.yaml → data_collection_scripts/queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,46 @@ bitcoin:
FROM `bigquery-public-data.crypto_bitcoin.transactions`
JOIN `bigquery-public-data.crypto_bitcoin.blocks` ON `bigquery-public-data.crypto_bitcoin.transactions`.block_number = `bigquery-public-data.crypto_bitcoin.blocks`.number
WHERE is_coinbase is TRUE
AND number > {{block_number}}
AND timestamp > '2018-01-01'
AND timestamp < '{{timestamp}}'
ORDER BY timestamp

bitcoin_cash:
SELECT block_number as number, block_timestamp as timestamp, coinbase_param as identifiers, `bigquery-public-data.crypto_bitcoin_cash.transactions`.outputs
FROM `bigquery-public-data.crypto_bitcoin_cash.transactions`
JOIN `bigquery-public-data.crypto_bitcoin_cash.blocks` ON `bigquery-public-data.crypto_bitcoin_cash.transactions`.block_number = `bigquery-public-data.crypto_bitcoin_cash.blocks`.number
WHERE is_coinbase is TRUE
AND number > {{block_number}}
AND timestamp > '2018-01-01'
AND timestamp < '{{timestamp}}'
ORDER BY timestamp

cardano:
SELECT `blockchain-analytics-392322.cardano_mainnet.block`.slot_no as number, `blockchain-analytics-392322.cardano_mainnet.pool_offline_data`.ticker_name as identifiers, `blockchain-analytics-392322.cardano_mainnet.block`.block_time as timestamp,`blockchain-analytics-392322.cardano_mainnet.block`.pool_hash as reward_addresses
FROM `blockchain-analytics-392322.cardano_mainnet.block`
LEFT JOIN `blockchain-analytics-392322.cardano_mainnet.pool_offline_data` ON `blockchain-analytics-392322.cardano_mainnet.block`.pool_hash = `blockchain-analytics-392322.cardano_mainnet.pool_offline_data`.pool_hash
WHERE `blockchain-analytics-392322.cardano_mainnet.block`.block_time > '2018-01-01'
AND `blockchain-analytics-392322.cardano_mainnet.block`.block_time < '{{timestamp}}'
AND number > {{block_number}}
ORDER BY `blockchain-analytics-392322.cardano_mainnet.block`.block_time

dogecoin:
SELECT block_number as number, block_timestamp as timestamp, coinbase_param as identifiers, `bigquery-public-data.crypto_dogecoin.transactions`.outputs
FROM `bigquery-public-data.crypto_dogecoin.transactions`
JOIN `bigquery-public-data.crypto_dogecoin.blocks` ON `bigquery-public-data.crypto_dogecoin.transactions`.block_number = `bigquery-public-data.crypto_dogecoin.blocks`.number
WHERE is_coinbase is TRUE
AND number > {{block_number}}
AND timestamp > '2018-01-01'
AND timestamp < '{{timestamp}}'
ORDER BY timestamp

ethereum:
SELECT number, timestamp, miner as reward_addresses, extra_data as identifiers
FROM `bigquery-public-data.crypto_ethereum.blocks`
WHERE timestamp > '2018-01-01'
AND timestamp < '{{timestamp}}'
AND number > {{block_number}}
ORDER BY timestamp


Expand All @@ -41,20 +51,27 @@ litecoin:
FROM `bigquery-public-data.crypto_litecoin.transactions`
JOIN `bigquery-public-data.crypto_litecoin.blocks` ON `bigquery-public-data.crypto_litecoin.transactions`.block_number = `bigquery-public-data.crypto_litecoin.blocks`.number
WHERE is_coinbase is TRUE
AND number > {{block_number}}
AND timestamp > '2018-01-01'
AND timestamp < '{{timestamp}}'
ORDER BY timestamp


tezos:
SELECT level as number, timestamp, baker as reward_addresses
FROM `public-data-finance.crypto_tezos.blocks`
WHERE timestamp > '2018-01-01'
AND timestamp < '{{timestamp}}'
AND number > {{block_number}}
ORDER BY timestamp

zcash:
SELECT block_number as number, block_timestamp as timestamp, coinbase_param as identifiers, `bigquery-public-data.crypto_zcash.transactions`.outputs
FROM `bigquery-public-data.crypto_zcash.transactions`
JOIN `bigquery-public-data.crypto_zcash.blocks` ON `bigquery-public-data.crypto_zcash.transactions`.block_number = `bigquery-public-data.crypto_zcash.blocks`.number
WHERE is_coinbase is TRUE
AND number > {{block_number}}
AND timestamp > '2018-01-01'
AND timestamp < '{{timestamp}}'
ORDER BY timestamp

10 changes: 5 additions & 5 deletions docs/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,22 +97,22 @@ ORDER BY timestamp

Instead of executing each of these queries separately on the BigQuery console and saving the results manually, it is
also possible to automate the process using a
[script](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/consensus_decentralization/collect_data.py)
[script](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/data_collection_scripts/collect_block_data.py)
and collect all relevant data in one go. Executing this script will run queries
from [this file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/queries.yaml).
from [this file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/data_collection_scripts/queries.yaml).

IMPORTANT: the script uses service account credentials for authentication, therefore before running it, you need to
generate the relevant credentials from Google, as described
[here](https://developers.google.com/workspace/guides/create-credentials#service-account) and save your key in the
root directory of the project under the name 'google-service-account-key.json'. There is a
[sample file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/google-service-account-key-SAMPLE.json)
`data_collection_scripts` directory of the project under the name 'google-service-account-key.json'. There is a
[sample file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/data_collection_scripts/google-service-account-key-SAMPLE.json)
that you can consult, which shows what your credentials are supposed to look like (but note that this is for
informational purposes only, this file is not used in the code).

Once you have set up the credentials, you can just run the following command from the root
directory to retrieve data for all supported blockchains:

`python -m consensus_decentralization.collect_data`
`python -m data_collection_scripts.collect_block_data`

There are also two command line arguments that can be used to customize the data collection process:

Expand Down

0 comments on commit d03816a

Please sign in to comment.