Update big query script (#155)

* Move data collection to dedicated folder * Add last blocks collected * Move data collection to dedicated folder * Add last blocks collected * Add block number and timestamp conditions to queries * Flake8 * Remove force_query arg * Fix typo * Add quotes around timestamp in queries
Blockchain-Technology-Lab · Jul 16, 2024 · d03816a · d03816a
1 parent ae769b0
commit d03816a
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 21 deletions.
diff --git a/consensus_decentralization/collect_data.py → ..._collection_scripts/collect_block_data.py b/consensus_decentralization/collect_data.py → ..._collection_scripts/collect_block_data.py
@@ -1,39 +1,40 @@
 """
     This script can be used to run queries on BigQuery for any number of blockchains, and save the results in the
     raw_block_data directory of the project.
-    The relevant queries must be stored in a file named 'queries.yaml' in the root directory of the project.
+    The relevant queries must be stored in a file named 'queries.yaml' in the `data_collection_scripts` directory of
+    the project.
 
     Attention! Before running this script, you need to generate service account credentials from Google, as described
     here (https://developers.google.com/workspace/guides/create-credentials#service-account) and save your key in the
-    root directory of the project under the name 'google-service-account-key.json'
+    `data_collection_scripts` directory of the project under the name 'google-service-account-key.json'
 """
 import consensus_decentralization.helper as hlp
 import google.cloud.bigquery as bq
 import json
 import argparse
 import logging
 from yaml import safe_load
+from datetime import datetime
 
 from consensus_decentralization.helper import ROOT_DIR, RAW_DATA_DIR
 
 
-def collect_data(ledgers, force_query):
+def collect_data(ledgers, from_block, to_date):
     if not RAW_DATA_DIR.is_dir():
         RAW_DATA_DIR.mkdir()
 
-    with open(ROOT_DIR / "queries.yaml") as f:
+    data_collection_dir = ROOT_DIR / "data_collection_scripts"
+
+    with open(data_collection_dir / "queries.yaml") as f:
         queries = safe_load(f)
 
-    client = bq.Client.from_service_account_json(json_credentials_path=ROOT_DIR / "google-service-account-key.json")
+    client = bq.Client.from_service_account_json(json_credentials_path=data_collection_dir / "google-service-account-key.json")
 
     for ledger in ledgers:
         file = RAW_DATA_DIR / f'{ledger}_raw_data.json'
-        if not force_query and file.is_file():
-            logging.info(f'{ledger} data already exists locally. '
-                         f'For querying {ledger} anyway please run the script using the flag --force-query')
-            continue
         logging.info(f"Querying {ledger}..")
-        query = (queries[ledger])
+
+        query = (queries[ledger]).replace("{{block_number}}", str(from_block[ledger]) if from_block[ledger] else "-1").replace("{{timestamp}}", to_date)
         query_job = client.query(query)
         try:
             rows = query_job.result()
@@ -44,13 +45,30 @@ def collect_data(ledgers, force_query):
             continue
 
         logging.info(f"Writing {ledger} data to file..")
-        # write json lines to file
-        with open(file, 'w') as f:
+        # Append result to file
+        with open(file, 'a') as f:
             for row in rows:
                 f.write(json.dumps(dict(row), default=str) + "\n")
         logging.info(f'Done writing {ledger} data to file.\n')
 
 
+def get_last_block_collected(ledger):
+    """
+    Get the last block collected for a ledger. This is useful for knowing where to start collecting data from.
+    Assumes that the data is stored in a json lines file, ordered in increasing block number.
+    :param ledger: the ledger to get the last block collected for
+    :returns: the number of the last block collected for the specified ledger
+    """
+    file = RAW_DATA_DIR / f'{ledger}_raw_data.json'
+    if not file.is_file():
+        return None
+    with open(file) as f:
+        for line in f:
+            pass
+    last_block = json.loads(line)
+    return last_block['number']
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
 
@@ -66,9 +84,12 @@ def collect_data(ledgers, force_query):
         help='The ledgers to collect data for.'
     )
     parser.add_argument(
-        '--force-query',
-        action='store_true',
-        help='Flag to specify whether to query for project data regardless if the relevant data already exist.'
+        '--to_date',
+        type=hlp.valid_date,
+        default=datetime.today().strftime('%Y-%m-%d'),
+        help='The date until which to get data for (YYYY-MM-DD format). Defaults to today.'
     )
+
     args = parser.parse_args()
-    collect_data(ledgers=args.ledgers, force_query=args.force_query)
+    from_block = {ledger: get_last_block_collected(ledger) for ledger in args.ledgers}
+    collect_data(ledgers=args.ledgers, from_block=from_block, to_date=args.to_date)
diff --git a/google-service-account-key-SAMPLE.json → ...ts/google-service-account-key-SAMPLE.json b/google-service-account-key-SAMPLE.json → ...ts/google-service-account-key-SAMPLE.json
diff --git a/queries.yaml → data_collection_scripts/queries.yaml b/queries.yaml → data_collection_scripts/queries.yaml
@@ -3,36 +3,46 @@ bitcoin:
   FROM `bigquery-public-data.crypto_bitcoin.transactions`
   JOIN `bigquery-public-data.crypto_bitcoin.blocks` ON `bigquery-public-data.crypto_bitcoin.transactions`.block_number = `bigquery-public-data.crypto_bitcoin.blocks`.number
   WHERE is_coinbase is TRUE
+  AND number > {{block_number}}
   AND timestamp > '2018-01-01'
+  AND timestamp < '{{timestamp}}'
   ORDER BY timestamp
 
 bitcoin_cash:
   SELECT block_number as number, block_timestamp as timestamp, coinbase_param as identifiers, `bigquery-public-data.crypto_bitcoin_cash.transactions`.outputs
   FROM `bigquery-public-data.crypto_bitcoin_cash.transactions`
   JOIN `bigquery-public-data.crypto_bitcoin_cash.blocks` ON `bigquery-public-data.crypto_bitcoin_cash.transactions`.block_number = `bigquery-public-data.crypto_bitcoin_cash.blocks`.number
   WHERE is_coinbase is TRUE
+  AND number > {{block_number}}
   AND timestamp > '2018-01-01'
+  AND timestamp < '{{timestamp}}'
   ORDER BY timestamp
 
 cardano:
   SELECT `blockchain-analytics-392322.cardano_mainnet.block`.slot_no as number, `blockchain-analytics-392322.cardano_mainnet.pool_offline_data`.ticker_name as identifiers, `blockchain-analytics-392322.cardano_mainnet.block`.block_time as timestamp,`blockchain-analytics-392322.cardano_mainnet.block`.pool_hash as reward_addresses
   FROM `blockchain-analytics-392322.cardano_mainnet.block`
   LEFT JOIN `blockchain-analytics-392322.cardano_mainnet.pool_offline_data` ON `blockchain-analytics-392322.cardano_mainnet.block`.pool_hash = `blockchain-analytics-392322.cardano_mainnet.pool_offline_data`.pool_hash
   WHERE `blockchain-analytics-392322.cardano_mainnet.block`.block_time > '2018-01-01'
+  AND `blockchain-analytics-392322.cardano_mainnet.block`.block_time < '{{timestamp}}'
+  AND number > {{block_number}}
   ORDER BY `blockchain-analytics-392322.cardano_mainnet.block`.block_time
 
 dogecoin:
   SELECT block_number as number, block_timestamp as timestamp, coinbase_param as identifiers, `bigquery-public-data.crypto_dogecoin.transactions`.outputs
   FROM `bigquery-public-data.crypto_dogecoin.transactions`
   JOIN `bigquery-public-data.crypto_dogecoin.blocks` ON `bigquery-public-data.crypto_dogecoin.transactions`.block_number = `bigquery-public-data.crypto_dogecoin.blocks`.number
   WHERE is_coinbase is TRUE
+  AND number > {{block_number}}
   AND timestamp > '2018-01-01'
+  AND timestamp < '{{timestamp}}'
   ORDER BY timestamp
 
 ethereum:
   SELECT number, timestamp, miner as reward_addresses, extra_data as identifiers
   FROM `bigquery-public-data.crypto_ethereum.blocks`
   WHERE timestamp > '2018-01-01'
+  AND timestamp < '{{timestamp}}'
+  AND number > {{block_number}}
   ORDER BY timestamp
 
 
@@ -41,20 +51,27 @@ litecoin:
   FROM `bigquery-public-data.crypto_litecoin.transactions`
   JOIN `bigquery-public-data.crypto_litecoin.blocks` ON `bigquery-public-data.crypto_litecoin.transactions`.block_number = `bigquery-public-data.crypto_litecoin.blocks`.number
   WHERE is_coinbase is TRUE
+  AND number > {{block_number}}
   AND timestamp > '2018-01-01'
+  AND timestamp < '{{timestamp}}'
   ORDER BY timestamp
 
+
 tezos:
   SELECT level as number, timestamp, baker as reward_addresses
   FROM `public-data-finance.crypto_tezos.blocks`
   WHERE timestamp > '2018-01-01'
+  AND timestamp < '{{timestamp}}'
+  AND number > {{block_number}}
   ORDER BY timestamp
 
 zcash:
   SELECT block_number as number, block_timestamp as timestamp, coinbase_param as identifiers, `bigquery-public-data.crypto_zcash.transactions`.outputs
   FROM `bigquery-public-data.crypto_zcash.transactions`
   JOIN `bigquery-public-data.crypto_zcash.blocks` ON `bigquery-public-data.crypto_zcash.transactions`.block_number = `bigquery-public-data.crypto_zcash.blocks`.number
   WHERE is_coinbase is TRUE
+  AND number > {{block_number}}
   AND timestamp > '2018-01-01'
+  AND timestamp < '{{timestamp}}'
   ORDER BY timestamp
 
diff --git a/docs/data.md b/docs/data.md
@@ -97,22 +97,22 @@ ORDER BY timestamp
 
 Instead of executing each of these queries separately on the BigQuery console and saving the results manually, it is
 also possible to automate the process using a
-[script](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/consensus_decentralization/collect_data.py)
+[script](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/data_collection_scripts/collect_block_data.py)
 and collect all relevant data in one go. Executing this script will run queries
-from [this file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/queries.yaml).
+from [this file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/data_collection_scripts/queries.yaml).
 
 IMPORTANT: the script uses service account credentials for authentication, therefore before running it, you need to
 generate the relevant credentials from Google, as described 
 [here](https://developers.google.com/workspace/guides/create-credentials#service-account) and save your key in the
-root directory of the project under the name 'google-service-account-key.json'. There is a
-[sample file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/google-service-account-key-SAMPLE.json) 
+`data_collection_scripts` directory of the project under the name 'google-service-account-key.json'. There is a
+[sample file](https://github.com/Blockchain-Technology-Lab/consensus-decentralization/blob/main/data_collection_scripts/google-service-account-key-SAMPLE.json) 
 that you can consult, which shows what your credentials are supposed to look like (but note that this is for
 informational purposes only, this file is not used in the code).
 
 Once you have set up the credentials, you can just run the following command from the root
 directory to retrieve data for all supported blockchains:
 
-`python -m consensus_decentralization.collect_data`
+`python -m data_collection_scripts.collect_block_data`
 
 There are also two command line arguments that can be used to customize the data collection process: