Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update execution parameter handling #154

Merged
merged 4 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 36 additions & 8 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# The metrics for which an analysis should be performed.
metrics:
entropy:
- 1
Expand All @@ -12,14 +13,8 @@ metrics:
- 0.33
- 0.66

analyze_flags:
clustering: true

default_timeframe:
start_date: 2010-01-01
end_date: 2023-12-31

default_ledgers:
# The ledgers for which an analysis should be performed.
ledgers:
- bitcoin
- bitcoin_cash
- cardano
Expand All @@ -28,3 +23,36 @@ default_ledgers:
- litecoin
- tezos
- zcash

# Execution flags
execution_flags:
force_map: false

# Analyze flags
analyze_flags:
clustering: true

# The timeframe for which an analysis should be performed.
# Each date is a string of the form YYYY-MM-DD.
# If granularity is also set, then the analysis will run on the timeframe of the two farthest snapshots.
timeframe:
start_date: 2010-01-01
end_date: 2023-12-31

# The granularity for the analysis when two dates are provided in the --snapshot_dates argument (which are then interpreted as start and end dates).
# It can be one of: "day", "week", "month", "year", or empty. If empty, then only the snapshots for the given dates will be analyzed.
granularity: "month"

input_directories: # Paths to directories that contain raw input data
- ./input

# Paths to directories of snapshot db files; either absolute or relative from run.py.
# The first path will be used to write newly created dbs and the output of runs
output_directories:
- ./output


# Plot flags
plot_parameters:
plot: false
animated: true
2 changes: 1 addition & 1 deletion consensus_decentralization/collect_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def collect_data(ledgers, force_query):
if __name__ == '__main__':
logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)

default_ledgers = hlp.get_default_ledgers()
default_ledgers = hlp.get_ledgers()

parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down
68 changes: 63 additions & 5 deletions consensus_decentralization/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,24 +265,27 @@ def get_metrics_config():
return metrics


def get_default_ledgers():
def get_ledgers():
"""
Retrieves data regarding the default ledgers to use
Retrieves data regarding the ledgers to use
:returns: a list of strings that correspond to the ledgers that will be used (unless overriden by the relevant cmd
arg)
"""
config = get_config_data()
ledgers = config['default_ledgers']
ledgers = config['ledgers']
return ledgers


def get_default_start_end_dates():
def get_start_end_dates():
"""
Retrieves the start and end dates for which to analyze data
:returns: a tuple of two strings, (<start date>, <end date>)
"""
config = get_config_data()
return str(config['default_timeframe']['start_date']), str(config['default_timeframe']['end_date'])
for date_type in ['start', 'end']:
if not valid_date(str(config['timeframe'][f'{date_type}_date'])):
raise ValueError(f'Invalid {date_type} date')
return str(config['timeframe']['start_date']), str(config['timeframe']['end_date'])
LadyChristina marked this conversation as resolved.
Show resolved Hide resolved


def read_mapped_project_data(project_dir):
Expand Down Expand Up @@ -361,3 +364,58 @@ def get_date_from_block(block, level='day'):
elif level == 'day':
return timestamp[:10]
raise ValueError(f'Invalid level: {level}')


def get_granularity():
"""
Retrieves the granularity to be used in the analysis
:returns: string in ['day', 'week', 'month', 'year'] that represents the chosen granularity
or 'all' if the relevant field is empty in the config file
:raises ValueError: if the granularity field is missing from the config file or if
the chosen value is not one of the allowed ones
"""
try:
granularity = get_config_data()['granularity']
if granularity:
if granularity in ['day', 'week', 'month', 'year']:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big deal, but if we inlude None in this list we wouldn't need the outside if

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, before we used "all" instead of empty / None. I'm fine to keep the empty one but then some more docstring needs to be changed where it shows "all" as an option. Additionally, since "aggregate_by" was renamed to "granularity", it should be renamed everywhere (there are some helper functions that use it and haven't been renamed from what I can tell)

return granularity
else:
raise ValueError('Malformed "granularity" in config; should be one of: "day", "week", "month", "year", or empty')
else:
return 'all'
except KeyError:
raise ValueError('"granularity" not in config file')


def get_plot_flag():
"""
Gets the flag that determines whether generate plots for the output
:returns: boolean
:raises ValueError: if the flag is not set in the config file
"""
config = get_config_data()
try:
return config['plot_parameters']['plot']
except KeyError:
raise ValueError('Flag "plot" not in config file')


def get_plot_config_data():
"""
Retrieves the plot-related config parameters
:returns: dictionary
"""
return get_config_data()['plot_parameters']


def get_force_map_flag():
"""
Gets the flag that determines whether to forcefully map the data
:returns: boolean
:raises ValueError: if the flag is not set in the config file
"""
config = get_config_data()
try:
return config['execution_flags']['force_map']
except KeyError:
raise ValueError('Flag "force_map" not in config file')
8 changes: 4 additions & 4 deletions consensus_decentralization/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,10 @@ def plot(ledgers, metrics, aggregated_data_filename, animated):


if __name__ == '__main__':
default_ledgers = hlp.get_default_ledgers()
ledgers = hlp.get_ledgers()
default_metrics = hlp.get_metrics_config().keys()

default_start_date, default_end_date = hlp.get_default_start_end_dates()
default_start_date, default_end_date = hlp.get_start_end_dates()
timeframe_start = hlp.get_timeframe_beginning(default_start_date)
timeframe_end = hlp.get_timeframe_end(default_end_date)

Expand All @@ -271,8 +271,8 @@ def plot(ledgers, metrics, aggregated_data_filename, animated):
'--ledgers',
nargs="*",
type=str.lower,
default=default_ledgers,
choices=default_ledgers,
default=ledgers,
choices=ledgers,
help='The ledgers whose data will be plotted.'
)
parser.add_argument(
Expand Down
129 changes: 38 additions & 91 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import argparse
import logging
from consensus_decentralization.aggregate import aggregate
from consensus_decentralization.map import apply_mapping
Expand All @@ -10,19 +9,19 @@
logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)


def process_data(force_map, project_dir, project, output_dir):
mapped_data_file = project_dir / 'mapped_data.json'
def process_data(force_map, ledger_dir, ledger, output_dir):
mapped_data_file = ledger_dir / 'mapped_data.json'
if force_map or not mapped_data_file.is_file():
parsed_data = parse(project=project, input_dir=hlp.RAW_DATA_DIR)
apply_mapping(project=project, parsed_data=parsed_data, output_dir=output_dir)
parsed_data = parse(ledger, input_dir=hlp.RAW_DATA_DIR)
apply_mapping(ledger, parsed_data=parsed_data, output_dir=output_dir)


def main(projects, timeframe, aggregate_by, force_map, make_plots, make_animated_plots, output_dir=hlp.OUTPUT_DIR):
def main(ledgers, timeframe, granularity, output_dir=hlp.OUTPUT_DIR):
"""
Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
:param projects: list of strings that correspond to the ledgers whose data should be analyzed
:param ledgers: list of strings that correspond to the ledgers whose data should be analyzed
:param timeframe: tuple of (start_date, end_date) where each date is a datetime.date object.
:param aggregate_by: string that corresponds to the granularity that will be used for the analysis. It can be one
:param granularity: string that corresponds to the granularity that will be used for the analysis. It can be one
of: day, week, month, year, all.
:param force_map: bool. If True, then the parsing and mapping will be performed, regardless of whether
mapped data for some or all of the projects already exist
Expand All @@ -31,104 +30,52 @@ def main(projects, timeframe, aggregate_by, force_map, make_plots, make_animated
Warning: generating animated plots might take a long time
:param output_dir: pathlib.PosixPath object of the directory where the output data will be saved
"""
logging.info(f"The ledgers that will be analyzed are: {','.join(projects)}")
for project in projects:
project_dir = output_dir / project
project_dir.mkdir(parents=True, exist_ok=True) # create project output directory if it doesn't already exist
logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}")

process_data(force_map, project_dir, project, output_dir)
force_map = hlp.get_force_map_flag()

for ledger in ledgers:
ledger_dir = output_dir / ledger
ledger_dir.mkdir(parents=True, exist_ok=True) # create ledger output directory if it doesn't already exist

process_data(force_map, ledger_dir, ledger, output_dir)

aggregate(
project=project,
output_dir=output_dir,
timeframe=timeframe,
aggregate_by=aggregate_by,
force_aggregate=force_map
ledger,
output_dir,
timeframe,
granularity,
force_map
)

used_metrics = analyze(
projects=projects,
aggregated_data_filename=hlp.get_blocks_per_entity_filename(aggregate_by=aggregate_by, timeframe=timeframe),
ledgers,
aggregated_data_filename=hlp.get_blocks_per_entity_filename(granularity, timeframe),
output_dir=output_dir
)

if make_plots:
if hlp.get_plot_flag():
plot(
ledgers=projects,
ledgers,
metrics=used_metrics,
aggregated_data_filename=hlp.get_blocks_per_entity_filename(aggregate_by=aggregate_by, timeframe=timeframe),
animated=make_animated_plots
aggregated_data_filename=hlp.get_blocks_per_entity_filename(granularity, timeframe),
animated=hlp.get_plot_config_data()['animated']
)


if __name__ == '__main__':
default_ledgers = hlp.get_default_ledgers()
start_date, end_date = hlp.get_default_start_end_dates()

parser = argparse.ArgumentParser()
parser.add_argument(
'--ledgers',
nargs="*",
type=str.lower,
default=default_ledgers,
choices=default_ledgers,
help='The ledgers that will be analyzed.'
)
parser.add_argument(
'--timeframe',
nargs="*",
type=hlp.valid_date,
default=[start_date, end_date],
help='The timeframe that will be analyzed. You can provide two values to mark the beginning and end of the '
'time frame or a single value that encapsulates both.'
)
parser.add_argument(
'--aggregate-by',
nargs="?",
type=str.lower,
default='month',
choices=['day', 'week', 'month', 'year', 'all'],
help='The granularity that will be used for the analysis. It can be one of: "day", "week", "month", "year", '
'"all" and by default it is month. Note that in the case of weekly aggregation, we consider a week to '
'be 7 consecutive days, starting from the first day of the time period under consideration (so not '
'necessarily Monday to Sunday). If "all" is chosen then no aggregation will be performed, meaning that '
'the given timeframe will be treated as one unit of time in our analysis.'
)
parser.add_argument(
'--force-map',
action='store_true',
help='Flag to specify whether to map the parsed data, regardless if the mapped data files exist.'
)
parser.add_argument(
'--plot',
action='store_true',
help='Flag to specify whether to produce and save plots of the results.'
)
parser.add_argument(
'--animated',
action='store_true',
help='Flag to specify whether to also generate animated plots.'
)
args = parser.parse_args()

aggregate_by = args.aggregate_by
timeframe = args.timeframe
if len(timeframe) > 2:
parser.error('Too many values given for --timeframe argument. Please provide one date to get a snapshot or '
'two dates to get a time series.')
timeframe_start = hlp.get_timeframe_beginning(timeframe[0])
timeframe_end = hlp.get_timeframe_end(timeframe[-1])
ledgers = hlp.get_ledgers()

granularity = hlp.get_granularity()

start_date, end_date = hlp.get_start_end_dates()
timeframe_start = hlp.get_timeframe_beginning(start_date)
timeframe_end = hlp.get_timeframe_end(end_date)
if timeframe_end < timeframe_start:
parser.error('Invalid --timeframe values. Please note that if providing a second date, it must occur after '
'the first date.')

main(
projects=args.ledgers,
timeframe=(timeframe_start, timeframe_end),
aggregate_by=aggregate_by,
force_map=args.force_map,
make_plots=args.plot,
make_animated_plots=args.animated
)
raise ValueError('Invalid --timeframe values. Please note that if providing a second date, it must occur after '
'the first date.')
timeframe = (timeframe_start, timeframe_end)

main(ledgers, timeframe, granularity)

logging.info('Done. Please check the output directory for results.')
Loading
Loading