-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into 1.0_release_to_main
- Loading branch information
Showing
5 changed files
with
324 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import functools | ||
import polars as pl | ||
from faker import Faker | ||
import random | ||
from datetime import datetime, timedelta | ||
import pytz | ||
import logging | ||
import time | ||
|
||
fake = Faker() | ||
|
||
def get_aws_invoice_issuer(num_records): | ||
aws_entities = [ | ||
'AWS Inc.', 'Amazon Web Services', 'AWS Marketplace', | ||
'Amazon Data Services', 'AWS CloudFront', 'Amazon S3 Billing', | ||
'Amazon EC2 Billing', 'AWS Lambda Billing' | ||
] | ||
return [random.choice(aws_entities) for _ in range(num_records)] | ||
|
||
# ... similar functions for other non-date attributes ... | ||
|
||
def get_random_datetimes(num_records, start_date, end_date): | ||
return [fake.date_time_between(start_date=start_date, end_date=end_date, tzinfo=pytz.utc).strftime('%Y-%m-%dT%H:%M:%SZ') for _ in range(num_records)] | ||
|
||
def log_execution_time(func): | ||
"""Decorator to log the execution time of a function.""" | ||
@functools.wraps(func) | ||
def wrapper(*args, **kwargs): | ||
start_time = time.time() | ||
result = func(*args, **kwargs) | ||
end_time = time.time() | ||
logging.info(f"{func.__name__} executed in {end_time - start_time:.2f} seconds") | ||
return result | ||
return wrapper | ||
|
||
@log_execution_time | ||
def generate_and_write_fake_focuses(csv_filename, num_records): | ||
now = datetime.now(pytz.utc) | ||
thirty_days_ago = now - timedelta(days=30) | ||
|
||
df = pl.DataFrame({ | ||
'InvoiceIssuer': [random.choice([ 'AWS Inc.', 'Amazon Web Services', 'AWS Marketplace', 'Amazon Data Services', | ||
'AWS CloudFront', 'Amazon S3 Billing', 'Amazon EC2 Billing', 'AWS Lambda Billing']) for _ in range(num_records)], | ||
'ResourceID': [fake.uuid4() for _ in range(num_records)], | ||
'ChargeType': [random.choice(['Adjustment', 'Purchase', 'Tax', 'Usage']) for _ in range(num_records)], | ||
'Provider': [fake.company() for _ in range(num_records)], | ||
'BillingAccountName': [fake.company() for _ in range(num_records)], | ||
'SubAccountName': get_random_datetimes(num_records, thirty_days_ago, now), | ||
'BillingAccountId': [fake.uuid4() for _ in range(num_records)], | ||
'Publisher': [f"{fake.company()} {random.choice(['Software', 'Service', 'Platform'])} {random.choice(['Inc.', 'LLC', 'Ltd.', 'Group', 'Technologies', 'Solutions'])}" for _ in range(num_records)], | ||
'ResourceName': [f"{random.choice(['i-', 'vol-', 'snap-', 'ami-', 'bucket-', 'db-'])}{fake.hexify(text='^^^^^^^^', upper=False)}" for _ in range(num_records)], | ||
'ServiceName': [random.choice([ | ||
'Amazon EC2', 'Amazon S3', 'AWS Lambda', 'Amazon RDS', | ||
'Amazon DynamoDB', 'Amazon VPC', 'Amazon Route 53', | ||
'Amazon CloudFront', 'AWS Elastic Beanstalk', 'Amazon SNS', | ||
'Amazon SQS', 'Amazon Redshift', 'AWS CloudFormation', | ||
'AWS IAM', 'Amazon EBS', 'Amazon ECS', 'Amazon EKS', | ||
'Amazon ElastiCache', 'AWS Fargate', 'AWS Glue' | ||
]) for _ in range(num_records)], | ||
'BilledCurrency': ['USD' for _ in range(num_records)], | ||
'BillingPeriodEnd': get_random_datetimes(num_records, thirty_days_ago, now), | ||
'BillingPeriodStart': get_random_datetimes(num_records, thirty_days_ago, now), | ||
'Region': [random.choice([ | ||
'us-east-1', 'us-west-1', 'us-west-2', 'eu-west-1', 'eu-central-1', | ||
'ap-southeast-1', 'ap-southeast-2', 'ap-northeast-1', 'ap-northeast-2', | ||
'ap-south-1', 'sa-east-1', 'ca-central-1', 'eu-north-1', 'eu-west-2', | ||
'eu-west-3', 'ap-east-1', 'me-south-1', 'af-south-1', 'eu-south-1' | ||
]) for _ in range(num_records)], | ||
'ServiceCategory': [random.choice([ | ||
'AI and Machine Learning', 'Analytics', 'Business Applications', 'Compute', 'Databases', 'Developer Tools', 'Multicloud', | ||
'Identity', 'Integration', 'Internet of Things', 'Management and Governance', 'Media', 'Migration', 'Mobile', 'Networking', | ||
'Security', 'Storage', 'Web', 'Other' | ||
]) for _ in range(num_records)], | ||
'ChargePeriodStart': get_random_datetimes(num_records, thirty_days_ago, now), | ||
'ChargePeriodEnd': get_random_datetimes(num_records, thirty_days_ago, now), | ||
'BilledCost': [fake.pyfloat(left_digits=3, right_digits=2, positive=True) for _ in range(num_records)], | ||
'AmortizedCost': [fake.pyfloat(left_digits=3, right_digits=2, positive=True) for _ in range(num_records)] | ||
}) | ||
|
||
df.write_csv(csv_filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import cProfile | ||
import csv | ||
import io | ||
import logging | ||
import os | ||
import pstats | ||
import time | ||
import unittest | ||
from ddt import ddt, data, unpack | ||
|
||
from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses | ||
from focus_validator.validator import Validator | ||
|
||
# Configure logging | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s') | ||
|
||
@ddt | ||
class TestPerformanceProfiler(unittest.TestCase): | ||
|
||
def profile_to_csv(self, profiling_result, csv_file): | ||
with open(csv_file, 'w', newline='') as f: | ||
w = csv.writer(f) | ||
# Write the headers | ||
headers = ['ncalls', 'tottime', 'percall', 'cumtime', 'percall', 'filename:lineno(function)'] | ||
w.writerow(headers) | ||
|
||
# Write each row | ||
for row in profiling_result.stats.items(): | ||
func_name, (cc, nc, tt, ct, callers) = row | ||
w.writerow([nc, tt, tt/nc, ct, ct/cc, func_name]) | ||
|
||
def execute_profiler(self, file_name, performance_threshold): | ||
# Set the environment variable for logging level | ||
env = os.environ.copy() | ||
env["LOG_LEVEL"] = "INFO" | ||
|
||
# Get the current directory of this test file | ||
test_dir = os.path.dirname(os.path.abspath(__file__)) | ||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
version_set_path=os.path.join(base_dir, "focus_validator", "rules", "version_sets") | ||
validator = Validator( | ||
data_filename=os.path.join(test_dir, '../' + file_name), | ||
override_filename=None, | ||
rule_set_path=version_set_path, | ||
rules_version="0.5", | ||
output_type="console", | ||
output_destination=None, | ||
column_namespace=None, | ||
) | ||
|
||
# Set up the profiler | ||
profiler = cProfile.Profile() | ||
profiler.enable() | ||
|
||
# The original performance testing code | ||
start_time = time.time() | ||
validator.validate() | ||
end_time = time.time() | ||
duration = end_time - start_time | ||
logging.info(f"File: {file_name} Duration: {duration} seconds") | ||
|
||
# Stop the profiler | ||
profiler.disable() | ||
|
||
# Save profiling data to a file | ||
profiling_result = pstats.Stats(profiler) | ||
profile_file_name = "profiling_data_" + file_name | ||
self.profile_to_csv(profiling_result, profile_file_name) | ||
|
||
# Optionally print out profiling report to the console | ||
s = io.StringIO() | ||
sortby = 'cumulative' # Can be changed to 'time', 'calls', etc. | ||
ps = pstats.Stats(profiler, stream=s).sort_stats(sortby) | ||
ps.print_stats(10) | ||
logging.info(s.getvalue()) | ||
|
||
#Execution time check | ||
self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds") | ||
|
||
@data( | ||
# ("fake_focuses500000.csv", 60.0, 500000, "validate_500000_records"), | ||
# ("fake_focuses250000.csv", 60.0, 250000, "validate_250000_records"), | ||
# ("fake_focuses100000.csv", 30.0, 100000, "validate_100000_records"), | ||
# ("fake_focuses50000.csv", 15.0, 50000, "validate_50000_records"), | ||
# ("fake_focuses10000.csv", 7.0, 10000, "validate_10000_records"), | ||
# ("fake_focuses5000.csv", 3.0, 5000, "validate_5000_records"), | ||
("fake_focuses2000.csv", 3.0, 2000, "validate_2000_records"), | ||
("fake_focuses2000.csv", 3.0, 1000, "validate_1000_records") | ||
) | ||
@unpack | ||
def test_param_validator_performance(self, file_name, performance_threshold, number_of_records, case_id): | ||
with self.subTest(case_id=case_id): | ||
# Set the environment variable for logging level | ||
env = os.environ.copy() | ||
env["LOG_LEVEL"] = "INFO" | ||
|
||
logging.info("Generating file with {number_of_records} records.") | ||
generate_and_write_fake_focuses(file_name, number_of_records) | ||
self.execute_profiler(str(file_name), performance_threshold) | ||
|
||
logging.info("Cleaning up test file.") | ||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
if os.path.exists(os.path.join(base_dir, file_name)): | ||
os.remove(os.path.join(base_dir, file_name)) | ||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import logging | ||
import os | ||
import subprocess | ||
import time | ||
import unittest | ||
|
||
from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses | ||
|
||
# Configure logging | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s') | ||
|
||
|
||
class TestProgressivePerformance(unittest.TestCase): | ||
@classmethod | ||
def setUpClass(cls): | ||
#Generate 1000 fake focuses to a CSV file | ||
cls.csv_filename_1000 = 'fake_focuses1000.csv' | ||
cls.csv_filename_10000 = 'fake_focuses10000.csv' | ||
cls.csv_filename_50000 = 'fake_focuses50000.csv' | ||
cls.csv_filename_100000 = 'fake_focuses100000.csv' | ||
cls.csv_filename_250000 = 'fake_focuses250000.csv' | ||
cls.csv_filename_500000 = 'fake_focuses500000.csv' | ||
|
||
logging.info("Generating file with 1,000 records") | ||
cls.generate_test_file(str(cls.csv_filename_1000), 1000) | ||
|
||
# logging.info("Generating file with 10,0000 records") | ||
# cls.generate_test_file(str(cls.csv_filename_10000), 10000) | ||
|
||
# logging.info("Generating file with 50,0000 records") | ||
# cls.generate_test_file(str(cls.csv_filename_50000), 50000) | ||
|
||
# logging.info("Generating file with 100,0000 records") | ||
# cls.generate_test_file(str(cls.csv_filename_100000), 100000) | ||
|
||
# logging.info("Generating file with 250,0000 records") | ||
# cls.generate_test_file(str(cls.csv_filename_250000), 250000) | ||
|
||
# logging.info("Generating file with 500,0000 records") | ||
# cls.generate_test_file(str(cls.csv_filename_500000), 500000) | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
|
||
if os.path.exists(os.path.join(base_dir, 'fake_focuses.csv')): | ||
os.remove(os.path.join(base_dir, 'fake_focuses.csv')) | ||
|
||
if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_1000))): | ||
os.remove(os.path.join(base_dir, str(cls.csv_filename_1000))) | ||
|
||
if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_10000))): | ||
os.remove(os.path.join(base_dir, str(cls.csv_filename_10000))) | ||
|
||
if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_50000))): | ||
os.remove(os.path.join(base_dir, str(cls.csv_filename_50000))) | ||
|
||
if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_100000))): | ||
os.remove(os.path.join(base_dir, str(cls.csv_filename_100000))) | ||
|
||
if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_250000))): | ||
os.remove(os.path.join(base_dir, str(cls.csv_filename_250000))) | ||
|
||
if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_500000))): | ||
os.remove(os.path.join(base_dir, str(cls.csv_filename_500000))) | ||
|
||
@classmethod | ||
def generate_test_file(cls, csv_filename, number_of_records): | ||
#Generate fake focuses to a CSV file | ||
# fake_focuses = generate_fake_focus(number_of_records) | ||
|
||
# write_fake_focuses_to_csv(fake_focuses, csv_filename) | ||
generate_and_write_fake_focuses(csv_filename, number_of_records) | ||
|
||
|
||
def run_validator(self, args): | ||
# Get the current directory of this test file | ||
test_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
# Construct the path to the application directory | ||
app_dir = os.path.join(test_dir, '../focus_validator') | ||
# Set the environment variable for logging level | ||
env = os.environ.copy() | ||
env["LOG_LEVEL"] = "INFO" | ||
|
||
command = ['poetry', 'run', 'python', os.path.join(app_dir, 'main.py')] + args | ||
return subprocess.run(command, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True) | ||
|
||
def test_1000_record_csv_performance(self): | ||
self.execute_performance(str(self.csv_filename_1000), 25.0) | ||
|
||
# def test_10000_record_csv_performance(self): | ||
# self.execute_performance(str(self.csv_filename_10000), 25.0) | ||
|
||
# def test_50000_record_csv_performance(self): | ||
# self.execute_performance(str(self.csv_filename_50000), 150.0) | ||
|
||
# def test_100000_record_csv_performance(self): | ||
# self.execute_performance(str(self.csv_filename_100000), 300.0) | ||
|
||
# def test_250000_record_csv_performance(self): | ||
# self.execute_performance(str(self.csv_filename_250000), 300.0) | ||
|
||
# def test_500000_record_csv_performance(self): | ||
# self.execute_performance(str(self.csv_filename_500000), 300.0) | ||
|
||
def execute_performance(self, file_name, performance_threshold): | ||
# Get the current directory of this test file | ||
test_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
start_time = time.time() | ||
|
||
# Command to execute the focus_validator tool | ||
result = self.run_validator(['--data-file', os.path.join(test_dir, '../' + file_name)]) | ||
print(result.stdout) | ||
|
||
end_time = time.time() | ||
duration = end_time - start_time | ||
logging.info(f"File: {file_name} Duration: {duration} seconds") | ||
|
||
self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds") | ||
self.assertEqual(result.returncode, 0, "Focus Validator did not exit cleanly.") | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |