Skip to content

Commit

Permalink
Merge branch 'main' into 1.0_release_to_main
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-finopsorg authored Jun 6, 2024
2 parents a262d11 + c58c2b0 commit 9193a50
Show file tree
Hide file tree
Showing 5 changed files with 324 additions and 0 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,14 @@ poetry run pytest

Ensure you have `pytest` defined as a development dependency in your `pyproject.toml`.

If running on legacy CPUs and the tests crash on the polars library, run the following locally only:

```bash
poetry add polars-lts-cpu
```

This will align the polars execution with your system hardware. It should NOT be committed back into the repository.

## License

This project is licensed under the MIT License - see the `LICENSE` file for details.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@ pyarrow = "*"
pydantic = "^2"
pyyaml = "*"
requests = "*"

pandera = { version = "^0.17.2" }
sqlglot = "^18.7.0"
numpy = { version = "^1.26"}
pytz = "^2023.3.post1"
pandasql = "^0.7.3"
polars = "^0.20.3"
ddt = "^1.7.1"

[tool.poetry.group.dev.dependencies]
black = { extras = ["d"], version = "^23.7.0" }
Expand Down
80 changes: 80 additions & 0 deletions tests/samples/csv_random_data_generate_at_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import functools
import polars as pl
from faker import Faker
import random
from datetime import datetime, timedelta
import pytz
import logging
import time

fake = Faker()

def get_aws_invoice_issuer(num_records):
aws_entities = [
'AWS Inc.', 'Amazon Web Services', 'AWS Marketplace',
'Amazon Data Services', 'AWS CloudFront', 'Amazon S3 Billing',
'Amazon EC2 Billing', 'AWS Lambda Billing'
]
return [random.choice(aws_entities) for _ in range(num_records)]

# ... similar functions for other non-date attributes ...

def get_random_datetimes(num_records, start_date, end_date):
return [fake.date_time_between(start_date=start_date, end_date=end_date, tzinfo=pytz.utc).strftime('%Y-%m-%dT%H:%M:%SZ') for _ in range(num_records)]

def log_execution_time(func):
"""Decorator to log the execution time of a function."""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
logging.info(f"{func.__name__} executed in {end_time - start_time:.2f} seconds")
return result
return wrapper

@log_execution_time
def generate_and_write_fake_focuses(csv_filename, num_records):
now = datetime.now(pytz.utc)
thirty_days_ago = now - timedelta(days=30)

df = pl.DataFrame({
'InvoiceIssuer': [random.choice([ 'AWS Inc.', 'Amazon Web Services', 'AWS Marketplace', 'Amazon Data Services',
'AWS CloudFront', 'Amazon S3 Billing', 'Amazon EC2 Billing', 'AWS Lambda Billing']) for _ in range(num_records)],
'ResourceID': [fake.uuid4() for _ in range(num_records)],
'ChargeType': [random.choice(['Adjustment', 'Purchase', 'Tax', 'Usage']) for _ in range(num_records)],
'Provider': [fake.company() for _ in range(num_records)],
'BillingAccountName': [fake.company() for _ in range(num_records)],
'SubAccountName': get_random_datetimes(num_records, thirty_days_ago, now),
'BillingAccountId': [fake.uuid4() for _ in range(num_records)],
'Publisher': [f"{fake.company()} {random.choice(['Software', 'Service', 'Platform'])} {random.choice(['Inc.', 'LLC', 'Ltd.', 'Group', 'Technologies', 'Solutions'])}" for _ in range(num_records)],
'ResourceName': [f"{random.choice(['i-', 'vol-', 'snap-', 'ami-', 'bucket-', 'db-'])}{fake.hexify(text='^^^^^^^^', upper=False)}" for _ in range(num_records)],
'ServiceName': [random.choice([
'Amazon EC2', 'Amazon S3', 'AWS Lambda', 'Amazon RDS',
'Amazon DynamoDB', 'Amazon VPC', 'Amazon Route 53',
'Amazon CloudFront', 'AWS Elastic Beanstalk', 'Amazon SNS',
'Amazon SQS', 'Amazon Redshift', 'AWS CloudFormation',
'AWS IAM', 'Amazon EBS', 'Amazon ECS', 'Amazon EKS',
'Amazon ElastiCache', 'AWS Fargate', 'AWS Glue'
]) for _ in range(num_records)],
'BilledCurrency': ['USD' for _ in range(num_records)],
'BillingPeriodEnd': get_random_datetimes(num_records, thirty_days_ago, now),
'BillingPeriodStart': get_random_datetimes(num_records, thirty_days_ago, now),
'Region': [random.choice([
'us-east-1', 'us-west-1', 'us-west-2', 'eu-west-1', 'eu-central-1',
'ap-southeast-1', 'ap-southeast-2', 'ap-northeast-1', 'ap-northeast-2',
'ap-south-1', 'sa-east-1', 'ca-central-1', 'eu-north-1', 'eu-west-2',
'eu-west-3', 'ap-east-1', 'me-south-1', 'af-south-1', 'eu-south-1'
]) for _ in range(num_records)],
'ServiceCategory': [random.choice([
'AI and Machine Learning', 'Analytics', 'Business Applications', 'Compute', 'Databases', 'Developer Tools', 'Multicloud',
'Identity', 'Integration', 'Internet of Things', 'Management and Governance', 'Media', 'Migration', 'Mobile', 'Networking',
'Security', 'Storage', 'Web', 'Other'
]) for _ in range(num_records)],
'ChargePeriodStart': get_random_datetimes(num_records, thirty_days_ago, now),
'ChargePeriodEnd': get_random_datetimes(num_records, thirty_days_ago, now),
'BilledCost': [fake.pyfloat(left_digits=3, right_digits=2, positive=True) for _ in range(num_records)],
'AmortizedCost': [fake.pyfloat(left_digits=3, right_digits=2, positive=True) for _ in range(num_records)]
})

df.write_csv(csv_filename)
107 changes: 107 additions & 0 deletions tests/test_performance_profiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import cProfile
import csv
import io
import logging
import os
import pstats
import time
import unittest
from ddt import ddt, data, unpack

from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses
from focus_validator.validator import Validator

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')

@ddt
class TestPerformanceProfiler(unittest.TestCase):

def profile_to_csv(self, profiling_result, csv_file):
with open(csv_file, 'w', newline='') as f:
w = csv.writer(f)
# Write the headers
headers = ['ncalls', 'tottime', 'percall', 'cumtime', 'percall', 'filename:lineno(function)']
w.writerow(headers)

# Write each row
for row in profiling_result.stats.items():
func_name, (cc, nc, tt, ct, callers) = row
w.writerow([nc, tt, tt/nc, ct, ct/cc, func_name])

def execute_profiler(self, file_name, performance_threshold):
# Set the environment variable for logging level
env = os.environ.copy()
env["LOG_LEVEL"] = "INFO"

# Get the current directory of this test file
test_dir = os.path.dirname(os.path.abspath(__file__))
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
version_set_path=os.path.join(base_dir, "focus_validator", "rules", "version_sets")
validator = Validator(
data_filename=os.path.join(test_dir, '../' + file_name),
override_filename=None,
rule_set_path=version_set_path,
rules_version="0.5",
output_type="console",
output_destination=None,
column_namespace=None,
)

# Set up the profiler
profiler = cProfile.Profile()
profiler.enable()

# The original performance testing code
start_time = time.time()
validator.validate()
end_time = time.time()
duration = end_time - start_time
logging.info(f"File: {file_name} Duration: {duration} seconds")

# Stop the profiler
profiler.disable()

# Save profiling data to a file
profiling_result = pstats.Stats(profiler)
profile_file_name = "profiling_data_" + file_name
self.profile_to_csv(profiling_result, profile_file_name)

# Optionally print out profiling report to the console
s = io.StringIO()
sortby = 'cumulative' # Can be changed to 'time', 'calls', etc.
ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
ps.print_stats(10)
logging.info(s.getvalue())

#Execution time check
self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds")

@data(
# ("fake_focuses500000.csv", 60.0, 500000, "validate_500000_records"),
# ("fake_focuses250000.csv", 60.0, 250000, "validate_250000_records"),
# ("fake_focuses100000.csv", 30.0, 100000, "validate_100000_records"),
# ("fake_focuses50000.csv", 15.0, 50000, "validate_50000_records"),
# ("fake_focuses10000.csv", 7.0, 10000, "validate_10000_records"),
# ("fake_focuses5000.csv", 3.0, 5000, "validate_5000_records"),
("fake_focuses2000.csv", 3.0, 2000, "validate_2000_records"),
("fake_focuses2000.csv", 3.0, 1000, "validate_1000_records")
)
@unpack
def test_param_validator_performance(self, file_name, performance_threshold, number_of_records, case_id):
with self.subTest(case_id=case_id):
# Set the environment variable for logging level
env = os.environ.copy()
env["LOG_LEVEL"] = "INFO"

logging.info("Generating file with {number_of_records} records.")
generate_and_write_fake_focuses(file_name, number_of_records)
self.execute_profiler(str(file_name), performance_threshold)

logging.info("Cleaning up test file.")
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if os.path.exists(os.path.join(base_dir, file_name)):
os.remove(os.path.join(base_dir, file_name))

if __name__ == '__main__':
unittest.main()
126 changes: 126 additions & 0 deletions tests/test_progressive_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import logging
import os
import subprocess
import time
import unittest

from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')


class TestProgressivePerformance(unittest.TestCase):
@classmethod
def setUpClass(cls):
#Generate 1000 fake focuses to a CSV file
cls.csv_filename_1000 = 'fake_focuses1000.csv'
cls.csv_filename_10000 = 'fake_focuses10000.csv'
cls.csv_filename_50000 = 'fake_focuses50000.csv'
cls.csv_filename_100000 = 'fake_focuses100000.csv'
cls.csv_filename_250000 = 'fake_focuses250000.csv'
cls.csv_filename_500000 = 'fake_focuses500000.csv'

logging.info("Generating file with 1,000 records")
cls.generate_test_file(str(cls.csv_filename_1000), 1000)

# logging.info("Generating file with 10,0000 records")
# cls.generate_test_file(str(cls.csv_filename_10000), 10000)

# logging.info("Generating file with 50,0000 records")
# cls.generate_test_file(str(cls.csv_filename_50000), 50000)

# logging.info("Generating file with 100,0000 records")
# cls.generate_test_file(str(cls.csv_filename_100000), 100000)

# logging.info("Generating file with 250,0000 records")
# cls.generate_test_file(str(cls.csv_filename_250000), 250000)

# logging.info("Generating file with 500,0000 records")
# cls.generate_test_file(str(cls.csv_filename_500000), 500000)

@classmethod
def tearDownClass(cls):
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

if os.path.exists(os.path.join(base_dir, 'fake_focuses.csv')):
os.remove(os.path.join(base_dir, 'fake_focuses.csv'))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_1000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_1000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_10000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_10000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_50000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_50000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_100000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_100000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_250000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_250000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_500000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_500000)))

@classmethod
def generate_test_file(cls, csv_filename, number_of_records):
#Generate fake focuses to a CSV file
# fake_focuses = generate_fake_focus(number_of_records)

# write_fake_focuses_to_csv(fake_focuses, csv_filename)
generate_and_write_fake_focuses(csv_filename, number_of_records)


def run_validator(self, args):
# Get the current directory of this test file
test_dir = os.path.dirname(os.path.abspath(__file__))

# Construct the path to the application directory
app_dir = os.path.join(test_dir, '../focus_validator')
# Set the environment variable for logging level
env = os.environ.copy()
env["LOG_LEVEL"] = "INFO"

command = ['poetry', 'run', 'python', os.path.join(app_dir, 'main.py')] + args
return subprocess.run(command, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)

def test_1000_record_csv_performance(self):
self.execute_performance(str(self.csv_filename_1000), 25.0)

# def test_10000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_10000), 25.0)

# def test_50000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_50000), 150.0)

# def test_100000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_100000), 300.0)

# def test_250000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_250000), 300.0)

# def test_500000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_500000), 300.0)

def execute_performance(self, file_name, performance_threshold):
# Get the current directory of this test file
test_dir = os.path.dirname(os.path.abspath(__file__))

start_time = time.time()

# Command to execute the focus_validator tool
result = self.run_validator(['--data-file', os.path.join(test_dir, '../' + file_name)])
print(result.stdout)

end_time = time.time()
duration = end_time - start_time
logging.info(f"File: {file_name} Duration: {duration} seconds")

self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds")
self.assertEqual(result.returncode, 0, "Focus Validator did not exit cleanly.")


if __name__ == '__main__':
unittest.main()

0 comments on commit 9193a50

Please sign in to comment.