Merge branch 'main' into 1.0_release_to_main

finopsfoundation · Jun 6, 2024 · 9193a50 · 9193a50
2 parents a262d11 + c58c2b0
commit 9193a50
Show file tree

Hide file tree

Showing 5 changed files with 324 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -67,6 +67,14 @@ poetry run pytest
 
 Ensure you have `pytest` defined as a development dependency in your `pyproject.toml`.
 
+If running on legacy CPUs and the tests crash on the polars library, run the following locally only:
+
+```bash
+poetry add polars-lts-cpu
+```
+
+This will align the polars execution with your system hardware. It should NOT be committed back into the repository.
+
 ## License
 
 This project is licensed under the MIT License - see the `LICENSE` file for details.
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,11 +27,14 @@ pyarrow = "*"
 pydantic = "^2"
 pyyaml = "*"
 requests = "*"
+
 pandera = { version = "^0.17.2" }
 sqlglot = "^18.7.0"
 numpy = { version = "^1.26"}
 pytz = "^2023.3.post1"
 pandasql = "^0.7.3"
+polars = "^0.20.3"
+ddt = "^1.7.1"
 
 [tool.poetry.group.dev.dependencies]
 black = { extras = ["d"], version = "^23.7.0" }

diff --git a/tests/samples/csv_random_data_generate_at_scale.py b/tests/samples/csv_random_data_generate_at_scale.py
@@ -0,0 +1,80 @@
+import functools
+import polars as pl
+from faker import Faker
+import random
+from datetime import datetime, timedelta
+import pytz
+import logging
+import time
+
+fake = Faker()
+
+def get_aws_invoice_issuer(num_records):
+    aws_entities = [
+        'AWS Inc.', 'Amazon Web Services', 'AWS Marketplace', 
+        'Amazon Data Services', 'AWS CloudFront', 'Amazon S3 Billing', 
+        'Amazon EC2 Billing', 'AWS Lambda Billing'
+    ]
+    return [random.choice(aws_entities) for _ in range(num_records)]
+
+# ... similar functions for other non-date attributes ...
+
+def get_random_datetimes(num_records, start_date, end_date):
+    return [fake.date_time_between(start_date=start_date, end_date=end_date, tzinfo=pytz.utc).strftime('%Y-%m-%dT%H:%M:%SZ') for _ in range(num_records)]
+
+def log_execution_time(func):
+    """Decorator to log the execution time of a function."""
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        logging.info(f"{func.__name__} executed in {end_time - start_time:.2f} seconds")
+        return result
+    return wrapper
+
+@log_execution_time
+def generate_and_write_fake_focuses(csv_filename, num_records):
+    now = datetime.now(pytz.utc)
+    thirty_days_ago = now - timedelta(days=30)
+
+    df = pl.DataFrame({
+        'InvoiceIssuer': [random.choice([ 'AWS Inc.', 'Amazon Web Services', 'AWS Marketplace', 'Amazon Data Services', 
+                                         'AWS CloudFront', 'Amazon S3 Billing', 'Amazon EC2 Billing', 'AWS Lambda Billing']) for _ in range(num_records)],
+        'ResourceID': [fake.uuid4() for _ in range(num_records)],
+        'ChargeType': [random.choice(['Adjustment', 'Purchase', 'Tax', 'Usage']) for _ in range(num_records)],
+        'Provider': [fake.company() for _ in range(num_records)],
+        'BillingAccountName': [fake.company() for _ in range(num_records)],
+        'SubAccountName': get_random_datetimes(num_records, thirty_days_ago, now),
+        'BillingAccountId': [fake.uuid4() for _ in range(num_records)],
+        'Publisher': [f"{fake.company()} {random.choice(['Software', 'Service', 'Platform'])} {random.choice(['Inc.', 'LLC', 'Ltd.', 'Group', 'Technologies', 'Solutions'])}" for _ in range(num_records)],
+        'ResourceName': [f"{random.choice(['i-', 'vol-', 'snap-', 'ami-', 'bucket-', 'db-'])}{fake.hexify(text='^^^^^^^^', upper=False)}" for _ in range(num_records)],
+        'ServiceName': [random.choice([
+            'Amazon EC2', 'Amazon S3', 'AWS Lambda', 'Amazon RDS', 
+            'Amazon DynamoDB', 'Amazon VPC', 'Amazon Route 53', 
+            'Amazon CloudFront', 'AWS Elastic Beanstalk', 'Amazon SNS', 
+            'Amazon SQS', 'Amazon Redshift', 'AWS CloudFormation', 
+            'AWS IAM', 'Amazon EBS', 'Amazon ECS', 'Amazon EKS', 
+            'Amazon ElastiCache', 'AWS Fargate', 'AWS Glue'
+        ]) for _ in range(num_records)],
+        'BilledCurrency': ['USD' for _ in range(num_records)],
+        'BillingPeriodEnd': get_random_datetimes(num_records, thirty_days_ago, now),
+        'BillingPeriodStart': get_random_datetimes(num_records, thirty_days_ago, now),
+        'Region': [random.choice([
+            'us-east-1', 'us-west-1', 'us-west-2', 'eu-west-1', 'eu-central-1',
+            'ap-southeast-1', 'ap-southeast-2', 'ap-northeast-1', 'ap-northeast-2',
+            'ap-south-1', 'sa-east-1', 'ca-central-1', 'eu-north-1', 'eu-west-2',
+            'eu-west-3', 'ap-east-1', 'me-south-1', 'af-south-1', 'eu-south-1'
+        ]) for _ in range(num_records)],
+        'ServiceCategory': [random.choice([
+            'AI and Machine Learning', 'Analytics', 'Business Applications', 'Compute', 'Databases', 'Developer Tools', 'Multicloud',
+            'Identity', 'Integration', 'Internet of Things', 'Management and Governance', 'Media', 'Migration', 'Mobile', 'Networking',
+            'Security', 'Storage', 'Web', 'Other'
+        ]) for _ in range(num_records)],
+        'ChargePeriodStart': get_random_datetimes(num_records, thirty_days_ago, now),
+        'ChargePeriodEnd': get_random_datetimes(num_records, thirty_days_ago, now),
+        'BilledCost': [fake.pyfloat(left_digits=3, right_digits=2, positive=True) for _ in range(num_records)],
+        'AmortizedCost': [fake.pyfloat(left_digits=3, right_digits=2, positive=True) for _ in range(num_records)]
+    })
+
+    df.write_csv(csv_filename)
diff --git a/tests/test_performance_profiler.py b/tests/test_performance_profiler.py
@@ -0,0 +1,107 @@
+import cProfile
+import csv
+import io
+import logging
+import os
+import pstats
+import time
+import unittest
+from ddt import ddt, data, unpack
+
+from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses
+from focus_validator.validator import Validator
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
+
+@ddt
+class TestPerformanceProfiler(unittest.TestCase):
+
+    def profile_to_csv(self, profiling_result, csv_file):
+        with open(csv_file, 'w', newline='') as f:
+            w = csv.writer(f)
+            # Write the headers
+            headers = ['ncalls', 'tottime', 'percall', 'cumtime', 'percall', 'filename:lineno(function)']
+            w.writerow(headers)
+
+            # Write each row
+            for row in profiling_result.stats.items():
+                func_name, (cc, nc, tt, ct, callers) = row
+                w.writerow([nc, tt, tt/nc, ct, ct/cc, func_name])
+
+    def execute_profiler(self, file_name, performance_threshold):
+        # Set the environment variable for logging level
+        env = os.environ.copy()
+        env["LOG_LEVEL"] = "INFO"
+
+        # Get the current directory of this test file
+        test_dir = os.path.dirname(os.path.abspath(__file__))
+        base_dir =  os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        version_set_path=os.path.join(base_dir, "focus_validator", "rules", "version_sets")
+        validator = Validator(
+            data_filename=os.path.join(test_dir, '../' + file_name),
+            override_filename=None,
+            rule_set_path=version_set_path,
+            rules_version="0.5",
+            output_type="console",
+            output_destination=None,
+            column_namespace=None,
+        )
+
+        # Set up the profiler
+        profiler = cProfile.Profile()
+        profiler.enable()
+
+        # The original performance testing code
+        start_time = time.time()
+        validator.validate()
+        end_time = time.time()
+        duration = end_time - start_time
+        logging.info(f"File: {file_name} Duration: {duration} seconds")
+
+        # Stop the profiler
+        profiler.disable()
+
+        # Save profiling data to a file
+        profiling_result = pstats.Stats(profiler)
+        profile_file_name = "profiling_data_" + file_name
+        self.profile_to_csv(profiling_result, profile_file_name)
+
+        # Optionally print out profiling report to the console
+        s = io.StringIO()
+        sortby = 'cumulative'  # Can be changed to 'time', 'calls', etc.
+        ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
+        ps.print_stats(10)
+        logging.info(s.getvalue())
+
+        #Execution time check
+        self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds")
+
+    @data(
+        # ("fake_focuses500000.csv", 60.0, 500000, "validate_500000_records"),
+        # ("fake_focuses250000.csv", 60.0, 250000, "validate_250000_records"),
+        # ("fake_focuses100000.csv", 30.0, 100000, "validate_100000_records"),
+        # ("fake_focuses50000.csv", 15.0, 50000, "validate_50000_records"),
+        # ("fake_focuses10000.csv", 7.0, 10000, "validate_10000_records"),
+        # ("fake_focuses5000.csv", 3.0, 5000, "validate_5000_records"),
+        ("fake_focuses2000.csv", 3.0, 2000, "validate_2000_records"),
+        ("fake_focuses2000.csv", 3.0, 1000, "validate_1000_records")
+    )
+    @unpack
+    def test_param_validator_performance(self, file_name, performance_threshold, number_of_records, case_id):
+        with self.subTest(case_id=case_id):
+            # Set the environment variable for logging level
+            env = os.environ.copy()
+            env["LOG_LEVEL"] = "INFO"
+
+            logging.info("Generating file with {number_of_records} records.")
+            generate_and_write_fake_focuses(file_name, number_of_records)
+            self.execute_profiler(str(file_name), performance_threshold)
+
+            logging.info("Cleaning up test file.")
+            base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            if os.path.exists(os.path.join(base_dir, file_name)):
+                os.remove(os.path.join(base_dir, file_name))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_progressive_performance.py b/tests/test_progressive_performance.py
@@ -0,0 +1,126 @@
+import logging
+import os
+import subprocess
+import time
+import unittest
+
+from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
+
+
+class TestProgressivePerformance(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        #Generate 1000 fake focuses to a CSV file
+        cls.csv_filename_1000 = 'fake_focuses1000.csv'
+        cls.csv_filename_10000 = 'fake_focuses10000.csv'
+        cls.csv_filename_50000 = 'fake_focuses50000.csv'
+        cls.csv_filename_100000 = 'fake_focuses100000.csv'
+        cls.csv_filename_250000 = 'fake_focuses250000.csv'
+        cls.csv_filename_500000 = 'fake_focuses500000.csv'
+
+        logging.info("Generating file with 1,000 records")
+        cls.generate_test_file(str(cls.csv_filename_1000), 1000)
+
+        # logging.info("Generating file with 10,0000 records")
+        # cls.generate_test_file(str(cls.csv_filename_10000), 10000)
+
+        # logging.info("Generating file with 50,0000 records")
+        # cls.generate_test_file(str(cls.csv_filename_50000), 50000)
+
+        # logging.info("Generating file with 100,0000 records")
+        # cls.generate_test_file(str(cls.csv_filename_100000), 100000)
+
+        # logging.info("Generating file with 250,0000 records")
+        # cls.generate_test_file(str(cls.csv_filename_250000), 250000)
+
+        # logging.info("Generating file with 500,0000 records")
+        # cls.generate_test_file(str(cls.csv_filename_500000), 500000)
+
+    @classmethod
+    def tearDownClass(cls):
+        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+        if os.path.exists(os.path.join(base_dir, 'fake_focuses.csv')):
+            os.remove(os.path.join(base_dir, 'fake_focuses.csv'))
+
+        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_1000))):
+            os.remove(os.path.join(base_dir, str(cls.csv_filename_1000)))
+
+        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_10000))):
+            os.remove(os.path.join(base_dir, str(cls.csv_filename_10000)))
+
+        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_50000))):
+            os.remove(os.path.join(base_dir, str(cls.csv_filename_50000)))
+
+        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_100000))):
+            os.remove(os.path.join(base_dir, str(cls.csv_filename_100000)))
+
+        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_250000))):
+            os.remove(os.path.join(base_dir, str(cls.csv_filename_250000)))
+
+        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_500000))):
+            os.remove(os.path.join(base_dir, str(cls.csv_filename_500000)))
+
+    @classmethod
+    def generate_test_file(cls, csv_filename, number_of_records):
+        #Generate fake focuses to a CSV file
+        # fake_focuses = generate_fake_focus(number_of_records)
+
+        # write_fake_focuses_to_csv(fake_focuses, csv_filename)
+        generate_and_write_fake_focuses(csv_filename, number_of_records)
+
+
+    def run_validator(self, args):
+        # Get the current directory of this test file
+        test_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # Construct the path to the application directory
+        app_dir = os.path.join(test_dir, '../focus_validator')
+        # Set the environment variable for logging level
+        env = os.environ.copy()
+        env["LOG_LEVEL"] = "INFO"
+
+        command = ['poetry', 'run', 'python', os.path.join(app_dir, 'main.py')] + args
+        return subprocess.run(command, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
+
+    def test_1000_record_csv_performance(self):
+        self.execute_performance(str(self.csv_filename_1000), 25.0)
+
+    # def test_10000_record_csv_performance(self):
+    #     self.execute_performance(str(self.csv_filename_10000), 25.0)
+
+    # def test_50000_record_csv_performance(self):
+    #     self.execute_performance(str(self.csv_filename_50000), 150.0)
+
+    # def test_100000_record_csv_performance(self):
+    #     self.execute_performance(str(self.csv_filename_100000), 300.0)
+
+    # def test_250000_record_csv_performance(self):
+    #     self.execute_performance(str(self.csv_filename_250000), 300.0)
+
+    # def test_500000_record_csv_performance(self):
+    #     self.execute_performance(str(self.csv_filename_500000), 300.0)
+
+    def execute_performance(self, file_name, performance_threshold):
+        # Get the current directory of this test file
+        test_dir = os.path.dirname(os.path.abspath(__file__))
+
+        start_time = time.time()
+
+        # Command to execute the focus_validator tool
+        result = self.run_validator(['--data-file', os.path.join(test_dir, '../' + file_name)])
+        print(result.stdout)
+
+        end_time = time.time()
+        duration = end_time - start_time
+        logging.info(f"File: {file_name} Duration: {duration} seconds")
+
+        self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds")
+        self.assertEqual(result.returncode, 0, "Focus Validator did not exit cleanly.")
+
+
+if __name__ == '__main__':
+    unittest.main()