cdisc-org · RamilCDISC · Jan 3, 2025 · Jan 3, 2025 · Jan 26, 2025 · Jan 27, 2025
diff --git a/.flake8 b/.flake8
@@ -2,10 +2,14 @@
 max-line-length = 120
 max_complexity = 10
 ignore = E203, W503
-
+select = E9,F63,F7,F82
+statistics = True
+count = True
+show-source = True
 exclude = .github,
     .pytest_cache,
     cdisc_rules_engine/resources,
+    tests/PerformanceTest.py,
     venv,
     build,
     dist
diff --git a/.github/workflows/automated-ci.yml b/.github/workflows/automated-ci.yml
@@ -59,12 +59,11 @@ jobs:
       - name: Install linters
         run: |
           pip install flake8==5.0.4
-          pip install black==22.6.0
+          pip install black==24.10.0
 
       - name: Run flake8
         run: |
-          flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics
-          flake8 ${{needs.get_changed_files.outputs.py}} --ignore E203,W503 --count --statistics
+          flake8 ${{needs.get_changed_files.outputs.py}} --statistics
 
       - name: Run black
         run: |

diff --git a/README.md b/README.md
@@ -73,6 +73,24 @@ From the root of the project run the following command (this will run both the u
 
 `python -m pytest tests`
 
+### **Performance Testing**
+
+This repository includes a performance testing script located in the `tests` folder under the filename `PerformanceTest.py`. The script is designed to evaluate the execution time of rules against datasets by running multiple test iterations.
+
+### Running the Performance Test
+
+To execute the performance test, navigate to the root directory of the project and run the following command:
+
+```sh
+python tests/PerformanceTest.py -dd <DATASET_DIRECTORY> -rd <RULES_DIRECTORY> -total_calls <NUMBER_OF_CALLS> -od <OUTPUT_DIRECTORY>
+```
+### Performance Test Command-Line Flags
+
+- **`-dd` (Dataset Directory)**: The directory containing the dataset files in `.json` or `.xpt` format.
+- **`-rd` (Rules Directory)**: The directory containing rule files.
+- **`-total_calls` (Total Calls)**: The number of times each rule should be executed for performance analysis.
+- **`-od` (Output Directory, Optional)**: The directory where the output report (`rule_execution_report.xlsx`) will be saved. By default, the report is saved in the current working directory.
+
 ### **Running a validation**
 
 #### From the command line

diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py
@@ -28,14 +28,17 @@
 from cdisc_rules_engine.services import logger
 from functools import wraps
 import traceback
+import time
 
 
 def log_operator_execution(func):
     @wraps(func)
     def wrapper(self, other_value, *args, **kwargs):
         try:
             logger.info(f"Starting check operator: {func.__name__}")
+            logger.log(rf"\n\OPRT{time.time()}-operator {func.__name__} starts")
             result = func(self, other_value)
+            logger.log(rf"\n\OPRT{time.time()}-operator {func.__name__} ends")
             logger.info(f"Completed check operator: {func.__name__}")
             return result
         except Exception as e:

diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py
@@ -45,6 +45,7 @@
     ExternalDictionariesContainer,
 )
 import traceback
+import time
 
 
 class RulesEngine:
@@ -249,20 +250,20 @@ def validate_rule(
         # SPECIAL CASES FOR RULE TYPES ###############################
         # TODO: Handle these special cases better.
         if self.library_metadata:
-            kwargs[
-                "variable_codelist_map"
-            ] = self.library_metadata.variable_codelist_map
-            kwargs[
-                "codelist_term_maps"
-            ] = self.library_metadata.get_all_ct_package_metadata()
+            kwargs["variable_codelist_map"] = (
+                self.library_metadata.variable_codelist_map
+            )
+            kwargs["codelist_term_maps"] = (
+                self.library_metadata.get_all_ct_package_metadata()
+            )
         if rule.get("rule_type") == RuleTypes.DEFINE_ITEM_METADATA_CHECK.value:
             if self.library_metadata:
-                kwargs[
-                    "variable_codelist_map"
-                ] = self.library_metadata.variable_codelist_map
-                kwargs[
-                    "codelist_term_maps"
-                ] = self.library_metadata.get_all_ct_package_metadata()
+                kwargs["variable_codelist_map"] = (
+                    self.library_metadata.variable_codelist_map
+                )
+                kwargs["codelist_term_maps"] = (
+                    self.library_metadata.get_all_ct_package_metadata()
+                )
         elif (
             rule.get("rule_type")
             == RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE.value
@@ -289,10 +290,10 @@ def validate_rule(
                 domain, {}
             )
             define_metadata: List[dict] = builder.get_define_xml_variables_metadata()
-            targets: List[
-                str
-            ] = self.data_processor.filter_dataset_columns_by_metadata_and_rule(
-                dataset.columns.tolist(), define_metadata, library_metadata, rule
+            targets: List[str] = (
+                self.data_processor.filter_dataset_columns_by_metadata_and_rule(
+                    dataset.columns.tolist(), define_metadata, library_metadata, rule
+                )
             )
             rule_copy = deepcopy(rule)
             updated_conditions = RuleProcessor.duplicate_conditions_for_all_targets(
@@ -343,10 +344,14 @@ def execute_rule(
         # Adding copy for now to avoid updating cached dataset
         dataset = deepcopy(dataset)
         # preprocess dataset
+
+        logger.log(rf"\n\ST{time.time()}-Dataset Preprocessing Starts")
         dataset_preprocessor = DatasetPreprocessor(
             dataset, domain, dataset_path, self.data_service, self.cache
         )
         dataset = dataset_preprocessor.preprocess(rule_copy, datasets)
+        logger.log(rf"\n\ST{time.time()}-Dataset Preprocessing Ends")
+        logger.log(rf"\n\OPRNT{time.time()}-Operation Starts")
         dataset = self.rule_processor.perform_rule_operations(
             rule_copy,
             dataset,
@@ -359,6 +364,7 @@ def execute_rule(
             external_dictionaries=self.external_dictionaries,
             ct_packages=ct_packages,
         )
+        logger.log(rf"\n\OPRNT{time.time()}-Operation Ends")
         relationship_data = {}
         if domain is not None and self.rule_processor.is_relationship_dataset(domain):
             relationship_data = self.data_processor.preprocess_relationship_dataset(