Skip to content
Draft
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
max-line-length = 120
max_complexity = 10
ignore = E203, W503

select = E9,F63,F7,F82
statistics = True
count = True
show-source = True
exclude = .github,
.pytest_cache,
cdisc_rules_engine/resources,
tests/PerformanceTest.py,
venv,
build,
dist
5 changes: 2 additions & 3 deletions .github/workflows/automated-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,11 @@ jobs:
- name: Install linters
run: |
pip install flake8==5.0.4
pip install black==22.6.0
pip install black==24.10.0

- name: Run flake8
run: |
flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics
flake8 ${{needs.get_changed_files.outputs.py}} --ignore E203,W503 --count --statistics
flake8 ${{needs.get_changed_files.outputs.py}} --statistics

- name: Run black
run: |
Expand Down
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@ From the root of the project run the following command (this will run both the u

`python -m pytest tests`

### **Performance Testing**

This repository includes a performance testing script located in the `tests` folder under the filename `PerformanceTest.py`. The script is designed to evaluate the execution time of rules against datasets by running multiple test iterations.

### Running the Performance Test

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This and the next header should be nested under the Performance Testing header. IOW, use 4 #


To execute the performance test, navigate to the root directory of the project and run the following command:

```sh
python tests/PerformanceTest.py -dd <DATASET_DIRECTORY> -rd <RULES_DIRECTORY> -total_calls <NUMBER_OF_CALLS> -od <OUTPUT_DIRECTORY>

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is the same as the -d, --data TEXT and -lr, --local_rules TEXT args in the validate command, it would be good to use the same arg names.

```
### Performance Test Command-Line Flags

- **`-dd` (Dataset Directory)**: The directory containing the dataset files in `.json` or `.xpt` format.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Format the documentation similar to the existing documentation. For example, use a code block for the args

- **`-rd` (Rules Directory)**: The directory containing rule files.
- **`-total_calls` (Total Calls)**: The number of times each rule should be executed for performance analysis.
- **`-od` (Output Directory, Optional)**: The directory where the output report (`rule_execution_report.xlsx`) will be saved. By default, the report is saved in the current working directory.

### **Running a validation**

#### From the command line
Expand Down
3 changes: 3 additions & 0 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,17 @@
from cdisc_rules_engine.services import logger
from functools import wraps
import traceback
import time


def log_operator_execution(func):
@wraps(func)
def wrapper(self, other_value, *args, **kwargs):
try:
logger.info(f"Starting check operator: {func.__name__}")
logger.log(rf"\n\OPRT{time.time()}-operator {func.__name__} starts")
result = func(self, other_value)
logger.log(rf"\n\OPRT{time.time()}-operator {func.__name__} ends")
logger.info(f"Completed check operator: {func.__name__}")
return result
except Exception as e:
Expand Down
38 changes: 22 additions & 16 deletions cdisc_rules_engine/rules_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
ExternalDictionariesContainer,
)
import traceback
import time


class RulesEngine:
Expand Down Expand Up @@ -249,20 +250,20 @@ def validate_rule(
# SPECIAL CASES FOR RULE TYPES ###############################
# TODO: Handle these special cases better.
if self.library_metadata:
kwargs[
"variable_codelist_map"
] = self.library_metadata.variable_codelist_map
kwargs[
"codelist_term_maps"
] = self.library_metadata.get_all_ct_package_metadata()
kwargs["variable_codelist_map"] = (
self.library_metadata.variable_codelist_map
)
kwargs["codelist_term_maps"] = (
self.library_metadata.get_all_ct_package_metadata()
)
if rule.get("rule_type") == RuleTypes.DEFINE_ITEM_METADATA_CHECK.value:
if self.library_metadata:
kwargs[
"variable_codelist_map"
] = self.library_metadata.variable_codelist_map
kwargs[
"codelist_term_maps"
] = self.library_metadata.get_all_ct_package_metadata()
kwargs["variable_codelist_map"] = (
self.library_metadata.variable_codelist_map
)
kwargs["codelist_term_maps"] = (
self.library_metadata.get_all_ct_package_metadata()
)
elif (
rule.get("rule_type")
== RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE.value
Expand All @@ -289,10 +290,10 @@ def validate_rule(
domain, {}
)
define_metadata: List[dict] = builder.get_define_xml_variables_metadata()
targets: List[
str
] = self.data_processor.filter_dataset_columns_by_metadata_and_rule(
dataset.columns.tolist(), define_metadata, library_metadata, rule
targets: List[str] = (
self.data_processor.filter_dataset_columns_by_metadata_and_rule(
dataset.columns.tolist(), define_metadata, library_metadata, rule
)
)
rule_copy = deepcopy(rule)
updated_conditions = RuleProcessor.duplicate_conditions_for_all_targets(
Expand Down Expand Up @@ -343,10 +344,14 @@ def execute_rule(
# Adding copy for now to avoid updating cached dataset
dataset = deepcopy(dataset)
# preprocess dataset

logger.log(rf"\n\ST{time.time()}-Dataset Preprocessing Starts")
dataset_preprocessor = DatasetPreprocessor(
dataset, domain, dataset_path, self.data_service, self.cache
)
dataset = dataset_preprocessor.preprocess(rule_copy, datasets)
logger.log(rf"\n\ST{time.time()}-Dataset Preprocessing Ends")
logger.log(rf"\n\OPRNT{time.time()}-Operation Starts")
dataset = self.rule_processor.perform_rule_operations(
rule_copy,
dataset,
Expand All @@ -359,6 +364,7 @@ def execute_rule(
external_dictionaries=self.external_dictionaries,
ct_packages=ct_packages,
)
logger.log(rf"\n\OPRNT{time.time()}-Operation Ends")
relationship_data = {}
if domain is not None and self.rule_processor.is_relationship_dataset(domain):
relationship_data = self.data_processor.preprocess_relationship_dataset(
Expand Down
Loading