microsoft · xieofxie · May 19, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
@@ -1,6 +1,6 @@
 {
     "configCheck": 171,
-    "copyCheck": 182,
+    "copyCheck": 181,
     "executeRuntimeCheck": 104,
     "extensionCheck": 2,
     "gitignoreCheck": 44,
@@ -13,6 +13,6 @@
     "pathCheck": 1455,
     "requirementsCheck": 37,
     "templateCheck": 3,
-    "venvRequirementsCheck": 22,
+    "venvRequirementsCheck": 23,
     "winmlCopyCheck": 38
 }
@@ -18,7 +18,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "microsoft-Phi-3.5-mini-instruct/aitk",
-            "version": 7,
+            "version": 8,
             "p0": true,
             "pipeline_tags": [
                 "text-generation"
@@ -69,7 +69,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk",
-            "version": 7,
+            "version": 8,
             "p0": true,
             "pipeline_tags": [
                 "text-generation"
@@ -193,7 +193,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "meta-llama-Llama-3.2-1B-Instruct/aitk",
-            "version": 7,
+            "version": 8,
             "p0": true,
             "pipeline_tags": [
                 "text-generation"
@@ -269,7 +269,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "Qwen-Qwen2.5-1.5B-Instruct/aitk",
-            "version": 7,
+            "version": 8,
             "p0": true,
             "pipeline_tags": [
                 "text-generation"
@@ -520,7 +520,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "meta-llama-Llama-3.1-8B-Instruct/aitk",
-            "version": 5,
+            "version": 6,
             "p0": false,
             "pipeline_tags": [
                 "text-generation"
@@ -1047,7 +1047,8 @@
             "bfloat16": "a:bf16"
         },
         "QuarkWeightType": {
-            "w_uint4_per_group_asym": "w:int4"
+            "w_uint4_per_group_asym": "w:uint4",
+            "uint4_wo_128": "w:uint4"
         }
     }
 }
@@ -0,0 +1,107 @@
+--extra-index-url=https://download.pytorch.org/whl/cu128
+--extra-index-url=https://pypi.amd.com/olive/1.7.1-5D/simple
+--extra-index-url=https://pypi.amd.com/simple
+accelerate==1.13.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.5
+aiosignal==1.4.0
+alembic==1.18.4
+amd-quark==0.11
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+attrs==26.1.0
+certifi==2026.4.22
+charset-normalizer==3.4.7
+click==8.4.0
+colorama==0.4.6
+colorlog==6.10.1
+datasets==4.8.5
+dill==0.4.1
+evaluate==0.4.6
+filelock==3.29.0
+flatbuffers==25.12.19
+frozenlist==1.8.0
+fsspec==2026.2.0
+greenlet==3.5.0
+h11==0.16.0
+hf-xet==1.5.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.2
+idna==3.15
+importlib-metadata==8.7.1
+jinja2==3.1.6
+joblib==1.5.3
+lightning-utilities==0.15.3
+mako==1.3.12
+markdown-it-py==4.2.0
+markupsafe==3.0.3
+mdurl==0.1.2
+ml-dtypes==0.5.4
+model-generate==1.7.1
+mpmath==1.3.0
+multidict==6.7.1
+multiprocess==0.70.19
+narwhals==2.21.2
+networkx==3.6.1
+ninja==1.13.0
+nltk==3.9.4
+numpy==1.26.4
+olive-ai==0.12.1
+onnx==1.18.0
+onnx-ir==0.2.1
+onnx-tool==1.0.1
+onnxruntime==1.26.0
+onnxruntime-genai==0.13.2
+onnxscript==0.7.0
+onnxsim==0.6.3
+onnxslim==0.1.93
+opentelemetry-api==1.41.1
+opentelemetry-sdk==1.41.1
+opentelemetry-semantic-conventions==0.62b1
+optimum==2.1.0
+optuna==4.8.0
+packaging==26.2
+pandas==3.0.3
+plotly==6.7.0
+prompt-toolkit==3.0.52
+propcache==0.5.2
+protobuf==7.34.1
+psutil==7.2.2
+pyarrow==24.0.0
+pydantic==2.13.4
+pydantic-core==2.46.4
+pygments==2.20.0
+python-dateutil==2.9.0.post0
+pyyaml==6.0.3
+questionary==2.1.1
+regex==2026.5.9
+requests==2.34.2
+rich==15.0.0
+ryzenai-dynamic-dispatch==1.7.1
+ryzenai-onnx-utils==1.7.1
+safetensors==0.7.0
+scipy==1.17.1
+sentencepiece==0.2.1
+setuptools==81.0.0
+shellingham==1.5.4
+six==1.17.0
+sqlalchemy==2.0.49
+sympy==1.14.0
+tabulate==0.10.0
+tokenizers==0.22.2
+torch==2.7.1+cu128
+torchmetrics==1.9.0
+tqdm==4.67.3
+transformers==4.57.6
+typer==0.25.1
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+tzdata==2026.2
+urllib3==2.7.0
+wcwidth==0.7.0
+xxhash==3.7.0
+yarl==1.23.0
+zipp==4.1.0
+zstandard==0.25.0
diff --git a/.aitk/requirements/AMD/sitecustomize.py b/.aitk/requirements/AMD/sitecustomize.py
@@ -0,0 +1,7 @@
+try:
+    import pyarrow            # noqa
+    import pyarrow.dataset    # noqa
+    import pyarrow.compute    # noqa
+except Exception:
+    pass
+
@@ -39,7 +39,7 @@
             "name": "QuarkDataType"
         }
     ],
-    "optimizationDefault": "w:int4 a:bf16",
+    "optimizationDefault": "w:uint4 a:bf16",
     "sections": [
         {
             "autoGenerated": true,

@@ -12,7 +12,7 @@ recipes:
       ep: VitisAIExecutionProvider
       aitk:
         oliveFile: "VitisAI/Qwen2.5-1.5B-Instruct_quark_vitisai_llm.json"
-        requirements: AMD/Quark_py3.10.17
+        requirements: AMD/Quark_py3.12.13
         evalRuntime: AMDNPU
     - file: "qwen2_5_ov_gpu_config.json"
       devices:
@@ -41,7 +41,7 @@ recipes:
 aitk:
     modelInfo:
         id: "huggingface/Qwen/Qwen2.5-1.5B-Instruct"
-        version: 7
+        version: 8
         groupId: "huggingface/Qwen/Qwen2.5-1.5B-Instruct"
         groupItemName: "1.5B"
         p0: true
@@ -31,6 +31,6 @@
     ],
     "modelInfo": {
         "id": "huggingface/Qwen/Qwen2.5-1.5B-Instruct",
-        "version": 7
+        "version": 8
     }
 }
@@ -21,35 +21,18 @@
     "passes": {
         "qq": {
             "type": "QuarkQuantization",
-            "quant_scheme": "w_uint4_per_group_asym",
+            "quant_scheme": "uint4_wo_128",
             "quant_algo": "awq",
             "dataset": "pileval_for_awq_benchmark",
             "data_type": "bfloat16",
             "num_calib_data": 128,
-            "model_export": [ "hf_format" ],
-            "exclude_layers": [  ],
-            "quant_config": {
-                "name": "awq",
-                "scaling_layers": [
-                    {
-                        "prev_op": "input_layernorm",
-                        "layers": [ "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj" ],
-                        "inp": "self_attn.q_proj",
-                        "module2inspect": "self_attn"
-                    },
-                    { "prev_op": "self_attn.v_proj", "layers": [ "self_attn.o_proj" ], "inp": "self_attn.o_proj" },
-                    {
-                        "prev_op": "post_attention_layernorm",
-                        "layers": [ "mlp.gate_proj", "mlp.up_proj" ],
-                        "inp": "mlp.gate_proj",
-                        "module2inspect": "mlp"
-                    },
-                    { "prev_op": "mlp.up_proj", "layers": [ "mlp.down_proj" ], "inp": "mlp.down_proj" }
-                ],
-                "model_decoder_layers": "model.layers"
-            }
+            "model_export": ["hf_format"],
+            "exclude_layers": []
         },
-        "mg": { "type": "VitisGenerateModelLLM", "packed_const": false, "cpu_only": false }
+        "mg": {
+            "type": "VitisGenerateModelLLM",
+            "recipe": "full_fusion"
+        }
     },
     "target": "local_system",
     "log_severity_level": 1,

@@ -11,7 +11,7 @@
     "isGPUSuggested": true,
     "runtimeOverwrite": {
         "autoGenerated": true,
-        "executeRequirement": "AMD/Quark_py3.10.17"
+        "executeRequirement": "AMD/Quark_py3.12.13"
     },
     "epMinVersions": {
         "VitisAIExecutionProvider": "1.8.50"
@@ -39,7 +39,7 @@
             "name": "QuarkDataType"
         }
     ],
-    "optimizationDefault": "w:int4 a:bf16",
+    "optimizationDefault": "w:uint4 a:bf16",
     "sections": [
         {
             "autoGenerated": true,

@@ -39,7 +39,7 @@
             "name": "QuarkDataType"
         }
     ],
-    "optimizationDefault": "w:int4 a:bf16",
+    "optimizationDefault": "w:uint4 a:bf16",
     "sections": [
         {
             "autoGenerated": true,

@@ -39,7 +39,7 @@
             "name": "QuarkDataType"
         }
     ],
-    "optimizationDefault": "w:int4 a:bf16",
+    "optimizationDefault": "w:uint4 a:bf16",
     "sections": [
         {
             "autoGenerated": true,

@@ -39,7 +39,7 @@
             "name": "QuarkDataType"
         }
     ],
-    "optimizationDefault": "w:int4 a:bf16",
+    "optimizationDefault": "w:uint4 a:bf16",
     "sections": [
         {
             "autoGenerated": true,

@@ -39,7 +39,7 @@
             "name": "QuarkDataType"
         }
     ],
-    "optimizationDefault": "w:int4 a:bf16",
+    "optimizationDefault": "w:uint4 a:bf16",
     "sections": [
         {
             "autoGenerated": true,

@@ -20,36 +20,19 @@
     ],
     "passes": {
         "qq": {
-            "type": "QuarkQuantization",
-            "quant_scheme": "w_uint4_per_group_asym",
+             "type": "QuarkQuantization",
+            "quant_scheme": "uint4_wo_128",
             "quant_algo": "awq",
             "dataset": "pileval_for_awq_benchmark",
             "data_type": "bfloat16",
             "num_calib_data": 128,
-            "model_export": [ "hf_format" ],
-            "exclude_layers": [  ],
-            "quant_config": {
-                "name": "awq",
-                "scaling_layers": [
-                    {
-                        "prev_op": "input_layernorm",
-                        "layers": [ "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj" ],
-                        "inp": "self_attn.q_proj",
-                        "module2inspect": "self_attn"
-                    },
-                    { "prev_op": "self_attn.v_proj", "layers": [ "self_attn.o_proj" ], "inp": "self_attn.o_proj" },
-                    {
-                        "prev_op": "post_attention_layernorm",
-                        "layers": [ "mlp.gate_proj", "mlp.up_proj" ],
-                        "inp": "mlp.gate_proj",
-                        "module2inspect": "mlp"
-                    },
-                    { "prev_op": "mlp.up_proj", "layers": [ "mlp.down_proj" ], "inp": "mlp.down_proj" }
-                ],
-                "model_decoder_layers": "model.layers"
-            }
+            "model_export": ["hf_format"],
+            "exclude_layers": []
         },
-        "mg": { "type": "VitisGenerateModelLLM", "packed_const": false, "cpu_only": false }
+        "mg": {
+            "type": "VitisGenerateModelLLM",
+            "recipe": "full_fusion"
+        }
     },
     "target": "local_system",
     "log_severity_level": 1,

@@ -11,7 +11,7 @@
     "isGPUSuggested": true,
     "runtimeOverwrite": {
         "autoGenerated": true,
-        "executeRequirement": "AMD/Quark_py3.10.17"
+        "executeRequirement": "AMD/Quark_py3.12.13"
     },
     "epMinVersions": {
         "VitisAIExecutionProvider": "1.8.50"
@@ -39,7 +39,7 @@
             "name": "QuarkDataType"
         }
     ],
-    "optimizationDefault": "w:int4 a:bf16",
+    "optimizationDefault": "w:uint4 a:bf16",
     "sections": [
         {
             "autoGenerated": true,

@@ -12,7 +12,7 @@ recipes:
       ep: VitisAIExecutionProvider
       aitk:
         oliveFile: "VitisAI/DeepSeek-R1-Distill-Qwen-1.5B_quark_vitisai_llm.json"
-        requirements: AMD/Quark_py3.10.17
+        requirements: AMD/Quark_py3.12.13
         evalRuntime: AMDNPU
     - file: "deepseek_ov_gpu_config.json"
       devices:
@@ -41,7 +41,7 @@ recipes:
 aitk:
     modelInfo:
         id: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-        version: 7
+        version: 8
         groupId: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
         groupItemName: "1.5B"
         p0: true