diff --git a/Qwen-Qwen3.5-2B/baseline/Qwen-Qwen3.5-2B_baseline_mmlu.json b/Qwen-Qwen3.5-2B/baseline/Qwen-Qwen3.5-2B_baseline_mmlu.json new file mode 100644 index 000000000..c74eca597 --- /dev/null +++ b/Qwen-Qwen3.5-2B/baseline/Qwen-Qwen3.5-2B_baseline_mmlu.json @@ -0,0 +1,32 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": ["CUDAExecutionProvider"] + } + ] + } + }, + "evaluators": { + "mmlu": { + "type": "LMEvaluator", + "tasks": ["mmlu"], + "model_class": "hf", + "batch_size": 8 + } + }, + "evaluator": "mmlu", + "target": "local_system", + "log_severity_level": 0, + "evaluate_input_model": true +} diff --git a/Qwen-Qwen3.5-2B/baseline/requirements.txt b/Qwen-Qwen3.5-2B/baseline/requirements.txt new file mode 100644 index 000000000..45243561c --- /dev/null +++ b/Qwen-Qwen3.5-2B/baseline/requirements.txt @@ -0,0 +1,5 @@ +accelerate +datasets +lm-eval +torch +transformers==4.52.4 diff --git a/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4.json b/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4.json new file mode 100644 index 000000000..dabdf4156 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4.json @@ -0,0 +1,42 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": ["CPUExecutionProvider"] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "extra_options": { + "exclude_embeds": false + } + }, + "q": { + "type": "GraphSurgeries", + "surgeries": [ + {"surgeon": "QuantizeEmbeddingInt8"}, + {"surgeon": "ShareEmbeddingLmHead"} + ], + "save_as_external_data": true + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4_with_eval.json b/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4_with_eval.json new file mode 100644 index 000000000..40a659938 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4_with_eval.json @@ -0,0 +1,52 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": ["CPUExecutionProvider"] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "extra_options": { + "exclude_embeds": false + } + }, + "q": { + "type": "GraphSurgeries", + "surgeries": [ + {"surgeon": "QuantizeEmbeddingInt8"}, + {"surgeon": "ShareEmbeddingLmHead"} + ], + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "model.onnx.data" + } + }, + "evaluators": { + "mmlu": { + "type": "LMEvaluator", + "tasks": ["mmlu"], + "batch_size": 8 + } + }, + "evaluator": "mmlu", + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/Qwen-Qwen3.5-2B/cpu/README.md b/Qwen-Qwen3.5-2B/cpu/README.md new file mode 100644 index 000000000..88d8ab391 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cpu/README.md @@ -0,0 +1,28 @@ +# Qwen-Qwen3.5-2B — CPU optimization + +This folder contains Olive recipes for optimizing Qwen-Qwen3.5-2B targeting the CPU EP. + +## What this folder is for + +- Execution Provider: CPU EP +- Typical precision: INT4 precision by default +- Example recipe filename: Qwen-Qwen3.5-2B_cpu_int4.json + +## Setup + +1) Install the main branch of Olive: + - pip install git+https://github.com/microsoft/olive.git +2) Install the appropriate runtime package for this backend: + - onnxruntime-genai (CPU build) +3) Run Olive to build/optimize the model + - olive run --config Qwen-Qwen3.5-2B_cpu_int4.json + +Additional notes: +- Pipeline: `ModelBuilder` (INT4 via Neural Compressor) → `QuantizeEmbeddingInt8` (post-hoc INT8 embedding) → `ShareEmbeddingLmHead` (share INT8 weight between embedding and lm_head) +- Model size: ~1.4 GB (down from 4.3 GB FP16) +- Uses text-only mode (exclude_embeds=false) for standalone LLM inference without multimodal pipeline. +- Runs purely on CPU; no GPU required. + +--- + +This README was auto-generated for the CPU EP of Qwen-Qwen3.5-2B. diff --git a/Qwen-Qwen3.5-2B/cpu/info.yaml b/Qwen-Qwen3.5-2B/cpu/info.yaml new file mode 100644 index 000000000..9a04c5d16 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cpu/info.yaml @@ -0,0 +1,6 @@ +arch: qwen3_5_text +recipes: + - name: Qwen-Qwen3.5-2B_cpu_int4 + file: Qwen-Qwen3.5-2B_cpu_int4.json + devices: cpu + eps: CPUExecutionProvider diff --git a/Qwen-Qwen3.5-2B/cpu/requirements.txt b/Qwen-Qwen3.5-2B/cpu/requirements.txt new file mode 100644 index 000000000..9d0477698 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cpu/requirements.txt @@ -0,0 +1,4 @@ +accelerate +datasets +onnxruntime-genai +transformers==4.52.4 diff --git a/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4.json b/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4.json new file mode 100644 index 000000000..08d36d54c --- /dev/null +++ b/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4.json @@ -0,0 +1,42 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": ["CUDAExecutionProvider"] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "extra_options": { + "exclude_embeds": false, + "enable_cuda_graph": true + } + }, + "q": { + "type": "GraphSurgeries", + "surgeries": [ + {"surgeon": "QuantizeEmbeddingInt8"}, + {"surgeon": "ShareEmbeddingLmHead"} + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4_with_eval.json b/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4_with_eval.json new file mode 100644 index 000000000..782780a79 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4_with_eval.json @@ -0,0 +1,51 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": ["CUDAExecutionProvider"] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "extra_options": { + "exclude_embeds": false, + "enable_cuda_graph": true + } + }, + "q": { + "type": "GraphSurgeries", + "surgeries": [ + {"surgeon": "QuantizeEmbeddingInt8"}, + {"surgeon": "ShareEmbeddingLmHead"} + ], + "save_as_external_data": true + } + }, + "evaluators": { + "mmlu": { + "type": "LMEvaluator", + "tasks": ["mmlu"], + "batch_size": 8 + } + }, + "evaluator": "mmlu", + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/Qwen-Qwen3.5-2B/cuda/README.md b/Qwen-Qwen3.5-2B/cuda/README.md new file mode 100644 index 000000000..9cbc8f4de --- /dev/null +++ b/Qwen-Qwen3.5-2B/cuda/README.md @@ -0,0 +1,31 @@ +# Qwen-Qwen3.5-2B — CUDA optimization + +This folder contains Olive recipes for optimizing Qwen-Qwen3.5-2B targeting the CUDA EP. + +## What this folder is for + +- Execution Provider: CUDA EP +- Typical precision: INT4 precision by default +- Example recipe filename: Qwen-Qwen3.5-2B_cuda_int4.json + +## Setup + +1) Install the main branch of Olive: + - pip install git+https://github.com/microsoft/olive.git +2) Install the appropriate runtime package for this backend: + - onnxruntime-genai-cuda (CUDA build) +3) Run Olive to build/optimize the model + - olive run --config Qwen-Qwen3.5-2B_cuda_int4.json + +Additional notes: +- Pipeline: `ModelBuilder` (INT4 via Neural Compressor) → `QuantizeEmbeddingInt8` (post-hoc INT8 embedding) → `ShareEmbeddingLmHead` (share INT8 weight between embedding and lm_head) +- Model size: ~1.4 GB (down from 4.3 GB FP16) +- MMLU accuracy: 57.11% (vs 59.27% FP16 baseline) +- Uses text-only mode (exclude_embeds=false) for standalone LLM inference without multimodal pipeline. +- CUDA graph enabled for optimized decode throughput. +- Requires NVIDIA GPU with CUDA support. +- Ensure CUDA toolkit and cuDNN are properly installed. + +--- + +This README was auto-generated for the CUDA EP of Qwen-Qwen3.5-2B. diff --git a/Qwen-Qwen3.5-2B/cuda/info.yaml b/Qwen-Qwen3.5-2B/cuda/info.yaml new file mode 100644 index 000000000..b17353786 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cuda/info.yaml @@ -0,0 +1,6 @@ +arch: qwen3_5_text +recipes: + - name: Qwen-Qwen3.5-2B_cuda_int4 + file: Qwen-Qwen3.5-2B_cuda_int4.json + devices: gpu + eps: CUDAExecutionProvider diff --git a/Qwen-Qwen3.5-2B/cuda/requirements.txt b/Qwen-Qwen3.5-2B/cuda/requirements.txt new file mode 100644 index 000000000..9d0477698 --- /dev/null +++ b/Qwen-Qwen3.5-2B/cuda/requirements.txt @@ -0,0 +1,4 @@ +accelerate +datasets +onnxruntime-genai +transformers==4.52.4 diff --git a/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4.json b/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4.json new file mode 100644 index 000000000..ee587661b --- /dev/null +++ b/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4.json @@ -0,0 +1,43 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": ["WebGpuExecutionProvider"] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_block_size": 32, + "extra_options": { + "exclude_embeds": false + } + }, + "q": { + "type": "GraphSurgeries", + "surgeries": [ + {"surgeon": "QuantizeEmbeddingInt8"}, + {"surgeon": "ShareEmbeddingLmHead"} + ], + "save_as_external_data": true + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4_with_eval.json b/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4_with_eval.json new file mode 100644 index 000000000..ee788e395 --- /dev/null +++ b/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4_with_eval.json @@ -0,0 +1,52 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": ["WebGpuExecutionProvider"] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_block_size": 32, + "extra_options": { + "exclude_embeds": false + } + }, + "q": { + "type": "GraphSurgeries", + "surgeries": [ + {"surgeon": "QuantizeEmbeddingInt8"}, + {"surgeon": "ShareEmbeddingLmHead"} + ], + "save_as_external_data": true, + "all_tensors_to_one_file": true + } + }, + "evaluators": { + "mmlu": { + "type": "LMEvaluator", + "tasks": ["mmlu"], + "batch_size": 8 + } + }, + "evaluator": "mmlu", + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/Qwen-Qwen3.5-2B/webgpu/README.md b/Qwen-Qwen3.5-2B/webgpu/README.md new file mode 100644 index 000000000..5dacabb52 --- /dev/null +++ b/Qwen-Qwen3.5-2B/webgpu/README.md @@ -0,0 +1,29 @@ +# Qwen-Qwen3.5-2B — WebGPU optimization + +This folder contains Olive recipes for optimizing Qwen-Qwen3.5-2B targeting the WebGPU EP. + +## What this folder is for + +- Execution Provider: WebGPU EP +- Typical precision: INT4 precision by default +- Example recipe filename: Qwen-Qwen3.5-2B_webgpu_int4.json + +## Setup + +1) Install the main branch of Olive: + - pip install git+https://github.com/microsoft/olive.git +2) Install the appropriate runtime package for this backend: + - onnxruntime-web (WebGPU build) +3) Run Olive to build/optimize the model + - olive run --config Qwen-Qwen3.5-2B_webgpu_int4.json + +Additional notes: +- Pipeline: `ModelBuilder` (INT4 via Neural Compressor) → `QuantizeEmbeddingInt8` (post-hoc INT8 embedding) → `ShareEmbeddingLmHead` (share INT8 weight between embedding and lm_head) +- Model size: ~1.4 GB (down from 4.3 GB FP16) +- Uses text-only mode (exclude_embeds=false) for standalone LLM inference without multimodal pipeline. +- WebGPU enables GPU-accelerated inference in web browsers. +- Ensure your browser supports WebGPU (Chrome 113+, Edge 113+). + +--- + +This README was auto-generated for the WebGPU EP of Qwen-Qwen3.5-2B. diff --git a/Qwen-Qwen3.5-2B/webgpu/info.yaml b/Qwen-Qwen3.5-2B/webgpu/info.yaml new file mode 100644 index 000000000..b1f6953df --- /dev/null +++ b/Qwen-Qwen3.5-2B/webgpu/info.yaml @@ -0,0 +1,6 @@ +arch: qwen3_5_text +recipes: + - name: Qwen-Qwen3.5-2B_webgpu_int4 + file: Qwen-Qwen3.5-2B_webgpu_int4.json + devices: gpu + eps: WebGpuExecutionProvider diff --git a/Qwen-Qwen3.5-2B/webgpu/requirements.txt b/Qwen-Qwen3.5-2B/webgpu/requirements.txt new file mode 100644 index 000000000..9d0477698 --- /dev/null +++ b/Qwen-Qwen3.5-2B/webgpu/requirements.txt @@ -0,0 +1,4 @@ +accelerate +datasets +onnxruntime-genai +transformers==4.52.4