From 75717a022866dcb28d9d100f150ff0f2c6bc6046 Mon Sep 17 00:00:00 2001 From: Sunghoon Choi Date: Sun, 31 May 2026 06:29:35 +0000 Subject: [PATCH] Add HY-MT1.5-1.8B recipes --- tencent-HY-MT1.5-1.8B/LICENSE | 2 + tencent-HY-MT1.5-1.8B/baseline/README.md | 24 ++++ tencent-HY-MT1.5-1.8B/baseline/info.yaml | 6 + .../baseline/requirements.txt | 8 ++ ...ncent-HY-MT1.5-1.8B_pytorch_with_eval.json | 75 +++++++++++ tencent-HY-MT1.5-1.8B/cpu/README.md | 50 ++++++++ tencent-HY-MT1.5-1.8B/cpu/info.yaml | 18 +++ tencent-HY-MT1.5-1.8B/cpu/requirements.txt | 9 ++ .../cpu/tencent-HY-MT1.5-1.8B_cpu_fp32.json | 44 +++++++ ...cent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json | 98 ++++++++++++++ .../cpu/tencent-HY-MT1.5-1.8B_cpu_int4.json | 66 ++++++++++ ...cent-HY-MT1.5-1.8B_cpu_int4_with_eval.json | 121 ++++++++++++++++++ tencent-HY-MT1.5-1.8B/cuda/README.md | 75 +++++++++++ tencent-HY-MT1.5-1.8B/cuda/info.yaml | 18 +++ tencent-HY-MT1.5-1.8B/cuda/requirements.txt | 10 ++ .../cuda/tencent-HY-MT1.5-1.8B_cuda_fp16.json | 42 ++++++ ...ent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json | 98 ++++++++++++++ .../cuda/tencent-HY-MT1.5-1.8B_cuda_int4.json | 64 +++++++++ ...ent-HY-MT1.5-1.8B_cuda_int4_with_eval.json | 120 +++++++++++++++++ tencent-HY-MT1.5-1.8B/webgpu/README.md | 63 +++++++++ tencent-HY-MT1.5-1.8B/webgpu/info.yaml | 18 +++ tencent-HY-MT1.5-1.8B/webgpu/requirements.txt | 11 ++ .../tencent-HY-MT1.5-1.8B_webgpu_fp16.json | 41 ++++++ ...t-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json | 95 ++++++++++++++ .../tencent-HY-MT1.5-1.8B_webgpu_int4.json | 63 +++++++++ ...t-HY-MT1.5-1.8B_webgpu_int4_with_eval.json | 118 +++++++++++++++++ 26 files changed, 1357 insertions(+) create mode 100644 tencent-HY-MT1.5-1.8B/LICENSE create mode 100644 tencent-HY-MT1.5-1.8B/baseline/README.md create mode 100644 tencent-HY-MT1.5-1.8B/baseline/info.yaml create mode 100644 tencent-HY-MT1.5-1.8B/baseline/requirements.txt create mode 100644 tencent-HY-MT1.5-1.8B/baseline/tencent-HY-MT1.5-1.8B_pytorch_with_eval.json create mode 100644 tencent-HY-MT1.5-1.8B/cpu/README.md create mode 100644 tencent-HY-MT1.5-1.8B/cpu/info.yaml create mode 100644 tencent-HY-MT1.5-1.8B/cpu/requirements.txt create mode 100644 tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32.json create mode 100644 tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json create mode 100644 tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4.json create mode 100644 tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4_with_eval.json create mode 100644 tencent-HY-MT1.5-1.8B/cuda/README.md create mode 100644 tencent-HY-MT1.5-1.8B/cuda/info.yaml create mode 100644 tencent-HY-MT1.5-1.8B/cuda/requirements.txt create mode 100644 tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16.json create mode 100644 tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json create mode 100644 tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4.json create mode 100644 tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4_with_eval.json create mode 100644 tencent-HY-MT1.5-1.8B/webgpu/README.md create mode 100644 tencent-HY-MT1.5-1.8B/webgpu/info.yaml create mode 100644 tencent-HY-MT1.5-1.8B/webgpu/requirements.txt create mode 100644 tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16.json create mode 100644 tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json create mode 100644 tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4.json create mode 100644 tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval.json diff --git a/tencent-HY-MT1.5-1.8B/LICENSE b/tencent-HY-MT1.5-1.8B/LICENSE new file mode 100644 index 000000000..a49126107 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/LICENSE @@ -0,0 +1,2 @@ +The upstream model is distributed under the Tencent HY Community License Agreement. +See https://huggingface.co/tencent/HY-MT1.5-1.8B/blob/main/License.txt. diff --git a/tencent-HY-MT1.5-1.8B/baseline/README.md b/tencent-HY-MT1.5-1.8B/baseline/README.md new file mode 100644 index 000000000..a7ac53836 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/baseline/README.md @@ -0,0 +1,24 @@ +# tencent-HY-MT1.5-1.8B - Baseline PyTorch Evaluation + +This folder contains an Olive recipe for evaluating the Hugging Face PyTorch base model `tencent/HY-MT1.5-1.8B` on WMT18 Chinese-to-English translation with LM-eval. + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Run evaluation + +```bash +olive run --config tencent-HY-MT1.5-1.8B_pytorch_with_eval.json +``` + +## Evaluation results + +PyTorch baseline WMT18 Chinese-to-English translation metrics, run on GPU: + +- BLEU: `12.634090637487896` +- chrF: `35.290981008260474` +- TER: `85.7224234039956` +- Samples: `3981` diff --git a/tencent-HY-MT1.5-1.8B/baseline/info.yaml b/tencent-HY-MT1.5-1.8B/baseline/info.yaml new file mode 100644 index 000000000..934878dcb --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/baseline/info.yaml @@ -0,0 +1,6 @@ +arch: hunyuan_v1_dense +recipes: + - name: tencent-HY-MT1.5-1.8B_pytorch_with_eval + file: tencent-HY-MT1.5-1.8B_pytorch_with_eval.json + devices: gpu + eps: PyTorch diff --git a/tencent-HY-MT1.5-1.8B/baseline/requirements.txt b/tencent-HY-MT1.5-1.8B/baseline/requirements.txt new file mode 100644 index 000000000..414fb58bf --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/baseline/requirements.txt @@ -0,0 +1,8 @@ +accelerate +datasets +lm-eval +olive-ai +sentencepiece +tiktoken +torch +transformers==4.56.1 diff --git a/tencent-HY-MT1.5-1.8B/baseline/tencent-HY-MT1.5-1.8B_pytorch_with_eval.json b/tencent-HY-MT1.5-1.8B/baseline/tencent-HY-MT1.5-1.8B_pytorch_with_eval.json new file mode 100644 index 000000000..b6709b47f --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/baseline/tencent-HY-MT1.5-1.8B_pytorch_with_eval.json @@ -0,0 +1,75 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu" + } + ] + } + }, + "evaluators": { + "zh_en_translation": { + "type": "LMEvaluator", + "tasks": [ + { + "task": "wmt18-zh-en", + "dataset_path": "wmt/wmt18", + "dataset_name": "zh-en", + "output_type": "generate_until", + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "validation", + "test_split": "test", + "doc_to_text": "Translate the following segment into English, without additional explanation.\n\n{{translation[\"zh\"]}}\nEnglish translation:", + "doc_to_target": "{{translation[\"en\"]}}", + "metric_list": [ + { + "metric": "bleu", + "aggregation": "bleu", + "higher_is_better": true + }, + { + "metric": "ter", + "aggregation": "ter", + "higher_is_better": false + }, + { + "metric": "chrf", + "aggregation": "chrf", + "higher_is_better": true + } + ], + "generation_kwargs": { + "until": [ + "\n" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "metadata": { + "version": 1.0 + } + } + ], + "batch_size": 32, + "bootstrap_iters": 0, + "max_length": 2048, + "device": "gpu" + } + }, + "evaluator": "zh_en_translation", + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_pytorch", + "cache_dir": "cache_pytorch" +} diff --git a/tencent-HY-MT1.5-1.8B/cpu/README.md b/tencent-HY-MT1.5-1.8B/cpu/README.md new file mode 100644 index 000000000..62aabeb4e --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cpu/README.md @@ -0,0 +1,50 @@ +# tencent-HY-MT1.5-1.8B - CPU Optimization + +This folder contains Olive recipes for optimizing `tencent/HY-MT1.5-1.8B` for `CPUExecutionProvider`. + +## Recipes + +- `tencent-HY-MT1.5-1.8B_cpu_fp32.json` +- `tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json` +- `tencent-HY-MT1.5-1.8B_cpu_int4.json` +- `tencent-HY-MT1.5-1.8B_cpu_int4_with_eval.json` + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Build examples + +```bash +olive run --config tencent-HY-MT1.5-1.8B_cpu_fp32.json +olive run --config tencent-HY-MT1.5-1.8B_cpu_int4.json +``` + +## Build and evaluate with WMT18 Chinese-to-English translation + +```bash +olive run --config tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json +olive run --config tencent-HY-MT1.5-1.8B_cpu_int4_with_eval.json +``` + +## Notes + +- HY-MT1.5-1.8B config has tie_word_embeddings=true, so TieWordEmbeddings surgery is applied after ModelBuilder. +- Full precision recipe for this backend uses `fp32`. +- The primary INT4 recipe uses full INT4 with `group_size: 32`: GPTQ -> RTN -> ModelBuilder. +- The primary INT4 recipe omits SelectiveMixedPrecision and quantizes embedding / lm_head to INT4 as well. +- CPU recipes save ONNX weights to `model.onnx.data` through the final GraphSurgeries pass. +- Full INT4 with `group_size: 32` was selected because it is smaller than the previous CPU INT4 artifact and still improves WMT18 Chinese-to-English quality over the PyTorch baseline when evaluated with CPU EP. + +## Evaluation results + +WMT18 Chinese-to-English translation, 3,981 test samples. BLEU and chrF are higher-is-better; TER is lower-is-better. CPU ONNX results were evaluated with ORT GenAI using CPU EP. + +| Model | Embedding / lm_head | Size | BLEU | chrF | TER | +| --- | --- | ---: | ---: | ---: | ---: | +| PyTorch baseline | fp16 | 3.806125 GiB | 12.634090637487896 | 35.290981008260474 | 85.7224234039956 | +| Current CPU INT4 recipe (full INT4, `group_size: 32`) | int4 | 1.203313 GiB | 16.920205475631327 | 42.982874679434815 | 80.8764291817553 | + +The current INT4 artifact is 68.4% smaller than the PyTorch model and improves BLEU by 4.29, chrF by 7.69, and TER by 4.85 points. It is also smaller than the previous CPU INT4 artifact based on SelectiveMixedPrecision `ratio: 0.83` (1.288222 GiB), so the full INT4 `group_size: 32` recipe is the primary CPU INT4 path. diff --git a/tencent-HY-MT1.5-1.8B/cpu/info.yaml b/tencent-HY-MT1.5-1.8B/cpu/info.yaml new file mode 100644 index 000000000..cae47f890 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cpu/info.yaml @@ -0,0 +1,18 @@ +arch: hunyuan_v1_dense +recipes: + - name: tencent-HY-MT1.5-1.8B_cpu_fp32 + file: tencent-HY-MT1.5-1.8B_cpu_fp32.json + devices: cpu + eps: CPUExecutionProvider + - name: tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval + file: tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json + devices: cpu + eps: CPUExecutionProvider + - name: tencent-HY-MT1.5-1.8B_cpu_int4 + file: tencent-HY-MT1.5-1.8B_cpu_int4.json + devices: cpu + eps: CPUExecutionProvider + - name: tencent-HY-MT1.5-1.8B_cpu_int4_with_eval + file: tencent-HY-MT1.5-1.8B_cpu_int4_with_eval.json + devices: cpu + eps: CPUExecutionProvider diff --git a/tencent-HY-MT1.5-1.8B/cpu/requirements.txt b/tencent-HY-MT1.5-1.8B/cpu/requirements.txt new file mode 100644 index 000000000..d018f61fa --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cpu/requirements.txt @@ -0,0 +1,9 @@ +accelerate +datasets +lm-eval +olive-ai +onnxruntime-genai +sentencepiece +tiktoken +torch +transformers==4.56.1 diff --git a/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32.json b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32.json new file mode 100644 index 000000000..43cd5deef --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32.json @@ -0,0 +1,44 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "fp32" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ], + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "model.onnx.data" + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cpu_fp32", + "cache_dir": "cache_cpu_fp32", + "no_artifacts": true +} diff --git a/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json new file mode 100644 index 000000000..fc365dc28 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_fp32_with_eval.json @@ -0,0 +1,98 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "fp32" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ], + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "model.onnx.data" + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cpu_fp32", + "cache_dir": "cache_cpu_fp32", + "evaluators": { + "zh_en_translation": { + "type": "LMEvaluator", + "tasks": [ + { + "task": "wmt18-zh-en", + "dataset_path": "wmt/wmt18", + "dataset_name": "zh-en", + "output_type": "generate_until", + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "validation", + "test_split": "test", + "doc_to_text": "Translate the following segment into English, without additional explanation.\n\n{{translation[\"zh\"]}}\nEnglish translation:", + "doc_to_target": "{{translation[\"en\"]}}", + "metric_list": [ + { + "metric": "bleu", + "aggregation": "bleu", + "higher_is_better": true + }, + { + "metric": "ter", + "aggregation": "ter", + "higher_is_better": false + }, + { + "metric": "chrf", + "aggregation": "chrf", + "higher_is_better": true + } + ], + "generation_kwargs": { + "until": [ + "\n" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "metadata": { + "version": 1.0 + } + } + ], + "batch_size": 1, + "bootstrap_iters": 0, + "max_length": 2048, + "provider_options": { + "enable_skip_layer_norm_strict_mode": "1" + } + } + }, + "evaluator": "zh_en_translation", + "evaluate_input_model": false +} diff --git a/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4.json b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4.json new file mode 100644 index 000000000..b659b6d57 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4.json @@ -0,0 +1,66 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + } + }, + "passes": { + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 4, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 4 + }, + "model.embed_tokens": { + "bits": 4 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ], + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "model.onnx.data" + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cpu_int4", + "cache_dir": "cache_cpu_int4", + "no_artifacts": true +} diff --git a/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4_with_eval.json b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4_with_eval.json new file mode 100644 index 000000000..e67ddb343 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cpu/tencent-HY-MT1.5-1.8B_cpu_int4_with_eval.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + } + }, + "passes": { + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 4, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 4 + }, + "model.embed_tokens": { + "bits": 4 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ], + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "model.onnx.data" + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cpu_int4", + "cache_dir": "cache_cpu_int4", + "evaluators": { + "zh_en_translation": { + "type": "LMEvaluator", + "tasks": [ + { + "task": "wmt18-zh-en", + "dataset_path": "wmt/wmt18", + "dataset_name": "zh-en", + "output_type": "generate_until", + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "validation", + "test_split": "test", + "doc_to_text": "Translate the following segment into English, without additional explanation.\n\n{{translation[\"zh\"]}}\nEnglish translation:", + "doc_to_target": "{{translation[\"en\"]}}", + "metric_list": [ + { + "metric": "bleu", + "aggregation": "bleu", + "higher_is_better": true + }, + { + "metric": "ter", + "aggregation": "ter", + "higher_is_better": false + }, + { + "metric": "chrf", + "aggregation": "chrf", + "higher_is_better": true + } + ], + "generation_kwargs": { + "until": [ + "\n" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "metadata": { + "version": 1.0 + } + } + ], + "model_class": "ortgenai", + "batch_size": 1, + "bootstrap_iters": 0, + "max_length": 2048, + "provider_options": { + "enable_skip_layer_norm_strict_mode": "1" + } + } + }, + "evaluator": "zh_en_translation", + "evaluate_input_model": false +} diff --git a/tencent-HY-MT1.5-1.8B/cuda/README.md b/tencent-HY-MT1.5-1.8B/cuda/README.md new file mode 100644 index 000000000..755f156f9 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cuda/README.md @@ -0,0 +1,75 @@ +# tencent-HY-MT1.5-1.8B - CUDA Optimization + +This folder contains Olive recipes for optimizing `tencent/HY-MT1.5-1.8B` for `CUDAExecutionProvider`. + +## Recipes + +- `tencent-HY-MT1.5-1.8B_cuda_fp16.json` +- `tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json` +- `tencent-HY-MT1.5-1.8B_cuda_int4.json` +- `tencent-HY-MT1.5-1.8B_cuda_int4_with_eval.json` + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Build examples + +```bash +olive run --config tencent-HY-MT1.5-1.8B_cuda_fp16.json +olive run --config tencent-HY-MT1.5-1.8B_cuda_int4.json +``` + +## Build and evaluate with WMT18 Chinese-to-English translation + +```bash +olive run --config tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json +olive run --config tencent-HY-MT1.5-1.8B_cuda_int4_with_eval.json +``` + +## Notes + +- HY-MT1.5-1.8B config has tie_word_embeddings=true, so TieWordEmbeddings surgery is applied after ModelBuilder. +- Full precision recipe for this backend uses `fp16`. +- Primary INT4 recipes use full INT4 with `group_size: 32`: GPTQ -> RTN -> ModelBuilder. +- Primary INT4 recipes omit SelectiveMixedPrecision and quantize embedding / lm_head to INT4 as well. +- The previous SelectiveMixedPrecision `ratio: 0.83` primary remains in the tuning history, but it has been replaced by the smaller and higher-quality full INT4 `group_size: 32` recipe. + +## Evaluation results + +WMT18 Chinese-to-English translation, 3,981 test samples. BLEU and chrF are higher-is-better; TER is lower-is-better. + +| Model | Embedding / lm_head | Size | BLEU | chrF | TER | +| --- | --- | ---: | ---: | ---: | ---: | +| PyTorch baseline | fp16 | 3.806125 GiB | 12.634090637487896 | 35.290981008260474 | 85.7224234039956 | +| Previous CUDA INT4 recipe (`ratio: 0.65`) | int8 | 1.350105 GiB | 15.265524905493395 | 39.79625126006993 | 84.66147210806851 | +| Previous CUDA INT4 recipe (`ratio: 0.83`) | int8 | 1.199468 GiB | 13.362969543291351 | 36.72419140068781 | 85.49761972575986 | +| Current CUDA INT4 recipe (full INT4, `group_size: 32`) | int4 | 1.036290 GiB | 17.170642492358855 | 43.50620755758628 | 80.59466167555031 | +| CUDA full INT4 (`group_size: 128`) | int4 | 0.938559 GiB | 3.759288935325742 | 20.2313593462662 | 93.06058509989013 | +| CUDA INT4 / INT8 mixed | int8 | 1.054681 GiB | 3.325815186575075 | 19.319673423679575 | 93.61699963380397 | + +The current CUDA INT4 recipe uses full INT4 GPTQ for the body and full INT4 RTN for embedding / lm_head. All CUDA ONNX variants apply TieWordEmbeddings after ModelBuilder. + +The previous `ratio: 0.83` primary CUDA INT4 output and cache were backed up before promoting full INT4 `group_size: 32` to `model_cuda_int4`. + +## SelectiveMixedPrecision ratio tuning + +`ratio` is the fraction of scored parameters kept at the default low precision. Higher values usually reduce model size, but after the sensitive layers are no longer promoted to 8-bit, translation quality drops quickly. + +| Recipe | Configured ratio | Size | BLEU | chrF | TER | Decision | +| --- | ---: | ---: | ---: | ---: | ---: | --- | +| Previous primary | 0.65 | 1.350105 GiB | 15.265524905493395 | 39.79625126006993 | 84.66147210806851 | Replaced | +| Tuning run | 0.75 | 1.268385 GiB | 17.78380603332943 | 44.55479318041093 | 82.10928917280384 | Best quality | +| Tuning run | 0.80 | 1.224083 GiB | 16.8883865898559 | 43.2185687820676 | 82.54465557228303 | Best quality / size point versus previous INT4 | +| Tuning run | 0.81 | 1.216207 GiB | 14.359839243852116 | 38.37412933939934 | 84.75810717337347 | Smaller, but no longer beats previous primary on all metrics | +| Tuning run | 0.82 | 1.211281 GiB | 13.304635612235579 | 36.437981160175106 | 85.74683647312527 | Near baseline, TER slightly below PyTorch | +| Previous primary | 0.83 | 1.199468 GiB | 13.362969543291351 | 36.72419140068781 | 85.49761972575986 | Replaced by full INT4 `group_size: 32` | +| Tuning run | 0.84 | 1.193562 GiB | 14.45227322167928 | 38.44629014667923 | 86.48940065915286 | Smaller and BLEU/chrF above PyTorch, but TER below PyTorch | +| Tuning run | 0.85 | 1.181749 GiB | 10.513690403174172 | 31.910545705126708 | 88.3702241933515 | Quality collapse | +| Tuning run | 0.90 | 1.140391 GiB | 6.483511887686124 | 25.198896559504853 | 91.26825894128658 | Quality collapse | + +Compared with PyTorch, the current full INT4 `group_size: 32` recipe is 72.8% smaller and improves BLEU by 4.54, chrF by 8.22, and TER by 5.13 points. The SelectiveMixedPrecision sweep remains as historical tuning data, but it is no longer the recommended CUDA INT4 path. + +The CUDA INT4 / INT8 mixed result was reproduced after clearing `cache_cuda_int4_int8` and `model_cuda_int4_int8`, then re-exporting and re-evaluating from scratch. diff --git a/tencent-HY-MT1.5-1.8B/cuda/info.yaml b/tencent-HY-MT1.5-1.8B/cuda/info.yaml new file mode 100644 index 000000000..e784ee462 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cuda/info.yaml @@ -0,0 +1,18 @@ +arch: hunyuan_v1_dense +recipes: + - name: tencent-HY-MT1.5-1.8B_cuda_fp16 + file: tencent-HY-MT1.5-1.8B_cuda_fp16.json + devices: gpu + eps: CUDAExecutionProvider + - name: tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval + file: tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json + devices: gpu + eps: CUDAExecutionProvider + - name: tencent-HY-MT1.5-1.8B_cuda_int4 + file: tencent-HY-MT1.5-1.8B_cuda_int4.json + devices: gpu + eps: CUDAExecutionProvider + - name: tencent-HY-MT1.5-1.8B_cuda_int4_with_eval + file: tencent-HY-MT1.5-1.8B_cuda_int4_with_eval.json + devices: gpu + eps: CUDAExecutionProvider diff --git a/tencent-HY-MT1.5-1.8B/cuda/requirements.txt b/tencent-HY-MT1.5-1.8B/cuda/requirements.txt new file mode 100644 index 000000000..a76691c7c --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cuda/requirements.txt @@ -0,0 +1,10 @@ +accelerate +datasets +lm-eval +olive-ai +onnxruntime-genai-cuda +onnxruntime-gpu +sentencepiece +tiktoken +torch +transformers==4.56.1 diff --git a/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16.json b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16.json new file mode 100644 index 000000000..3bc725a27 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16.json @@ -0,0 +1,42 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "fp16", + "enable_cuda_graph": true + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cuda_fp16", + "cache_dir": "cache_cuda_fp16", + "no_artifacts": true +} diff --git a/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json new file mode 100644 index 000000000..8e3435d98 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_fp16_with_eval.json @@ -0,0 +1,98 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "fp16", + "enable_cuda_graph": true + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cuda_fp16", + "cache_dir": "cache_cuda_fp16", + "evaluators": { + "zh_en_translation": { + "type": "LMEvaluator", + "tasks": [ + { + "task": "wmt18-zh-en", + "dataset_path": "wmt/wmt18", + "dataset_name": "zh-en", + "output_type": "generate_until", + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "validation", + "test_split": "test", + "doc_to_text": "Translate the following segment into English, without additional explanation.\n\n{{translation[\"zh\"]}}\nEnglish translation:", + "doc_to_target": "{{translation[\"en\"]}}", + "metric_list": [ + { + "metric": "bleu", + "aggregation": "bleu", + "higher_is_better": true + }, + { + "metric": "ter", + "aggregation": "ter", + "higher_is_better": false + }, + { + "metric": "chrf", + "aggregation": "chrf", + "higher_is_better": true + } + ], + "generation_kwargs": { + "until": [ + "\n" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "metadata": { + "version": 1.0 + } + } + ], + "model_class": "ortgenai", + "batch_size": 1, + "bootstrap_iters": 0, + "max_length": 2048, + "provider_options": { + "enable_cuda_graph": "1", + "enable_skip_layer_norm_strict_mode": "1" + } + } + }, + "evaluator": "zh_en_translation", + "evaluate_input_model": false +} diff --git a/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4.json b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4.json new file mode 100644 index 000000000..0cd9634c6 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4.json @@ -0,0 +1,64 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "passes": { + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 4, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 4 + }, + "model.embed_tokens": { + "bits": 4 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4", + "enable_cuda_graph": true + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cuda_int4", + "cache_dir": "cache_cuda_int4", + "no_artifacts": true +} diff --git a/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4_with_eval.json b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4_with_eval.json new file mode 100644 index 000000000..cc572ad52 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/cuda/tencent-HY-MT1.5-1.8B_cuda_int4_with_eval.json @@ -0,0 +1,120 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "passes": { + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 4, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 4 + }, + "model.embed_tokens": { + "bits": 4 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4", + "enable_cuda_graph": true + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_cuda_int4", + "cache_dir": "cache_cuda_int4", + "evaluators": { + "zh_en_translation": { + "type": "LMEvaluator", + "tasks": [ + { + "task": "wmt18-zh-en", + "dataset_path": "wmt/wmt18", + "dataset_name": "zh-en", + "output_type": "generate_until", + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "validation", + "test_split": "test", + "doc_to_text": "Translate the following segment into English, without additional explanation.\n\n{{translation[\"zh\"]}}\nEnglish translation:", + "doc_to_target": "{{translation[\"en\"]}}", + "metric_list": [ + { + "metric": "bleu", + "aggregation": "bleu", + "higher_is_better": true + }, + { + "metric": "ter", + "aggregation": "ter", + "higher_is_better": false + }, + { + "metric": "chrf", + "aggregation": "chrf", + "higher_is_better": true + } + ], + "generation_kwargs": { + "until": [ + "\n" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "metadata": { + "version": 1.0 + } + } + ], + "model_class": "ortgenai", + "batch_size": 1, + "bootstrap_iters": 0, + "max_length": 2048, + "provider_options": { + "enable_cuda_graph": "1", + "enable_skip_layer_norm_strict_mode": "1" + } + } + }, + "evaluator": "zh_en_translation", + "evaluate_input_model": false +} diff --git a/tencent-HY-MT1.5-1.8B/webgpu/README.md b/tencent-HY-MT1.5-1.8B/webgpu/README.md new file mode 100644 index 000000000..7f776df3a --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/webgpu/README.md @@ -0,0 +1,63 @@ +# tencent-HY-MT1.5-1.8B - WebGPU Optimization + +This folder contains Olive recipes for optimizing `tencent/HY-MT1.5-1.8B` for `WebGpuExecutionProvider`. + +## Recipes + +- `tencent-HY-MT1.5-1.8B_webgpu_fp16.json` +- `tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json` +- `tencent-HY-MT1.5-1.8B_webgpu_int4.json` +- `tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval.json` + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Build examples + +```bash +olive run --config tencent-HY-MT1.5-1.8B_webgpu_fp16.json +olive run --config tencent-HY-MT1.5-1.8B_webgpu_int4.json +``` + +## Build and evaluate with WMT18 Chinese-to-English translation + +```bash +olive run --config tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json +olive run --config tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval.json +``` + +## Notes + +- HY-MT1.5-1.8B config has tie_word_embeddings=true, so TieWordEmbeddings surgery is applied after ModelBuilder. +- Full precision recipe for this backend uses `fp16`. +- Primary INT4 recipes use the full INT4 pass chain: GPTQ -> RTN -> ModelBuilder. +- Primary INT4 recipes omit SelectiveMixedPrecision and quantize embedding / lm_head to INT4 as well, while preserving WebGPU `group_size: 32`. +- The SelectiveMixedPrecision `ratio: 0.98` tuning output is kept as a higher-quality reference point, but it is no longer the primary INT4 recipe. + +## Evaluation results + +WMT18 Chinese-to-English translation, 3,981 test samples. BLEU and chrF are higher-is-better; TER is lower-is-better. The WebGPU-exported models were evaluated with ORT GenAI using CUDA EP for measurement. + +| Model | Embedding / lm_head | Size | BLEU | chrF | TER | +| --- | --- | ---: | ---: | ---: | ---: | +| PyTorch baseline | fp16 | 3.806125 GiB | 12.634090637487896 | 35.290981008260474 | 85.7224234039956 | +| WebGPU INT4 (full INT4, primary) | int4 | 1.036270 GiB | 17.170642492358855 | 43.50620755758628 | 80.59466167555031 | +| WebGPU INT4 SMP (`ratio: 0.98`) | int8 | 1.175239 GiB | 19.914400461490622 | 51.365618622203314 | 78.65382267974122 | + +## SelectiveMixedPrecision ratio tuning + +The WebGPU sweep keeps `group_size: 32` and evaluates WebGPU-exported models with ORT GenAI using CUDA EP for measurement. + +| Configured ratio | Size | BLEU | chrF | TER | Decision | +| ---: | ---: | ---: | ---: | ---: | --- | +| 0.83 | 1.303161 GiB | 19.361209647557704 | 47.93900531979513 | 81.56406396224112 | Earlier conservative point | +| 0.90 | 1.243736 GiB | 17.294982888634184 | 44.015683477618495 | 83.01664157545673 | Smaller, quality lower | +| 0.95 | 1.198408 GiB | 19.59367777635586 | 47.99027966955557 | 80.96187492370916 | Good size / quality point | +| 0.98 | 1.175239 GiB | 19.914400461490622 | 51.365618622203314 | 78.65382267974122 | Best tested SMP quality point | +| 0.985 | 1.170202 GiB | 15.526458652910305 | 41.07165704340729 | 83.61679619156122 | Quality cliff starts above 0.98 | +| 0.99 | 1.164157 GiB | 18.187055216810382 | 45.06622281076286 | 81.91093298612525 | Smaller, below 0.98 quality | + +`ratio: 0.98` is the best tested SMP point: it is smaller than `0.83`, higher quality than `0.95`, and avoids the quality drop seen at `0.985` and `0.99`. The primary INT4 recipe now uses full INT4 because it is smaller than the SMP variants while still beating the PyTorch baseline on BLEU, chrF, and TER. diff --git a/tencent-HY-MT1.5-1.8B/webgpu/info.yaml b/tencent-HY-MT1.5-1.8B/webgpu/info.yaml new file mode 100644 index 000000000..27911c759 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/webgpu/info.yaml @@ -0,0 +1,18 @@ +arch: hunyuan_v1_dense +recipes: + - name: tencent-HY-MT1.5-1.8B_webgpu_fp16 + file: tencent-HY-MT1.5-1.8B_webgpu_fp16.json + devices: gpu + eps: WebGpuExecutionProvider + - name: tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval + file: tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json + devices: gpu + eps: WebGpuExecutionProvider + - name: tencent-HY-MT1.5-1.8B_webgpu_int4 + file: tencent-HY-MT1.5-1.8B_webgpu_int4.json + devices: gpu + eps: WebGpuExecutionProvider + - name: tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval + file: tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval.json + devices: gpu + eps: WebGpuExecutionProvider diff --git a/tencent-HY-MT1.5-1.8B/webgpu/requirements.txt b/tencent-HY-MT1.5-1.8B/webgpu/requirements.txt new file mode 100644 index 000000000..9e5654646 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/webgpu/requirements.txt @@ -0,0 +1,11 @@ +--pre +accelerate +datasets +lm-eval +olive-ai +onnxruntime-genai +onnxruntime-webgpu +sentencepiece +tiktoken +torch +transformers==4.56.1 diff --git a/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16.json b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16.json new file mode 100644 index 000000000..d44d44a5f --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16.json @@ -0,0 +1,41 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "fp16" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_webgpu_fp16", + "cache_dir": "cache_webgpu_fp16", + "no_artifacts": true +} diff --git a/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json new file mode 100644 index 000000000..ce1badac2 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_fp16_with_eval.json @@ -0,0 +1,95 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "fp16" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_webgpu_fp16", + "cache_dir": "cache_webgpu_fp16", + "evaluators": { + "zh_en_translation": { + "type": "LMEvaluator", + "tasks": [ + { + "task": "wmt18-zh-en", + "dataset_path": "wmt/wmt18", + "dataset_name": "zh-en", + "output_type": "generate_until", + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "validation", + "test_split": "test", + "doc_to_text": "Translate the following segment into English, without additional explanation.\n\n{{translation[\"zh\"]}}\nEnglish translation:", + "doc_to_target": "{{translation[\"en\"]}}", + "metric_list": [ + { + "metric": "bleu", + "aggregation": "bleu", + "higher_is_better": true + }, + { + "metric": "ter", + "aggregation": "ter", + "higher_is_better": false + }, + { + "metric": "chrf", + "aggregation": "chrf", + "higher_is_better": true + } + ], + "generation_kwargs": { + "until": [ + "\n" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "metadata": { + "version": 1.0 + } + } + ], + "batch_size": 1, + "bootstrap_iters": 0, + "max_length": 2048, + "provider_options": { + "enable_skip_layer_norm_strict_mode": "1" + } + } + }, + "evaluator": "zh_en_translation", + "evaluate_input_model": false +} diff --git a/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4.json b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4.json new file mode 100644 index 000000000..21d4d0083 --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4.json @@ -0,0 +1,63 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 4, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 4 + }, + "model.embed_tokens": { + "bits": 4 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_webgpu_int4", + "cache_dir": "cache_webgpu_int4", + "no_artifacts": true +} diff --git a/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval.json b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval.json new file mode 100644 index 000000000..e66fa5fcb --- /dev/null +++ b/tencent-HY-MT1.5-1.8B/webgpu/tencent-HY-MT1.5-1.8B_webgpu_int4_with_eval.json @@ -0,0 +1,118 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "tencent/HY-MT1.5-1.8B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 4, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 4 + }, + "model.embed_tokens": { + "bits": 4 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model_webgpu_int4", + "cache_dir": "cache_webgpu_int4", + "evaluators": { + "zh_en_translation": { + "type": "LMEvaluator", + "tasks": [ + { + "task": "wmt18-zh-en", + "dataset_path": "wmt/wmt18", + "dataset_name": "zh-en", + "output_type": "generate_until", + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "validation", + "test_split": "test", + "doc_to_text": "Translate the following segment into English, without additional explanation.\n\n{{translation[\"zh\"]}}\nEnglish translation:", + "doc_to_target": "{{translation[\"en\"]}}", + "metric_list": [ + { + "metric": "bleu", + "aggregation": "bleu", + "higher_is_better": true + }, + { + "metric": "ter", + "aggregation": "ter", + "higher_is_better": false + }, + { + "metric": "chrf", + "aggregation": "chrf", + "higher_is_better": true + } + ], + "generation_kwargs": { + "until": [ + "\n" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "metadata": { + "version": 1.0 + } + } + ], + "model_class": "ortgenai", + "batch_size": 1, + "bootstrap_iters": 0, + "max_length": 2048, + "provider_options": { + "enable_skip_layer_norm_strict_mode": "1" + } + } + }, + "evaluator": "zh_en_translation", + "evaluate_input_model": false +}