From b0b14233567d7cd99d03799d9358483686256cc1 Mon Sep 17 00:00:00 2001 From: GovIndLok <97396655+GovIndLok@users.noreply.github.com> Date: Mon, 25 May 2026 20:46:08 +0530 Subject: [PATCH 1/4] fix: update dataset path for Rotten Tomatoes in load_and_inspect and stream_dataset functions --- .../09-data-management/code/data_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py index 25f9dd037..6b746dc6f 100644 --- a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py +++ b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py @@ -159,10 +159,10 @@ def fingerprint(ds, num_rows: int = 100): print("=" * 60) print("\n--- 1. Load and inspect a dataset ---") - ds = load_and_inspect("rotten_tomatoes", split="train") + ds = load_and_inspect("cornell-movie-review-data/rotten_tomatoes", split="train") print("\n--- 2. Stream a dataset ---") - rows = stream_dataset("rotten_tomatoes", max_rows=3) + rows = stream_dataset("cornell-movie-review-data/rotten_tomatoes", max_rows=3) for row in rows: print(f" {row['text'][:80]}...") From 26ffdb52c9eb716deedbf994f8ef99c0d43a3c83 Mon Sep 17 00:00:00 2001 From: GovIndLok <97396655+GovIndLok@users.noreply.github.com> Date: Mon, 25 May 2026 21:31:01 +0530 Subject: [PATCH 2/4] fix: improve formatting of dataset split print statements --- .../00-setup-and-tooling/09-data-management/code/data_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py index 6b746dc6f..5df6f045e 100644 --- a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py +++ b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py @@ -1,4 +1,3 @@ -import os import sys import json import hashlib From 01e177c0b39ef2bb139f2666d57396279c5ccf3a Mon Sep 17 00:00:00 2001 From: GovIndLok <97396655+GovIndLok@users.noreply.github.com> Date: Tue, 26 May 2026 03:26:14 +0530 Subject: [PATCH 3/4] fix: update Hugging Face IDs for dataset recommendations in prompt-data-helper --- .../09-data-management/outputs/prompt-data-helper.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md index 8daf3ff31..427792351 100644 --- a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md +++ b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md @@ -33,15 +33,15 @@ Common task-to-dataset mapping: | Task | Starter Dataset | HF ID | |------|----------------|-------| -| Text classification | Rotten Tomatoes | `rotten_tomatoes` | -| Sentiment analysis | IMDB | `imdb` | +| Text classification | Rotten Tomatoes | `cornell-movie-review-data/rotten_tomatoes` | +| Sentiment analysis | IMDB | `stanfordnlp/imdb` | | Natural language inference | MNLI | `glue/mnli` | -| Question answering | SQuAD | `squad` | +| Question answering | SQuAD | `rajpurkar/squad` | | Summarization | CNN/DailyMail | `cnn_dailymail` | -| Translation | WMT | `wmt16` | -| Language modeling | WikiText | `wikitext` | +| Translation | WMT | `wmt/wmt16` | +| Language modeling | WikiText | `Salesforce/wikitext` | | Token classification | CoNLL-2003 | `conll2003` | -| Image classification | MNIST / CIFAR-10 | `mnist` / `cifar10` | +| Image classification | MNIST / CIFAR-10 | `ylecun/mnist` / `uoft-cs/cifar10` | | Object detection | COCO | `detection-datasets/coco` | When recommending, prefer smaller datasets for learning and prototyping. Suggest larger datasets only when the user is ready to train at scale. From 961aa8b280cb300d8c2b264bd7009923119806e3 Mon Sep 17 00:00:00 2001 From: GovIndLok <97396655+GovIndLok@users.noreply.github.com> Date: Tue, 26 May 2026 20:04:14 +0530 Subject: [PATCH 4/4] fix: update Hugging Face IDs and configurations in dataset recommendations --- .../09-data-management/outputs/prompt-data-helper.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md index 427792351..832b4234e 100644 --- a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md +++ b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md @@ -35,12 +35,12 @@ Common task-to-dataset mapping: |------|----------------|-------| | Text classification | Rotten Tomatoes | `cornell-movie-review-data/rotten_tomatoes` | | Sentiment analysis | IMDB | `stanfordnlp/imdb` | -| Natural language inference | MNLI | `glue/mnli` | +| Natural language inference | MNLI | `nyu-mll/glue` (config:`mnli`) | | Question answering | SQuAD | `rajpurkar/squad` | -| Summarization | CNN/DailyMail | `cnn_dailymail` | -| Translation | WMT | `wmt/wmt16` | +| Summarization | CNN/DailyMail | `abisee/cnn_dailymail`(config: `3.0.0`) | +| Translation | WMT | `wmt/wmt16`(config: `cs-en`) | | Language modeling | WikiText | `Salesforce/wikitext` | -| Token classification | CoNLL-2003 | `conll2003` | +| Token classification | CoNLL-2003 | `lhoestq/conll2003` | | Image classification | MNIST / CIFAR-10 | `ylecun/mnist` / `uoft-cs/cifar10` | | Object detection | COCO | `detection-datasets/coco` |