From b0b14233567d7cd99d03799d9358483686256cc1 Mon Sep 17 00:00:00 2001
From: GovIndLok <97396655+GovIndLok@users.noreply.github.com>
Date: Mon, 25 May 2026 20:46:08 +0530
Subject: [PATCH 1/4] fix: update dataset path for Rotten Tomatoes in
 load_and_inspect and stream_dataset functions

---
 .../09-data-management/code/data_utils.py                     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py
index 25f9dd037..6b746dc6f 100644
--- a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py
+++ b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py
@@ -159,10 +159,10 @@ def fingerprint(ds, num_rows: int = 100):
     print("=" * 60)
 
     print("\n--- 1. Load and inspect a dataset ---")
-    ds = load_and_inspect("rotten_tomatoes", split="train")
+    ds = load_and_inspect("cornell-movie-review-data/rotten_tomatoes", split="train")
 
     print("\n--- 2. Stream a dataset ---")
-    rows = stream_dataset("rotten_tomatoes", max_rows=3)
+    rows = stream_dataset("cornell-movie-review-data/rotten_tomatoes", max_rows=3)
     for row in rows:
         print(f"  {row['text'][:80]}...")
 

From 26ffdb52c9eb716deedbf994f8ef99c0d43a3c83 Mon Sep 17 00:00:00 2001
From: GovIndLok <97396655+GovIndLok@users.noreply.github.com>
Date: Mon, 25 May 2026 21:31:01 +0530
Subject: [PATCH 2/4] fix: improve formatting of dataset split print statements

---
 .../00-setup-and-tooling/09-data-management/code/data_utils.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py
index 6b746dc6f..5df6f045e 100644
--- a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py
+++ b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py
@@ -1,4 +1,3 @@
-import os
 import sys
 import json
 import hashlib

From 01e177c0b39ef2bb139f2666d57396279c5ccf3a Mon Sep 17 00:00:00 2001
From: GovIndLok <97396655+GovIndLok@users.noreply.github.com>
Date: Tue, 26 May 2026 03:26:14 +0530
Subject: [PATCH 3/4] fix: update Hugging Face IDs for dataset recommendations
 in prompt-data-helper

---
 .../09-data-management/outputs/prompt-data-helper.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md
index 8daf3ff31..427792351 100644
--- a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md
+++ b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md
@@ -33,15 +33,15 @@ Common task-to-dataset mapping:
 
 | Task | Starter Dataset | HF ID |
 |------|----------------|-------|
-| Text classification | Rotten Tomatoes | `rotten_tomatoes` |
-| Sentiment analysis | IMDB | `imdb` |
+| Text classification | Rotten Tomatoes | `cornell-movie-review-data/rotten_tomatoes` |
+| Sentiment analysis | IMDB | `stanfordnlp/imdb` |
 | Natural language inference | MNLI | `glue/mnli` |
-| Question answering | SQuAD | `squad` |
+| Question answering | SQuAD | `rajpurkar/squad` |
 | Summarization | CNN/DailyMail | `cnn_dailymail` |
-| Translation | WMT | `wmt16` |
-| Language modeling | WikiText | `wikitext` |
+| Translation | WMT | `wmt/wmt16` |
+| Language modeling | WikiText | `Salesforce/wikitext` |
 | Token classification | CoNLL-2003 | `conll2003` |
-| Image classification | MNIST / CIFAR-10 | `mnist` / `cifar10` |
+| Image classification | MNIST / CIFAR-10 | `ylecun/mnist` / `uoft-cs/cifar10` |
 | Object detection | COCO | `detection-datasets/coco` |
 
 When recommending, prefer smaller datasets for learning and prototyping. Suggest larger datasets only when the user is ready to train at scale.

From 961aa8b280cb300d8c2b264bd7009923119806e3 Mon Sep 17 00:00:00 2001
From: GovIndLok <97396655+GovIndLok@users.noreply.github.com>
Date: Tue, 26 May 2026 20:04:14 +0530
Subject: [PATCH 4/4] fix: update Hugging Face IDs and configurations in
 dataset recommendations

---
 .../09-data-management/outputs/prompt-data-helper.md      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md
index 427792351..832b4234e 100644
--- a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md
+++ b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md
@@ -35,12 +35,12 @@ Common task-to-dataset mapping:
 |------|----------------|-------|
 | Text classification | Rotten Tomatoes | `cornell-movie-review-data/rotten_tomatoes` |
 | Sentiment analysis | IMDB | `stanfordnlp/imdb` |
-| Natural language inference | MNLI | `glue/mnli` |
+| Natural language inference | MNLI | `nyu-mll/glue` (config:`mnli`) |
 | Question answering | SQuAD | `rajpurkar/squad` |
-| Summarization | CNN/DailyMail | `cnn_dailymail` |
-| Translation | WMT | `wmt/wmt16` |
+| Summarization | CNN/DailyMail | `abisee/cnn_dailymail`(config: `3.0.0`) |
+| Translation | WMT | `wmt/wmt16`(config: `cs-en`) |
 | Language modeling | WikiText | `Salesforce/wikitext` |
-| Token classification | CoNLL-2003 | `conll2003` |
+| Token classification | CoNLL-2003 | `lhoestq/conll2003` |
 | Image classification | MNIST / CIFAR-10 | `ylecun/mnist` / `uoft-cs/cifar10` |
 | Object detection | COCO | `detection-datasets/coco` |