diff --git a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py index 25f9dd037..5df6f045e 100644 --- a/phases/00-setup-and-tooling/09-data-management/code/data_utils.py +++ b/phases/00-setup-and-tooling/09-data-management/code/data_utils.py @@ -1,4 +1,3 @@ -import os import sys import json import hashlib @@ -159,10 +158,10 @@ def fingerprint(ds, num_rows: int = 100): print("=" * 60) print("\n--- 1. Load and inspect a dataset ---") - ds = load_and_inspect("rotten_tomatoes", split="train") + ds = load_and_inspect("cornell-movie-review-data/rotten_tomatoes", split="train") print("\n--- 2. Stream a dataset ---") - rows = stream_dataset("rotten_tomatoes", max_rows=3) + rows = stream_dataset("cornell-movie-review-data/rotten_tomatoes", max_rows=3) for row in rows: print(f" {row['text'][:80]}...") diff --git a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md index 8daf3ff31..832b4234e 100644 --- a/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md +++ b/phases/00-setup-and-tooling/09-data-management/outputs/prompt-data-helper.md @@ -33,15 +33,15 @@ Common task-to-dataset mapping: | Task | Starter Dataset | HF ID | |------|----------------|-------| -| Text classification | Rotten Tomatoes | `rotten_tomatoes` | -| Sentiment analysis | IMDB | `imdb` | -| Natural language inference | MNLI | `glue/mnli` | -| Question answering | SQuAD | `squad` | -| Summarization | CNN/DailyMail | `cnn_dailymail` | -| Translation | WMT | `wmt16` | -| Language modeling | WikiText | `wikitext` | -| Token classification | CoNLL-2003 | `conll2003` | -| Image classification | MNIST / CIFAR-10 | `mnist` / `cifar10` | +| Text classification | Rotten Tomatoes | `cornell-movie-review-data/rotten_tomatoes` | +| Sentiment analysis | IMDB | `stanfordnlp/imdb` | +| Natural language inference | MNLI | `nyu-mll/glue` (config:`mnli`) | +| Question answering | SQuAD | `rajpurkar/squad` | +| Summarization | CNN/DailyMail | `abisee/cnn_dailymail`(config: `3.0.0`) | +| Translation | WMT | `wmt/wmt16`(config: `cs-en`) | +| Language modeling | WikiText | `Salesforce/wikitext` | +| Token classification | CoNLL-2003 | `lhoestq/conll2003` | +| Image classification | MNIST / CIFAR-10 | `ylecun/mnist` / `uoft-cs/cifar10` | | Object detection | COCO | `detection-datasets/coco` | When recommending, prefer smaller datasets for learning and prototyping. Suggest larger datasets only when the user is ready to train at scale.