| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | Tokenize Dataset Script: Prepare Tool Calling Dataset for Training |
| | |
| | This script tokenizes the nvidia/Nemotron-Agentic-v1 tool_calling dataset |
| | and uploads it to HuggingFace Hub for reuse. |
| | |
| | Usage: |
| | uv run tokenize_dataset.py |
| | |
| | Can run on CPU - no GPU required! |
| | """ |
| |
|
| | import os |
| | import json |
| | from datasets import load_dataset, Dataset |
| | from transformers import AutoTokenizer |
| | from huggingface_hub import hf_hub_download, HfApi, create_repo |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | BASE_MODEL = "Tesslate/Synthia-S1-27b" |
| |
|
| | |
| | DATASET_NAME = "nvidia/Nemotron-Agentic-v1" |
| | DATASET_SPLIT = "tool_calling" |
| |
|
| | |
| | TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized" |
| | TOKENIZED_DATASET_PRIVATE = True |
| |
|
| | |
| | MAX_SEQ_LENGTH = 4096 |
| |
|
| | |
| | |
| | |
| |
|
| | def tokenize_conversation(example, tokenizer, max_length): |
| | """ |
| | Tokenize a conversation using the model's chat template. |
| | Returns input_ids, attention_mask, and labels for causal LM training. |
| | """ |
| | messages = example["messages"] |
| |
|
| | |
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=False |
| | ) |
| |
|
| | |
| | tokenized = tokenizer( |
| | text, |
| | truncation=True, |
| | max_length=max_length, |
| | padding=False, |
| | return_tensors=None, |
| | ) |
| |
|
| | |
| | tokenized["labels"] = tokenized["input_ids"].copy() |
| |
|
| | return tokenized |
| |
|
| |
|
| | def main(): |
| | print("=" * 60) |
| | print("Tokenize Dataset for Tool Calling Training") |
| | print("=" * 60) |
| |
|
| | |
| | from huggingface_hub import whoami |
| | try: |
| | username = whoami()["name"] |
| | print(f"Logged in as: {username}") |
| | except Exception as e: |
| | print(f"ERROR: Not logged in to HF Hub ({e})") |
| | print("Run 'huggingface-cli login' first") |
| | return |
| |
|
| | |
| | |
| | |
| | print(f"\nLoading tokenizer from {BASE_MODEL}...") |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained( |
| | BASE_MODEL, |
| | trust_remote_code=True, |
| | padding_side="right", |
| | ) |
| |
|
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| | tokenizer.pad_token_id = tokenizer.eos_token_id |
| |
|
| | print(f"Vocab size: {len(tokenizer):,}") |
| |
|
| | |
| | |
| | |
| | print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...") |
| |
|
| | |
| | jsonl_file = f"data/{DATASET_SPLIT}.jsonl" |
| | print(f"Downloading {jsonl_file}...") |
| |
|
| | local_path = hf_hub_download( |
| | repo_id=DATASET_NAME, |
| | filename=jsonl_file, |
| | repo_type="dataset" |
| | ) |
| | print(f"Downloaded to: {local_path}") |
| |
|
| | |
| | print("Loading and processing JSONL file...") |
| | processed_examples = [] |
| | skipped = 0 |
| |
|
| | with open(local_path, 'r', encoding='utf-8') as f: |
| | for line_num, line in enumerate(f): |
| | if line_num % 50000 == 0: |
| | print(f" Processed {line_num:,} lines...") |
| | try: |
| | example = json.loads(line.strip()) |
| | messages = example.get("messages", []) |
| |
|
| | |
| | formatted_messages = [] |
| | for msg in messages: |
| | role = msg.get("role", "user") |
| | content = msg.get("content", "") |
| |
|
| | |
| | if isinstance(content, list): |
| | parts = [] |
| | for item in content: |
| | if isinstance(item, dict): |
| | if "text" in item: |
| | parts.append(item["text"]) |
| | else: |
| | parts.append(json.dumps(item)) |
| | else: |
| | parts.append(str(item)) |
| | content = "\n".join(parts) if parts else "" |
| | elif isinstance(content, dict): |
| | content = json.dumps(content) |
| | elif content is None: |
| | content = "" |
| | else: |
| | content = str(content) |
| |
|
| | formatted_messages.append({ |
| | "role": role, |
| | "content": content |
| | }) |
| |
|
| | |
| | if formatted_messages: |
| | merged_messages = [] |
| | for msg in formatted_messages: |
| | role = msg["role"] |
| | content = msg["content"] |
| |
|
| | |
| | if role == "tool": |
| | role = "user" |
| | content = f"[Tool Result]\n{content}" |
| |
|
| | if merged_messages and merged_messages[-1]["role"] == role: |
| | merged_messages[-1]["content"] += f"\n\n{content}" |
| | else: |
| | merged_messages.append({"role": role, "content": content}) |
| |
|
| | |
| | if merged_messages and merged_messages[0]["role"] != "user": |
| | merged_messages.insert(0, {"role": "user", "content": "[Start]"}) |
| |
|
| | processed_examples.append({"messages": merged_messages}) |
| |
|
| | except Exception as e: |
| | skipped += 1 |
| | if skipped < 5: |
| | print(f" Warning: Skipped line {line_num}: {e}") |
| |
|
| | print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})") |
| |
|
| | |
| | dataset = Dataset.from_list(processed_examples) |
| | print(f"Dataset size: {len(dataset):,} examples") |
| |
|
| | |
| | split_dataset = dataset.train_test_split(test_size=0.02, seed=42) |
| | train_dataset = split_dataset["train"] |
| | eval_dataset = split_dataset["test"] |
| |
|
| | print(f"Train samples: {len(train_dataset):,}") |
| | print(f"Eval samples: {len(eval_dataset):,}") |
| |
|
| | |
| | |
| | |
| | print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...") |
| | print("This may take a while for large datasets...") |
| |
|
| | train_dataset = train_dataset.map( |
| | lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
| | remove_columns=["messages"], |
| | num_proc=1, |
| | desc="Tokenizing train", |
| | ) |
| |
|
| | eval_dataset = eval_dataset.map( |
| | lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
| | remove_columns=["messages"], |
| | num_proc=1, |
| | desc="Tokenizing eval", |
| | ) |
| |
|
| | print(f"Tokenization complete!") |
| | print(f"Train dataset columns: {train_dataset.column_names}") |
| | print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
| |
|
| | |
| | |
| | |
| | print(f"\nUploading TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}") |
| |
|
| | |
| | api = HfApi() |
| | try: |
| | create_repo( |
| | TOKENIZED_DATASET_REPO, |
| | repo_type="dataset", |
| | private=TOKENIZED_DATASET_PRIVATE, |
| | exist_ok=True |
| | ) |
| | print(f" Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})") |
| |
|
| | if TOKENIZED_DATASET_PRIVATE: |
| | try: |
| | api.update_repo_visibility( |
| | TOKENIZED_DATASET_REPO, |
| | repo_type="dataset", |
| | private=True |
| | ) |
| | except Exception: |
| | pass |
| | except Exception as e: |
| | print(f" Repo note: {e}") |
| |
|
| | |
| | train_dataset.reset_format() |
| | eval_dataset.reset_format() |
| |
|
| | |
| | print(f" Verifying tokenized data...") |
| | print(f" Train columns: {train_dataset.column_names}") |
| | print(f" Sample input_ids type: {type(train_dataset[0]['input_ids'])}") |
| | print(f" Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
| | print(f" First 10 tokens: {train_dataset[0]['input_ids'][:10]}") |
| |
|
| | |
| | print(f" Pushing train split ({len(train_dataset):,} examples)...") |
| | train_dataset.push_to_hub( |
| | TOKENIZED_DATASET_REPO, |
| | split="train", |
| | ) |
| |
|
| | print(f" Pushing test split ({len(eval_dataset):,} examples)...") |
| | eval_dataset.push_to_hub( |
| | TOKENIZED_DATASET_REPO, |
| | split="test", |
| | ) |
| |
|
| | print(f"\n" + "=" * 60) |
| | print(f"SUCCESS! Tokenized dataset saved to:") |
| | print(f" https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}") |
| | print(f"=" * 60) |
| |
|
| | |
| | print("\nVerifying upload...") |
| | try: |
| | from datasets import load_dataset as verify_load |
| | verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True) |
| | sample = next(iter(verify_ds)) |
| | if "input_ids" in sample: |
| | print(f" VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens") |
| | else: |
| | print(f" WARNING: input_ids not found in columns: {list(sample.keys())}") |
| | except Exception as ve: |
| | print(f" Could not verify: {ve}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|