atasoglu · atasoglu · Nov 9, 2025 · Nov 8, 2025 · Nov 9, 2025 · Nov 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,21 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
 
 Nothing yet.
 
+## [0.3.0] - 2025-01-10
+### Added
+- Hugging Face dataset integration utilities in `examples/nano_tool_calling_v1/`
+  - `dataset_to_tools()` function to load tools from Hugging Face datasets
+  - `validate_json_schema()` for OpenAI tool schema validation with recursive array type checking
+  - `push_to_hf.py` script for uploading generated datasets to Hugging Face Hub
+- Complete example workflow for Nano Tool Calling v1 dataset generation
+  - Configuration, generation, validation, and publishing pipeline
+  - Analysis utilities for function inspection
+  - Comprehensive README with dataset card format
+
+### Changed
+- Enhanced batch sampling progress bar display for better user feedback
+- Improved parallel processing record ordering and ID assignment
+
 ## [0.2.2] - 2025-01-09
 ### Changed
 - Records are now written to JSONL file immediately as they complete in parallel mode, rather than waiting for all generation to finish

diff --git a/examples/nano_tool_calling_v1/README.md b/examples/nano_tool_calling_v1/README.md
@@ -0,0 +1,132 @@
+---
+license: mit
+task_categories:
+- text-generation
+language:
+- en
+tags:
+- function-calling
+- tool-calling
+- synthetic
+- openai
+size_categories:
+- n<1K
+---
+
+# Nano Tool Calling v1
+
+A synthetic tool-calling dataset generated using [ToolsGen](https://github.com/atasoglu/toolsgen) with GPT-4.1-nano models.
+
+## Dataset Details
+
+- **Generated with**: ToolsGen v0.1.0
+- **Source Tools**: [argilla-warehouse/python-seed-tools](https://huggingface.co/datasets/argilla-warehouse/python-seed-tools)
+- **Total Samples**: 989
+- **Language**: English
+- **Format**: Single-turn conversations with tool calls
+
+### Models Used
+
+- **Problem Generator**: gpt-4.1-nano (temp=1.0)
+- **Tool Caller**: gpt-4.1-nano (temp=0.0)
+- **Judge**: gpt-4.1-mini (temp=0.0)
+
+## Dataset Structure
+
+Each record contains:
+
+```json
+{
+  "id": "record_000000",
+  "language": "english",
+  "tools": [...],
+  "messages": [
+    {"role": "user", "content": "..."}
+  ],
+  "assistant_calls": [
+    {
+      "id": "call_...",
+      "type": "function",
+      "function": {
+        "name": "function_name",
+        "arguments": "{...}"
+      }
+    }
+  ],
+  "problem_metadata": {...},
+  "judge": {
+    "tool_relevance": 0.4,
+    "argument_quality": 0.38,
+    "clarity": 0.2,
+    "score": 0.98,
+    "verdict": "accept",
+    "rationale": "...",
+    "rubric_version": "0.1.0",
+    "model": "gpt-4.1-mini",
+    "temperature": 0.0
+  },
+  "quality_tags": [],
+  "tools_metadata": {"num_tools": 2}
+}
+```
+
+## Generation Details
+
+### Configuration
+
+- **Strategy**: Random tool sampling
+- **Tools per sample**: 1-4 (k_min=1, k_max=4)
+- **Parallel workers**: 16
+- **Worker batch size**: 16
+- **Max attempts**: 3
+- **Seed**: 42
+
+### Quality Control
+
+All samples passed through an LLM-as-a-judge evaluation with a multi-dimensional rubric:
+
+- **Tool Relevance** (40%): Are the selected tools appropriate?
+- **Argument Quality** (38%): Are arguments valid and plausible?
+- **Clarity** (20%): Is the response complete and clear?
+
+Samples with `score >= 0.7` and `verdict == "accept"` are included.
+
+## Usage
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("atasoglu/nano-tool-calling-v1")
+
+# Access a sample
+sample = dataset["train"][0]
+print(sample["messages"])
+print(sample["assistant_calls"])
+```
+
+## Source Tools
+
+The dataset uses 38,420 Python function definitions from the [python-seed-tools](https://huggingface.co/datasets/argilla-warehouse/python-seed-tools) dataset, covering diverse programming tasks and domains.
+
+## Limitations
+
+- Single-turn conversations only
+- English language only
+- Synthetic data generated by LLMs (may contain artifacts)
+- No actual tool execution or validation
+- Judge scores are model-based assessments
+
+## Citation
+
+```bibtex
+@software{toolsgen2025,
+  title = {ToolsGen: Synthetic Tool-Calling Dataset Generator},
+  author = {Ataşoğlu, Ahmet},
+  year = {2025},
+  url = {https://github.com/atasoglu/toolsgen}
+}
+```
+
+## License
+
+MIT License
diff --git a/examples/nano_tool_calling_v1/analyze_functions.py b/examples/nano_tool_calling_v1/analyze_functions.py
@@ -0,0 +1,15 @@
+import json
+from collections import Counter
+
+with open("output/train.jsonl", "r", encoding="utf-8") as f:
+    function_counts = Counter()
+
+    for line in f:
+        record = json.loads(line)
+        for tool in record.get("tools", []):
+            func_name = tool.get("function", {}).get("name")
+            if func_name:
+                function_counts[func_name] += 1
+
+for func, count in function_counts.most_common():
+    print(f"{func}: {count}")
diff --git a/examples/nano_tool_calling_v1/config.py b/examples/nano_tool_calling_v1/config.py
@@ -0,0 +1,33 @@
+from toolsgen import (
+    GenerationConfig,
+    ModelConfig,
+    RoleBasedModelConfig,
+)
+
+gen_config = GenerationConfig(
+    num_samples=1_000,
+    strategy="random",
+    seed=42,
+    language="english",
+    max_attempts=3,
+    k_min=1,
+    k_max=4,
+    shuffle_tools=True,
+    num_workers=16,
+    worker_batch_size=16,
+)
+
+role_config = RoleBasedModelConfig(
+    problem_generator=ModelConfig(
+        model="gpt-4.1-nano",
+        temperature=1.0,
+    ),
+    tool_caller=ModelConfig(
+        model="gpt-4.1-nano",
+        temperature=0,
+    ),
+    judge=ModelConfig(
+        model="gpt-4.1-mini",
+        temperature=0,
+    ),
+)
diff --git a/examples/nano_tool_calling_v1/example.py b/examples/nano_tool_calling_v1/example.py
@@ -0,0 +1,39 @@
+"""
+Hugging Face example - Using a dataset from Hugging Face
+
+Install datasets library by using `pip install datasets` and set the dataset id.
+"""
+
+from pathlib import Path
+from dotenv import load_dotenv
+from utils import dataset_to_tools
+from config import gen_config, role_config
+from toolsgen import generate_dataset
+
+# Load environment variables from .env file
+load_dotenv()
+
+
+def main() -> None:
+    # Load dataset from Hugging Face
+    dataset_id = "argilla-warehouse/python-seed-tools"
+    tools = dataset_to_tools(dataset_id, dataset_kwargs={"split": "train"})
+    output_dir = Path(__file__).parent / "output"
+
+    # Generate dataset
+    manifest = generate_dataset(output_dir, gen_config, role_config, tools=tools)
+
+    # Print summary
+    print(
+        f"\n✓ Generated {manifest['num_generated']}/{manifest['num_requested']} records"
+    )
+    if manifest["num_failed"] > 0:
+        print(f"  Failed: {manifest['num_failed']} attempts")
+    print(f"  Problem Generator: {role_config.problem_generator.model}")
+    print(f"  Tool Caller: {role_config.tool_caller.model}")
+    print(f"  Judge: {role_config.judge.model}")
+    print(f"  Output: {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/nano_tool_calling_v1/push_to_hf.py b/examples/nano_tool_calling_v1/push_to_hf.py
@@ -0,0 +1,64 @@
+import json
+import os
+from pathlib import Path
+from datasets import Dataset, DatasetDict
+from huggingface_hub import DatasetCard
+from dotenv import load_dotenv
+
+
+def load_jsonl(path: Path) -> list[dict]:
+    return [json.loads(line) for line in open(path, encoding="utf-8")]
+
+
+def push_to_hub(
+    dataset_path: Path,
+    repo_id: str,
+    token: str | None = None,
+    private: bool = False,
+    readme_path: Path | None = None,
+):
+    train = load_jsonl(dataset_path / "train.jsonl")
+    val_path = dataset_path / "val.jsonl"
+
+    # Convert to JSON strings to avoid schema issues
+    for record in train:
+        record["tools"] = json.dumps(record["tools"])
+        record["messages"] = json.dumps(record["messages"])
+        record["assistant_calls"] = json.dumps(record["assistant_calls"])
+        record["problem_metadata"] = json.dumps(record["problem_metadata"])
+        record["judge"] = json.dumps(record["judge"])
+        record["quality_tags"] = json.dumps(record["quality_tags"])
+        record["tools_metadata"] = json.dumps(record["tools_metadata"])
+
+    dataset = Dataset.from_list(train)
+
+    if val_path.exists():
+        val = load_jsonl(val_path)
+        for record in val:
+            record["tools"] = json.dumps(record["tools"])
+            record["messages"] = json.dumps(record["messages"])
+            record["assistant_calls"] = json.dumps(record["assistant_calls"])
+            record["problem_metadata"] = json.dumps(record["problem_metadata"])
+            record["judge"] = json.dumps(record["judge"])
+            record["quality_tags"] = json.dumps(record["quality_tags"])
+            record["tools_metadata"] = json.dumps(record["tools_metadata"])
+        dataset = DatasetDict({"train": dataset, "validation": Dataset.from_list(val)})
+
+    dataset.push_to_hub(repo_id, token=token, private=private)
+
+    if readme_path and readme_path.exists():
+        card = DatasetCard(open(readme_path, encoding="utf-8").read())
+        card.push_to_hub(repo_id, token=token)
+
+    print(f"✓ Pushed to https://huggingface.co/datasets/{repo_id}")
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    base_path = Path(__file__).parent
+    push_to_hub(
+        base_path / "output",
+        "atasoglu/nano-tool-calling-v1",
+        os.getenv("HF_TOKEN"),
+        readme_path=base_path / "README.md",
+    )
diff --git a/examples/nano_tool_calling_v1/utils.py b/examples/nano_tool_calling_v1/utils.py
@@ -0,0 +1,45 @@
+import json
+from typing import List, Optional
+
+from datasets import load_dataset
+
+from toolsgen import (
+    ToolFunction,
+    ToolSpec,
+)
+
+from validation import validate_json_schema
+
+
+def dataset_to_tools(
+    dataset_id: str, dataset_kwargs: Optional[dict] = None
+) -> List[ToolSpec]:
+    """Load tools from a Hugging Face dataset.
+
+    Args:
+        dataset_id (str): The Hugging Face dataset identifier.
+        dataset_kwargs (Optional[dict]): Additional arguments for loading the dataset.
+    Returns:
+        List[ToolSpec]: A list of ToolSpec objects.
+    """
+    dataset = load_dataset(dataset_id, **(dataset_kwargs or {}))
+    # Each dataset row contains a list of tools in OpenAI format
+    # Flatten the nested lists: [[tool1], [tool2, tool3], ...] -> [tool1, tool2, tool3, ...]
+    all_tools = []
+    for tools_json in dataset["tools"]:
+        tools_list = json.loads(tools_json)
+        all_tools.extend(tools_list)
+
+    # Convert from OpenAI format to ToolSpec
+    # OpenAI format: {'type': 'function', 'function': {'name': ..., 'description': ..., 'parameters': ...}}
+    return [
+        ToolSpec(
+            function=ToolFunction(
+                name=tool["function"]["name"],
+                description=tool["function"]["description"],
+                parameters=tool["function"]["parameters"],
+            )
+        )
+        for tool in all_tools
+        if validate_json_schema(tool)
+    ]