diff --git a/CHANGELOG.md b/CHANGELOG.md index 28028aa..ecb55ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,21 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve Nothing yet. +## [0.3.0] - 2025-01-10 +### Added +- Hugging Face dataset integration utilities in `examples/nano_tool_calling_v1/` + - `dataset_to_tools()` function to load tools from Hugging Face datasets + - `validate_json_schema()` for OpenAI tool schema validation with recursive array type checking + - `push_to_hf.py` script for uploading generated datasets to Hugging Face Hub +- Complete example workflow for Nano Tool Calling v1 dataset generation + - Configuration, generation, validation, and publishing pipeline + - Analysis utilities for function inspection + - Comprehensive README with dataset card format + +### Changed +- Enhanced batch sampling progress bar display for better user feedback +- Improved parallel processing record ordering and ID assignment + ## [0.2.2] - 2025-01-09 ### Changed - Records are now written to JSONL file immediately as they complete in parallel mode, rather than waiting for all generation to finish diff --git a/examples/nano_tool_calling_v1/README.md b/examples/nano_tool_calling_v1/README.md new file mode 100644 index 0000000..f62b642 --- /dev/null +++ b/examples/nano_tool_calling_v1/README.md @@ -0,0 +1,132 @@ +--- +license: mit +task_categories: +- text-generation +language: +- en +tags: +- function-calling +- tool-calling +- synthetic +- openai +size_categories: +- n<1K +--- + +# Nano Tool Calling v1 + +A synthetic tool-calling dataset generated using [ToolsGen](https://github.com/atasoglu/toolsgen) with GPT-4.1-nano models. + +## Dataset Details + +- **Generated with**: ToolsGen v0.1.0 +- **Source Tools**: [argilla-warehouse/python-seed-tools](https://huggingface.co/datasets/argilla-warehouse/python-seed-tools) +- **Total Samples**: 989 +- **Language**: English +- **Format**: Single-turn conversations with tool calls + +### Models Used + +- **Problem Generator**: gpt-4.1-nano (temp=1.0) +- **Tool Caller**: gpt-4.1-nano (temp=0.0) +- **Judge**: gpt-4.1-mini (temp=0.0) + +## Dataset Structure + +Each record contains: + +```json +{ + "id": "record_000000", + "language": "english", + "tools": [...], + "messages": [ + {"role": "user", "content": "..."} + ], + "assistant_calls": [ + { + "id": "call_...", + "type": "function", + "function": { + "name": "function_name", + "arguments": "{...}" + } + } + ], + "problem_metadata": {...}, + "judge": { + "tool_relevance": 0.4, + "argument_quality": 0.38, + "clarity": 0.2, + "score": 0.98, + "verdict": "accept", + "rationale": "...", + "rubric_version": "0.1.0", + "model": "gpt-4.1-mini", + "temperature": 0.0 + }, + "quality_tags": [], + "tools_metadata": {"num_tools": 2} +} +``` + +## Generation Details + +### Configuration + +- **Strategy**: Random tool sampling +- **Tools per sample**: 1-4 (k_min=1, k_max=4) +- **Parallel workers**: 16 +- **Worker batch size**: 16 +- **Max attempts**: 3 +- **Seed**: 42 + +### Quality Control + +All samples passed through an LLM-as-a-judge evaluation with a multi-dimensional rubric: + +- **Tool Relevance** (40%): Are the selected tools appropriate? +- **Argument Quality** (38%): Are arguments valid and plausible? +- **Clarity** (20%): Is the response complete and clear? + +Samples with `score >= 0.7` and `verdict == "accept"` are included. + +## Usage + +```python +from datasets import load_dataset + +dataset = load_dataset("atasoglu/nano-tool-calling-v1") + +# Access a sample +sample = dataset["train"][0] +print(sample["messages"]) +print(sample["assistant_calls"]) +``` + +## Source Tools + +The dataset uses 38,420 Python function definitions from the [python-seed-tools](https://huggingface.co/datasets/argilla-warehouse/python-seed-tools) dataset, covering diverse programming tasks and domains. + +## Limitations + +- Single-turn conversations only +- English language only +- Synthetic data generated by LLMs (may contain artifacts) +- No actual tool execution or validation +- Judge scores are model-based assessments + +## Citation + +```bibtex +@software{toolsgen2025, + title = {ToolsGen: Synthetic Tool-Calling Dataset Generator}, + author = {Ataşoğlu, Ahmet}, + year = {2025}, + url = {https://github.com/atasoglu/toolsgen} +} +``` + +## License + +MIT License diff --git a/examples/nano_tool_calling_v1/analyze_functions.py b/examples/nano_tool_calling_v1/analyze_functions.py new file mode 100644 index 0000000..ad38ea9 --- /dev/null +++ b/examples/nano_tool_calling_v1/analyze_functions.py @@ -0,0 +1,15 @@ +import json +from collections import Counter + +with open("output/train.jsonl", "r", encoding="utf-8") as f: + function_counts = Counter() + + for line in f: + record = json.loads(line) + for tool in record.get("tools", []): + func_name = tool.get("function", {}).get("name") + if func_name: + function_counts[func_name] += 1 + +for func, count in function_counts.most_common(): + print(f"{func}: {count}") diff --git a/examples/nano_tool_calling_v1/config.py b/examples/nano_tool_calling_v1/config.py new file mode 100644 index 0000000..babd9bb --- /dev/null +++ b/examples/nano_tool_calling_v1/config.py @@ -0,0 +1,33 @@ +from toolsgen import ( + GenerationConfig, + ModelConfig, + RoleBasedModelConfig, +) + +gen_config = GenerationConfig( + num_samples=1_000, + strategy="random", + seed=42, + language="english", + max_attempts=3, + k_min=1, + k_max=4, + shuffle_tools=True, + num_workers=16, + worker_batch_size=16, +) + +role_config = RoleBasedModelConfig( + problem_generator=ModelConfig( + model="gpt-4.1-nano", + temperature=1.0, + ), + tool_caller=ModelConfig( + model="gpt-4.1-nano", + temperature=0, + ), + judge=ModelConfig( + model="gpt-4.1-mini", + temperature=0, + ), +) diff --git a/examples/nano_tool_calling_v1/example.py b/examples/nano_tool_calling_v1/example.py new file mode 100644 index 0000000..6fd4d22 --- /dev/null +++ b/examples/nano_tool_calling_v1/example.py @@ -0,0 +1,39 @@ +""" +Hugging Face example - Using a dataset from Hugging Face + +Install datasets library by using `pip install datasets` and set the dataset id. +""" + +from pathlib import Path +from dotenv import load_dotenv +from utils import dataset_to_tools +from config import gen_config, role_config +from toolsgen import generate_dataset + +# Load environment variables from .env file +load_dotenv() + + +def main() -> None: + # Load dataset from Hugging Face + dataset_id = "argilla-warehouse/python-seed-tools" + tools = dataset_to_tools(dataset_id, dataset_kwargs={"split": "train"}) + output_dir = Path(__file__).parent / "output" + + # Generate dataset + manifest = generate_dataset(output_dir, gen_config, role_config, tools=tools) + + # Print summary + print( + f"\n✓ Generated {manifest['num_generated']}/{manifest['num_requested']} records" + ) + if manifest["num_failed"] > 0: + print(f" Failed: {manifest['num_failed']} attempts") + print(f" Problem Generator: {role_config.problem_generator.model}") + print(f" Tool Caller: {role_config.tool_caller.model}") + print(f" Judge: {role_config.judge.model}") + print(f" Output: {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/examples/nano_tool_calling_v1/push_to_hf.py b/examples/nano_tool_calling_v1/push_to_hf.py new file mode 100644 index 0000000..aa9486e --- /dev/null +++ b/examples/nano_tool_calling_v1/push_to_hf.py @@ -0,0 +1,64 @@ +import json +import os +from pathlib import Path +from datasets import Dataset, DatasetDict +from huggingface_hub import DatasetCard +from dotenv import load_dotenv + + +def load_jsonl(path: Path) -> list[dict]: + return [json.loads(line) for line in open(path, encoding="utf-8")] + + +def push_to_hub( + dataset_path: Path, + repo_id: str, + token: str | None = None, + private: bool = False, + readme_path: Path | None = None, +): + train = load_jsonl(dataset_path / "train.jsonl") + val_path = dataset_path / "val.jsonl" + + # Convert to JSON strings to avoid schema issues + for record in train: + record["tools"] = json.dumps(record["tools"]) + record["messages"] = json.dumps(record["messages"]) + record["assistant_calls"] = json.dumps(record["assistant_calls"]) + record["problem_metadata"] = json.dumps(record["problem_metadata"]) + record["judge"] = json.dumps(record["judge"]) + record["quality_tags"] = json.dumps(record["quality_tags"]) + record["tools_metadata"] = json.dumps(record["tools_metadata"]) + + dataset = Dataset.from_list(train) + + if val_path.exists(): + val = load_jsonl(val_path) + for record in val: + record["tools"] = json.dumps(record["tools"]) + record["messages"] = json.dumps(record["messages"]) + record["assistant_calls"] = json.dumps(record["assistant_calls"]) + record["problem_metadata"] = json.dumps(record["problem_metadata"]) + record["judge"] = json.dumps(record["judge"]) + record["quality_tags"] = json.dumps(record["quality_tags"]) + record["tools_metadata"] = json.dumps(record["tools_metadata"]) + dataset = DatasetDict({"train": dataset, "validation": Dataset.from_list(val)}) + + dataset.push_to_hub(repo_id, token=token, private=private) + + if readme_path and readme_path.exists(): + card = DatasetCard(open(readme_path, encoding="utf-8").read()) + card.push_to_hub(repo_id, token=token) + + print(f"✓ Pushed to https://huggingface.co/datasets/{repo_id}") + + +if __name__ == "__main__": + load_dotenv() + base_path = Path(__file__).parent + push_to_hub( + base_path / "output", + "atasoglu/nano-tool-calling-v1", + os.getenv("HF_TOKEN"), + readme_path=base_path / "README.md", + ) diff --git a/examples/nano_tool_calling_v1/utils.py b/examples/nano_tool_calling_v1/utils.py new file mode 100644 index 0000000..22b8ed7 --- /dev/null +++ b/examples/nano_tool_calling_v1/utils.py @@ -0,0 +1,45 @@ +import json +from typing import List, Optional + +from datasets import load_dataset + +from toolsgen import ( + ToolFunction, + ToolSpec, +) + +from validation import validate_json_schema + + +def dataset_to_tools( + dataset_id: str, dataset_kwargs: Optional[dict] = None +) -> List[ToolSpec]: + """Load tools from a Hugging Face dataset. + + Args: + dataset_id (str): The Hugging Face dataset identifier. + dataset_kwargs (Optional[dict]): Additional arguments for loading the dataset. + Returns: + List[ToolSpec]: A list of ToolSpec objects. + """ + dataset = load_dataset(dataset_id, **(dataset_kwargs or {})) + # Each dataset row contains a list of tools in OpenAI format + # Flatten the nested lists: [[tool1], [tool2, tool3], ...] -> [tool1, tool2, tool3, ...] + all_tools = [] + for tools_json in dataset["tools"]: + tools_list = json.loads(tools_json) + all_tools.extend(tools_list) + + # Convert from OpenAI format to ToolSpec + # OpenAI format: {'type': 'function', 'function': {'name': ..., 'description': ..., 'parameters': ...}} + return [ + ToolSpec( + function=ToolFunction( + name=tool["function"]["name"], + description=tool["function"]["description"], + parameters=tool["function"]["parameters"], + ) + ) + for tool in all_tools + if validate_json_schema(tool) + ] diff --git a/examples/nano_tool_calling_v1/validation.py b/examples/nano_tool_calling_v1/validation.py new file mode 100644 index 0000000..8261d15 --- /dev/null +++ b/examples/nano_tool_calling_v1/validation.py @@ -0,0 +1,108 @@ +from typing import Any + + +def _validate_schema_recursively(schema: dict[str, Any]) -> bool: + """Recursively validate JSON Schema properties. + + OpenAI requires that array types must have 'items' field defined. + """ + if not isinstance(schema, dict): + return True + + # Check if this is an array type + schema_type = schema.get("type") + if schema_type == "array": + # Array must have items field + if "items" not in schema: + return False + # Recursively validate items + if not _validate_schema_recursively(schema.get("items", {})): + return False + elif isinstance(schema_type, list) and "array" in schema_type: + # Handle union types like ["array", "null"] + if "items" not in schema: + return False + if not _validate_schema_recursively(schema.get("items", {})): + return False + + # Check properties recursively + if "properties" in schema: + properties = schema.get("properties", {}) + if isinstance(properties, dict): + for prop_schema in properties.values(): + if not _validate_schema_recursively(prop_schema): + return False + + # Check items recursively (for nested arrays) + if "items" in schema: + items = schema.get("items") + if isinstance(items, dict): + if not _validate_schema_recursively(items): + return False + + # Check additionalProperties if it's a schema + if "additionalProperties" in schema: + add_props = schema.get("additionalProperties") + if isinstance(add_props, dict): + if not _validate_schema_recursively(add_props): + return False + + return True + + +def validate_json_schema(tool: dict[str, Any]) -> bool: + """Validate OpenAI tool schema format. + + Expected format: + + ```json + { + "type": "function", + "function": { + "name": "function_name", + "description": "function description", + "parameters": {"type": "object", "properties": {...}} + } + } + ``` + Also validates that all array types have 'items' field defined. + """ + try: + # Check top-level structure + if not isinstance(tool, dict): + return False + + if tool.get("type") != "function": + return False + + function = tool.get("function") + if not isinstance(function, dict): + return False + + # Check required function fields + if "name" not in function or not isinstance(function["name"], str): + return False + + if "description" not in function or not isinstance( + function["description"], str + ): + return False + + # Parameters are optional, but if present must be a dict + if "parameters" in function: + params = function["parameters"] + if not isinstance(params, dict): + return False + + # If parameters exist, should have type: object + if params.get("type") != "object": + return False + + # Recursively validate the schema for array types + if not _validate_schema_recursively(params): + return False + + return True + + except Exception: + return False diff --git a/pyproject.toml b/pyproject.toml index d0d9b93..502c02d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ toolsgen = ["prompts/*.txt"] [project] name = "toolsgen" -version = "0.2.2" +version = "0.3.0" description = "Generate tool-calling datasets from OpenAI-compatible tool specs" readme = "README.md" requires-python = ">=3.9" diff --git a/src/toolsgen/core/generator.py b/src/toolsgen/core/generator.py index 911ddce..171f0a6 100644 --- a/src/toolsgen/core/generator.py +++ b/src/toolsgen/core/generator.py @@ -58,6 +58,10 @@ def _split_records( "val": shuffled[split_idx:], } + temp_train = output_dir / "train.jsonl" + if temp_train.exists(): + temp_train.unlink() + for split_name, split_records in splits.items(): if split_records: split_path = output_dir / f"{split_name}.jsonl" diff --git a/src/toolsgen/core/parallel.py b/src/toolsgen/core/parallel.py index 3ad6017..da8c00e 100644 --- a/src/toolsgen/core/parallel.py +++ b/src/toolsgen/core/parallel.py @@ -132,6 +132,8 @@ def generate_records_parallel( return [], 0 results_by_index: Dict[int, Record] = {} + failed_indices: set[int] = set() + written_records: List[Record] = [] failed = 0 next_id_to_write = 0 @@ -157,14 +159,8 @@ def generate_records_parallel( if sample_result.record: record = Record.model_validate(sample_result.record) results_by_index[sample_result.sample_index] = record - - while next_id_to_write in results_by_index: - rec = results_by_index[next_id_to_write] - rec.id = f"record_{next_id_to_write:06d}" - append_record_jsonl(rec, jsonl_path) - del results_by_index[next_id_to_write] - next_id_to_write += 1 else: + failed_indices.add(sample_result.sample_index) tqdm.write( "Warning: Failed to generate sample " f"{sample_result.sample_index} after {gen_config.max_attempts} attempts" @@ -175,7 +171,18 @@ def generate_records_parallel( ) ) + while ( + next_id_to_write in results_by_index + or next_id_to_write in failed_indices + ): + if next_id_to_write in results_by_index: + rec = results_by_index[next_id_to_write] + rec.id = f"record_{next_id_to_write:06d}" + append_record_jsonl(rec, jsonl_path) + written_records.append(rec) + del results_by_index[next_id_to_write] + next_id_to_write += 1 + pbar.update(1) - all_records = [results_by_index[i] for i in sorted(results_by_index.keys())] - return all_records, failed + return written_records, failed diff --git a/src/toolsgen/sampling/batch.py b/src/toolsgen/sampling/batch.py index f71d939..0731528 100644 --- a/src/toolsgen/sampling/batch.py +++ b/src/toolsgen/sampling/batch.py @@ -1,6 +1,8 @@ import random from typing import Callable, List, Optional, Sequence +from tqdm import tqdm + from .param_aware import sample_param_aware_subset from .random import sample_random_subset from .semantic import sample_semantic_subset @@ -84,7 +86,7 @@ def batched_subsets( using_chunks = batch_size is not None and batch_size > 0 subsets: List[List[ToolSpec]] = [] - for i in range(total): + for i in tqdm(range(total), desc="Preparing tool subsets", total=total): batch = batches[i % len(batches)] if using_chunks: k = len(batch) diff --git a/uv.lock b/uv.lock index 252874b..3ed0922 100644 --- a/uv.lock +++ b/uv.lock @@ -396,7 +396,7 @@ wheels = [ [[package]] name = "toolsgen" -version = "0.1.3" +version = "0.3.0" source = { editable = "." } dependencies = [ { name = "openai" },