From bc707850f2a19dc4ab387cddb57764e182b11abe Mon Sep 17 00:00:00 2001 From: atasoglu Date: Sun, 9 Nov 2025 01:06:28 +0300 Subject: [PATCH 1/5] feat: add Hugging Face dataset integration and example setup for tool calling generation - Introduces dataset_to_tools utility to load tools from Hugging Face datasets - Adds configuration, example script, and README for simple tool calling v1 - Enables parallel processing, schema validation, and dataset generation for tool calling tasks --- examples/simple_tool_calling_v1/README.md | 31 +++++ examples/simple_tool_calling_v1/config.py | 34 ++++++ examples/simple_tool_calling_v1/example.py | 31 +++++ examples/simple_tool_calling_v1/utils.py | 45 ++++++++ examples/simple_tool_calling_v1/validation.py | 108 ++++++++++++++++++ 5 files changed, 249 insertions(+) create mode 100644 examples/simple_tool_calling_v1/README.md create mode 100644 examples/simple_tool_calling_v1/config.py create mode 100644 examples/simple_tool_calling_v1/example.py create mode 100644 examples/simple_tool_calling_v1/utils.py create mode 100644 examples/simple_tool_calling_v1/validation.py diff --git a/examples/simple_tool_calling_v1/README.md b/examples/simple_tool_calling_v1/README.md new file mode 100644 index 0000000..68cac98 --- /dev/null +++ b/examples/simple_tool_calling_v1/README.md @@ -0,0 +1,31 @@ +# Simple Tool Calling v1 + +Generates 10K tool-calling samples from Hugging Face dataset with parallel processing and schema validation. + +## Setup + +```bash +pip install toolsgen datasets python-dotenv +echo "OPENAI_API_KEY=your-key-here" > .env +python example.py +``` + +## Configuration + +- **Dataset**: `argilla-warehouse/python-seed-tools` +- **Samples**: 10,000 (80% train / 20% val) +- **Parallel**: 8 workers × 16 batch size +- **Models**: GPT-5-mini (problem/caller), GPT-5 (judge) + +## Files + +- `example.py` - Main generation script +- `config.py` - Generation and model settings +- `utils.py` - HF dataset loader +- `validation.py` - Schema validator (ensures arrays have `items`) + +## Output + +- `output/train.jsonl` - Training set +- `output/val.jsonl` - Validation set +- `output/manifest.json` - Metadata diff --git a/examples/simple_tool_calling_v1/config.py b/examples/simple_tool_calling_v1/config.py new file mode 100644 index 0000000..1a89c04 --- /dev/null +++ b/examples/simple_tool_calling_v1/config.py @@ -0,0 +1,34 @@ +from toolsgen import ( + GenerationConfig, + ModelConfig, + RoleBasedModelConfig, +) + +gen_config = GenerationConfig( + num_samples=10_000, + strategy="random", + seed=42, + train_split=0.8, + language="english", + max_attempts=3, + k_min=2, + k_max=8, + shuffle_tools=True, + num_workers=8, + worker_batch_size=16, +) + +role_config = RoleBasedModelConfig( + problem_generator=ModelConfig( + model="gpt-5-mini", + temperature=1.0, + ), + tool_caller=ModelConfig( + model="gpt-5-mini", + temperature=0, + ), + judge=ModelConfig( + model="gpt-5", + temperature=0, + ), +) diff --git a/examples/simple_tool_calling_v1/example.py b/examples/simple_tool_calling_v1/example.py new file mode 100644 index 0000000..ee6a607 --- /dev/null +++ b/examples/simple_tool_calling_v1/example.py @@ -0,0 +1,31 @@ +""" +Hugging Face example - Using a dataset from Hugging Face + +Install datasets library by using `pip install datasets` and set the dataset id. +""" + +from pathlib import Path +from dotenv import load_dotenv +from utils import dataset_to_tools +from config import gen_config, role_config +from toolsgen import generate_dataset + +# Load environment variables from .env file +load_dotenv() + +# Load dataset from Hugging Face +dataset_id = "argilla-warehouse/python-seed-tools" +tools = dataset_to_tools(dataset_id, dataset_kwargs={"split": "train"}) +output_dir = Path(__file__).parent / "output" + +# Generate dataset +manifest = generate_dataset(output_dir, gen_config, role_config, tools=tools) + +# Print summary +print(f"\n✓ Generated {manifest['num_generated']}/{manifest['num_requested']} records") +if manifest["num_failed"] > 0: + print(f" Failed: {manifest['num_failed']} attempts") +print(f" Problem Generator: {role_config.problem_generator.model}") +print(f" Tool Caller: {role_config.tool_caller.model}") +print(f" Judge: {role_config.judge.model}") +print(f" Output: {output_dir}") diff --git a/examples/simple_tool_calling_v1/utils.py b/examples/simple_tool_calling_v1/utils.py new file mode 100644 index 0000000..22b8ed7 --- /dev/null +++ b/examples/simple_tool_calling_v1/utils.py @@ -0,0 +1,45 @@ +import json +from typing import List, Optional + +from datasets import load_dataset + +from toolsgen import ( + ToolFunction, + ToolSpec, +) + +from validation import validate_json_schema + + +def dataset_to_tools( + dataset_id: str, dataset_kwargs: Optional[dict] = None +) -> List[ToolSpec]: + """Load tools from a Hugging Face dataset. + + Args: + dataset_id (str): The Hugging Face dataset identifier. + dataset_kwargs (Optional[dict]): Additional arguments for loading the dataset. + Returns: + List[ToolSpec]: A list of ToolSpec objects. + """ + dataset = load_dataset(dataset_id, **(dataset_kwargs or {})) + # Each dataset row contains a list of tools in OpenAI format + # Flatten the nested lists: [[tool1], [tool2, tool3], ...] -> [tool1, tool2, tool3, ...] + all_tools = [] + for tools_json in dataset["tools"]: + tools_list = json.loads(tools_json) + all_tools.extend(tools_list) + + # Convert from OpenAI format to ToolSpec + # OpenAI format: {'type': 'function', 'function': {'name': ..., 'description': ..., 'parameters': ...}} + return [ + ToolSpec( + function=ToolFunction( + name=tool["function"]["name"], + description=tool["function"]["description"], + parameters=tool["function"]["parameters"], + ) + ) + for tool in all_tools + if validate_json_schema(tool) + ] diff --git a/examples/simple_tool_calling_v1/validation.py b/examples/simple_tool_calling_v1/validation.py new file mode 100644 index 0000000..8261d15 --- /dev/null +++ b/examples/simple_tool_calling_v1/validation.py @@ -0,0 +1,108 @@ +from typing import Any + + +def _validate_schema_recursively(schema: dict[str, Any]) -> bool: + """Recursively validate JSON Schema properties. + + OpenAI requires that array types must have 'items' field defined. + """ + if not isinstance(schema, dict): + return True + + # Check if this is an array type + schema_type = schema.get("type") + if schema_type == "array": + # Array must have items field + if "items" not in schema: + return False + # Recursively validate items + if not _validate_schema_recursively(schema.get("items", {})): + return False + elif isinstance(schema_type, list) and "array" in schema_type: + # Handle union types like ["array", "null"] + if "items" not in schema: + return False + if not _validate_schema_recursively(schema.get("items", {})): + return False + + # Check properties recursively + if "properties" in schema: + properties = schema.get("properties", {}) + if isinstance(properties, dict): + for prop_schema in properties.values(): + if not _validate_schema_recursively(prop_schema): + return False + + # Check items recursively (for nested arrays) + if "items" in schema: + items = schema.get("items") + if isinstance(items, dict): + if not _validate_schema_recursively(items): + return False + + # Check additionalProperties if it's a schema + if "additionalProperties" in schema: + add_props = schema.get("additionalProperties") + if isinstance(add_props, dict): + if not _validate_schema_recursively(add_props): + return False + + return True + + +def validate_json_schema(tool: dict[str, Any]) -> bool: + """Validate OpenAI tool schema format. + + Expected format: + + ```json + { + "type": "function", + "function": { + "name": "function_name", + "description": "function description", + "parameters": {"type": "object", "properties": {...}} + } + } + ``` + Also validates that all array types have 'items' field defined. + """ + try: + # Check top-level structure + if not isinstance(tool, dict): + return False + + if tool.get("type") != "function": + return False + + function = tool.get("function") + if not isinstance(function, dict): + return False + + # Check required function fields + if "name" not in function or not isinstance(function["name"], str): + return False + + if "description" not in function or not isinstance( + function["description"], str + ): + return False + + # Parameters are optional, but if present must be a dict + if "parameters" in function: + params = function["parameters"] + if not isinstance(params, dict): + return False + + # If parameters exist, should have type: object + if params.get("type") != "object": + return False + + # Recursively validate the schema for array types + if not _validate_schema_recursively(params): + return False + + return True + + except Exception: + return False From 8e0e0026dc71c9954f2d701914bc3137d65f387f Mon Sep 17 00:00:00 2001 From: atasoglu Date: Sun, 9 Nov 2025 09:49:21 +0300 Subject: [PATCH 2/5] refactor: rename example files and update configurations for Nano Tool Calling v1; add new example script and adjust batch sampling progress bar --- .../README.md | 4 +- .../config.py | 12 +++--- examples/nano_tool_calling_v1/example.py | 39 +++++++++++++++++++ .../utils.py | 0 .../validation.py | 0 examples/simple_tool_calling_v1/example.py | 31 --------------- src/toolsgen/sampling/batch.py | 4 +- 7 files changed, 50 insertions(+), 40 deletions(-) rename examples/{simple_tool_calling_v1 => nano_tool_calling_v1}/README.md (89%) rename examples/{simple_tool_calling_v1 => nano_tool_calling_v1}/config.py (77%) create mode 100644 examples/nano_tool_calling_v1/example.py rename examples/{simple_tool_calling_v1 => nano_tool_calling_v1}/utils.py (100%) rename examples/{simple_tool_calling_v1 => nano_tool_calling_v1}/validation.py (100%) delete mode 100644 examples/simple_tool_calling_v1/example.py diff --git a/examples/simple_tool_calling_v1/README.md b/examples/nano_tool_calling_v1/README.md similarity index 89% rename from examples/simple_tool_calling_v1/README.md rename to examples/nano_tool_calling_v1/README.md index 68cac98..268c303 100644 --- a/examples/simple_tool_calling_v1/README.md +++ b/examples/nano_tool_calling_v1/README.md @@ -1,4 +1,4 @@ -# Simple Tool Calling v1 +# Nano Tool Calling v1 Generates 10K tool-calling samples from Hugging Face dataset with parallel processing and schema validation. @@ -15,7 +15,7 @@ python example.py - **Dataset**: `argilla-warehouse/python-seed-tools` - **Samples**: 10,000 (80% train / 20% val) - **Parallel**: 8 workers × 16 batch size -- **Models**: GPT-5-mini (problem/caller), GPT-5 (judge) +- **Models**: GPT-4.1-nano ## Files diff --git a/examples/simple_tool_calling_v1/config.py b/examples/nano_tool_calling_v1/config.py similarity index 77% rename from examples/simple_tool_calling_v1/config.py rename to examples/nano_tool_calling_v1/config.py index 1a89c04..f70cee0 100644 --- a/examples/simple_tool_calling_v1/config.py +++ b/examples/nano_tool_calling_v1/config.py @@ -12,23 +12,23 @@ language="english", max_attempts=3, k_min=2, - k_max=8, + k_max=4, shuffle_tools=True, - num_workers=8, - worker_batch_size=16, + num_workers=4, + worker_batch_size=8, ) role_config = RoleBasedModelConfig( problem_generator=ModelConfig( - model="gpt-5-mini", + model="gpt-4.1-nano", temperature=1.0, ), tool_caller=ModelConfig( - model="gpt-5-mini", + model="gpt-4.1-nano", temperature=0, ), judge=ModelConfig( - model="gpt-5", + model="gpt-4.1-nano", temperature=0, ), ) diff --git a/examples/nano_tool_calling_v1/example.py b/examples/nano_tool_calling_v1/example.py new file mode 100644 index 0000000..6fd4d22 --- /dev/null +++ b/examples/nano_tool_calling_v1/example.py @@ -0,0 +1,39 @@ +""" +Hugging Face example - Using a dataset from Hugging Face + +Install datasets library by using `pip install datasets` and set the dataset id. +""" + +from pathlib import Path +from dotenv import load_dotenv +from utils import dataset_to_tools +from config import gen_config, role_config +from toolsgen import generate_dataset + +# Load environment variables from .env file +load_dotenv() + + +def main() -> None: + # Load dataset from Hugging Face + dataset_id = "argilla-warehouse/python-seed-tools" + tools = dataset_to_tools(dataset_id, dataset_kwargs={"split": "train"}) + output_dir = Path(__file__).parent / "output" + + # Generate dataset + manifest = generate_dataset(output_dir, gen_config, role_config, tools=tools) + + # Print summary + print( + f"\n✓ Generated {manifest['num_generated']}/{manifest['num_requested']} records" + ) + if manifest["num_failed"] > 0: + print(f" Failed: {manifest['num_failed']} attempts") + print(f" Problem Generator: {role_config.problem_generator.model}") + print(f" Tool Caller: {role_config.tool_caller.model}") + print(f" Judge: {role_config.judge.model}") + print(f" Output: {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/examples/simple_tool_calling_v1/utils.py b/examples/nano_tool_calling_v1/utils.py similarity index 100% rename from examples/simple_tool_calling_v1/utils.py rename to examples/nano_tool_calling_v1/utils.py diff --git a/examples/simple_tool_calling_v1/validation.py b/examples/nano_tool_calling_v1/validation.py similarity index 100% rename from examples/simple_tool_calling_v1/validation.py rename to examples/nano_tool_calling_v1/validation.py diff --git a/examples/simple_tool_calling_v1/example.py b/examples/simple_tool_calling_v1/example.py deleted file mode 100644 index ee6a607..0000000 --- a/examples/simple_tool_calling_v1/example.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Hugging Face example - Using a dataset from Hugging Face - -Install datasets library by using `pip install datasets` and set the dataset id. -""" - -from pathlib import Path -from dotenv import load_dotenv -from utils import dataset_to_tools -from config import gen_config, role_config -from toolsgen import generate_dataset - -# Load environment variables from .env file -load_dotenv() - -# Load dataset from Hugging Face -dataset_id = "argilla-warehouse/python-seed-tools" -tools = dataset_to_tools(dataset_id, dataset_kwargs={"split": "train"}) -output_dir = Path(__file__).parent / "output" - -# Generate dataset -manifest = generate_dataset(output_dir, gen_config, role_config, tools=tools) - -# Print summary -print(f"\n✓ Generated {manifest['num_generated']}/{manifest['num_requested']} records") -if manifest["num_failed"] > 0: - print(f" Failed: {manifest['num_failed']} attempts") -print(f" Problem Generator: {role_config.problem_generator.model}") -print(f" Tool Caller: {role_config.tool_caller.model}") -print(f" Judge: {role_config.judge.model}") -print(f" Output: {output_dir}") diff --git a/src/toolsgen/sampling/batch.py b/src/toolsgen/sampling/batch.py index f71d939..0731528 100644 --- a/src/toolsgen/sampling/batch.py +++ b/src/toolsgen/sampling/batch.py @@ -1,6 +1,8 @@ import random from typing import Callable, List, Optional, Sequence +from tqdm import tqdm + from .param_aware import sample_param_aware_subset from .random import sample_random_subset from .semantic import sample_semantic_subset @@ -84,7 +86,7 @@ def batched_subsets( using_chunks = batch_size is not None and batch_size > 0 subsets: List[List[ToolSpec]] = [] - for i in range(total): + for i in tqdm(range(total), desc="Preparing tool subsets", total=total): batch = batches[i % len(batches)] if using_chunks: k = len(batch) From e6fce67df4bed6d8dfdb848e4a536c41f29654fd Mon Sep 17 00:00:00 2001 From: atasoglu Date: Sun, 9 Nov 2025 23:16:20 +0300 Subject: [PATCH 3/5] chore: update README with dataset details, usage, and licensing information --- examples/nano_tool_calling_v1/README.md | 141 +++++++++++++++--- .../nano_tool_calling_v1/analyze_functions.py | 15 ++ examples/nano_tool_calling_v1/config.py | 11 +- examples/nano_tool_calling_v1/push_to_hf.py | 64 ++++++++ src/toolsgen/core/generator.py | 4 + src/toolsgen/core/parallel.py | 25 ++-- 6 files changed, 225 insertions(+), 35 deletions(-) create mode 100644 examples/nano_tool_calling_v1/analyze_functions.py create mode 100644 examples/nano_tool_calling_v1/push_to_hf.py diff --git a/examples/nano_tool_calling_v1/README.md b/examples/nano_tool_calling_v1/README.md index 268c303..f62b642 100644 --- a/examples/nano_tool_calling_v1/README.md +++ b/examples/nano_tool_calling_v1/README.md @@ -1,31 +1,132 @@ +--- +license: mit +task_categories: +- text-generation +language: +- en +tags: +- function-calling +- tool-calling +- synthetic +- openai +size_categories: +- n<1K +--- + # Nano Tool Calling v1 -Generates 10K tool-calling samples from Hugging Face dataset with parallel processing and schema validation. +A synthetic tool-calling dataset generated using [ToolsGen](https://github.com/atasoglu/toolsgen) with GPT-4.1-nano models. + +## Dataset Details + +- **Generated with**: ToolsGen v0.1.0 +- **Source Tools**: [argilla-warehouse/python-seed-tools](https://huggingface.co/datasets/argilla-warehouse/python-seed-tools) +- **Total Samples**: 989 +- **Language**: English +- **Format**: Single-turn conversations with tool calls + +### Models Used + +- **Problem Generator**: gpt-4.1-nano (temp=1.0) +- **Tool Caller**: gpt-4.1-nano (temp=0.0) +- **Judge**: gpt-4.1-mini (temp=0.0) + +## Dataset Structure -## Setup +Each record contains: -```bash -pip install toolsgen datasets python-dotenv -echo "OPENAI_API_KEY=your-key-here" > .env -python example.py +```json +{ + "id": "record_000000", + "language": "english", + "tools": [...], + "messages": [ + {"role": "user", "content": "..."} + ], + "assistant_calls": [ + { + "id": "call_...", + "type": "function", + "function": { + "name": "function_name", + "arguments": "{...}" + } + } + ], + "problem_metadata": {...}, + "judge": { + "tool_relevance": 0.4, + "argument_quality": 0.38, + "clarity": 0.2, + "score": 0.98, + "verdict": "accept", + "rationale": "...", + "rubric_version": "0.1.0", + "model": "gpt-4.1-mini", + "temperature": 0.0 + }, + "quality_tags": [], + "tools_metadata": {"num_tools": 2} +} ``` -## Configuration +## Generation Details + +### Configuration + +- **Strategy**: Random tool sampling +- **Tools per sample**: 1-4 (k_min=1, k_max=4) +- **Parallel workers**: 16 +- **Worker batch size**: 16 +- **Max attempts**: 3 +- **Seed**: 42 + +### Quality Control + +All samples passed through an LLM-as-a-judge evaluation with a multi-dimensional rubric: + +- **Tool Relevance** (40%): Are the selected tools appropriate? +- **Argument Quality** (38%): Are arguments valid and plausible? +- **Clarity** (20%): Is the response complete and clear? -- **Dataset**: `argilla-warehouse/python-seed-tools` -- **Samples**: 10,000 (80% train / 20% val) -- **Parallel**: 8 workers × 16 batch size -- **Models**: GPT-4.1-nano +Samples with `score >= 0.7` and `verdict == "accept"` are included. -## Files +## Usage -- `example.py` - Main generation script -- `config.py` - Generation and model settings -- `utils.py` - HF dataset loader -- `validation.py` - Schema validator (ensures arrays have `items`) +```python +from datasets import load_dataset + +dataset = load_dataset("atasoglu/nano-tool-calling-v1") + +# Access a sample +sample = dataset["train"][0] +print(sample["messages"]) +print(sample["assistant_calls"]) +``` + +## Source Tools + +The dataset uses 38,420 Python function definitions from the [python-seed-tools](https://huggingface.co/datasets/argilla-warehouse/python-seed-tools) dataset, covering diverse programming tasks and domains. + +## Limitations + +- Single-turn conversations only +- English language only +- Synthetic data generated by LLMs (may contain artifacts) +- No actual tool execution or validation +- Judge scores are model-based assessments + +## Citation + +```bibtex +@software{toolsgen2025, + title = {ToolsGen: Synthetic Tool-Calling Dataset Generator}, + author = {Ataşoğlu, Ahmet}, + year = {2025}, + url = {https://github.com/atasoglu/toolsgen} +} +``` -## Output +## License -- `output/train.jsonl` - Training set -- `output/val.jsonl` - Validation set -- `output/manifest.json` - Metadata +MIT License diff --git a/examples/nano_tool_calling_v1/analyze_functions.py b/examples/nano_tool_calling_v1/analyze_functions.py new file mode 100644 index 0000000..ad38ea9 --- /dev/null +++ b/examples/nano_tool_calling_v1/analyze_functions.py @@ -0,0 +1,15 @@ +import json +from collections import Counter + +with open("output/train.jsonl", "r", encoding="utf-8") as f: + function_counts = Counter() + + for line in f: + record = json.loads(line) + for tool in record.get("tools", []): + func_name = tool.get("function", {}).get("name") + if func_name: + function_counts[func_name] += 1 + +for func, count in function_counts.most_common(): + print(f"{func}: {count}") diff --git a/examples/nano_tool_calling_v1/config.py b/examples/nano_tool_calling_v1/config.py index f70cee0..babd9bb 100644 --- a/examples/nano_tool_calling_v1/config.py +++ b/examples/nano_tool_calling_v1/config.py @@ -5,17 +5,16 @@ ) gen_config = GenerationConfig( - num_samples=10_000, + num_samples=1_000, strategy="random", seed=42, - train_split=0.8, language="english", max_attempts=3, - k_min=2, + k_min=1, k_max=4, shuffle_tools=True, - num_workers=4, - worker_batch_size=8, + num_workers=16, + worker_batch_size=16, ) role_config = RoleBasedModelConfig( @@ -28,7 +27,7 @@ temperature=0, ), judge=ModelConfig( - model="gpt-4.1-nano", + model="gpt-4.1-mini", temperature=0, ), ) diff --git a/examples/nano_tool_calling_v1/push_to_hf.py b/examples/nano_tool_calling_v1/push_to_hf.py new file mode 100644 index 0000000..aa9486e --- /dev/null +++ b/examples/nano_tool_calling_v1/push_to_hf.py @@ -0,0 +1,64 @@ +import json +import os +from pathlib import Path +from datasets import Dataset, DatasetDict +from huggingface_hub import DatasetCard +from dotenv import load_dotenv + + +def load_jsonl(path: Path) -> list[dict]: + return [json.loads(line) for line in open(path, encoding="utf-8")] + + +def push_to_hub( + dataset_path: Path, + repo_id: str, + token: str | None = None, + private: bool = False, + readme_path: Path | None = None, +): + train = load_jsonl(dataset_path / "train.jsonl") + val_path = dataset_path / "val.jsonl" + + # Convert to JSON strings to avoid schema issues + for record in train: + record["tools"] = json.dumps(record["tools"]) + record["messages"] = json.dumps(record["messages"]) + record["assistant_calls"] = json.dumps(record["assistant_calls"]) + record["problem_metadata"] = json.dumps(record["problem_metadata"]) + record["judge"] = json.dumps(record["judge"]) + record["quality_tags"] = json.dumps(record["quality_tags"]) + record["tools_metadata"] = json.dumps(record["tools_metadata"]) + + dataset = Dataset.from_list(train) + + if val_path.exists(): + val = load_jsonl(val_path) + for record in val: + record["tools"] = json.dumps(record["tools"]) + record["messages"] = json.dumps(record["messages"]) + record["assistant_calls"] = json.dumps(record["assistant_calls"]) + record["problem_metadata"] = json.dumps(record["problem_metadata"]) + record["judge"] = json.dumps(record["judge"]) + record["quality_tags"] = json.dumps(record["quality_tags"]) + record["tools_metadata"] = json.dumps(record["tools_metadata"]) + dataset = DatasetDict({"train": dataset, "validation": Dataset.from_list(val)}) + + dataset.push_to_hub(repo_id, token=token, private=private) + + if readme_path and readme_path.exists(): + card = DatasetCard(open(readme_path, encoding="utf-8").read()) + card.push_to_hub(repo_id, token=token) + + print(f"✓ Pushed to https://huggingface.co/datasets/{repo_id}") + + +if __name__ == "__main__": + load_dotenv() + base_path = Path(__file__).parent + push_to_hub( + base_path / "output", + "atasoglu/nano-tool-calling-v1", + os.getenv("HF_TOKEN"), + readme_path=base_path / "README.md", + ) diff --git a/src/toolsgen/core/generator.py b/src/toolsgen/core/generator.py index 911ddce..171f0a6 100644 --- a/src/toolsgen/core/generator.py +++ b/src/toolsgen/core/generator.py @@ -58,6 +58,10 @@ def _split_records( "val": shuffled[split_idx:], } + temp_train = output_dir / "train.jsonl" + if temp_train.exists(): + temp_train.unlink() + for split_name, split_records in splits.items(): if split_records: split_path = output_dir / f"{split_name}.jsonl" diff --git a/src/toolsgen/core/parallel.py b/src/toolsgen/core/parallel.py index 3ad6017..da8c00e 100644 --- a/src/toolsgen/core/parallel.py +++ b/src/toolsgen/core/parallel.py @@ -132,6 +132,8 @@ def generate_records_parallel( return [], 0 results_by_index: Dict[int, Record] = {} + failed_indices: set[int] = set() + written_records: List[Record] = [] failed = 0 next_id_to_write = 0 @@ -157,14 +159,8 @@ def generate_records_parallel( if sample_result.record: record = Record.model_validate(sample_result.record) results_by_index[sample_result.sample_index] = record - - while next_id_to_write in results_by_index: - rec = results_by_index[next_id_to_write] - rec.id = f"record_{next_id_to_write:06d}" - append_record_jsonl(rec, jsonl_path) - del results_by_index[next_id_to_write] - next_id_to_write += 1 else: + failed_indices.add(sample_result.sample_index) tqdm.write( "Warning: Failed to generate sample " f"{sample_result.sample_index} after {gen_config.max_attempts} attempts" @@ -175,7 +171,18 @@ def generate_records_parallel( ) ) + while ( + next_id_to_write in results_by_index + or next_id_to_write in failed_indices + ): + if next_id_to_write in results_by_index: + rec = results_by_index[next_id_to_write] + rec.id = f"record_{next_id_to_write:06d}" + append_record_jsonl(rec, jsonl_path) + written_records.append(rec) + del results_by_index[next_id_to_write] + next_id_to_write += 1 + pbar.update(1) - all_records = [results_by_index[i] for i in sorted(results_by_index.keys())] - return all_records, failed + return written_records, failed From 98a485ac04caeeb9802105b4e0609ef3801f2acf Mon Sep 17 00:00:00 2001 From: atasoglu Date: Sun, 9 Nov 2025 23:18:57 +0300 Subject: [PATCH 4/5] chore(release): update to version 0.3.0 with new Hugging Face integration and workflow enhancements - Added Hugging Face dataset utilities and dataset upload script - Included complete Nano Tool Calling v1 dataset generation example - Improved progress bar display and parallel processing record handling - Updated project version to 0.3.0 --- CHANGELOG.md | 15 +++++++++++++++ pyproject.toml | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28028aa..ecb55ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,21 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve Nothing yet. +## [0.3.0] - 2025-01-10 +### Added +- Hugging Face dataset integration utilities in `examples/nano_tool_calling_v1/` + - `dataset_to_tools()` function to load tools from Hugging Face datasets + - `validate_json_schema()` for OpenAI tool schema validation with recursive array type checking + - `push_to_hf.py` script for uploading generated datasets to Hugging Face Hub +- Complete example workflow for Nano Tool Calling v1 dataset generation + - Configuration, generation, validation, and publishing pipeline + - Analysis utilities for function inspection + - Comprehensive README with dataset card format + +### Changed +- Enhanced batch sampling progress bar display for better user feedback +- Improved parallel processing record ordering and ID assignment + ## [0.2.2] - 2025-01-09 ### Changed - Records are now written to JSONL file immediately as they complete in parallel mode, rather than waiting for all generation to finish diff --git a/pyproject.toml b/pyproject.toml index d0d9b93..502c02d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ toolsgen = ["prompts/*.txt"] [project] name = "toolsgen" -version = "0.2.2" +version = "0.3.0" description = "Generate tool-calling datasets from OpenAI-compatible tool specs" readme = "README.md" requires-python = ">=3.9" From e0437ecbd933e76ee13f15ab877b816a3529ad6a Mon Sep 17 00:00:00 2001 From: atasoglu Date: Sun, 9 Nov 2025 23:19:19 +0300 Subject: [PATCH 5/5] chore: update toolsgen to version 0.3.0 for latest features and fixes --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 252874b..3ed0922 100644 --- a/uv.lock +++ b/uv.lock @@ -396,7 +396,7 @@ wheels = [ [[package]] name = "toolsgen" -version = "0.1.3" +version = "0.3.0" source = { editable = "." } dependencies = [ { name = "openai" },