From fd13c04fb2c2604a225bc290d583614e425c45bd Mon Sep 17 00:00:00 2001
From: Daniel Arndt <daniel-arndt@outlook.com>
Date: Mon, 23 Mar 2026 16:47:09 -0300
Subject: [PATCH] Initial library, CLI, documentation, and CI/CD setup

---
 .github/workflows/ci.yml                      |  35 ++++
 .github/workflows/publish.yml                 |  39 +++++
 .github/workflows/release.yml                 |  35 ++++
 .gitignore                                    | 103 +++++++++++
 LICENSE                                       |  21 +++
 README.md                                     | 137 +++++++++++++++
 docs/API.en.md                                |  99 +++++++++++
 docs/API.pt-BR.md                             |  99 +++++++++++
 docs/ARCHITECTURE.md                          |  70 ++++++++
 docs/ARCHITECTURE.pt-BR.md                    |  70 ++++++++
 docs/CI-CD.en.md                              |  42 +++++
 docs/CI-CD.pt-BR.md                           |  90 ++++++++++
 docs/CLI.en.md                                | 101 +++++++++++
 docs/CLI.pt-BR.md                             | 101 +++++++++++
 docs/ENVIRONMENT.en.md                        |  28 +++
 docs/ENVIRONMENT.pt-BR.md                     |  88 ++++++++++
 docs/README.en.md                             |  19 +++
 docs/README.pt-BR.md                          |  53 ++++++
 examples/README.md                            |  23 +++
 examples/basic_usage.py                       |  21 +++
 examples/custom_regex_usage.py                |  17 ++
 examples/mysql_usage.py                       |  25 +++
 .../pdf_invoices}/Invoice1.pdf                | Bin
 .../pdf_invoices}/Invoice2.pdf                | Bin
 .../pdf_invoices}/Invoice3.pdf                | Bin
 .../pdf_invoices}/Invoice4.pdf                | Bin
 examples/recursive_usage.py                   |  12 ++
 invoices.py                                   |  99 -----------
 pyproject.toml                                |  49 ++++++
 src/pydf/__init__.py                          |  21 +++
 src/pydf/cli.py                               | 160 ++++++++++++++++++
 src/pydf/config.py                            |  50 ++++++
 src/pydf/database.py                          |  41 +++++
 src/pydf/excel.py                             |  38 +++++
 src/pydf/legacy.py                            |  22 +++
 src/pydf/models.py                            |  35 ++++
 src/pydf/parser.py                            |  59 +++++++
 src/pydf/processor.py                         |  93 ++++++++++
 tests/test_smoke.py                           |  14 ++
 39 files changed, 1910 insertions(+), 99 deletions(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .github/workflows/publish.yml
 create mode 100644 .github/workflows/release.yml
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 docs/API.en.md
 create mode 100644 docs/API.pt-BR.md
 create mode 100644 docs/ARCHITECTURE.md
 create mode 100644 docs/ARCHITECTURE.pt-BR.md
 create mode 100644 docs/CI-CD.en.md
 create mode 100644 docs/CI-CD.pt-BR.md
 create mode 100644 docs/CLI.en.md
 create mode 100644 docs/CLI.pt-BR.md
 create mode 100644 docs/ENVIRONMENT.en.md
 create mode 100644 docs/ENVIRONMENT.pt-BR.md
 create mode 100644 docs/README.en.md
 create mode 100644 docs/README.pt-BR.md
 create mode 100644 examples/README.md
 create mode 100644 examples/basic_usage.py
 create mode 100644 examples/custom_regex_usage.py
 create mode 100644 examples/mysql_usage.py
 rename {pdf_invoices => examples/pdf_invoices}/Invoice1.pdf (100%)
 rename {pdf_invoices => examples/pdf_invoices}/Invoice2.pdf (100%)
 rename {pdf_invoices => examples/pdf_invoices}/Invoice3.pdf (100%)
 rename {pdf_invoices => examples/pdf_invoices}/Invoice4.pdf (100%)
 create mode 100644 examples/recursive_usage.py
 delete mode 100644 invoices.py
 create mode 100644 pyproject.toml
 create mode 100644 src/pydf/__init__.py
 create mode 100644 src/pydf/cli.py
 create mode 100644 src/pydf/config.py
 create mode 100644 src/pydf/database.py
 create mode 100644 src/pydf/excel.py
 create mode 100644 src/pydf/legacy.py
 create mode 100644 src/pydf/models.py
 create mode 100644 src/pydf/parser.py
 create mode 100644 src/pydf/processor.py
 create mode 100644 tests/test_smoke.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..0682911
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,35 @@
+name: CI
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11"]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -e .[dev]
+
+      - name: Run tests
+        run: python -m pytest -v
+
+      - name: Build package
+        run: python -m build
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..6e02fb9
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,39 @@
+name: Publish Python Package
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  publish:
+    if: ${{ !github.event.release.prerelease }}
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write
+      id-token: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install build tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install build
+
+      - name: Build distributions
+        run: python -m build
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
+      - name: Upload dist files to GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..155a533
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,35 @@
+name: Release Build
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  build-release-artifacts:
+    if: ${{ !github.event.release.prerelease }}
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install build dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install build
+
+      - name: Build distributions
+        run: python -m build
+
+      - name: Upload artifacts to GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4a7d244
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,103 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+*.egg
+MANIFEST
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage / pytest
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache/
+.pytest_cache/
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+
+# Type check / lint caches
+.mypy_cache/
+.ruff_cache/
+.pyre/
+.dmypy.json
+dmypy.json
+
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+
+# Jupyter Notebook
+.ipynb_checkpoints/
+
+# IDEs / editors
+.vscode/
+.idea/
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Local environment files
+.env
+.env.*
+*.local
+
+# Logs
+*.log
+
+# Temporary files
+tmp/
+temp/
+*.tmp
+
+# Project generated files
+output/
+temp_uploads/
+generated/
+reports/
+
+# Excel / export artifacts
+*.xlsx
+
+# Database / local data
+*.db
+*.sqlite3
+
+# Python build metadata
+.pybuild/
+
+# Packaging tools
+pip-wheel-metadata/
+
+# PyInstaller
+*.manifest
+*.spec
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..3769d0a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Daniel Arndt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..131b720
--- /dev/null
+++ b/README.md
@@ -0,0 +1,137 @@
+# Pydf
+
+[Documentação PT-BR](docs/README.pt-BR.md) | [English docs](docs/README.en.md)
+
+A `pydf` é uma biblioteca Python leve para leitura de PDFs de faturas, extração de metadados com regex, exportação para Excel e persistência opcional em MySQL.
+
+Esta versão reorganiza o projeto original como biblioteca e CLI, sem fugir da ideia central do script: **PDF -> extração -> Excel -> MySQL opcional**.
+
+## Visão rápida
+
+- Biblioteca Python reutilizável
+- CLI simples para uso no terminal
+- Regex configurável para número e data da fatura
+- Exportação para `.xlsx`
+- Persistência opcional em MySQL
+- Documentação em PT-BR e inglês
+- Workflows de CI e release para GitHub Actions
+
+## Requisitos
+
+- Python **3.10 ou superior**
+- `pip`
+- Recomendado: ambiente virtual (`venv`)
+
+## Instalação local
+
+Na raiz do projeto:
+
+```bash
+pip install -e .
+```
+
+Instalação com dependências de desenvolvimento:
+
+```bash
+pip install -e .[dev]
+```
+
+## Instalação da CLI via GitHub
+
+Como o GitHub não oferece um registry Python suportado para `pip` no GitHub Packages, a forma recomendada para instalar a CLI a partir do GitHub é usar o próprio repositório Git.
+
+### Instalar da branch padrão
+
+```bash
+pip install "git+https://github.com/DanielArndt0/pydf.git"
+```
+
+### Instalar de uma tag ou release específica
+
+```bash
+pip install "git+https://github.com/DanielArndt0/pydf.git@v1.0.0"
+```
+
+Depois disso, a CLI fica disponível como:
+
+```bash
+pydf --help
+```
+
+## Primeiros passos com venv no Windows
+
+Se você tiver mais de uma versão do Python instalada, confira as versões disponíveis:
+
+```powershell
+py -0p
+```
+
+Crie e ative um ambiente virtual com Python 3.10:
+
+```powershell
+py -3.10 -m venv .venv
+.venv\Scripts\Activate.ps1
+python -m pip install --upgrade pip
+python -m pip install -e .[dev]
+```
+
+## Como executar a CLI
+
+```bash
+pydf --help
+pydf examples/pdf_invoices --output output/invoices.xlsx
+```
+
+## Como usar como biblioteca
+
+```python
+from pydf import InvoiceProcessor, ProcessorConfig
+
+config = ProcessorConfig(
+    input_dir="examples/pdf_invoices",
+    output_excel="output/invoices.xlsx",
+)
+
+result = InvoiceProcessor(config).process()
+
+print(result.output_excel)
+for record in result.records:
+    print(record.file_name, record.invoice_number, record.invoice_date, record.status)
+```
+
+## Rodando testes
+
+```bash
+pytest -v
+```
+
+Se o ambiente ainda não estiver preparado:
+
+```bash
+pip install -e .[dev]
+pytest -v
+```
+
+## Build local
+
+```bash
+python -m build
+```
+
+## CI e releases no GitHub
+
+Este repositório inclui dois workflows:
+
+- `ci.yml`: roda testes e build em todo push e pull request
+- `release.yml`: gera os artefatos e anexa `dist/*` a uma release publicada manualmente
+
+Documentação detalhada:
+
+- [Guia principal da documentação](docs/README.pt-BR.md)
+- [Guia da CLI](docs/CLI.pt-BR.md)
+- [Guia da API](docs/API.pt-BR.md)
+- [Arquitetura](docs/ARCHITECTURE.pt-BR.md)
+- [CI/CD e Releases](docs/CI-CD.pt-BR.md)
+- [Ambiente Python, venv e troubleshooting](docs/ENVIRONMENT.pt-BR.md)
+- [Exemplos](examples/README.md)
+
diff --git a/docs/API.en.md b/docs/API.en.md
new file mode 100644
index 0000000..e10d6ec
--- /dev/null
+++ b/docs/API.en.md
@@ -0,0 +1,99 @@
+# Public API
+
+[Versão em Português do Brasil](API.pt-BR.md)
+
+## Overview
+
+The library was organized so most integrations go through a few simple entry points.
+
+## Main objects
+
+### `ProcessorConfig`
+
+Main configuration object.
+
+Most useful fields:
+
+- `input_dir`: directory containing PDFs.
+- `output_excel`: final Excel path.
+- `invoice_number_pattern`: invoice number regex.
+- `invoice_date_pattern`: invoice date regex.
+- `worksheet_name`: Excel sheet name.
+- `status_completed`: success status text.
+- `persist_to_database`: enables MySQL persistence.
+- `database`: `DatabaseConfig` instance.
+- `recursive`: recursive search.
+
+### `DatabaseConfig`
+
+Used only when `persist_to_database=True`.
+
+Fields:
+
+- `host`
+- `user`
+- `password`
+- `database`
+- `table`
+
+### `InvoiceProcessor`
+
+Main library class.
+
+Most important method:
+
+- `process() -> ProcessingResult`
+
+### `ProcessingResult`
+
+Consolidated processing result.
+
+Properties:
+
+- `records`
+- `output_excel`
+- `success_count`
+- `error_count`
+
+### `InvoiceRecord`
+
+Represents one processed PDF.
+
+Fields:
+
+- `invoice_number`
+- `invoice_date`
+- `file_name`
+- `status`
+
+## Useful functions
+
+### `extract_text_from_pdf(file_path)`
+
+Extracts text from the first page of the PDF.
+
+### `parse_invoice(file_path, number_pattern, date_pattern, completed_status)`
+
+Processes a single PDF and returns an `InvoiceRecord`.
+
+## Direct API example
+
+```python
+from pydf import InvoiceProcessor, ProcessorConfig
+
+config = ProcessorConfig(
+    input_dir="examples/pdf_invoices",
+    output_excel="output/api_usage.xlsx",
+)
+
+result = InvoiceProcessor(config).process()
+print(result.success_count)
+```
+
+## When to use the API
+
+Use the API when:
+
+- you want to integrate processing into another Python system;
+- you need to inspect the `ProcessingResult` in memory;
+- you want more control over customization.
diff --git a/docs/API.pt-BR.md b/docs/API.pt-BR.md
new file mode 100644
index 0000000..7f9af8b
--- /dev/null
+++ b/docs/API.pt-BR.md
@@ -0,0 +1,99 @@
+# API pública (Português do Brasil)
+
+[English version](API.en.md)
+
+## Visão geral
+
+A biblioteca foi organizada para que a maior parte do uso passe por poucos pontos de entrada.
+
+## Principais objetos
+
+### `ProcessorConfig`
+
+Objeto principal de configuração.
+
+Campos mais úteis:
+
+- `input_dir`: diretório com PDFs.
+- `output_excel`: caminho do Excel final.
+- `invoice_number_pattern`: regex para número da fatura.
+- `invoice_date_pattern`: regex para data da fatura.
+- `worksheet_name`: nome da aba do Excel.
+- `status_completed`: texto de sucesso.
+- `persist_to_database`: habilita gravação no MySQL.
+- `database`: instância de `DatabaseConfig`.
+- `recursive`: busca recursiva.
+
+### `DatabaseConfig`
+
+Usado apenas se `persist_to_database=True`.
+
+Campos:
+
+- `host`
+- `user`
+- `password`
+- `database`
+- `table`
+
+### `InvoiceProcessor`
+
+Classe principal da biblioteca.
+
+Método mais importante:
+
+- `process() -> ProcessingResult`
+
+### `ProcessingResult`
+
+Resultado consolidado do processamento.
+
+Propriedades:
+
+- `records`
+- `output_excel`
+- `success_count`
+- `error_count`
+
+### `InvoiceRecord`
+
+Representa um PDF processado.
+
+Campos:
+
+- `invoice_number`
+- `invoice_date`
+- `file_name`
+- `status`
+
+## Funções úteis
+
+### `extract_text_from_pdf(file_path)`
+
+Extrai o texto da primeira página do PDF.
+
+### `parse_invoice(file_path, number_pattern, date_pattern, completed_status)`
+
+Processa um único PDF e devolve um `InvoiceRecord`.
+
+## Exemplo direto da API
+
+```python
+from pydf import InvoiceProcessor, ProcessorConfig
+
+config = ProcessorConfig(
+    input_dir="examples/pdf_invoices",
+    output_excel="output/api_usage.xlsx",
+)
+
+result = InvoiceProcessor(config).process()
+print(result.success_count)
+```
+
+## Quando usar a API
+
+Use a API quando:
+
+- você vai integrar o processamento em outro sistema Python;
+- precisa manipular o `ProcessingResult` em memória;
+- quer customizar o fluxo com mais controle.
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
new file mode 100644
index 0000000..b32983e
--- /dev/null
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,70 @@
+# Architecture overview
+
+[Português do Brasil](ARCHITECTURE.pt-BR.md)
+
+## Goal
+
+The project was turned into a library without abandoning the original flow. The architecture keeps the implementation small and easy to understand.
+
+## Layers
+
+### 1. Configuration layer
+
+- `config.py`
+- Holds `ProcessorConfig` and `DatabaseConfig`.
+- Used by both the API and the CLI.
+
+### 2. Parsing layer
+
+- `parser.py`
+- Reads PDF text from the first page.
+- Applies regex extraction for invoice number and date.
+
+### 3. Orchestration layer
+
+- `processor.py`
+- Discovers files.
+- Coordinates parsing.
+- Handles per-file failures without aborting the whole batch.
+- Delegates Excel export and optional database persistence.
+
+### 4. Output layer
+
+- `excel.py`
+- Converts the records into an `.xlsx` file.
+
+### 5. Persistence layer
+
+- `database.py`
+- Manages the MySQL connection and inserts records.
+
+### 6. Entry points
+
+- `cli.py`: terminal entry point.
+- `__init__.py`: public package API.
+- `legacy.py`: compatibility helper for the original project style.
+
+## Processing flow
+
+```text
+CLI or Python API
+        |
+        v
+ProcessorConfig / DatabaseConfig
+        |
+        v
+InvoiceProcessor.process()
+        |
+        +--> discover PDF files
+        +--> parse each PDF
+        +--> optionally insert into MySQL
+        +--> export records to Excel
+        v
+ProcessingResult
+```
+
+## Design choices
+
+- Keep batch processing resilient: one bad file should not stop the others.
+- Keep parsing simple: first page + regex, matching the original project spirit.
+- Keep extension points obvious: regex, recursion, database persistence, and public helper functions.
diff --git a/docs/ARCHITECTURE.pt-BR.md b/docs/ARCHITECTURE.pt-BR.md
new file mode 100644
index 0000000..3172a21
--- /dev/null
+++ b/docs/ARCHITECTURE.pt-BR.md
@@ -0,0 +1,70 @@
+# Visão geral da arquitetura
+
+[English version](ARCHITECTURE.md)
+
+## Objetivo
+
+O projeto foi transformado em biblioteca sem abandonar o fluxo original. A arquitetura procura manter a implementação pequena e fácil de entender.
+
+## Camadas
+
+### 1. Camada de configuração
+
+- `config.py`
+- Contém `ProcessorConfig` e `DatabaseConfig`.
+- É usada tanto pela API quanto pela CLI.
+
+### 2. Camada de parsing
+
+- `parser.py`
+- Lê o texto da primeira página do PDF.
+- Aplica regex para extrair número e data.
+
+### 3. Camada de orquestração
+
+- `processor.py`
+- Descobre os arquivos.
+- Coordena o parsing.
+- Trata falhas por arquivo sem abortar o lote inteiro.
+- Delega exportação para Excel e persistência opcional em banco.
+
+### 4. Camada de saída
+
+- `excel.py`
+- Converte os registros em um arquivo `.xlsx`.
+
+### 5. Camada de persistência
+
+- `database.py`
+- Gerencia conexão MySQL e inserção dos registros.
+
+### 6. Pontos de entrada
+
+- `cli.py`: entrada via terminal.
+- `__init__.py`: API pública do pacote.
+- `legacy.py`: helper de compatibilidade com o estilo do projeto original.
+
+## Fluxo de processamento
+
+```text
+CLI ou API Python
+        |
+        v
+ProcessorConfig / DatabaseConfig
+        |
+        v
+InvoiceProcessor.process()
+        |
+        +--> descobrir PDFs
+        +--> processar cada PDF
+        +--> opcionalmente inserir no MySQL
+        +--> exportar registros para Excel
+        v
+ProcessingResult
+```
+
+## Escolhas de design
+
+- O processamento em lote continua resiliente: um arquivo ruim não derruba os demais.
+- O parsing continua simples: primeira página + regex, preservando o espírito do projeto original.
+- Os pontos de extensão ficam claros: regex, recursão, persistência em banco e funções públicas auxiliares.
diff --git a/docs/CI-CD.en.md b/docs/CI-CD.en.md
new file mode 100644
index 0000000..ca1d453
--- /dev/null
+++ b/docs/CI-CD.en.md
@@ -0,0 +1,42 @@
+# CI/CD and Releases
+
+[Versão em Português (Brasil)](CI-CD.pt-BR.md)
+
+## Goal
+
+This repository is configured for the following flow:
+
+- on **every push** and **every pull request**: run tests and validate the build
+- on a **manually published release**: generate distribution artifacts and attach them to the GitHub release
+
+## Included workflows
+
+### 1. CI
+
+File: `.github/workflows/ci.yml`
+
+Runs:
+
+- checkout
+- Python 3.10 and 3.11 setup
+- editable install with development dependencies
+- `pytest`
+- `python -m build`
+
+### 2. Release
+
+File: `.github/workflows/release.yml`
+
+Runs:
+
+- checkout
+- Python 3.10 setup
+- build dependencies install
+- `sdist` and `wheel` generation in `dist/`
+- upload of `dist/*` to the GitHub release
+
+It runs on:
+
+- `release.published`
+
+It also skips pre-releases.
diff --git a/docs/CI-CD.pt-BR.md b/docs/CI-CD.pt-BR.md
new file mode 100644
index 0000000..b4e22bf
--- /dev/null
+++ b/docs/CI-CD.pt-BR.md
@@ -0,0 +1,90 @@
+# CI/CD e Releases
+
+[English version](CI-CD.en.md)
+
+## Objetivo
+
+Este repositório foi configurado para o seguinte fluxo:
+
+- em **todo push** e **todo pull request**: rodar testes e validar o build
+- em **release publicada manualmente**: gerar os artefatos de distribuição e anexá-los à release do GitHub
+
+Esse modelo evita publicação automática a cada commit e combina melhor com um projeto que será distribuído principalmente por GitHub e releases.
+
+## Workflows incluídos
+
+### 1. CI
+
+Arquivo: `.github/workflows/ci.yml`
+
+Executa:
+
+- checkout do código
+- setup do Python 3.10 e 3.11
+- instalação do projeto com dependências de desenvolvimento
+- execução do `pytest`
+- execução de `python -m build`
+
+Esse workflow roda em:
+
+- `push` para `main` e `master`
+- `pull_request`
+
+### 2. Release
+
+Arquivo: `.github/workflows/release.yml`
+
+Executa:
+
+- checkout do código
+- setup do Python 3.10
+- instalação de dependências de build
+- geração de `sdist` e `wheel` em `dist/`
+- upload dos artefatos da pasta `dist/` para a release do GitHub
+
+Esse workflow roda em:
+
+- `release.published`
+
+Além disso, ele ignora `pre-release`.
+
+## Como criar uma release estável
+
+1. Atualize a versão em `pyproject.toml`
+2. Faça commit das alterações
+3. Envie para o GitHub
+4. Crie uma tag, por exemplo `v1.0.0`
+5. Publique uma release manual estável no GitHub usando essa tag
+
+Quando a release for publicada, o workflow `release.yml` será executado e anexará os arquivos gerados em `dist/`.
+
+## O que este repositório não faz
+
+Este projeto **não publica em GitHub Packages como índice Python para `pip`**, porque esse tipo de registry não é suportado para pacotes Python.
+
+Para este caso, as opções documentadas são:
+
+- instalar direto do GitHub com `pip install "git+https://..."`
+- baixar o `.whl` da release e instalar com `pip install arquivo.whl`
+
+## Instalação da CLI a partir do GitHub
+
+### Pela branch padrão
+
+```bash
+pip install "git+https://github.com/DanielArndt0/pydf.git"
+```
+
+### Por tag ou release
+
+```bash
+pip install "git+https://github.com/DanielArndt0/pydf.git@v1.0.0"
+```
+
+## Validação local antes de subir
+
+```bash
+pip install -e .[dev]
+pytest -v
+python -m build
+```
diff --git a/docs/CLI.en.md b/docs/CLI.en.md
new file mode 100644
index 0000000..7f1a85e
--- /dev/null
+++ b/docs/CLI.en.md
@@ -0,0 +1,101 @@
+# CLI
+
+[Versão em Português do Brasil](CLI.pt-BR.md)
+
+## What it is
+
+The CLI exposes the library through the `pydf` terminal command.
+
+Use it when you want to process PDFs without writing Python code.
+
+## Installation for CLI usage
+
+### Local development mode
+
+```bash
+pip install -e .
+```
+
+This registers the `pydf` command in the current Python environment.
+
+### Package build
+
+```bash
+python -m build
+```
+
+## How to run it
+
+### General help
+
+```bash
+pydf --help
+```
+
+### Minimum example
+
+```bash
+pydf examples/pdf_invoices
+```
+
+### Explicit Excel output path
+
+```bash
+pydf examples/pdf_invoices --output output/invoices.xlsx
+```
+
+### Recursive search
+
+```bash
+pydf examples --recursive
+```
+
+### Custom regex
+
+```bash
+pydf invoices   --invoice-number-pattern "INVOICE #(\d+)"   --invoice-date-pattern "(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})"
+```
+
+### MySQL persistence
+
+```bash
+pydf examples/pdf_invoices   --persist-to-database   --db-host localhost   --db-user root   --db-password ""   --db-name process_invoices   --db-table invoice_records
+```
+
+## Flags and arguments
+
+### Positional argument
+
+- `input_dir`: directory containing PDFs. Defaults to `pdf_invoices`.
+
+### Main flags
+
+- `--output`: output `.xlsx` path.
+- `--invoice-number-pattern`: invoice number regex.
+- `--invoice-date-pattern`: invoice date regex.
+- `--recursive`: search subdirectories.
+- `--persist-to-database`: enable MySQL persistence.
+- `--db-host`: MySQL host.
+- `--db-user`: MySQL user.
+- `--db-password`: MySQL password.
+- `--db-name`: database name.
+- `--db-table`: table name.
+- `--version`: show CLI version.
+- `--help`: show help.
+
+## Expected output
+
+At the end of the execution, the CLI prints:
+
+- number of processed files;
+- number of successes;
+- number of errors;
+- final generated Excel path.
+
+## When to use the CLI instead of the API
+
+Use the CLI when:
+
+- you only need to run a quick batch job;
+- you want to automate execution in shell scripts or CI;
+- you do not need to integrate the result into another Python application.
diff --git a/docs/CLI.pt-BR.md b/docs/CLI.pt-BR.md
new file mode 100644
index 0000000..1232f8a
--- /dev/null
+++ b/docs/CLI.pt-BR.md
@@ -0,0 +1,101 @@
+# CLI (Português do Brasil)
+
+[English version](CLI.en.md)
+
+## O que é
+
+A CLI expõe a biblioteca no terminal por meio do comando `pydf`.
+
+Ela serve para quem quer processar PDFs sem escrever código Python.
+
+## Como instalar para usar a CLI
+
+### Modo local de desenvolvimento
+
+```bash
+pip install -e .
+```
+
+Isso registra o comando `pydf` no ambiente Python atual.
+
+### Build de pacote
+
+```bash
+python -m build
+```
+
+## Como executar
+
+### Ajuda geral
+
+```bash
+pydf --help
+```
+
+### Exemplo mínimo
+
+```bash
+pydf examples/pdf_invoices
+```
+
+### Definindo o Excel de saída
+
+```bash
+pydf examples/pdf_invoices --output output/invoices.xlsx
+```
+
+### Busca recursiva
+
+```bash
+pydf examples --recursive
+```
+
+### Com regex customizada
+
+```bash
+pydf invoices   --invoice-number-pattern "INVOICE #(\d+)"   --invoice-date-pattern "(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})"
+```
+
+### Com MySQL
+
+```bash
+pydf examples/pdf_invoices   --persist-to-database   --db-host localhost   --db-user root   --db-password ""   --db-name process_invoices   --db-table invoice_records
+```
+
+## Flags e argumentos
+
+### Argumento posicional
+
+- `input_dir`: pasta com PDFs. Se omitido, usa `pdf_invoices`.
+
+### Flags principais
+
+- `--output`: caminho do arquivo `.xlsx`.
+- `--invoice-number-pattern`: regex do número da fatura.
+- `--invoice-date-pattern`: regex da data da fatura.
+- `--recursive`: busca em subpastas.
+- `--persist-to-database`: ativa persistência em MySQL.
+- `--db-host`: host do MySQL.
+- `--db-user`: usuário do MySQL.
+- `--db-password`: senha do MySQL.
+- `--db-name`: nome do banco.
+- `--db-table`: nome da tabela.
+- `--version`: mostra a versão da CLI.
+- `--help`: mostra a ajuda.
+
+## Saída esperada
+
+Ao final da execução, a CLI mostra:
+
+- quantidade de arquivos processados;
+- quantidade de sucessos;
+- quantidade de erros;
+- caminho final do Excel gerado.
+
+## Quando usar CLI em vez da API
+
+Use a CLI quando:
+
+- você só quer rodar um lote rapidamente;
+- vai automatizar isso em scripts `.bat`, shell script ou CI;
+- não precisa integrar o resultado em outra aplicação Python.
diff --git a/docs/ENVIRONMENT.en.md b/docs/ENVIRONMENT.en.md
new file mode 100644
index 0000000..88322c8
--- /dev/null
+++ b/docs/ENVIRONMENT.en.md
@@ -0,0 +1,28 @@
+# Python environment, venv, and troubleshooting
+
+[Versão em Português (Brasil)](ENVIRONMENT.pt-BR.md)
+
+## Minimum Python version
+
+This project requires **Python 3.10 or higher**.
+
+## Windows: list installed Python versions
+
+```powershell
+py -0p
+```
+
+## Create a Python 3.10 virtual environment
+
+```powershell
+py -3.10 -m venv .venv
+.venv\Scripts\Activate.ps1
+python -m pip install --upgrade pip
+python -m pip install -e .[dev]
+```
+
+## Run tests
+
+```powershell
+python -m pytest -v
+```
diff --git a/docs/ENVIRONMENT.pt-BR.md b/docs/ENVIRONMENT.pt-BR.md
new file mode 100644
index 0000000..9b81956
--- /dev/null
+++ b/docs/ENVIRONMENT.pt-BR.md
@@ -0,0 +1,88 @@
+# Ambiente Python, venv e troubleshooting
+
+[English version](ENVIRONMENT.en.md)
+
+## Versão mínima do Python
+
+Este projeto requer **Python 3.10 ou superior**.
+
+Se você tentar instalar com uma versão mais antiga, poderá ver algo como:
+
+```text
+ERROR: Package 'pydf' requires a different Python: 3.8.5 not in '>=3.10'
+```
+
+## Como verificar as versões instaladas no Windows
+
+```powershell
+py -0p
+```
+
+## Como criar um ambiente virtual com Python 3.10
+
+Na raiz do projeto:
+
+```powershell
+py -3.10 -m venv .venv
+.venv\Scripts\Activate.ps1
+python -m pip install --upgrade pip
+python -m pip install -e .[dev]
+```
+
+## Como confirmar a versão ativa
+
+```powershell
+python --version
+```
+
+## Como rodar os testes
+
+```powershell
+python -m pytest -v
+```
+
+## Como testar se o pacote foi instalado
+
+```powershell
+python -c "import pydf; print('ok')"
+```
+
+## Se o comando da CLI não for reconhecido
+
+Use o modo por módulo:
+
+```powershell
+python -m pydf.cli --help
+```
+
+## Problemas comuns
+
+### 1. `No module named 'pydf'`
+
+O pacote ainda não foi instalado no ambiente atual.
+
+Resolva com:
+
+```powershell
+python -m pip install -e .
+```
+
+### 2. `requires a different Python`
+
+Você está usando uma versão abaixo do mínimo exigido.
+
+Crie o venv com Python 3.10 ou superior.
+
+### 3. `pytest` não reconhecido
+
+Use:
+
+```powershell
+python -m pytest -v
+```
+
+ou instale as dependências de desenvolvimento:
+
+```powershell
+python -m pip install -e .[dev]
+```
diff --git a/docs/README.en.md b/docs/README.en.md
new file mode 100644
index 0000000..2ff1dff
--- /dev/null
+++ b/docs/README.en.md
@@ -0,0 +1,19 @@
+# English Documentation
+
+[Versão em Português (Brasil)](README.pt-BR.md)
+
+## Overview
+
+`pydf` is a Python library created from the original project while preserving its core idea: read invoice PDFs, extract invoice number and date, generate an Excel spreadsheet, and optionally persist the data to MySQL.
+
+This version also includes a CLI, commented examples, and recommended GitHub Actions workflows.
+
+## Start here
+
+- [Main README](../README.md)
+- [CLI guide](CLI.en.md)
+- [Public API guide](API.en.md)
+- [Architecture](ARCHITECTURE.md)
+- [CI/CD and Releases](CI-CD.en.md)
+- [Python environment, venv, and troubleshooting](ENVIRONMENT.en.md)
+- [Examples](../examples/README.md)
diff --git a/docs/README.pt-BR.md b/docs/README.pt-BR.md
new file mode 100644
index 0000000..7127c82
--- /dev/null
+++ b/docs/README.pt-BR.md
@@ -0,0 +1,53 @@
+# Documentação em Português (Brasil)
+
+[English version](README.en.md)
+
+## Visão geral
+
+A `pydf` é uma biblioteca Python criada a partir do projeto original, mantendo a mesma ideia principal: ler PDFs de faturas, extrair número e data da fatura, gerar uma planilha Excel e, opcionalmente, persistir os dados em MySQL.
+
+Esta versão também inclui uma CLI, exemplos comentados e workflows recomendados para GitHub Actions.
+
+## Comece por aqui
+
+- [README principal](../README.md)
+- [Guia da CLI](CLI.pt-BR.md)
+- [Guia da API pública](API.pt-BR.md)
+- [Arquitetura](ARCHITECTURE.pt-BR.md)
+- [CI/CD e Releases](CI-CD.pt-BR.md)
+- [Ambiente Python, venv e troubleshooting](ENVIRONMENT.pt-BR.md)
+- [Exemplos](../examples/README.md)
+
+## Instalação
+
+```bash
+pip install -e .
+```
+
+Com dependências de desenvolvimento:
+
+```bash
+pip install -e .[dev]
+```
+
+## Formas de uso
+
+### 1. Via CLI
+
+```bash
+pydf examples/pdf_invoices --output output/invoices.xlsx
+```
+
+### 2. Via Python
+
+```python
+from pydf import InvoiceProcessor, ProcessorConfig
+
+config = ProcessorConfig(
+    input_dir="examples/pdf_invoices",
+    output_excel="output/invoices.xlsx",
+)
+
+result = InvoiceProcessor(config).process()
+print(result.output_excel)
+```
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..c75341f
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,23 @@
+# Examples
+
+This folder contains small, commented examples showing the most common ways to use the library.
+
+## Files
+
+- `basic_usage.py`: minimal API usage
+- `custom_regex_usage.py`: custom regex patterns for different PDF layouts
+- `mysql_usage.py`: optional MySQL persistence
+- `recursive_usage.py`: recursive PDF discovery in nested folders
+
+## Running the examples
+
+From the project root:
+
+```bash
+pip install -e .[dev]
+python examples/basic_usage.py
+python examples/custom_regex_usage.py
+python examples/recursive_usage.py
+```
+
+The MySQL example requires a running MySQL server and a pre-created table.
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
new file mode 100644
index 0000000..4237322
--- /dev/null
+++ b/examples/basic_usage.py
@@ -0,0 +1,21 @@
+"""Basic example for the library API.
+
+Run with:
+    python examples/basic_usage.py
+"""
+
+from pydf import InvoiceProcessor, ProcessorConfig
+
+# The default regex patterns already match the sample PDFs.
+config = ProcessorConfig(
+    input_dir="examples/pdf_invoices",
+    output_excel="examples/output/basic_usage.xlsx",
+)
+
+result = InvoiceProcessor(config).process()
+
+print(f"Processed files: {len(result.records)}")
+print(f"Output spreadsheet: {result.output_excel}")
+
+for record in result.records:
+    print(record)
diff --git a/examples/custom_regex_usage.py b/examples/custom_regex_usage.py
new file mode 100644
index 0000000..1e2501d
--- /dev/null
+++ b/examples/custom_regex_usage.py
@@ -0,0 +1,17 @@
+"""Example showing how to customize regex patterns.
+
+Use this when your PDFs have a different wording than the sample invoices.
+"""
+
+from pydf import InvoiceProcessor, ProcessorConfig
+
+config = ProcessorConfig(
+    input_dir="examples/pdf_invoices",
+    output_excel="examples/output/custom_regex_usage.xlsx",
+    # These are examples only. Adjust them to your own PDF layout.
+    invoice_number_pattern=r"INVOICE #(\d+)",
+    invoice_date_pattern=r"(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})",
+)
+
+result = InvoiceProcessor(config).process()
+print(result.output_excel)
diff --git a/examples/mysql_usage.py b/examples/mysql_usage.py
new file mode 100644
index 0000000..ece29c2
--- /dev/null
+++ b/examples/mysql_usage.py
@@ -0,0 +1,25 @@
+"""Example showing how to enable MySQL persistence.
+
+Before running this example, create the database/table and review your credentials.
+"""
+
+from pydf import DatabaseConfig, InvoiceProcessor, ProcessorConfig
+
+# Update these credentials before using against a real database.
+db_config = DatabaseConfig(
+    host="localhost",
+    user="root",
+    password="",
+    database="process_invoices",
+    table="invoice_records",
+)
+
+config = ProcessorConfig(
+    input_dir="examples/pdf_invoices",
+    output_excel="examples/output/mysql_usage.xlsx",
+    persist_to_database=True,
+    database=db_config,
+)
+
+result = InvoiceProcessor(config).process()
+print(result.output_excel)
diff --git a/pdf_invoices/Invoice1.pdf b/examples/pdf_invoices/Invoice1.pdf
similarity index 100%
rename from pdf_invoices/Invoice1.pdf
rename to examples/pdf_invoices/Invoice1.pdf
diff --git a/pdf_invoices/Invoice2.pdf b/examples/pdf_invoices/Invoice2.pdf
similarity index 100%
rename from pdf_invoices/Invoice2.pdf
rename to examples/pdf_invoices/Invoice2.pdf
diff --git a/pdf_invoices/Invoice3.pdf b/examples/pdf_invoices/Invoice3.pdf
similarity index 100%
rename from pdf_invoices/Invoice3.pdf
rename to examples/pdf_invoices/Invoice3.pdf
diff --git a/pdf_invoices/Invoice4.pdf b/examples/pdf_invoices/Invoice4.pdf
similarity index 100%
rename from pdf_invoices/Invoice4.pdf
rename to examples/pdf_invoices/Invoice4.pdf
diff --git a/examples/recursive_usage.py b/examples/recursive_usage.py
new file mode 100644
index 0000000..0aefef8
--- /dev/null
+++ b/examples/recursive_usage.py
@@ -0,0 +1,12 @@
+"""Example showing recursive file discovery."""
+
+from pydf import InvoiceProcessor, ProcessorConfig
+
+config = ProcessorConfig(
+    input_dir="examples",
+    output_excel="examples/output/recursive_usage.xlsx",
+    recursive=True,
+)
+
+result = InvoiceProcessor(config).process()
+print(f"Files found: {len(result.records)}")
diff --git a/invoices.py b/invoices.py
deleted file mode 100644
index a5ab79a..0000000
--- a/invoices.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import os
-from openpyxl import Workbook
-import pdfplumber
-import re
-from datetime import datetime
-import mysql.connector
-
-def execute_insert(cursor, invoice_number, invoice_date, file_name, status):
-    sql = "INSERT INTO invoice_records (invoice_number, invoice_date, file_name,status) VALUES (%s, %s, %s, %s)"
-    val = (invoice_number, invoice_date, file_name,status)
-    cursor.execute(sql, val)
-
-def main():
-    # STARTUP
-
-    # Database Connection
-    db = mysql.connector.connect(
-        host="localhost",
-        user="root",
-        password="",
-        database="process_invoices"
-    )
-    cursor = db.cursor()
-    print("--- Successfully connected to database... ---")
-
-    # Get files from directory
-    directory = 'pdf_invoices'
-    files = os.listdir(directory)
-    files_quantity = len(files)
-
-    if files_quantity == 0:
-        raise Exception("No files found in the directory")
-
-    # Create Excel file
-    wb = Workbook()
-    ws = wb.active
-    ws.title = 'Invoice Imports'
-
-    ws['A1'] = 'Invoice #'
-    ws['B1'] = 'Date'
-    ws['C1'] = 'File Name'
-    ws['D1'] = 'Status'
-
-    last_empty_line = 1
-    while ws["D" + str(last_empty_line)].value is not None:
-        last_empty_line += 1
-
-    # WORK
-    for file in files:
-        try:
-            with pdfplumber.open(directory + "/" + file) as pdf:
-                first_page = pdf.pages[0]
-                pdf_text = first_page.extract_text()
-
-            inv_number_re_pattern = r'INVOICE #(\d+)'
-            inv_date_re_pattern = r'DATE (\d{2}/\d{2}/\d{4})'
-
-            match_number = re.search(inv_number_re_pattern, pdf_text)
-            match_date = re.search(inv_date_re_pattern, pdf_text)
-
-            if match_number:
-                ws['A{}'.format(last_empty_line)] = match_number.group(1)
-            else:
-                raise Exception("Couldn't find invoice number")
-
-            if match_date:
-                ws['B{}'.format(last_empty_line)] = match_date.group(1)
-            else:
-                raise Exception("Couldn't find invoice date")
-
-            ws['C{}'.format(last_empty_line)] = file
-            ws['D{}'.format(last_empty_line)] = "Completed"
-
-            execute_insert(cursor, match_number.group(1), match_date.group(1), file, "Completed")
-            db.commit()
-
-            last_empty_line += 1
-
-        except Exception as e:
-            print(f"Error processing file: {e}")
-
-            ws['C{}'.format(last_empty_line)] = file
-            ws['D{}'.format(last_empty_line)] = "Exception: {}".format(e)
-
-            execute_insert(cursor, "N/A", "N/A", file, "Exception: {}".format(e))
-            db.commit()
-
-            last_empty_line += 1
-
-    cursor.close()
-    db.close()
-
-    full_now = str(datetime.now()).replace(":", "-")
-    dot_index = full_now.index(".")
-    now = full_now[:dot_index]
-    wb.save("Invoices - {}.xlsx".format(now))
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..503861a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,49 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pydf"
+version = "1.0.0"
+description = "Biblioteca Python leve para extrair metadados de faturas em PDF, exportar para Excel e opcionalmente persistir em MySQL."
+readme = "README.md"
+requires-python = ">=3.10"
+license = {text = "MIT"}
+authors = [{name = "Daniel Arndt"}]
+dependencies = [
+  "openpyxl>=3.1.0",
+  "pdfplumber>=0.11.0",
+  "mysql-connector-python>=9.0.0",
+]
+keywords = ["pdf", "invoice", "excel", "mysql", "automation"]
+classifiers = [
+  "License :: OSI Approved :: MIT License",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+]
+
+[project.urls]
+Documentation = "https://github.com/DanielArndt0/pydf/README.md"
+Homepage = "https://github.com/DanielArndt0/pydf"
+Repository = "https://github.com/DanielArndt0/pydf"
+Issues = "https://github.com/DanielArndt0/pydf/issues"
+
+[project.scripts]
+pydf = "pydf.cli:main"
+
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.0.0",
+  "build>=1.2.0",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
diff --git a/src/pydf/__init__.py b/src/pydf/__init__.py
new file mode 100644
index 0000000..0eb75cd
--- /dev/null
+++ b/src/pydf/__init__.py
@@ -0,0 +1,21 @@
+"""Public package interface for pydf.
+
+The package keeps the original project idea intact:
+read invoice PDFs, extract a few key fields, export the result to Excel,
+and optionally persist records to MySQL.
+"""
+
+from .config import DatabaseConfig, ProcessorConfig
+from .models import InvoiceRecord, ProcessingResult
+from .parser import extract_text_from_pdf, parse_invoice
+from .processor import InvoiceProcessor
+
+__all__ = [
+    "DatabaseConfig",
+    "ProcessorConfig",
+    "InvoiceRecord",
+    "ProcessingResult",
+    "InvoiceProcessor",
+    "extract_text_from_pdf",
+    "parse_invoice",
+]
diff --git a/src/pydf/cli.py b/src/pydf/cli.py
new file mode 100644
index 0000000..3d657a3
--- /dev/null
+++ b/src/pydf/cli.py
@@ -0,0 +1,160 @@
+from __future__ import annotations
+
+"""Command-line interface for pydf."""
+
+import argparse
+
+from .config import DatabaseConfig, ProcessorConfig
+from .processor import InvoiceProcessor
+
+VERSION = "1.0.0"
+
+
+def build_parser() -> argparse.ArgumentParser:
+    """Create and return the CLI argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="pydf",
+        description=(
+            "Process invoice PDFs, extract invoice number and date, "
+            "export results to Excel, and optionally persist records to MySQL."
+        ),
+        epilog="""
+Examples:
+  pydf
+  pydf examples/pdf_invoices
+  pydf examples/pdf_invoices --output out/invoices.xlsx
+  pydf invoices --recursive
+  pydf invoices --invoice-number-pattern "Invoice No\\. (\\d+)"
+  pydf invoices --persist-to-database --db-host localhost --db-user root --db-name process_invoices
+""".strip(),
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "input_dir",
+        nargs="?",
+        default="pdf_invoices",
+        help="Directory containing PDF files (default: %(default)s).",
+    )
+
+    parser.add_argument(
+        "--output",
+        dest="output_excel",
+        metavar="PATH",
+        help=(
+            "Path to the output .xlsx file. "
+            "If omitted, a timestamped file name will be generated."
+        ),
+    )
+
+    parser.add_argument(
+        "--invoice-number-pattern",
+        default=r"INVOICE #(\d+)",
+        metavar="REGEX",
+        help="Regex with one capture group for the invoice number.",
+    )
+
+    parser.add_argument(
+        "--invoice-date-pattern",
+        default=r"(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})",
+        metavar="REGEX",
+        help="Regex with one capture group for the invoice date.",
+    )
+
+    parser.add_argument(
+        "--recursive",
+        action="store_true",
+        help="Search for PDF files recursively in subdirectories.",
+    )
+
+    parser.add_argument(
+        "--persist-to-database",
+        action="store_true",
+        help="Persist processed records to MySQL in addition to generating the Excel file.",
+    )
+
+    parser.add_argument(
+        "--db-host",
+        default="localhost",
+        metavar="HOST",
+        help="MySQL host (default: %(default)s).",
+    )
+
+    parser.add_argument(
+        "--db-user",
+        default="root",
+        metavar="USER",
+        help="MySQL user (default: %(default)s).",
+    )
+
+    parser.add_argument(
+        "--db-password",
+        default="",
+        metavar="PASSWORD",
+        help="MySQL password (default: empty).",
+    )
+
+    parser.add_argument(
+        "--db-name",
+        default="process_invoices",
+        metavar="NAME",
+        help="MySQL database name (default: %(default)s).",
+    )
+
+    parser.add_argument(
+        "--db-table",
+        default="invoice_records",
+        metavar="TABLE",
+        help="MySQL target table name (default: %(default)s).",
+    )
+
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=f"%(prog)s {VERSION}",
+    )
+
+    return parser
+
+
+def build_database_config(args: argparse.Namespace) -> DatabaseConfig | None:
+    """Build database configuration if persistence is enabled."""
+    if not args.persist_to_database:
+        return None
+
+    return DatabaseConfig(
+        host=args.db_host,
+        user=args.db_user,
+        password=args.db_password,
+        database=args.db_name,
+        table=args.db_table,
+    )
+
+
+def main() -> None:
+    """Run the CLI."""
+    parser = build_parser()
+    args = parser.parse_args()
+
+    config = ProcessorConfig(
+        input_dir=args.input_dir,
+        output_excel=args.output_excel,
+        invoice_number_pattern=args.invoice_number_pattern,
+        invoice_date_pattern=args.invoice_date_pattern,
+        persist_to_database=args.persist_to_database,
+        database=build_database_config(args),
+        recursive=args.recursive,
+    )
+
+    result = InvoiceProcessor(config).process()
+
+    print("\nPyDF processing completed")
+    print("-" * 28)
+    print(f"Processed files : {len(result.records)}")
+    print(f"Success         : {result.success_count}")
+    print(f"Errors          : {result.error_count}")
+    print(f"Excel output    : {result.output_excel}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/pydf/config.py b/src/pydf/config.py
new file mode 100644
index 0000000..836db41
--- /dev/null
+++ b/src/pydf/config.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+"""Configuration models used by the public API and the CLI."""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass(slots=True)
+class DatabaseConfig:
+    """Connection settings for the optional MySQL persistence layer.
+
+    Attributes:
+        host: MySQL server hostname.
+        user: MySQL username.
+        password: MySQL password.
+        database: Target database name.
+        table: Target table name.
+    """
+
+    host: str = "localhost"
+    user: str = "root"
+    password: str = ""
+    database: str = "process_invoices"
+    table: str = "invoice_records"
+
+
+@dataclass(slots=True)
+class ProcessorConfig:
+    """Main configuration object used by :class:`pydf.InvoiceProcessor`.
+
+    The defaults intentionally stay close to the original project so the
+    migration from script to library remains straightforward.
+    """
+
+    input_dir: Path | str = "pdf_invoices"
+    output_excel: Optional[Path | str] = None
+    invoice_number_pattern: str = r"INVOICE #(\d+)"
+    invoice_date_pattern: str = r"(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})"
+    worksheet_name: str = "Invoice Imports"
+    status_completed: str = "Completed"
+    persist_to_database: bool = False
+    database: Optional[DatabaseConfig] = None
+    recursive: bool = False
+    supported_extensions: tuple[str, ...] = field(default_factory=lambda: (".pdf",))
+
+    def resolved_input_dir(self) -> Path:
+        """Return the absolute input directory path."""
+        return Path(self.input_dir).expanduser().resolve()
diff --git a/src/pydf/database.py b/src/pydf/database.py
new file mode 100644
index 0000000..d6881ac
--- /dev/null
+++ b/src/pydf/database.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+"""MySQL helpers used when database persistence is enabled."""
+
+from contextlib import contextmanager
+from typing import Iterator
+
+import mysql.connector
+
+from .config import DatabaseConfig
+from .models import InvoiceRecord
+
+
+@contextmanager
+def mysql_connection(config: DatabaseConfig) -> Iterator[mysql.connector.MySQLConnection]:
+    """Create and automatically close a MySQL connection."""
+    connection = mysql.connector.connect(
+        host=config.host,
+        user=config.user,
+        password=config.password,
+        database=config.database,
+    )
+    try:
+        yield connection
+    finally:
+        connection.close()
+
+
+def insert_record(connection: mysql.connector.MySQLConnection, config: DatabaseConfig, record: InvoiceRecord) -> None:
+    """Insert one processed record into the configured MySQL table."""
+    sql = (
+        f"INSERT INTO {config.table} "
+        "(invoice_number, invoice_date, file_name, status) VALUES (%s, %s, %s, %s)"
+    )
+    values = (record.invoice_number, record.invoice_date, record.file_name, record.status)
+    cursor = connection.cursor()
+    try:
+        cursor.execute(sql, values)
+        connection.commit()
+    finally:
+        cursor.close()
diff --git a/src/pydf/excel.py b/src/pydf/excel.py
new file mode 100644
index 0000000..d1f8b1c
--- /dev/null
+++ b/src/pydf/excel.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+"""Excel export utilities."""
+
+from pathlib import Path
+
+from openpyxl import Workbook
+
+from .models import InvoiceRecord
+
+
+HEADERS = ["Invoice #", "Date", "File Name", "Status"]
+
+
+def export_records_to_excel(records: list[InvoiceRecord], output_path: Path, sheet_name: str) -> Path:
+    """Export processed invoice records to an Excel workbook.
+
+    Args:
+        records: Records produced by the processing pipeline.
+        output_path: Destination XLSX path.
+        sheet_name: Worksheet title to use inside the workbook.
+    """
+    wb = Workbook()
+    ws = wb.active
+    ws.title = sheet_name
+
+    for index, header in enumerate(HEADERS, start=1):
+        ws.cell(row=1, column=index, value=header)
+
+    for row_index, record in enumerate(records, start=2):
+        ws.cell(row=row_index, column=1, value=record.invoice_number)
+        ws.cell(row=row_index, column=2, value=record.invoice_date)
+        ws.cell(row=row_index, column=3, value=record.file_name)
+        ws.cell(row=row_index, column=4, value=record.status)
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    wb.save(output_path)
+    return output_path
diff --git a/src/pydf/legacy.py b/src/pydf/legacy.py
new file mode 100644
index 0000000..fc5dad2
--- /dev/null
+++ b/src/pydf/legacy.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+"""Compatibility helpers for users migrating from the original script."""
+
+from .config import DatabaseConfig, ProcessorConfig
+from .processor import InvoiceProcessor
+
+
+def run_legacy_flow() -> None:
+    """Run a configuration close to the original project behavior."""
+    config = ProcessorConfig(
+        input_dir="pdf_invoices",
+        persist_to_database=True,
+        database=DatabaseConfig(
+            host="localhost",
+            user="root",
+            password="",
+            database="process_invoices",
+            table="invoice_records",
+        ),
+    )
+    InvoiceProcessor(config).process()
diff --git a/src/pydf/models.py b/src/pydf/models.py
new file mode 100644
index 0000000..6155674
--- /dev/null
+++ b/src/pydf/models.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+"""Lightweight data structures returned by the library."""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass(slots=True)
+class InvoiceRecord:
+    """Represents the outcome of parsing a single PDF file."""
+
+    invoice_number: str
+    invoice_date: str
+    file_name: str
+    status: str
+
+
+@dataclass(slots=True)
+class ProcessingResult:
+    """Aggregated result produced by :meth:`pydf.InvoiceProcessor.process`."""
+
+    records: list[InvoiceRecord] = field(default_factory=list)
+    output_excel: Optional[Path] = None
+
+    @property
+    def success_count(self) -> int:
+        """Return how many records finished with the configured success status."""
+        return sum(1 for record in self.records if record.status == "Completed")
+
+    @property
+    def error_count(self) -> int:
+        """Return how many records finished with an error-like status."""
+        return len(self.records) - self.success_count
diff --git a/src/pydf/parser.py b/src/pydf/parser.py
new file mode 100644
index 0000000..db39b13
--- /dev/null
+++ b/src/pydf/parser.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+"""PDF parsing helpers.
+
+This module is intentionally small and focused. It reads the first page of the
+PDF, extracts text, and applies the configured regular expressions.
+"""
+
+import re
+from pathlib import Path
+
+import pdfplumber
+
+from .models import InvoiceRecord
+
+
+def extract_text_from_pdf(file_path: Path) -> str:
+    """Extract text from the first page of a PDF file.
+
+    Args:
+        file_path: Absolute or relative path to a PDF file.
+
+    Returns:
+        The extracted text from the first page.
+
+    Raises:
+        ValueError: If the PDF contains no pages.
+    """
+    with pdfplumber.open(file_path) as pdf:
+        if not pdf.pages:
+            raise ValueError("PDF has no pages")
+        return pdf.pages[0].extract_text() or ""
+
+
+def parse_invoice(file_path: Path, number_pattern: str, date_pattern: str, completed_status: str) -> InvoiceRecord:
+    """Parse a single invoice-like PDF into an :class:`InvoiceRecord`.
+
+    Args:
+        file_path: PDF file to inspect.
+        number_pattern: Regex with one capturing group for the invoice number.
+        date_pattern: Regex with one capturing group for the invoice date.
+        completed_status: Status text to use when extraction succeeds.
+    """
+    pdf_text = extract_text_from_pdf(file_path)
+
+    match_number = re.search(number_pattern, pdf_text)
+    match_date = re.search(date_pattern, pdf_text)
+
+    if not match_number:
+        raise ValueError("Couldn't find invoice number")
+    if not match_date:
+        raise ValueError("Couldn't find invoice date")
+
+    return InvoiceRecord(
+        invoice_number=match_number.group(1),
+        invoice_date=match_date.group(1),
+        file_name=file_path.name,
+        status=completed_status,
+    )
diff --git a/src/pydf/processor.py b/src/pydf/processor.py
new file mode 100644
index 0000000..4d5970a
--- /dev/null
+++ b/src/pydf/processor.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+"""Processing pipeline orchestration."""
+
+from datetime import datetime
+from pathlib import Path
+from typing import Iterator
+
+from .config import ProcessorConfig
+from .database import insert_record, mysql_connection
+from .excel import export_records_to_excel
+from .models import InvoiceRecord, ProcessingResult
+from .parser import parse_invoice
+
+
+class InvoiceProcessor:
+    """High-level processing service.
+
+    Typical flow:
+        1. Discover PDF files in the configured input directory.
+        2. Parse each file with regex-based extraction.
+        3. Optionally persist the results to MySQL.
+        4. Export all records to an Excel workbook.
+    """
+
+    def __init__(self, config: ProcessorConfig | None = None):
+        self.config = config or ProcessorConfig()
+
+    def process(self) -> ProcessingResult:
+        """Run the full pipeline and return the aggregated result."""
+        files = list(self._iter_files())
+        if not files:
+            raise FileNotFoundError("No PDF files found in the input directory")
+
+        records: list[InvoiceRecord] = []
+        db_connection = None
+
+        if self.config.persist_to_database:
+            if not self.config.database:
+                raise ValueError("database config is required when persist_to_database=True")
+            db_context = mysql_connection(self.config.database)
+            db_connection = db_context.__enter__()
+        else:
+            db_context = None
+
+        try:
+            for file_path in files:
+                try:
+                    record = parse_invoice(
+                        file_path=file_path,
+                        number_pattern=self.config.invoice_number_pattern,
+                        date_pattern=self.config.invoice_date_pattern,
+                        completed_status=self.config.status_completed,
+                    )
+                except Exception as exc:
+                    # Keep the batch running even if one file fails,
+                    # which mirrors the practical behavior of the original script.
+                    record = InvoiceRecord(
+                        invoice_number="N/A",
+                        invoice_date="N/A",
+                        file_name=file_path.name,
+                        status=f"Exception: {exc}",
+                    )
+
+                records.append(record)
+
+                if db_connection is not None and self.config.database is not None:
+                    insert_record(db_connection, self.config.database, record)
+        finally:
+            if db_context is not None:
+                db_context.__exit__(None, None, None)
+
+        output_excel = Path(self.config.output_excel) if self.config.output_excel else self._default_output_name()
+        output_excel = export_records_to_excel(records, output_excel, self.config.worksheet_name)
+        return ProcessingResult(records=records, output_excel=output_excel)
+
+    def _iter_files(self) -> Iterator[Path]:
+        """Yield supported files from the configured input directory."""
+        input_dir = self.config.resolved_input_dir()
+        if self.config.recursive:
+            for file_path in input_dir.rglob('*'):
+                if file_path.suffix.lower() in self.config.supported_extensions:
+                    yield file_path
+        else:
+            for file_path in input_dir.iterdir():
+                if file_path.is_file() and file_path.suffix.lower() in self.config.supported_extensions:
+                    yield file_path
+
+    @staticmethod
+    def _default_output_name() -> Path:
+        """Build the default timestamped Excel file name."""
+        timestamp = datetime.now().strftime('%Y-%m-%d %H-%M-%S')
+        return Path(f'Invoices - {timestamp}.xlsx')
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
new file mode 100644
index 0000000..e445299
--- /dev/null
+++ b/tests/test_smoke.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from pydf import InvoiceProcessor, ProcessorConfig
+
+
+def test_process_sample_pdfs(tmp_path: Path):
+    config = ProcessorConfig(
+        input_dir="examples/pdf_invoices",
+        output_excel=tmp_path / "output.xlsx",
+    )
+    result = InvoiceProcessor(config).process()
+
+    assert len(result.records) == 4
+    assert result.output_excel.exists()