From fd13c04fb2c2604a225bc290d583614e425c45bd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 23 Mar 2026 16:47:09 -0300 Subject: [PATCH] Initial library, CLI, documentation, and CI/CD setup --- .github/workflows/ci.yml | 35 ++++ .github/workflows/publish.yml | 39 +++++ .github/workflows/release.yml | 35 ++++ .gitignore | 103 +++++++++++ LICENSE | 21 +++ README.md | 137 +++++++++++++++ docs/API.en.md | 99 +++++++++++ docs/API.pt-BR.md | 99 +++++++++++ docs/ARCHITECTURE.md | 70 ++++++++ docs/ARCHITECTURE.pt-BR.md | 70 ++++++++ docs/CI-CD.en.md | 42 +++++ docs/CI-CD.pt-BR.md | 90 ++++++++++ docs/CLI.en.md | 101 +++++++++++ docs/CLI.pt-BR.md | 101 +++++++++++ docs/ENVIRONMENT.en.md | 28 +++ docs/ENVIRONMENT.pt-BR.md | 88 ++++++++++ docs/README.en.md | 19 +++ docs/README.pt-BR.md | 53 ++++++ examples/README.md | 23 +++ examples/basic_usage.py | 21 +++ examples/custom_regex_usage.py | 17 ++ examples/mysql_usage.py | 25 +++ .../pdf_invoices}/Invoice1.pdf | Bin .../pdf_invoices}/Invoice2.pdf | Bin .../pdf_invoices}/Invoice3.pdf | Bin .../pdf_invoices}/Invoice4.pdf | Bin examples/recursive_usage.py | 12 ++ invoices.py | 99 ----------- pyproject.toml | 49 ++++++ src/pydf/__init__.py | 21 +++ src/pydf/cli.py | 160 ++++++++++++++++++ src/pydf/config.py | 50 ++++++ src/pydf/database.py | 41 +++++ src/pydf/excel.py | 38 +++++ src/pydf/legacy.py | 22 +++ src/pydf/models.py | 35 ++++ src/pydf/parser.py | 59 +++++++ src/pydf/processor.py | 93 ++++++++++ tests/test_smoke.py | 14 ++ 39 files changed, 1910 insertions(+), 99 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/release.yml create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 docs/API.en.md create mode 100644 docs/API.pt-BR.md create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/ARCHITECTURE.pt-BR.md create mode 100644 docs/CI-CD.en.md create mode 100644 docs/CI-CD.pt-BR.md create mode 100644 docs/CLI.en.md create mode 100644 docs/CLI.pt-BR.md create mode 100644 docs/ENVIRONMENT.en.md create mode 100644 docs/ENVIRONMENT.pt-BR.md create mode 100644 docs/README.en.md create mode 100644 docs/README.pt-BR.md create mode 100644 examples/README.md create mode 100644 examples/basic_usage.py create mode 100644 examples/custom_regex_usage.py create mode 100644 examples/mysql_usage.py rename {pdf_invoices => examples/pdf_invoices}/Invoice1.pdf (100%) rename {pdf_invoices => examples/pdf_invoices}/Invoice2.pdf (100%) rename {pdf_invoices => examples/pdf_invoices}/Invoice3.pdf (100%) rename {pdf_invoices => examples/pdf_invoices}/Invoice4.pdf (100%) create mode 100644 examples/recursive_usage.py delete mode 100644 invoices.py create mode 100644 pyproject.toml create mode 100644 src/pydf/__init__.py create mode 100644 src/pydf/cli.py create mode 100644 src/pydf/config.py create mode 100644 src/pydf/database.py create mode 100644 src/pydf/excel.py create mode 100644 src/pydf/legacy.py create mode 100644 src/pydf/models.py create mode 100644 src/pydf/parser.py create mode 100644 src/pydf/processor.py create mode 100644 tests/test_smoke.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0682911 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: CI + +on: + push: + branches: [main, master] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11"] + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e .[dev] + + - name: Run tests + run: python -m pytest -v + + - name: Build package + run: python -m build diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..6e02fb9 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,39 @@ +name: Publish Python Package + +on: + release: + types: [published] + +jobs: + publish: + if: ${{ !github.event.release.prerelease }} + runs-on: ubuntu-latest + + permissions: + contents: write + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install build tools + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build distributions + run: python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + - name: Upload dist files to GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: dist/* \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..155a533 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,35 @@ +name: Release Build + +on: + release: + types: [published] + +jobs: + build-release-artifacts: + if: ${{ !github.event.release.prerelease }} + runs-on: ubuntu-latest + + permissions: + contents: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build + + - name: Build distributions + run: python -m build + + - name: Upload artifacts to GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: dist/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4a7d244 --- /dev/null +++ b/.gitignore @@ -0,0 +1,103 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +*.egg +MANIFEST + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage / pytest +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache/ +.pytest_cache/ +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ + +# Type check / lint caches +.mypy_cache/ +.ruff_cache/ +.pyre/ +.dmypy.json +dmypy.json + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ + +# Jupyter Notebook +.ipynb_checkpoints/ + +# IDEs / editors +.vscode/ +.idea/ + +# OS files +.DS_Store +Thumbs.db + +# Local environment files +.env +.env.* +*.local + +# Logs +*.log + +# Temporary files +tmp/ +temp/ +*.tmp + +# Project generated files +output/ +temp_uploads/ +generated/ +reports/ + +# Excel / export artifacts +*.xlsx + +# Database / local data +*.db +*.sqlite3 + +# Python build metadata +.pybuild/ + +# Packaging tools +pip-wheel-metadata/ + +# PyInstaller +*.manifest +*.spec \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3769d0a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Daniel Arndt + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..131b720 --- /dev/null +++ b/README.md @@ -0,0 +1,137 @@ +# Pydf + +[Documentação PT-BR](docs/README.pt-BR.md) | [English docs](docs/README.en.md) + +A `pydf` é uma biblioteca Python leve para leitura de PDFs de faturas, extração de metadados com regex, exportação para Excel e persistência opcional em MySQL. + +Esta versão reorganiza o projeto original como biblioteca e CLI, sem fugir da ideia central do script: **PDF -> extração -> Excel -> MySQL opcional**. + +## Visão rápida + +- Biblioteca Python reutilizável +- CLI simples para uso no terminal +- Regex configurável para número e data da fatura +- Exportação para `.xlsx` +- Persistência opcional em MySQL +- Documentação em PT-BR e inglês +- Workflows de CI e release para GitHub Actions + +## Requisitos + +- Python **3.10 ou superior** +- `pip` +- Recomendado: ambiente virtual (`venv`) + +## Instalação local + +Na raiz do projeto: + +```bash +pip install -e . +``` + +Instalação com dependências de desenvolvimento: + +```bash +pip install -e .[dev] +``` + +## Instalação da CLI via GitHub + +Como o GitHub não oferece um registry Python suportado para `pip` no GitHub Packages, a forma recomendada para instalar a CLI a partir do GitHub é usar o próprio repositório Git. + +### Instalar da branch padrão + +```bash +pip install "git+https://github.com/DanielArndt0/pydf.git" +``` + +### Instalar de uma tag ou release específica + +```bash +pip install "git+https://github.com/DanielArndt0/pydf.git@v1.0.0" +``` + +Depois disso, a CLI fica disponível como: + +```bash +pydf --help +``` + +## Primeiros passos com venv no Windows + +Se você tiver mais de uma versão do Python instalada, confira as versões disponíveis: + +```powershell +py -0p +``` + +Crie e ative um ambiente virtual com Python 3.10: + +```powershell +py -3.10 -m venv .venv +.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip +python -m pip install -e .[dev] +``` + +## Como executar a CLI + +```bash +pydf --help +pydf examples/pdf_invoices --output output/invoices.xlsx +``` + +## Como usar como biblioteca + +```python +from pydf import InvoiceProcessor, ProcessorConfig + +config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel="output/invoices.xlsx", +) + +result = InvoiceProcessor(config).process() + +print(result.output_excel) +for record in result.records: + print(record.file_name, record.invoice_number, record.invoice_date, record.status) +``` + +## Rodando testes + +```bash +pytest -v +``` + +Se o ambiente ainda não estiver preparado: + +```bash +pip install -e .[dev] +pytest -v +``` + +## Build local + +```bash +python -m build +``` + +## CI e releases no GitHub + +Este repositório inclui dois workflows: + +- `ci.yml`: roda testes e build em todo push e pull request +- `release.yml`: gera os artefatos e anexa `dist/*` a uma release publicada manualmente + +Documentação detalhada: + +- [Guia principal da documentação](docs/README.pt-BR.md) +- [Guia da CLI](docs/CLI.pt-BR.md) +- [Guia da API](docs/API.pt-BR.md) +- [Arquitetura](docs/ARCHITECTURE.pt-BR.md) +- [CI/CD e Releases](docs/CI-CD.pt-BR.md) +- [Ambiente Python, venv e troubleshooting](docs/ENVIRONMENT.pt-BR.md) +- [Exemplos](examples/README.md) + diff --git a/docs/API.en.md b/docs/API.en.md new file mode 100644 index 0000000..e10d6ec --- /dev/null +++ b/docs/API.en.md @@ -0,0 +1,99 @@ +# Public API + +[Versão em Português do Brasil](API.pt-BR.md) + +## Overview + +The library was organized so most integrations go through a few simple entry points. + +## Main objects + +### `ProcessorConfig` + +Main configuration object. + +Most useful fields: + +- `input_dir`: directory containing PDFs. +- `output_excel`: final Excel path. +- `invoice_number_pattern`: invoice number regex. +- `invoice_date_pattern`: invoice date regex. +- `worksheet_name`: Excel sheet name. +- `status_completed`: success status text. +- `persist_to_database`: enables MySQL persistence. +- `database`: `DatabaseConfig` instance. +- `recursive`: recursive search. + +### `DatabaseConfig` + +Used only when `persist_to_database=True`. + +Fields: + +- `host` +- `user` +- `password` +- `database` +- `table` + +### `InvoiceProcessor` + +Main library class. + +Most important method: + +- `process() -> ProcessingResult` + +### `ProcessingResult` + +Consolidated processing result. + +Properties: + +- `records` +- `output_excel` +- `success_count` +- `error_count` + +### `InvoiceRecord` + +Represents one processed PDF. + +Fields: + +- `invoice_number` +- `invoice_date` +- `file_name` +- `status` + +## Useful functions + +### `extract_text_from_pdf(file_path)` + +Extracts text from the first page of the PDF. + +### `parse_invoice(file_path, number_pattern, date_pattern, completed_status)` + +Processes a single PDF and returns an `InvoiceRecord`. + +## Direct API example + +```python +from pydf import InvoiceProcessor, ProcessorConfig + +config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel="output/api_usage.xlsx", +) + +result = InvoiceProcessor(config).process() +print(result.success_count) +``` + +## When to use the API + +Use the API when: + +- you want to integrate processing into another Python system; +- you need to inspect the `ProcessingResult` in memory; +- you want more control over customization. diff --git a/docs/API.pt-BR.md b/docs/API.pt-BR.md new file mode 100644 index 0000000..7f9af8b --- /dev/null +++ b/docs/API.pt-BR.md @@ -0,0 +1,99 @@ +# API pública (Português do Brasil) + +[English version](API.en.md) + +## Visão geral + +A biblioteca foi organizada para que a maior parte do uso passe por poucos pontos de entrada. + +## Principais objetos + +### `ProcessorConfig` + +Objeto principal de configuração. + +Campos mais úteis: + +- `input_dir`: diretório com PDFs. +- `output_excel`: caminho do Excel final. +- `invoice_number_pattern`: regex para número da fatura. +- `invoice_date_pattern`: regex para data da fatura. +- `worksheet_name`: nome da aba do Excel. +- `status_completed`: texto de sucesso. +- `persist_to_database`: habilita gravação no MySQL. +- `database`: instância de `DatabaseConfig`. +- `recursive`: busca recursiva. + +### `DatabaseConfig` + +Usado apenas se `persist_to_database=True`. + +Campos: + +- `host` +- `user` +- `password` +- `database` +- `table` + +### `InvoiceProcessor` + +Classe principal da biblioteca. + +Método mais importante: + +- `process() -> ProcessingResult` + +### `ProcessingResult` + +Resultado consolidado do processamento. + +Propriedades: + +- `records` +- `output_excel` +- `success_count` +- `error_count` + +### `InvoiceRecord` + +Representa um PDF processado. + +Campos: + +- `invoice_number` +- `invoice_date` +- `file_name` +- `status` + +## Funções úteis + +### `extract_text_from_pdf(file_path)` + +Extrai o texto da primeira página do PDF. + +### `parse_invoice(file_path, number_pattern, date_pattern, completed_status)` + +Processa um único PDF e devolve um `InvoiceRecord`. + +## Exemplo direto da API + +```python +from pydf import InvoiceProcessor, ProcessorConfig + +config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel="output/api_usage.xlsx", +) + +result = InvoiceProcessor(config).process() +print(result.success_count) +``` + +## Quando usar a API + +Use a API quando: + +- você vai integrar o processamento em outro sistema Python; +- precisa manipular o `ProcessingResult` em memória; +- quer customizar o fluxo com mais controle. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..b32983e --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,70 @@ +# Architecture overview + +[Português do Brasil](ARCHITECTURE.pt-BR.md) + +## Goal + +The project was turned into a library without abandoning the original flow. The architecture keeps the implementation small and easy to understand. + +## Layers + +### 1. Configuration layer + +- `config.py` +- Holds `ProcessorConfig` and `DatabaseConfig`. +- Used by both the API and the CLI. + +### 2. Parsing layer + +- `parser.py` +- Reads PDF text from the first page. +- Applies regex extraction for invoice number and date. + +### 3. Orchestration layer + +- `processor.py` +- Discovers files. +- Coordinates parsing. +- Handles per-file failures without aborting the whole batch. +- Delegates Excel export and optional database persistence. + +### 4. Output layer + +- `excel.py` +- Converts the records into an `.xlsx` file. + +### 5. Persistence layer + +- `database.py` +- Manages the MySQL connection and inserts records. + +### 6. Entry points + +- `cli.py`: terminal entry point. +- `__init__.py`: public package API. +- `legacy.py`: compatibility helper for the original project style. + +## Processing flow + +```text +CLI or Python API + | + v +ProcessorConfig / DatabaseConfig + | + v +InvoiceProcessor.process() + | + +--> discover PDF files + +--> parse each PDF + +--> optionally insert into MySQL + +--> export records to Excel + v +ProcessingResult +``` + +## Design choices + +- Keep batch processing resilient: one bad file should not stop the others. +- Keep parsing simple: first page + regex, matching the original project spirit. +- Keep extension points obvious: regex, recursion, database persistence, and public helper functions. diff --git a/docs/ARCHITECTURE.pt-BR.md b/docs/ARCHITECTURE.pt-BR.md new file mode 100644 index 0000000..3172a21 --- /dev/null +++ b/docs/ARCHITECTURE.pt-BR.md @@ -0,0 +1,70 @@ +# Visão geral da arquitetura + +[English version](ARCHITECTURE.md) + +## Objetivo + +O projeto foi transformado em biblioteca sem abandonar o fluxo original. A arquitetura procura manter a implementação pequena e fácil de entender. + +## Camadas + +### 1. Camada de configuração + +- `config.py` +- Contém `ProcessorConfig` e `DatabaseConfig`. +- É usada tanto pela API quanto pela CLI. + +### 2. Camada de parsing + +- `parser.py` +- Lê o texto da primeira página do PDF. +- Aplica regex para extrair número e data. + +### 3. Camada de orquestração + +- `processor.py` +- Descobre os arquivos. +- Coordena o parsing. +- Trata falhas por arquivo sem abortar o lote inteiro. +- Delega exportação para Excel e persistência opcional em banco. + +### 4. Camada de saída + +- `excel.py` +- Converte os registros em um arquivo `.xlsx`. + +### 5. Camada de persistência + +- `database.py` +- Gerencia conexão MySQL e inserção dos registros. + +### 6. Pontos de entrada + +- `cli.py`: entrada via terminal. +- `__init__.py`: API pública do pacote. +- `legacy.py`: helper de compatibilidade com o estilo do projeto original. + +## Fluxo de processamento + +```text +CLI ou API Python + | + v +ProcessorConfig / DatabaseConfig + | + v +InvoiceProcessor.process() + | + +--> descobrir PDFs + +--> processar cada PDF + +--> opcionalmente inserir no MySQL + +--> exportar registros para Excel + v +ProcessingResult +``` + +## Escolhas de design + +- O processamento em lote continua resiliente: um arquivo ruim não derruba os demais. +- O parsing continua simples: primeira página + regex, preservando o espírito do projeto original. +- Os pontos de extensão ficam claros: regex, recursão, persistência em banco e funções públicas auxiliares. diff --git a/docs/CI-CD.en.md b/docs/CI-CD.en.md new file mode 100644 index 0000000..ca1d453 --- /dev/null +++ b/docs/CI-CD.en.md @@ -0,0 +1,42 @@ +# CI/CD and Releases + +[Versão em Português (Brasil)](CI-CD.pt-BR.md) + +## Goal + +This repository is configured for the following flow: + +- on **every push** and **every pull request**: run tests and validate the build +- on a **manually published release**: generate distribution artifacts and attach them to the GitHub release + +## Included workflows + +### 1. CI + +File: `.github/workflows/ci.yml` + +Runs: + +- checkout +- Python 3.10 and 3.11 setup +- editable install with development dependencies +- `pytest` +- `python -m build` + +### 2. Release + +File: `.github/workflows/release.yml` + +Runs: + +- checkout +- Python 3.10 setup +- build dependencies install +- `sdist` and `wheel` generation in `dist/` +- upload of `dist/*` to the GitHub release + +It runs on: + +- `release.published` + +It also skips pre-releases. diff --git a/docs/CI-CD.pt-BR.md b/docs/CI-CD.pt-BR.md new file mode 100644 index 0000000..b4e22bf --- /dev/null +++ b/docs/CI-CD.pt-BR.md @@ -0,0 +1,90 @@ +# CI/CD e Releases + +[English version](CI-CD.en.md) + +## Objetivo + +Este repositório foi configurado para o seguinte fluxo: + +- em **todo push** e **todo pull request**: rodar testes e validar o build +- em **release publicada manualmente**: gerar os artefatos de distribuição e anexá-los à release do GitHub + +Esse modelo evita publicação automática a cada commit e combina melhor com um projeto que será distribuído principalmente por GitHub e releases. + +## Workflows incluídos + +### 1. CI + +Arquivo: `.github/workflows/ci.yml` + +Executa: + +- checkout do código +- setup do Python 3.10 e 3.11 +- instalação do projeto com dependências de desenvolvimento +- execução do `pytest` +- execução de `python -m build` + +Esse workflow roda em: + +- `push` para `main` e `master` +- `pull_request` + +### 2. Release + +Arquivo: `.github/workflows/release.yml` + +Executa: + +- checkout do código +- setup do Python 3.10 +- instalação de dependências de build +- geração de `sdist` e `wheel` em `dist/` +- upload dos artefatos da pasta `dist/` para a release do GitHub + +Esse workflow roda em: + +- `release.published` + +Além disso, ele ignora `pre-release`. + +## Como criar uma release estável + +1. Atualize a versão em `pyproject.toml` +2. Faça commit das alterações +3. Envie para o GitHub +4. Crie uma tag, por exemplo `v1.0.0` +5. Publique uma release manual estável no GitHub usando essa tag + +Quando a release for publicada, o workflow `release.yml` será executado e anexará os arquivos gerados em `dist/`. + +## O que este repositório não faz + +Este projeto **não publica em GitHub Packages como índice Python para `pip`**, porque esse tipo de registry não é suportado para pacotes Python. + +Para este caso, as opções documentadas são: + +- instalar direto do GitHub com `pip install "git+https://..."` +- baixar o `.whl` da release e instalar com `pip install arquivo.whl` + +## Instalação da CLI a partir do GitHub + +### Pela branch padrão + +```bash +pip install "git+https://github.com/DanielArndt0/pydf.git" +``` + +### Por tag ou release + +```bash +pip install "git+https://github.com/DanielArndt0/pydf.git@v1.0.0" +``` + +## Validação local antes de subir + +```bash +pip install -e .[dev] +pytest -v +python -m build +``` diff --git a/docs/CLI.en.md b/docs/CLI.en.md new file mode 100644 index 0000000..7f1a85e --- /dev/null +++ b/docs/CLI.en.md @@ -0,0 +1,101 @@ +# CLI + +[Versão em Português do Brasil](CLI.pt-BR.md) + +## What it is + +The CLI exposes the library through the `pydf` terminal command. + +Use it when you want to process PDFs without writing Python code. + +## Installation for CLI usage + +### Local development mode + +```bash +pip install -e . +``` + +This registers the `pydf` command in the current Python environment. + +### Package build + +```bash +python -m build +``` + +## How to run it + +### General help + +```bash +pydf --help +``` + +### Minimum example + +```bash +pydf examples/pdf_invoices +``` + +### Explicit Excel output path + +```bash +pydf examples/pdf_invoices --output output/invoices.xlsx +``` + +### Recursive search + +```bash +pydf examples --recursive +``` + +### Custom regex + +```bash +pydf invoices --invoice-number-pattern "INVOICE #(\d+)" --invoice-date-pattern "(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})" +``` + +### MySQL persistence + +```bash +pydf examples/pdf_invoices --persist-to-database --db-host localhost --db-user root --db-password "" --db-name process_invoices --db-table invoice_records +``` + +## Flags and arguments + +### Positional argument + +- `input_dir`: directory containing PDFs. Defaults to `pdf_invoices`. + +### Main flags + +- `--output`: output `.xlsx` path. +- `--invoice-number-pattern`: invoice number regex. +- `--invoice-date-pattern`: invoice date regex. +- `--recursive`: search subdirectories. +- `--persist-to-database`: enable MySQL persistence. +- `--db-host`: MySQL host. +- `--db-user`: MySQL user. +- `--db-password`: MySQL password. +- `--db-name`: database name. +- `--db-table`: table name. +- `--version`: show CLI version. +- `--help`: show help. + +## Expected output + +At the end of the execution, the CLI prints: + +- number of processed files; +- number of successes; +- number of errors; +- final generated Excel path. + +## When to use the CLI instead of the API + +Use the CLI when: + +- you only need to run a quick batch job; +- you want to automate execution in shell scripts or CI; +- you do not need to integrate the result into another Python application. diff --git a/docs/CLI.pt-BR.md b/docs/CLI.pt-BR.md new file mode 100644 index 0000000..1232f8a --- /dev/null +++ b/docs/CLI.pt-BR.md @@ -0,0 +1,101 @@ +# CLI (Português do Brasil) + +[English version](CLI.en.md) + +## O que é + +A CLI expõe a biblioteca no terminal por meio do comando `pydf`. + +Ela serve para quem quer processar PDFs sem escrever código Python. + +## Como instalar para usar a CLI + +### Modo local de desenvolvimento + +```bash +pip install -e . +``` + +Isso registra o comando `pydf` no ambiente Python atual. + +### Build de pacote + +```bash +python -m build +``` + +## Como executar + +### Ajuda geral + +```bash +pydf --help +``` + +### Exemplo mínimo + +```bash +pydf examples/pdf_invoices +``` + +### Definindo o Excel de saída + +```bash +pydf examples/pdf_invoices --output output/invoices.xlsx +``` + +### Busca recursiva + +```bash +pydf examples --recursive +``` + +### Com regex customizada + +```bash +pydf invoices --invoice-number-pattern "INVOICE #(\d+)" --invoice-date-pattern "(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})" +``` + +### Com MySQL + +```bash +pydf examples/pdf_invoices --persist-to-database --db-host localhost --db-user root --db-password "" --db-name process_invoices --db-table invoice_records +``` + +## Flags e argumentos + +### Argumento posicional + +- `input_dir`: pasta com PDFs. Se omitido, usa `pdf_invoices`. + +### Flags principais + +- `--output`: caminho do arquivo `.xlsx`. +- `--invoice-number-pattern`: regex do número da fatura. +- `--invoice-date-pattern`: regex da data da fatura. +- `--recursive`: busca em subpastas. +- `--persist-to-database`: ativa persistência em MySQL. +- `--db-host`: host do MySQL. +- `--db-user`: usuário do MySQL. +- `--db-password`: senha do MySQL. +- `--db-name`: nome do banco. +- `--db-table`: nome da tabela. +- `--version`: mostra a versão da CLI. +- `--help`: mostra a ajuda. + +## Saída esperada + +Ao final da execução, a CLI mostra: + +- quantidade de arquivos processados; +- quantidade de sucessos; +- quantidade de erros; +- caminho final do Excel gerado. + +## Quando usar CLI em vez da API + +Use a CLI quando: + +- você só quer rodar um lote rapidamente; +- vai automatizar isso em scripts `.bat`, shell script ou CI; +- não precisa integrar o resultado em outra aplicação Python. diff --git a/docs/ENVIRONMENT.en.md b/docs/ENVIRONMENT.en.md new file mode 100644 index 0000000..88322c8 --- /dev/null +++ b/docs/ENVIRONMENT.en.md @@ -0,0 +1,28 @@ +# Python environment, venv, and troubleshooting + +[Versão em Português (Brasil)](ENVIRONMENT.pt-BR.md) + +## Minimum Python version + +This project requires **Python 3.10 or higher**. + +## Windows: list installed Python versions + +```powershell +py -0p +``` + +## Create a Python 3.10 virtual environment + +```powershell +py -3.10 -m venv .venv +.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip +python -m pip install -e .[dev] +``` + +## Run tests + +```powershell +python -m pytest -v +``` diff --git a/docs/ENVIRONMENT.pt-BR.md b/docs/ENVIRONMENT.pt-BR.md new file mode 100644 index 0000000..9b81956 --- /dev/null +++ b/docs/ENVIRONMENT.pt-BR.md @@ -0,0 +1,88 @@ +# Ambiente Python, venv e troubleshooting + +[English version](ENVIRONMENT.en.md) + +## Versão mínima do Python + +Este projeto requer **Python 3.10 ou superior**. + +Se você tentar instalar com uma versão mais antiga, poderá ver algo como: + +```text +ERROR: Package 'pydf' requires a different Python: 3.8.5 not in '>=3.10' +``` + +## Como verificar as versões instaladas no Windows + +```powershell +py -0p +``` + +## Como criar um ambiente virtual com Python 3.10 + +Na raiz do projeto: + +```powershell +py -3.10 -m venv .venv +.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip +python -m pip install -e .[dev] +``` + +## Como confirmar a versão ativa + +```powershell +python --version +``` + +## Como rodar os testes + +```powershell +python -m pytest -v +``` + +## Como testar se o pacote foi instalado + +```powershell +python -c "import pydf; print('ok')" +``` + +## Se o comando da CLI não for reconhecido + +Use o modo por módulo: + +```powershell +python -m pydf.cli --help +``` + +## Problemas comuns + +### 1. `No module named 'pydf'` + +O pacote ainda não foi instalado no ambiente atual. + +Resolva com: + +```powershell +python -m pip install -e . +``` + +### 2. `requires a different Python` + +Você está usando uma versão abaixo do mínimo exigido. + +Crie o venv com Python 3.10 ou superior. + +### 3. `pytest` não reconhecido + +Use: + +```powershell +python -m pytest -v +``` + +ou instale as dependências de desenvolvimento: + +```powershell +python -m pip install -e .[dev] +``` diff --git a/docs/README.en.md b/docs/README.en.md new file mode 100644 index 0000000..2ff1dff --- /dev/null +++ b/docs/README.en.md @@ -0,0 +1,19 @@ +# English Documentation + +[Versão em Português (Brasil)](README.pt-BR.md) + +## Overview + +`pydf` is a Python library created from the original project while preserving its core idea: read invoice PDFs, extract invoice number and date, generate an Excel spreadsheet, and optionally persist the data to MySQL. + +This version also includes a CLI, commented examples, and recommended GitHub Actions workflows. + +## Start here + +- [Main README](../README.md) +- [CLI guide](CLI.en.md) +- [Public API guide](API.en.md) +- [Architecture](ARCHITECTURE.md) +- [CI/CD and Releases](CI-CD.en.md) +- [Python environment, venv, and troubleshooting](ENVIRONMENT.en.md) +- [Examples](../examples/README.md) diff --git a/docs/README.pt-BR.md b/docs/README.pt-BR.md new file mode 100644 index 0000000..7127c82 --- /dev/null +++ b/docs/README.pt-BR.md @@ -0,0 +1,53 @@ +# Documentação em Português (Brasil) + +[English version](README.en.md) + +## Visão geral + +A `pydf` é uma biblioteca Python criada a partir do projeto original, mantendo a mesma ideia principal: ler PDFs de faturas, extrair número e data da fatura, gerar uma planilha Excel e, opcionalmente, persistir os dados em MySQL. + +Esta versão também inclui uma CLI, exemplos comentados e workflows recomendados para GitHub Actions. + +## Comece por aqui + +- [README principal](../README.md) +- [Guia da CLI](CLI.pt-BR.md) +- [Guia da API pública](API.pt-BR.md) +- [Arquitetura](ARCHITECTURE.pt-BR.md) +- [CI/CD e Releases](CI-CD.pt-BR.md) +- [Ambiente Python, venv e troubleshooting](ENVIRONMENT.pt-BR.md) +- [Exemplos](../examples/README.md) + +## Instalação + +```bash +pip install -e . +``` + +Com dependências de desenvolvimento: + +```bash +pip install -e .[dev] +``` + +## Formas de uso + +### 1. Via CLI + +```bash +pydf examples/pdf_invoices --output output/invoices.xlsx +``` + +### 2. Via Python + +```python +from pydf import InvoiceProcessor, ProcessorConfig + +config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel="output/invoices.xlsx", +) + +result = InvoiceProcessor(config).process() +print(result.output_excel) +``` diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..c75341f --- /dev/null +++ b/examples/README.md @@ -0,0 +1,23 @@ +# Examples + +This folder contains small, commented examples showing the most common ways to use the library. + +## Files + +- `basic_usage.py`: minimal API usage +- `custom_regex_usage.py`: custom regex patterns for different PDF layouts +- `mysql_usage.py`: optional MySQL persistence +- `recursive_usage.py`: recursive PDF discovery in nested folders + +## Running the examples + +From the project root: + +```bash +pip install -e .[dev] +python examples/basic_usage.py +python examples/custom_regex_usage.py +python examples/recursive_usage.py +``` + +The MySQL example requires a running MySQL server and a pre-created table. diff --git a/examples/basic_usage.py b/examples/basic_usage.py new file mode 100644 index 0000000..4237322 --- /dev/null +++ b/examples/basic_usage.py @@ -0,0 +1,21 @@ +"""Basic example for the library API. + +Run with: + python examples/basic_usage.py +""" + +from pydf import InvoiceProcessor, ProcessorConfig + +# The default regex patterns already match the sample PDFs. +config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel="examples/output/basic_usage.xlsx", +) + +result = InvoiceProcessor(config).process() + +print(f"Processed files: {len(result.records)}") +print(f"Output spreadsheet: {result.output_excel}") + +for record in result.records: + print(record) diff --git a/examples/custom_regex_usage.py b/examples/custom_regex_usage.py new file mode 100644 index 0000000..1e2501d --- /dev/null +++ b/examples/custom_regex_usage.py @@ -0,0 +1,17 @@ +"""Example showing how to customize regex patterns. + +Use this when your PDFs have a different wording than the sample invoices. +""" + +from pydf import InvoiceProcessor, ProcessorConfig + +config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel="examples/output/custom_regex_usage.xlsx", + # These are examples only. Adjust them to your own PDF layout. + invoice_number_pattern=r"INVOICE #(\d+)", + invoice_date_pattern=r"(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})", +) + +result = InvoiceProcessor(config).process() +print(result.output_excel) diff --git a/examples/mysql_usage.py b/examples/mysql_usage.py new file mode 100644 index 0000000..ece29c2 --- /dev/null +++ b/examples/mysql_usage.py @@ -0,0 +1,25 @@ +"""Example showing how to enable MySQL persistence. + +Before running this example, create the database/table and review your credentials. +""" + +from pydf import DatabaseConfig, InvoiceProcessor, ProcessorConfig + +# Update these credentials before using against a real database. +db_config = DatabaseConfig( + host="localhost", + user="root", + password="", + database="process_invoices", + table="invoice_records", +) + +config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel="examples/output/mysql_usage.xlsx", + persist_to_database=True, + database=db_config, +) + +result = InvoiceProcessor(config).process() +print(result.output_excel) diff --git a/pdf_invoices/Invoice1.pdf b/examples/pdf_invoices/Invoice1.pdf similarity index 100% rename from pdf_invoices/Invoice1.pdf rename to examples/pdf_invoices/Invoice1.pdf diff --git a/pdf_invoices/Invoice2.pdf b/examples/pdf_invoices/Invoice2.pdf similarity index 100% rename from pdf_invoices/Invoice2.pdf rename to examples/pdf_invoices/Invoice2.pdf diff --git a/pdf_invoices/Invoice3.pdf b/examples/pdf_invoices/Invoice3.pdf similarity index 100% rename from pdf_invoices/Invoice3.pdf rename to examples/pdf_invoices/Invoice3.pdf diff --git a/pdf_invoices/Invoice4.pdf b/examples/pdf_invoices/Invoice4.pdf similarity index 100% rename from pdf_invoices/Invoice4.pdf rename to examples/pdf_invoices/Invoice4.pdf diff --git a/examples/recursive_usage.py b/examples/recursive_usage.py new file mode 100644 index 0000000..0aefef8 --- /dev/null +++ b/examples/recursive_usage.py @@ -0,0 +1,12 @@ +"""Example showing recursive file discovery.""" + +from pydf import InvoiceProcessor, ProcessorConfig + +config = ProcessorConfig( + input_dir="examples", + output_excel="examples/output/recursive_usage.xlsx", + recursive=True, +) + +result = InvoiceProcessor(config).process() +print(f"Files found: {len(result.records)}") diff --git a/invoices.py b/invoices.py deleted file mode 100644 index a5ab79a..0000000 --- a/invoices.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -from openpyxl import Workbook -import pdfplumber -import re -from datetime import datetime -import mysql.connector - -def execute_insert(cursor, invoice_number, invoice_date, file_name, status): - sql = "INSERT INTO invoice_records (invoice_number, invoice_date, file_name,status) VALUES (%s, %s, %s, %s)" - val = (invoice_number, invoice_date, file_name,status) - cursor.execute(sql, val) - -def main(): - # STARTUP - - # Database Connection - db = mysql.connector.connect( - host="localhost", - user="root", - password="", - database="process_invoices" - ) - cursor = db.cursor() - print("--- Successfully connected to database... ---") - - # Get files from directory - directory = 'pdf_invoices' - files = os.listdir(directory) - files_quantity = len(files) - - if files_quantity == 0: - raise Exception("No files found in the directory") - - # Create Excel file - wb = Workbook() - ws = wb.active - ws.title = 'Invoice Imports' - - ws['A1'] = 'Invoice #' - ws['B1'] = 'Date' - ws['C1'] = 'File Name' - ws['D1'] = 'Status' - - last_empty_line = 1 - while ws["D" + str(last_empty_line)].value is not None: - last_empty_line += 1 - - # WORK - for file in files: - try: - with pdfplumber.open(directory + "/" + file) as pdf: - first_page = pdf.pages[0] - pdf_text = first_page.extract_text() - - inv_number_re_pattern = r'INVOICE #(\d+)' - inv_date_re_pattern = r'DATE (\d{2}/\d{2}/\d{4})' - - match_number = re.search(inv_number_re_pattern, pdf_text) - match_date = re.search(inv_date_re_pattern, pdf_text) - - if match_number: - ws['A{}'.format(last_empty_line)] = match_number.group(1) - else: - raise Exception("Couldn't find invoice number") - - if match_date: - ws['B{}'.format(last_empty_line)] = match_date.group(1) - else: - raise Exception("Couldn't find invoice date") - - ws['C{}'.format(last_empty_line)] = file - ws['D{}'.format(last_empty_line)] = "Completed" - - execute_insert(cursor, match_number.group(1), match_date.group(1), file, "Completed") - db.commit() - - last_empty_line += 1 - - except Exception as e: - print(f"Error processing file: {e}") - - ws['C{}'.format(last_empty_line)] = file - ws['D{}'.format(last_empty_line)] = "Exception: {}".format(e) - - execute_insert(cursor, "N/A", "N/A", file, "Exception: {}".format(e)) - db.commit() - - last_empty_line += 1 - - cursor.close() - db.close() - - full_now = str(datetime.now()).replace(":", "-") - dot_index = full_now.index(".") - now = full_now[:dot_index] - wb.save("Invoices - {}.xlsx".format(now)) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..503861a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "pydf" +version = "1.0.0" +description = "Biblioteca Python leve para extrair metadados de faturas em PDF, exportar para Excel e opcionalmente persistir em MySQL." +readme = "README.md" +requires-python = ">=3.10" +license = {text = "MIT"} +authors = [{name = "Daniel Arndt"}] +dependencies = [ + "openpyxl>=3.1.0", + "pdfplumber>=0.11.0", + "mysql-connector-python>=9.0.0", +] +keywords = ["pdf", "invoice", "excel", "mysql", "automation"] +classifiers = [ + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] + +[project.urls] +Documentation = "https://github.com/DanielArndt0/pydf/README.md" +Homepage = "https://github.com/DanielArndt0/pydf" +Repository = "https://github.com/DanielArndt0/pydf" +Issues = "https://github.com/DanielArndt0/pydf/issues" + +[project.scripts] +pydf = "pydf.cli:main" + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "build>=1.2.0", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] + + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/src/pydf/__init__.py b/src/pydf/__init__.py new file mode 100644 index 0000000..0eb75cd --- /dev/null +++ b/src/pydf/__init__.py @@ -0,0 +1,21 @@ +"""Public package interface for pydf. + +The package keeps the original project idea intact: +read invoice PDFs, extract a few key fields, export the result to Excel, +and optionally persist records to MySQL. +""" + +from .config import DatabaseConfig, ProcessorConfig +from .models import InvoiceRecord, ProcessingResult +from .parser import extract_text_from_pdf, parse_invoice +from .processor import InvoiceProcessor + +__all__ = [ + "DatabaseConfig", + "ProcessorConfig", + "InvoiceRecord", + "ProcessingResult", + "InvoiceProcessor", + "extract_text_from_pdf", + "parse_invoice", +] diff --git a/src/pydf/cli.py b/src/pydf/cli.py new file mode 100644 index 0000000..3d657a3 --- /dev/null +++ b/src/pydf/cli.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +"""Command-line interface for pydf.""" + +import argparse + +from .config import DatabaseConfig, ProcessorConfig +from .processor import InvoiceProcessor + +VERSION = "1.0.0" + + +def build_parser() -> argparse.ArgumentParser: + """Create and return the CLI argument parser.""" + parser = argparse.ArgumentParser( + prog="pydf", + description=( + "Process invoice PDFs, extract invoice number and date, " + "export results to Excel, and optionally persist records to MySQL." + ), + epilog=""" +Examples: + pydf + pydf examples/pdf_invoices + pydf examples/pdf_invoices --output out/invoices.xlsx + pydf invoices --recursive + pydf invoices --invoice-number-pattern "Invoice No\\. (\\d+)" + pydf invoices --persist-to-database --db-host localhost --db-user root --db-name process_invoices +""".strip(), + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument( + "input_dir", + nargs="?", + default="pdf_invoices", + help="Directory containing PDF files (default: %(default)s).", + ) + + parser.add_argument( + "--output", + dest="output_excel", + metavar="PATH", + help=( + "Path to the output .xlsx file. " + "If omitted, a timestamped file name will be generated." + ), + ) + + parser.add_argument( + "--invoice-number-pattern", + default=r"INVOICE #(\d+)", + metavar="REGEX", + help="Regex with one capture group for the invoice number.", + ) + + parser.add_argument( + "--invoice-date-pattern", + default=r"(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})", + metavar="REGEX", + help="Regex with one capture group for the invoice date.", + ) + + parser.add_argument( + "--recursive", + action="store_true", + help="Search for PDF files recursively in subdirectories.", + ) + + parser.add_argument( + "--persist-to-database", + action="store_true", + help="Persist processed records to MySQL in addition to generating the Excel file.", + ) + + parser.add_argument( + "--db-host", + default="localhost", + metavar="HOST", + help="MySQL host (default: %(default)s).", + ) + + parser.add_argument( + "--db-user", + default="root", + metavar="USER", + help="MySQL user (default: %(default)s).", + ) + + parser.add_argument( + "--db-password", + default="", + metavar="PASSWORD", + help="MySQL password (default: empty).", + ) + + parser.add_argument( + "--db-name", + default="process_invoices", + metavar="NAME", + help="MySQL database name (default: %(default)s).", + ) + + parser.add_argument( + "--db-table", + default="invoice_records", + metavar="TABLE", + help="MySQL target table name (default: %(default)s).", + ) + + parser.add_argument( + "--version", + action="version", + version=f"%(prog)s {VERSION}", + ) + + return parser + + +def build_database_config(args: argparse.Namespace) -> DatabaseConfig | None: + """Build database configuration if persistence is enabled.""" + if not args.persist_to_database: + return None + + return DatabaseConfig( + host=args.db_host, + user=args.db_user, + password=args.db_password, + database=args.db_name, + table=args.db_table, + ) + + +def main() -> None: + """Run the CLI.""" + parser = build_parser() + args = parser.parse_args() + + config = ProcessorConfig( + input_dir=args.input_dir, + output_excel=args.output_excel, + invoice_number_pattern=args.invoice_number_pattern, + invoice_date_pattern=args.invoice_date_pattern, + persist_to_database=args.persist_to_database, + database=build_database_config(args), + recursive=args.recursive, + ) + + result = InvoiceProcessor(config).process() + + print("\nPyDF processing completed") + print("-" * 28) + print(f"Processed files : {len(result.records)}") + print(f"Success : {result.success_count}") + print(f"Errors : {result.error_count}") + print(f"Excel output : {result.output_excel}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/pydf/config.py b/src/pydf/config.py new file mode 100644 index 0000000..836db41 --- /dev/null +++ b/src/pydf/config.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +"""Configuration models used by the public API and the CLI.""" + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +@dataclass(slots=True) +class DatabaseConfig: + """Connection settings for the optional MySQL persistence layer. + + Attributes: + host: MySQL server hostname. + user: MySQL username. + password: MySQL password. + database: Target database name. + table: Target table name. + """ + + host: str = "localhost" + user: str = "root" + password: str = "" + database: str = "process_invoices" + table: str = "invoice_records" + + +@dataclass(slots=True) +class ProcessorConfig: + """Main configuration object used by :class:`pydf.InvoiceProcessor`. + + The defaults intentionally stay close to the original project so the + migration from script to library remains straightforward. + """ + + input_dir: Path | str = "pdf_invoices" + output_excel: Optional[Path | str] = None + invoice_number_pattern: str = r"INVOICE #(\d+)" + invoice_date_pattern: str = r"(?:DATE|DATE OF ISSUE):?\s*(\d{2}/\d{2}/\d{4})" + worksheet_name: str = "Invoice Imports" + status_completed: str = "Completed" + persist_to_database: bool = False + database: Optional[DatabaseConfig] = None + recursive: bool = False + supported_extensions: tuple[str, ...] = field(default_factory=lambda: (".pdf",)) + + def resolved_input_dir(self) -> Path: + """Return the absolute input directory path.""" + return Path(self.input_dir).expanduser().resolve() diff --git a/src/pydf/database.py b/src/pydf/database.py new file mode 100644 index 0000000..d6881ac --- /dev/null +++ b/src/pydf/database.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +"""MySQL helpers used when database persistence is enabled.""" + +from contextlib import contextmanager +from typing import Iterator + +import mysql.connector + +from .config import DatabaseConfig +from .models import InvoiceRecord + + +@contextmanager +def mysql_connection(config: DatabaseConfig) -> Iterator[mysql.connector.MySQLConnection]: + """Create and automatically close a MySQL connection.""" + connection = mysql.connector.connect( + host=config.host, + user=config.user, + password=config.password, + database=config.database, + ) + try: + yield connection + finally: + connection.close() + + +def insert_record(connection: mysql.connector.MySQLConnection, config: DatabaseConfig, record: InvoiceRecord) -> None: + """Insert one processed record into the configured MySQL table.""" + sql = ( + f"INSERT INTO {config.table} " + "(invoice_number, invoice_date, file_name, status) VALUES (%s, %s, %s, %s)" + ) + values = (record.invoice_number, record.invoice_date, record.file_name, record.status) + cursor = connection.cursor() + try: + cursor.execute(sql, values) + connection.commit() + finally: + cursor.close() diff --git a/src/pydf/excel.py b/src/pydf/excel.py new file mode 100644 index 0000000..d1f8b1c --- /dev/null +++ b/src/pydf/excel.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +"""Excel export utilities.""" + +from pathlib import Path + +from openpyxl import Workbook + +from .models import InvoiceRecord + + +HEADERS = ["Invoice #", "Date", "File Name", "Status"] + + +def export_records_to_excel(records: list[InvoiceRecord], output_path: Path, sheet_name: str) -> Path: + """Export processed invoice records to an Excel workbook. + + Args: + records: Records produced by the processing pipeline. + output_path: Destination XLSX path. + sheet_name: Worksheet title to use inside the workbook. + """ + wb = Workbook() + ws = wb.active + ws.title = sheet_name + + for index, header in enumerate(HEADERS, start=1): + ws.cell(row=1, column=index, value=header) + + for row_index, record in enumerate(records, start=2): + ws.cell(row=row_index, column=1, value=record.invoice_number) + ws.cell(row=row_index, column=2, value=record.invoice_date) + ws.cell(row=row_index, column=3, value=record.file_name) + ws.cell(row=row_index, column=4, value=record.status) + + output_path.parent.mkdir(parents=True, exist_ok=True) + wb.save(output_path) + return output_path diff --git a/src/pydf/legacy.py b/src/pydf/legacy.py new file mode 100644 index 0000000..fc5dad2 --- /dev/null +++ b/src/pydf/legacy.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +"""Compatibility helpers for users migrating from the original script.""" + +from .config import DatabaseConfig, ProcessorConfig +from .processor import InvoiceProcessor + + +def run_legacy_flow() -> None: + """Run a configuration close to the original project behavior.""" + config = ProcessorConfig( + input_dir="pdf_invoices", + persist_to_database=True, + database=DatabaseConfig( + host="localhost", + user="root", + password="", + database="process_invoices", + table="invoice_records", + ), + ) + InvoiceProcessor(config).process() diff --git a/src/pydf/models.py b/src/pydf/models.py new file mode 100644 index 0000000..6155674 --- /dev/null +++ b/src/pydf/models.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +"""Lightweight data structures returned by the library.""" + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +@dataclass(slots=True) +class InvoiceRecord: + """Represents the outcome of parsing a single PDF file.""" + + invoice_number: str + invoice_date: str + file_name: str + status: str + + +@dataclass(slots=True) +class ProcessingResult: + """Aggregated result produced by :meth:`pydf.InvoiceProcessor.process`.""" + + records: list[InvoiceRecord] = field(default_factory=list) + output_excel: Optional[Path] = None + + @property + def success_count(self) -> int: + """Return how many records finished with the configured success status.""" + return sum(1 for record in self.records if record.status == "Completed") + + @property + def error_count(self) -> int: + """Return how many records finished with an error-like status.""" + return len(self.records) - self.success_count diff --git a/src/pydf/parser.py b/src/pydf/parser.py new file mode 100644 index 0000000..db39b13 --- /dev/null +++ b/src/pydf/parser.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +"""PDF parsing helpers. + +This module is intentionally small and focused. It reads the first page of the +PDF, extracts text, and applies the configured regular expressions. +""" + +import re +from pathlib import Path + +import pdfplumber + +from .models import InvoiceRecord + + +def extract_text_from_pdf(file_path: Path) -> str: + """Extract text from the first page of a PDF file. + + Args: + file_path: Absolute or relative path to a PDF file. + + Returns: + The extracted text from the first page. + + Raises: + ValueError: If the PDF contains no pages. + """ + with pdfplumber.open(file_path) as pdf: + if not pdf.pages: + raise ValueError("PDF has no pages") + return pdf.pages[0].extract_text() or "" + + +def parse_invoice(file_path: Path, number_pattern: str, date_pattern: str, completed_status: str) -> InvoiceRecord: + """Parse a single invoice-like PDF into an :class:`InvoiceRecord`. + + Args: + file_path: PDF file to inspect. + number_pattern: Regex with one capturing group for the invoice number. + date_pattern: Regex with one capturing group for the invoice date. + completed_status: Status text to use when extraction succeeds. + """ + pdf_text = extract_text_from_pdf(file_path) + + match_number = re.search(number_pattern, pdf_text) + match_date = re.search(date_pattern, pdf_text) + + if not match_number: + raise ValueError("Couldn't find invoice number") + if not match_date: + raise ValueError("Couldn't find invoice date") + + return InvoiceRecord( + invoice_number=match_number.group(1), + invoice_date=match_date.group(1), + file_name=file_path.name, + status=completed_status, + ) diff --git a/src/pydf/processor.py b/src/pydf/processor.py new file mode 100644 index 0000000..4d5970a --- /dev/null +++ b/src/pydf/processor.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +"""Processing pipeline orchestration.""" + +from datetime import datetime +from pathlib import Path +from typing import Iterator + +from .config import ProcessorConfig +from .database import insert_record, mysql_connection +from .excel import export_records_to_excel +from .models import InvoiceRecord, ProcessingResult +from .parser import parse_invoice + + +class InvoiceProcessor: + """High-level processing service. + + Typical flow: + 1. Discover PDF files in the configured input directory. + 2. Parse each file with regex-based extraction. + 3. Optionally persist the results to MySQL. + 4. Export all records to an Excel workbook. + """ + + def __init__(self, config: ProcessorConfig | None = None): + self.config = config or ProcessorConfig() + + def process(self) -> ProcessingResult: + """Run the full pipeline and return the aggregated result.""" + files = list(self._iter_files()) + if not files: + raise FileNotFoundError("No PDF files found in the input directory") + + records: list[InvoiceRecord] = [] + db_connection = None + + if self.config.persist_to_database: + if not self.config.database: + raise ValueError("database config is required when persist_to_database=True") + db_context = mysql_connection(self.config.database) + db_connection = db_context.__enter__() + else: + db_context = None + + try: + for file_path in files: + try: + record = parse_invoice( + file_path=file_path, + number_pattern=self.config.invoice_number_pattern, + date_pattern=self.config.invoice_date_pattern, + completed_status=self.config.status_completed, + ) + except Exception as exc: + # Keep the batch running even if one file fails, + # which mirrors the practical behavior of the original script. + record = InvoiceRecord( + invoice_number="N/A", + invoice_date="N/A", + file_name=file_path.name, + status=f"Exception: {exc}", + ) + + records.append(record) + + if db_connection is not None and self.config.database is not None: + insert_record(db_connection, self.config.database, record) + finally: + if db_context is not None: + db_context.__exit__(None, None, None) + + output_excel = Path(self.config.output_excel) if self.config.output_excel else self._default_output_name() + output_excel = export_records_to_excel(records, output_excel, self.config.worksheet_name) + return ProcessingResult(records=records, output_excel=output_excel) + + def _iter_files(self) -> Iterator[Path]: + """Yield supported files from the configured input directory.""" + input_dir = self.config.resolved_input_dir() + if self.config.recursive: + for file_path in input_dir.rglob('*'): + if file_path.suffix.lower() in self.config.supported_extensions: + yield file_path + else: + for file_path in input_dir.iterdir(): + if file_path.is_file() and file_path.suffix.lower() in self.config.supported_extensions: + yield file_path + + @staticmethod + def _default_output_name() -> Path: + """Build the default timestamped Excel file name.""" + timestamp = datetime.now().strftime('%Y-%m-%d %H-%M-%S') + return Path(f'Invoices - {timestamp}.xlsx') diff --git a/tests/test_smoke.py b/tests/test_smoke.py new file mode 100644 index 0000000..e445299 --- /dev/null +++ b/tests/test_smoke.py @@ -0,0 +1,14 @@ +from pathlib import Path + +from pydf import InvoiceProcessor, ProcessorConfig + + +def test_process_sample_pdfs(tmp_path: Path): + config = ProcessorConfig( + input_dir="examples/pdf_invoices", + output_excel=tmp_path / "output.xlsx", + ) + result = InvoiceProcessor(config).process() + + assert len(result.records) == 4 + assert result.output_excel.exists()