Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 97 additions & 44 deletions mssql_python/pybind/ddbc_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2914,6 +2914,10 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
// Note: wcharEncoding parameter is reserved for future use
// Currently WCHAR data always uses UTF-16LE for Windows compatibility
(void)wcharEncoding; // Suppress unused parameter warning
#if !defined(__APPLE__) && !defined(__linux__)
// On Windows, VARCHAR is fetched as SQL_C_WCHAR, so charEncoding is unused.
(void)charEncoding;
#endif

LOG("SQLGetData: Getting data from %d columns for statement_handle=%p", colCount,
(void*)StatementHandle->get());
Expand Down Expand Up @@ -2949,6 +2953,8 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
case SQL_CHAR:
case SQL_VARCHAR:
case SQL_LONGVARCHAR: {
#if defined(__APPLE__) || defined(__linux__)
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR.
if (columnSize == SQL_NO_TOTAL || columnSize == 0 ||
columnSize > SQL_MAX_LOB_SIZE) {
LOG("SQLGetData: Streaming LOB for column %d (SQL_C_CHAR) "
Expand All @@ -2957,34 +2963,16 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
row.append(
FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
} else {
// Allocate columnSize * 4 + 1 on ALL platforms (no #if guard).
//
// Why this differs from SQLBindColums / FetchBatchData:
// Those two functions use #if to apply *4 only on Linux/macOS,
// because on Windows with a non-UTF-8 collation (e.g. CP1252)
// each character occupies exactly 1 byte, so *1 suffices and
// saves memory across the entire batch (fetchSize × numCols
// buffers).
//
// SQLGetData_wrap allocates a single temporary buffer per
// column per row, so the over-allocation cost is negligible.
// Using *4 unconditionally here keeps the code simple and
// correct on every platform—including Windows with a UTF-8
// collation where multi-byte chars could otherwise cause
// truncation at the exact column boundary (e.g. CP1252 é in
// VARCHAR(10)).
// Allocate columnSize * 4 + 1 to accommodate UTF-8 expansion.
uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */;
std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
SQLLEN dataLen;
ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(),
&dataLen);
if (SQL_SUCCEEDED(ret)) {
// columnSize is in chars, dataLen is in bytes
if (dataLen > 0) {
uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
if (numCharsInData < dataBuffer.size()) {
// SQLGetData will null-terminate the data
// Use Python's codec system to decode bytes.
const std::string decodeEncoding =
GetEffectiveCharDecoding(charEncoding);
py::bytes raw_bytes(reinterpret_cast<char*>(dataBuffer.data()),
Expand All @@ -3001,11 +2989,9 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
LOG_ERROR(
"SQLGetData: Failed to decode CHAR column %d with '%s': %s",
i, decodeEncoding.c_str(), e.what());
// Return raw bytes as fallback
row.append(raw_bytes);
}
} else {
// Buffer too small, fallback to streaming
LOG("SQLGetData: CHAR column %d data truncated "
"(buffer_size=%zu), using streaming LOB",
i, dataBuffer.size());
Expand Down Expand Up @@ -3037,6 +3023,66 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
row.append(py::none());
}
}
#else
// On Windows, request SQL_C_WCHAR so the ODBC driver converts
// from the server's native encoding (e.g. CP1252) to UTF-16.
// This avoids the need to guess the server's code page and
// eliminates the bytes-vs-str inconsistency.
if (columnSize == SQL_NO_TOTAL || columnSize == 0 ||
columnSize > SQL_MAX_LOB_SIZE) {
LOG("SQLGetData: Streaming LOB for column %d (VARCHAR as SQL_C_WCHAR) "
"- columnSize=%lu",
i, (unsigned long)columnSize);
row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false, "utf-16le"));
} else {
uint64_t fetchBufferSize =
(columnSize + 1) * sizeof(SQLWCHAR); // +1 for null terminator
std::vector<SQLWCHAR> dataBuffer(columnSize + 1);
SQLLEN dataLen;
ret = SQLGetData_ptr(hStmt, i, SQL_C_WCHAR, dataBuffer.data(), fetchBufferSize,
&dataLen);
if (SQL_SUCCEEDED(ret)) {
if (dataLen > 0) {
uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
if (numCharsInData < dataBuffer.size()) {
std::wstring wstr(reinterpret_cast<wchar_t*>(dataBuffer.data()));
row.append(py::cast(wstr));
LOG("SQLGetData: VARCHAR column %d decoded via SQL_C_WCHAR, "
"length=%lu",
i, (unsigned long)numCharsInData);
} else {
LOG("SQLGetData: VARCHAR column %d data truncated "
"(as WCHAR), using streaming LOB",
i);
row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false,
"utf-16le"));
}
} else if (dataLen == SQL_NULL_DATA) {
LOG("SQLGetData: Column %d is NULL (VARCHAR via WCHAR)", i);
row.append(py::none());
} else if (dataLen == 0) {
row.append(py::str(""));
} else if (dataLen == SQL_NO_TOTAL) {
LOG("SQLGetData: Cannot determine data length "
"(SQL_NO_TOTAL) for column %d (VARCHAR via WCHAR), "
"returning NULL",
i);
row.append(py::none());
Comment on lines +3068 to +3070
} else if (dataLen < 0) {
LOG("SQLGetData: Unexpected negative data length "
"for column %d (VARCHAR via WCHAR) - dataLen=%ld",
i, (long)dataLen);
ThrowStdException("SQLGetData returned an unexpected negative "
"data length");
}
} else {
LOG("SQLGetData: Error retrieving data for column %d "
"(VARCHAR via WCHAR) - SQLRETURN=%d, returning NULL",
i, ret);
row.append(py::none());
}
}
#endif
break;
}
case SQL_SS_XML: {
Expand Down Expand Up @@ -3487,29 +3533,26 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
// TODO: handle variable length data correctly. This logic wont
// suffice
HandleZeroColumnSizeAtFetch(columnSize);
// Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8
// expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes.
#if defined(__APPLE__) || defined(__linux__)
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR
// where each character can be up to 4 bytes.
uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
#else
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
#endif
// TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
// 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
// fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
// size 2GB. If a query fetches multiple (say N) LONG...
// columns, we will have allocated multiple (N) 2GB sized
// vectors. This will make driver very slow. And if the N is
// high enough, we could hit the OS limit for heap memory that
// we can allocate, & hence get a std::bad_alloc. The process
// could also be killed by OS for consuming too much memory.
// Hence this will be revisited in beta to not allocate 2GB+
// memory, & use streaming instead
buffers.charBuffers[col - 1].resize(fetchSize * fetchBufferSize);
ret = SQLBindCol_ptr(hStmt, col, SQL_C_CHAR, buffers.charBuffers[col - 1].data(),
fetchBufferSize * sizeof(SQLCHAR),
buffers.indicators[col - 1].data());
#else
// On Windows, the ODBC driver returns bytes in the server's
// native encoding (e.g., CP1252). Rather than guessing the
// code page, we request SQL_C_WCHAR so the driver performs
// the conversion to UTF-16 — exactly matching how NVARCHAR
// columns are already handled.
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
buffers.wcharBuffers[col - 1].resize(fetchSize * fetchBufferSize);
ret = SQLBindCol_ptr(hStmt, col, SQL_C_WCHAR, buffers.wcharBuffers[col - 1].data(),
fetchBufferSize * sizeof(SQLWCHAR),
buffers.indicators[col - 1].data());
#endif
break;
}
case SQL_WCHAR:
Expand Down Expand Up @@ -3675,9 +3718,9 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize);
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes. Must match SQLBindColums buffer.
#if defined(__APPLE__) || defined(__linux__)
SQLSMALLINT dt = columnInfos[col].dataType;
bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR);
#if defined(__APPLE__) || defined(__linux__)
if (isCharType) {
columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 +
1; // *4 for UTF-8, +1 for null terminator
Expand All @@ -3686,6 +3729,10 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
}
#else
// On Windows, VARCHAR columns are fetched as SQL_C_WCHAR (see
// SQLBindColums). The fetchBufferSize is in SQLWCHAR elements,
// matching the wcharBuffers layout.
(void)isCharType; // same formula for all types on Windows
columnInfos[col].fetchBufferSize =
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
#endif
Expand Down Expand Up @@ -3740,7 +3787,14 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
case SQL_CHAR:
case SQL_VARCHAR:
case SQL_LONGVARCHAR:
#if defined(__APPLE__) || defined(__linux__)
columnProcessors[col] = ColumnProcessors::ProcessChar;
#else
// On Windows, VARCHAR columns are fetched as SQL_C_WCHAR
// (the driver converts from the server's native encoding to
// UTF-16), so we reuse the NVARCHAR processor.
columnProcessors[col] = ColumnProcessors::ProcessWChar;
#endif
break;
case SQL_WCHAR:
case SQL_WVARCHAR:
Expand Down Expand Up @@ -4048,7 +4102,8 @@ size_t calculateRowSize(py::list& columnNames, SQLUSMALLINT numCols) {
break;
case SQL_SS_UDT:
rowSize += (static_cast<SQLLEN>(columnSize) == SQL_NO_TOTAL || columnSize == 0)
? SQL_MAX_LOB_SIZE : columnSize;
? SQL_MAX_LOB_SIZE
: columnSize;
Comment on lines 4102 to +4106
break;
case SQL_BINARY:
case SQL_VARBINARY:
Expand Down Expand Up @@ -4112,8 +4167,7 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch

if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR || dataType == SQL_VARCHAR ||
dataType == SQL_LONGVARCHAR || dataType == SQL_VARBINARY ||
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML ||
dataType == SQL_SS_UDT) &&
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || dataType == SQL_SS_UDT) &&
(columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) {
lobColumns.push_back(i + 1); // 1-based
}
Expand Down Expand Up @@ -4252,8 +4306,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,

if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR || dataType == SQL_VARCHAR ||
dataType == SQL_LONGVARCHAR || dataType == SQL_VARBINARY ||
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML ||
dataType == SQL_SS_UDT) &&
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || dataType == SQL_SS_UDT) &&
(columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) {
lobColumns.push_back(i + 1); // 1-based
}
Expand Down
Loading
Loading