Skip to content

Commit 4885aff

Browse files
authored
Revert "FIX: varchar columnsize does not account for utf8 conversion (#392)"
This reverts commit f71dcfe.
1 parent f71dcfe commit 4885aff

File tree

3 files changed

+59
-206
lines changed

3 files changed

+59
-206
lines changed

mssql_python/pybind/ddbc_bindings.cpp

Lines changed: 29 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2924,9 +2924,7 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
29242924
row.append(
29252925
FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
29262926
} else {
2927-
// Multiply by 4 because utf8 conversion by the driver might
2928-
// turn varchar(x) into up to 3*x (maybe 4*x?) bytes.
2929-
uint64_t fetchBufferSize = 4 * columnSize + 1 /* null-termination */;
2927+
uint64_t fetchBufferSize = columnSize + 1 /* null-termination */;
29302928
std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
29312929
SQLLEN dataLen;
29322930
ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(),
@@ -2955,15 +2953,12 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
29552953
row.append(raw_bytes);
29562954
}
29572955
} else {
2958-
// Reaching this case indicates an error in mssql_python.
2959-
// Theoretically, we could still compensate by calling SQLGetData or
2960-
// FetchLobColumnData more often, but then we would still have to process
2961-
// the data we already got from the above call to SQLGetData.
2962-
// Better to throw an exception and fix the code than to risk returning corrupted data.
2963-
ThrowStdException(
2964-
"Internal error: SQLGetData returned data "
2965-
"larger than expected for CHAR column"
2966-
);
2956+
// Buffer too small, fallback to streaming
2957+
LOG("SQLGetData: CHAR column %d data truncated "
2958+
"(buffer_size=%zu), using streaming LOB",
2959+
i, dataBuffer.size());
2960+
row.append(FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false,
2961+
charEncoding));
29672962
}
29682963
} else if (dataLen == SQL_NULL_DATA) {
29692964
LOG("SQLGetData: Column %d is NULL (CHAR)", i);
@@ -3000,7 +2995,7 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
30002995
case SQL_WCHAR:
30012996
case SQL_WVARCHAR:
30022997
case SQL_WLONGVARCHAR: {
3003-
if (columnSize == SQL_NO_TOTAL || columnSize == 0 || columnSize > 4000) {
2998+
if (columnSize == SQL_NO_TOTAL || columnSize > 4000) {
30042999
LOG("SQLGetData: Streaming LOB for column %d (SQL_C_WCHAR) "
30053000
"- columnSize=%lu",
30063001
i, (unsigned long)columnSize);
@@ -3029,15 +3024,12 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
30293024
"length=%lu for column %d",
30303025
(unsigned long)numCharsInData, i);
30313026
} else {
3032-
// Reaching this case indicates an error in mssql_python.
3033-
// Theoretically, we could still compensate by calling SQLGetData or
3034-
// FetchLobColumnData more often, but then we would still have to process
3035-
// the data we already got from the above call to SQLGetData.
3036-
// Better to throw an exception and fix the code than to risk returning corrupted data.
3037-
ThrowStdException(
3038-
"Internal error: SQLGetData returned data "
3039-
"larger than expected for WCHAR column"
3040-
);
3027+
// Buffer too small, fallback to streaming
3028+
LOG("SQLGetData: NVARCHAR column %d data "
3029+
"truncated, using streaming LOB",
3030+
i);
3031+
row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false,
3032+
"utf-16le"));
30413033
}
30423034
} else if (dataLen == SQL_NULL_DATA) {
30433035
LOG("SQLGetData: Column %d is NULL (NVARCHAR)", i);
@@ -3299,15 +3291,8 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
32993291
row.append(py::bytes(
33003292
reinterpret_cast<const char*>(dataBuffer.data()), dataLen));
33013293
} else {
3302-
// Reaching this case indicates an error in mssql_python.
3303-
// Theoretically, we could still compensate by calling SQLGetData or
3304-
// FetchLobColumnData more often, but then we would still have to process
3305-
// the data we already got from the above call to SQLGetData.
3306-
// Better to throw an exception and fix the code than to risk returning corrupted data.
3307-
ThrowStdException(
3308-
"Internal error: SQLGetData returned data "
3309-
"larger than expected for BINARY column"
3310-
);
3294+
row.append(
3295+
FetchLobColumnData(hStmt, i, SQL_C_BINARY, false, true, ""));
33113296
}
33123297
} else if (dataLen == SQL_NULL_DATA) {
33133298
row.append(py::none());
@@ -3449,9 +3434,7 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
34493434
// TODO: handle variable length data correctly. This logic wont
34503435
// suffice
34513436
HandleZeroColumnSizeAtFetch(columnSize);
3452-
// Multiply by 4 because utf8 conversion by the driver might
3453-
// turn varchar(x) into up to 3*x (maybe 4*x?) bytes.
3454-
uint64_t fetchBufferSize = 4 * columnSize + 1 /*null-terminator*/;
3437+
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
34553438
// TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
34563439
// 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
34573440
// fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
@@ -3597,7 +3580,8 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
35973580
// Fetch rows in batches
35983581
// TODO: Move to anonymous namespace, since it is not used outside this file
35993582
SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& columnNames,
3600-
py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched) {
3583+
py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched,
3584+
const std::vector<SQLUSMALLINT>& lobColumns) {
36013585
LOG("FetchBatchData: Fetching data in batches");
36023586
SQLRETURN ret = SQLFetchScroll_ptr(hStmt, SQL_FETCH_NEXT, 0);
36033587
if (ret == SQL_NO_DATA) {
@@ -3616,28 +3600,19 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
36163600
SQLULEN columnSize;
36173601
SQLULEN processedColumnSize;
36183602
uint64_t fetchBufferSize;
3603+
bool isLob;
36193604
};
36203605
std::vector<ColumnInfo> columnInfos(numCols);
36213606
for (SQLUSMALLINT col = 0; col < numCols; col++) {
36223607
const auto& columnMeta = columnNames[col].cast<py::dict>();
36233608
columnInfos[col].dataType = columnMeta["DataType"].cast<SQLSMALLINT>();
36243609
columnInfos[col].columnSize = columnMeta["ColumnSize"].cast<SQLULEN>();
3610+
columnInfos[col].isLob =
3611+
std::find(lobColumns.begin(), lobColumns.end(), col + 1) != lobColumns.end();
36253612
columnInfos[col].processedColumnSize = columnInfos[col].columnSize;
36263613
HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize);
3627-
switch (columnInfos[col].dataType) {
3628-
case SQL_CHAR:
3629-
case SQL_VARCHAR:
3630-
case SQL_LONGVARCHAR:
3631-
// Multiply by 4 because utf8 conversion by the driver might
3632-
// turn varchar(x) into up to 3*x (maybe 4*x?) bytes.
3633-
columnInfos[col].fetchBufferSize =
3634-
4 * columnInfos[col].processedColumnSize + 1; // +1 for null terminator
3635-
break;
3636-
default:
3637-
columnInfos[col].fetchBufferSize =
3638-
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
3639-
break;
3640-
}
3614+
columnInfos[col].fetchBufferSize =
3615+
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
36413616
}
36423617

36433618
std::string decimalSeparator = GetDecimalSeparator(); // Cache decimal separator
@@ -3655,6 +3630,7 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
36553630
columnInfosExt[col].columnSize = columnInfos[col].columnSize;
36563631
columnInfosExt[col].processedColumnSize = columnInfos[col].processedColumnSize;
36573632
columnInfosExt[col].fetchBufferSize = columnInfos[col].fetchBufferSize;
3633+
columnInfosExt[col].isLob = columnInfos[col].isLob;
36583634

36593635
// Map data type to processor function (switch executed once per column,
36603636
// not per cell)
@@ -3763,7 +3739,7 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
37633739
// types) to just 10 (setup only) Note: Processor functions no
37643740
// longer need to check for NULL since we do it above
37653741
if (columnProcessors[col - 1] != nullptr) {
3766-
columnProcessors[col - 1](row, buffers, &columnInfosExt[col - 1], col, i);
3742+
columnProcessors[col - 1](row, buffers, &columnInfosExt[col - 1], col, i, hStmt);
37673743
continue;
37683744
}
37693745

@@ -3940,9 +3916,7 @@ size_t calculateRowSize(py::list& columnNames, SQLUSMALLINT numCols) {
39403916
case SQL_CHAR:
39413917
case SQL_VARCHAR:
39423918
case SQL_LONGVARCHAR:
3943-
// Multiply by 4 because utf8 conversion by the driver might
3944-
// turn varchar(x) into up to 3*x (maybe 4*x?) bytes.
3945-
rowSize += 4 * columnSize;
3919+
rowSize += columnSize;
39463920
break;
39473921
case SQL_SS_XML:
39483922
case SQL_WCHAR:
@@ -4096,7 +4070,7 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
40964070
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)(intptr_t)fetchSize, 0);
40974071
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);
40984072

4099-
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched);
4073+
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
41004074
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
41014075
LOG("FetchMany_wrap: Error when fetching data - SQLRETURN=%d", ret);
41024076
return ret;
@@ -4229,7 +4203,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
42294203

42304204
while (ret != SQL_NO_DATA) {
42314205
ret =
4232-
FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched);
4206+
FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
42334207
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
42344208
LOG("FetchAll_wrap: Error when fetching data - SQLRETURN=%d", ret);
42354209
return ret;

0 commit comments

Comments
 (0)