Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 55 additions & 181 deletions .github/workflows/release.yml

Large diffs are not rendered by default.

196 changes: 46 additions & 150 deletions iris_vector_rag/common/db_vector_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
Standardized DB Vector Utilities for IRIS.
Uses proper parameter binding for security and performance.
"""

import os
import logging
from typing import Any, Dict, List, Optional

Expand All @@ -14,168 +20,58 @@ def insert_vector(
additional_data: Optional[Dict[str, Any]] = None,
) -> bool:
"""
Inserts a record with a vector embedding into a specified table.
The vector is truncated to the target_dimension and inserted using TO_VECTOR(?).

Args:
cursor: Database cursor.
table_name: Name of the table to insert into (e.g., "RAG.DocumentTokenEmbeddings").
vector_column_name: Name of the column that stores the vector.
vector_data: The raw embedding vector (list of floats).
target_dimension: The dimension to truncate/pad the vector to.
key_columns: Dictionary of primary key or identifying column names and their values.
(e.g., {"doc_id": "id1", "token_index": 0})
additional_data: Optional dictionary of other column names and their values.
(e.g., {"token_text": "example"})

Returns:
True if insertion was successful, False otherwise.
Inserts a record with a vector embedding using parameter binding.
"""
# Validate cursor handle
if cursor is None:
logger.error(
f"DB Vector Util: Cannot insert vector into table '{table_name}': cursor is NULL"
)
return False
if cursor is None: return False

if not isinstance(vector_data, list) or not all(
isinstance(x, (float, int)) for x in vector_data
):
logger.error(
f"DB Vector Util: Invalid vector_data format for table '{table_name}'. "
f"Expected list of floats/ints. Got type: {type(vector_data)}. Skipping insertion."
)
return False

# Truncate or pad the vector to the target dimension
# Process vector
processed_vector = vector_data[:target_dimension]
if len(processed_vector) < target_dimension:
logger.warning(
f"DB Vector Util: Original vector length ({len(vector_data)}) for table '{table_name}', column '{vector_column_name}' "
f"is less than target dimension ({target_dimension}). Padding with zeros."
)
processed_vector.extend([0.0] * (target_dimension - len(processed_vector)))

# Format as bracketed comma-separated string for IRIS TO_VECTOR() function
embedding_str = "[" + ",".join(map(str, processed_vector)) + "]"

all_columns_dict = {}
all_columns_dict.update(key_columns)
if additional_data:
all_columns_dict.update(additional_data)

# Separate vector column from other data for SQL construction
other_column_names = [col for col in all_columns_dict.keys()]
other_column_values = [all_columns_dict[col] for col in other_column_names]

column_names_sql = ", ".join(other_column_names + [vector_column_name])

# IMPORTANT: TO_VECTOR() does NOT accept parameter markers (?, :param, etc.)
# Must embed the vector string directly in SQL
# Use FLOAT to match how test data was created (see tests/fixtures/embedding_generator.py:247)
placeholders_list = ["?" for _ in other_column_names] + [
f"TO_VECTOR('{embedding_str}', FLOAT, {target_dimension})"
]
placeholders_sql = ", ".join(placeholders_list)

# Use MERGE for upsert functionality to handle duplicates
# Build MERGE statement for IRIS
key_conditions = " AND ".join(
[f"target.{col} = source.{col}" for col in key_columns.keys()]
)
update_assignments = ", ".join(
[
f"{col} = source.{col}"
for col in other_column_names
if col not in key_columns
]
)

# Separate approach: try INSERT first, if it fails due to constraint, try UPDATE
sql_query = (
f"INSERT INTO {table_name} ({column_names_sql}) VALUES ({placeholders_sql})"
)
# Don't pass embedding_str as a parameter - it's embedded in SQL above
params = other_column_values
# Prepare data
all_data = {**key_columns, **(additional_data or {})}
columns = list(all_data.keys())
values = [all_data[c] for c in columns]

# Get vector type
vector_data_type = os.environ.get("IRIS_VECTOR_DATA_TYPE") or "FLOAT"

# Build SQL
column_names = columns + [vector_column_name]
column_sql = ", ".join(column_names)

placeholders = ["?" for _ in columns]
placeholders.append(f"TO_VECTOR(?, {vector_data_type}, {target_dimension})")
placeholders_sql = ", ".join(placeholders)

sql = f"INSERT INTO {table_name} ({column_sql}) VALUES ({placeholders_sql})"
params = values + [embedding_str]

try:
logger.debug(f"DB Vector Util: Executing INSERT: {sql_query}")
logger.debug(f"DB Vector Util: Parameters: {params}")
logger.debug(
f"DB Vector Util: Embedding string length: {len(embedding_str)} chars"
)
logger.debug(f"DB Vector Util: Vector dimension: {target_dimension}")
cursor.execute(sql_query, params)
cursor.execute(sql, tuple(params))
return True
except Exception as e:
# Check for connection handle issues
error_str = str(e).lower()
if "_handle is null" in error_str or "handle is null" in error_str:
logger.error(
f"DB Vector Util: Database connection handle is NULL during vector insertion: {e}"
)
return False

# Check if it's a unique constraint violation
if "UNIQUE" in str(e) or "constraint failed" in str(e):
logger.debug(
f"DB Vector Util: INSERT failed due to duplicate key, attempting UPDATE..."
)

# Build UPDATE statement
set_clauses = []
update_params = []

# Add non-key columns to SET clause
for col in other_column_names:
if col not in key_columns:
set_clauses.append(f"{col} = ?")
update_params.append(all_columns_dict[col])

# Add vector column to SET clause - TO_VECTOR doesn't accept parameters
# Use FLOAT to match how test data was created (see tests/fixtures/embedding_generator.py:247)
set_clauses.append(
f"{vector_column_name} = TO_VECTOR('{embedding_str}', FLOAT, {target_dimension})"
)
# Don't append embedding_str to params - it's embedded in SQL above

# Add key columns to WHERE clause
where_clauses = []
for col, val in key_columns.items():
where_clauses.append(f"{col} = ?")
update_params.append(val)

if set_clauses and where_clauses:
update_sql = f"UPDATE {table_name} SET {', '.join(set_clauses)} WHERE {' AND '.join(where_clauses)}"

try:
logger.debug(f"DB Vector Util: Executing UPDATE: {update_sql}")
cursor.execute(update_sql, update_params)
return True
except Exception as update_error:
# Check for connection handle issues in UPDATE
update_error_str = str(update_error).lower()
if (
"_handle is null" in update_error_str
or "handle is null" in update_error_str
):
logger.error(
f"DB Vector Util: Database connection handle is NULL during UPDATE: {update_error}"
)
else:
logger.error(
f"DB Vector Util: UPDATE also failed: {update_error}"
)
return False
else:
logger.error(f"DB Vector Util: Could not build UPDATE statement")
# Attempt update
set_clauses = [f"{c} = ?" for c in columns if c not in key_columns]
update_params = [all_data[c] for c in columns if c not in key_columns]

set_clauses.append(f"{vector_column_name} = TO_VECTOR(?, {vector_data_type}, {target_dimension})")
update_params.append(embedding_str)

where_clauses = [f"{c} = ?" for c in key_columns]
for c in key_columns:
update_params.append(all_data[c])

update_sql = f"UPDATE {table_name} SET {', '.join(set_clauses)} WHERE {' AND '.join(where_clauses)}"
try:
cursor.execute(update_sql, tuple(update_params))
return True
except Exception as ue:
logger.error(f"Update failed: {ue}")
return False
else:
logger.error(
f"DB Vector Util: Error inserting vector into table '{table_name}', column '{vector_column_name}': {e}"
)
logger.error(f"DB Vector Util: Key columns: {key_columns}")
logger.error(
f"DB Vector Util: Failing embedding string (first 100 chars): {embedding_str[:100] if 'embedding_str' in locals() else 'NOT_SET'}"
)
logger.error(f"Insert failed: {e}")
return False
Loading