Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions chatbot-core/data/chunking/extract_chunk_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def extract_chunks(docs):
Processes all Jenkins documentation pages by chunking their content.

Args:
docs (dict): A dictionary mapping URLs to raw HTML strings.
docs (dict): A dictionary mapping URLs to raw HTML strings or nested dicts.

Returns:
list[dict]: A list of all processed chunks across all docs.
Expand All @@ -91,7 +91,18 @@ def extract_chunks(docs):
text_splitter = get_text_splitter(CHUNK_SIZE, CHUNK_OVERLAP)

for url, html in docs.items():
page_chunks = process_page(url, html, text_splitter)
# Handle both nested dict format and raw string format (for unit tests)
if isinstance(html, dict):
html_values = list(html.values())
if not html_values:
logger.warning("No content found for URL: %s", url)
continue
actual_html_string = html_values[0]
else:
actual_html_string = html

# Process the page using the extracted string
page_chunks = process_page(url, actual_html_string, text_splitter)
all_chunks.extend(page_chunks)

return all_chunks
Expand Down
27 changes: 20 additions & 7 deletions chatbot-core/rag/vectorstore/store_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


def build_faiss_ivf_index(vectors, nlist, nprobe, logger):

"""
Build and return a FAISS IndexIVFFlat index from the given vectors.

Expand All @@ -31,6 +32,7 @@ def build_faiss_ivf_index(vectors, nlist, nprobe, logger):
Returns:
faiss.IndexIVFFlat: A trained FAISS IVF index with added vectors.
"""

if not isinstance(vectors, np.ndarray):
raise TypeError("Vectors must be an instance of numpy.ndarray.")
if vectors.ndim != 2:
Expand All @@ -39,15 +41,26 @@ def build_faiss_ivf_index(vectors, nlist, nprobe, logger):
raise TypeError(f"Vectors must be float32, got dtype {vectors.dtype}.")

d = vectors.shape[1]
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
n_samples = vectors.shape[0]

# --- ARCHITECTURAL FIX START ---
# We check if we have enough data points to satisfy the cluster requirement.
# Typically, FAISS needs at least 'nlist' points to train.
if n_samples < nlist:
logger.warning(
"Dataset size (%d) is smaller than nlist (%d). Falling back to IndexFlatL2.",
n_samples, nlist
)
else:
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
logger.info("FAISS index training started...")
index.train(vectors)
logger.info("FAISS index training completed.")
index.nprobe = nprobe
# --- ARCHITECTURAL FIX END ---

logger.info("FAISS index training started...")
index.train(vectors) # pylint: disable=no-value-for-parameter
logger.info("FAISS index training completed.")
index.nprobe = nprobe
index.add(vectors) # pylint: disable=no-value-for-parameter

return index


Expand Down