From bffd9aa24948c108591c089e84b0bf61823baad3 Mon Sep 17 00:00:00 2001 From: AI Station Server Date: Tue, 30 Dec 2025 17:06:13 +0100 Subject: [PATCH] Fix: Smart chunking per Excel e Hybrid Search funzionante --- app.py | 120 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 32 deletions(-) diff --git a/app.py b/app.py index b326a71f..6a9673be 100644 --- a/app.py +++ b/app.py @@ -155,60 +155,93 @@ def extract_text_from_excel(path: str) -> str: return "" # === AI & EMBEDDINGS (Remoto) === -async def get_embeddings(text: str) -> list: +async def get_embeddings(text: str) -> dict: + """Ritorna dict con keys 'dense' e 'sparse'""" try: async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.post( f"{BGE_API_URL}/embed", - json={"texts": [text], "normalize": True} + json={"texts": [text]} ) if resp.status_code == 200: data = resp.json() - # Gestisce sia il vecchio formato (lista diretta) che il nuovo (dict) - if isinstance(data, list): return data[0] # Vecchia API - if "dense" in data: return data["dense"][0] # Nuova API Hybrid - if "embeddings" in data: return data["embeddings"][0] # API precedente + # La nuova API ritorna {"data": [{"dense":..., "sparse":...}]} + return data["data"][0] except Exception as e: print(f"⚠️ Errore Embedding: {e}") - return [] + return {} async def ensure_collection(name: str): client = AsyncQdrantClient(url=QDRANT_URL) if not await client.collection_exists(name): - # Creiamo una collezione ottimizzata await client.create_collection( collection_name=name, vectors_config={ "bge_dense": VectorParams(size=1024, distance=Distance.COSINE) + }, + # ABILITIAMO LO SPARSE VECTOR + sparse_vectors_config={ + "bge_sparse": SparseVectorParams( + index=SparseIndexParams( + on_disk=False, # True se hai poca RAM, ma hai 32GB quindi False è meglio + ) + ) } - # Se in futuro abilitiamo lo sparse output dal .243, aggiungeremo: - # sparse_vectors_config={"bge_sparse": SparseVectorParams(index=SparseIndexParams(on_disk=False))} ) +def chunk_text_by_lines(text: str, max_chars: int = 2500) -> List[str]: + """Taglia il testo raggruppando linee intere senza spezzarle.""" + lines = text.split('\n') + chunks = [] + current_chunk = "" + + for line in lines: + # Se la riga è troppo lunga da sola (caso raro), la tagliamo + if len(line) > max_chars: + if current_chunk: chunks.append(current_chunk) + chunks.append(line[:max_chars]) + current_chunk = "" + continue + + # Se aggiungere la riga supera il limite, salviamo il chunk attuale + if len(current_chunk) + len(line) > max_chars: + chunks.append(current_chunk) + current_chunk = line + "\n" + else: + current_chunk += line + "\n" + + if current_chunk: + chunks.append(current_chunk) + return chunks + async def index_document(filename: str, content: str, collection: str) -> bool: try: await ensure_collection(collection) - chunks = [content[i:i+3000] for i in range(0, len(content), 3000)] + + # --- MODIFICA QUI: Usiamo il chunking intelligente invece di quello brutale --- + # Vecchio: chunks = [content[i:i+3000] for i in range(0, len(content), 3000)] + chunks = chunk_text_by_lines(content, max_chars=2000) + # --------------------------------------------------------------------------- qdrant = AsyncQdrantClient(url=QDRANT_URL) points = [] - + for i, chunk in enumerate(chunks): - # Ottieni embedding (assume che get_embeddings ritorni la lista float) - # Nota: Se hai aggiornato l'API .243 per ritornare un dict {"dense": ...}, - # devi aggiornare get_embeddings per estrarre ["dense"]! - - # Vedere funzione get_embeddings aggiornata sotto - emb = await get_embeddings(chunk) - - if emb: + vectors = await get_embeddings(chunk) + + if vectors: points.append(PointStruct( id=str(uuid.uuid4()), - # Vettori nominati - vector={"bge_dense": emb}, + vector={ + "bge_dense": vectors["dense"], + "bge_sparse": models.SparseVector( + indices=vectors["sparse"]["indices"], + values=vectors["sparse"]["values"] + ) + }, payload={"file_name": filename, "content": chunk, "chunk_id": i} )) - + if points: await qdrant.upsert(collection_name=collection, points=points) return True @@ -220,19 +253,42 @@ async def search_qdrant(query: str, collection: str) -> str: try: client = AsyncQdrantClient(url=QDRANT_URL) if not await client.collection_exists(collection): return "" - - emb = await get_embeddings(query) - if not emb: return "" - - # Ricerca mirata sul vettore BGE + + vectors = await get_embeddings(query) + if not vectors: return "" + + # HYBRID QUERY (RRF FUSION) res = await client.query_points( collection_name=collection, - query=emb, - using="bge_dense", # Specifica quale indice usare - limit=5 + prefetch=[ + models.Prefetch( + query=vectors["dense"], + using="bge_dense", + limit=10, + ), + models.Prefetch( + query=models.SparseVector( + indices=vectors["sparse"]["indices"], + values=vectors["sparse"]["values"] + ), + using="bge_sparse", + limit=20, + ), + ], + # --- CORREZIONE QUI SOTTO (da 'method' a 'fusion') --- + query=models.FusionQuery(fusion=models.Fusion.RRF), + limit=12 ) + return "\n\n".join([f"📄 {hit.payload['file_name']}:\n{hit.payload['content']}" for hit in res.points if hit.payload]) - except: return "" + except Exception as e: + print(f"Search Error: {e}") # Questo è quello che vedevi nei log + return "" + + return "\n\n".join([f"📄 {hit.payload['file_name']}:\n{hit.payload['content']}" for hit in res.points if hit.payload]) + except Exception as e: + print(f"Search Error: {e}") + return "" # === CHAT LOGIC === @cl.on_chat_start