From bffd9aa24948c108591c089e84b0bf61823baad3 Mon Sep 17 00:00:00 2001
From: AI Station Server <ai-station@ai-server>
Date: Tue, 30 Dec 2025 17:06:13 +0100
Subject: [PATCH] Fix: Smart chunking per Excel e Hybrid Search funzionante

---
 app.py | 120 ++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 88 insertions(+), 32 deletions(-)

diff --git a/app.py b/app.py
index b326a71f..6a9673be 100644
--- a/app.py
+++ b/app.py
@@ -155,60 +155,93 @@ def extract_text_from_excel(path: str) -> str:
         return ""
 
 # === AI & EMBEDDINGS (Remoto) ===
-async def get_embeddings(text: str) -> list:
+async def get_embeddings(text: str) -> dict:
+    """Ritorna dict con keys 'dense' e 'sparse'"""
     try:
         async with httpx.AsyncClient(timeout=30.0) as client:
             resp = await client.post(
                 f"{BGE_API_URL}/embed",
-                json={"texts": [text], "normalize": True}
+                json={"texts": [text]}
             )
             if resp.status_code == 200:
                 data = resp.json()
-                # Gestisce sia il vecchio formato (lista diretta) che il nuovo (dict)
-                if isinstance(data, list): return data[0] # Vecchia API
-                if "dense" in data: return data["dense"][0] # Nuova API Hybrid
-                if "embeddings" in data: return data["embeddings"][0] # API precedente
+                # La nuova API ritorna {"data": [{"dense":..., "sparse":...}]}
+                return data["data"][0]
     except Exception as e:
         print(f"⚠️ Errore Embedding: {e}")
-    return []
+    return {}
 
 async def ensure_collection(name: str):
     client = AsyncQdrantClient(url=QDRANT_URL)
     if not await client.collection_exists(name):
-        # Creiamo una collezione ottimizzata
         await client.create_collection(
             collection_name=name,
             vectors_config={
                 "bge_dense": VectorParams(size=1024, distance=Distance.COSINE)
+            },
+            # ABILITIAMO LO SPARSE VECTOR
+            sparse_vectors_config={
+                "bge_sparse": SparseVectorParams(
+                    index=SparseIndexParams(
+                        on_disk=False, # True se hai poca RAM, ma hai 32GB quindi False è meglio
+                    )
+                )
             }
-            # Se in futuro abilitiamo lo sparse output dal .243, aggiungeremo:
-            # sparse_vectors_config={"bge_sparse": SparseVectorParams(index=SparseIndexParams(on_disk=False))}
         )
 
+def chunk_text_by_lines(text: str, max_chars: int = 2500) -> List[str]:
+    """Taglia il testo raggruppando linee intere senza spezzarle."""
+    lines = text.split('\n')
+    chunks = []
+    current_chunk = ""
+    
+    for line in lines:
+        # Se la riga è troppo lunga da sola (caso raro), la tagliamo
+        if len(line) > max_chars:
+            if current_chunk: chunks.append(current_chunk)
+            chunks.append(line[:max_chars])
+            current_chunk = ""
+            continue
+            
+        # Se aggiungere la riga supera il limite, salviamo il chunk attuale
+        if len(current_chunk) + len(line) > max_chars:
+            chunks.append(current_chunk)
+            current_chunk = line + "\n"
+        else:
+            current_chunk += line + "\n"
+            
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+
 async def index_document(filename: str, content: str, collection: str) -> bool:
     try:
         await ensure_collection(collection)
-        chunks = [content[i:i+3000] for i in range(0, len(content), 3000)]
+        
+        # --- MODIFICA QUI: Usiamo il chunking intelligente invece di quello brutale ---
+        # Vecchio: chunks = [content[i:i+3000] for i in range(0, len(content), 3000)]
+        chunks = chunk_text_by_lines(content, max_chars=2000) 
+        # ---------------------------------------------------------------------------
 
         qdrant = AsyncQdrantClient(url=QDRANT_URL)
         points = []
-
+        
         for i, chunk in enumerate(chunks):
-            # Ottieni embedding (assume che get_embeddings ritorni la lista float)
-            # Nota: Se hai aggiornato l'API .243 per ritornare un dict {"dense": ...},
-            # devi aggiornare get_embeddings per estrarre ["dense"]!
-
-            # Vedere funzione get_embeddings aggiornata sotto
-            emb = await get_embeddings(chunk)
-
-            if emb:
+            vectors = await get_embeddings(chunk)
+            
+            if vectors:
                 points.append(PointStruct(
                     id=str(uuid.uuid4()),
-                    # Vettori nominati
-                    vector={"bge_dense": emb},
+                    vector={
+                        "bge_dense": vectors["dense"],
+                        "bge_sparse": models.SparseVector(
+                            indices=vectors["sparse"]["indices"],
+                            values=vectors["sparse"]["values"]
+                        )
+                    },
                     payload={"file_name": filename, "content": chunk, "chunk_id": i}
                 ))
-
+        
         if points:
             await qdrant.upsert(collection_name=collection, points=points)
             return True
@@ -220,19 +253,42 @@ async def search_qdrant(query: str, collection: str) -> str:
     try:
         client = AsyncQdrantClient(url=QDRANT_URL)
         if not await client.collection_exists(collection): return ""
-
-        emb = await get_embeddings(query)
-        if not emb: return ""
-
-        # Ricerca mirata sul vettore BGE
+        
+        vectors = await get_embeddings(query)
+        if not vectors: return ""
+        
+        # HYBRID QUERY (RRF FUSION)
         res = await client.query_points(
             collection_name=collection,
-            query=emb,
-            using="bge_dense", # Specifica quale indice usare
-            limit=5
+            prefetch=[
+                models.Prefetch(
+                    query=vectors["dense"],
+                    using="bge_dense",
+                    limit=10, 
+                ),
+                models.Prefetch(
+                    query=models.SparseVector(
+                        indices=vectors["sparse"]["indices"],
+                        values=vectors["sparse"]["values"]
+                    ),
+                    using="bge_sparse",
+                    limit=20,
+                ),
+            ],
+            # --- CORREZIONE QUI SOTTO (da 'method' a 'fusion') ---
+            query=models.FusionQuery(fusion=models.Fusion.RRF), 
+            limit=12
         )
+        
         return "\n\n".join([f"📄 {hit.payload['file_name']}:\n{hit.payload['content']}" for hit in res.points if hit.payload])
-    except: return ""
+    except Exception as e:
+        print(f"Search Error: {e}") # Questo è quello che vedevi nei log
+        return ""
+        
+        return "\n\n".join([f"📄 {hit.payload['file_name']}:\n{hit.payload['content']}" for hit in res.points if hit.payload])
+    except Exception as e:
+        print(f"Search Error: {e}")
+        return ""
 
 # === CHAT LOGIC ===
 @cl.on_chat_start