Fix: Smart chunking per Excel e Hybrid Search funzionante

This commit is contained in:
AI Station Server 2025-12-30 17:06:13 +01:00
parent 9cef64f9ea
commit bffd9aa249
1 changed files with 88 additions and 32 deletions

112
app.py
View File

@ -155,57 +155,90 @@ def extract_text_from_excel(path: str) -> str:
return "" return ""
# === AI & EMBEDDINGS (Remoto) === # === AI & EMBEDDINGS (Remoto) ===
async def get_embeddings(text: str) -> list: async def get_embeddings(text: str) -> dict:
"""Ritorna dict con keys 'dense' e 'sparse'"""
try: try:
async with httpx.AsyncClient(timeout=30.0) as client: async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post( resp = await client.post(
f"{BGE_API_URL}/embed", f"{BGE_API_URL}/embed",
json={"texts": [text], "normalize": True} json={"texts": [text]}
) )
if resp.status_code == 200: if resp.status_code == 200:
data = resp.json() data = resp.json()
# Gestisce sia il vecchio formato (lista diretta) che il nuovo (dict) # La nuova API ritorna {"data": [{"dense":..., "sparse":...}]}
if isinstance(data, list): return data[0] # Vecchia API return data["data"][0]
if "dense" in data: return data["dense"][0] # Nuova API Hybrid
if "embeddings" in data: return data["embeddings"][0] # API precedente
except Exception as e: except Exception as e:
print(f"⚠️ Errore Embedding: {e}") print(f"⚠️ Errore Embedding: {e}")
return [] return {}
async def ensure_collection(name: str): async def ensure_collection(name: str):
client = AsyncQdrantClient(url=QDRANT_URL) client = AsyncQdrantClient(url=QDRANT_URL)
if not await client.collection_exists(name): if not await client.collection_exists(name):
# Creiamo una collezione ottimizzata
await client.create_collection( await client.create_collection(
collection_name=name, collection_name=name,
vectors_config={ vectors_config={
"bge_dense": VectorParams(size=1024, distance=Distance.COSINE) "bge_dense": VectorParams(size=1024, distance=Distance.COSINE)
} },
# Se in futuro abilitiamo lo sparse output dal .243, aggiungeremo: # ABILITIAMO LO SPARSE VECTOR
# sparse_vectors_config={"bge_sparse": SparseVectorParams(index=SparseIndexParams(on_disk=False))} sparse_vectors_config={
"bge_sparse": SparseVectorParams(
index=SparseIndexParams(
on_disk=False, # True se hai poca RAM, ma hai 32GB quindi False è meglio
) )
)
}
)
def chunk_text_by_lines(text: str, max_chars: int = 2500) -> List[str]:
"""Taglia il testo raggruppando linee intere senza spezzarle."""
lines = text.split('\n')
chunks = []
current_chunk = ""
for line in lines:
# Se la riga è troppo lunga da sola (caso raro), la tagliamo
if len(line) > max_chars:
if current_chunk: chunks.append(current_chunk)
chunks.append(line[:max_chars])
current_chunk = ""
continue
# Se aggiungere la riga supera il limite, salviamo il chunk attuale
if len(current_chunk) + len(line) > max_chars:
chunks.append(current_chunk)
current_chunk = line + "\n"
else:
current_chunk += line + "\n"
if current_chunk:
chunks.append(current_chunk)
return chunks
async def index_document(filename: str, content: str, collection: str) -> bool: async def index_document(filename: str, content: str, collection: str) -> bool:
try: try:
await ensure_collection(collection) await ensure_collection(collection)
chunks = [content[i:i+3000] for i in range(0, len(content), 3000)]
# --- MODIFICA QUI: Usiamo il chunking intelligente invece di quello brutale ---
# Vecchio: chunks = [content[i:i+3000] for i in range(0, len(content), 3000)]
chunks = chunk_text_by_lines(content, max_chars=2000)
# ---------------------------------------------------------------------------
qdrant = AsyncQdrantClient(url=QDRANT_URL) qdrant = AsyncQdrantClient(url=QDRANT_URL)
points = [] points = []
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
# Ottieni embedding (assume che get_embeddings ritorni la lista float) vectors = await get_embeddings(chunk)
# Nota: Se hai aggiornato l'API .243 per ritornare un dict {"dense": ...},
# devi aggiornare get_embeddings per estrarre ["dense"]!
# Vedere funzione get_embeddings aggiornata sotto if vectors:
emb = await get_embeddings(chunk)
if emb:
points.append(PointStruct( points.append(PointStruct(
id=str(uuid.uuid4()), id=str(uuid.uuid4()),
# Vettori nominati vector={
vector={"bge_dense": emb}, "bge_dense": vectors["dense"],
"bge_sparse": models.SparseVector(
indices=vectors["sparse"]["indices"],
values=vectors["sparse"]["values"]
)
},
payload={"file_name": filename, "content": chunk, "chunk_id": i} payload={"file_name": filename, "content": chunk, "chunk_id": i}
)) ))
@ -221,18 +254,41 @@ async def search_qdrant(query: str, collection: str) -> str:
client = AsyncQdrantClient(url=QDRANT_URL) client = AsyncQdrantClient(url=QDRANT_URL)
if not await client.collection_exists(collection): return "" if not await client.collection_exists(collection): return ""
emb = await get_embeddings(query) vectors = await get_embeddings(query)
if not emb: return "" if not vectors: return ""
# Ricerca mirata sul vettore BGE # HYBRID QUERY (RRF FUSION)
res = await client.query_points( res = await client.query_points(
collection_name=collection, collection_name=collection,
query=emb, prefetch=[
using="bge_dense", # Specifica quale indice usare models.Prefetch(
limit=5 query=vectors["dense"],
using="bge_dense",
limit=10,
),
models.Prefetch(
query=models.SparseVector(
indices=vectors["sparse"]["indices"],
values=vectors["sparse"]["values"]
),
using="bge_sparse",
limit=20,
),
],
# --- CORREZIONE QUI SOTTO (da 'method' a 'fusion') ---
query=models.FusionQuery(fusion=models.Fusion.RRF),
limit=12
) )
return "\n\n".join([f"📄 {hit.payload['file_name']}:\n{hit.payload['content']}" for hit in res.points if hit.payload]) return "\n\n".join([f"📄 {hit.payload['file_name']}:\n{hit.payload['content']}" for hit in res.points if hit.payload])
except: return "" except Exception as e:
print(f"Search Error: {e}") # Questo è quello che vedevi nei log
return ""
return "\n\n".join([f"📄 {hit.payload['file_name']}:\n{hit.payload['content']}" for hit in res.points if hit.payload])
except Exception as e:
print(f"Search Error: {e}")
return ""
# === CHAT LOGIC === # === CHAT LOGIC ===
@cl.on_chat_start @cl.on_chat_start