from fasthtml.common import * from llama_index.core import SimpleDirectoryReader, Document,VectorStoreIndex from llama_index.core.node_parser import SimpleNodeParser from llama_index.core.text_splitter import TokenTextSplitter from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.core.storage.storage_context import StorageContext from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import SummaryIndex from llama_index.llms.groq import Groq from chromadb import PersistentClient from llama_index.core import Settings from llama_index.embeddings.huggingface_api import ( HuggingFaceInferenceAPIEmbedding, ) import chromadb import os import threading import time from llama_index.core.memory import ChatMemoryBuffer os.environ["GROQ_API_KEY"] = "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" llm_70b = Groq(model="llama-3.1-70b-versatile") memory = ChatMemoryBuffer.from_defaults(token_limit=3900) Settings.llm = llm_70b app= FastHTML() def listUsers(): with os.scandir("static") as files: subdir = [file.name for file in files if file.is_dir()] return subdir def menuusers(users): T=[] n=0 for user in users: T.append(Option(user, value=str(user)) ) return Form( Select(*T,name="user"), Button("Submit",type="submit",id="buttonMenuuser"), hx_post="/checkInfoSources",hx_swap="innerHTML",hx_target="#files" ,id="menuuser") @app.post("/checkInfoSources") def checkInfoSources(user:str): global userdata with os.scandir("static/"+user) as files: subdir = [Option(file.name,value="static/"+user+"/"+file.name) for file in files if file.is_file()] userdata=user print("Cambio",userdata) return Form( Select( *subdir,name="data"), Input(id="name-db", name="collection", placeholder="Enter a collection name"), Button("Submit",type="submit"), hx_post="/createCollection",hx_swap="innerHTML",hx_target="#NewCollection" ) def create_or_load_db(path="./chroma_db",collection="init",Nodes=None,model="sentence-transformers/all-mpnet-base-v2"): embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5") #embed_model = HuggingFaceInferenceAPIEmbedding( #model_name="BAAI/bge-small-en-v1.5", #token="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", # Optional #) db = chromadb.PersistentClient(path=path) chroma_collection = db.get_or_create_collection(collection) vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) if Nodes: index = VectorStoreIndex( Nodes, storage_context=storage_context, embed_model=embed_model ) else: index = VectorStoreIndex.from_vector_store( vector_store, embed_model=embed_model, ) return index def post_process_documents(documents): processed_documents = [] n=0 print(len(documents)) for doc in documents: # 1. Text cleaning n+=1 print(n) text = doc.text.lower() # Convert to lowercase # 2. Remove stopwords stop_words = set("adssss") tokens = text.split(" ") filtered_text = ' '.join([word for word in tokens if word.lower() not in stop_words]) # 3. Custom metadata extraction (example) metadata = doc.metadata.copy() metadata['word_count'] = len(tokens) # 4. Create a new document with processed text and updated metadata processed_doc = Document(text=filtered_text, metadata=metadata) processed_documents.append(processed_doc) node_parser = SimpleNodeParser(chunk_size=200, chunk_overlap=30) nodes = node_parser.get_nodes_from_documents(processed_documents) return nodes @app.get("/listmodelactives") def listmodelactives(): try: print(userdata) except: print("cambio") return Div(id="options",hx_target="this",hx_swap="outerHTML",hx_get="/listmodelactives",hx_trigger="click from:#buttonMenuuser") db = chromadb.PersistentClient(path="static/"+userdata+"/chroma_db") files= db.list_collections() collecs = [Option(file.name, value=file.name)for file in files] return Form( Select( *collecs,name="data"), Button("Submit",type="submit"), hx_post="/loadCollection",hx_swap="innerHTML",hx_target="#Infomodel") @app.post("/loadCollection") def loadCollection(data:str): global index index=create_or_load_db(path="static/"+userdata+"/chroma_db",collection=data,model="BAAI/bge-m3") return P("El usuario %s colleccion %s"%(userdata,data)) @app.post("/queryprompt") def queryPrompt(question:str): #index=load_create_db(collection="my_collection") query_engine = index.as_query_engine() response = query_engine.query(question) return P(response) @app.post("/chatData") def questionChat(message:str): chat_engine = index.as_chat_engine( chat_mode="condense_plus_context", memory=memory, llm=llm_70b, context_prompt=( "You are a chatbot, able to have normal interactions, as well as talk" " about an essay discussing IA and uses in leardeship." "Here are the relevant documents for the context:\n" "{context_str}" "\nInstruction: Use the previous chat history, or the context above, to interact and help the user but only about tematic of the essay" ), verbose=False, ) response = chat_engine.chat(message) return P(message),P(response) @app.get("/SummarySources") def SummarySources(): with os.scandir("static/"+userdata) as files: subdir = [Option(file.name,value="static/"+userdata+"/"+file.name) for file in files if file.is_file()] return Form("Este es muy caro para documentos grandes y tarda mucho", Select( *subdir,name="data"), Input( name="query", placeholder="Enter a query"), Button("Submit",type="submit"), hx_post="/SummaryMake",hx_swap="innerHTML",hx_target="#summaryR" ) @app.post("/SummaryMake") def SummaryMake(data:str,query:str): print(data,query) docs = SimpleDirectoryReader( input_files=[data] ).load_data() print("p1") summary_index = SummaryIndex.from_documents(docs) print("p2") summary_engine = summary_index.as_query_engine() print("p3") response = summary_engine.query( query ) print("p4") return P(response) @app.post("/createCollection") def createCollection(data:str,collection:str): print("Reading") docs = SimpleDirectoryReader( input_files=[data] ).load_data() print("Process Documents") Nodes=post_process_documents(docs) print("create DB") class MyThread(threading.Thread): def run(self): print("Hilo") create_or_load_db(path="static/"+data.split("/")[1]+"/chroma_db",collection=collection,Nodes=Nodes,model="BAAI/bge-m3") # create and start the thread global t t = MyThread() t.start() global t_time t_time=time.time() return Div("Iniciando carga de datos") @app.get("/is_busy") def is_busy(): try: Busy= t.is_alive() except: Busy=False if not Busy: return Busy else: return "Esta ocupados desde hace %s , este es un proceso largo"%(str(time.time()-t_time)) @app.get("/") def home(): page = Title('Super tutor'),Main( Div('Este es el sistema de super tutor, ', menuusers(listUsers()), #A('A link', href='https://example.com'), #Img(src="https://placehold.co/200"), Div("Archivos",id="files"), Div(id="NewCollection"), Div("Estado",id="status",hx_target="this",hx_swap="innerHTML",hx_get="/is_busy",hx_trigger="every 60000ms"), Div( Div(id="options",hx_target="this",hx_swap="outerHTML",hx_get="/listmodelactives",hx_trigger="click from:#buttonMenuuser delay:3s"), Div(id="Infomodel"), #Div("Resumen",Div(id="summary",hx_target="this",hx_swap="outerHTML",hx_get="/SummarySources",hx_trigger="click from:#buttonMenuuser"),Div(id="summaryR")), Div( Form( Input(id="question", name="message", placeholder="Enter a message"), Button("Submit",type="submit"), hx_post="/chatData",hx_swap="afterend",hx_target="#questionR" ), Div(id="questionR") ,id="questions"), Div( Form( Input(id="query", name="question", placeholder="Enter a query"), Button("Submit",type="submit"), hx_post="/queryprompt",hx_swap="innerHTML",hx_target="#queryR" ), Div(id="queryR"), id="query"), id="chatbot") )) return page app.mount("/static", StaticFiles(directory="static"), name="static") serve()