From 664f2a35a01c9cd26fd2d44a4f40ebfe8be55fda Mon Sep 17 00:00:00 2001 From: marioggil Date: Thu, 26 Sep 2024 12:58:38 -0500 Subject: [PATCH] Funtionality Ok --- README.md | 12 +++ main.py | 261 ++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 238 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index e69de29..6c42fe7 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,12 @@ +pip install llama-index +pip install llama-index-llms-groq +pip install llama-index-embeddings-huggingface +pip install llama-parse +pip install chromadb +pip install llama-index-vector-stores-chroma +pip install llama-index-embeddings-huggingface +pip install python-fasthtml +pip install grok +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + + diff --git a/main.py b/main.py index 1c200c7..1d98308 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,30 @@ from fasthtml.common import * +from llama_index.core import SimpleDirectoryReader, Document,VectorStoreIndex +from llama_index.core.node_parser import SimpleNodeParser +from llama_index.core.text_splitter import TokenTextSplitter +from llama_index.vector_stores.chroma import ChromaVectorStore +from llama_index.core.storage.storage_context import StorageContext +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core import SummaryIndex +from llama_index.llms.groq import Groq +from chromadb import PersistentClient +from llama_index.core import Settings +from llama_index.embeddings.huggingface_api import ( + HuggingFaceInferenceAPIEmbedding, +) +import chromadb import os +import threading +import time +from llama_index.core.memory import ChatMemoryBuffer +os.environ["GROQ_API_KEY"] = "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" + + +llm_70b = Groq(model="llama-3.1-70b-versatile") +memory = ChatMemoryBuffer.from_defaults(token_limit=3900) + +Settings.llm = llm_70b + app= FastHTML() @@ -14,54 +39,220 @@ def menuusers(users): for user in users: T.append(Option(user, value=str(user)) ) return Form( - Select(*T, - cls="selector", - _id="counter", - name="data", - **{'@click':"alert('Clicked');"},),Button("Submit"),action="/checkInfoSources", method="post") + Select(*T,name="user"), + Button("Submit",type="submit",id="buttonMenuuser"), + hx_post="/checkInfoSources",hx_swap="innerHTML",hx_target="#files" ,id="menuuser") @app.post("/checkInfoSources") -def checkInfoSources(data:str): - print(data) - with os.scandir("static/"+data) as files: - subdir = [CheckboxX(label=file.name,value="static/"+data+"/"+file.name) for file in files if file.is_file()] +def checkInfoSources(user:str): + global userdata + with os.scandir("static/"+user) as files: + subdir = [Option(file.name,value="static/"+user+"/"+file.name) for file in files if file.is_file()] + userdata=user + print("Cambio",userdata) return Form( - Label(*subdir, - cls="selector", - _id="counter", - hx_target="files", - name="data", - **{'@click':"alert('Clicked');"},),Button("Submit"),action="/process", method="post") + Select( + *subdir,name="data"), + Input(id="name-db", name="collection", placeholder="Enter a collection name"), + Button("Submit",type="submit"), hx_post="/createCollection",hx_swap="innerHTML",hx_target="#NewCollection" ) -@app.post("/process") -def processData(): - print() - pass - - +def create_or_load_db(path="./chroma_db",collection="init",Nodes=None,model="sentence-transformers/all-mpnet-base-v2"): + embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5") + #embed_model = HuggingFaceInferenceAPIEmbedding( + #model_name="BAAI/bge-small-en-v1.5", + #token="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", # Optional + #) + db = chromadb.PersistentClient(path=path) + chroma_collection = db.get_or_create_collection(collection) + + vector_store = ChromaVectorStore(chroma_collection=chroma_collection) + + storage_context = StorageContext.from_defaults(vector_store=vector_store) + if Nodes: + + index = VectorStoreIndex( + Nodes, storage_context=storage_context, embed_model=embed_model + ) + else: + index = VectorStoreIndex.from_vector_store( + vector_store, + embed_model=embed_model, + ) + return index + +def post_process_documents(documents): + processed_documents = [] + n=0 + print(len(documents)) + for doc in documents: + # 1. Text cleaning + n+=1 + print(n) + text = doc.text.lower() # Convert to lowercase + # 2. Remove stopwords + stop_words = set("adssss") + tokens = text.split(" ") + filtered_text = ' '.join([word for word in tokens if word.lower() not in stop_words]) + # 3. Custom metadata extraction (example) + metadata = doc.metadata.copy() + metadata['word_count'] = len(tokens) + # 4. Create a new document with processed text and updated metadata + processed_doc = Document(text=filtered_text, metadata=metadata) + processed_documents.append(processed_doc) + node_parser = SimpleNodeParser(chunk_size=200, chunk_overlap=30) + nodes = node_parser.get_nodes_from_documents(processed_documents) + return nodes + +@app.get("/listmodelactives") +def listmodelactives(): + try: + print(userdata) + except: + print("cambio") + return Div(id="options",hx_target="this",hx_swap="outerHTML",hx_get="/listmodelactives",hx_trigger="click from:#buttonMenuuser") + db = chromadb.PersistentClient(path="static/"+userdata+"/chroma_db") + files= db.list_collections() + collecs = [Option(file.name, value=file.name)for file in files] + + return Form( + Select( + *collecs,name="data"), + Button("Submit",type="submit"), + hx_post="/loadCollection",hx_swap="innerHTML",hx_target="#Infomodel") + +@app.post("/loadCollection") +def loadCollection(data:str): + global index + index=create_or_load_db(path="static/"+userdata+"/chroma_db",collection=data,model="BAAI/bge-m3") + return P("El usuario %s colleccion %s"%(userdata,data)) + +@app.post("/queryprompt") +def queryPrompt(question:str): + #index=load_create_db(collection="my_collection") + query_engine = index.as_query_engine() + response = query_engine.query(question) + return P(response) + + + +@app.post("/chatData") +def questionChat(message:str): + chat_engine = index.as_chat_engine( + chat_mode="condense_plus_context", + memory=memory, + llm=llm_70b, + context_prompt=( + "You are a chatbot, able to have normal interactions, as well as talk" + " about an essay discussing IA and uses in leardeship." + "Here are the relevant documents for the context:\n" + "{context_str}" + "\nInstruction: Use the previous chat history, or the context above, to interact and help the user but only about tematic of the essay" + ), + verbose=False, + ) + response = chat_engine.chat(message) + return P(message),P(response) + +@app.get("/SummarySources") +def SummarySources(): + with os.scandir("static/"+userdata) as files: + subdir = [Option(file.name,value="static/"+userdata+"/"+file.name) for file in files if file.is_file()] + return Form("Este es muy caro para documentos grandes y tarda mucho", + Select( + *subdir,name="data"), + Input( name="query", placeholder="Enter a query"), + Button("Submit",type="submit"), hx_post="/SummaryMake",hx_swap="innerHTML",hx_target="#summaryR" ) + +@app.post("/SummaryMake") +def SummaryMake(data:str,query:str): + print(data,query) + docs = SimpleDirectoryReader( + input_files=[data] + ).load_data() + print("p1") + summary_index = SummaryIndex.from_documents(docs) + print("p2") + summary_engine = summary_index.as_query_engine() + print("p3") + response = summary_engine.query( + query + ) + print("p4") + return P(response) + + + +@app.post("/createCollection") +def createCollection(data:str,collection:str): + print("Reading") + docs = SimpleDirectoryReader( + input_files=[data] + ).load_data() + print("Process Documents") + Nodes=post_process_documents(docs) + print("create DB") + class MyThread(threading.Thread): + def run(self): + print("Hilo") + create_or_load_db(path="static/"+data.split("/")[1]+"/chroma_db",collection=collection,Nodes=Nodes,model="BAAI/bge-m3") + + # create and start the thread + global t + t = MyThread() + t.start() + global t_time + t_time=time.time() + return Div("Iniciando carga de datos") + +@app.get("/is_busy") +def is_busy(): + try: + Busy= t.is_alive() + except: + Busy=False + if not Busy: + return Busy + else: + return "Esta ocupados desde hace %s , este es un proceso largo"%(str(time.time()-t_time)) + + @app.get("/") def home(): - page = Html( - Head(Title('Super tutor')), - Body(Div('Este es el sistema de super tutor, ', + + page = Title('Super tutor'),Main( + Div('Este es el sistema de super tutor, ', menuusers(listUsers()), - A('A link', href='https://example.com'), - Img(src="https://placehold.co/200"), - Form( - Select( - Option("user", value=str("user"))), - Button("Submit"), - action="/", method="post"), cls='myclass')), - Div(id="files")) - - - + #A('A link', href='https://example.com'), + #Img(src="https://placehold.co/200"), + Div("Archivos",id="files"), + Div(id="NewCollection"), + Div("Estado",id="status",hx_target="this",hx_swap="innerHTML",hx_get="/is_busy",hx_trigger="every 60000ms"), + Div( + Div(id="options",hx_target="this",hx_swap="outerHTML",hx_get="/listmodelactives",hx_trigger="click from:#buttonMenuuser delay:3s"), + Div(id="Infomodel"), + #Div("Resumen",Div(id="summary",hx_target="this",hx_swap="outerHTML",hx_get="/SummarySources",hx_trigger="click from:#buttonMenuuser"),Div(id="summaryR")), + Div( + Form( + Input(id="question", name="message", placeholder="Enter a message"), + Button("Submit",type="submit"), hx_post="/chatData",hx_swap="afterend",hx_target="#questionR" ), + Div(id="questionR") + ,id="questions"), + Div( + Form( + Input(id="query", name="question", placeholder="Enter a query"), + Button("Submit",type="submit"), hx_post="/queryprompt",hx_swap="innerHTML",hx_target="#queryR" ), + Div(id="queryR"), + id="query"), + id="chatbot") + )) return page + + app.mount("/static", StaticFiles(directory="static"), name="static")