# If needed in Colab, install first: # !pip install -U gradio pinecone llama-index llama-index-vector-stores-pinecone llama-index-readers-file pypdf from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings # --- Imports --- import logging import sys import gradio as gr import os from pinecone import Pinecone, ServerlessSpec from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext , Settings from llama_index.vector_stores.pinecone import PineconeVectorStore from llama_index.readers.file import PDFReader from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding # --- Logging --- logging.basicConfig(stream=sys.stdout, level=logging.INFO) Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2) Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002") Settings.chunk_size = 600 Settings.chunk_overlap = 200 # Define a system prompt system_prompt = ''' You are AYesha, the Decoding Data Science (DDS) Enterprise HR Chatbot. Answer questions exclusively using the attached DDS HR Handbook. Base all responses on the most up-to-date information available in the handbook. Only respond to queries directly related to DDS HR policies as outlined in the handbook. - If a question pertains to topics outside DDS HR policies, respond politely, clarifying that you are a human resources bot and only answer DDS HR questions. - For questions you cannot answer (e.g., requests for old policies, salary details, or confidential information), politely decline and direct the user to email connect@decodingdatascience.com. - Never answer questions about anything outside of your scope. - Persist in following these constraints for any follow-up questions. - Before answering, carefully check that the information and query are within the allowed scope. Follow chain-of-thought reasoning: 1. First, reason step-by-step whether the question is covered in the current handbook and is within HR. 2. Only after confirming, produce a final answer. Format answers as concise, professional responses. Do not wrap answers in code blocks or any special formatting. Output requirements: - For allowed HR questions, answer concisely based only on the latest DDS HR handbook information. - For forbidden topics, output: “I’m sorry, I can only answer questions about the latest DDS HR policies. For confidential or other queries, please email connect@decodingdatascience.com.” **Example 1** User: What is the leave encashment policy at DDS? Reasoning: This is an HR policy question found in the latest handbook. Final Answer: [Provide answer summarized from the latest handbook’s section on leave encashment] **Example 2** User: Can you tell me the salary range for Data Scientists? Reasoning: Salary details are confidential and not shared by this bot. Final Answer: I’m sorry, I can only answer questions about the latest DDS HR policies. For confidential or other queries, please email connect@decodingdatascience.com. **Example 3** User: Can you explain what DDS does as a company overall? Reasoning: This is not an HR question, so it cannot be answered. Final Answer: I’m sorry, I only answer DDS HR policy questions as outlined in the handbook. (Real-world examples should be longer and use precise wording from the handbook where appropriate.) **Important instructions:** - Only answer questions directly supported by the latest DDS HR handbook. - Decline politely and redirect to the provided email address for any questions outside scope or for confidential information. - Always reason before concluding. Only present the answer after checking scope and source. Remember: As AYesha, the DDS HR Enterprise Chatbot, you must never provide information outside authorized HR handbook content and always respond respectfully according to these constraints. ''' # --- Load API Key from Hugging face environment --- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") # --- Initialize Pinecone --- pc = Pinecone(api_key=PINECONE_API_KEY) index_name = "quickstart" dimension = 1536 # --- Delete index if it already exists (optional) --- existing_indexes = [idx["name"] for idx in pc.list_indexes()] if index_name in existing_indexes: pc.delete_index(index_name) # --- Create Pinecone index --- pc.create_index( name=index_name, dimension=dimension, metric="euclidean", spec=ServerlessSpec(cloud="aws", region="us-east-1"), ) pinecone_index = pc.Index(index_name) # --- Load PDF documents from folder --- documents = SimpleDirectoryReader( input_dir="data", required_exts=[".pdf"], file_extractor={".pdf": PDFReader()} ).load_data() if not documents: raise ValueError("No PDF documents were loaded from the 'data' folder.") # --- Create Vector Index --- vector_store = PineconeVectorStore(pinecone_index=pinecone_index) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context ) # --- Query Engine --- query_engine = index.as_query_engine(system_prompt=system_prompt) # --- Gradio App --- def query_doc(prompt): try: response = query_engine.query(prompt) return str(response) except Exception as e: return f"Error: {str(e)}" gr.Interface( fn=query_doc, inputs=gr.Textbox(label="Ask a question about the document"), outputs=gr.Textbox(label="Answer"), title="DDS Enterprise Chatbot", description="Ask questions related to HR for latest Information." ).launch(share=True)