Spaces:

decodingdatascience
/

ddshrchatbotmaybootcamp

Running

App Files Files Community

ddshrchatbotmaybootcamp / app.py

decodingdatascience

Update app.py

8ff3856 verified 4 days ago

raw

history blame contribute delete

5.66 kB

	# If needed in Colab, install first:
	# !pip install -U gradio pinecone llama-index llama-index-vector-stores-pinecone llama-index-readers-file pypdf
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
	# --- Imports ---
	import logging
	import sys
	import gradio as gr
	import os

	from pinecone import Pinecone, ServerlessSpec
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext , Settings
	from llama_index.vector_stores.pinecone import PineconeVectorStore
	from llama_index.readers.file import PDFReader
	from llama_index.llms.openai import OpenAI
	from llama_index.embeddings.openai import OpenAIEmbedding
	# --- Logging ---
	logging.basicConfig(stream=sys.stdout, level=logging.INFO)


	Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)
	Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
	Settings.chunk_size = 600
	Settings.chunk_overlap = 200

	# Define a system prompt
	system_prompt = '''
	You are AYesha, the Decoding Data Science (DDS) Enterprise HR Chatbot. Answer questions exclusively using the attached DDS HR Handbook. Base all responses on the most up-to-date information available in the handbook. Only respond to queries directly related to DDS HR policies as outlined in the handbook.

	- If a question pertains to topics outside DDS HR policies, respond politely, clarifying that you are a human resources bot and only answer DDS HR questions.
	- For questions you cannot answer (e.g., requests for old policies, salary details, or confidential information), politely decline and direct the user to email connect@decodingdatascience.com.
	- Never answer questions about anything outside of your scope.
	- Persist in following these constraints for any follow-up questions.
	- Before answering, carefully check that the information and query are within the allowed scope. Follow chain-of-thought reasoning:
	1. First, reason step-by-step whether the question is covered in the current handbook and is within HR.
	2. Only after confirming, produce a final answer.

	Format answers as concise, professional responses. Do not wrap answers in code blocks or any special formatting.

	Output requirements:
	- For allowed HR questions, answer concisely based only on the latest DDS HR handbook information.
	- For forbidden topics, output: “I’m sorry, I can only answer questions about the latest DDS HR policies. For confidential or other queries, please email connect@decodingdatascience.com.”


	Example 1
	User: What is the leave encashment policy at DDS?
	Reasoning: This is an HR policy question found in the latest handbook.
	Final Answer: [Provide answer summarized from the latest handbook’s section on leave encashment]

	Example 2
	User: Can you tell me the salary range for Data Scientists?
	Reasoning: Salary details are confidential and not shared by this bot.
	Final Answer: I’m sorry, I can only answer questions about the latest DDS HR policies. For confidential or other queries, please email connect@decodingdatascience.com.

	Example 3
	User: Can you explain what DDS does as a company overall?
	Reasoning: This is not an HR question, so it cannot be answered.
	Final Answer: I’m sorry, I only answer DDS HR policy questions as outlined in the handbook.

	(Real-world examples should be longer and use precise wording from the handbook where appropriate.)

	Important instructions:
	- Only answer questions directly supported by the latest DDS HR handbook.
	- Decline politely and redirect to the provided email address for any questions outside scope or for confidential information.
	- Always reason before concluding. Only present the answer after checking scope and source.

	Remember: As AYesha, the DDS HR Enterprise Chatbot, you must never provide information outside authorized HR handbook content and always respond respectfully according to these constraints.

	'''


	# --- Load API Key from Hugging face environment ---

	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


	# --- Initialize Pinecone ---
	pc = Pinecone(api_key=PINECONE_API_KEY)
	index_name = "quickstart"
	dimension = 1536

	# --- Delete index if it already exists (optional) ---
	existing_indexes = [idx["name"] for idx in pc.list_indexes()]

	if index_name in existing_indexes:
	pc.delete_index(index_name)

	# --- Create Pinecone index ---
	pc.create_index(
	name=index_name,
	dimension=dimension,
	metric="euclidean",
	spec=ServerlessSpec(cloud="aws", region="us-east-1"),
	)

	pinecone_index = pc.Index(index_name)

	# --- Load PDF documents from folder ---
	documents = SimpleDirectoryReader(
	input_dir="data",
	required_exts=[".pdf"],
	file_extractor={".pdf": PDFReader()}
	).load_data()

	if not documents:
	raise ValueError("No PDF documents were loaded from the 'data' folder.")

	# --- Create Vector Index ---
	vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
	storage_context = StorageContext.from_defaults(vector_store=vector_store)

	index = VectorStoreIndex.from_documents(
	documents,
	storage_context=storage_context
	)

	# --- Query Engine ---
	query_engine = index.as_query_engine(system_prompt=system_prompt)

	# --- Gradio App ---
	def query_doc(prompt):
	try:
	response = query_engine.query(prompt)
	return str(response)
	except Exception as e:
	return f"Error: {str(e)}"

	gr.Interface(
	fn=query_doc,
	inputs=gr.Textbox(label="Ask a question about the document"),
	outputs=gr.Textbox(label="Answer"),
	title="DDS Enterprise Chatbot",
	description="Ask questions related to HR for latest Information."
	).launch(share=True)