https://medium.com/@shravankoninti/mastering-rag-applying-rag-to-webscraped-information-44748ee33ea2
https://www.youtube.com/watch?v=D8A11vqshv8
https://python.langchain.com/docs/introduction/
import os
from dotenv import load_dotenv
# LangChain core + community
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
def main():
# 1) env
load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
raise RuntimeError("Missing OPENAI_API_KEY in your .env file")
# 2) pick pages to scrape (you can add more URLs)
urls = [
"<https://medium.com/@shravankoninti/mastering-rag-applying-rag-to-webscraped-information-44748ee33ea2>",
]
# 3) load & clean
print("Loading webpages…")
loader = WebBaseLoader(urls)
docs = loader.load() # returns a list of Documents with .page_content and .metadata
# 4) split into chunks
print("Splitting into chunks…")
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
splits = splitter.split_documents(docs)
# 5) embed + store locally in Chroma (persist so you can reuse later)
persist_dir = "chroma_db"
print("Embedding and building vector store…")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory=persist_dir,
)
# (optional) ensure it’s written to disk
vectorstore.persist()
# 6) make a retriever (what comes back for each query)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
# 7) LLM for answering
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
# 8) prompt with retrieved context
prompt = ChatPromptTemplate.from_template(
"You are a helpful assistant. Use the provided CONTEXT to answer the QUESTION.\\n"
"If you don't know, say you don't know. Cite sources by title or URL when helpful.\\n\\n"
"CONTEXT:\\n{context}\\n\\nQUESTION:\\n{question}"
)
# 9) build the chain (Retrieval → Prompt → LLM → String)
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# 10) try a question
user_question = "Summarize the core steps this tutorial uses to build a RAG system."
print(f"\\nAsking: {user_question}\\n")
answer = rag_chain.invoke(user_question)
print("Answer:\\n")
print(answer)
if __name__ == "__main__":
main()