Prompt engineering, token optimization, reducing hallucination, RAG, embeddings, API usage & evaluation
AI / Practical Guide# System prompt template (structured)
system_prompt = """
You are an expert {role}.
## Rules
- Always respond in {format}
- Cite sources when making claims
- If unsure, say "I don't know"
## Output Format
{output_schema}
"""
# Few-shot example
messages = [
{"role": "system", "content": "Classify sentiment as positive, negative, or neutral."},
{"role": "user", "content": "The food was amazing!"},
{"role": "assistant", "content": "positive"},
{"role": "user", "content": "Terrible service, never coming back."},
{"role": "assistant", "content": "negative"},
{"role": "user", "content": "The meeting is at 3pm."}, # actual query
]
# Chain-of-Thought prompt
prompt = """Solve the following problem step by step.
Show your reasoning before giving the final answer.
Q: If a train travels 120km in 2 hours,
then slows to 40km/h for 3 hours,
what is the average speed for the whole journey?
Think step by step:"""import tiktoken
# Count tokens (OpenAI models)
enc = tiktoken.encoding_for_model("gpt-4")
tokens = enc.encode("Hello, world!")
print(len(tokens)) # 4
# Estimate cost
input_tokens = len(enc.encode(prompt))
cost = input_tokens / 1_000_000 * 2.50 # GPT-4o: $2.50/1M input
# Token-efficient formatting
# β Verbose: "Please classify the following text into one of these categories"
# β
Concise: "Classify β [positive|negative|neutral]"
# Compress context with summarization
summary_prompt = f"Summarize in <200 words:\n{long_document}"
summary = call_llm(summary_prompt)
final_prompt = f"Based on this context:\n{summary}\n\nAnswer: {question}"# Anti-hallucination system prompt
system = """You are a precise research assistant.
RULES:
1. Only answer using the CONTEXT provided below
2. If the context doesn't contain the answer, say "Not found in context"
3. Quote relevant passages with [Source: section_name]
4. Never invent facts, dates, or statistics
5. Express uncertainty with confidence levels: [HIGH/MEDIUM/LOW]
CONTEXT:
{retrieved_documents}
"""
# Structured output with schema enforcement
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
temperature=0.1, # low for factuality
response_format={
"type": "json_schema",
"json_schema": {
"name": "answer",
"schema": {
"type": "object",
"properties": {
"answer": {"type": "string"},
"confidence": {"type": "string", "enum": ["HIGH","MEDIUM","LOW"]},
"sources": {"type": "array", "items": {"type": "string"}}
},
"required": ["answer", "confidence", "sources"]
}
}
}
)from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
# 1. Chunk documents
splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=50,
separators=["\n\n", "\n", ". ", " "]
)
chunks = splitter.split_documents(docs)
# 2. Embed & store
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma.from_documents(chunks, embeddings)
# 3. Retrieve
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
docs = retriever.invoke("What is the return policy?")
# 4. Generate with context
context = "\n\n".join([d.page_content for d in docs])
prompt = f"""Answer using ONLY the context below.
Context:
{context}
Question: {question}
Answer:"""from openai import OpenAI
import numpy as np
client = OpenAI()
# Generate embeddings
response = client.embeddings.create(
model="text-embedding-3-small", # 1536 dims, cheap
input=["Hello world", "Goodbye moon"]
)
vec = response.data[0].embedding # list of floats
# Cosine similarity
def cosine_sim(a, b):
a, b = np.array(a), np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))from openai import OpenAI
client = OpenAI() # reads OPENAI_API_KEY from env
# Chat completion
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain async/await in Python"}
],
temperature=0.7,
max_tokens=500,
top_p=0.9,
)
answer = response.choices[0].message.content
# Streaming
stream = client.chat.completions.create(
model="gpt-4o",
messages=messages,
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
# Tool / Function calling
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a city",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"}
},
"required": ["city"]
}
}
}]
response = client.chat.completions.create(
model="gpt-4o", messages=messages, tools=tools
)from anthropic import Anthropic
client = Anthropic() # reads ANTHROPIC_API_KEY
# Chat
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
system="You are a precise coding assistant.",
messages=[
{"role": "user", "content": "Write a Python fibonacci function"}
]
)
print(message.content[0].text)
# Streaming
with client.messages.stream(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=messages
) as stream:
for text in stream.text_stream:
print(text, end="")
# Tool use
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
tools=[{
"name": "get_weather",
"description": "Get weather for a location",
"input_schema": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"]
}
}],
messages=[{"role": "user", "content": "Weather in Tokyo?"}]
)# OpenAI fine-tuning (JSONL format)
# training_data.jsonl β one JSON per line
{"messages": [{"role":"system","content":"..."},{"role":"user","content":"..."},{"role":"assistant","content":"..."}]}
# Upload & create fine-tuning job
file = client.files.create(file=open("data.jsonl", "rb"), purpose="fine-tune")
job = client.fine_tuning.jobs.create(
training_file=file.id,
model="gpt-4o-mini-2024-07-18",
hyperparameters={"n_epochs": 3}
)
# Use fine-tuned model
response = client.chat.completions.create(
model="ft:gpt-4o-mini:my-org::abc123",
messages=messages
)# LangGraph agent example
from langgraph.prebuilt import create_react_agent
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
@tool
def search_docs(query: str) -> str:
"""Search internal documentation."""
# your retrieval logic
return results
@tool
def run_sql(query: str) -> str:
"""Execute a read-only SQL query."""
return db.execute(query)
llm = ChatOpenAI(model="gpt-4o")
agent = create_react_agent(llm, tools=[search_docs, run_sql])
result = agent.invoke({
"messages": [{"role": "user", "content": "How many users signed up last week?"}]
})# LLM-as-Judge evaluation
eval_prompt = """Rate the following answer on a scale of 1-5.
Criteria:
- Accuracy: Is the information correct?
- Completeness: Does it fully answer the question?
- Conciseness: Is it appropriately brief?
Question: {question}
Answer: {answer}
Reference: {reference}
Output JSON: {"accuracy": int, "completeness": int, "conciseness": int, "explanation": str}
"""
# RAGAS evaluation framework
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
result = evaluate(
dataset=eval_dataset,
metrics=[faithfulness, answer_relevancy, context_precision]
)
print(result) # {'faithfulness': 0.92, 'answer_relevancy': 0.87, ...}| Model | Context | Strengths |
|---|---|---|
| GPT-4o | 128K | Best all-round, multimodal, tools |
| Claude Opus 4 | 200K | Long context, coding, analysis |
| Claude Sonnet 4 | 200K | Great balance of speed/quality |
| Gemini 2.5 Pro | 1M+ | Massive context, multimodal |
| Llama 3.1 405B | 128K | Open-source, self-hostable |
| Mistral Large | 128K | European, multilingual, fast |
| DeepSeek V3 | 128K | Cost-effective, strong reasoning |