export async function queryDocuments(
question: string,
topK = 5,
): Promise<Array<{ source: string; content: string; distance: number }>> {
// embed the question with the same model used at ingest time
const [embedding] = await embedBatch([question]);
const embeddingStr = JSON.stringify(embedding);
const rows = await sql<{ source: string; content: string; distance: number }[]>`
SELECT
source,
content,
(embedding <=> ${embeddingStr}::vector) AS distance
FROM documents
ORDER BY embedding <=> ${embeddingStr}::vector
LIMIT ${topK}
`;
return rows;
}
The <=> operator returns cosine distance (0 = identical, 2 = opposite). Lower numbers win. If you add metadata filters, add them in the WHERE clause before ORDER BY so the planner can use the HNSW iterative scan introduced in 0.8.0.
// filtered query example — same model must have returned results for this source
const rows = await sql<{ source: string; content: string; distance: number }[]>`
SELECT source, content, (embedding <=> ${embeddingStr}::vector) AS distance
FROM documents
WHERE source = ${filterSource}
ORDER BY embedding <=> ${embeddingStr}::vector
LIMIT ${topK}
`;
5. Wiring retrieved docs into an LLM call
Concatenate the retrieved chunks into a context block, then call your model of choice. Claude 3.5 Sonnet or GPT-4o both handle long contexts well. Keep the context block under 80,000 tokens for cost reasons.
import Anthropic from "@anthropic-ai/sdk";
const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY! });
export async function answerWithRAG(question: string): Promise<string> {
const docs = await queryDocuments(question, 5);
if (docs.length === 0) {
return "No relevant documents found.";
}
const context = docs
.map((d, i) => `[${i + 1}] (${d.source})\n${d.content}`)
.join("\n\n---\n\n");
const prompt = `You are a helpful assistant. Answer the question using only the provided context.
If the context does not contain the answer, say so.
Context:
${context}
Question: ${question}`;
const response = await anthropic.messages.create({
model: "claude-sonnet-4-6-20250929",
max_tokens: 1024,
messages: [{ role: "user", content: prompt }],
});
const block = response.content[0];
return block.type === "text" ? block.text : "";
}
The "answer using only the provided context" instruction is load-bearing. Without it, the model mixes retrieval with parametric memory and you cannot tell which is which. If the answer comes from the context, citations work. If it comes from training data, they do not. Force the distinction at the prompt level.
One more thing worth noting: rerank before you send to the LLM. A fast cosine search returns the 5 closest chunks by vector distance, but distance does not always equal usefulness. A cross-encoder reranker (Cohere Rerank costs about 1ドル per 1,000 queries) takes your top-20 candidates and scores them for actual relevance before you trim to 5. The quality jump is noticeable. Skip the reranker while prototyping, add it before you hit production.
6. Two gotchas that bite everyone
Chunk size drives recall more than index parameters
Most teams spend hours tuning HNSW m and ef_construction and see marginal gains. The actual lever is chunk size and overlap. A chunk that is too short loses context (the model cannot answer a cross-sentence question). A chunk that is too long pulls in noise, dilutes the embedding, and wastes context window in the LLM call. Run a quick eval: take 20 representative questions, retrieve top-5, then manually score whether the answer appeared in the returned chunks. Adjust chunk size in 100-word steps until recall tops 85%. Then tune the index.
Build the index after bulk loading, not before
HNSW indexing at insert time is slow. If you load 500,000 documents and the HNSW index exists, every INSERT pays the graph update cost. The fast path: load all rows with the index dropped, then build it once with CREATE INDEX. On a table of 500,000 rows with 1,536-dim embeddings, a cold HNSW build takes roughly 8 to 12 minutes on 4 vCPUs. That is far cheaper than the cumulative insert overhead.
-- drop the index before bulk load
DROP INDEX IF EXISTS documents_embedding_idx;
-- ... run your ingest pipeline ...
-- rebuild once after load
CREATE INDEX documents_embedding_idx
ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
The bottom line
The full pipeline is about 120 lines of TypeScript and three SQL statements. pgvector 0.8.x is stable enough for production, HNSW is the right default index for most teams, and the two things that matter most for answer quality are chunk size and staying consistent between embed-at-ingest and embed-at-query time (same model, same preprocessing). Dedicated vector DBs are not wrong, they are just a layer you do not need until your row count passes 50M or your recall requirements get strict enough to warrant a tuning team.
What chunk size worked best for your use case? Drop it in the comments.
GDS K S · thegdsks.com · follow on X @thegdsks
Good retrieval beats a better model every time.