Download Notebook (.ipynb)
R_002 - Minimal Retrieval (TF-IDF)
Goal: build a tiny retrieval pipeline (chunk → index → query → top-k evidence).
No LLM. Focus on reproducibility and evidence display.
import sys
from pathlib import Path
repo_root = Path.cwd()
# 如果你在 LABS/notebooks 里运行,repo_root 是 notebooks,需要上两级
if (repo_root / "LABS").exists() is False:
repo_root = repo_root.parent.parent
sys.path.insert(0, str(repo_root))
from LABS.src.text_chunker import chunk_text
from LABS.src.retrieval_tfidf import TfidfRetrieverdocs = {
"doc_york_ai": """
University of York offers an MSc in Artificial Intelligence covering machine learning,
deep learning, and autonomous systems. Students often build projects using Python and PyTorch.
""",
"doc_rag": """
Retrieval-Augmented Generation (RAG) improves factual grounding by retrieving relevant documents
and injecting them into a model's context. Typical components include chunking, retrieval, reranking,
and evaluation for faithfulness.
""",
"doc_transformer": """
Transformers use self-attention to model token interactions. They scale well and are the backbone
of modern large language models. Common variants include encoder-only, decoder-only, and encoder-decoder.
""",
"doc_eval": """
Evaluating retrieval systems often uses metrics like precision@k, recall@k, MRR, and nDCG.
For RAG, you also care about citation faithfulness and answer correctness on held-out queries.
""".strip(),
}
len(docs)chunks = []
for doc_id, text in docs.items():
cs = chunk_text(text, chunk_size=220, overlap=40)
for i, c in enumerate(cs):
chunks.append((f"{doc_id}::chunk{i:02d}", c.text))
len(chunks), chunks[0][0], chunks[0][1][:80]retriever = TfidfRetriever(chunks)def show_results(query: str, top_k: int = 5):
print("Query:", query)
results = retriever.search(query, top_k=top_k)
for r in results:
print(f"- {r.doc_id} score={r.score:.3f}")
print(" ", r.text.strip().replace("\n", " "))
print()
show_results("What is RAG and why is chunking important?", top_k=5)show_results("Which metrics are used to evaluate retrieval systems?", top_k=5)queries = [
("rag components chunking retrieval", "doc_rag"),
("self attention transformer backbone", "doc_transformer"),
("precision recall mrr ndcg evaluation", "doc_eval"),
]
def recall_at_k(k: int = 3) -> float:
hit = 0
for q, target_prefix in queries:
res = retriever.search(q, top_k=k)
ok = any(r.doc_id.startswith(target_prefix) for r in res)
hit += int(ok)
return hit / len(queries)
for k in [1, 3, 5]:
print("recall@", k, "=", recall_at_k(k))Notes
- This is a minimal baseline (TF-IDF cosine).
- Next step: add a tiny evaluation set + compute recall@k / MRR.
- Then: swap retriever to BM25/hybrid or add simple reranking.