إضافة ميزات مدعومة بالـ AI
البحث الدلالي مع التضمينات
Project Goal: Build a search system that finds relevant results based on meaning, enabling natural language queries.
Understanding Embeddings
Embeddings convert text into numerical vectors that capture semantic meaning:
"How do I fix a bug?" → [0.12, -0.34, 0.56, ...]
"Debug an error" → [0.11, -0.33, 0.55, ...] // Similar!
"Buy groceries" → [-0.45, 0.21, -0.12, ...] // Different!
Similar meanings = similar vectors = easy to find related content.
Project Setup Prompt
Add semantic search to my application:
## Tech Stack
- Next.js 15 with App Router
- OpenAI text-embedding-3-small model
- Vercel Postgres with pgvector extension
- Drizzle ORM
## Features
1. Document ingestion with chunking
2. Embedding generation and storage
3. Similarity search API
4. Hybrid search (semantic + keyword)
5. Search result ranking
6. Query expansion
## Project Structure
/lib /embeddings client.ts # Embedding generation chunker.ts # Text chunking /search semantic.ts # Vector similarity hybrid.ts # Combined search /app /api/search route.ts /api/ingest route.ts
Database Schema with pgvector
// lib/db/schema.ts
import {
pgTable,
text,
timestamp,
uuid,
integer,
index,
vector,
} from 'drizzle-orm/pg-core';
export const documents = pgTable('documents', {
id: uuid('id').primaryKey().defaultRandom(),
title: text('title').notNull(),
content: text('content').notNull(),
source: text('source'),
metadata: text('metadata'), // JSON string
createdAt: timestamp('created_at').defaultNow(),
updatedAt: timestamp('updated_at').defaultNow(),
});
export const documentChunks = pgTable(
'document_chunks',
{
id: uuid('id').primaryKey().defaultRandom(),
documentId: uuid('document_id')
.notNull()
.references(() => documents.id, { onDelete: 'cascade' }),
content: text('content').notNull(),
chunkIndex: integer('chunk_index').notNull(),
embedding: vector('embedding', { dimensions: 1536 }),
tokenCount: integer('token_count'),
createdAt: timestamp('created_at').defaultNow(),
},
(table) => ({
// Create HNSW index for fast similarity search
embeddingIdx: index('embedding_idx').using(
'hnsw',
table.embedding.op('vector_cosine_ops')
),
documentIdx: index('document_idx').on(table.documentId),
})
);
export type Document = typeof documents.$inferSelect;
export type DocumentChunk = typeof documentChunks.$inferSelect;
Text Chunking
// lib/embeddings/chunker.ts
interface ChunkOptions {
maxChunkSize: number; // Max characters per chunk
chunkOverlap: number; // Overlap between chunks
minChunkSize: number; // Minimum chunk size
}
const defaultOptions: ChunkOptions = {
maxChunkSize: 1000,
chunkOverlap: 200,
minChunkSize: 100,
};
export function chunkText(
text: string,
options: Partial<ChunkOptions> = {}
): string[] {
const opts = { ...defaultOptions, ...options };
const chunks: string[] = [];
// Split by paragraphs first
const paragraphs = text.split(/\n\n+/);
let currentChunk = '';
for (const paragraph of paragraphs) {
const trimmedParagraph = paragraph.trim();
if (!trimmedParagraph) continue;
// If paragraph alone exceeds max, split by sentences
if (trimmedParagraph.length > opts.maxChunkSize) {
if (currentChunk) {
chunks.push(currentChunk.trim());
currentChunk = '';
}
chunks.push(...chunkBySentences(trimmedParagraph, opts));
continue;
}
// Check if adding paragraph exceeds limit
if (currentChunk.length + trimmedParagraph.length > opts.maxChunkSize) {
if (currentChunk.length >= opts.minChunkSize) {
chunks.push(currentChunk.trim());
// Start new chunk with overlap from previous
const overlap = getOverlap(currentChunk, opts.chunkOverlap);
currentChunk = overlap + trimmedParagraph;
} else {
currentChunk += '\n\n' + trimmedParagraph;
}
} else {
currentChunk += (currentChunk ? '\n\n' : '') + trimmedParagraph;
}
}
if (currentChunk.length >= opts.minChunkSize) {
chunks.push(currentChunk.trim());
}
return chunks;
}
function chunkBySentences(text: string, opts: ChunkOptions): string[] {
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
const chunks: string[] = [];
let currentChunk = '';
for (const sentence of sentences) {
if (currentChunk.length + sentence.length > opts.maxChunkSize) {
if (currentChunk) {
chunks.push(currentChunk.trim());
const overlap = getOverlap(currentChunk, opts.chunkOverlap);
currentChunk = overlap + sentence;
} else {
// Single sentence too long, force split
chunks.push(sentence.slice(0, opts.maxChunkSize));
currentChunk = sentence.slice(opts.maxChunkSize - opts.chunkOverlap);
}
} else {
currentChunk += sentence;
}
}
if (currentChunk.trim()) {
chunks.push(currentChunk.trim());
}
return chunks;
}
function getOverlap(text: string, overlapSize: number): string {
if (text.length <= overlapSize) return text;
return text.slice(-overlapSize);
}
Embedding Generation
// lib/embeddings/client.ts
import OpenAI from 'openai';
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
export async function generateEmbedding(text: string): Promise<number[]> {
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: text,
dimensions: 1536,
});
return response.data[0].embedding;
}
export async function generateEmbeddings(
texts: string[]
): Promise<number[][]> {
// OpenAI supports batching up to 2048 inputs
const batchSize = 100;
const embeddings: number[][] = [];
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: batch,
dimensions: 1536,
});
embeddings.push(...response.data.map((d) => d.embedding));
}
return embeddings;
}
// Estimate token count (rough approximation)
export function estimateTokens(text: string): number {
// GPT tokenizer roughly: 1 token ≈ 4 characters
return Math.ceil(text.length / 4);
}
Document Ingestion API
// app/api/ingest/route.ts
import { db } from '@/lib/db';
import { documents, documentChunks } from '@/lib/db/schema';
import { chunkText } from '@/lib/embeddings/chunker';
import {
generateEmbeddings,
estimateTokens,
} from '@/lib/embeddings/client';
export async function POST(request: Request) {
try {
const { title, content, source, metadata } = await request.json();
if (!content) {
return Response.json({ error: 'Content is required' }, { status: 400 });
}
// Create document
const [document] = await db
.insert(documents)
.values({
title: title || 'Untitled',
content,
source,
metadata: metadata ? JSON.stringify(metadata) : null,
})
.returning();
// Chunk the content
const chunks = chunkText(content);
// Generate embeddings for all chunks
const embeddings = await generateEmbeddings(chunks);
// Insert chunks with embeddings
const chunkRecords = chunks.map((chunk, index) => ({
documentId: document.id,
content: chunk,
chunkIndex: index,
embedding: embeddings[index],
tokenCount: estimateTokens(chunk),
}));
await db.insert(documentChunks).values(chunkRecords);
return Response.json({
success: true,
documentId: document.id,
chunksCreated: chunks.length,
});
} catch (error) {
console.error('Ingestion error:', error);
return Response.json(
{ error: 'Failed to ingest document' },
{ status: 500 }
);
}
}
Semantic Search Implementation
// lib/search/semantic.ts
import { db } from '@/lib/db';
import { documentChunks, documents } from '@/lib/db/schema';
import { generateEmbedding } from '@/lib/embeddings/client';
import { sql, desc, eq } from 'drizzle-orm';
import { cosineDistance, gt } from 'drizzle-orm/sql';
interface SearchResult {
documentId: string;
documentTitle: string;
chunkContent: string;
similarity: number;
chunkIndex: number;
}
export async function semanticSearch(
query: string,
options: {
limit?: number;
minSimilarity?: number;
documentIds?: string[];
} = {}
): Promise<SearchResult[]> {
const { limit = 10, minSimilarity = 0.7, documentIds } = options;
// Generate embedding for query
const queryEmbedding = await generateEmbedding(query);
// Build similarity search query
const similarity = sql<number>`1 - (${cosineDistance(
documentChunks.embedding,
queryEmbedding
)})`;
let query_builder = db
.select({
documentId: documentChunks.documentId,
documentTitle: documents.title,
chunkContent: documentChunks.content,
similarity,
chunkIndex: documentChunks.chunkIndex,
})
.from(documentChunks)
.innerJoin(documents, eq(documents.id, documentChunks.documentId))
.where(gt(similarity, minSimilarity))
.orderBy(desc(similarity))
.limit(limit);
// Filter by document IDs if specified
if (documentIds && documentIds.length > 0) {
query_builder = query_builder.where(
sql`${documentChunks.documentId} = ANY(${documentIds})`
);
}
const results = await query_builder;
return results.map((r) => ({
documentId: r.documentId,
documentTitle: r.documentTitle,
chunkContent: r.chunkContent,
similarity: r.similarity,
chunkIndex: r.chunkIndex,
}));
}
Hybrid Search (Semantic + Keyword)
// lib/search/hybrid.ts
import { db } from '@/lib/db';
import { documentChunks, documents } from '@/lib/db/schema';
import { generateEmbedding } from '@/lib/embeddings/client';
import { sql, desc, eq, or, ilike } from 'drizzle-orm';
interface HybridSearchOptions {
limit?: number;
semanticWeight?: number; // 0-1, weight for semantic score
keywordWeight?: number; // 0-1, weight for keyword score
minScore?: number;
}
export async function hybridSearch(
query: string,
options: HybridSearchOptions = {}
): Promise<any[]> {
const {
limit = 10,
semanticWeight = 0.7,
keywordWeight = 0.3,
minScore = 0.5,
} = options;
const queryEmbedding = await generateEmbedding(query);
// Extract keywords for text search
const keywords = query
.toLowerCase()
.split(/\s+/)
.filter((w) => w.length > 2);
// Semantic similarity score
const semanticScore = sql<number>`1 - (${documentChunks.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector)`;
// Keyword match score (simple: count matching keywords)
const keywordConditions = keywords.map(
(kw) => sql`${documentChunks.content} ILIKE ${'%' + kw + '%'}`
);
const keywordScore =
keywords.length > 0
? sql<number>`(
${sql.join(
keywordConditions.map(
(cond) => sql`CASE WHEN ${cond} THEN 1 ELSE 0 END`
),
sql` + `
)}
)::float / ${keywords.length}`
: sql<number>`0`;
// Combined score
const combinedScore = sql<number>`
(${semanticScore} * ${semanticWeight}) +
(${keywordScore} * ${keywordWeight})
`;
const results = await db
.select({
documentId: documentChunks.documentId,
documentTitle: documents.title,
chunkContent: documentChunks.content,
semanticScore,
keywordScore,
combinedScore,
chunkIndex: documentChunks.chunkIndex,
})
.from(documentChunks)
.innerJoin(documents, eq(documents.id, documentChunks.documentId))
.where(sql`${combinedScore} > ${minScore}`)
.orderBy(desc(combinedScore))
.limit(limit);
return results;
}
Search API Route
// app/api/search/route.ts
import { semanticSearch } from '@/lib/search/semantic';
import { hybridSearch } from '@/lib/search/hybrid';
export async function GET(request: Request) {
const { searchParams } = new URL(request.url);
const query = searchParams.get('q');
const mode = searchParams.get('mode') || 'hybrid';
const limit = parseInt(searchParams.get('limit') || '10');
if (!query) {
return Response.json({ error: 'Query is required' }, { status: 400 });
}
try {
let results;
if (mode === 'semantic') {
results = await semanticSearch(query, { limit });
} else {
results = await hybridSearch(query, { limit });
}
return Response.json({
query,
mode,
results,
count: results.length,
});
} catch (error) {
console.error('Search error:', error);
return Response.json({ error: 'Search failed' }, { status: 500 });
}
}
Search UI Component
// components/search/search-interface.tsx
'use client';
import { useState } from 'react';
import { Search, Loader2 } from 'lucide-react';
interface SearchResult {
documentId: string;
documentTitle: string;
chunkContent: string;
similarity?: number;
combinedScore?: number;
}
export function SearchInterface() {
const [query, setQuery] = useState('');
const [results, setResults] = useState<SearchResult[]>([]);
const [isLoading, setIsLoading] = useState(false);
const [mode, setMode] = useState<'hybrid' | 'semantic'>('hybrid');
const handleSearch = async () => {
if (!query.trim()) return;
setIsLoading(true);
try {
const response = await fetch(
`/api/search?q=${encodeURIComponent(query)}&mode=${mode}&limit=10`
);
const data = await response.json();
setResults(data.results);
} catch (error) {
console.error('Search failed:', error);
} finally {
setIsLoading(false);
}
};
return (
<div className="mx-auto max-w-3xl p-6">
<div className="mb-6">
<div className="flex gap-2">
<div className="relative flex-1">
<Search className="absolute left-3 top-1/2 h-4 w-4 -translate-y-1/2 text-muted-foreground" />
<input
type="text"
value={query}
onChange={(e) => setQuery(e.target.value)}
onKeyDown={(e) => e.key === 'Enter' && handleSearch()}
placeholder="Search documents..."
className="w-full rounded-lg border bg-background py-2 pl-10 pr-4"
/>
</div>
<select
value={mode}
onChange={(e) => setMode(e.target.value as any)}
className="rounded-lg border bg-background px-3"
>
<option value="hybrid">Hybrid</option>
<option value="semantic">Semantic Only</option>
</select>
<button
onClick={handleSearch}
disabled={isLoading}
className="rounded-lg bg-primary px-4 py-2 text-primary-foreground"
>
{isLoading ? (
<Loader2 className="h-4 w-4 animate-spin" />
) : (
'Search'
)}
</button>
</div>
</div>
<div className="space-y-4">
{results.map((result, i) => (
<div
key={`${result.documentId}-${i}`}
className="rounded-lg border bg-card p-4"
>
<h3 className="font-semibold">{result.documentTitle}</h3>
<p className="mt-2 text-sm text-muted-foreground line-clamp-3">
{result.chunkContent}
</p>
<div className="mt-2 text-xs text-muted-foreground">
Score: {((result.combinedScore || result.similarity || 0) * 100).toFixed(1)}%
</div>
</div>
))}
{results.length === 0 && query && !isLoading && (
<p className="text-center text-muted-foreground">
No results found for "{query}"
</p>
)}
</div>
</div>
);
}
Key Takeaways
- Embeddings capture semantic meaning in vectors
- Chunking strategy affects search quality significantly
- pgvector enables efficient similarity search in Postgres
- Hybrid search combines semantic understanding with keyword precision
- Batch processing is essential for efficient embedding generation
البحث الدلالي مع التضمينات
هدف المشروع: بناء نظام بحث يجد نتائج ذات صلة بناءً على المعنى، مما يُمكّن الاستعلامات باللغة الطبيعية.
فهم التضمينات
التضمينات تحول النص إلى متجهات رقمية تلتقط المعنى الدلالي:
"كيف أصلح خطأ؟" → [0.12, -0.34, 0.56, ...]
"تصحيح مشكلة" → [0.11, -0.33, 0.55, ...] // متشابه!
"شراء بقالة" → [-0.45, 0.21, -0.12, ...] // مختلف!
معاني متشابهة = متجهات متشابهة = سهولة إيجاد المحتوى ذي الصلة.
برومبت إعداد المشروع
أضف البحث الدلالي لتطبيقي:
## التقنيات
- Next.js 15 مع App Router
- نموذج OpenAI text-embedding-3-small
- Vercel Postgres مع إضافة pgvector
- Drizzle ORM
## الميزات
1. استيعاب المستندات مع التقطيع
2. توليد وتخزين التضمينات
3. API البحث بالتشابه
4. البحث الهجين (دلالي + كلمات مفتاحية)
5. ترتيب نتائج البحث
6. توسيع الاستعلام
## هيكل المشروع
/lib /embeddings client.ts # توليد التضمينات chunker.ts # تقطيع النص /search semantic.ts # تشابه المتجهات hybrid.ts # البحث المدمج /app /api/search route.ts /api/ingest route.ts
مخطط قاعدة البيانات مع pgvector
// lib/db/schema.ts
import {
pgTable,
text,
timestamp,
uuid,
integer,
index,
vector,
} from 'drizzle-orm/pg-core';
export const documents = pgTable('documents', {
id: uuid('id').primaryKey().defaultRandom(),
title: text('title').notNull(),
content: text('content').notNull(),
source: text('source'),
metadata: text('metadata'), // سلسلة JSON
createdAt: timestamp('created_at').defaultNow(),
updatedAt: timestamp('updated_at').defaultNow(),
});
export const documentChunks = pgTable(
'document_chunks',
{
id: uuid('id').primaryKey().defaultRandom(),
documentId: uuid('document_id')
.notNull()
.references(() => documents.id, { onDelete: 'cascade' }),
content: text('content').notNull(),
chunkIndex: integer('chunk_index').notNull(),
embedding: vector('embedding', { dimensions: 1536 }),
tokenCount: integer('token_count'),
createdAt: timestamp('created_at').defaultNow(),
},
(table) => ({
// إنشاء فهرس HNSW للبحث السريع بالتشابه
embeddingIdx: index('embedding_idx').using(
'hnsw',
table.embedding.op('vector_cosine_ops')
),
documentIdx: index('document_idx').on(table.documentId),
})
);
تقطيع النص
// lib/embeddings/chunker.ts
interface ChunkOptions {
maxChunkSize: number; // أقصى حروف لكل قطعة
chunkOverlap: number; // التداخل بين القطع
minChunkSize: number; // أدنى حجم للقطعة
}
const defaultOptions: ChunkOptions = {
maxChunkSize: 1000,
chunkOverlap: 200,
minChunkSize: 100,
};
export function chunkText(
text: string,
options: Partial<ChunkOptions> = {}
): string[] {
const opts = { ...defaultOptions, ...options };
const chunks: string[] = [];
// تقسيم بالفقرات أولاً
const paragraphs = text.split(/\n\n+/);
let currentChunk = '';
for (const paragraph of paragraphs) {
const trimmedParagraph = paragraph.trim();
if (!trimmedParagraph) continue;
// إذا تجاوزت الفقرة وحدها الحد الأقصى، قسم بالجمل
if (trimmedParagraph.length > opts.maxChunkSize) {
if (currentChunk) {
chunks.push(currentChunk.trim());
currentChunk = '';
}
chunks.push(...chunkBySentences(trimmedParagraph, opts));
continue;
}
// تحقق إذا إضافة الفقرة تتجاوز الحد
if (currentChunk.length + trimmedParagraph.length > opts.maxChunkSize) {
if (currentChunk.length >= opts.minChunkSize) {
chunks.push(currentChunk.trim());
const overlap = getOverlap(currentChunk, opts.chunkOverlap);
currentChunk = overlap + trimmedParagraph;
} else {
currentChunk += '\n\n' + trimmedParagraph;
}
} else {
currentChunk += (currentChunk ? '\n\n' : '') + trimmedParagraph;
}
}
if (currentChunk.length >= opts.minChunkSize) {
chunks.push(currentChunk.trim());
}
return chunks;
}
توليد التضمينات
// lib/embeddings/client.ts
import OpenAI from 'openai';
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
export async function generateEmbedding(text: string): Promise<number[]> {
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: text,
dimensions: 1536,
});
return response.data[0].embedding;
}
export async function generateEmbeddings(
texts: string[]
): Promise<number[][]> {
// OpenAI يدعم الدفعات حتى 2048 مدخل
const batchSize = 100;
const embeddings: number[][] = [];
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: batch,
dimensions: 1536,
});
embeddings.push(...response.data.map((d) => d.embedding));
}
return embeddings;
}
تنفيذ البحث الدلالي
// lib/search/semantic.ts
import { db } from '@/lib/db';
import { documentChunks, documents } from '@/lib/db/schema';
import { generateEmbedding } from '@/lib/embeddings/client';
import { sql, desc, eq } from 'drizzle-orm';
interface SearchResult {
documentId: string;
documentTitle: string;
chunkContent: string;
similarity: number;
chunkIndex: number;
}
export async function semanticSearch(
query: string,
options: {
limit?: number;
minSimilarity?: number;
} = {}
): Promise<SearchResult[]> {
const { limit = 10, minSimilarity = 0.7 } = options;
// توليد التضمين للاستعلام
const queryEmbedding = await generateEmbedding(query);
// بناء استعلام البحث بالتشابه
const similarity = sql<number>`1 - (${documentChunks.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector)`;
const results = await db
.select({
documentId: documentChunks.documentId,
documentTitle: documents.title,
chunkContent: documentChunks.content,
similarity,
chunkIndex: documentChunks.chunkIndex,
})
.from(documentChunks)
.innerJoin(documents, eq(documents.id, documentChunks.documentId))
.where(sql`${similarity} > ${minSimilarity}`)
.orderBy(desc(similarity))
.limit(limit);
return results;
}
النقاط الرئيسية
- التضمينات تلتقط المعنى الدلالي في المتجهات
- استراتيجية التقطيع تؤثر على جودة البحث بشكل كبير
- pgvector يُمكّن البحث الفعال بالتشابه في Postgres
- البحث الهجين يجمع الفهم الدلالي مع دقة الكلمات المفتاحية
- المعالجة بالدفعات ضرورية لتوليد التضمينات بكفاءة