import ContextualAI, { toFile } from "contextual-client";
import { ChromaClient, OpenAIEmbeddingFunction } from "chromadb";
import fs from "node:fs";
const contextual = new ContextualAI({
apiKey: process.env.CONTEXTUAL_AI_API_KEY!,
});
const chroma = new ChromaClient();
const embedder = new OpenAIEmbeddingFunction({
apiKey: process.env.OPENAI_API_KEY!,
model: "text-embedding-3-small",
});
const parseRes = await contextual.parse.create({
raw_file: await toFile(fs.createReadStream("document.pdf"), "document.pdf", {
type: "application/pdf",
}),
parse_mode: "standard",
enable_document_hierarchy: true,
});
// Monitor job status (Parse API is asynchronous)
async function waitForJob(
jobId: string,
maxAttempts = 20,
interval = 30000
): Promise<void> {
for (let attempt = 0; attempt < maxAttempts; attempt++) {
const s = await contextual.parse.jobStatus(jobId);
if (s.status === "completed") return;
if (s.status === "failed") throw new Error("Parse job failed");
await new Promise((r) => setTimeout(r, interval));
}
}
await waitForJob(parseRes.job_id);
// Get results after job completion
const results = await contextual.parse.jobResults(parseRes.job_id, {
output_types: ["blocks-per-page"],
});
// Create or get existing collection
const collection = await chroma.getOrCreateCollection({
name: "documents",
embeddingFunction: embedder,
});
// Add parsed content to Chroma
const texts: string[] = [];
const metadatas: Array<Record<string, string | number | boolean | null>> = [];
const ids: string[] = [];
for (const page of results.pages ?? []) {
for (const block of page.blocks ?? []) {
if (["text", "heading", "table"].includes(block.type)) {
texts.push(block.markdown);
metadatas.push({ page: (page.index ?? 0) + 1, block_type: block.type });
ids.push(`block_${block.id}`);
}
}
}
await collection.add({ documents: texts, metadatas, ids });