Skip to content

PDF Extraction for RAG Pipelines

Extract PDFs into structured Markdown for your RAG pipeline:

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("paper.pdf")
md = doc.to_markdown_all(detect_headings=True, include_images=False)
# Split into chunks, embed, and store in your vector database

WASM

import { WasmPdfDocument } from "pdf-oxide-wasm";

const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll();
// Split into chunks, embed, and store in your vector database
doc.free();

Rust

use pdf_oxide::PdfDocument;

let mut doc = PdfDocument::open("paper.pdf")?;
let md = doc.to_markdown_all(true)?;
// Split into chunks, embed, and store in your vector database

Go

package main

import (
    "log"
    pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
    doc, err := pdfoxide.Open("paper.pdf")
    if err != nil { log.Fatal(err) }
    defer doc.Close()

    md, _ := doc.ToMarkdownAll()
    _ = md // Split into chunks, embed, and store in your vector database
}

C#

using PdfOxide;

using var doc = PdfDocument.Open("paper.pdf");
var md = doc.ToMarkdownAll();
// Split into chunks, embed, and store in your vector database

PDF Oxide processes 3,830 PDFs in 3.1 seconds at 0.8ms per page with a 100% pass rate. Zero missing documents in your index.

Why Extraction Quality Matters for RAG

Your retrieval system is only as good as your extraction:

  • Missing text = missing answers. A library with 98.4% pass rate (pypdf) silently drops 61 documents from a 3,823-file corpus. PDF Oxide passes 100%.
  • Lost structure = poor chunking. Plain text loses headings, tables, and formatting that enable semantic chunking. Markdown preserves them.
  • Slow extraction = pipeline bottleneck. At 12.1ms per page (pypdf) or 23.2ms (pdfplumber), processing 100K pages takes minutes. At 0.8ms, it takes 80 seconds.

Installation

pip install pdf_oxide

Quick Start: PDF to Vector Database

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

def extract_documents(pdf_dir: str) -> list[dict]:
    """Extract all PDFs in a directory to structured chunks."""
    documents = []
    for pdf_path in Path(pdf_dir).glob("*.pdf"):
        try:
            doc = PdfDocument(str(pdf_path))
            for i in range(doc.page_count()):
                md = doc.to_markdown(i,
                    detect_headings=True,
                    include_images=False
                )
                if md.strip():
                    documents.append({
                        "content": md,
                        "source": pdf_path.name,
                        "page": i,
                    })
        except PdfError as e:
            print(f"Skipped {pdf_path.name}: {e}")
    return documents

docs = extract_documents("research-papers/")
print(f"Extracted {len(docs)} chunks from PDFs")
# Feed docs to your embedding model and vector store

Chunking Strategies

By Heading (Semantic Chunking)

Split Markdown output on headings for semantically meaningful chunks:

Python

import re
from pdf_oxide import PdfDocument

doc = PdfDocument("paper.pdf")
md = doc.to_markdown_all(detect_headings=True, include_images=False)

# Split on ## headings
chunks = re.split(r'\n(?=## )', md)
chunks = [c.strip() for c in chunks if c.strip()]

WASM

const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll();

// Split on ## headings
const chunks = md.split(/\n(?=## )/).filter(c => c.trim());
doc.free();

Rust

let mut doc = PdfDocument::open("paper.pdf")?;
let md = doc.to_markdown_all(true)?;

let chunks: Vec<&str> = md.split("\n## ")
    .map(|c| c.trim())
    .filter(|c| !c.is_empty())
    .collect();

Go

doc, _ := pdfoxide.Open("paper.pdf")
defer doc.Close()

md, _ := doc.ToMarkdownAll()

var chunks []string
for _, c := range strings.Split(md, "\n## ") {
    c = strings.TrimSpace(c)
    if c != "" { chunks = append(chunks, c) }
}

C#

using var doc = PdfDocument.Open("paper.pdf");
var md = doc.ToMarkdownAll();

var chunks = md.Split("\n## ")
    .Select(c => c.Trim())
    .Where(c => c.Length > 0)
    .ToList();

By Page

One chunk per page — simple and preserves page-level context:

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("manual.pdf")
chunks = []
for i in range(doc.page_count()):
    md = doc.to_markdown(i, detect_headings=True, include_images=False)
    if md.strip():
        chunks.append({"content": md, "page": i})

WASM

const doc = new WasmPdfDocument(bytes);
const chunks = [];
for (let i = 0; i < doc.pageCount(); i++) {
    const md = doc.toMarkdown(i);
    if (md.trim()) {
        chunks.push({ content: md, page: i });
    }
}
doc.free();

Rust

let mut doc = PdfDocument::open("manual.pdf")?;
let mut chunks = Vec::new();
for i in 0..doc.page_count()? {
    let md = doc.to_markdown(i, true)?;
    if !md.trim().is_empty() {
        chunks.push((i, md));
    }
}

Go

doc, _ := pdfoxide.Open("manual.pdf")
defer doc.Close()

type Chunk struct{ Page int; Content string }
var chunks []Chunk

n, _ := doc.PageCount()
for i := 0; i < n; i++ {
    md, _ := doc.ToMarkdown(i)
    if strings.TrimSpace(md) != "" {
        chunks = append(chunks, Chunk{Page: i, Content: md})
    }
}

C#

using var doc = PdfDocument.Open("manual.pdf");
var chunks = Enumerable.Range(0, doc.PageCount)
    .Select(i => new { Page = i, Content = doc.ToMarkdown(i) })
    .Where(c => !string.IsNullOrWhiteSpace(c.Content))
    .ToList();

Fixed-Size with Overlap

Split long text into fixed-size chunks with overlap:

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("book.pdf")
full_text = doc.to_markdown_all(detect_headings=True, include_images=False)

chunk_size = 1000  # characters
overlap = 200
chunks = []

for start in range(0, len(full_text), chunk_size - overlap):
    chunk = full_text[start:start + chunk_size]
    if chunk.strip():
        chunks.append(chunk)

WASM

const doc = new WasmPdfDocument(bytes);
const fullText = doc.toMarkdownAll();

const chunkSize = 1000;
const overlap = 200;
const chunks = [];

for (let start = 0; start < fullText.length; start += chunkSize - overlap) {
    const chunk = fullText.slice(start, start + chunkSize);
    if (chunk.trim()) chunks.push(chunk);
}
doc.free();

Rust

let mut doc = PdfDocument::open("book.pdf")?;
let full_text = doc.to_markdown_all(true)?;

let chunk_size = 1000;
let overlap = 200;
let mut chunks = Vec::new();
let mut start = 0;

while start < full_text.len() {
    let end = (start + chunk_size).min(full_text.len());
    let chunk = &full_text[start..end];
    if !chunk.trim().is_empty() {
        chunks.push(chunk.to_string());
    }
    start += chunk_size - overlap;
}

Go

doc, _ := pdfoxide.Open("book.pdf")
defer doc.Close()

full, _ := doc.ToMarkdownAll()

const chunkSize, overlap = 1000, 200
var chunks []string
for start := 0; start < len(full); start += chunkSize - overlap {
    end := start + chunkSize
    if end > len(full) { end = len(full) }
    chunk := full[start:end]
    if strings.TrimSpace(chunk) != "" {
        chunks = append(chunks, chunk)
    }
}

C#

using var doc = PdfDocument.Open("book.pdf");
var full = doc.ToMarkdownAll();

const int chunkSize = 1000, overlap = 200;
var chunks = new List<string>();
for (int start = 0; start < full.Length; start += chunkSize - overlap)
{
    var end = Math.Min(start + chunkSize, full.Length);
    var chunk = full[start..end];
    if (!string.IsNullOrWhiteSpace(chunk))
        chunks.Add(chunk);
}

Batch Processing Thousands of PDFs

At 0.8ms per page, PDF Oxide can process large corpora quickly:

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_files = list(Path("corpus/").glob("**/*.pdf"))
print(f"Processing {len(pdf_files)} PDFs...")

all_chunks = []
errors = 0

for pdf_path in pdf_files:
    try:
        doc = PdfDocument(str(pdf_path))
        md = doc.to_markdown_all(
            detect_headings=True,
            include_images=False
        )
        if md.strip():
            all_chunks.append({
                "content": md,
                "source": str(pdf_path),
                "pages": doc.page_count(),
            })
    except PdfError:
        errors += 1

print(f"Extracted {len(all_chunks)} documents, {errors} errors")

Handling Scanned PDFs in Your Pipeline

Some PDFs in a corpus will be scanned images. Use OCR as a fallback:

from pdf_oxide import PdfDocument

doc = PdfDocument("mixed-corpus-file.pdf")
text = doc.extract_text(0)

if len(text.strip()) < 50:
    # Likely a scanned page — use OCR
    text = doc.extract_text_ocr(0)

See OCR guide for setup details.

Why Markdown Over Plain Text

Feature Plain text Markdown
Heading hierarchy Lost Preserved (#, ##, ###)
Tables Flattened GFM table syntax
Bold/italic Lost **bold**, *italic*
Semantic chunking Difficult Split on headings
LLM comprehension Lower Higher (structured input)

Markdown gives your LLM more context about document structure, leading to better retrieval and generation quality.

Performance at Scale

Corpus Size PDF Oxide pypdf pdfplumber
1,000 pages 0.8s 12.1s 23.2s
10,000 pages 8s 121s 232s
100,000 pages 80s 1,210s 2,320s
Pass rate 100% 98.4% 98.8%

With 100% pass rate, you never have to manually investigate why documents are missing from your index.