Batch Process PDFs in Python
Process a directory of PDFs with error handling:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
for pdf_path in Path("documents/").glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
text = doc.extract_text(0)
print(f"{pdf_path.name}: {len(text)} chars")
except PdfError as e:
print(f"Failed: {pdf_path.name}: {e}")
WASM
import { WasmPdfDocument } from "pdf-oxide-wasm";
// Process multiple PDF buffers
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const text = doc.extractText(0);
console.log(`${name}: ${text.length} chars`);
doc.free();
} catch (e) {
console.error(`Failed: ${name}: ${e.message}`);
}
}
Rust
use pdf_oxide::PdfDocument;
use std::path::Path;
for entry in std::fs::read_dir("documents/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
match PdfDocument::open(path.to_str().unwrap()) {
Ok(mut doc) => {
let text = doc.extract_text(0)?;
println!("{}: {} chars", path.display(), text.len());
}
Err(e) => println!("Failed: {}: {}", path.display(), e),
}
}
}
Go
package main
import (
"fmt"
"log"
"path/filepath"
pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)
func main() {
matches, _ := filepath.Glob("documents/*.pdf")
for _, p := range matches {
doc, err := pdfoxide.Open(p)
if err != nil {
log.Printf("Failed: %s: %v", p, err)
continue
}
text, _ := doc.ExtractText(0)
fmt.Printf("%s: %d chars\n", filepath.Base(p), len(text))
doc.Close()
}
}
C#
using PdfOxide;
foreach (var p in Directory.GetFiles("documents/", "*.pdf"))
{
try
{
using var doc = PdfDocument.Open(p);
var text = doc.ExtractText(0);
Console.WriteLine($"{Path.GetFileName(p)}: {text.Length} chars");
}
catch (Exception e)
{
Console.Error.WriteLine($"Failed: {p}: {e.Message}");
}
}
At 0.8ms per page, PDF Oxide processes 3,830 PDFs in 3.1 seconds.
Installation
pip install pdf_oxide
Sequential Processing
Extract Text from All PDFs
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_dir = Path("invoices/")
results = {}
for pdf_path in sorted(pdf_dir.glob("*.pdf")):
try:
doc = PdfDocument(str(pdf_path))
pages = []
for i in range(doc.page_count()):
pages.append(doc.extract_text(i))
results[pdf_path.name] = "\n".join(pages)
except PdfError as e:
print(f"Error: {pdf_path.name}: {e}")
print(f"Processed {len(results)} PDFs")
WASM
const results = new Map();
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const text = doc.extractAllText();
results.set(name, text);
doc.free();
} catch (e) {
console.error(`Error: ${name}: ${e.message}`);
}
}
console.log(`Processed ${results.size} PDFs`);
Rust
use std::collections::HashMap;
let mut results: HashMap<String, String> = HashMap::new();
for entry in std::fs::read_dir("invoices/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
let mut pages = Vec::new();
for i in 0..doc.page_count().unwrap_or(0) {
if let Ok(text) = doc.extract_text(i) {
pages.push(text);
}
}
results.insert(path.display().to_string(), pages.join("\n"));
}
}
}
println!("Processed {} PDFs", results.len());
Go
results := make(map[string]string)
matches, _ := filepath.Glob("invoices/*.pdf")
sort.Strings(matches)
for _, p := range matches {
doc, err := pdfoxide.Open(p)
if err != nil { log.Printf("Error: %s: %v", p, err); continue }
full, _ := doc.ExtractAllText()
results[filepath.Base(p)] = full
doc.Close()
}
fmt.Printf("Processed %d PDFs\n", len(results))
C#
var results = new Dictionary<string, string>();
foreach (var p in Directory.GetFiles("invoices/", "*.pdf").OrderBy(p => p))
{
try
{
using var doc = PdfDocument.Open(p);
var sb = new StringBuilder();
for (int i = 0; i < doc.PageCount; i++)
sb.AppendLine(doc.ExtractText(i));
results[Path.GetFileName(p)] = sb.ToString();
}
catch (Exception e)
{
Console.Error.WriteLine($"Error: {p}: {e.Message}");
}
}
Console.WriteLine($"Processed {results.Count} PDFs");
Convert All PDFs to Markdown
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
input_dir = Path("papers/")
output_dir = Path("markdown/")
output_dir.mkdir(exist_ok=True)
for pdf_path in input_dir.glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
md = doc.to_markdown_all(detect_headings=True, include_images=False)
out_path = output_dir / pdf_path.with_suffix(".md").name
out_path.write_text(md, encoding="utf-8")
except PdfError as e:
print(f"Skipped {pdf_path.name}: {e}")
WASM
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll();
console.log(`Converted ${name}: ${md.length} chars`);
doc.free();
} catch (e) {
console.error(`Skipped ${name}: ${e.message}`);
}
}
Rust
for entry in std::fs::read_dir("papers/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
if let Ok(md) = doc.to_markdown_all(true) {
let out = path.with_extension("md");
std::fs::write(&out, &md)?;
}
}
}
}
Go
_ = os.MkdirAll("markdown/", 0755)
matches, _ := filepath.Glob("papers/*.pdf")
for _, p := range matches {
doc, err := pdfoxide.Open(p)
if err != nil { log.Printf("Skipped %s: %v", p, err); continue }
md, _ := doc.ToMarkdownAll()
doc.Close()
out := filepath.Join("markdown", strings.TrimSuffix(filepath.Base(p), ".pdf") + ".md")
_ = os.WriteFile(out, []byte(md), 0644)
}
C#
Directory.CreateDirectory("markdown/");
foreach (var p in Directory.GetFiles("papers/", "*.pdf"))
{
try
{
using var doc = PdfDocument.Open(p);
var md = doc.ToMarkdownAll();
var outPath = Path.Combine("markdown", Path.GetFileNameWithoutExtension(p) + ".md");
File.WriteAllText(outPath, md);
}
catch (Exception e)
{
Console.Error.WriteLine($"Skipped {p}: {e.Message}");
}
}
Parallel Processing
Goroutines and Tasks. In Go, each
PdfDocumentis goroutine-safe for independent reads — just spawn one goroutine per file. In C#, preferTask.WhenAllwith the*Asyncmethods (see the Async guide). Examples below focus on Python; the same shape works for any language with a worker-pool primitive.
Using multiprocessing
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from multiprocessing import Pool
def process_pdf(pdf_path: str) -> dict:
try:
doc = PdfDocument(pdf_path)
text = ""
for i in range(doc.page_count()):
text += doc.extract_text(i) + "\n"
return {"file": pdf_path, "text": text, "pages": doc.page_count()}
except PdfError as e:
return {"file": pdf_path, "error": str(e)}
pdf_files = [str(p) for p in Path("documents/").glob("*.pdf")]
with Pool() as pool:
results = pool.map(process_pdf, pdf_files)
success = [r for r in results if "text" in r]
errors = [r for r in results if "error" in r]
print(f"Processed {len(success)}, failed {len(errors)}")
Using concurrent.futures
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
def extract_text(pdf_path: str) -> tuple[str, str]:
doc = PdfDocument(pdf_path)
text = ""
for i in range(doc.page_count()):
text += doc.extract_text(i) + "\n"
return pdf_path, text
pdf_files = list(Path("documents/").glob("*.pdf"))
with ProcessPoolExecutor(max_workers=4) as executor:
futures = {executor.submit(extract_text, str(p)): p for p in pdf_files}
for future in as_completed(futures):
pdf_path = futures[future]
try:
path, text = future.result()
print(f"{pdf_path.name}: {len(text)} chars")
except Exception as e:
print(f"Error: {pdf_path.name}: {e}")
Progress Tracking
Simple Counter
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_files = list(Path("documents/").glob("*.pdf"))
total = len(pdf_files)
for idx, pdf_path in enumerate(pdf_files, 1):
try:
doc = PdfDocument(str(pdf_path))
text = doc.extract_text(0)
print(f"[{idx}/{total}] {pdf_path.name}: OK")
except PdfError as e:
print(f"[{idx}/{total}] {pdf_path.name}: FAILED - {e}")
With tqdm
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from tqdm import tqdm
pdf_files = list(Path("documents/").glob("*.pdf"))
for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
try:
doc = PdfDocument(str(pdf_path))
for i in range(doc.page_count()):
doc.extract_text(i)
except PdfError:
pass
Error Handling for Corrupt Files
Build a robust pipeline that logs errors and continues:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import json
pdf_dir = Path("mixed-quality/")
results = []
errors = []
for pdf_path in pdf_dir.glob("**/*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
page_count = doc.page_count()
text_length = sum(
len(doc.extract_text(i)) for i in range(page_count)
)
results.append({
"file": str(pdf_path),
"pages": page_count,
"chars": text_length,
})
except PdfError as e:
errors.append({
"file": str(pdf_path),
"error": str(e),
})
except Exception as e:
errors.append({
"file": str(pdf_path),
"error": f"Unexpected: {e}",
})
print(f"Success: {len(results)}, Errors: {len(errors)}")
# Save error report
if errors:
with open("errors.json", "w") as f:
json.dump(errors, f, indent=2)
WASM
const results = [];
const errors = [];
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const pageCount = doc.pageCount();
let textLength = 0;
for (let i = 0; i < pageCount; i++) {
textLength += doc.extractText(i).length;
}
results.push({ file: name, pages: pageCount, chars: textLength });
doc.free();
} catch (e) {
errors.push({ file: name, error: e.message });
}
}
console.log(`Success: ${results.length}, Errors: ${errors.length}`);
Rust
let mut results = Vec::new();
let mut errors = Vec::new();
for entry in std::fs::read_dir("mixed-quality/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
match PdfDocument::open(path.to_str().unwrap()) {
Ok(mut doc) => {
let page_count = doc.page_count().unwrap_or(0);
let text_length: usize = (0..page_count)
.filter_map(|i| doc.extract_text(i).ok())
.map(|t| t.len())
.sum();
results.push((path.display().to_string(), page_count, text_length));
}
Err(e) => errors.push((path.display().to_string(), e.to_string())),
}
}
}
println!("Success: {}, Errors: {}", results.len(), errors.len());
Go
type Success struct{ File string; Pages int; Chars int }
type Failure struct{ File string; Error string }
var results []Success
var errors []Failure
_ = filepath.Walk("mixed-quality/", func(path string, info os.FileInfo, err error) error {
if err != nil || info.IsDir() || !strings.HasSuffix(path, ".pdf") { return nil }
doc, err := pdfoxide.Open(path)
if err != nil {
errors = append(errors, Failure{path, err.Error()})
return nil
}
defer doc.Close()
n, _ := doc.PageCount()
chars := 0
for i := 0; i < n; i++ {
t, _ := doc.ExtractText(i)
chars += len(t)
}
results = append(results, Success{path, n, chars})
return nil
})
fmt.Printf("Success: %d, Errors: %d\n", len(results), len(errors))
C#
var results = new List<(string File, int Pages, int Chars)>();
var errors = new List<(string File, string Error)>();
foreach (var p in Directory.EnumerateFiles("mixed-quality/", "*.pdf", SearchOption.AllDirectories))
{
try
{
using var doc = PdfDocument.Open(p);
var n = doc.PageCount;
var chars = Enumerable.Range(0, n).Sum(i => doc.ExtractText(i).Length);
results.Add((p, n, chars));
}
catch (Exception e)
{
errors.Add((p, e.Message));
}
}
Console.WriteLine($"Success: {results.Count}, Errors: {errors.Count}");
Memory-Efficient Processing
For very large corpora, process one file at a time without accumulating results in memory:
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import csv
pdf_dir = Path("large-corpus/")
with open("output.csv", "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["file", "page", "text"])
for pdf_path in pdf_dir.glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
for i in range(doc.page_count()):
text = doc.extract_text(i)
writer.writerow([pdf_path.name, i, text])
except PdfError:
pass
Batch Merge
Merge all PDFs in a directory into one:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_files = sorted(Path("reports/").glob("*.pdf"))
if pdf_files:
doc = PdfDocument(str(pdf_files[0]))
for pdf_path in pdf_files[1:]:
try:
doc.merge_from(str(pdf_path))
except PdfError as e:
print(f"Skipped {pdf_path.name}: {e}")
doc.save("all-reports.pdf")
Rust
use pdf_oxide::editor::DocumentEditor;
let mut files: Vec<_> = std::fs::read_dir("reports/")?
.filter_map(|e| e.ok())
.filter(|e| e.path().extension().map_or(false, |ext| ext == "pdf"))
.collect();
files.sort_by_key(|e| e.path());
if let Some(first) = files.first() {
let mut editor = DocumentEditor::open(first.path().to_str().unwrap())?;
for entry in &files[1..] {
if let Err(e) = editor.merge_from(entry.path().to_str().unwrap()) {
println!("Skipped {}: {}", entry.path().display(), e);
}
}
editor.save("all-reports.pdf")?;
}
Go
files, _ := filepath.Glob("reports/*.pdf")
sort.Strings(files)
// Top-level Merge concatenates every file in one call
bytes, err := pdfoxide.Merge(files)
if err != nil { log.Fatal(err) }
_ = os.WriteFile("all-reports.pdf", bytes, 0644)
Related Pages
- Extract Text from PDF — text extraction basics
- PDF for RAG Pipelines — RAG integration patterns
- PDF to Markdown — Markdown conversion
- Performance Benchmarks — benchmark results