Skip to content

PDFs im Stapel verarbeiten — Python, Rust, Go, C#

Ein Verzeichnis voller PDFs mit Fehlerbehandlung verarbeiten — sequenziell für den Einstieg, parallel wenn Sie Durchsatz brauchen:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

for pdf_path in Path("documents/").glob("*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        text = doc.extract_text(0)
        print(f"{pdf_path.name}: {len(text)} chars")
    except PdfError as e:
        print(f"Failed: {pdf_path.name}: {e}")

WASM

import { WasmPdfDocument } from "pdf-oxide-wasm";

// Process multiple PDF buffers
for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const text = doc.extractText(0);
        console.log(`${name}: ${text.length} chars`);
        doc.free();
    } catch (e) {
        console.error(`Failed: ${name}: ${e.message}`);
    }
}

Rust

use pdf_oxide::PdfDocument;
use std::path::Path;

for entry in std::fs::read_dir("documents/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        match PdfDocument::open(path.to_str().unwrap()) {
            Ok(mut doc) => {
                let text = doc.extract_text(0)?;
                println!("{}: {} chars", path.display(), text.len());
            }
            Err(e) => println!("Failed: {}: {}", path.display(), e),
        }
    }
}

Mit 0,8ms pro Seite verarbeitet PDF Oxide 3.830 PDFs in 3,1 Sekunden.

Installation

pip install pdf_oxide

Sequentielle Verarbeitung

Text aus allen PDFs extrahieren

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_dir = Path("invoices/")
results = {}

for pdf_path in sorted(pdf_dir.glob("*.pdf")):
    try:
        doc = PdfDocument(str(pdf_path))
        pages = []
        for i in range(doc.page_count()):
            pages.append(doc.extract_text(i))
        results[pdf_path.name] = "\n".join(pages)
    except PdfError as e:
        print(f"Error: {pdf_path.name}: {e}")

print(f"Processed {len(results)} PDFs")

WASM

const results = new Map();
for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const text = doc.extractAllText();
        results.set(name, text);
        doc.free();
    } catch (e) {
        console.error(`Error: ${name}: ${e.message}`);
    }
}
console.log(`Processed ${results.size} PDFs`);

Rust

use std::collections::HashMap;

let mut results: HashMap<String, String> = HashMap::new();
for entry in std::fs::read_dir("invoices/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
            let mut pages = Vec::new();
            for i in 0..doc.page_count().unwrap_or(0) {
                if let Ok(text) = doc.extract_text(i) {
                    pages.push(text);
                }
            }
            results.insert(path.display().to_string(), pages.join("\n"));
        }
    }
}
println!("Processed {} PDFs", results.len());

Alle PDFs in Markdown konvertieren

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

input_dir = Path("papers/")
output_dir = Path("markdown/")
output_dir.mkdir(exist_ok=True)

for pdf_path in input_dir.glob("*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        md = doc.to_markdown_all(detect_headings=True, include_images=False)
        out_path = output_dir / pdf_path.with_suffix(".md").name
        out_path.write_text(md, encoding="utf-8")
    except PdfError as e:
        print(f"Skipped {pdf_path.name}: {e}")

WASM

for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const md = doc.toMarkdownAll();
        console.log(`Converted ${name}: ${md.length} chars`);
        doc.free();
    } catch (e) {
        console.error(`Skipped ${name}: ${e.message}`);
    }
}

Rust

for entry in std::fs::read_dir("papers/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
            if let Ok(md) = doc.to_markdown_all(true) {
                let out = path.with_extension("md");
                std::fs::write(&out, &md)?;
            }
        }
    }
}

Parallele Verarbeitung

Mit multiprocessing

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from multiprocessing import Pool

def process_pdf(pdf_path: str) -> dict:
    try:
        doc = PdfDocument(pdf_path)
        text = ""
        for i in range(doc.page_count()):
            text += doc.extract_text(i) + "\n"
        return {"file": pdf_path, "text": text, "pages": doc.page_count()}
    except PdfError as e:
        return {"file": pdf_path, "error": str(e)}

pdf_files = [str(p) for p in Path("documents/").glob("*.pdf")]

with Pool() as pool:
    results = pool.map(process_pdf, pdf_files)

success = [r for r in results if "text" in r]
errors = [r for r in results if "error" in r]
print(f"Processed {len(success)}, failed {len(errors)}")

Mit concurrent.futures

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

def extract_text(pdf_path: str) -> tuple[str, str]:
    doc = PdfDocument(pdf_path)
    text = ""
    for i in range(doc.page_count()):
        text += doc.extract_text(i) + "\n"
    return pdf_path, text

pdf_files = list(Path("documents/").glob("*.pdf"))

with ProcessPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(extract_text, str(p)): p for p in pdf_files}

    for future in as_completed(futures):
        pdf_path = futures[future]
        try:
            path, text = future.result()
            print(f"{pdf_path.name}: {len(text)} chars")
        except Exception as e:
            print(f"Error: {pdf_path.name}: {e}")

Fortschrittsanzeige

Einfacher Zähler

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_files = list(Path("documents/").glob("*.pdf"))
total = len(pdf_files)

for idx, pdf_path in enumerate(pdf_files, 1):
    try:
        doc = PdfDocument(str(pdf_path))
        text = doc.extract_text(0)
        print(f"[{idx}/{total}] {pdf_path.name}: OK")
    except PdfError as e:
        print(f"[{idx}/{total}] {pdf_path.name}: FAILED - {e}")

Mit tqdm

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from tqdm import tqdm

pdf_files = list(Path("documents/").glob("*.pdf"))

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        doc = PdfDocument(str(pdf_path))
        for i in range(doc.page_count()):
            doc.extract_text(i)
    except PdfError:
        pass

Fehlerbehandlung für beschädigte Dateien

Bauen Sie eine robuste Pipeline, die Fehler protokolliert und fortfährt:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import json

pdf_dir = Path("mixed-quality/")
results = []
errors = []

for pdf_path in pdf_dir.glob("**/*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        page_count = doc.page_count()
        text_length = sum(
            len(doc.extract_text(i)) for i in range(page_count)
        )
        results.append({
            "file": str(pdf_path),
            "pages": page_count,
            "chars": text_length,
        })
    except PdfError as e:
        errors.append({
            "file": str(pdf_path),
            "error": str(e),
        })
    except Exception as e:
        errors.append({
            "file": str(pdf_path),
            "error": f"Unexpected: {e}",
        })

print(f"Success: {len(results)}, Errors: {len(errors)}")

# Save error report
if errors:
    with open("errors.json", "w") as f:
        json.dump(errors, f, indent=2)

WASM

const results = [];
const errors = [];

for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const pageCount = doc.pageCount();
        let textLength = 0;
        for (let i = 0; i < pageCount; i++) {
            textLength += doc.extractText(i).length;
        }
        results.push({ file: name, pages: pageCount, chars: textLength });
        doc.free();
    } catch (e) {
        errors.push({ file: name, error: e.message });
    }
}

console.log(`Success: ${results.length}, Errors: ${errors.length}`);

Rust

let mut results = Vec::new();
let mut errors = Vec::new();

for entry in std::fs::read_dir("mixed-quality/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        match PdfDocument::open(path.to_str().unwrap()) {
            Ok(mut doc) => {
                let page_count = doc.page_count().unwrap_or(0);
                let text_length: usize = (0..page_count)
                    .filter_map(|i| doc.extract_text(i).ok())
                    .map(|t| t.len())
                    .sum();
                results.push((path.display().to_string(), page_count, text_length));
            }
            Err(e) => errors.push((path.display().to_string(), e.to_string())),
        }
    }
}

println!("Success: {}, Errors: {}", results.len(), errors.len());

Speichereffiziente Verarbeitung

Für sehr große Korpora verarbeiten Sie eine Datei nach der anderen, ohne Ergebnisse im Speicher anzusammeln:

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import csv

pdf_dir = Path("large-corpus/")

with open("output.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["file", "page", "text"])

    for pdf_path in pdf_dir.glob("*.pdf"):
        try:
            doc = PdfDocument(str(pdf_path))
            for i in range(doc.page_count()):
                text = doc.extract_text(i)
                writer.writerow([pdf_path.name, i, text])
        except PdfError:
            pass

Stapel-Zusammenführung

Alle PDFs in einem Verzeichnis zu einer Datei zusammenführen:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_files = sorted(Path("reports/").glob("*.pdf"))

if pdf_files:
    doc = PdfDocument(str(pdf_files[0]))
    for pdf_path in pdf_files[1:]:
        try:
            doc.merge_from(str(pdf_path))
        except PdfError as e:
            print(f"Skipped {pdf_path.name}: {e}")
    doc.save("all-reports.pdf")

Rust

use pdf_oxide::editor::DocumentEditor;

let mut files: Vec<_> = std::fs::read_dir("reports/")?
    .filter_map(|e| e.ok())
    .filter(|e| e.path().extension().map_or(false, |ext| ext == "pdf"))
    .collect();
files.sort_by_key(|e| e.path());

if let Some(first) = files.first() {
    let mut editor = DocumentEditor::open(first.path().to_str().unwrap())?;
    for entry in &files[1..] {
        if let Err(e) = editor.merge_from(entry.path().to_str().unwrap()) {
            println!("Skipped {}: {}", entry.path().display(), e);
        }
    }
    editor.save("all-reports.pdf")?;
}

Verwandte Seiten