PDFs im Stapel verarbeiten — Python, Rust, Go, C#
Ein Verzeichnis voller PDFs mit Fehlerbehandlung verarbeiten — sequenziell für den Einstieg, parallel wenn Sie Durchsatz brauchen:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
for pdf_path in Path("documents/").glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
text = doc.extract_text(0)
print(f"{pdf_path.name}: {len(text)} chars")
except PdfError as e:
print(f"Failed: {pdf_path.name}: {e}")
WASM
import { WasmPdfDocument } from "pdf-oxide-wasm";
// Process multiple PDF buffers
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const text = doc.extractText(0);
console.log(`${name}: ${text.length} chars`);
doc.free();
} catch (e) {
console.error(`Failed: ${name}: ${e.message}`);
}
}
Rust
use pdf_oxide::PdfDocument;
use std::path::Path;
for entry in std::fs::read_dir("documents/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
match PdfDocument::open(path.to_str().unwrap()) {
Ok(mut doc) => {
let text = doc.extract_text(0)?;
println!("{}: {} chars", path.display(), text.len());
}
Err(e) => println!("Failed: {}: {}", path.display(), e),
}
}
}
Mit 0,8ms pro Seite verarbeitet PDF Oxide 3.830 PDFs in 3,1 Sekunden.
Installation
pip install pdf_oxide
Sequentielle Verarbeitung
Text aus allen PDFs extrahieren
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_dir = Path("invoices/")
results = {}
for pdf_path in sorted(pdf_dir.glob("*.pdf")):
try:
doc = PdfDocument(str(pdf_path))
pages = []
for i in range(doc.page_count()):
pages.append(doc.extract_text(i))
results[pdf_path.name] = "\n".join(pages)
except PdfError as e:
print(f"Error: {pdf_path.name}: {e}")
print(f"Processed {len(results)} PDFs")
WASM
const results = new Map();
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const text = doc.extractAllText();
results.set(name, text);
doc.free();
} catch (e) {
console.error(`Error: ${name}: ${e.message}`);
}
}
console.log(`Processed ${results.size} PDFs`);
Rust
use std::collections::HashMap;
let mut results: HashMap<String, String> = HashMap::new();
for entry in std::fs::read_dir("invoices/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
let mut pages = Vec::new();
for i in 0..doc.page_count().unwrap_or(0) {
if let Ok(text) = doc.extract_text(i) {
pages.push(text);
}
}
results.insert(path.display().to_string(), pages.join("\n"));
}
}
}
println!("Processed {} PDFs", results.len());
Alle PDFs in Markdown konvertieren
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
input_dir = Path("papers/")
output_dir = Path("markdown/")
output_dir.mkdir(exist_ok=True)
for pdf_path in input_dir.glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
md = doc.to_markdown_all(detect_headings=True, include_images=False)
out_path = output_dir / pdf_path.with_suffix(".md").name
out_path.write_text(md, encoding="utf-8")
except PdfError as e:
print(f"Skipped {pdf_path.name}: {e}")
WASM
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll();
console.log(`Converted ${name}: ${md.length} chars`);
doc.free();
} catch (e) {
console.error(`Skipped ${name}: ${e.message}`);
}
}
Rust
for entry in std::fs::read_dir("papers/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
if let Ok(md) = doc.to_markdown_all(true) {
let out = path.with_extension("md");
std::fs::write(&out, &md)?;
}
}
}
}
Parallele Verarbeitung
Mit multiprocessing
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from multiprocessing import Pool
def process_pdf(pdf_path: str) -> dict:
try:
doc = PdfDocument(pdf_path)
text = ""
for i in range(doc.page_count()):
text += doc.extract_text(i) + "\n"
return {"file": pdf_path, "text": text, "pages": doc.page_count()}
except PdfError as e:
return {"file": pdf_path, "error": str(e)}
pdf_files = [str(p) for p in Path("documents/").glob("*.pdf")]
with Pool() as pool:
results = pool.map(process_pdf, pdf_files)
success = [r for r in results if "text" in r]
errors = [r for r in results if "error" in r]
print(f"Processed {len(success)}, failed {len(errors)}")
Mit concurrent.futures
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
def extract_text(pdf_path: str) -> tuple[str, str]:
doc = PdfDocument(pdf_path)
text = ""
for i in range(doc.page_count()):
text += doc.extract_text(i) + "\n"
return pdf_path, text
pdf_files = list(Path("documents/").glob("*.pdf"))
with ProcessPoolExecutor(max_workers=4) as executor:
futures = {executor.submit(extract_text, str(p)): p for p in pdf_files}
for future in as_completed(futures):
pdf_path = futures[future]
try:
path, text = future.result()
print(f"{pdf_path.name}: {len(text)} chars")
except Exception as e:
print(f"Error: {pdf_path.name}: {e}")
Fortschrittsanzeige
Einfacher Zähler
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_files = list(Path("documents/").glob("*.pdf"))
total = len(pdf_files)
for idx, pdf_path in enumerate(pdf_files, 1):
try:
doc = PdfDocument(str(pdf_path))
text = doc.extract_text(0)
print(f"[{idx}/{total}] {pdf_path.name}: OK")
except PdfError as e:
print(f"[{idx}/{total}] {pdf_path.name}: FAILED - {e}")
Mit tqdm
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from tqdm import tqdm
pdf_files = list(Path("documents/").glob("*.pdf"))
for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
try:
doc = PdfDocument(str(pdf_path))
for i in range(doc.page_count()):
doc.extract_text(i)
except PdfError:
pass
Fehlerbehandlung für beschädigte Dateien
Bauen Sie eine robuste Pipeline, die Fehler protokolliert und fortfährt:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import json
pdf_dir = Path("mixed-quality/")
results = []
errors = []
for pdf_path in pdf_dir.glob("**/*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
page_count = doc.page_count()
text_length = sum(
len(doc.extract_text(i)) for i in range(page_count)
)
results.append({
"file": str(pdf_path),
"pages": page_count,
"chars": text_length,
})
except PdfError as e:
errors.append({
"file": str(pdf_path),
"error": str(e),
})
except Exception as e:
errors.append({
"file": str(pdf_path),
"error": f"Unexpected: {e}",
})
print(f"Success: {len(results)}, Errors: {len(errors)}")
# Save error report
if errors:
with open("errors.json", "w") as f:
json.dump(errors, f, indent=2)
WASM
const results = [];
const errors = [];
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const pageCount = doc.pageCount();
let textLength = 0;
for (let i = 0; i < pageCount; i++) {
textLength += doc.extractText(i).length;
}
results.push({ file: name, pages: pageCount, chars: textLength });
doc.free();
} catch (e) {
errors.push({ file: name, error: e.message });
}
}
console.log(`Success: ${results.length}, Errors: ${errors.length}`);
Rust
let mut results = Vec::new();
let mut errors = Vec::new();
for entry in std::fs::read_dir("mixed-quality/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
match PdfDocument::open(path.to_str().unwrap()) {
Ok(mut doc) => {
let page_count = doc.page_count().unwrap_or(0);
let text_length: usize = (0..page_count)
.filter_map(|i| doc.extract_text(i).ok())
.map(|t| t.len())
.sum();
results.push((path.display().to_string(), page_count, text_length));
}
Err(e) => errors.push((path.display().to_string(), e.to_string())),
}
}
}
println!("Success: {}, Errors: {}", results.len(), errors.len());
Speichereffiziente Verarbeitung
Für sehr große Korpora verarbeiten Sie eine Datei nach der anderen, ohne Ergebnisse im Speicher anzusammeln:
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import csv
pdf_dir = Path("large-corpus/")
with open("output.csv", "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["file", "page", "text"])
for pdf_path in pdf_dir.glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
for i in range(doc.page_count()):
text = doc.extract_text(i)
writer.writerow([pdf_path.name, i, text])
except PdfError:
pass
Stapel-Zusammenführung
Alle PDFs in einem Verzeichnis zu einer Datei zusammenführen:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_files = sorted(Path("reports/").glob("*.pdf"))
if pdf_files:
doc = PdfDocument(str(pdf_files[0]))
for pdf_path in pdf_files[1:]:
try:
doc.merge_from(str(pdf_path))
except PdfError as e:
print(f"Skipped {pdf_path.name}: {e}")
doc.save("all-reports.pdf")
Rust
use pdf_oxide::editor::DocumentEditor;
let mut files: Vec<_> = std::fs::read_dir("reports/")?
.filter_map(|e| e.ok())
.filter(|e| e.path().extension().map_or(false, |ext| ext == "pdf"))
.collect();
files.sort_by_key(|e| e.path());
if let Some(first) = files.first() {
let mut editor = DocumentEditor::open(first.path().to_str().unwrap())?;
for entry in &files[1..] {
if let Err(e) = editor.merge_from(entry.path().to_str().unwrap()) {
println!("Skipped {}: {}", entry.path().display(), e);
}
}
editor.save("all-reports.pdf")?;
}
Verwandte Seiten
- Text aus PDF extrahieren — Grundlagen der Textextraktion
- PDF für RAG-Pipelines — RAG-Integrationsmuster
- PDF in Markdown — Markdown-Konvertierung
- Leistungsbenchmarks — Benchmark-Ergebnisse