PDF-Extraktion für RAG-Pipelines
PDFs als strukturiertes Markdown für Ihre RAG-Pipeline extrahieren:
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("paper.pdf")
md = doc.to_markdown_all(detect_headings=True, include_images=False)
# In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
WASM
import { WasmPdfDocument } from "pdf-oxide-wasm";
const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll();
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
doc.free();
Rust
use pdf_oxide::PdfDocument;
let mut doc = PdfDocument::open("paper.pdf")?;
let md = doc.to_markdown_all(true)?;
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Go
package main
import (
"log"
pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)
func main() {
doc, err := pdfoxide.Open("paper.pdf")
if err != nil { log.Fatal(err) }
defer doc.Close()
md, _ := doc.ToMarkdownAll()
_ = md // In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
}
C#
using PdfOxide;
using var doc = PdfDocument.Open("paper.pdf");
var md = doc.ToMarkdownAll();
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Java
try (PdfDocument doc = PdfDocument.open(Path.of("paper.pdf"))) {
String md = doc.toMarkdown();
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
}
PHP
$doc = PdfDocument::open('paper.pdf');
$md = $doc->toMarkdownAll();
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
$doc->close();
Ruby
PdfOxide::PdfDocument.open('paper.pdf') do |doc|
md = doc.to_markdown
# In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
end
C++
auto doc = pdf_oxide::Document::open("paper.pdf");
auto md = doc.to_markdown_all();
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Swift
let doc = try Document.open("paper.pdf")
let md = try doc.toMarkdownAll()
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Kotlin
PdfDocument.open(java.nio.file.Path.of("paper.pdf")).use { doc ->
val md = doc.toMarkdown()
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
}
Dart
final doc = PdfDocument.open('paper.pdf');
final md = doc.toMarkdownAll();
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
R
doc <- pdf_open("paper.pdf")
md <- pdf_to_markdown_all(doc)
# In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Julia
doc = open_document("paper.pdf")
md = to_markdown_all(doc)
# In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Zig
var doc = try pdf_oxide.Document.open("paper.pdf");
const md = try doc.toMarkdownAll(a);
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Scala
Using.resource(PdfDocument.open("paper.pdf")) { doc =>
val md = doc.toMarkdown()
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
}
Clojure
(with-open [doc (pdf/open "paper.pdf")]
(let [md (pdf/to-markdown doc)]
;; In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
))
Objective-C
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"paper.pdf" error:&err];
NSString *md = [doc toMarkdownAllWithError:&err];
// In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
Elixir
{:ok, doc} = PdfOxide.open("paper.pdf")
{:ok, md} = PdfOxide.to_markdown_all(doc)
# In Chunks aufteilen, Embeddings erstellen und in der Vektordatenbank speichern
PDF Oxide verarbeitet 3.830 PDFs in 3,1 Sekunden bei 0,8 ms pro Seite mit einer Trefferquote von 100 %. Kein einziges Dokument fehlt in Ihrem Index.
Warum Extraktionsqualität für RAG entscheidend ist
Ihr Retrieval-System ist nur so gut wie die Extraktion davor:
- Fehlender Text = fehlende Antworten. Eine Bibliothek mit 98,4 % Trefferquote (pypdf) verliert stillschweigend 61 Dokumente aus einem Korpus mit 3.823 Dateien. PDF Oxide erreicht 100 %.
- Verlorene Struktur = schlechtes Chunking. Einfacher Text verliert Überschriften, Tabellen und Formatierungen, die für semantisches Chunking unerlässlich sind. Markdown bewahrt sie.
- Langsame Extraktion = Pipeline-Engpass. Bei 12,1 ms pro Seite (pypdf) oder 23,2 ms (pdfplumber) dauert die Verarbeitung von 100.000 Seiten mehrere Minuten. Bei 0,8 ms sind es 80 Sekunden.
Installation
pip install pdf_oxide
Schnellstart: Von PDF zur Vektordatenbank
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
def extract_documents(pdf_dir: str) -> list[dict]:
"""Alle PDFs in einem Verzeichnis in strukturierte Chunks extrahieren."""
documents = []
for pdf_path in Path(pdf_dir).glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
for i in range(doc.page_count()):
md = doc.to_markdown(i,
detect_headings=True,
include_images=False
)
if md.strip():
documents.append({
"content": md,
"source": pdf_path.name,
"page": i,
})
except PdfError as e:
print(f"{pdf_path.name} übersprungen: {e}")
return documents
docs = extract_documents("research-papers/")
print(f"{len(docs)} Chunks aus PDFs extrahiert")
# docs an das Embedding-Modell und den Vector Store übergeben
Chunking-Strategien
Nach Überschrift (Semantisches Chunking)
Markdown-Ausgabe an Überschriften aufteilen für semantisch bedeutsame Chunks:
Python
import re
from pdf_oxide import PdfDocument
doc = PdfDocument("paper.pdf")
md = doc.to_markdown_all(detect_headings=True, include_images=False)
# An ## Überschriften aufteilen
chunks = re.split(r'\n(?=## )', md)
chunks = [c.strip() for c in chunks if c.strip()]
WASM
const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll();
// Split on ## headings
const chunks = md.split(/\n(?=## )/).filter(c => c.trim());
doc.free();
Rust
let mut doc = PdfDocument::open("paper.pdf")?;
let md = doc.to_markdown_all(true)?;
let chunks: Vec<&str> = md.split("\n## ")
.map(|c| c.trim())
.filter(|c| !c.is_empty())
.collect();
Go
doc, _ := pdfoxide.Open("paper.pdf")
defer doc.Close()
md, _ := doc.ToMarkdownAll()
var chunks []string
for _, c := range strings.Split(md, "\n## ") {
c = strings.TrimSpace(c)
if c != "" { chunks = append(chunks, c) }
}
C#
using var doc = PdfDocument.Open("paper.pdf");
var md = doc.ToMarkdownAll();
var chunks = md.Split("\n## ")
.Select(c => c.Trim())
.Where(c => c.Length > 0)
.ToList();
Java
try (PdfDocument doc = PdfDocument.open(Path.of("paper.pdf"))) {
String md = doc.toMarkdown();
List<String> chunks = Arrays.stream(md.split("\n## "))
.map(String::trim)
.filter(c -> !c.isEmpty())
.collect(Collectors.toList());
}
PHP
$doc = PdfDocument::open('paper.pdf');
$md = $doc->toMarkdownAll();
$chunks = array_values(array_filter(
array_map('trim', explode("\n## ", $md)),
fn($c) => $c !== ''
));
$doc->close();
Ruby
PdfOxide::PdfDocument.open('paper.pdf') do |doc|
md = doc.to_markdown
chunks = md.split("\n## ").map(&:strip).reject(&:empty?)
end
C++
auto doc = pdf_oxide::Document::open("paper.pdf");
auto md = doc.to_markdown_all();
std::vector<std::string> chunks;
size_t start = 0, pos;
std::string sep = "\n## ";
while ((pos = md.find(sep, start)) != std::string::npos) {
chunks.push_back(md.substr(start, pos - start));
start = pos + sep.size();
}
chunks.push_back(md.substr(start));
Swift
let doc = try Document.open("paper.pdf")
let md = try doc.toMarkdownAll()
let chunks = md.components(separatedBy: "\n## ")
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
Kotlin
PdfDocument.open(java.nio.file.Path.of("paper.pdf")).use { doc ->
val md = doc.toMarkdown()
val chunks = md.split("\n## ").map { it.trim() }.filter { it.isNotEmpty() }
}
Dart
final doc = PdfDocument.open('paper.pdf');
final md = doc.toMarkdownAll();
final chunks = md.split('\n## ')
.map((c) => c.trim())
.where((c) => c.isNotEmpty)
.toList();
R
doc <- pdf_open("paper.pdf")
md <- pdf_to_markdown_all(doc)
chunks <- strsplit(md, "\n## ", fixed = TRUE)[[1]]
chunks <- trimws(chunks)
chunks <- chunks[nzchar(chunks)]
Julia
doc = open_document("paper.pdf")
md = to_markdown_all(doc)
chunks = filter(!isempty, strip.(split(md, "\n## ")))
Zig
var doc = try pdf_oxide.Document.open("paper.pdf");
const md = try doc.toMarkdownAll(a);
var chunks = std.ArrayList([]const u8).init(a);
var it = std.mem.splitSequence(u8, md, "\n## ");
while (it.next()) |part| {
const trimmed = std.mem.trim(u8, part, " \t\r\n");
if (trimmed.len > 0) try chunks.append(trimmed);
}
Scala
Using.resource(PdfDocument.open("paper.pdf")) { doc =>
val md = doc.toMarkdown()
val chunks = md.split("\n## ").map(_.trim).filter(_.nonEmpty)
}
Clojure
(with-open [doc (pdf/open "paper.pdf")]
(let [md (pdf/to-markdown doc)
chunks (->> (clojure.string/split md #"\n## ")
(map clojure.string/trim)
(remove clojure.string/blank?))]))
Objective-C
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"paper.pdf" error:&err];
NSString *md = [doc toMarkdownAllWithError:&err];
NSMutableArray<NSString*> *chunks = [NSMutableArray array];
for (NSString *part in [md componentsSeparatedByString:@"\n## "]) {
NSString *t = [part stringByTrimmingCharactersInSet:
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
if (t.length > 0) [chunks addObject:t];
}
Elixir
{:ok, doc} = PdfOxide.open("paper.pdf")
{:ok, md} = PdfOxide.to_markdown_all(doc)
chunks =
md
|> String.split("\n## ")
|> Enum.map(&String.trim/1)
|> Enum.reject(&(&1 == ""))
Nach Seite
Ein Chunk pro Seite — einfach und bewahrt den seitenweisen Kontext:
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("manual.pdf")
chunks = []
for i in range(doc.page_count()):
md = doc.to_markdown(i, detect_headings=True, include_images=False)
if md.strip():
chunks.append({"content": md, "page": i})
WASM
const doc = new WasmPdfDocument(bytes);
const chunks = [];
for (let i = 0; i < doc.pageCount(); i++) {
const md = doc.toMarkdown(i);
if (md.trim()) {
chunks.push({ content: md, page: i });
}
}
doc.free();
Rust
let mut doc = PdfDocument::open("manual.pdf")?;
let mut chunks = Vec::new();
for i in 0..doc.page_count()? {
let md = doc.to_markdown(i, true)?;
if !md.trim().is_empty() {
chunks.push((i, md));
}
}
Go
doc, _ := pdfoxide.Open("manual.pdf")
defer doc.Close()
type Chunk struct{ Page int; Content string }
var chunks []Chunk
n, _ := doc.PageCount()
for i := 0; i < n; i++ {
md, _ := doc.ToMarkdown(i)
if strings.TrimSpace(md) != "" {
chunks = append(chunks, Chunk{Page: i, Content: md})
}
}
C#
using var doc = PdfDocument.Open("manual.pdf");
var chunks = Enumerable.Range(0, doc.PageCount)
.Select(i => new { Page = i, Content = doc.ToMarkdown(i) })
.Where(c => !string.IsNullOrWhiteSpace(c.Content))
.ToList();
Java
try (PdfDocument doc = PdfDocument.open(Path.of("manual.pdf"))) {
List<String> chunks = new ArrayList<>();
for (int i = 0; i < doc.pageCount(); i++) {
String md = doc.toMarkdown(i);
if (!md.isBlank()) chunks.add(md);
}
}
PHP
$doc = PdfDocument::open('manual.pdf');
$chunks = [];
for ($i = 0; $i < $doc->pageCount(); $i++) {
$md = $doc->toMarkdown($i);
if (trim($md) !== '') { $chunks[] = ['content' => $md, 'page' => $i]; }
}
$doc->close();
Ruby
PdfOxide::PdfDocument.open('manual.pdf') do |doc|
chunks = []
(0...doc.page_count).each do |i|
md = doc.to_markdown(i)
chunks << { content: md, page: i } unless md.strip.empty?
end
end
C++
auto doc = pdf_oxide::Document::open("manual.pdf");
std::vector<std::string> chunks;
for (int i = 0; i < doc.page_count(); i++) {
auto md = doc.to_markdown(i);
if (!md.empty()) chunks.push_back(md);
}
Swift
let doc = try Document.open("manual.pdf")
var chunks: [(page: Int, content: String)] = []
for i in 0..<(try doc.pageCount()) {
let md = try doc.toMarkdown(i)
if !md.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
chunks.append((page: i, content: md))
}
}
Kotlin
PdfDocument.open(java.nio.file.Path.of("manual.pdf")).use { doc ->
val chunks = (0 until doc.pageCount())
.map { i -> i to doc.toMarkdown(i) }
.filter { it.second.isNotBlank() }
}
Dart
final doc = PdfDocument.open('manual.pdf');
final chunks = <Map<String, Object>>[];
for (var i = 0; i < doc.pageCount; i++) {
final md = doc.toMarkdown(i);
if (md.trim().isNotEmpty) chunks.add({'content': md, 'page': i});
}
R
doc <- pdf_open("manual.pdf")
chunks <- list()
for (i in 0:(pdf_page_count(doc) - 1)) {
md <- pdf_to_markdown(doc, i)
if (nzchar(trimws(md))) {
chunks[[length(chunks) + 1]] <- list(content = md, page = i)
}
}
Julia
doc = open_document("manual.pdf")
chunks = []
for i in 0:(page_count(doc) - 1)
md = to_markdown(doc, i)
if !isempty(strip(md))
push!(chunks, (page = i, content = md))
end
end
Zig
var doc = try pdf_oxide.Document.open("manual.pdf");
var chunks = std.ArrayList([]const u8).init(a);
var i: usize = 0;
const n = try doc.pageCount();
while (i < n) : (i += 1) {
const md = try doc.toMarkdown(a, i);
if (std.mem.trim(u8, md, " \t\r\n").len > 0) try chunks.append(md);
}
Scala
Using.resource(PdfDocument.open("manual.pdf")) { doc =>
val chunks = (0 until doc.pageCount())
.map(i => i -> doc.toMarkdown(i))
.filter(_._2.trim.nonEmpty)
}
Clojure
(with-open [doc (pdf/open "manual.pdf")]
(let [chunks (for [i (range (pdf/page-count doc))
:let [md (pdf/to-markdown doc i)]
:when (not (clojure.string/blank? md))]
{:content md :page i})]))
Objective-C
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"manual.pdf" error:&err];
NSMutableArray *chunks = [NSMutableArray array];
for (NSInteger i = 0; i < [doc pageCountError:&err]; i++) {
NSString *md = [doc toMarkdown:i error:&err];
if ([md stringByTrimmingCharactersInSet:
[NSCharacterSet whitespaceAndNewlineCharacterSet]].length > 0) {
[chunks addObject:@{@"content": md, @"page": @(i)}];
}
}
Elixir
{:ok, doc} = PdfOxide.open("manual.pdf")
{:ok, n} = PdfOxide.page_count(doc)
chunks =
0..(n - 1)
|> Enum.map(fn i -> {:ok, md} = PdfOxide.to_markdown(doc, i); {i, md} end)
|> Enum.reject(fn {_i, md} -> String.trim(md) == "" end)
|> Enum.map(fn {i, md} -> %{content: md, page: i} end)
Feste Größe mit Überlappung
Langen Text in Chunks fester Größe mit Überlappung aufteilen:
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("book.pdf")
full_text = doc.to_markdown_all(detect_headings=True, include_images=False)
chunk_size = 1000 # Zeichen
overlap = 200
chunks = []
for start in range(0, len(full_text), chunk_size - overlap):
chunk = full_text[start:start + chunk_size]
if chunk.strip():
chunks.append(chunk)
WASM
const doc = new WasmPdfDocument(bytes);
const fullText = doc.toMarkdownAll();
const chunkSize = 1000;
const overlap = 200;
const chunks = [];
for (let start = 0; start < fullText.length; start += chunkSize - overlap) {
const chunk = fullText.slice(start, start + chunkSize);
if (chunk.trim()) chunks.push(chunk);
}
doc.free();
Rust
let mut doc = PdfDocument::open("book.pdf")?;
let full_text = doc.to_markdown_all(true)?;
let chunk_size = 1000;
let overlap = 200;
let mut chunks = Vec::new();
let mut start = 0;
while start < full_text.len() {
let end = (start + chunk_size).min(full_text.len());
let chunk = &full_text[start..end];
if !chunk.trim().is_empty() {
chunks.push(chunk.to_string());
}
start += chunk_size - overlap;
}
Go
doc, _ := pdfoxide.Open("book.pdf")
defer doc.Close()
full, _ := doc.ToMarkdownAll()
const chunkSize, overlap = 1000, 200
var chunks []string
for start := 0; start < len(full); start += chunkSize - overlap {
end := start + chunkSize
if end > len(full) { end = len(full) }
chunk := full[start:end]
if strings.TrimSpace(chunk) != "" {
chunks = append(chunks, chunk)
}
}
C#
using var doc = PdfDocument.Open("book.pdf");
var full = doc.ToMarkdownAll();
const int chunkSize = 1000, overlap = 200;
var chunks = new List<string>();
for (int start = 0; start < full.Length; start += chunkSize - overlap)
{
var end = Math.Min(start + chunkSize, full.Length);
var chunk = full[start..end];
if (!string.IsNullOrWhiteSpace(chunk))
chunks.Add(chunk);
}
Java
try (PdfDocument doc = PdfDocument.open(Path.of("book.pdf"))) {
String full = doc.toMarkdown();
int chunkSize = 1000, overlap = 200;
List<String> chunks = new ArrayList<>();
for (int start = 0; start < full.length(); start += chunkSize - overlap) {
int end = Math.min(start + chunkSize, full.length());
String chunk = full.substring(start, end);
if (!chunk.isBlank()) chunks.add(chunk);
}
}
PHP
$doc = PdfDocument::open('book.pdf');
$full = $doc->toMarkdownAll();
$chunkSize = 1000; $overlap = 200;
$chunks = [];
for ($start = 0; $start < strlen($full); $start += $chunkSize - $overlap) {
$chunk = substr($full, $start, $chunkSize);
if (trim($chunk) !== '') { $chunks[] = $chunk; }
}
$doc->close();
Ruby
PdfOxide::PdfDocument.open('book.pdf') do |doc|
full = doc.to_markdown
chunk_size = 1000; overlap = 200
chunks = []
start = 0
while start < full.length
chunk = full[start, chunk_size]
chunks << chunk unless chunk.strip.empty?
start += chunk_size - overlap
end
end
C++
auto doc = pdf_oxide::Document::open("book.pdf");
auto full = doc.to_markdown_all();
const size_t chunkSize = 1000, overlap = 200;
std::vector<std::string> chunks;
for (size_t start = 0; start < full.size(); start += chunkSize - overlap) {
auto chunk = full.substr(start, chunkSize);
if (!chunk.empty()) chunks.push_back(chunk);
}
Swift
let doc = try Document.open("book.pdf")
let full = Array(try doc.toMarkdownAll())
let chunkSize = 1000, overlap = 200
var chunks: [String] = []
var start = 0
while start < full.count {
let end = min(start + chunkSize, full.count)
let chunk = String(full[start..<end])
if !chunk.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
chunks.append(chunk)
}
start += chunkSize - overlap
}
Kotlin
PdfDocument.open(java.nio.file.Path.of("book.pdf")).use { doc ->
val full = doc.toMarkdown()
val chunkSize = 1000; val overlap = 200
val chunks = mutableListOf<String>()
var start = 0
while (start < full.length) {
val chunk = full.substring(start, minOf(start + chunkSize, full.length))
if (chunk.isNotBlank()) chunks.add(chunk)
start += chunkSize - overlap
}
}
Dart
final doc = PdfDocument.open('book.pdf');
final full = doc.toMarkdownAll();
const chunkSize = 1000, overlap = 200;
final chunks = <String>[];
for (var start = 0; start < full.length; start += chunkSize - overlap) {
final end = (start + chunkSize).clamp(0, full.length);
final chunk = full.substring(start, end);
if (chunk.trim().isNotEmpty) chunks.add(chunk);
}
R
doc <- pdf_open("book.pdf")
full <- pdf_to_markdown_all(doc)
chunk_size <- 1000; overlap <- 200
chunks <- character(0)
start <- 1
while (start <= nchar(full)) {
chunk <- substr(full, start, start + chunk_size - 1)
if (nzchar(trimws(chunk))) chunks <- c(chunks, chunk)
start <- start + chunk_size - overlap
}
Julia
doc = open_document("book.pdf")
full = to_markdown_all(doc)
chunk_size = 1000; overlap = 200
chunks = String[]
start = 1
while start <= lastindex(full)
stop = min(start + chunk_size - 1, lastindex(full))
chunk = full[start:stop]
!isempty(strip(chunk)) && push!(chunks, chunk)
global start += chunk_size - overlap
end
Zig
var doc = try pdf_oxide.Document.open("book.pdf");
const full = try doc.toMarkdownAll(a);
const chunk_size: usize = 1000;
const overlap: usize = 200;
var chunks = std.ArrayList([]const u8).init(a);
var start: usize = 0;
while (start < full.len) : (start += chunk_size - overlap) {
const end = @min(start + chunk_size, full.len);
const chunk = full[start..end];
if (std.mem.trim(u8, chunk, " \t\r\n").len > 0) try chunks.append(chunk);
}
Scala
Using.resource(PdfDocument.open("book.pdf")) { doc =>
val full = doc.toMarkdown()
val chunkSize = 1000; val overlap = 200
val chunks = (0 until full.length by (chunkSize - overlap)).flatMap { start =>
val chunk = full.substring(start, math.min(start + chunkSize, full.length))
Option.when(chunk.trim.nonEmpty)(chunk)
}
}
Clojure
(with-open [doc (pdf/open "book.pdf")]
(let [full (pdf/to-markdown doc)
chunk-size 1000 overlap 200
chunks (->> (range 0 (count full) (- chunk-size overlap))
(map #(subs full % (min (+ % chunk-size) (count full))))
(remove clojure.string/blank?))]))
Objective-C
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"book.pdf" error:&err];
NSString *full = [doc toMarkdownAllWithError:&err];
NSUInteger chunkSize = 1000, overlap = 200;
NSMutableArray<NSString*> *chunks = [NSMutableArray array];
for (NSUInteger start = 0; start < full.length; start += chunkSize - overlap) {
NSUInteger len = MIN(chunkSize, full.length - start);
NSString *chunk = [full substringWithRange:NSMakeRange(start, len)];
if ([chunk stringByTrimmingCharactersInSet:
[NSCharacterSet whitespaceAndNewlineCharacterSet]].length > 0) {
[chunks addObject:chunk];
}
}
Elixir
{:ok, doc} = PdfOxide.open("book.pdf")
{:ok, full} = PdfOxide.to_markdown_all(doc)
chunk_size = 1000
overlap = 200
chunks =
Stream.iterate(0, &(&1 + chunk_size - overlap))
|> Stream.take_while(&(&1 < String.length(full)))
|> Enum.map(&String.slice(full, &1, chunk_size))
|> Enum.reject(&(String.trim(&1) == ""))
Tausende von PDFs stapelweise verarbeiten
Bei 0,8 ms pro Seite verarbeitet PDF Oxide große Korpora schnell:
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_files = list(Path("corpus/").glob("**/*.pdf"))
print(f"{len(pdf_files)} PDFs werden verarbeitet...")
all_chunks = []
errors = 0
for pdf_path in pdf_files:
try:
doc = PdfDocument(str(pdf_path))
md = doc.to_markdown_all(
detect_headings=True,
include_images=False
)
if md.strip():
all_chunks.append({
"content": md,
"source": str(pdf_path),
"pages": doc.page_count(),
})
except PdfError:
errors += 1
print(f"{len(all_chunks)} Dokumente extrahiert, {errors} Fehler")
Gescannte PDFs in der Pipeline verarbeiten
Einige PDFs in einem Korpus sind gescannte Bilder. OCR als Fallback verwenden:
from pdf_oxide import PdfDocument
doc = PdfDocument("mixed-corpus-file.pdf")
text = doc.extract_text(0)
if len(text.strip()) < 50:
# Wahrscheinlich eine gescannte Seite — OCR verwenden
text = doc.extract_text_ocr(0)
Details zur Einrichtung finden Sie im OCR-Leitfaden.
Warum Markdown statt Klartext
| Merkmal | Klartext | Markdown |
|---|---|---|
| Überschriftenhierarchie | Verloren | Erhalten (#, ##, ###) |
| Tabellen | Abgeflacht | GFM-Tabellensyntax |
| Fett/Kursiv | Verloren | **bold**, *italic* |
| Semantisches Chunking | Schwierig | Aufteilung nach Überschriften |
| LLM-Verständnis | Geringer | Höher (strukturierte Eingabe) |
Markdown liefert dem LLM mehr Kontext zur Dokumentstruktur und führt so zu besserer Retrieval- und Generierungsqualität.
Leistung bei großen Mengen
| Korpusgröße | PDF Oxide | pypdf | pdfplumber |
|---|---|---|---|
| 1.000 Seiten | 0,8 s | 12,1 s | 23,2 s |
| 10.000 Seiten | 8 s | 121 s | 232 s |
| 100.000 Seiten | 80 s | 1.210 s | 2.320 s |
| Trefferquote | 100 % | 98,4 % | 98,8 % |
Mit 100 % Trefferquote müssen Sie nie manuell nachforschen, warum Dokumente in Ihrem Index fehlen.
Verwandte Seiten
- PDF zu Markdown — Details zur Markdown-Konvertierung
- Stapelverarbeitung — Muster für parallele Verarbeitung
- OCR für gescannte PDFs — OCR-Einrichtung und -Nutzung
- Text aus PDF extrahieren — Klartextextraktion
- Leistungs-Benchmarks — vollständige Benchmark-Ergebnisse