What is the fastest Python PDF library?

PDF Oxide is the fastest Python PDF library, with 0.8ms mean text extraction time — 5.8× faster than PyMuPDF (4.6ms) and 15× faster than pypdf (12.1ms). Benchmarked on 3,830 real-world PDFs with 100% pass rate.

Is PDF Oxide free for commercial use?

Yes. PDF Oxide is MIT licensed — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL restrictions.

Can PDF Oxide handle scanned PDFs with OCR?

Yes. PDF Oxide includes built-in OCR via PaddleOCR and ONNX Runtime. No Tesseract installation needed — just pip install pdf_oxide and use extract_text_ocr(). Supports PP-OCRv3, v4, and v5 models.

Does PDF Oxide support XFA forms?

Yes. PDF Oxide is the only Python PDF library that can detect, analyze, and extract data from XFA forms (XML Forms Architecture). PyMuPDF, pypdf, pdfplumber, and pdfminer cannot read XFA form data.

How does PDF Oxide compare to PyMuPDF?

PDF Oxide is 5.8× faster than PyMuPDF (0.8ms vs 4.6ms mean), has a 100% pass rate vs 99.3%, and is MIT licensed vs PyMuPDF's AGPL-3.0. PDF Oxide also has built-in Markdown/HTML output and XFA form support that PyMuPDF lacks.

Can PDF Oxide convert PDF to Markdown?

Yes. PDF Oxide has built-in PDF to Markdown conversion with heading detection, table preservation, and list formatting — ideal for LLM and RAG pipelines. No separate package needed, unlike PyMuPDF which requires pymupdf4llm (69× slower).

Aus HTML erstellen

Es stehen zwei Einstiegspunkte zur Verfügung:

Pdf::from_html(content) — grundlegendes strukturelles HTML (Überschriften, Absätze, Listen, Code, fett/kursiv). Ohne Styling. In jedem Binding.
Pdf::from_html_css(html, css, font_bytes) — die vollständige HTML+CSS-Pipeline in reinem Rust, eingeführt in v0.3.37. Selbst entwickelte CSS-Engine (Teilmenge der L3- + L4-Selektoren, Kaskade, calc() / var(), @page / @media print), Taffy-gestütztes block-/flex-/grid-Layout, Zeilenumbruch nach UAX #14, RTL-Shaping via rustybuzz, ::before / ::after, page-break-*, <a href> → Link-Annotation, <img> data-URI → /XObject, Mehrschrift-Kaskade. Null MPL-Abhängigkeiten. In jedem Binding.

Schnelles Beispiel

Python

from pdf_oxide import Pdf

pdf = Pdf.from_html("<h1>Hello</h1><p>World</p>")
pdf.save("out.pdf")

WASM

import { WasmPdf } from "pdf-oxide-wasm";
import { writeFileSync } from "fs";

const pdf = WasmPdf.fromHtml("<h1>Hello</h1><p>World</p>");
writeFileSync("out.pdf", pdf.toBytes());

Rust

use pdf_oxide::api::Pdf;

let pdf = Pdf::from_html("<h1>Hello</h1><p>World</p>")?;
pdf.save("out.pdf")?;

package main

import (
    "log"
    pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
    pdf, err := pdfoxide.FromHtml("<h1>Hello</h1><p>World</p>")
    if err != nil { log.Fatal(err) }
    defer pdf.Close()

    if err := pdf.Save("out.pdf"); err != nil { log.Fatal(err) }
}

using PdfOxide;

using var pdf = Pdf.FromHtml("<h1>Hello</h1><p>World</p>");
pdf.Save("out.pdf");

Java

import fyi.oxide.pdf.Pdf;
import java.nio.file.Path;

try (Pdf pdf = Pdf.fromHtml("<h1>Hello</h1><p>World</p>")) {
    pdf.saveTo(Path.of("out.pdf"));
}

PHP

use PdfOxide\Pdf;

$pdf = Pdf::fromHtml('<h1>Hello</h1><p>World</p>');
file_put_contents('out.pdf', $pdf->save());

Ruby

require 'pdf_oxide'

PdfOxide::Pdf.from_html('<h1>Hello</h1><p>World</p>') { |pdf| pdf.save('out.pdf') }

C++

#include <pdf_oxide/pdf_oxide.hpp>

auto pdf = pdf_oxide::Pdf::from_html("<h1>Hello</h1><p>World</p>");
pdf.save("out.pdf");

Swift

import PdfOxide

let pdf = try Pdf.fromHtml("<h1>Hello</h1><p>World</p>")
try pdf.save("out.pdf")

Kotlin

import fyi.oxide.pdf.Pdf

Pdf.fromHtml("<h1>Hello</h1><p>World</p>").use { it.saveTo(java.nio.file.Path.of("out.pdf")) }

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

final pdf = Pdf.fromHtml('<h1>Hello</h1><p>World</p>');
pdf.save('out.pdf');

library(pdfoxide)

pdf <- pdf_from_html("<h1>Hello</h1><p>World</p>")
pdf_save(pdf, "out.pdf")

Julia

using PdfOxide

pdf = from_html("<h1>Hello</h1><p>World</p>")
save(pdf, "out.pdf")

Zig

const pdf_oxide = @import("pdf_oxide");

var pdf = try pdf_oxide.Pdf.fromHtml("<h1>Hello</h1><p>World</p>");
try pdf.save("out.pdf");

Scala

import fyi.oxide.pdf.Pdf
import scala.util.Using

Using.resource(Pdf.fromHtml("<h1>Hello</h1><p>World</p>"))(_.saveTo(java.nio.file.Path.of("out.pdf")))

Clojure

(require '[pdf-oxide.core :as pdf])

(let [p (pdf/from-html "<h1>Hello</h1><p>World</p>")]
  (.saveTo p (java.nio.file.Path/of "out.pdf" (into-array String []))))

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXPdf *pdf = [POXPdf fromHtml:@"<h1>Hello</h1><p>World</p>" error:&err];
[pdf saveToPath:@"out.pdf" error:&err];

Elixir

{:ok, pdf} = PdfOxide.from_html("<h1>Hello</h1><p>World</p>")
PdfOxide.save(pdf, "out.pdf")

HTML + CSS-Pipeline (v0.3.37)

Pdf::from_html_css(html, css, font_bytes) nimmt HTML, ein CSS-Stylesheet und die Bytes einer TTF/OTF-Schriftart entgegen. Zurückgegeben wird ein paginiertes PDF. extract_text liefert beim Hin- und Rückweg byteidentische Ergebnisse, sodass die erzeugten PDFs in die bestehende Testinfrastruktur eingebunden werden.

Rust:

use pdf_oxide::api::Pdf;

let font = std::fs::read("DejaVuSans.ttf")?;
let pdf = Pdf::from_html_css(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt } p { line-height: 1.5 }",
    font,
)?;
pdf.save("out.pdf")?;

Python:

from pdf_oxide import Pdf

with open("DejaVuSans.ttf", "rb") as f:
    font = f.read()

pdf = Pdf.from_html_css(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    font,
)
pdf.save("out.pdf")

Node / TypeScript:

import { Pdf } from "pdf-oxide";
import { readFileSync } from "fs";

const font = readFileSync("DejaVuSans.ttf");
const pdf = Pdf.fromHtmlCss(
  "<h1>Hello</h1><p>World</p>",
  "h1 { color: blue; font-size: 24pt }",
  font,
);
pdf.save("out.pdf");

Go:

font, _ := os.ReadFile("DejaVuSans.ttf")
pdf, err := pdfoxide.FromHtmlCss(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    font,
)
if err != nil { log.Fatal(err) }
defer pdf.Close()
_ = pdf.Save("out.pdf")

C#:

var font = File.ReadAllBytes("DejaVuSans.ttf");
using var pdf = Pdf.FromHtmlCss(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    font);
pdf.Save("out.pdf");

C++:

#include <pdf_oxide/pdf_oxide.hpp>
#include <fstream>

std::ifstream in("DejaVuSans.ttf", std::ios::binary);
std::string font((std::istreambuf_iterator<char>(in)), {});
auto pdf = pdf_oxide::Pdf::from_html_css(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    std::vector<uint8_t>(font.begin(), font.end()));
pdf.save("out.pdf");

Swift:

import PdfOxide
import Foundation

let font = [UInt8](try Data(contentsOf: URL(fileURLWithPath: "DejaVuSans.ttf")))
let pdf = try Pdf.fromHtmlCss(
    html: "<h1>Hello</h1><p>World</p>",
    css: "h1 { color: blue; font-size: 24pt }",
    fontBytes: font)
try pdf.save("out.pdf")

Dart:

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

final font = File('DejaVuSans.ttf').readAsBytesSync();
final pdf = Pdf.fromHtmlCss(
    '<h1>Hello</h1><p>World</p>',
    'h1 { color: blue; font-size: 24pt }',
    font);
pdf.save('out.pdf');

library(pdfoxide)

font <- readBin("DejaVuSans.ttf", "raw", file.info("DejaVuSans.ttf")$size)
pdf <- pdf_from_html_css(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    font)
pdf_save(pdf, "out.pdf")

Julia:

using PdfOxide

font = read("DejaVuSans.ttf")
pdf = from_html_css(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    font)
save(pdf, "out.pdf")

Zig:

const pdf_oxide = @import("pdf_oxide");
const std = @import("std");

const font = try std.fs.cwd().readFileAlloc(std.heap.page_allocator, "DejaVuSans.ttf", 1 << 24);
var pdf = try pdf_oxide.Pdf.fromHtmlCss(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    font);
try pdf.save("out.pdf");

Objective-C:

#import "POXPdfOxide.h"
NSError *err = nil;

NSData *font = [NSData dataWithContentsOfFile:@"DejaVuSans.ttf"];
POXPdf *pdf = [POXPdf fromHtml:@"<h1>Hello</h1><p>World</p>"
                          css:@"h1 { color: blue; font-size: 24pt }"
                    fontBytes:font
                        error:&err];
[pdf saveToPath:@"out.pdf" error:&err];

Elixir:

font = File.read!("DejaVuSans.ttf")
{:ok, pdf} = PdfOxide.from_html_css(
    "<h1>Hello</h1><p>World</p>",
    "h1 { color: blue; font-size: 24pt }",
    font)
PdfOxide.save(pdf, "out.pdf")

Mehrschrift-Kaskade

Verwenden Sie Pdf::from_html_css_with_fonts(html, css, fonts), wenn Ihr Dokument mehrere Schriftfamilien kombiniert. Die CSS-Eigenschaft font-family auf einem beliebigen Element wird gegen die registrierten Familien aufgelöst (ohne Groß-/Kleinschreibung, mit und ohne Anführungszeichen, mehrteilige Namen ohne Anführungszeichen). Unbekannte Familien fallen auf die zuerst registrierte Schriftart zurück.

from pdf_oxide import Pdf

fonts = [
    ("DejaVu Sans", open("DejaVuSans.ttf", "rb").read()),
    ("Noto Sans CJK", open("NotoSansCJKtc-Regular.otf", "rb").read()),
]

pdf = Pdf.from_html_css_with_fonts(
    '<h1 style="font-family: DejaVu Sans">English</h1>'
    '<p style="font-family: \'Noto Sans CJK\'">中文段落</p>',
    "h1 { font-size: 24pt }",
    fonts,
)
pdf.save("multilang.pdf")

CJK-Inhalte werden bei der Ausgabe automatisch als Subset eingebettet (v0.3.38 #385) — ein PDF mit 5 Zeichen aus einer rund 17 MB großen CJK-Schriftart bleibt typischerweise unter 100 KB.

Unterstützter CSS-Umfang

Selektoren — Teilmenge L3 + L4: :is / :where / :not / :has, strukturelle Pseudoklassen, Attribut-Matcher mit i-/s-Flags.
Kaskade — Sortierung nach Ursprung / Spezifität / Quellreihenfolge, Vererbung, Zusammenführung von Inline-Styles, benutzerdefinierte Eigenschaften (var() mit Zyklenerkennung).
Funktionen — calc(), min(), max(), clamp().
At-Regeln — @media print (stets wahr), (min/max-width), @page :first / :left / :right / :blank mit Margin-Boxen, @font-face, @import, @supports.
Typisierte Werte — Farbe (~150 benannte, Hex, rgb/rgba, hsl), Länge (alle Einheiten aus CSS Values L4), display, font-size / weight / style / family, margin-/padding-Kurzschreibweise, line-height.
Zähler — counter / counters, counter-reset / -increment / -set, römische / griechische / alphabetische Nummerierung.
Pseudoelemente — ::before / ::after mit Stringliteralen, attr(name), open-quote / close-quote.
Layout — block, flex, grid (alles über Taffy), Margin-Kollabieren, Mehrspaltigkeit (column-count / column-width / column-gap), Tabellen (auto- und fixed-Spaltenalgorithmen).
Inline — Zeilenumbruch nach UAX #14, text-align, white-space-Modi, harte Umbrüche, atomare Inline-Boxen.
Effekte — opacity, transform: translate*(), page-break-before: always, page-break-after: always.
HTML — HTML5-Tokenizer, Extraktion von <style> / <link rel="stylesheet"> / inline style="", Dekodierung von <img> data-URI (/XObject), <a href> → /Link-Annotation mit /URI, <ul> / <ol> Listenmarkierungen.

Nicht im Funktionsumfang

CSS-Filter, 3D-Transformationen, Animationen, SVG in HTML (jedes brauchbare Rust-SVG-Crate ist MPL), MathML, hyphens: auto, shape-outside, JavaScript-Ausführung, vollständige Matrix-transform (Skalierung / Rotation), Verläufe, box-shadow.

Lizenz

cargo deny check licenses läuft mit null transitiven MPL-Abhängigkeiten durch. Der CSS-Stack von Mozilla (cssparser, selectors, html5ever, lightningcss, stylo) steht durchweg unter MPL-2.0; v0.3.37 setzt die Äquivalente von Hand um, damit pdf_oxide vollständig unter MIT/Apache bleibt.

Unterstützte HTML-Elemente

Element	Beschreibung
`<h1>` bis `<h6>`	Überschriften (auf PDF-Überschriftengrößen abgebildet)
`<p>`	Absätze mit automatischem Abstand
`<b>`, `<strong>`	Fetter Text
`<i>`, `<em>`	Kursiver Text
`<ul>`, `<ol>`, `<li>`	Ungeordnete und geordnete Listen
`<pre>`, `<code>`	Vorformatierter Text und Inline-Code
`<blockquote>`	Blockzitate
`<br>`	Zeilenumbrüche
`<hr>`	Horizontale Linien

Vollständige API-Referenz

`Pdf::from_html(content)` (statische Methode)

Erstellt ein PDF aus HTML-Inhalt mit den Standardeinstellungen (Letter-Seite, 72pt-Ränder, 12pt Helvetica).

Rust:

use pdf_oxide::api::Pdf;

let html = r#"
<h1>Product Specification</h1>
<p>This document describes the <strong>technical requirements</strong>
for the new product line.</p>
<h2>Requirements</h2>
<ul>
    <li>Operating temperature: -20C to 60C</li>
    <li>Power consumption: &lt;5W</li>
    <li>Weight: &lt;200g</li>
</ul>
"#;

let pdf = Pdf::from_html(html)?;
pdf.save("spec.pdf")?;

JavaScript:

import { WasmPdf } from "pdf-oxide-wasm";
import { writeFileSync } from "fs";

const html = `
<h1>Product Specification</h1>
<p>This document describes the <strong>technical requirements</strong>
for the new product line.</p>
`;

const pdf = WasmPdf.fromHtml(html);
writeFileSync("spec.pdf", pdf.toBytes());

Python:

from pdf_oxide import Pdf

html = """
<h1>Product Specification</h1>
<p>This document describes the <strong>technical requirements</strong>
for the new product line.</p>
"""

pdf = Pdf.from_html(html)
pdf.save("spec.pdf")

Java:

import fyi.oxide.pdf.Pdf;
import java.nio.file.Path;

String html = "<h1>Product Specification</h1>"
            + "<p>This document describes the <strong>technical requirements</strong>.</p>";

try (Pdf pdf = Pdf.fromHtml(html)) {
    pdf.saveTo(Path.of("spec.pdf"));
}

PHP:

use PdfOxide\Pdf;

$html = '<h1>Product Specification</h1>'
      . '<p>This document describes the <strong>technical requirements</strong>.</p>';

$pdf = Pdf::fromHtml($html);
file_put_contents('spec.pdf', $pdf->save());

Ruby:

require 'pdf_oxide'

html = '<h1>Product Specification</h1>' \
       '<p>This document describes the <strong>technical requirements</strong>.</p>'

PdfOxide::Pdf.from_html(html) { |pdf| pdf.save('spec.pdf') }

C++:

#include <pdf_oxide/pdf_oxide.hpp>

std::string html =
    "<h1>Product Specification</h1>"
    "<p>This document describes the <strong>technical requirements</strong>.</p>";

auto pdf = pdf_oxide::Pdf::from_html(html);
pdf.save("spec.pdf");

Swift:

import PdfOxide

let html = """
<h1>Product Specification</h1>
<p>This document describes the <strong>technical requirements</strong>.</p>
"""

let pdf = try Pdf.fromHtml(html)
try pdf.save("spec.pdf")

Kotlin:

import fyi.oxide.pdf.Pdf

val html = """
    <h1>Product Specification</h1>
    <p>This document describes the <strong>technical requirements</strong>.</p>
""".trimIndent()

Pdf.fromHtml(html).use { it.saveTo(java.nio.file.Path.of("spec.pdf")) }

Dart:

import 'package:pdf_oxide/pdf_oxide.dart';

final html = '<h1>Product Specification</h1>'
    '<p>This document describes the <strong>technical requirements</strong>.</p>';

final pdf = Pdf.fromHtml(html);
pdf.save('spec.pdf');

library(pdfoxide)

html <- paste0(
    "<h1>Product Specification</h1>",
    "<p>This document describes the <strong>technical requirements</strong>.</p>")

pdf <- pdf_from_html(html)
pdf_save(pdf, "spec.pdf")

Julia:

using PdfOxide

html = """
<h1>Product Specification</h1>
<p>This document describes the <strong>technical requirements</strong>.</p>
"""

pdf = from_html(html)
save(pdf, "spec.pdf")

Zig:

const pdf_oxide = @import("pdf_oxide");

const html =
    "<h1>Product Specification</h1>" ++
    "<p>This document describes the <strong>technical requirements</strong>.</p>";

var pdf = try pdf_oxide.Pdf.fromHtml(html);
try pdf.save("spec.pdf");

Scala:

import fyi.oxide.pdf.Pdf
import scala.util.Using

val html =
  "<h1>Product Specification</h1>" +
  "<p>This document describes the <strong>technical requirements</strong>.</p>"

Using.resource(Pdf.fromHtml(html))(_.saveTo(java.nio.file.Path.of("spec.pdf")))

Clojure:

(require '[pdf-oxide.core :as pdf])

(let [html (str "<h1>Product Specification</h1>"
                "<p>This document describes the <strong>technical requirements</strong>.</p>")
      p    (pdf/from-html html)]
  (.saveTo p (java.nio.file.Path/of "spec.pdf" (into-array String []))))

Objective-C:

#import "POXPdfOxide.h"
NSError *err = nil;

NSString *html = @"<h1>Product Specification</h1>"
                  "<p>This document describes the <strong>technical requirements</strong>.</p>";

POXPdf *pdf = [POXPdf fromHtml:html error:&err];
[pdf saveToPath:@"spec.pdf" error:&err];

Elixir:

html =
  "<h1>Product Specification</h1>" <>
  "<p>This document describes the <strong>technical requirements</strong>.</p>"

{:ok, pdf} = PdfOxide.from_html(html)
PdfOxide.save(pdf, "spec.pdf")

Python-Signatur:

Pdf.from_html(
    content: str,
    title: str | None = None,
    author: str | None = None
) -> Pdf

`PdfBuilder::new().from_html(content)` (Builder-Muster)

Verwenden Sie PdfBuilder, um Seitengröße, Ränder, Schriftgröße und Dokumentmetadaten zu steuern.

Rust:

use pdf_oxide::api::PdfBuilder;
use pdf_oxide::writer::PageSize;

let pdf = PdfBuilder::new()
    .title("Technical Specification")
    .author("Engineering")
    .page_size(PageSize::A4)
    .margin(54.0)
    .font_size(11.0)
    .from_html("<h1>Spec</h1><p>Version 2.0</p>")?;

pdf.save("spec_a4.pdf")?;

Fortgeschrittene Beispiele

Strukturierter Bericht

use pdf_oxide::api::Pdf;

let html = r#"
<h1>Incident Report</h1>
<h2>Summary</h2>
<p>On <em>2025-11-15</em>, a service disruption was detected in the
<strong>payment processing</strong> pipeline.</p>

<h2>Timeline</h2>
<ol>
    <li>14:32 UTC - Alert triggered for elevated error rates</li>
    <li>14:35 UTC - On-call engineer acknowledged</li>
    <li>14:48 UTC - Root cause identified: database connection pool exhaustion</li>
    <li>15:02 UTC - Fix deployed, services recovering</li>
    <li>15:15 UTC - Full recovery confirmed</li>
</ol>

<h2>Root Cause</h2>
<p>A configuration change deployed at 14:00 UTC reduced the maximum
connection pool size from 100 to 10.</p>

<h2>Code Reference</h2>
<pre><code>max_connections: 10  # Should be 100
timeout_seconds: 30
</code></pre>

<h2>Action Items</h2>
<ul>
    <li>Add validation for connection pool configuration</li>
    <li>Implement canary deployment for config changes</li>
    <li>Add alerting for connection pool utilization</li>
</ul>
"#;

let pdf = Pdf::from_html(html)?;
pdf.save("incident_report.pdf")?;

Python mit dynamischem HTML

from pdf_oxide import Pdf

rows = [
    ("Widget A", "$12.99", 150),
    ("Widget B", "$24.50", 89),
    ("Widget C", "$7.25", 312),
]

html = "<h1>Inventory Report</h1>"
html += "<p>Generated on 2025-11-20</p>"
html += "<h2>Current Stock</h2><ul>"
for name, price, qty in rows:
    html += f"<li><strong>{name}</strong> - {price} ({qty} units)</li>"
html += "</ul>"

pdf = Pdf.from_html(html, title="Inventory Report")
pdf.save("inventory.pdf")

HTML aus einer Datei lesen

from pdf_oxide import Pdf

with open("report.html") as f:
    html = f.read()

pdf = Pdf.from_html(html, title="Report")
pdf.save("report.pdf")

import { WasmPdf } from "pdf-oxide-wasm";
import { readFileSync, writeFileSync } from "fs";

const html = readFileSync("report.html", "utf-8");
const pdf = WasmPdf.fromHtml(html);
writeFileSync("report.pdf", pdf.toBytes());

use pdf_oxide::api::Pdf;

let html = std::fs::read_to_string("report.html")?;
let pdf = Pdf::from_html(&html)?;
pdf.save("report.pdf")?;

Aus HTML erstellen

Schnelles Beispiel

HTML + CSS-Pipeline (v0.3.37)

Mehrschrift-Kaskade

Unterstützter CSS-Umfang

Nicht im Funktionsumfang

Lizenz

Unterstützte HTML-Elemente

Vollständige API-Referenz

Pdf::from_html(content) (statische Methode)

PdfBuilder::new().from_html(content) (Builder-Muster)

Fortgeschrittene Beispiele

Strukturierter Bericht

Python mit dynamischem HTML

HTML aus einer Datei lesen

Verwandte Seiten

`Pdf::from_html(content)` (statische Methode)

`PdfBuilder::new().from_html(content)` (Builder-Muster)