HTML Conversion
PDF Oxide converts PDF pages to structured HTML with heading detection, font styling, and optional CSS-based layout preservation. Use to_html() for a single page or to_html_all() to convert the entire document. When preserve_layout is enabled, elements are positioned with CSS absolute coordinates matching the original PDF layout. When disabled, the output is semantic HTML with natural flow.
Quick Example
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("report.pdf")
html = doc.to_html(0, detect_headings=True)
print(html)
Node.js
const { PdfDocument } = require("pdf-oxide");
const doc = new PdfDocument("report.pdf");
const html = doc.toHtml(0);
console.log(html);
doc.close();
Go
import pdfoxide "github.com/yfedoseev/pdf_oxide/go"
doc, _ := pdfoxide.Open("report.pdf")
defer doc.Close()
html, _ := doc.ToHtml(0)
fmt.Println(html)
C#
using PdfOxide.Core;
using var doc = PdfDocument.Open("report.pdf");
var html = doc.ToHtml(0);
Console.WriteLine(html);
WASM
const doc = new WasmPdfDocument(bytes);
const html = doc.toHtml(0);
console.log(html);
Rust
use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;
let mut doc = PdfDocument::open("report.pdf")?;
let options = ConversionOptions { detect_headings: true, ..Default::default() };
let html = doc.to_html(0, &options)?;
println!("{}", html);
API Reference
to_html(page_index, ...) -> str
Convert a single page to HTML.
Python Signature
doc.to_html(
page: int,
preserve_layout: bool = False,
detect_headings: bool = True,
include_images: bool = True,
image_output_dir: str | None = None,
embed_images: bool = True,
) -> str
JavaScript Signature
doc.toHtml(pageIndex, preserveLayout?, detectHeadings?, includeFormFields?) -> string
Rust Signature
pub fn to_html(
&mut self,
page_index: usize,
options: &ConversionOptions,
) -> Result<String>
| Parameter | Type | Default | Description |
|---|---|---|---|
page_index |
int / usize / number |
– | Zero-based page index |
preserve_layout |
bool |
false |
Use CSS absolute positioning to match PDF layout |
detect_headings |
bool |
true |
Auto-detect heading levels from font sizes |
include_images |
bool |
true |
Include images in the HTML output |
image_output_dir |
str / None |
None |
Directory to save extracted images (Python/Rust only) |
embed_images |
bool |
true |
Embed images as base64 data URIs (Python/Rust only) |
include_form_fields |
bool |
true |
Include form field values (Python/JS) |
Returns: HTML string for the page.
When preserve_layout is true, the output uses <div> elements with absolute CSS positioning:
<div style="position: absolute; left: 72.0px; top: 100.0px; font-size: 24px; font-weight: bold;">
Introduction
</div>
When preserve_layout is false, the output uses semantic elements:
<h1>Introduction</h1>
<p>This report examines the quarterly results...</p>
to_html_all(...) -> str
Convert all pages to HTML. Each page is wrapped in a <div class="page"> element.
Python Signature
doc.to_html_all(
preserve_layout: bool = False,
detect_headings: bool = True,
include_images: bool = True,
image_output_dir: str | None = None,
embed_images: bool = True,
) -> str
JavaScript Signature
doc.toHtmlAll(preserveLayout?, detectHeadings?, includeFormFields?) -> string
Rust Signature
pub fn to_html_all(
&mut self,
options: &ConversionOptions,
) -> Result<String>
| Parameter | Type | Default | Description |
|---|---|---|---|
preserve_layout |
bool |
false |
Use CSS absolute positioning |
detect_headings |
bool |
true |
Detect headings |
include_images |
bool |
true |
Include images |
image_output_dir |
str / None |
None |
Image output directory |
embed_images |
bool |
true |
Embed images as base64 |
Returns: HTML string for all pages.
ConversionOptions
See the Markdown Conversion page for the full ConversionOptions reference. The same options struct is shared between Markdown and HTML conversion.
Advanced Examples
Convert all pages to HTML
WASM
const doc = new WasmPdfDocument(bytes);
const html = doc.toHtmlAll(false, true, true);
writeFileSync("report.html", html);
doc.free();
Create a complete HTML file
from pdf_oxide import PdfDocument
doc = PdfDocument("report.pdf")
body = doc.to_html_all(detect_headings=True)
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Report</title>
<style>
body {{ font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 2rem; }}
.page {{ margin-bottom: 2rem; border-bottom: 1px solid #ccc; padding-bottom: 2rem; }}
</style>
</head>
<body>
{body}
</body>
</html>"""
with open("report.html", "w", encoding="utf-8") as f:
f.write(html)
Layout-preserved HTML for visual fidelity
use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;
let mut doc = PdfDocument::open("brochure.pdf")?;
let options = ConversionOptions {
preserve_layout: true,
detect_headings: false, // layout mode uses exact positioning
include_images: true,
embed_images: true,
..Default::default()
};
let html = doc.to_html(0, &options)?;
std::fs::write("brochure.html", &html)?;
Convert with external image files
from pdf_oxide import PdfDocument
doc = PdfDocument("report.pdf")
html = doc.to_html_all(
detect_headings=True,
include_images=True,
embed_images=False,
image_output_dir="output/images",
)
with open("output/report.html", "w") as f:
f.write(html)
# Images saved as output/images/img_001.png, img_002.jpg, etc.
Page-by-page conversion with custom wrappers
use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;
let mut doc = PdfDocument::open("book.pdf")?;
let options = ConversionOptions::default();
let page_count = doc.page_count()?;
let mut pages_html = Vec::new();
for i in 0..page_count {
let html = doc.to_html(i, &options)?;
pages_html.push(format!(
"<section id=\"page-{}\" class=\"page\">\n{}\n</section>",
i + 1, html
));
}
let full = pages_html.join("\n");
std::fs::write("output.html", &full)?;
Related Pages
- Markdown Conversion – Convert to Markdown instead of HTML
- Text Extraction – Extract raw text without formatting
- Image Extraction – Extract images separately