Skip to content

HTML Conversion

PDF Oxide converts PDF pages to structured HTML with heading detection, font styling, and optional CSS-based layout preservation. Use to_html() for a single page or to_html_all() to convert the entire document. When preserve_layout is enabled, elements are positioned with CSS absolute coordinates matching the original PDF layout. When disabled, the output is semantic HTML with natural flow.

Quick Example

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")
html = doc.to_html(0, detect_headings=True)
print(html)

Node.js

const { PdfDocument } = require("pdf-oxide");

const doc = new PdfDocument("report.pdf");
const html = doc.toHtml(0);
console.log(html);
doc.close();

Go

import pdfoxide "github.com/yfedoseev/pdf_oxide/go"

doc, _ := pdfoxide.Open("report.pdf")
defer doc.Close()
html, _ := doc.ToHtml(0)
fmt.Println(html)

C#

using PdfOxide.Core;

using var doc = PdfDocument.Open("report.pdf");
var html = doc.ToHtml(0);
Console.WriteLine(html);

WASM

const doc = new WasmPdfDocument(bytes);
const html = doc.toHtml(0);
console.log(html);

Rust

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("report.pdf")?;
let options = ConversionOptions { detect_headings: true, ..Default::default() };
let html = doc.to_html(0, &options)?;
println!("{}", html);

API Reference

to_html(page_index, ...) -> str

Convert a single page to HTML.

Python Signature

doc.to_html(
    page: int,
    preserve_layout: bool = False,
    detect_headings: bool = True,
    include_images: bool = True,
    image_output_dir: str | None = None,
    embed_images: bool = True,
) -> str

JavaScript Signature

doc.toHtml(pageIndex, preserveLayout?, detectHeadings?, includeFormFields?) -> string

Rust Signature

pub fn to_html(
    &mut self,
    page_index: usize,
    options: &ConversionOptions,
) -> Result<String>
Parameter Type Default Description
page_index int / usize / number Zero-based page index
preserve_layout bool false Use CSS absolute positioning to match PDF layout
detect_headings bool true Auto-detect heading levels from font sizes
include_images bool true Include images in the HTML output
image_output_dir str / None None Directory to save extracted images (Python/Rust only)
embed_images bool true Embed images as base64 data URIs (Python/Rust only)
include_form_fields bool true Include form field values (Python/JS)

Returns: HTML string for the page.

When preserve_layout is true, the output uses <div> elements with absolute CSS positioning:

<div style="position: absolute; left: 72.0px; top: 100.0px; font-size: 24px; font-weight: bold;">
  Introduction
</div>

When preserve_layout is false, the output uses semantic elements:

<h1>Introduction</h1>
<p>This report examines the quarterly results...</p>

to_html_all(...) -> str

Convert all pages to HTML. Each page is wrapped in a <div class="page"> element.

Python Signature

doc.to_html_all(
    preserve_layout: bool = False,
    detect_headings: bool = True,
    include_images: bool = True,
    image_output_dir: str | None = None,
    embed_images: bool = True,
) -> str

JavaScript Signature

doc.toHtmlAll(preserveLayout?, detectHeadings?, includeFormFields?) -> string

Rust Signature

pub fn to_html_all(
    &mut self,
    options: &ConversionOptions,
) -> Result<String>
Parameter Type Default Description
preserve_layout bool false Use CSS absolute positioning
detect_headings bool true Detect headings
include_images bool true Include images
image_output_dir str / None None Image output directory
embed_images bool true Embed images as base64

Returns: HTML string for all pages.


ConversionOptions

See the Markdown Conversion page for the full ConversionOptions reference. The same options struct is shared between Markdown and HTML conversion.


Advanced Examples

Convert all pages to HTML

WASM

const doc = new WasmPdfDocument(bytes);
const html = doc.toHtmlAll(false, true, true);
writeFileSync("report.html", html);
doc.free();

Create a complete HTML file

from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")
body = doc.to_html_all(detect_headings=True)

html = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Report</title>
    <style>
        body {{ font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 2rem; }}
        .page {{ margin-bottom: 2rem; border-bottom: 1px solid #ccc; padding-bottom: 2rem; }}
    </style>
</head>
<body>
{body}
</body>
</html>"""

with open("report.html", "w", encoding="utf-8") as f:
    f.write(html)

Layout-preserved HTML for visual fidelity

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("brochure.pdf")?;
let options = ConversionOptions {
    preserve_layout: true,
    detect_headings: false, // layout mode uses exact positioning
    include_images: true,
    embed_images: true,
    ..Default::default()
};

let html = doc.to_html(0, &options)?;
std::fs::write("brochure.html", &html)?;

Convert with external image files

from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")
html = doc.to_html_all(
    detect_headings=True,
    include_images=True,
    embed_images=False,
    image_output_dir="output/images",
)

with open("output/report.html", "w") as f:
    f.write(html)
# Images saved as output/images/img_001.png, img_002.jpg, etc.

Page-by-page conversion with custom wrappers

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("book.pdf")?;
let options = ConversionOptions::default();
let page_count = doc.page_count()?;

let mut pages_html = Vec::new();
for i in 0..page_count {
    let html = doc.to_html(i, &options)?;
    pages_html.push(format!(
        "<section id=\"page-{}\" class=\"page\">\n{}\n</section>",
        i + 1, html
    ));
}

let full = pages_html.join("\n");
std::fs::write("output.html", &full)?;