HTML Conversion
PDF Oxide converts PDF pages to structured HTML with heading detection, font styling, and optional CSS-based layout preservation. Use to_html() for a single page or to_html_all() to convert the entire document. When preserve_layout is enabled, elements are positioned with CSS absolute coordinates matching the original PDF layout. When disabled, the output is semantic HTML with natural flow.
Quick Example
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("report.pdf")
html = doc.to_html(0, detect_headings=True)
print(html)
Node.js
const { PdfDocument } = require("pdf-oxide");
const doc = new PdfDocument("report.pdf");
const html = doc.toHtml(0);
console.log(html);
doc.close();
Go
import pdfoxide "github.com/yfedoseev/pdf_oxide/go"
doc, _ := pdfoxide.Open("report.pdf")
defer doc.Close()
html, _ := doc.ToHtml(0)
fmt.Println(html)
C#
using PdfOxide.Core;
using var doc = PdfDocument.Open("report.pdf");
var html = doc.ToHtml(0);
Console.WriteLine(html);
WASM
const doc = new WasmPdfDocument(bytes);
const html = doc.toHtml(0);
console.log(html);
Rust
use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;
let mut doc = PdfDocument::open("report.pdf")?;
let options = ConversionOptions { detect_headings: true, ..Default::default() };
let html = doc.to_html(0, &options)?;
println!("{}", html);
Java
import fyi.oxide.pdf.PdfDocument;
try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
String html = doc.toHtml(0);
System.out.println(html);
}
Kotlin
import fyi.oxide.pdf.PdfDocument
PdfDocument.open(java.nio.file.Path.of("report.pdf")).use { doc ->
val html = doc.toHtml(0)
println(html)
}
Scala
import fyi.oxide.pdf.PdfDocument
import scala.util.Using
Using.resource(PdfDocument.open("report.pdf")) { doc =>
val html = doc.toHtml(0)
println(html)
}
Clojure
(require '[pdf-oxide.core :as pdf])
(with-open [doc (pdf/open "report.pdf")]
(println (pdf/to-html doc 0)))
PHP
use PdfOxide\PdfDocument;
$doc = PdfDocument::open('report.pdf');
$html = $doc->toHtml(0);
echo $html;
$doc->close();
Ruby
require 'pdf_oxide'
PdfOxide::PdfDocument.open('report.pdf') do |doc|
html = doc.to_html(0)
puts html
end
C++
#include <pdf_oxide/pdf_oxide.hpp>
auto doc = pdf_oxide::Document::open("report.pdf");
auto html = doc.to_html(0);
std::cout << html << std::endl;
Swift
import PdfOxide
let doc = try Document.open("report.pdf")
let html = try doc.toHtml(0)
print(html)
Dart
import 'package:pdf_oxide/pdf_oxide.dart';
final doc = PdfDocument.open('report.pdf');
final html = doc.toHtml(0);
print(html);
R
library(pdfoxide)
doc <- pdf_open("report.pdf")
html <- pdf_to_html(doc, 0)
cat(html)
Julia
using PdfOxide
doc = open_document("report.pdf")
html = to_html(doc, 0)
println(html)
Zig
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
var doc = try pdf_oxide.Document.open("report.pdf");
const html = try doc.toHtml(a, 0);
std.debug.print("{s}\n", .{html});
Objective-C
#import "POXPdfOxide.h"
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];
NSString *html = [doc toHtml:0 error:&err];
NSLog(@"%@", html);
Elixir
{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, html} = PdfOxide.to_html(doc, 0)
IO.puts(html)
API Reference
to_html(page_index, ...) -> str
Convert a single page to HTML.
Python Signature
doc.to_html(
page: int,
preserve_layout: bool = False,
detect_headings: bool = True,
include_images: bool = True,
image_output_dir: str | None = None,
embed_images: bool = True,
) -> str
JavaScript Signature
doc.toHtml(pageIndex, preserveLayout?, detectHeadings?, includeFormFields?) -> string
Rust Signature
pub fn to_html(
&mut self,
page_index: usize,
options: &ConversionOptions,
) -> Result<String>
| Parameter | Type | Default | Description |
|---|---|---|---|
page_index |
int / usize / number |
– | Zero-based page index |
preserve_layout |
bool |
false |
Use CSS absolute positioning to match PDF layout |
detect_headings |
bool |
true |
Auto-detect heading levels from font sizes |
include_images |
bool |
true |
Include images in the HTML output |
image_output_dir |
str / None |
None |
Directory to save extracted images (Python/Rust only) |
embed_images |
bool |
true |
Embed images as base64 data URIs (Python/Rust only) |
include_form_fields |
bool |
true |
Include form field values (Python/JS) |
Returns: HTML string for the page.
When preserve_layout is true, the output uses <div> elements with absolute CSS positioning:
<div style="position: absolute; left: 72.0px; top: 100.0px; font-size: 24px; font-weight: bold;">
Introduction
</div>
When preserve_layout is false, the output uses semantic elements:
<h1>Introduction</h1>
<p>This report examines the quarterly results...</p>
to_html_all(...) -> str
Convert all pages to HTML. Each page is wrapped in a <div class="page"> element.
Python Signature
doc.to_html_all(
preserve_layout: bool = False,
detect_headings: bool = True,
include_images: bool = True,
image_output_dir: str | None = None,
embed_images: bool = True,
) -> str
JavaScript Signature
doc.toHtmlAll(preserveLayout?, detectHeadings?, includeFormFields?) -> string
Rust Signature
pub fn to_html_all(
&mut self,
options: &ConversionOptions,
) -> Result<String>
| Parameter | Type | Default | Description |
|---|---|---|---|
preserve_layout |
bool |
false |
Use CSS absolute positioning |
detect_headings |
bool |
true |
Detect headings |
include_images |
bool |
true |
Include images |
image_output_dir |
str / None |
None |
Image output directory |
embed_images |
bool |
true |
Embed images as base64 |
Returns: HTML string for all pages.
ConversionOptions
See the Markdown Conversion page for the full ConversionOptions reference. The same options struct is shared between Markdown and HTML conversion.
Advanced Examples
Convert all pages to HTML
WASM
const doc = new WasmPdfDocument(bytes);
const html = doc.toHtmlAll(false, true, true);
writeFileSync("report.html", html);
doc.free();
Java
import fyi.oxide.pdf.PdfDocument;
import java.nio.file.*;
try (PdfDocument doc = PdfDocument.open(Path.of("report.pdf"))) {
String html = doc.toHtml(); // no-arg overload converts the whole document
Files.writeString(Path.of("report.html"), html);
}
Kotlin
import fyi.oxide.pdf.PdfDocument
import java.nio.file.*
PdfDocument.open(Path.of("report.pdf")).use { doc ->
val html = doc.toHtml() // whole document
Files.writeString(Path.of("report.html"), html)
}
Scala
import fyi.oxide.pdf.PdfDocument
import java.nio.file.*
import scala.util.Using
Using.resource(PdfDocument.open("report.pdf")) { doc =>
val html = doc.toHtml() // whole document
Files.writeString(Path.of("report.html"), html)
}
Clojure
(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])
(with-open [doc (pdf/open "report.pdf")]
(spit "report.html" (pdf/to-html doc))) ; whole document
Ruby
require 'pdf_oxide'
PdfOxide::PdfDocument.open('report.pdf') do |doc|
html = doc.to_html # whole document
File.write('report.html', html)
end
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <fstream>
auto doc = pdf_oxide::Document::open("report.pdf");
auto html = doc.to_html_all();
std::ofstream("report.html") << html;
Swift
import PdfOxide
let doc = try Document.open("report.pdf")
let html = try doc.toHtmlAll()
try html.write(toFile: "report.html", atomically: true, encoding: .utf8)
Dart
import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';
final doc = PdfDocument.open('report.pdf');
final html = doc.toHtmlAll();
File('report.html').writeAsStringSync(html);
R
library(pdfoxide)
doc <- pdf_open("report.pdf")
html <- pdf_to_html_all(doc)
writeLines(html, "report.html")
Julia
using PdfOxide
doc = open_document("report.pdf")
html = to_html_all(doc)
write("report.html", html)
Zig
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
var doc = try pdf_oxide.Document.open("report.pdf");
const html = try doc.toHtmlAll(a);
try std.fs.cwd().writeFile(.{ .sub_path = "report.html", .data = html });
Objective-C
#import "POXPdfOxide.h"
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];
NSString *html = [doc toHtmlAllWithError:&err];
[html writeToFile:@"report.html" atomically:YES encoding:NSUTF8StringEncoding error:&err];
Elixir
{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, html} = PdfOxide.to_html_all(doc)
File.write!("report.html", html)
Create a complete HTML file
from pdf_oxide import PdfDocument
doc = PdfDocument("report.pdf")
body = doc.to_html_all(detect_headings=True)
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Report</title>
<style>
body {{ font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 2rem; }}
.page {{ margin-bottom: 2rem; border-bottom: 1px solid #ccc; padding-bottom: 2rem; }}
</style>
</head>
<body>
{body}
</body>
</html>"""
with open("report.html", "w", encoding="utf-8") as f:
f.write(html)
Layout-preserved HTML for visual fidelity
use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;
let mut doc = PdfDocument::open("brochure.pdf")?;
let options = ConversionOptions {
preserve_layout: true,
detect_headings: false, // layout mode uses exact positioning
include_images: true,
embed_images: true,
..Default::default()
};
let html = doc.to_html(0, &options)?;
std::fs::write("brochure.html", &html)?;
Convert with external image files
from pdf_oxide import PdfDocument
doc = PdfDocument("report.pdf")
html = doc.to_html_all(
detect_headings=True,
include_images=True,
embed_images=False,
image_output_dir="output/images",
)
with open("output/report.html", "w") as f:
f.write(html)
# Images saved as output/images/img_001.png, img_002.jpg, etc.
Page-by-page conversion with custom wrappers
use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;
let mut doc = PdfDocument::open("book.pdf")?;
let options = ConversionOptions::default();
let page_count = doc.page_count()?;
let mut pages_html = Vec::new();
for i in 0..page_count {
let html = doc.to_html(i, &options)?;
pages_html.push(format!(
"<section id=\"page-{}\" class=\"page\">\n{}\n</section>",
i + 1, html
));
}
let full = pages_html.join("\n");
std::fs::write("output.html", &full)?;
Related Pages
- Markdown Conversion – Convert to Markdown instead of HTML
- Text Extraction – Extract raw text without formatting
- Image Extraction – Extract images separately