Skip to content

HTML 변환

PDF Oxide는 PDF 페이지를 구조화된 HTML로 변환합니다. 제목 감지, 폰트 스타일링, CSS 기반 레이아웃 보존 옵션을 지원합니다. 단일 페이지 변환에는 to_html()을, 전체 문서 변환에는 to_html_all()을 사용하세요. preserve_layout을 활성화하면 원본 PDF 레이아웃에 맞게 CSS 절대 좌표로 요소가 배치됩니다. 비활성화 시에는 자연스러운 흐름의 시맨틱 HTML이 출력됩니다.

빠른 예제

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")
html = doc.to_html(0, detect_headings=True)
print(html)

Node.js

const { PdfDocument } = require("pdf-oxide");

const doc = new PdfDocument("report.pdf");
const html = doc.toHtml(0);
console.log(html);
doc.close();

Go

import pdfoxide "github.com/yfedoseev/pdf_oxide/go"

doc, _ := pdfoxide.Open("report.pdf")
defer doc.Close()
html, _ := doc.ToHtml(0)
fmt.Println(html)

C#

using PdfOxide.Core;

using var doc = PdfDocument.Open("report.pdf");
var html = doc.ToHtml(0);
Console.WriteLine(html);

WASM

const doc = new WasmPdfDocument(bytes);
const html = doc.toHtml(0);
console.log(html);

Rust

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("report.pdf")?;
let options = ConversionOptions { detect_headings: true, ..Default::default() };
let html = doc.to_html(0, &options)?;
println!("{}", html);

Java

import fyi.oxide.pdf.PdfDocument;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    String html = doc.toHtml(0);
    System.out.println(html);
}

Kotlin

import fyi.oxide.pdf.PdfDocument

PdfDocument.open(java.nio.file.Path.of("report.pdf")).use { doc ->
    val html = doc.toHtml(0)
    println(html)
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using

Using.resource(PdfDocument.open("report.pdf")) { doc =>
  val html = doc.toHtml(0)
  println(html)
}

Clojure

(require '[pdf-oxide.core :as pdf])

(with-open [doc (pdf/open "report.pdf")]
  (println (pdf/to-html doc 0)))

PHP

use PdfOxide\PdfDocument;

$doc = PdfDocument::open('report.pdf');
$html = $doc->toHtml(0);
echo $html;
$doc->close();

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('report.pdf') do |doc|
  html = doc.to_html(0)
  puts html
end

C++

#include <pdf_oxide/pdf_oxide.hpp>

auto doc = pdf_oxide::Document::open("report.pdf");
auto html = doc.to_html(0);
std::cout << html << std::endl;

Swift

import PdfOxide

let doc = try Document.open("report.pdf")
let html = try doc.toHtml(0)
print(html)

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('report.pdf');
final html = doc.toHtml(0);
print(html);

R

library(pdfoxide)

doc <- pdf_open("report.pdf")
html <- pdf_to_html(doc, 0)
cat(html)

Julia

using PdfOxide

doc = open_document("report.pdf")
html = to_html(doc, 0)
println(html)

Zig

const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;

var doc = try pdf_oxide.Document.open("report.pdf");
const html = try doc.toHtml(a, 0);
std.debug.print("{s}\n", .{html});

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];
NSString *html = [doc toHtml:0 error:&err];
NSLog(@"%@", html);

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, html} = PdfOxide.to_html(doc, 0)
IO.puts(html)

API 레퍼런스

to_html(page_index, ...) -> str

단일 페이지를 HTML로 변환합니다.

Python Signature

doc.to_html(
    page: int,
    preserve_layout: bool = False,
    detect_headings: bool = True,
    include_images: bool = True,
    image_output_dir: str | None = None,
    embed_images: bool = True,
) -> str

JavaScript Signature

doc.toHtml(pageIndex, preserveLayout?, detectHeadings?, includeFormFields?) -> string

Rust Signature

pub fn to_html(
    &mut self,
    page_index: usize,
    options: &ConversionOptions,
) -> Result<String>
파라미터 타입 기본값 설명
page_index int / usize / number 0부터 시작하는 페이지 인덱스
preserve_layout bool false PDF 레이아웃에 맞춰 CSS 절대 위치 사용
detect_headings bool true 폰트 크기로부터 제목 수준 자동 감지
include_images bool true HTML 출력에 이미지 포함
image_output_dir str / None None 추출된 이미지를 저장할 디렉터리 (Python/Rust만 해당)
embed_images bool true 이미지를 base64 데이터 URI로 삽입 (Python/Rust만 해당)
include_form_fields bool true 폼 필드 값 포함 (Python/JS)

반환값: 해당 페이지의 HTML 문자열.

preserve_layouttrue이면 CSS 절대 위치를 사용하는 <div> 요소로 출력됩니다:

<div style="position: absolute; left: 72.0px; top: 100.0px; font-size: 24px; font-weight: bold;">
  Introduction
</div>

preserve_layoutfalse이면 시맨틱 요소로 출력됩니다:

<h1>Introduction</h1>
<p>This report examines the quarterly results...</p>

to_html_all(...) -> str

모든 페이지를 HTML로 변환합니다. 각 페이지는 <div class="page"> 요소로 감싸집니다.

Python Signature

doc.to_html_all(
    preserve_layout: bool = False,
    detect_headings: bool = True,
    include_images: bool = True,
    image_output_dir: str | None = None,
    embed_images: bool = True,
) -> str

JavaScript Signature

doc.toHtmlAll(preserveLayout?, detectHeadings?, includeFormFields?) -> string

Rust Signature

pub fn to_html_all(
    &mut self,
    options: &ConversionOptions,
) -> Result<String>
파라미터 타입 기본값 설명
preserve_layout bool false CSS 절대 위치 사용
detect_headings bool true 제목 감지
include_images bool true 이미지 포함
image_output_dir str / None None 이미지 출력 디렉터리
embed_images bool true 이미지를 base64로 삽입

반환값: 전체 페이지의 HTML 문자열.


ConversionOptions

ConversionOptions 전체 레퍼런스는 Markdown 변환 페이지를 참고하세요. Markdown과 HTML 변환에 동일한 옵션 구조체가 사용됩니다.


고급 예제

모든 페이지를 HTML로 변환

WASM

const doc = new WasmPdfDocument(bytes);
const html = doc.toHtmlAll(false, true, true);
writeFileSync("report.html", html);
doc.free();

Java

import fyi.oxide.pdf.PdfDocument;
import java.nio.file.*;

try (PdfDocument doc = PdfDocument.open(Path.of("report.pdf"))) {
    String html = doc.toHtml(); // no-arg overload converts the whole document
    Files.writeString(Path.of("report.html"), html);
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.nio.file.*

PdfDocument.open(Path.of("report.pdf")).use { doc ->
    val html = doc.toHtml() // whole document
    Files.writeString(Path.of("report.html"), html)
}

Scala

import fyi.oxide.pdf.PdfDocument
import java.nio.file.*
import scala.util.Using

Using.resource(PdfDocument.open("report.pdf")) { doc =>
  val html = doc.toHtml() // whole document
  Files.writeString(Path.of("report.html"), html)
}

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(with-open [doc (pdf/open "report.pdf")]
  (spit "report.html" (pdf/to-html doc))) ; whole document

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('report.pdf') do |doc|
  html = doc.to_html # whole document
  File.write('report.html', html)
end

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <fstream>

auto doc = pdf_oxide::Document::open("report.pdf");
auto html = doc.to_html_all();
std::ofstream("report.html") << html;

Swift

import PdfOxide

let doc = try Document.open("report.pdf")
let html = try doc.toHtmlAll()
try html.write(toFile: "report.html", atomically: true, encoding: .utf8)

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('report.pdf');
final html = doc.toHtmlAll();
File('report.html').writeAsStringSync(html);

R

library(pdfoxide)

doc <- pdf_open("report.pdf")
html <- pdf_to_html_all(doc)
writeLines(html, "report.html")

Julia

using PdfOxide

doc = open_document("report.pdf")
html = to_html_all(doc)
write("report.html", html)

Zig

const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;

var doc = try pdf_oxide.Document.open("report.pdf");
const html = try doc.toHtmlAll(a);
try std.fs.cwd().writeFile(.{ .sub_path = "report.html", .data = html });

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];
NSString *html = [doc toHtmlAllWithError:&err];
[html writeToFile:@"report.html" atomically:YES encoding:NSUTF8StringEncoding error:&err];

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, html} = PdfOxide.to_html_all(doc)
File.write!("report.html", html)

완전한 HTML 파일 생성

from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")
body = doc.to_html_all(detect_headings=True)

html = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Report</title>
    <style>
        body {{ font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 2rem; }}
        .page {{ margin-bottom: 2rem; border-bottom: 1px solid #ccc; padding-bottom: 2rem; }}
    </style>
</head>
<body>
{body}
</body>
</html>"""

with open("report.html", "w", encoding="utf-8") as f:
    f.write(html)

시각적 충실도를 위한 레이아웃 보존 HTML

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("brochure.pdf")?;
let options = ConversionOptions {
    preserve_layout: true,
    detect_headings: false, // layout mode uses exact positioning
    include_images: true,
    embed_images: true,
    ..Default::default()
};

let html = doc.to_html(0, &options)?;
std::fs::write("brochure.html", &html)?;

외부 이미지 파일로 변환

from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")
html = doc.to_html_all(
    detect_headings=True,
    include_images=True,
    embed_images=False,
    image_output_dir="output/images",
)

with open("output/report.html", "w") as f:
    f.write(html)
# Images saved as output/images/img_001.png, img_002.jpg, etc.

사용자 정의 래퍼를 사용한 페이지별 변환

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("book.pdf")?;
let options = ConversionOptions::default();
let page_count = doc.page_count()?;

let mut pages_html = Vec::new();
for i in 0..page_count {
    let html = doc.to_html(i, &options)?;
    pages_html.push(format!(
        "<section id=\"page-{}\" class=\"page\">\n{}\n</section>",
        i + 1, html
    ));
}

let full = pages_html.join("\n");
std::fs::write("output.html", &full)?;

관련 페이지