Skip to content

Markdown Conversion

PDF Oxide converts PDF pages to clean, readable Markdown. The conversion pipeline extracts text spans, clusters them into lines, consults the /StructTreeRoot for headings and list roles on Tagged PDFs, detects multi-column gutters and backward-x reading-order wraps, groups paragraphs, and emits Markdown syntax.

Since v0.3.36, for Tagged PDFs the converter reads StructRole(Heading(1..6) | ListItem | ListItemLabel | ListItemBody) directly from /StructTreeRoot instead of re-deriving heading levels from font size. Role information is propagated through nested MCRs (H1 → Span → MCR, LI → LBody → Span → MCR). For untagged documents, the geometric fallback still applies: bold + 5 % size bump promotes to H4, and is_ordered_list_marker recognises 1. / 12. / a) / iv. / A. while rejecting figure captions and years.

Multi-column handling: same-baseline spans separated by > max(3 × font_size, 30 pt) are treated as cross-column. Backward-x reading-order wraps (column-major last→first-span) break paragraphs instead of joining them into nonsense tokens.

RTL: bidi reorder is off by default — the earlier unconditional visual→logical reorder broke logical-order PDFs (Hebrew בנימין was being reversed). Spurious **bold** markers around Arabic contextual glyphs are stripped. Callers whose input is in visual order can invoke text::bidi::reorder_visual_to_logical manually (Rust).

Inline images are capped at 200 KB base64 payload (added v0.3.36). Over-the-cap images emit an HTML comment noting the original size; use image_output_dir to write them to disk instead.

Quick Example

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("paper.pdf")
md = doc.to_markdown(0, detect_headings=True)
print(md)

Node.js

const { PdfDocument } = require("pdf-oxide");

const doc = new PdfDocument("paper.pdf");
const md = doc.toMarkdown(0, { detectHeadings: true });
console.log(md);
doc.close();

Go

import pdfoxide "github.com/yfedoseev/pdf_oxide/go"

doc, _ := pdfoxide.Open("paper.pdf")
defer doc.Close()
md, _ := doc.ToMarkdown(0)
fmt.Println(md)

C#

using PdfOxide.Core;

using var doc = PdfDocument.Open("paper.pdf");
var md = doc.ToMarkdown(0);
Console.WriteLine(md);

WASM

const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdown(0, true);
console.log(md);

Rust

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("paper.pdf")?;
let options = ConversionOptions { detect_headings: true, ..Default::default() };
let md = doc.to_markdown(0, &options)?;
println!("{}", md);

Java

import fyi.oxide.pdf.PdfDocument;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("paper.pdf"))) {
    String md = doc.toMarkdown(0);
    System.out.println(md);
}

Kotlin

import fyi.oxide.pdf.PdfDocument

PdfDocument.open(java.nio.file.Path.of("paper.pdf")).use { doc ->
    val md = doc.toMarkdown(0)
    println(md)
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using

Using.resource(PdfDocument.open("paper.pdf")) { doc =>
  val md = doc.toMarkdown(0)
  println(md)
}

Clojure

(require '[pdf-oxide.core :as pdf])

(with-open [doc (pdf/open "paper.pdf")]
  (println (pdf/to-markdown doc 0)))

PHP

use PdfOxide\PdfDocument;

$doc = PdfDocument::open('paper.pdf');
echo $doc->toMarkdown(0);
$doc->close();

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('paper.pdf') do |doc|
  puts doc.to_markdown(0)
end

C++

#include <pdf_oxide/pdf_oxide.hpp>

auto doc = pdf_oxide::Document::open("paper.pdf");
auto md = doc.to_markdown(0);
std::cout << md << std::endl;

Swift

import PdfOxide

let doc = try Document.open("paper.pdf")
let md = try doc.toMarkdown(0)
print(md)

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('paper.pdf');
final md = doc.toMarkdown(0);
print(md);

R

library(pdfoxide)

doc <- pdf_open("paper.pdf")
md <- pdf_to_markdown(doc, 0)
cat(md)

Julia

using PdfOxide

doc = open_document("paper.pdf")
md = to_markdown(doc, 0)
println(md)

Zig

const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;

var doc = try pdf_oxide.Document.open("paper.pdf");
const md = try doc.toMarkdown(a, 0);
std.debug.print("{s}\n", .{md});

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXDocument *doc = [POXDocument openPath:@"paper.pdf" error:&err];
NSString *md = [doc toMarkdown:0 error:&err];
NSLog(@"%@", md);

Elixir

{:ok, doc} = PdfOxide.open("paper.pdf")
{:ok, md} = PdfOxide.to_markdown(doc, 0)
IO.puts(md)

API Reference

to_markdown(page_index, ...) -> str

Convert a single page to Markdown.

Python Signature

doc.to_markdown(
    page: int,
    preserve_layout: bool = False,
    detect_headings: bool = True,
    include_images: bool = True,
    image_output_dir: str | None = None,
    embed_images: bool = True,
) -> str

JavaScript Signature

doc.toMarkdown(pageIndex, detectHeadings?, includeImages?, includeFormFields?) -> string

Rust Signature

pub fn to_markdown(
    &mut self,
    page_index: usize,
    options: &ConversionOptions,
) -> Result<String>

Java Signature

String toMarkdown(int pageIndex)

Kotlin Signature

fun toMarkdown(pageIndex: Int): String

Scala Signature

def toMarkdown(pageIndex: Int): String

Clojure Signature

(pdf/to-markdown doc page-index) ; => String

PHP Signature

public function toMarkdown(int $pageIndex): string

Ruby Signature

doc.to_markdown(page_index) # => String

C++ Signature

std::string to_markdown(int page_index) const;

Swift Signature

func toMarkdown(_ pageIndex: Int) throws -> String

Dart Signature

String toMarkdown(int pageIndex)

R Signature

pdf_to_markdown(doc, page_index)  # character

Julia Signature

to_markdown(doc, page_index)::String

Zig Signature

pub fn toMarkdown(self: *Document, allocator: std.mem.Allocator, page_index: usize) ![]u8

Objective-C Signature

- (NSString *)toMarkdown:(NSInteger)pageIndex error:(NSError **)error;

Elixir Signature

PdfOxide.to_markdown(doc, page_index) :: {:ok, String.t()} | {:error, term()}
Parameter Type Default Description
page_index int / usize / number Zero-based page index
preserve_layout bool false Preserve visual layout positioning
detect_headings bool true Detect headings based on font size and weight
include_images bool true Include images in output
image_output_dir str / None None Directory to save extracted images (Python/Rust only). Unaffected by the 200 KB inline cap.
embed_images bool true Embed images as base64 data URIs (Python/Rust only). Payloads over 200 KB emit a placeholder HTML comment noting the original size (v0.3.36).
include_form_fields bool true Include form field values (Python/JS)

Returns: Markdown string for the page.


to_markdown_all(...) -> str

Convert all pages to Markdown, separated by horizontal rules (---).

Python Signature

doc.to_markdown_all(
    preserve_layout: bool = False,
    detect_headings: bool = True,
    include_images: bool = True,
    image_output_dir: str | None = None,
    embed_images: bool = True,
) -> str

JavaScript Signature

doc.toMarkdownAll(detectHeadings?, includeImages?, includeFormFields?) -> string

Rust Signature

pub fn to_markdown_all(
    &mut self,
    options: &ConversionOptions,
) -> Result<String>

Java Signature

String toMarkdown()  // no-arg overload = whole document

Kotlin Signature

fun toMarkdown(): String  // no-arg = whole document

Scala Signature

def toMarkdown(): String  // no-arg = whole document

Clojure Signature

(pdf/to-markdown doc) ; no page index = whole document => String

PHP Signature

public function toMarkdownAll(): string

Ruby Signature

doc.to_markdown # nil page index = whole document => String

C++ Signature

std::string to_markdown_all() const;

Swift Signature

func toMarkdownAll() throws -> String

Dart Signature

String toMarkdownAll()

R Signature

pdf_to_markdown_all(doc)  # character

Julia Signature

to_markdown_all(doc)::String

Zig Signature

pub fn toMarkdownAll(self: *Document, allocator: std.mem.Allocator) ![]u8

Objective-C Signature

- (NSString *)toMarkdownAllWithError:(NSError **)error;

Elixir Signature

PdfOxide.to_markdown_all(doc) :: {:ok, String.t()} | {:error, term()}
Parameter Type Default Description
preserve_layout bool false Preserve visual layout
detect_headings bool true Detect headings
include_images bool true Include images
image_output_dir str / None None Image output directory
embed_images bool true Embed images as base64

Returns: Markdown string for all pages joined with --- separators.


to_markdown_with_ocr(page_index, model_path, options) -> str

Convert a page to Markdown with OCR fallback for scanned pages. When the page has little or no extractable text, OCR is used to recognize text from the rendered page image. Requires the ocr feature.

Parameter Type Description
page_index usize Zero-based page index
model_path &str Path to the OCR model files
options &ConversionOptions Conversion options

Rust

let mut doc = PdfDocument::open("scanned.pdf")?;
let options = ConversionOptions { detect_headings: true, ..Default::default() };
let md = doc.to_markdown_with_ocr(0, "/path/to/models", &options)?;
println!("{}", md);

ConversionOptions

The ConversionOptions struct controls all conversion behavior.

Field Type Default Description
preserve_layout bool false Preserve visual layout with positioning
detect_headings bool true Auto-detect headings from font size clusters
extract_tables bool false Extract tables (experimental)
include_images bool true Include images in output
image_output_dir Option<String> None Save images to this directory
embed_images bool true Embed images as base64 data URIs
reading_order_mode ReadingOrderMode Auto How to determine reading order
bold_marker_behavior BoldMarkerBehavior Conservative Bold marker application strategy

How It Works

The Markdown conversion pipeline operates in several stages:

  1. Text Extraction – Extracts TextSpan objects from the page content stream, capturing text, position, font, size, weight, and color.

  2. Character Clustering – Groups characters into words based on inter-character gaps, then words into lines based on vertical proximity.

  3. Reading Order – Determines reading order using either the Tagged PDF structure tree (preferred) or a graph-based spatial analysis of text block positions.

  4. Heading Detection – When detect_headings is enabled, clusters font sizes across the page to identify heading levels. Larger and bolder text is mapped to #, ##, ### headings.

  5. Formatting – Applies bold (**text**) and italic (*text*) markers based on font weight and style metadata.

  6. Table Detection – Identifies tabular layouts using spatial analysis of aligned text blocks and emits GFM-style Markdown tables.

  7. Whitespace Cleanup – Normalizes spacing, removes redundant blank lines, and ensures consistent paragraph breaks.


Advanced Examples

Convert entire PDF to a Markdown file

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("book.pdf")
md = doc.to_markdown_all(detect_headings=True)

with open("book.md", "w", encoding="utf-8") as f:
    f.write(md)

Node.js

const fs = require("node:fs");

const doc = new PdfDocument("book.pdf");
const md = doc.toMarkdownAll();
fs.writeFileSync("book.md", md);
doc.close();

Go

doc, _ := pdfoxide.Open("book.pdf")
defer doc.Close()
md, _ := doc.ToMarkdownAll()
os.WriteFile("book.md", []byte(md), 0644)

C#

using var doc = PdfDocument.Open("book.pdf");
var md = doc.ToMarkdownAll();
File.WriteAllText("book.md", md);

WASM

const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll(true);
writeFileSync("book.md", md);
doc.free();

Java

import fyi.oxide.pdf.PdfDocument;
import java.nio.file.*;

try (PdfDocument doc = PdfDocument.open(Path.of("book.pdf"))) {
    String md = doc.toMarkdown();
    Files.writeString(Path.of("book.md"), md);
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.nio.file.*

PdfDocument.open(Path.of("book.pdf")).use { doc ->
    Files.writeString(Path.of("book.md"), doc.toMarkdown())
}

Scala

import fyi.oxide.pdf.PdfDocument
import java.nio.file.{Files, Path}
import scala.util.Using

Using.resource(PdfDocument.open("book.pdf")) { doc =>
  Files.writeString(Path.of("book.md"), doc.toMarkdown())
}

Clojure

(require '[pdf-oxide.core :as pdf]
         '[clojure.java.io :as io])

(with-open [doc (pdf/open "book.pdf")]
  (spit "book.md" (pdf/to-markdown doc)))

PHP

use PdfOxide\PdfDocument;

$doc = PdfDocument::open('book.pdf');
file_put_contents('book.md', $doc->toMarkdownAll());
$doc->close();

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('book.pdf') do |doc|
  File.write('book.md', doc.to_markdown)
end

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <fstream>

auto doc = pdf_oxide::Document::open("book.pdf");
auto md = doc.to_markdown_all();
std::ofstream("book.md") << md;

Swift

import PdfOxide

let doc = try Document.open("book.pdf")
let md = try doc.toMarkdownAll()
try md.write(toFile: "book.md", atomically: true, encoding: .utf8)

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('book.pdf');
final md = doc.toMarkdownAll();
File('book.md').writeAsStringSync(md);

R

library(pdfoxide)

doc <- pdf_open("book.pdf")
md <- pdf_to_markdown_all(doc)
writeLines(md, "book.md")

Julia

using PdfOxide

doc = open_document("book.pdf")
md = to_markdown_all(doc)
write("book.md", md)

Zig

const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;

var doc = try pdf_oxide.Document.open("book.pdf");
const md = try doc.toMarkdownAll(a);
try std.fs.cwd().writeFile(.{ .sub_path = "book.md", .data = md });

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXDocument *doc = [POXDocument openPath:@"book.pdf" error:&err];
NSString *md = [doc toMarkdownAllWithError:&err];
[md writeToFile:@"book.md" atomically:YES encoding:NSUTF8StringEncoding error:&err];

Elixir

{:ok, doc} = PdfOxide.open("book.pdf")
{:ok, md} = PdfOxide.to_markdown_all(doc)
File.write!("book.md", md)

Convert with images saved to a directory

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let mut doc = PdfDocument::open("report.pdf")?;
let options = ConversionOptions {
    detect_headings: true,
    include_images: true,
    embed_images: false,
    image_output_dir: Some("output/images".to_string()),
    ..Default::default()
};

let md = doc.to_markdown_all(&options)?;
std::fs::write("output/report.md", &md)?;

Page-by-page conversion with progress

from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")
pages = doc.page_count()

parts = []
for i in range(pages):
    md = doc.to_markdown(i, detect_headings=True)
    parts.append(md)
    print(f"Converted page {i + 1}/{pages}")

full_md = "\n\n---\n\n".join(parts)
with open("report.md", "w") as f:
    f.write(full_md)

Disable heading detection for flat text

doc = PdfDocument("form.pdf")
md = doc.to_markdown(0, detect_headings=False)
# All text rendered as paragraphs, no # headings