What is the fastest Python PDF library?

PDF Oxide is the fastest Python PDF library, with 0.8ms mean text extraction time — 5.8× faster than PyMuPDF (4.6ms) and 15× faster than pypdf (12.1ms). Benchmarked on 3,830 real-world PDFs with 100% pass rate.

Is PDF Oxide free for commercial use?

Yes. PDF Oxide is MIT licensed — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL restrictions.

Can PDF Oxide handle scanned PDFs with OCR?

Yes. PDF Oxide includes built-in OCR via PaddleOCR and ONNX Runtime. No Tesseract installation needed — just pip install pdf_oxide and use extract_text_ocr(). Supports PP-OCRv3, v4, and v5 models.

Does PDF Oxide support XFA forms?

Yes. PDF Oxide is the only Python PDF library that can detect, analyze, and extract data from XFA forms (XML Forms Architecture). PyMuPDF, pypdf, pdfplumber, and pdfminer cannot read XFA form data.

How does PDF Oxide compare to PyMuPDF?

PDF Oxide is 5.8× faster than PyMuPDF (0.8ms vs 4.6ms mean), has a 100% pass rate vs 99.3%, and is MIT licensed vs PyMuPDF's AGPL-3.0. PDF Oxide also has built-in Markdown/HTML output and XFA form support that PyMuPDF lacks.

Can PDF Oxide convert PDF to Markdown?

Yes. PDF Oxide has built-in PDF to Markdown conversion with heading detection, table preservation, and list formatting — ideal for LLM and RAG pipelines. No separate package needed, unlike PyMuPDF which requires pymupdf4llm (69× slower).

PDFページの分類 — テキスト vs スキャン

テキスト抽出の前に、まずどのようなページを扱っているかを把握しておくことが重要です。使えるネイティブテキストレイヤーがあるのか、それともOCRが必要なスキャン画像なのか。PDF Oxideはこれを低コストのプリフライトで解決します — classify_page と classify_document はPDF内部（グリフ数、画像面積、コーデック、不可視テキスト比率、文字化けグリフ比率）をOCRなし・ページのラスタライズなしで検査します。

分類結果は説明可能です。すべての判定には信頼度スコア、型付きのreasonコード、および判断根拠となった生のsignalsが含まれます。これにより、ページを適切な抽出器（ネイティブテキスト vs OCR）にルーティングし、なぜそうなったかをログに残すことができます。

バインディングのカバレッジ。 分類機能は Rust、Go、C#、Swift、および WASM/JavaScript で利用できます。Python と Node N-API バインディングは v0.3.69 時点で classify_page / classify_document を公開していません — それらのランタイムからは自動抽出パスを使うか、Rustコア / CLIを経由してください。

単一のPDFページを分類するには？

classify_page は0始まりのページインデックスを受け取り、PageClassification を返します。C-ABIバインディング（Go、C#、Swift、WASM）ではデシリアライズが必要なJSON文字列として返されます。

Rust

use pdf_oxide::PdfDocument;

fn main() -> pdf_oxide::Result<()> {
    let doc = PdfDocument::open("mixed.pdf")?;

    // PdfDocument::classify_page(&self, page: usize)
    //   -> Result<pdf_oxide::extractors::auto::PageClassification>
    let result = doc.classify_page(0)?;

    println!("page {} is {:?} (confidence {:.2})",
        result.page, result.kind, result.confidence);
    println!("reason: {:?}", result.reason);
    println!("glyphs={} image_area={:.2} garbled={:.2}",
        result.signals.text_glyph_count,
        result.signals.image_area_ratio,
        result.signals.garbled_ratio);
    Ok(())
}

package main

import (
	"encoding/json"
	"fmt"
	"log"

	pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
	doc, err := pdfoxide.Open("mixed.pdf")
	if err != nil {
		log.Fatal(err)
	}
	defer doc.Close()

	// func (doc *PdfDocument) ClassifyPage(pageIndex int) (string, error)
	raw, err := doc.ClassifyPage(0)
	if err != nil {
		log.Fatal(err)
	}

	var pc struct {
		Page       int     `json:"page"`
		Kind       string  `json:"kind"`
		Confidence float64 `json:"confidence"`
		Reason     string  `json:"reason"`
	}
	if err := json.Unmarshal([]byte(raw), &pc); err != nil {
		log.Fatal(err)
	}
	fmt.Printf("page %d is %s (%.2f): %s\n", pc.Page, pc.Kind, pc.Confidence, pc.Reason)
}

using System;
using System.Text.Json;
using PdfOxide.Core;

using var doc = PdfDocument.Open("mixed.pdf");

// string PdfDocument.ClassifyPage(int pageIndex)
string raw = doc.ClassifyPage(0);

using var json = JsonDocument.Parse(raw);
var root = json.RootElement;
Console.WriteLine(
    $"page {root.GetProperty("page").GetInt32()} is " +
    $"{root.GetProperty("kind").GetString()} " +
    $"({root.GetProperty("confidence").GetDouble():F2}): " +
    $"{root.GetProperty("reason").GetString()}");

Swift

import PdfOxide

let doc = try PdfDocument(path: "mixed.pdf")

// func classifyPage(_ pageIndex: Int) throws -> String   (JSON)
let json = try doc.classifyPage(0)
print(json)

JavaScript (WASM)

import init, { WasmPdfDocument } from "pdf-oxide-wasm";

await init();
const bytes = new Uint8Array(await (await fetch("mixed.pdf")).arrayBuffer());
const doc = WasmPdfDocument.fromBytes(bytes);

// WasmPdfDocument.classifyPage(pageIndex) -> JSON string
const pc = JSON.parse(doc.classifyPage(0));
console.log(`page ${pc.page} is ${pc.kind} (${pc.confidence}): ${pc.reason}`);

Java

import fyi.oxide.pdf.PdfDocument;
import fyi.oxide.pdf.AutoExtractor;
import fyi.oxide.pdf.auto.PageClass;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("mixed.pdf"))) {
    AutoExtractor auto = AutoExtractor.of(doc);

    // PageClass AutoExtractor.classifyPageKind(int pageIndex)
    PageClass kind = auto.classifyPageKind(0);
    System.out.println("page 0 is " + kind);   // TEXT_LAYER / SCANNED / IMAGE_TEXT / MIXED / EMPTY
}

PHP

<?php
use PdfOxide\PdfDocument;
use PdfOxide\AutoExtractor;

$doc = PdfDocument::open('mixed.pdf');
$auto = AutoExtractor::of($doc);

// string AutoExtractor::classifyPageKind(int $pageIndex)
$kind = $auto->classifyPageKind(0);
echo "page 0 is {$kind}\n";   // text_layer / scanned / image_text / mixed / empty

Ruby

require 'pdf_oxide'

doc  = PdfOxide::PdfDocument.open('mixed.pdf')
auto = PdfOxide::AutoExtractor.new(doc)

# AutoExtractor#classify_page(page_index)
#   => { reason:, kind:, confidence:, classification: }
pc = auto.classify_page(0)
puts "page 0 is #{pc[:kind]} (#{pc[:confidence]}): #{pc[:reason]}"

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <iostream>

int main() {
    auto doc = pdf_oxide::Document::open("mixed.pdf");

    // std::string Document::classify_page(int page_index) — JSON
    std::string json = doc.classify_page(0);
    std::cout << json << '\n';
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

void main() {
  final doc = PdfDocument.open('mixed.pdf');

  // String PdfDocument.classifyPage(int page) — JSON
  final json = doc.classifyPage(0);
  print(json);
}

library(pdfoxide)

doc <- pdf_open("mixed.pdf")

# pdf_classify_page(doc, page) — JSON PageClassification
json <- pdf_classify_page(doc, 0)
cat(json, "\n")

Julia

using PdfOxide

doc = open_document("mixed.pdf")

# classify_page(doc, page) -> JSON string
json = classify_page(doc, 0)
println(json)

Zig

const std = @import("std");
const pdf = @import("pdf_oxide");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    const alloc = gpa.allocator();

    var doc = try pdf.Document.open("mixed.pdf");
    defer doc.deinit();

    // classifyPage(alloc, page_index) -> caller-owned JSON bytes
    const json = try doc.classifyPage(alloc, 0);
    defer alloc.free(json);
    std.debug.print("{s}\n", .{json});
}

Objective-C

#import <POXPdfOxide.h>

NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"mixed.pdf" error:&err];

// -classifyPage:error: -> JSON NSString
NSString *json = [doc classifyPage:0 error:&err];
NSLog(@"%@", json);

Elixir

{:ok, doc} = PdfOxide.open("mixed.pdf")

# PdfOxide.classify_page(doc, page) -> JSON string
json = PdfOxide.classify_page(doc, 0)
IO.puts(json)

PDFドキュメント全体を一度に分類するには？

classify_document は同じ低コストのプリフライトをすべてのページで実行し、結果を集約します。ページごとのkindリスト、0始まりのpages_needing_ocrインデックス、そして全体のsummaryが得られます。判定はページ単位で行われます — PDF Oxideは混在PDFに対して一つのドキュメントモードを強制しません。

Rust

use pdf_oxide::PdfDocument;

fn main() -> pdf_oxide::Result<()> {
    let doc = PdfDocument::open("report.pdf")?;

    // PdfDocument::classify_document(&self)
    //   -> Result<pdf_oxide::extractors::auto::DocumentClassification>
    let dc = doc.classify_document()?;

    println!("summary: {:?}", dc.summary);
    println!("pages needing OCR: {:?}", dc.pages_needing_ocr);
    for (i, kind) in dc.pages.iter().enumerate() {
        println!("  page {i}: {kind:?}");
    }
    Ok(())
}

package main

import (
	"encoding/json"
	"fmt"
	"log"

	pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
	doc, err := pdfoxide.Open("report.pdf")
	if err != nil {
		log.Fatal(err)
	}
	defer doc.Close()

	// func (doc *PdfDocument) ClassifyDocument() (string, error)
	raw, err := doc.ClassifyDocument()
	if err != nil {
		log.Fatal(err)
	}

	var dc struct {
		Pages           []string `json:"pages"`
		PagesNeedingOCR []int    `json:"pages_needing_ocr"`
		Summary         string   `json:"summary"`
	}
	if err := json.Unmarshal([]byte(raw), &dc); err != nil {
		log.Fatal(err)
	}
	fmt.Printf("summary=%s ocr_pages=%v\n", dc.Summary, dc.PagesNeedingOCR)
}

using System;
using PdfOxide.Core;

using var doc = PdfDocument.Open("report.pdf");

// string PdfDocument.ClassifyDocument()
string raw = doc.ClassifyDocument();
Console.WriteLine(raw);

Swift

import PdfOxide

let doc = try PdfDocument(path: "report.pdf")

// func classifyDocument() throws -> String   (JSON)
let json = try doc.classifyDocument()
print(json)

JavaScript (WASM)

import init, { WasmPdfDocument } from "pdf-oxide-wasm";

await init();
const bytes = new Uint8Array(await (await fetch("report.pdf")).arrayBuffer());
const doc = WasmPdfDocument.fromBytes(bytes);

// WasmPdfDocument.classifyDocument() -> JSON string
const dc = JSON.parse(doc.classifyDocument());
console.log("pages needing OCR:", dc.pages_needing_ocr);

Java

import fyi.oxide.pdf.PdfDocument;
import fyi.oxide.pdf.AutoExtractor;
import fyi.oxide.pdf.auto.PageClass;
import java.util.List;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    AutoExtractor auto = AutoExtractor.of(doc);

    // List<PageClass> AutoExtractor.classifyDocumentKinds()
    List<PageClass> kinds = auto.classifyDocumentKinds();
    for (int i = 0; i < kinds.size(); i++) {
        System.out.println("page " + i + ": " + kinds.get(i));
    }
}

PHP

<?php
use PdfOxide\PdfDocument;
use PdfOxide\AutoExtractor;

$doc = PdfDocument::open('report.pdf');
$auto = AutoExtractor::of($doc);

// array<int,string> AutoExtractor::classifyDocumentKinds()
$kinds = $auto->classifyDocumentKinds();
foreach ($kinds as $i => $kind) {
    echo "page {$i}: {$kind}\n";
}

Ruby

require 'pdf_oxide'

doc  = PdfOxide::PdfDocument.open('report.pdf')
auto = PdfOxide::AutoExtractor.new(doc)

# AutoExtractor#classify_document => decoded JSON envelope
dc = auto.classify_document
puts "pages needing OCR: #{dc['pages_needing_ocr']}"

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <iostream>

int main() {
    auto doc = pdf_oxide::Document::open("report.pdf");

    // std::string Document::classify_document() — JSON
    std::string json = doc.classify_document();
    std::cout << json << '\n';
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

void main() {
  final doc = PdfDocument.open('report.pdf');

  // String PdfDocument.classifyDocument() — JSON
  final json = doc.classifyDocument();
  print(json);
}

library(pdfoxide)

doc <- pdf_open("report.pdf")

# pdf_classify_document(doc) — JSON DocumentClassification
json <- pdf_classify_document(doc)
cat(json, "\n")

Julia

using PdfOxide

doc = open_document("report.pdf")

# classify_document(doc) -> JSON string
json = classify_document(doc)
println(json)

Zig

const std = @import("std");
const pdf = @import("pdf_oxide");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    const alloc = gpa.allocator();

    var doc = try pdf.Document.open("report.pdf");
    defer doc.deinit();

    // classifyDocument(alloc) -> caller-owned JSON bytes
    const json = try doc.classifyDocument(alloc);
    defer alloc.free(json);
    std.debug.print("{s}\n", .{json});
}

Objective-C

#import <POXPdfOxide.h>

NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];

// -classifyDocumentWithError: -> JSON NSString
NSString *json = [doc classifyDocumentWithError:&err];
NSLog(@"%@", json);

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")

# PdfOxide.classify_document(doc) -> JSON string
json = PdfOxide.classify_document(doc)
IO.puts(json)

分類結果のJSONはどのような形式か？

classify_page は PageClassification を返します：

{
  "page": 0,
  "kind": "text_layer",
  "confidence": 0.97,
  "reason": "native_text_high_confidence",
  "signals": {
    "text_glyph_count": 1840,
    "text_area_ratio": 0.62,
    "image_area_ratio": 0.0,
    "codec": "none",
    "invisible_text_ratio": 0.0,
    "garbled_ratio": 0.0,
    "fragmented_word_ratio": 0.01,
    "consecutive_repeat_ratio": 0.0,
    "vector_path_density": 0.04,
    "has_reliable_structure": true,
    "producer_prior": "authoring",
    "page_is_empty": false
  }
}

classify_document は DocumentClassification を返します：

{
  "pages": ["text_layer", "scanned", "image_text"],
  "pages_needing_ocr": [1],
  "summary": "mixed"
}

ページ種別（`kind`）

種別	意味	推奨ルート
`text_layer`	使用可能なネイティブテキストが支配的	テキストレイヤーを抽出
`scanned`	画像が支配的でテキストなし/文字化け	ページをOCR処理
`image_text`	ネイティブテキストかつテキストを含む画像領域がある	ハイブリッド：ネイティブ + 領域OCR
`mixed`	ページ内が異質（テキスト + 画像テーブル/図）	領域ごとに自動ルーティング
`empty`	空白 / ほぼ空 — エラーではない	スキップ

ドキュメントサマリー（`summary`）

mostly_text、mostly_scanned、mixed、または empty。

理由コード（`reason`）

理由は凍結済みの追記専用 snake_case トークンです。よく見られる値：ok、native_text_high_confidence、no_text_layer_present、text_layer_below_threshold、glyph_mapping_missing、encrypted_no_extract_permission、image_table_reconstructed、image_table_no_structure。

分類結果に基づいて抽出をルーティングするには？

低コストプリフライトの目的は、OCRが不要なページにOCRコストをかけないことです。まず分類し、次にOCRが必要なページだけを重いパスで処理します：

Rust

use pdf_oxide::PdfDocument;
use pdf_oxide::extractors::auto::PageKind;

fn main() -> pdf_oxide::Result<()> {
    let doc = PdfDocument::open("report.pdf")?;
    let dc = doc.classify_document()?;

    for (page, kind) in dc.pages.iter().enumerate() {
        match kind {
            PageKind::TextLayer => {
                // Fast, free native path — no OCR cost.
                let text = doc.extract_text(page)?;
                println!("=== page {page} (native) ===\n{text}");
            }
            PageKind::Scanned | PageKind::ImageText | PageKind::Mixed => {
                println!("=== page {page} needs OCR ===");
                // route to your OCR / auto-extract pipeline here
            }
            PageKind::Empty => { /* skip */ }
        }
    }
    Ok(())
}

Java

import fyi.oxide.pdf.PdfDocument;
import fyi.oxide.pdf.AutoExtractor;
import fyi.oxide.pdf.auto.PageClass;
import java.util.List;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    AutoExtractor auto = AutoExtractor.of(doc);
    List<PageClass> kinds = auto.classifyDocumentKinds();

    for (int page = 0; page < kinds.size(); page++) {
        switch (kinds.get(page)) {
            case TEXT_LAYER -> {
                // Fast, free native path — no OCR cost.
                String text = doc.extractText(page);
                System.out.println("=== page " + page + " (native) ===\n" + text);
            }
            case SCANNED, IMAGE_TEXT, MIXED ->
                System.out.println("=== page " + page + " needs OCR ===");
            case EMPTY -> { /* skip */ }
        }
    }
}

Ruby

require 'pdf_oxide'

doc  = PdfOxide::PdfDocument.open('report.pdf')
auto = PdfOxide::AutoExtractor.new(doc)
dc   = auto.classify_document

dc['pages'].each_with_index do |kind, page|
  case kind
  when 'text_layer'
    # Fast, free native path — no OCR cost.
    text = doc.extract_text(page)
    puts "=== page #{page} (native) ===\n#{text}"
  when 'scanned', 'image_text', 'mixed'
    puts "=== page #{page} needs OCR ==="
  when 'empty'
    # skip
  end
end

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <nlohmann/json.hpp>   // any JSON lib
#include <iostream>

int main() {
    auto doc = pdf_oxide::Document::open("report.pdf");

    auto dc = nlohmann::json::parse(doc.classify_document());
    int page = 0;
    for (const auto& kind : dc["pages"]) {
        if (kind == "text_layer") {
            // Fast, free native path — no OCR cost.
            std::cout << "=== page " << page << " (native) ===\n"
                      << doc.extract_text(page) << '\n';
        } else if (kind == "scanned" || kind == "image_text" || kind == "mixed") {
            std::cout << "=== page " << page << " needs OCR ===\n";
        }
        ++page;
    }
}

PHP

<?php
use PdfOxide\PdfDocument;
use PdfOxide\AutoExtractor;

$doc  = PdfDocument::open('report.pdf');
$auto = AutoExtractor::of($doc);

foreach ($auto->classifyDocumentKinds() as $page => $kind) {
    if ($kind === 'text_layer') {
        // Fast, free native path — no OCR cost.
        echo "=== page {$page} (native) ===\n" . $doc->extractText($page) . "\n";
    } elseif (in_array($kind, ['scanned', 'image_text', 'mixed'], true)) {
        echo "=== page {$page} needs OCR ===\n";
    }
}

Dart

import 'dart:convert';
import 'package:pdf_oxide/pdf_oxide.dart';

void main() {
  final doc = PdfDocument.open('report.pdf');
  final dc = jsonDecode(doc.classifyDocument()) as Map<String, dynamic>;

  final pages = (dc['pages'] as List).cast<String>();
  for (var page = 0; page < pages.length; page++) {
    final kind = pages[page];
    if (kind == 'text_layer') {
      // Fast, free native path — no OCR cost.
      print('=== page $page (native) ===\n${doc.extractText(page)}');
    } else if (kind == 'scanned' || kind == 'image_text' || kind == 'mixed') {
      print('=== page $page needs OCR ===');
    }
  }
}

library(pdfoxide)
library(jsonlite)

doc <- pdf_open("report.pdf")
dc  <- fromJSON(pdf_classify_document(doc))

for (page in seq_along(dc$pages)) {
  kind <- dc$pages[[page]]
  idx  <- page - 1L   # 0-based page index
  if (kind == "text_layer") {
    # Fast, free native path — no OCR cost.
    cat(sprintf("=== page %d (native) ===\n%s\n", idx, pdf_extract_text(doc, idx)))
  } else if (kind %in% c("scanned", "image_text", "mixed")) {
    cat(sprintf("=== page %d needs OCR ===\n", idx))
  }
}

Julia

using PdfOxide
using JSON

doc = open_document("report.pdf")
dc  = JSON.parse(classify_document(doc))

for (page, kind) in enumerate(dc["pages"])
    idx = page - 1   # 0-based page index
    if kind == "text_layer"
        # Fast, free native path — no OCR cost.
        println("=== page $idx (native) ===\n", extract_text(doc, idx))
    elseif kind in ("scanned", "image_text", "mixed")
        println("=== page $idx needs OCR ===")
    end
end

Zig

const std = @import("std");
const pdf = @import("pdf_oxide");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    const alloc = gpa.allocator();

    var doc = try pdf.Document.open("report.pdf");
    defer doc.deinit();

    const dc = try doc.classifyDocument(alloc);
    defer alloc.free(dc);

    const parsed = try std.json.parseFromSlice(std.json.Value, alloc, dc, .{});
    defer parsed.deinit();

    const pages = parsed.value.object.get("pages").?.array;
    for (pages.items, 0..) |kind_val, page| {
        const kind = kind_val.string;
        const idx: i32 = @intCast(page);
        if (std.mem.eql(u8, kind, "text_layer")) {
            // Fast, free native path — no OCR cost.
            const text = try doc.extractText(alloc, idx);
            defer alloc.free(text);
            std.debug.print("=== page {d} (native) ===\n{s}\n", .{ idx, text });
        } else if (std.mem.eql(u8, kind, "scanned") or
            std.mem.eql(u8, kind, "image_text") or
            std.mem.eql(u8, kind, "mixed"))
        {
            std.debug.print("=== page {d} needs OCR ===\n", .{idx});
        }
    }
}

Objective-C

#import <POXPdfOxide.h>

NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];

NSString *json = [doc classifyDocumentWithError:&err];
NSDictionary *dc = [NSJSONSerialization JSONObjectWithData:[json dataUsingEncoding:NSUTF8StringEncoding]
                                                   options:0 error:&err];

NSArray<NSString *> *pages = dc[@"pages"];
[pages enumerateObjectsUsingBlock:^(NSString *kind, NSUInteger page, BOOL *stop) {
    if ([kind isEqualToString:@"text_layer"]) {
        // Fast, free native path — no OCR cost.
        NSString *text = [doc extractText:(NSInteger)page error:nil];
        NSLog(@"=== page %lu (native) ===\n%@", (unsigned long)page, text);
    } else if ([kind isEqualToString:@"scanned"] ||
               [kind isEqualToString:@"image_text"] ||
               [kind isEqualToString:@"mixed"]) {
        NSLog(@"=== page %lu needs OCR ===", (unsigned long)page);
    }
}];

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")

dc = doc |> PdfOxide.classify_document() |> Jason.decode!()

dc["pages"]
|> Enum.with_index()
|> Enum.each(fn {kind, page} ->
  case kind do
    "text_layer" ->
      # Fast, free native path — no OCR cost.
      IO.puts("=== page #{page} (native) ===\n#{PdfOxide.extract_text(doc, page)}")

    k when k in ["scanned", "image_text", "mixed"] ->
      IO.puts("=== page #{page} needs OCR ===")

    _ ->
      :ok
  end
end)

ネイティブ分類プリフライトは抽出処理と比べてほぼコストゼロです — ラスタライズもOCR実行も行わないため、コーパス全体に対して実行してどのページがOCR予算に値するかを判断できます。PDF Oxideのネイティブテキスト抽出自体は公開ベンチマークで**平均0.8ms / 合格率100%**で動作するため、分類してから抽出するパスは高速な一般ケースを高速なまま維持します。

暗号化されたPDFについての注意

classify_page と classify_document は、認証されていない暗号化ドキュメントに対してフェイルクローズします — empty を無言で返すのではなく EncryptedPdf エラーを返します。分類前に認証してください（PDFの暗号化と復号を参照）。セキュリティ以外のページ単位の失敗は empty に graceful degradation されます。

よくある質問

分類処理でOCRは実行されますか？ いいえ。classify_page / classify_document はPDF内部の純粋な検査です — OCRもラスタライズも行いません。これによりコーパス全体のプリフライトとして実行できるほど低コストになっています。

PythonやNodeで分類機能は使えますか？ v0.3.69では対応していません。メソッドはRust、Go、C#、Swift、WASM/JavaScriptで公開されています。Python/Nodeからは自動抽出を使うか、Rustコア / CLIを経由してください。

text_layer vs scanned の判定精度はどのくらいですか？ 分類器は複数のシグナル（グリフ数、画像面積、ラスターコーデック、不可視テキスト比率、文字化け/断片化比率）を組み合わせ、強化されたテキスト品質ゲートを適用します。そのため、使えないborn-digitalテキストレイヤー（カラム順序の崩れ、(cid:NN)ゴミ文字、グリフ単位の断片化）は scanned に降格され、型付きの理由コードが付与されます。

Go / C# / SwiftでなぜJSONで結果が返るのですか？ これらのバインディングはC ABIをまたぐため、分類結果はmallocされたJSON文字列として返されます。標準JSONライブラリでデシリアライズしてください — フィールド名と列挙型トークンはリリースをまたいで凍結・安定しています。

PDFページの分類 — テキスト vs スキャン

単一のPDFページを分類するには？

PDFドキュメント全体を一度に分類するには？

分類結果のJSONはどのような形式か？

ページ種別（kind）

ドキュメントサマリー（summary）

理由コード（reason）

分類結果に基づいて抽出をルーティングするには？

暗号化されたPDFについての注意

よくある質問

関連ページ

ページ種別（`kind`）

ドキュメントサマリー（`summary`）

理由コード（`reason`）