What is the fastest Python PDF library?

PDF Oxide is the fastest Python PDF library, with 0.8ms mean text extraction time — 5.8× faster than PyMuPDF (4.6ms) and 15× faster than pypdf (12.1ms). Benchmarked on 3,830 real-world PDFs with 100% pass rate.

Is PDF Oxide free for commercial use?

Yes. PDF Oxide is MIT licensed — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL restrictions.

Can PDF Oxide handle scanned PDFs with OCR?

Yes. PDF Oxide includes built-in OCR via PaddleOCR and ONNX Runtime. No Tesseract installation needed — just pip install pdf_oxide and use extract_text_ocr(). Supports PP-OCRv3, v4, and v5 models.

Does PDF Oxide support XFA forms?

Yes. PDF Oxide is the only Python PDF library that can detect, analyze, and extract data from XFA forms (XML Forms Architecture). PyMuPDF, pypdf, pdfplumber, and pdfminer cannot read XFA form data.

How does PDF Oxide compare to PyMuPDF?

PDF Oxide is 5.8× faster than PyMuPDF (0.8ms vs 4.6ms mean), has a 100% pass rate vs 99.3%, and is MIT licensed vs PyMuPDF's AGPL-3.0. PDF Oxide also has built-in Markdown/HTML output and XFA form support that PyMuPDF lacks.

Can PDF Oxide convert PDF to Markdown?

Yes. PDF Oxide has built-in PDF to Markdown conversion with heading detection, table preservation, and list formatting — ideal for LLM and RAG pipelines. No separate package needed, unlike PyMuPDF which requires pymupdf4llm (69× slower).

PDF 페이지 분류 — 텍스트 vs 스캔

텍스트를 추출하기 전에 보통 어떤 종류의 페이지인지를 먼저 파악해야 합니다. 사용 가능한 네이티브 텍스트 레이어가 있는지, 아니면 OCR이 필요한 스캔 이미지인지 확인해야 하죠. PDF Oxide는 이를 저비용 프리플라이트로 해결합니다 — classify_page와 classify_document는 PDF 내부 구조(글리프 수, 이미지 면적, 코덱, 보이지 않는 텍스트 비율, 깨진 글리프 비율)를 OCR 없이, 페이지 래스터화 없이 검사합니다.

분류 결과는 설명 가능합니다: 모든 판정에는 신뢰도 점수, 타입이 지정된 reason 코드, 그리고 결정을 이끈 원시 signals가 포함됩니다. 이를 통해 페이지를 적절한 추출기(네이티브 텍스트 vs OCR)로 라우팅하고 이유를 로그에 남길 수 있습니다.

바인딩 지원 범위. 분류 기능은 Rust, Go, C#, Swift, WASM/JavaScript에서 사용할 수 있습니다. Python과 Node N-API 바인딩은 v0.3.69에서 classify_page / classify_document를 제공하지 않습니다 — 해당 런타임에서는 자동 추출 경로를 사용하거나 Rust 코어 / CLI를 통해 브리지하세요.

단일 PDF 페이지를 분류하려면?

classify_page는 0 기반의 페이지 인덱스를 받아 PageClassification을 반환합니다. C-ABI 바인딩(Go, C#, Swift, WASM)에서는 역직렬화가 필요한 JSON 문자열로 반환됩니다.

Rust

use pdf_oxide::PdfDocument;

fn main() -> pdf_oxide::Result<()> {
    let doc = PdfDocument::open("mixed.pdf")?;

    // PdfDocument::classify_page(&self, page: usize)
    //   -> Result<pdf_oxide::extractors::auto::PageClassification>
    let result = doc.classify_page(0)?;

    println!("page {} is {:?} (confidence {:.2})",
        result.page, result.kind, result.confidence);
    println!("reason: {:?}", result.reason);
    println!("glyphs={} image_area={:.2} garbled={:.2}",
        result.signals.text_glyph_count,
        result.signals.image_area_ratio,
        result.signals.garbled_ratio);
    Ok(())
}

package main

import (
	"encoding/json"
	"fmt"
	"log"

	pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
	doc, err := pdfoxide.Open("mixed.pdf")
	if err != nil {
		log.Fatal(err)
	}
	defer doc.Close()

	// func (doc *PdfDocument) ClassifyPage(pageIndex int) (string, error)
	raw, err := doc.ClassifyPage(0)
	if err != nil {
		log.Fatal(err)
	}

	var pc struct {
		Page       int     `json:"page"`
		Kind       string  `json:"kind"`
		Confidence float64 `json:"confidence"`
		Reason     string  `json:"reason"`
	}
	if err := json.Unmarshal([]byte(raw), &pc); err != nil {
		log.Fatal(err)
	}
	fmt.Printf("page %d is %s (%.2f): %s\n", pc.Page, pc.Kind, pc.Confidence, pc.Reason)
}

using System;
using System.Text.Json;
using PdfOxide.Core;

using var doc = PdfDocument.Open("mixed.pdf");

// string PdfDocument.ClassifyPage(int pageIndex)
string raw = doc.ClassifyPage(0);

using var json = JsonDocument.Parse(raw);
var root = json.RootElement;
Console.WriteLine(
    $"page {root.GetProperty("page").GetInt32()} is " +
    $"{root.GetProperty("kind").GetString()} " +
    $"({root.GetProperty("confidence").GetDouble():F2}): " +
    $"{root.GetProperty("reason").GetString()}");

Swift

import PdfOxide

let doc = try PdfDocument(path: "mixed.pdf")

// func classifyPage(_ pageIndex: Int) throws -> String   (JSON)
let json = try doc.classifyPage(0)
print(json)

JavaScript (WASM)

import init, { WasmPdfDocument } from "pdf-oxide-wasm";

await init();
const bytes = new Uint8Array(await (await fetch("mixed.pdf")).arrayBuffer());
const doc = WasmPdfDocument.fromBytes(bytes);

// WasmPdfDocument.classifyPage(pageIndex) -> JSON string
const pc = JSON.parse(doc.classifyPage(0));
console.log(`page ${pc.page} is ${pc.kind} (${pc.confidence}): ${pc.reason}`);

Java

import fyi.oxide.pdf.PdfDocument;
import fyi.oxide.pdf.AutoExtractor;
import fyi.oxide.pdf.auto.PageClass;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("mixed.pdf"))) {
    AutoExtractor auto = AutoExtractor.of(doc);

    // PageClass AutoExtractor.classifyPageKind(int pageIndex)
    PageClass kind = auto.classifyPageKind(0);
    System.out.println("page 0 is " + kind);   // TEXT_LAYER / SCANNED / IMAGE_TEXT / MIXED / EMPTY
}

PHP

<?php
use PdfOxide\PdfDocument;
use PdfOxide\AutoExtractor;

$doc = PdfDocument::open('mixed.pdf');
$auto = AutoExtractor::of($doc);

// string AutoExtractor::classifyPageKind(int $pageIndex)
$kind = $auto->classifyPageKind(0);
echo "page 0 is {$kind}\n";   // text_layer / scanned / image_text / mixed / empty

Ruby

require 'pdf_oxide'

doc  = PdfOxide::PdfDocument.open('mixed.pdf')
auto = PdfOxide::AutoExtractor.new(doc)

# AutoExtractor#classify_page(page_index)
#   => { reason:, kind:, confidence:, classification: }
pc = auto.classify_page(0)
puts "page 0 is #{pc[:kind]} (#{pc[:confidence]}): #{pc[:reason]}"

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <iostream>

int main() {
    auto doc = pdf_oxide::Document::open("mixed.pdf");

    // std::string Document::classify_page(int page_index) — JSON
    std::string json = doc.classify_page(0);
    std::cout << json << '\n';
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

void main() {
  final doc = PdfDocument.open('mixed.pdf');

  // String PdfDocument.classifyPage(int page) — JSON
  final json = doc.classifyPage(0);
  print(json);
}

library(pdfoxide)

doc <- pdf_open("mixed.pdf")

# pdf_classify_page(doc, page) — JSON PageClassification
json <- pdf_classify_page(doc, 0)
cat(json, "\n")

Julia

using PdfOxide

doc = open_document("mixed.pdf")

# classify_page(doc, page) -> JSON string
json = classify_page(doc, 0)
println(json)

Zig

const std = @import("std");
const pdf = @import("pdf_oxide");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    const alloc = gpa.allocator();

    var doc = try pdf.Document.open("mixed.pdf");
    defer doc.deinit();

    // classifyPage(alloc, page_index) -> caller-owned JSON bytes
    const json = try doc.classifyPage(alloc, 0);
    defer alloc.free(json);
    std.debug.print("{s}\n", .{json});
}

Objective-C

#import <POXPdfOxide.h>

NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"mixed.pdf" error:&err];

// -classifyPage:error: -> JSON NSString
NSString *json = [doc classifyPage:0 error:&err];
NSLog(@"%@", json);

Elixir

{:ok, doc} = PdfOxide.open("mixed.pdf")

# PdfOxide.classify_page(doc, page) -> JSON string
json = PdfOxide.classify_page(doc, 0)
IO.puts(json)

PDF 문서 전체를 한 번에 분류하려면?

classify_document는 모든 페이지에 동일한 저비용 프리플라이트를 실행하여 결과를 집계합니다. 페이지별 kind 목록, 0 기반의 pages_needing_ocr 인덱스, 그리고 전체 summary를 반환합니다. 판정은 페이지 단위로 이루어집니다 — PDF Oxide는 혼합 PDF에 단일 문서 모드를 강제하지 않습니다.

Rust

use pdf_oxide::PdfDocument;

fn main() -> pdf_oxide::Result<()> {
    let doc = PdfDocument::open("report.pdf")?;

    // PdfDocument::classify_document(&self)
    //   -> Result<pdf_oxide::extractors::auto::DocumentClassification>
    let dc = doc.classify_document()?;

    println!("summary: {:?}", dc.summary);
    println!("pages needing OCR: {:?}", dc.pages_needing_ocr);
    for (i, kind) in dc.pages.iter().enumerate() {
        println!("  page {i}: {kind:?}");
    }
    Ok(())
}

package main

import (
	"encoding/json"
	"fmt"
	"log"

	pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
	doc, err := pdfoxide.Open("report.pdf")
	if err != nil {
		log.Fatal(err)
	}
	defer doc.Close()

	// func (doc *PdfDocument) ClassifyDocument() (string, error)
	raw, err := doc.ClassifyDocument()
	if err != nil {
		log.Fatal(err)
	}

	var dc struct {
		Pages           []string `json:"pages"`
		PagesNeedingOCR []int    `json:"pages_needing_ocr"`
		Summary         string   `json:"summary"`
	}
	if err := json.Unmarshal([]byte(raw), &dc); err != nil {
		log.Fatal(err)
	}
	fmt.Printf("summary=%s ocr_pages=%v\n", dc.Summary, dc.PagesNeedingOCR)
}

using System;
using PdfOxide.Core;

using var doc = PdfDocument.Open("report.pdf");

// string PdfDocument.ClassifyDocument()
string raw = doc.ClassifyDocument();
Console.WriteLine(raw);

Swift

import PdfOxide

let doc = try PdfDocument(path: "report.pdf")

// func classifyDocument() throws -> String   (JSON)
let json = try doc.classifyDocument()
print(json)

JavaScript (WASM)

import init, { WasmPdfDocument } from "pdf-oxide-wasm";

await init();
const bytes = new Uint8Array(await (await fetch("report.pdf")).arrayBuffer());
const doc = WasmPdfDocument.fromBytes(bytes);

// WasmPdfDocument.classifyDocument() -> JSON string
const dc = JSON.parse(doc.classifyDocument());
console.log("pages needing OCR:", dc.pages_needing_ocr);

Java

import fyi.oxide.pdf.PdfDocument;
import fyi.oxide.pdf.AutoExtractor;
import fyi.oxide.pdf.auto.PageClass;
import java.util.List;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    AutoExtractor auto = AutoExtractor.of(doc);

    // List<PageClass> AutoExtractor.classifyDocumentKinds()
    List<PageClass> kinds = auto.classifyDocumentKinds();
    for (int i = 0; i < kinds.size(); i++) {
        System.out.println("page " + i + ": " + kinds.get(i));
    }
}

PHP

<?php
use PdfOxide\PdfDocument;
use PdfOxide\AutoExtractor;

$doc = PdfDocument::open('report.pdf');
$auto = AutoExtractor::of($doc);

// array<int,string> AutoExtractor::classifyDocumentKinds()
$kinds = $auto->classifyDocumentKinds();
foreach ($kinds as $i => $kind) {
    echo "page {$i}: {$kind}\n";
}

Ruby

require 'pdf_oxide'

doc  = PdfOxide::PdfDocument.open('report.pdf')
auto = PdfOxide::AutoExtractor.new(doc)

# AutoExtractor#classify_document => decoded JSON envelope
dc = auto.classify_document
puts "pages needing OCR: #{dc['pages_needing_ocr']}"

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <iostream>

int main() {
    auto doc = pdf_oxide::Document::open("report.pdf");

    // std::string Document::classify_document() — JSON
    std::string json = doc.classify_document();
    std::cout << json << '\n';
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

void main() {
  final doc = PdfDocument.open('report.pdf');

  // String PdfDocument.classifyDocument() — JSON
  final json = doc.classifyDocument();
  print(json);
}

library(pdfoxide)

doc <- pdf_open("report.pdf")

# pdf_classify_document(doc) — JSON DocumentClassification
json <- pdf_classify_document(doc)
cat(json, "\n")

Julia

using PdfOxide

doc = open_document("report.pdf")

# classify_document(doc) -> JSON string
json = classify_document(doc)
println(json)

Zig

const std = @import("std");
const pdf = @import("pdf_oxide");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    const alloc = gpa.allocator();

    var doc = try pdf.Document.open("report.pdf");
    defer doc.deinit();

    // classifyDocument(alloc) -> caller-owned JSON bytes
    const json = try doc.classifyDocument(alloc);
    defer alloc.free(json);
    std.debug.print("{s}\n", .{json});
}

Objective-C

#import <POXPdfOxide.h>

NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];

// -classifyDocumentWithError: -> JSON NSString
NSString *json = [doc classifyDocumentWithError:&err];
NSLog(@"%@", json);

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")

# PdfOxide.classify_document(doc) -> JSON string
json = PdfOxide.classify_document(doc)
IO.puts(json)

분류 JSON은 어떻게 생겼나요?

classify_page는 PageClassification을 반환합니다:

{
  "page": 0,
  "kind": "text_layer",
  "confidence": 0.97,
  "reason": "native_text_high_confidence",
  "signals": {
    "text_glyph_count": 1840,
    "text_area_ratio": 0.62,
    "image_area_ratio": 0.0,
    "codec": "none",
    "invisible_text_ratio": 0.0,
    "garbled_ratio": 0.0,
    "fragmented_word_ratio": 0.01,
    "consecutive_repeat_ratio": 0.0,
    "vector_path_density": 0.04,
    "has_reliable_structure": true,
    "producer_prior": "authoring",
    "page_is_empty": false
  }
}

classify_document는 DocumentClassification을 반환합니다:

{
  "pages": ["text_layer", "scanned", "image_text"],
  "pages_needing_ocr": [1],
  "summary": "mixed"
}

페이지 종류 (`kind`)

종류	의미	권장 경로
`text_layer`	사용 가능한 네이티브 텍스트가 지배적	텍스트 레이어 추출
`scanned`	이미지 위주, 텍스트 없음 또는 깨진 텍스트	페이지 OCR 처리
`image_text`	네이티브 텍스트 및 텍스트를 포함한 이미지 영역	하이브리드: 네이티브 + 영역 OCR
`mixed`	페이지 내 이질적 구성 (텍스트 + 이미지 표/그림)	영역별 자동 라우팅
`empty`	빈 페이지 / 거의 빈 페이지 — 오류 아님	건너뜀

문서 요약 (`summary`)

mostly_text, mostly_scanned, mixed 또는 empty.

이유 코드 (`reason`)

이유는 고정된, 추가 전용 snake_case 토큰입니다. 자주 나오는 값: ok, native_text_high_confidence, no_text_layer_present, text_layer_below_threshold, glyph_mapping_missing, encrypted_no_extract_permission, image_table_reconstructed, image_table_no_structure.

분류 결과에 따라 추출을 라우팅하려면?

저비용 프리플라이트의 핵심은 OCR이 필요 없는 페이지에 OCR 비용을 치르지 않는 것입니다. 먼저 분류하고, OCR이 필요한 페이지만 무거운 경로로 처리하세요:

Rust

use pdf_oxide::PdfDocument;
use pdf_oxide::extractors::auto::PageKind;

fn main() -> pdf_oxide::Result<()> {
    let doc = PdfDocument::open("report.pdf")?;
    let dc = doc.classify_document()?;

    for (page, kind) in dc.pages.iter().enumerate() {
        match kind {
            PageKind::TextLayer => {
                // Fast, free native path — no OCR cost.
                let text = doc.extract_text(page)?;
                println!("=== page {page} (native) ===\n{text}");
            }
            PageKind::Scanned | PageKind::ImageText | PageKind::Mixed => {
                println!("=== page {page} needs OCR ===");
                // route to your OCR / auto-extract pipeline here
            }
            PageKind::Empty => { /* skip */ }
        }
    }
    Ok(())
}

Java

import fyi.oxide.pdf.PdfDocument;
import fyi.oxide.pdf.AutoExtractor;
import fyi.oxide.pdf.auto.PageClass;
import java.util.List;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    AutoExtractor auto = AutoExtractor.of(doc);
    List<PageClass> kinds = auto.classifyDocumentKinds();

    for (int page = 0; page < kinds.size(); page++) {
        switch (kinds.get(page)) {
            case TEXT_LAYER -> {
                // Fast, free native path — no OCR cost.
                String text = doc.extractText(page);
                System.out.println("=== page " + page + " (native) ===\n" + text);
            }
            case SCANNED, IMAGE_TEXT, MIXED ->
                System.out.println("=== page " + page + " needs OCR ===");
            case EMPTY -> { /* skip */ }
        }
    }
}

Ruby

require 'pdf_oxide'

doc  = PdfOxide::PdfDocument.open('report.pdf')
auto = PdfOxide::AutoExtractor.new(doc)
dc   = auto.classify_document

dc['pages'].each_with_index do |kind, page|
  case kind
  when 'text_layer'
    # Fast, free native path — no OCR cost.
    text = doc.extract_text(page)
    puts "=== page #{page} (native) ===\n#{text}"
  when 'scanned', 'image_text', 'mixed'
    puts "=== page #{page} needs OCR ==="
  when 'empty'
    # skip
  end
end

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <nlohmann/json.hpp>   // any JSON lib
#include <iostream>

int main() {
    auto doc = pdf_oxide::Document::open("report.pdf");

    auto dc = nlohmann::json::parse(doc.classify_document());
    int page = 0;
    for (const auto& kind : dc["pages"]) {
        if (kind == "text_layer") {
            // Fast, free native path — no OCR cost.
            std::cout << "=== page " << page << " (native) ===\n"
                      << doc.extract_text(page) << '\n';
        } else if (kind == "scanned" || kind == "image_text" || kind == "mixed") {
            std::cout << "=== page " << page << " needs OCR ===\n";
        }
        ++page;
    }
}

PHP

<?php
use PdfOxide\PdfDocument;
use PdfOxide\AutoExtractor;

$doc  = PdfDocument::open('report.pdf');
$auto = AutoExtractor::of($doc);

foreach ($auto->classifyDocumentKinds() as $page => $kind) {
    if ($kind === 'text_layer') {
        // Fast, free native path — no OCR cost.
        echo "=== page {$page} (native) ===\n" . $doc->extractText($page) . "\n";
    } elseif (in_array($kind, ['scanned', 'image_text', 'mixed'], true)) {
        echo "=== page {$page} needs OCR ===\n";
    }
}

Dart

import 'dart:convert';
import 'package:pdf_oxide/pdf_oxide.dart';

void main() {
  final doc = PdfDocument.open('report.pdf');
  final dc = jsonDecode(doc.classifyDocument()) as Map<String, dynamic>;

  final pages = (dc['pages'] as List).cast<String>();
  for (var page = 0; page < pages.length; page++) {
    final kind = pages[page];
    if (kind == 'text_layer') {
      // Fast, free native path — no OCR cost.
      print('=== page $page (native) ===\n${doc.extractText(page)}');
    } else if (kind == 'scanned' || kind == 'image_text' || kind == 'mixed') {
      print('=== page $page needs OCR ===');
    }
  }
}

library(pdfoxide)
library(jsonlite)

doc <- pdf_open("report.pdf")
dc  <- fromJSON(pdf_classify_document(doc))

for (page in seq_along(dc$pages)) {
  kind <- dc$pages[[page]]
  idx  <- page - 1L   # 0-based page index
  if (kind == "text_layer") {
    # Fast, free native path — no OCR cost.
    cat(sprintf("=== page %d (native) ===\n%s\n", idx, pdf_extract_text(doc, idx)))
  } else if (kind %in% c("scanned", "image_text", "mixed")) {
    cat(sprintf("=== page %d needs OCR ===\n", idx))
  }
}

Julia

using PdfOxide
using JSON

doc = open_document("report.pdf")
dc  = JSON.parse(classify_document(doc))

for (page, kind) in enumerate(dc["pages"])
    idx = page - 1   # 0-based page index
    if kind == "text_layer"
        # Fast, free native path — no OCR cost.
        println("=== page $idx (native) ===\n", extract_text(doc, idx))
    elseif kind in ("scanned", "image_text", "mixed")
        println("=== page $idx needs OCR ===")
    end
end

Zig

const std = @import("std");
const pdf = @import("pdf_oxide");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    const alloc = gpa.allocator();

    var doc = try pdf.Document.open("report.pdf");
    defer doc.deinit();

    const dc = try doc.classifyDocument(alloc);
    defer alloc.free(dc);

    const parsed = try std.json.parseFromSlice(std.json.Value, alloc, dc, .{});
    defer parsed.deinit();

    const pages = parsed.value.object.get("pages").?.array;
    for (pages.items, 0..) |kind_val, page| {
        const kind = kind_val.string;
        const idx: i32 = @intCast(page);
        if (std.mem.eql(u8, kind, "text_layer")) {
            // Fast, free native path — no OCR cost.
            const text = try doc.extractText(alloc, idx);
            defer alloc.free(text);
            std.debug.print("=== page {d} (native) ===\n{s}\n", .{ idx, text });
        } else if (std.mem.eql(u8, kind, "scanned") or
            std.mem.eql(u8, kind, "image_text") or
            std.mem.eql(u8, kind, "mixed"))
        {
            std.debug.print("=== page {d} needs OCR ===\n", .{idx});
        }
    }
}

Objective-C

#import <POXPdfOxide.h>

NSError *err = nil;
POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];

NSString *json = [doc classifyDocumentWithError:&err];
NSDictionary *dc = [NSJSONSerialization JSONObjectWithData:[json dataUsingEncoding:NSUTF8StringEncoding]
                                                   options:0 error:&err];

NSArray<NSString *> *pages = dc[@"pages"];
[pages enumerateObjectsUsingBlock:^(NSString *kind, NSUInteger page, BOOL *stop) {
    if ([kind isEqualToString:@"text_layer"]) {
        // Fast, free native path — no OCR cost.
        NSString *text = [doc extractText:(NSInteger)page error:nil];
        NSLog(@"=== page %lu (native) ===\n%@", (unsigned long)page, text);
    } else if ([kind isEqualToString:@"scanned"] ||
               [kind isEqualToString:@"image_text"] ||
               [kind isEqualToString:@"mixed"]) {
        NSLog(@"=== page %lu needs OCR ===", (unsigned long)page);
    }
}];

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")

dc = doc |> PdfOxide.classify_document() |> Jason.decode!()

dc["pages"]
|> Enum.with_index()
|> Enum.each(fn {kind, page} ->
  case kind do
    "text_layer" ->
      # Fast, free native path — no OCR cost.
      IO.puts("=== page #{page} (native) ===\n#{PdfOxide.extract_text(doc, page)}")

    k when k in ["scanned", "image_text", "mixed"] ->
      IO.puts("=== page #{page} needs OCR ===")

    _ ->
      :ok
  end
end)

네이티브 분류 프리플라이트는 추출 작업에 비해 사실상 무료입니다 — 래스터화도, OCR 실행도 없기 때문에 전체 코퍼스에 걸쳐 실행하여 어떤 페이지가 OCR 비용을 쓸 만한지 판단할 수 있습니다. PDF Oxide의 네이티브 텍스트 추출 자체는 공개 벤치마크에서 **평균 0.8ms / 합격률 100%**로 동작하므로, 분류 후 추출하는 경로는 빠른 일반적인 경우를 계속 빠르게 유지합니다.

암호화된 PDF에 대한 주의사항

classify_page와 classify_document는 인증하지 않은 암호화 문서에 대해 안전하게 실패합니다 — empty를 조용히 반환하는 대신 EncryptedPdf 오류를 반환합니다. 분류 전에 먼저 인증하세요(PDF 암호화 및 복호화 참조). 보안과 무관한 페이지 단위 오류는 empty로 우아하게 강등됩니다.

자주 묻는 질문

분류가 OCR을 실행하나요? 아니요. classify_page / classify_document는 PDF 내부의 순수한 검사입니다 — OCR도, 래스터화도 없습니다. 바로 이 점 때문에 전체 코퍼스에 대한 프리플라이트로 실행할 만큼 저렴합니다.

Python이나 Node에서 분류를 사용할 수 있나요? v0.3.69에서는 지원되지 않습니다. 메서드는 Rust, Go, C#, Swift, WASM/JavaScript에서 제공됩니다. Python/Node에서는 자동 추출을 사용하거나 Rust 코어 / CLI를 통해 브리지하세요.

text_layer vs scanned 판정은 얼마나 정확한가요? 분류기는 여러 신호(글리프 수, 이미지 면적, 래스터 코덱, 보이지 않는 텍스트 비율, 깨진/단편화된 글리프 비율)를 결합하고 강화된 텍스트 품질 게이트를 적용합니다. 덕분에 사용 불가능한 born-digital 텍스트 레이어(컬럼 순서 왜곡, (cid:NN) 쓰레기 문자, 글리프 단위 단편화)는 신뢰하는 대신 타입이 지정된 이유 코드와 함께 scanned으로 강등됩니다.

Go / C# / Swift에서 왜 JSON으로 결과가 반환되나요? 이 바인딩들은 C ABI를 통하므로, 분류 결과가 malloc된 JSON 문자열로 반환됩니다. 표준 JSON 라이브러리로 역직렬화하세요 — 필드 이름과 열거형 토큰은 릴리스 간에 고정되어 있으며 안정적입니다.

PDF 페이지 분류 — 텍스트 vs 스캔

단일 PDF 페이지를 분류하려면?

PDF 문서 전체를 한 번에 분류하려면?

분류 JSON은 어떻게 생겼나요?

페이지 종류 (kind)

문서 요약 (summary)

이유 코드 (reason)

분류 결과에 따라 추출을 라우팅하려면?

암호화된 PDF에 대한 주의사항

자주 묻는 질문

관련 페이지

페이지 종류 (`kind`)

문서 요약 (`summary`)

이유 코드 (`reason`)