What is the fastest Python PDF library?

PDF Oxide is the fastest Python PDF library, with 0.8ms mean text extraction time — 5.8× faster than PyMuPDF (4.6ms) and 15× faster than pypdf (12.1ms). Benchmarked on 3,830 real-world PDFs with 100% pass rate.

Is PDF Oxide free for commercial use?

Yes. PDF Oxide is MIT licensed — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL restrictions.

Can PDF Oxide handle scanned PDFs with OCR?

Yes. PDF Oxide includes built-in OCR via PaddleOCR and ONNX Runtime. No Tesseract installation needed — just pip install pdf_oxide and use extract_text_ocr(). Supports PP-OCRv3, v4, and v5 models.

Does PDF Oxide support XFA forms?

Yes. PDF Oxide is the only Python PDF library that can detect, analyze, and extract data from XFA forms (XML Forms Architecture). PyMuPDF, pypdf, pdfplumber, and pdfminer cannot read XFA form data.

How does PDF Oxide compare to PyMuPDF?

PDF Oxide is 5.8× faster than PyMuPDF (0.8ms vs 4.6ms mean), has a 100% pass rate vs 99.3%, and is MIT licensed vs PyMuPDF's AGPL-3.0. PDF Oxide also has built-in Markdown/HTML output and XFA form support that PyMuPDF lacks.

Can PDF Oxide convert PDF to Markdown?

Yes. PDF Oxide has built-in PDF to Markdown conversion with heading detection, table preservation, and list formatting — ideal for LLM and RAG pipelines. No separate package needed, unlike PyMuPDF which requires pymupdf4llm (69× slower).

Python으로 PDF 일괄 처리하기

오류 처리를 포함한 디렉터리 내 PDF 일괄 처리:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

for pdf_path in Path("documents/").glob("*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        text = doc.extract_text(0)
        print(f"{pdf_path.name}: {len(text)} chars")
    except PdfError as e:
        print(f"Failed: {pdf_path.name}: {e}")

WASM

import { WasmPdfDocument } from "pdf-oxide-wasm";

// Process multiple PDF buffers
for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const text = doc.extractText(0);
        console.log(`${name}: ${text.length} chars`);
        doc.free();
    } catch (e) {
        console.error(`Failed: ${name}: ${e.message}`);
    }
}

Rust

use pdf_oxide::PdfDocument;
use std::path::Path;

for entry in std::fs::read_dir("documents/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        match PdfDocument::open(path.to_str().unwrap()) {
            Ok(mut doc) => {
                let text = doc.extract_text(0)?;
                println!("{}: {} chars", path.display(), text.len());
            }
            Err(e) => println!("Failed: {}: {}", path.display(), e),
        }
    }
}

package main

import (
    "fmt"
    "log"
    "path/filepath"
    pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
    matches, _ := filepath.Glob("documents/*.pdf")
    for _, p := range matches {
        doc, err := pdfoxide.Open(p)
        if err != nil {
            log.Printf("Failed: %s: %v", p, err)
            continue
        }
        text, _ := doc.ExtractText(0)
        fmt.Printf("%s: %d chars\n", filepath.Base(p), len(text))
        doc.Close()
    }
}

using PdfOxide;

foreach (var p in Directory.GetFiles("documents/", "*.pdf"))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var text = doc.ExtractText(0);
        Console.WriteLine($"{Path.GetFileName(p)}: {text.Length} chars");
    }
    catch (Exception e)
    {
        Console.Error.WriteLine($"Failed: {p}: {e.Message}");
    }
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.File;

for (File f : new File("documents/").listFiles((d, n) -> n.endsWith(".pdf"))) {
    try (PdfDocument doc = PdfDocument.open(f.toPath())) {
        String text = doc.extractText(0);
        System.out.printf("%s: %d chars%n", f.getName(), text.length());
    } catch (Exception e) {
        System.out.printf("Failed: %s: %s%n", f.getName(), e.getMessage());
    }
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

File("documents/").listFiles { _, n -> n.endsWith(".pdf") }?.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val text = doc.extractText(0)
            println("${f.name}: ${text.length} chars")
        }
    } catch (e: Exception) {
        println("Failed: ${f.name}: ${e.message}")
    }
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File

for (f <- new File("documents/").listFiles((_, n) => n.endsWith(".pdf"))) {
  Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val text = doc.extractText(0)
    println(s"${f.getName}: ${text.length} chars")
  }
}

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(doseq [f (.listFiles (io/file "documents/"))
        :when (.endsWith (.getName f) ".pdf")]
  (try
    (with-open [doc (pdf/open (.getPath f))]
      (let [text (pdf/extract-text doc 0)]
        (println (str (.getName f) ": " (count text) " chars"))))
    (catch Exception e
      (println (str "Failed: " (.getName f) ": " (.getMessage e))))))

Ruby

require 'pdf_oxide'

Dir.glob('documents/*.pdf').each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      text = doc.extract_text(0)
      puts "#{File.basename(path)}: #{text.length} chars"
    end
  rescue PdfOxide::Error => e
    puts "Failed: #{File.basename(path)}: #{e.message}"
  end
end

PHP

use PdfOxide\PdfDocument;

foreach (glob('documents/*.pdf') as $path) {
    try {
        $doc = PdfDocument::open($path);
        $text = $doc->extractText(0);
        printf("%s: %d chars\n", basename($path), strlen($text));
        $doc->close();
    } catch (\Throwable $e) {
        printf("Failed: %s: %s\n", basename($path), $e->getMessage());
    }
}

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <iostream>

namespace fs = std::filesystem;
for (const auto& entry : fs::directory_iterator("documents/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        auto text = doc.extract_text(0);
        std::cout << entry.path().filename() << ": " << text.size() << " chars\n";
    } catch (const pdf_oxide::Error& e) {
        std::cerr << "Failed: " << entry.path().filename() << ": " << e.what() << "\n";
    }
}

Swift

import PdfOxide
import Foundation

let dir = "documents/"
for name in try FileManager.default.contentsOfDirectory(atPath: dir) where name.hasSuffix(".pdf") {
    do {
        let doc = try Document.open(dir + name)
        let text = try doc.extractText(0)
        print("\(name): \(text.count) chars")
    } catch {
        print("Failed: \(name): \(error)")
    }
}

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

for (final entry in Directory('documents/').listSync()) {
  if (!entry.path.endsWith('.pdf')) continue;
  try {
    final doc = PdfDocument.open(entry.path);
    final text = doc.extractText(0);
    print('${entry.uri.pathSegments.last}: ${text.length} chars');
    doc.close();
  } catch (e) {
    print('Failed: ${entry.path}: $e');
  }
}

library(pdfoxide)

for (path in Sys.glob("documents/*.pdf")) {
  tryCatch({
    doc  <- pdf_open(path)
    text <- pdf_extract_text(doc, 0)
    cat(sprintf("%s: %d chars\n", basename(path), nchar(text)))
  }, error = function(e) {
    cat(sprintf("Failed: %s: %s\n", basename(path), conditionMessage(e)))
  })
}

Julia

using PdfOxide

for path in filter(p -> endswith(p, ".pdf"), readdir("documents/"; join = true))
    try
        doc  = open_document(path)
        text = extract_text(doc, 0)
        println("$(basename(path)): $(length(text)) chars")
    catch e
        println("Failed: $(basename(path)): $e")
    end
end

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var dir = try std.fs.cwd().openDir("documents/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "documents/", entry.name });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch |e| {
        std.debug.print("Failed: {s}: {}\n", .{ entry.name, e });
        continue;
    };
    defer doc.deinit();
    const text = try doc.extractText(a, 0);
    defer a.free(text);
    std.debug.print("{s}: {d} chars\n", .{ entry.name, text.len });
}

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"documents/" error:nil]) {
    if (![name hasSuffix:@".pdf"]) continue;
    NSError *err = nil;
    NSString *path = [@"documents/" stringByAppendingString:name];
    POXDocument *doc = [POXDocument openPath:path error:&err];
    if (!doc) { NSLog(@"Failed: %@: %@", name, err.localizedDescription); continue; }
    NSString *text = [doc extractText:0 error:&err];
    NSLog(@"%@: %lu chars", name, (unsigned long)text.length);
}

Elixir

Path.wildcard("documents/*.pdf")
|> Enum.each(fn path ->
  case PdfOxide.open(path) do
    {:ok, doc} ->
      {:ok, text} = PdfOxide.extract_text(doc, 0)
      IO.puts("#{Path.basename(path)}: #{String.length(text)} chars")

    {:error, reason} ->
      IO.puts("Failed: #{Path.basename(path)}: #{inspect(reason)}")
  end
end)

페이지당 0.8ms 속도로 PDF Oxide는 3,830개의 PDF를 3.1초 안에 처리합니다.

설치

pip install pdf_oxide

순차 처리

모든 PDF에서 텍스트 추출

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_dir = Path("invoices/")
results = {}

for pdf_path in sorted(pdf_dir.glob("*.pdf")):
    try:
        doc = PdfDocument(str(pdf_path))
        pages = []
        for i in range(doc.page_count()):
            pages.append(doc.extract_text(i))
        results[pdf_path.name] = "\n".join(pages)
    except PdfError as e:
        print(f"Error: {pdf_path.name}: {e}")

print(f"Processed {len(results)} PDFs")

WASM

const results = new Map();
for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const text = doc.extractAllText();
        results.set(name, text);
        doc.free();
    } catch (e) {
        console.error(`Error: ${name}: ${e.message}`);
    }
}
console.log(`Processed ${results.size} PDFs`);

Rust

use std::collections::HashMap;

let mut results: HashMap<String, String> = HashMap::new();
for entry in std::fs::read_dir("invoices/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
            let mut pages = Vec::new();
            for i in 0..doc.page_count().unwrap_or(0) {
                if let Ok(text) = doc.extract_text(i) {
                    pages.push(text);
                }
            }
            results.insert(path.display().to_string(), pages.join("\n"));
        }
    }
}
println!("Processed {} PDFs", results.len());

results := make(map[string]string)
matches, _ := filepath.Glob("invoices/*.pdf")
sort.Strings(matches)

for _, p := range matches {
    doc, err := pdfoxide.Open(p)
    if err != nil { log.Printf("Error: %s: %v", p, err); continue }

    full, _ := doc.ExtractAllText()
    results[filepath.Base(p)] = full
    doc.Close()
}
fmt.Printf("Processed %d PDFs\n", len(results))

var results = new Dictionary<string, string>();
foreach (var p in Directory.GetFiles("invoices/", "*.pdf").OrderBy(p => p))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var sb = new StringBuilder();
        for (int i = 0; i < doc.PageCount; i++)
            sb.AppendLine(doc.ExtractText(i));
        results[Path.GetFileName(p)] = sb.ToString();
    }
    catch (Exception e)
    {
        Console.Error.WriteLine($"Error: {p}: {e.Message}");
    }
}
Console.WriteLine($"Processed {results.Count} PDFs");

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.File;
import java.util.*;

Map<String, String> results = new HashMap<>();
File[] files = new File("invoices/").listFiles((d, n) -> n.endsWith(".pdf"));
Arrays.sort(files);
for (File f : files) {
    try (PdfDocument doc = PdfDocument.open(f.toPath())) {
        StringBuilder all = new StringBuilder();
        for (int i = 0; i < doc.pageCount(); i++) all.append(doc.extractText(i));
        results.put(f.getName(), all.toString());
    } catch (Exception e) {
        System.err.printf("Error: %s: %s%n", f.getName(), e.getMessage());
    }
}
System.out.printf("Processed %d PDFs%n", results.size());

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

val results = mutableMapOf<String, String>()
File("invoices/").listFiles { _, n -> n.endsWith(".pdf") }?.sorted()?.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val all = (0 until doc.pageCount()).joinToString("") { doc.extractText(it) }
            results[f.name] = all
        }
    } catch (e: Exception) {
        System.err.println("Error: ${f.name}: ${e.message}")
    }
}
println("Processed ${results.size} PDFs")

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File

val results = scala.collection.mutable.Map.empty[String, String]
for (f <- new File("invoices/").listFiles((_, n) => n.endsWith(".pdf")).sorted) {
  Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val all = (0 until doc.pageCount()).map(doc.extractText).mkString
    results(f.getName) = all
  }
}
println(s"Processed ${results.size} PDFs")

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(let [results (atom {})]
  (doseq [f (sort (filter #(.endsWith (.getName %) ".pdf")
                          (.listFiles (io/file "invoices/"))))]
    (with-open [doc (pdf/open (.getPath f))]
      (let [all (apply str (map #(pdf/extract-text doc %)
                                (range (pdf/page-count doc))))]
        (swap! results assoc (.getName f) all))))
  (println (str "Processed " (count @results) " PDFs")))

Ruby

require 'pdf_oxide'

results = {}
Dir.glob('invoices/*.pdf').sort.each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      all = (0...doc.page_count).map { |i| doc.extract_text(i) }.join
      results[File.basename(path)] = all
    end
  rescue PdfOxide::Error => e
    warn "Error: #{File.basename(path)}: #{e.message}"
  end
end
puts "Processed #{results.size} PDFs"

PHP

use PdfOxide\PdfDocument;

$results = [];
$files = glob('invoices/*.pdf');
sort($files);
foreach ($files as $path) {
    try {
        $doc = PdfDocument::open($path);
        $all = '';
        for ($i = 0; $i < $doc->pageCount(); $i++) { $all .= $doc->extractText($i); }
        $results[basename($path)] = $all;
        $doc->close();
    } catch (\Throwable $e) {
        fwrite(STDERR, "Error: " . basename($path) . ": " . $e->getMessage() . "\n");
    }
}
printf("Processed %d PDFs\n", count($results));

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <map>

namespace fs = std::filesystem;
std::map<std::string, std::string> results;
for (const auto& entry : fs::directory_iterator("invoices/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        results[entry.path().filename().string()] = doc.extract_all_text();
    } catch (const pdf_oxide::Error& e) {
        std::cerr << "Error: " << entry.path().filename() << ": " << e.what() << "\n";
    }
}
std::cout << "Processed " << results.size() << " PDFs\n";

Swift

import PdfOxide
import Foundation

var results: [String: String] = [:]
let dir = "invoices/"
for name in try FileManager.default.contentsOfDirectory(atPath: dir).sorted() where name.hasSuffix(".pdf") {
    do {
        let doc = try Document.open(dir + name)
        results[name] = try doc.extractAllText()
    } catch {
        FileHandle.standardError.write("Error: \(name): \(error)\n".data(using: .utf8)!)
    }
}
print("Processed \(results.count) PDFs")

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

final results = <String, String>{};
final files = Directory('invoices/').listSync().where((e) => e.path.endsWith('.pdf')).toList()
  ..sort((a, b) => a.path.compareTo(b.path));
for (final entry in files) {
  try {
    final doc = PdfDocument.open(entry.path);
    results[entry.uri.pathSegments.last] = doc.extractAllText();
    doc.close();
  } catch (e) {
    stderr.writeln('Error: ${entry.path}: $e');
  }
}
print('Processed ${results.length} PDFs');

library(pdfoxide)

results <- list()
for (path in sort(Sys.glob("invoices/*.pdf"))) {
  tryCatch({
    doc <- pdf_open(path)
    results[[basename(path)]] <- pdf_extract_all_text(doc)
  }, error = function(e) {
    message(sprintf("Error: %s: %s", basename(path), conditionMessage(e)))
  })
}
cat(sprintf("Processed %d PDFs\n", length(results)))

Julia

using PdfOxide

results = Dict{String,String}()
for path in sort(filter(p -> endswith(p, ".pdf"), readdir("invoices/"; join = true)))
    try
        doc = open_document(path)
        results[basename(path)] = extract_all_text(doc)
    catch e
        @warn "Error: $(basename(path)): $e"
    end
end
println("Processed $(length(results)) PDFs")

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var results = std.StringHashMap([]const u8).init(a);
var dir = try std.fs.cwd().openDir("invoices/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "invoices/", entry.name });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch |e| {
        std.debug.print("Error: {s}: {}\n", .{ entry.name, e });
        continue;
    };
    defer doc.deinit();
    const all = try doc.extractAllText(a);
    try results.put(try a.dupe(u8, entry.name), all);
}
std.debug.print("Processed {d} PDFs\n", .{results.count()});

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
NSMutableDictionary<NSString*, NSString*> *results = [NSMutableDictionary dictionary];
NSArray *names = [[fm contentsOfDirectoryAtPath:@"invoices/" error:nil]
                  sortedArrayUsingSelector:@selector(compare:)];
for (NSString *name in names) {
    if (![name hasSuffix:@".pdf"]) continue;
    NSError *err = nil;
    POXDocument *doc = [POXDocument openPath:[@"invoices/" stringByAppendingString:name] error:&err];
    if (!doc) { NSLog(@"Error: %@: %@", name, err.localizedDescription); continue; }
    results[name] = [doc extractAllTextWithError:&err];
}
NSLog(@"Processed %lu PDFs", (unsigned long)results.count);

Elixir

results =
  "invoices/*.pdf"
  |> Path.wildcard()
  |> Enum.sort()
  |> Enum.reduce(%{}, fn path, acc ->
    case PdfOxide.open(path) do
      {:ok, doc} ->
        n = elem(PdfOxide.page_count(doc), 1)
        all = 0..(n - 1) |> Enum.map_join(fn i -> elem(PdfOxide.extract_text(doc, i), 1) end)
        Map.put(acc, Path.basename(path), all)

      {:error, reason} ->
        IO.puts(:stderr, "Error: #{Path.basename(path)}: #{inspect(reason)}")
        acc
    end
  end)

IO.puts("Processed #{map_size(results)} PDFs")

모든 PDF를 Markdown으로 변환

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

input_dir = Path("papers/")
output_dir = Path("markdown/")
output_dir.mkdir(exist_ok=True)

for pdf_path in input_dir.glob("*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        md = doc.to_markdown_all(detect_headings=True, include_images=False)
        out_path = output_dir / pdf_path.with_suffix(".md").name
        out_path.write_text(md, encoding="utf-8")
    except PdfError as e:
        print(f"Skipped {pdf_path.name}: {e}")

WASM

for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const md = doc.toMarkdownAll();
        console.log(`Converted ${name}: ${md.length} chars`);
        doc.free();
    } catch (e) {
        console.error(`Skipped ${name}: ${e.message}`);
    }
}

Rust

for entry in std::fs::read_dir("papers/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
            if let Ok(md) = doc.to_markdown_all(true) {
                let out = path.with_extension("md");
                std::fs::write(&out, &md)?;
            }
        }
    }
}

_ = os.MkdirAll("markdown/", 0755)
matches, _ := filepath.Glob("papers/*.pdf")
for _, p := range matches {
    doc, err := pdfoxide.Open(p)
    if err != nil { log.Printf("Skipped %s: %v", p, err); continue }

    md, _ := doc.ToMarkdownAll()
    doc.Close()

    out := filepath.Join("markdown", strings.TrimSuffix(filepath.Base(p), ".pdf") + ".md")
    _ = os.WriteFile(out, []byte(md), 0644)
}

Directory.CreateDirectory("markdown/");
foreach (var p in Directory.GetFiles("papers/", "*.pdf"))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var md = doc.ToMarkdownAll();
        var outPath = Path.Combine("markdown", Path.GetFileNameWithoutExtension(p) + ".md");
        File.WriteAllText(outPath, md);
    }
    catch (Exception e)
    {
        Console.Error.WriteLine($"Skipped {p}: {e.Message}");
    }
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.File;
import java.nio.file.*;

Files.createDirectories(Path.of("markdown"));
for (File f : new File("papers/").listFiles((d, n) -> n.endsWith(".pdf"))) {
    try (PdfDocument doc = PdfDocument.open(f.toPath())) {
        String md = doc.toMarkdown();   // no-arg = whole document
        String base = f.getName().replaceFirst("\\.pdf$", ".md");
        Files.writeString(Path.of("markdown", base), md);
    } catch (Exception e) {
        System.err.printf("Skipped %s: %s%n", f.getName(), e.getMessage());
    }
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

File("markdown").mkdirs()
File("papers/").listFiles { _, n -> n.endsWith(".pdf") }?.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val md = doc.toMarkdown()   // no-arg = whole document
            File("markdown", f.name.removeSuffix(".pdf") + ".md").writeText(md)
        }
    } catch (e: Exception) {
        System.err.println("Skipped ${f.name}: ${e.message}")
    }
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File
import java.nio.file.{Files, Path}

Files.createDirectories(Path.of("markdown"))
for (f <- new File("papers/").listFiles((_, n) => n.endsWith(".pdf"))) {
  Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val md = doc.toMarkdown()   // no-arg = whole document
    Files.writeString(Path.of("markdown", f.getName.stripSuffix(".pdf") + ".md"), md)
  }
}

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(.mkdirs (io/file "markdown"))
(doseq [f (filter #(.endsWith (.getName %) ".pdf")
                  (.listFiles (io/file "papers/")))]
  (try
    (with-open [doc (pdf/open (.getPath f))]
      (let [md   (pdf/to-markdown doc)   ; no page arg = whole document
            base (clojure.string/replace (.getName f) #"\.pdf$" ".md")]
        (spit (io/file "markdown" base) md)))
    (catch Exception e
      (println (str "Skipped " (.getName f) ": " (.getMessage e))))))

Ruby

require 'pdf_oxide'
require 'fileutils'

FileUtils.mkdir_p('markdown')
Dir.glob('papers/*.pdf').each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      md = doc.to_markdown    # nil arg = whole document
      out = File.join('markdown', File.basename(path, '.pdf') + '.md')
      File.write(out, md)
    end
  rescue PdfOxide::Error => e
    warn "Skipped #{File.basename(path)}: #{e.message}"
  end
end

PHP

use PdfOxide\PdfDocument;

@mkdir('markdown');
foreach (glob('papers/*.pdf') as $path) {
    try {
        $doc = PdfDocument::open($path);
        $md  = $doc->toMarkdownAll();
        $out = 'markdown/' . basename($path, '.pdf') . '.md';
        file_put_contents($out, $md);
        $doc->close();
    } catch (\Throwable $e) {
        fwrite(STDERR, "Skipped " . basename($path) . ": " . $e->getMessage() . "\n");
    }
}

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <fstream>

namespace fs = std::filesystem;
fs::create_directories("markdown");
for (const auto& entry : fs::directory_iterator("papers/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        auto md  = doc.to_markdown_all();
        auto out = fs::path("markdown") / entry.path().filename().replace_extension(".md");
        std::ofstream(out) << md;
    } catch (const pdf_oxide::Error& e) {
        std::cerr << "Skipped " << entry.path().filename() << ": " << e.what() << "\n";
    }
}

Swift

import PdfOxide
import Foundation

let dir = "papers/"
try FileManager.default.createDirectory(atPath: "markdown", withIntermediateDirectories: true)
for name in try FileManager.default.contentsOfDirectory(atPath: dir) where name.hasSuffix(".pdf") {
    do {
        let doc = try Document.open(dir + name)
        let md = try doc.toMarkdownAll()
        let out = "markdown/" + name.replacingOccurrences(of: ".pdf", with: ".md")
        try md.write(toFile: out, atomically: true, encoding: .utf8)
    } catch {
        print("Skipped \(name): \(error)")
    }
}

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

Directory('markdown').createSync();
for (final entry in Directory('papers/').listSync()) {
  if (!entry.path.endsWith('.pdf')) continue;
  try {
    final doc = PdfDocument.open(entry.path);
    final md = doc.toMarkdownAll();
    final base = entry.uri.pathSegments.last.replaceAll('.pdf', '.md');
    File('markdown/$base').writeAsStringSync(md);
    doc.close();
  } catch (e) {
    print('Skipped ${entry.path}: $e');
  }
}

library(pdfoxide)

dir.create("markdown", showWarnings = FALSE)
for (path in Sys.glob("papers/*.pdf")) {
  tryCatch({
    doc <- pdf_open(path)
    md  <- pdf_to_markdown_all(doc)
    out <- file.path("markdown", sub("\\.pdf$", ".md", basename(path)))
    writeLines(md, out)
  }, error = function(e) {
    message(sprintf("Skipped %s: %s", basename(path), conditionMessage(e)))
  })
}

Julia

using PdfOxide

mkpath("markdown")
for path in filter(p -> endswith(p, ".pdf"), readdir("papers/"; join = true))
    try
        doc = open_document(path)
        md  = to_markdown_all(doc)
        out = joinpath("markdown", replace(basename(path), ".pdf" => ".md"))
        write(out, md)
    catch e
        println("Skipped $(basename(path)): $e")
    end
end

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
try std.fs.cwd().makePath("markdown");
var dir = try std.fs.cwd().openDir("papers/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "papers/", entry.name });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch |e| {
        std.debug.print("Skipped {s}: {}\n", .{ entry.name, e });
        continue;
    };
    defer doc.deinit();
    const md = try doc.toMarkdownAll(a);
    defer a.free(md);
    const out = try std.fmt.allocPrint(a, "markdown/{s}.md", .{entry.name[0 .. entry.name.len - 4]});
    defer a.free(out);
    try std.fs.cwd().writeFile(.{ .sub_path = out, .data = md });
}

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
[fm createDirectoryAtPath:@"markdown" withIntermediateDirectories:YES attributes:nil error:nil];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"papers/" error:nil]) {
    if (![name hasSuffix:@".pdf"]) continue;
    NSError *err = nil;
    POXDocument *doc = [POXDocument openPath:[@"papers/" stringByAppendingString:name] error:&err];
    if (!doc) { NSLog(@"Skipped %@: %@", name, err.localizedDescription); continue; }
    NSString *md = [doc toMarkdownAllWithError:&err];
    NSString *base = [[name stringByDeletingPathExtension] stringByAppendingPathExtension:@"md"];
    [md writeToFile:[@"markdown/" stringByAppendingString:base]
         atomically:YES encoding:NSUTF8StringEncoding error:&err];
}

Elixir

File.mkdir_p!("markdown")

"papers/*.pdf"
|> Path.wildcard()
|> Enum.each(fn path ->
  case PdfOxide.open(path) do
    {:ok, doc} ->
      {:ok, md} = PdfOxide.to_markdown_all(doc)
      out = Path.join("markdown", Path.basename(path, ".pdf") <> ".md")
      File.write!(out, md)

    {:error, reason} ->
      IO.puts(:stderr, "Skipped #{Path.basename(path)}: #{inspect(reason)}")
  end
end)

병렬 처리

고루틴과 태스크. Go에서는 각 PdfDocument가 독립적인 읽기에 대해 고루틴 안전합니다. 파일마다 고루틴 하나를 실행하면 됩니다. C#에서는 *Async 메서드와 Task.WhenAll을 조합하는 것이 권장됩니다(비동기 가이드 참조). 아래 예제는 Python 중심으로 설명하지만, 워커 풀 기능이 있는 언어라면 동일한 패턴을 적용할 수 있습니다.

multiprocessing 사용

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from multiprocessing import Pool

def process_pdf(pdf_path: str) -> dict:
    try:
        doc = PdfDocument(pdf_path)
        text = ""
        for i in range(doc.page_count()):
            text += doc.extract_text(i) + "\n"
        return {"file": pdf_path, "text": text, "pages": doc.page_count()}
    except PdfError as e:
        return {"file": pdf_path, "error": str(e)}

pdf_files = [str(p) for p in Path("documents/").glob("*.pdf")]

with Pool() as pool:
    results = pool.map(process_pdf, pdf_files)

success = [r for r in results if "text" in r]
errors = [r for r in results if "error" in r]
print(f"Processed {len(success)}, failed {len(errors)}")

concurrent.futures 사용

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

def extract_text(pdf_path: str) -> tuple[str, str]:
    doc = PdfDocument(pdf_path)
    text = ""
    for i in range(doc.page_count()):
        text += doc.extract_text(i) + "\n"
    return pdf_path, text

pdf_files = list(Path("documents/").glob("*.pdf"))

with ProcessPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(extract_text, str(p)): p for p in pdf_files}

    for future in as_completed(futures):
        pdf_path = futures[future]
        try:
            path, text = future.result()
            print(f"{pdf_path.name}: {len(text)} chars")
        except Exception as e:
            print(f"Error: {pdf_path.name}: {e}")

진행 상황 추적

간단한 카운터

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_files = list(Path("documents/").glob("*.pdf"))
total = len(pdf_files)

for idx, pdf_path in enumerate(pdf_files, 1):
    try:
        doc = PdfDocument(str(pdf_path))
        text = doc.extract_text(0)
        print(f"[{idx}/{total}] {pdf_path.name}: OK")
    except PdfError as e:
        print(f"[{idx}/{total}] {pdf_path.name}: FAILED - {e}")

tqdm 사용

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from tqdm import tqdm

pdf_files = list(Path("documents/").glob("*.pdf"))

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        doc = PdfDocument(str(pdf_path))
        for i in range(doc.page_count()):
            doc.extract_text(i)
    except PdfError:
        pass

손상된 파일의 오류 처리

오류를 기록하면서도 처리를 계속하는 견고한 파이프라인 구축:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import json

pdf_dir = Path("mixed-quality/")
results = []
errors = []

for pdf_path in pdf_dir.glob("**/*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        page_count = doc.page_count()
        text_length = sum(
            len(doc.extract_text(i)) for i in range(page_count)
        )
        results.append({
            "file": str(pdf_path),
            "pages": page_count,
            "chars": text_length,
        })
    except PdfError as e:
        errors.append({
            "file": str(pdf_path),
            "error": str(e),
        })
    except Exception as e:
        errors.append({
            "file": str(pdf_path),
            "error": f"Unexpected: {e}",
        })

print(f"Success: {len(results)}, Errors: {len(errors)}")

# 오류 보고서 저장
if errors:
    with open("errors.json", "w") as f:
        json.dump(errors, f, indent=2)

WASM

const results = [];
const errors = [];

for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const pageCount = doc.pageCount();
        let textLength = 0;
        for (let i = 0; i < pageCount; i++) {
            textLength += doc.extractText(i).length;
        }
        results.push({ file: name, pages: pageCount, chars: textLength });
        doc.free();
    } catch (e) {
        errors.push({ file: name, error: e.message });
    }
}

console.log(`Success: ${results.length}, Errors: ${errors.length}`);

Rust

let mut results = Vec::new();
let mut errors = Vec::new();

for entry in std::fs::read_dir("mixed-quality/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        match PdfDocument::open(path.to_str().unwrap()) {
            Ok(mut doc) => {
                let page_count = doc.page_count().unwrap_or(0);
                let text_length: usize = (0..page_count)
                    .filter_map(|i| doc.extract_text(i).ok())
                    .map(|t| t.len())
                    .sum();
                results.push((path.display().to_string(), page_count, text_length));
            }
            Err(e) => errors.push((path.display().to_string(), e.to_string())),
        }
    }
}

println!("Success: {}, Errors: {}", results.len(), errors.len());

type Success struct{ File string; Pages int; Chars int }
type Failure struct{ File string; Error string }

var results []Success
var errors []Failure

_ = filepath.Walk("mixed-quality/", func(path string, info os.FileInfo, err error) error {
    if err != nil || info.IsDir() || !strings.HasSuffix(path, ".pdf") { return nil }

    doc, err := pdfoxide.Open(path)
    if err != nil {
        errors = append(errors, Failure{path, err.Error()})
        return nil
    }
    defer doc.Close()

    n, _ := doc.PageCount()
    chars := 0
    for i := 0; i < n; i++ {
        t, _ := doc.ExtractText(i)
        chars += len(t)
    }
    results = append(results, Success{path, n, chars})
    return nil
})

fmt.Printf("Success: %d, Errors: %d\n", len(results), len(errors))

var results = new List<(string File, int Pages, int Chars)>();
var errors = new List<(string File, string Error)>();

foreach (var p in Directory.EnumerateFiles("mixed-quality/", "*.pdf", SearchOption.AllDirectories))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var n = doc.PageCount;
        var chars = Enumerable.Range(0, n).Sum(i => doc.ExtractText(i).Length);
        results.Add((p, n, chars));
    }
    catch (Exception e)
    {
        errors.Add((p, e.Message));
    }
}

Console.WriteLine($"Success: {results.Count}, Errors: {errors.Count}");

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.IOException;
import java.nio.file.*;
import java.util.*;

record Success(String file, int pages, long chars) {}
record Failure(String file, String error) {}

List<Success> results = new ArrayList<>();
List<Failure> errors = new ArrayList<>();

try (var stream = Files.walk(Path.of("mixed-quality/"))) {
    for (Path path : (Iterable<Path>) stream.filter(p -> p.toString().endsWith(".pdf"))::iterator) {
        try (PdfDocument doc = PdfDocument.open(path)) {
            int n = doc.pageCount();
            long chars = 0;
            for (int i = 0; i < n; i++) chars += doc.extractText(i).length();
            results.add(new Success(path.toString(), n, chars));
        } catch (Exception e) {
            errors.add(new Failure(path.toString(), e.getMessage()));
        }
    }
}
System.out.printf("Success: %d, Errors: %d%n", results.size(), errors.size());

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

data class Success(val file: String, val pages: Int, val chars: Long)
data class Failure(val file: String, val error: String?)

val results = mutableListOf<Success>()
val errors = mutableListOf<Failure>()

File("mixed-quality/").walkTopDown().filter { it.extension == "pdf" }.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val n = doc.pageCount()
            val chars = (0 until n).sumOf { doc.extractText(it).length.toLong() }
            results += Success(f.path, n, chars)
        }
    } catch (e: Exception) {
        errors += Failure(f.path, e.message)
    }
}
println("Success: ${results.size}, Errors: ${errors.size}")

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.{Using, Try}
import java.io.File

final case class Success(file: String, pages: Int, chars: Long)
final case class Failure(file: String, error: String)

def pdfs(dir: File): Seq[File] =
  dir.listFiles.flatMap(f => if (f.isDirectory) pdfs(f) else if (f.getName.endsWith(".pdf")) Seq(f) else Nil).toSeq

val results = scala.collection.mutable.ArrayBuffer.empty[Success]
val errors = scala.collection.mutable.ArrayBuffer.empty[Failure]
for (f <- pdfs(new File("mixed-quality/"))) {
  Try(Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val n = doc.pageCount()
    val chars = (0 until n).map(i => doc.extractText(i).length.toLong).sum
    Success(f.getPath, n, chars)
  }).fold(e => errors += Failure(f.getPath, e.getMessage), results += _)
}
println(s"Success: ${results.size}, Errors: ${errors.size}")

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(let [results (atom [])
      errors  (atom [])]
  (doseq [f (filter #(.endsWith (.getName %) ".pdf")
                    (file-seq (io/file "mixed-quality/")))]
    (try
      (with-open [doc (pdf/open (.getPath f))]
        (let [n     (pdf/page-count doc)
              chars (reduce + (map #(count (pdf/extract-text doc %)) (range n)))]
          (swap! results conj {:file (.getPath f) :pages n :chars chars})))
      (catch Exception e
        (swap! errors conj {:file (.getPath f) :error (.getMessage e)}))))
  (println (str "Success: " (count @results) ", Errors: " (count @errors))))

Ruby

require 'pdf_oxide'

results = []
errors = []

Dir.glob('mixed-quality/**/*.pdf').each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      n = doc.page_count
      chars = (0...n).sum { |i| doc.extract_text(i).length }
      results << { file: path, pages: n, chars: chars }
    end
  rescue PdfOxide::Error => e
    errors << { file: path, error: e.message }
  end
end

puts "Success: #{results.size}, Errors: #{errors.size}"

PHP

use PdfOxide\PdfDocument;

$results = [];
$errors = [];

$it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator('mixed-quality/'));
foreach ($it as $file) {
    if ($file->getExtension() !== 'pdf') continue;
    try {
        $doc = PdfDocument::open($file->getPathname());
        $n = $doc->pageCount();
        $chars = 0;
        for ($i = 0; $i < $n; $i++) { $chars += strlen($doc->extractText($i)); }
        $results[] = ['file' => $file->getPathname(), 'pages' => $n, 'chars' => $chars];
        $doc->close();
    } catch (\Throwable $e) {
        $errors[] = ['file' => $file->getPathname(), 'error' => $e->getMessage()];
    }
}

printf("Success: %d, Errors: %d\n", count($results), count($errors));

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>

namespace fs = std::filesystem;
struct Success { std::string file; int pages; std::size_t chars; };
struct Failure { std::string file; std::string error; };
std::vector<Success> results;
std::vector<Failure> errors;

for (const auto& entry : fs::recursive_directory_iterator("mixed-quality/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        int n = doc.page_count();
        std::size_t chars = 0;
        for (int i = 0; i < n; i++) chars += doc.extract_text(i).size();
        results.push_back({entry.path().string(), n, chars});
    } catch (const pdf_oxide::Error& e) {
        errors.push_back({entry.path().string(), e.what()});
    }
}
std::cout << "Success: " << results.size() << ", Errors: " << errors.size() << "\n";

Swift

import PdfOxide
import Foundation

struct Success { let file: String; let pages: Int; let chars: Int }
struct Failure { let file: String; let error: String }
var results: [Success] = []
var errors: [Failure] = []

let root = URL(fileURLWithPath: "mixed-quality/")
let walker = FileManager.default.enumerator(at: root, includingPropertiesForKeys: nil)!
for case let url as URL in walker where url.pathExtension == "pdf" {
    do {
        let doc = try Document.open(url.path)
        let n = try doc.pageCount()
        var chars = 0
        for i in 0..<n { chars += try doc.extractText(i).count }
        results.append(Success(file: url.path, pages: n, chars: chars))
    } catch {
        errors.append(Failure(file: url.path, error: "\(error)"))
    }
}
print("Success: \(results.count), Errors: \(errors.count)")

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

final results = <Map<String, Object>>[];
final errors = <Map<String, Object>>[];

for (final entry in Directory('mixed-quality/').listSync(recursive: true)) {
  if (entry is! File || !entry.path.endsWith('.pdf')) continue;
  try {
    final doc = PdfDocument.open(entry.path);
    final n = doc.pageCount;
    var chars = 0;
    for (var i = 0; i < n; i++) chars += doc.extractText(i).length;
    results.add({'file': entry.path, 'pages': n, 'chars': chars});
    doc.close();
  } catch (e) {
    errors.add({'file': entry.path, 'error': '$e'});
  }
}
print('Success: ${results.length}, Errors: ${errors.length}');

library(pdfoxide)

results <- list()
errors <- list()

for (path in list.files("mixed-quality/", pattern = "\\.pdf$", recursive = TRUE, full.names = TRUE)) {
  tryCatch({
    doc <- pdf_open(path)
    n <- pdf_page_count(doc)
    chars <- sum(vapply(0:(n - 1), function(i) nchar(pdf_extract_text(doc, i)), integer(1)))
    results[[length(results) + 1]] <- list(file = path, pages = n, chars = chars)
  }, error = function(e) {
    errors[[length(errors) + 1]] <<- list(file = path, error = conditionMessage(e))
  })
}

cat(sprintf("Success: %d, Errors: %d\n", length(results), length(errors)))

Julia

using PdfOxide

results = NamedTuple[]
errors = NamedTuple[]

for (root, _, files) in walkdir("mixed-quality/"), name in files
    endswith(name, ".pdf") || continue
    path = joinpath(root, name)
    try
        doc = open_document(path)
        n = page_count(doc)
        chars = sum(i -> length(extract_text(doc, i)), 0:(n - 1); init = 0)
        push!(results, (file = path, pages = n, chars = chars))
    catch e
        push!(errors, (file = path, error = sprint(showerror, e)))
    end
end

println("Success: $(length(results)), Errors: $(length(errors))")

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var ok: usize = 0;
var failed: usize = 0;

var root = try std.fs.cwd().openDir("mixed-quality/", .{ .iterate = true });
defer root.close();
var walker = try root.walk(a);
defer walker.deinit();
while (try walker.next()) |entry| {
    if (entry.kind != .file or !std.mem.endsWith(u8, entry.basename, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "mixed-quality/", entry.path });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch {
        failed += 1;
        continue;
    };
    defer doc.deinit();
    const n = try doc.pageCount();
    var chars: usize = 0;
    var i: i32 = 0;
    while (i < n) : (i += 1) {
        const t = try doc.extractText(a, i);
        defer a.free(t);
        chars += t.len;
    }
    ok += 1;
}
std.debug.print("Success: {d}, Errors: {d}\n", .{ ok, failed });

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
NSUInteger ok = 0, failed = 0;
NSDirectoryEnumerator *en = [fm enumeratorAtPath:@"mixed-quality/"];
for (NSString *rel in en) {
    if (![rel hasSuffix:@".pdf"]) continue;
    NSString *path = [@"mixed-quality/" stringByAppendingString:rel];
    NSError *err = nil;
    POXDocument *doc = [POXDocument openPath:path error:&err];
    if (!doc) { failed++; continue; }
    NSInteger n = [doc pageCountError:&err];
    NSUInteger chars = 0;
    for (NSInteger i = 0; i < n; i++) chars += [doc extractText:i error:&err].length;
    ok++;
}
NSLog(@"Success: %lu, Errors: %lu", (unsigned long)ok, (unsigned long)failed);

Elixir

{results, errors} =
  "mixed-quality/**/*.pdf"
  |> Path.wildcard()
  |> Enum.reduce({[], []}, fn path, {ok, bad} ->
    case PdfOxide.open(path) do
      {:ok, doc} ->
        {:ok, n} = PdfOxide.page_count(doc)
        chars =
          0..(n - 1)
          |> Enum.reduce(0, fn i, acc -> acc + String.length(elem(PdfOxide.extract_text(doc, i), 1)) end)

        {[%{file: path, pages: n, chars: chars} | ok], bad}

      {:error, reason} ->
        {ok, [%{file: path, error: inspect(reason)} | bad]}
    end
  end)

IO.puts("Success: #{length(results)}, Errors: #{length(errors)}")

메모리 효율적인 처리

매우 큰 코퍼스의 경우, 결과를 메모리에 쌓지 않고 파일을 하나씩 처리합니다:

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import csv

pdf_dir = Path("large-corpus/")

with open("output.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["file", "page", "text"])

    for pdf_path in pdf_dir.glob("*.pdf"):
        try:
            doc = PdfDocument(str(pdf_path))
            for i in range(doc.page_count()):
                text = doc.extract_text(i)
                writer.writerow([pdf_path.name, i, text])
        except PdfError:
            pass

일괄 병합

디렉터리의 모든 PDF를 하나로 합치기:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_files = sorted(Path("reports/").glob("*.pdf"))

if pdf_files:
    doc = PdfDocument(str(pdf_files[0]))
    for pdf_path in pdf_files[1:]:
        try:
            doc.merge_from(str(pdf_path))
        except PdfError as e:
            print(f"Skipped {pdf_path.name}: {e}")
    doc.save("all-reports.pdf")

Rust

use pdf_oxide::editor::DocumentEditor;

let mut files: Vec<_> = std::fs::read_dir("reports/")?
    .filter_map(|e| e.ok())
    .filter(|e| e.path().extension().map_or(false, |ext| ext == "pdf"))
    .collect();
files.sort_by_key(|e| e.path());

if let Some(first) = files.first() {
    let mut editor = DocumentEditor::open(first.path().to_str().unwrap())?;
    for entry in &files[1..] {
        if let Err(e) = editor.merge_from(entry.path().to_str().unwrap()) {
            println!("Skipped {}: {}", entry.path().display(), e);
        }
    }
    editor.save("all-reports.pdf")?;
}

files, _ := filepath.Glob("reports/*.pdf")
sort.Strings(files)

// Top-level Merge concatenates every file in one call
bytes, err := pdfoxide.Merge(files)
if err != nil { log.Fatal(err) }
_ = os.WriteFile("all-reports.pdf", bytes, 0644)

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <fstream>
#include <algorithm>

namespace fs = std::filesystem;
std::vector<std::string> files;
for (const auto& entry : fs::directory_iterator("reports/"))
    if (entry.path().extension() == ".pdf") files.push_back(entry.path().string());
std::sort(files.begin(), files.end());

// Top-level merge concatenates every file in one call
auto bytes = pdf_oxide::merge(files);
std::ofstream("all-reports.pdf", std::ios::binary).write(
    reinterpret_cast<const char*>(bytes.data()), bytes.size());

Swift

import PdfOxide
import Foundation

let dir = "reports/"
let files = try FileManager.default.contentsOfDirectory(atPath: dir)
    .filter { $0.hasSuffix(".pdf") }
    .sorted()
    .map { dir + $0 }

// Top-level merge concatenates every file in one call
let bytes = try merge(files)
try Data(bytes).write(to: URL(fileURLWithPath: "all-reports.pdf"))

Dart

import 'dart:io';
import 'dart:typed_data';
import 'package:pdf_oxide/pdf_oxide.dart';

final files = Directory('reports/')
    .listSync()
    .map((e) => e.path)
    .where((p) => p.endsWith('.pdf'))
    .toList()
  ..sort();

// Top-level merge concatenates every file in one call
final Uint8List bytes = pdfMerge(files);
File('all-reports.pdf').writeAsBytesSync(bytes);

library(pdfoxide)

files <- sort(Sys.glob("reports/*.pdf"))

# Top-level merge concatenates every file in one call
bytes <- pdf_merge(files)
writeBin(bytes, "all-reports.pdf")

Julia

using PdfOxide

files = sort(filter(p -> endswith(p, ".pdf"), readdir("reports/"; join = true)))

# Top-level merge concatenates every file in one call
bytes = merge_pdfs(files)
write("all-reports.pdf", bytes)

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var paths = std.ArrayList([*:0]const u8).init(a);
defer paths.deinit();
var dir = try std.fs.cwd().openDir("reports/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    try paths.append(try std.fs.path.joinZ(a, &.{ "reports/", entry.name }));
}

// Top-level merge concatenates every file in one call
const bytes = try pdf_oxide.merge(a, paths.items);
defer a.free(bytes);
try std.fs.cwd().writeFile(.{ .sub_path = "all-reports.pdf", .data = bytes });

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
NSMutableArray<NSString*> *files = [NSMutableArray array];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"reports/" error:nil])
    if ([name hasSuffix:@".pdf"]) [files addObject:[@"reports/" stringByAppendingString:name]];
[files sortUsingSelector:@selector(compare:)];

// Top-level merge concatenates every file in one call
NSError *err = nil;
NSData *bytes = [POXPdf merge:files error:&err];
[bytes writeToFile:@"all-reports.pdf" atomically:YES];

Elixir

files =
  "reports/*.pdf"
  |> Path.wildcard()
  |> Enum.sort()

# Top-level merge concatenates every file in one call
{:ok, bytes} = PdfOxide.merge(files)
File.write!("all-reports.pdf", bytes)

Python으로 PDF 일괄 처리하기

설치

순차 처리

모든 PDF에서 텍스트 추출

모든 PDF를 Markdown으로 변환

병렬 처리

multiprocessing 사용

concurrent.futures 사용

진행 상황 추적

간단한 카운터

tqdm 사용

손상된 파일의 오류 처리

메모리 효율적인 처리

일괄 병합

관련 페이지