Skip to content

Python으로 PDF 일괄 처리하기

오류 처리를 포함한 디렉터리 내 PDF 일괄 처리:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

for pdf_path in Path("documents/").glob("*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        text = doc.extract_text(0)
        print(f"{pdf_path.name}: {len(text)} chars")
    except PdfError as e:
        print(f"Failed: {pdf_path.name}: {e}")

WASM

import { WasmPdfDocument } from "pdf-oxide-wasm";

// Process multiple PDF buffers
for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const text = doc.extractText(0);
        console.log(`${name}: ${text.length} chars`);
        doc.free();
    } catch (e) {
        console.error(`Failed: ${name}: ${e.message}`);
    }
}

Rust

use pdf_oxide::PdfDocument;
use std::path::Path;

for entry in std::fs::read_dir("documents/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        match PdfDocument::open(path.to_str().unwrap()) {
            Ok(mut doc) => {
                let text = doc.extract_text(0)?;
                println!("{}: {} chars", path.display(), text.len());
            }
            Err(e) => println!("Failed: {}: {}", path.display(), e),
        }
    }
}

Go

package main

import (
    "fmt"
    "log"
    "path/filepath"
    pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
    matches, _ := filepath.Glob("documents/*.pdf")
    for _, p := range matches {
        doc, err := pdfoxide.Open(p)
        if err != nil {
            log.Printf("Failed: %s: %v", p, err)
            continue
        }
        text, _ := doc.ExtractText(0)
        fmt.Printf("%s: %d chars\n", filepath.Base(p), len(text))
        doc.Close()
    }
}

C#

using PdfOxide;

foreach (var p in Directory.GetFiles("documents/", "*.pdf"))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var text = doc.ExtractText(0);
        Console.WriteLine($"{Path.GetFileName(p)}: {text.Length} chars");
    }
    catch (Exception e)
    {
        Console.Error.WriteLine($"Failed: {p}: {e.Message}");
    }
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.File;

for (File f : new File("documents/").listFiles((d, n) -> n.endsWith(".pdf"))) {
    try (PdfDocument doc = PdfDocument.open(f.toPath())) {
        String text = doc.extractText(0);
        System.out.printf("%s: %d chars%n", f.getName(), text.length());
    } catch (Exception e) {
        System.out.printf("Failed: %s: %s%n", f.getName(), e.getMessage());
    }
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

File("documents/").listFiles { _, n -> n.endsWith(".pdf") }?.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val text = doc.extractText(0)
            println("${f.name}: ${text.length} chars")
        }
    } catch (e: Exception) {
        println("Failed: ${f.name}: ${e.message}")
    }
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File

for (f <- new File("documents/").listFiles((_, n) => n.endsWith(".pdf"))) {
  Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val text = doc.extractText(0)
    println(s"${f.getName}: ${text.length} chars")
  }
}

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(doseq [f (.listFiles (io/file "documents/"))
        :when (.endsWith (.getName f) ".pdf")]
  (try
    (with-open [doc (pdf/open (.getPath f))]
      (let [text (pdf/extract-text doc 0)]
        (println (str (.getName f) ": " (count text) " chars"))))
    (catch Exception e
      (println (str "Failed: " (.getName f) ": " (.getMessage e))))))

Ruby

require 'pdf_oxide'

Dir.glob('documents/*.pdf').each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      text = doc.extract_text(0)
      puts "#{File.basename(path)}: #{text.length} chars"
    end
  rescue PdfOxide::Error => e
    puts "Failed: #{File.basename(path)}: #{e.message}"
  end
end

PHP

use PdfOxide\PdfDocument;

foreach (glob('documents/*.pdf') as $path) {
    try {
        $doc = PdfDocument::open($path);
        $text = $doc->extractText(0);
        printf("%s: %d chars\n", basename($path), strlen($text));
        $doc->close();
    } catch (\Throwable $e) {
        printf("Failed: %s: %s\n", basename($path), $e->getMessage());
    }
}

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <iostream>

namespace fs = std::filesystem;
for (const auto& entry : fs::directory_iterator("documents/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        auto text = doc.extract_text(0);
        std::cout << entry.path().filename() << ": " << text.size() << " chars\n";
    } catch (const pdf_oxide::Error& e) {
        std::cerr << "Failed: " << entry.path().filename() << ": " << e.what() << "\n";
    }
}

Swift

import PdfOxide
import Foundation

let dir = "documents/"
for name in try FileManager.default.contentsOfDirectory(atPath: dir) where name.hasSuffix(".pdf") {
    do {
        let doc = try Document.open(dir + name)
        let text = try doc.extractText(0)
        print("\(name): \(text.count) chars")
    } catch {
        print("Failed: \(name): \(error)")
    }
}

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

for (final entry in Directory('documents/').listSync()) {
  if (!entry.path.endsWith('.pdf')) continue;
  try {
    final doc = PdfDocument.open(entry.path);
    final text = doc.extractText(0);
    print('${entry.uri.pathSegments.last}: ${text.length} chars');
    doc.close();
  } catch (e) {
    print('Failed: ${entry.path}: $e');
  }
}

R

library(pdfoxide)

for (path in Sys.glob("documents/*.pdf")) {
  tryCatch({
    doc  <- pdf_open(path)
    text <- pdf_extract_text(doc, 0)
    cat(sprintf("%s: %d chars\n", basename(path), nchar(text)))
  }, error = function(e) {
    cat(sprintf("Failed: %s: %s\n", basename(path), conditionMessage(e)))
  })
}

Julia

using PdfOxide

for path in filter(p -> endswith(p, ".pdf"), readdir("documents/"; join = true))
    try
        doc  = open_document(path)
        text = extract_text(doc, 0)
        println("$(basename(path)): $(length(text)) chars")
    catch e
        println("Failed: $(basename(path)): $e")
    end
end

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var dir = try std.fs.cwd().openDir("documents/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "documents/", entry.name });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch |e| {
        std.debug.print("Failed: {s}: {}\n", .{ entry.name, e });
        continue;
    };
    defer doc.deinit();
    const text = try doc.extractText(a, 0);
    defer a.free(text);
    std.debug.print("{s}: {d} chars\n", .{ entry.name, text.len });
}

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"documents/" error:nil]) {
    if (![name hasSuffix:@".pdf"]) continue;
    NSError *err = nil;
    NSString *path = [@"documents/" stringByAppendingString:name];
    POXDocument *doc = [POXDocument openPath:path error:&err];
    if (!doc) { NSLog(@"Failed: %@: %@", name, err.localizedDescription); continue; }
    NSString *text = [doc extractText:0 error:&err];
    NSLog(@"%@: %lu chars", name, (unsigned long)text.length);
}

Elixir

Path.wildcard("documents/*.pdf")
|> Enum.each(fn path ->
  case PdfOxide.open(path) do
    {:ok, doc} ->
      {:ok, text} = PdfOxide.extract_text(doc, 0)
      IO.puts("#{Path.basename(path)}: #{String.length(text)} chars")

    {:error, reason} ->
      IO.puts("Failed: #{Path.basename(path)}: #{inspect(reason)}")
  end
end)

페이지당 0.8ms 속도로 PDF Oxide는 3,830개의 PDF를 3.1초 안에 처리합니다.

설치

pip install pdf_oxide

순차 처리

모든 PDF에서 텍스트 추출

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_dir = Path("invoices/")
results = {}

for pdf_path in sorted(pdf_dir.glob("*.pdf")):
    try:
        doc = PdfDocument(str(pdf_path))
        pages = []
        for i in range(doc.page_count()):
            pages.append(doc.extract_text(i))
        results[pdf_path.name] = "\n".join(pages)
    except PdfError as e:
        print(f"Error: {pdf_path.name}: {e}")

print(f"Processed {len(results)} PDFs")

WASM

const results = new Map();
for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const text = doc.extractAllText();
        results.set(name, text);
        doc.free();
    } catch (e) {
        console.error(`Error: ${name}: ${e.message}`);
    }
}
console.log(`Processed ${results.size} PDFs`);

Rust

use std::collections::HashMap;

let mut results: HashMap<String, String> = HashMap::new();
for entry in std::fs::read_dir("invoices/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
            let mut pages = Vec::new();
            for i in 0..doc.page_count().unwrap_or(0) {
                if let Ok(text) = doc.extract_text(i) {
                    pages.push(text);
                }
            }
            results.insert(path.display().to_string(), pages.join("\n"));
        }
    }
}
println!("Processed {} PDFs", results.len());

Go

results := make(map[string]string)
matches, _ := filepath.Glob("invoices/*.pdf")
sort.Strings(matches)

for _, p := range matches {
    doc, err := pdfoxide.Open(p)
    if err != nil { log.Printf("Error: %s: %v", p, err); continue }

    full, _ := doc.ExtractAllText()
    results[filepath.Base(p)] = full
    doc.Close()
}
fmt.Printf("Processed %d PDFs\n", len(results))

C#

var results = new Dictionary<string, string>();
foreach (var p in Directory.GetFiles("invoices/", "*.pdf").OrderBy(p => p))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var sb = new StringBuilder();
        for (int i = 0; i < doc.PageCount; i++)
            sb.AppendLine(doc.ExtractText(i));
        results[Path.GetFileName(p)] = sb.ToString();
    }
    catch (Exception e)
    {
        Console.Error.WriteLine($"Error: {p}: {e.Message}");
    }
}
Console.WriteLine($"Processed {results.Count} PDFs");

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.File;
import java.util.*;

Map<String, String> results = new HashMap<>();
File[] files = new File("invoices/").listFiles((d, n) -> n.endsWith(".pdf"));
Arrays.sort(files);
for (File f : files) {
    try (PdfDocument doc = PdfDocument.open(f.toPath())) {
        StringBuilder all = new StringBuilder();
        for (int i = 0; i < doc.pageCount(); i++) all.append(doc.extractText(i));
        results.put(f.getName(), all.toString());
    } catch (Exception e) {
        System.err.printf("Error: %s: %s%n", f.getName(), e.getMessage());
    }
}
System.out.printf("Processed %d PDFs%n", results.size());

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

val results = mutableMapOf<String, String>()
File("invoices/").listFiles { _, n -> n.endsWith(".pdf") }?.sorted()?.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val all = (0 until doc.pageCount()).joinToString("") { doc.extractText(it) }
            results[f.name] = all
        }
    } catch (e: Exception) {
        System.err.println("Error: ${f.name}: ${e.message}")
    }
}
println("Processed ${results.size} PDFs")

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File

val results = scala.collection.mutable.Map.empty[String, String]
for (f <- new File("invoices/").listFiles((_, n) => n.endsWith(".pdf")).sorted) {
  Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val all = (0 until doc.pageCount()).map(doc.extractText).mkString
    results(f.getName) = all
  }
}
println(s"Processed ${results.size} PDFs")

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(let [results (atom {})]
  (doseq [f (sort (filter #(.endsWith (.getName %) ".pdf")
                          (.listFiles (io/file "invoices/"))))]
    (with-open [doc (pdf/open (.getPath f))]
      (let [all (apply str (map #(pdf/extract-text doc %)
                                (range (pdf/page-count doc))))]
        (swap! results assoc (.getName f) all))))
  (println (str "Processed " (count @results) " PDFs")))

Ruby

require 'pdf_oxide'

results = {}
Dir.glob('invoices/*.pdf').sort.each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      all = (0...doc.page_count).map { |i| doc.extract_text(i) }.join
      results[File.basename(path)] = all
    end
  rescue PdfOxide::Error => e
    warn "Error: #{File.basename(path)}: #{e.message}"
  end
end
puts "Processed #{results.size} PDFs"

PHP

use PdfOxide\PdfDocument;

$results = [];
$files = glob('invoices/*.pdf');
sort($files);
foreach ($files as $path) {
    try {
        $doc = PdfDocument::open($path);
        $all = '';
        for ($i = 0; $i < $doc->pageCount(); $i++) { $all .= $doc->extractText($i); }
        $results[basename($path)] = $all;
        $doc->close();
    } catch (\Throwable $e) {
        fwrite(STDERR, "Error: " . basename($path) . ": " . $e->getMessage() . "\n");
    }
}
printf("Processed %d PDFs\n", count($results));

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <map>

namespace fs = std::filesystem;
std::map<std::string, std::string> results;
for (const auto& entry : fs::directory_iterator("invoices/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        results[entry.path().filename().string()] = doc.extract_all_text();
    } catch (const pdf_oxide::Error& e) {
        std::cerr << "Error: " << entry.path().filename() << ": " << e.what() << "\n";
    }
}
std::cout << "Processed " << results.size() << " PDFs\n";

Swift

import PdfOxide
import Foundation

var results: [String: String] = [:]
let dir = "invoices/"
for name in try FileManager.default.contentsOfDirectory(atPath: dir).sorted() where name.hasSuffix(".pdf") {
    do {
        let doc = try Document.open(dir + name)
        results[name] = try doc.extractAllText()
    } catch {
        FileHandle.standardError.write("Error: \(name): \(error)\n".data(using: .utf8)!)
    }
}
print("Processed \(results.count) PDFs")

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

final results = <String, String>{};
final files = Directory('invoices/').listSync().where((e) => e.path.endsWith('.pdf')).toList()
  ..sort((a, b) => a.path.compareTo(b.path));
for (final entry in files) {
  try {
    final doc = PdfDocument.open(entry.path);
    results[entry.uri.pathSegments.last] = doc.extractAllText();
    doc.close();
  } catch (e) {
    stderr.writeln('Error: ${entry.path}: $e');
  }
}
print('Processed ${results.length} PDFs');

R

library(pdfoxide)

results <- list()
for (path in sort(Sys.glob("invoices/*.pdf"))) {
  tryCatch({
    doc <- pdf_open(path)
    results[[basename(path)]] <- pdf_extract_all_text(doc)
  }, error = function(e) {
    message(sprintf("Error: %s: %s", basename(path), conditionMessage(e)))
  })
}
cat(sprintf("Processed %d PDFs\n", length(results)))

Julia

using PdfOxide

results = Dict{String,String}()
for path in sort(filter(p -> endswith(p, ".pdf"), readdir("invoices/"; join = true)))
    try
        doc = open_document(path)
        results[basename(path)] = extract_all_text(doc)
    catch e
        @warn "Error: $(basename(path)): $e"
    end
end
println("Processed $(length(results)) PDFs")

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var results = std.StringHashMap([]const u8).init(a);
var dir = try std.fs.cwd().openDir("invoices/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "invoices/", entry.name });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch |e| {
        std.debug.print("Error: {s}: {}\n", .{ entry.name, e });
        continue;
    };
    defer doc.deinit();
    const all = try doc.extractAllText(a);
    try results.put(try a.dupe(u8, entry.name), all);
}
std.debug.print("Processed {d} PDFs\n", .{results.count()});

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
NSMutableDictionary<NSString*, NSString*> *results = [NSMutableDictionary dictionary];
NSArray *names = [[fm contentsOfDirectoryAtPath:@"invoices/" error:nil]
                  sortedArrayUsingSelector:@selector(compare:)];
for (NSString *name in names) {
    if (![name hasSuffix:@".pdf"]) continue;
    NSError *err = nil;
    POXDocument *doc = [POXDocument openPath:[@"invoices/" stringByAppendingString:name] error:&err];
    if (!doc) { NSLog(@"Error: %@: %@", name, err.localizedDescription); continue; }
    results[name] = [doc extractAllTextWithError:&err];
}
NSLog(@"Processed %lu PDFs", (unsigned long)results.count);

Elixir

results =
  "invoices/*.pdf"
  |> Path.wildcard()
  |> Enum.sort()
  |> Enum.reduce(%{}, fn path, acc ->
    case PdfOxide.open(path) do
      {:ok, doc} ->
        n = elem(PdfOxide.page_count(doc), 1)
        all = 0..(n - 1) |> Enum.map_join(fn i -> elem(PdfOxide.extract_text(doc, i), 1) end)
        Map.put(acc, Path.basename(path), all)

      {:error, reason} ->
        IO.puts(:stderr, "Error: #{Path.basename(path)}: #{inspect(reason)}")
        acc
    end
  end)

IO.puts("Processed #{map_size(results)} PDFs")

모든 PDF를 Markdown으로 변환

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

input_dir = Path("papers/")
output_dir = Path("markdown/")
output_dir.mkdir(exist_ok=True)

for pdf_path in input_dir.glob("*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        md = doc.to_markdown_all(detect_headings=True, include_images=False)
        out_path = output_dir / pdf_path.with_suffix(".md").name
        out_path.write_text(md, encoding="utf-8")
    except PdfError as e:
        print(f"Skipped {pdf_path.name}: {e}")

WASM

for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const md = doc.toMarkdownAll();
        console.log(`Converted ${name}: ${md.length} chars`);
        doc.free();
    } catch (e) {
        console.error(`Skipped ${name}: ${e.message}`);
    }
}

Rust

for entry in std::fs::read_dir("papers/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
            if let Ok(md) = doc.to_markdown_all(true) {
                let out = path.with_extension("md");
                std::fs::write(&out, &md)?;
            }
        }
    }
}

Go

_ = os.MkdirAll("markdown/", 0755)
matches, _ := filepath.Glob("papers/*.pdf")
for _, p := range matches {
    doc, err := pdfoxide.Open(p)
    if err != nil { log.Printf("Skipped %s: %v", p, err); continue }

    md, _ := doc.ToMarkdownAll()
    doc.Close()

    out := filepath.Join("markdown", strings.TrimSuffix(filepath.Base(p), ".pdf") + ".md")
    _ = os.WriteFile(out, []byte(md), 0644)
}

C#

Directory.CreateDirectory("markdown/");
foreach (var p in Directory.GetFiles("papers/", "*.pdf"))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var md = doc.ToMarkdownAll();
        var outPath = Path.Combine("markdown", Path.GetFileNameWithoutExtension(p) + ".md");
        File.WriteAllText(outPath, md);
    }
    catch (Exception e)
    {
        Console.Error.WriteLine($"Skipped {p}: {e.Message}");
    }
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.File;
import java.nio.file.*;

Files.createDirectories(Path.of("markdown"));
for (File f : new File("papers/").listFiles((d, n) -> n.endsWith(".pdf"))) {
    try (PdfDocument doc = PdfDocument.open(f.toPath())) {
        String md = doc.toMarkdown();   // no-arg = whole document
        String base = f.getName().replaceFirst("\\.pdf$", ".md");
        Files.writeString(Path.of("markdown", base), md);
    } catch (Exception e) {
        System.err.printf("Skipped %s: %s%n", f.getName(), e.getMessage());
    }
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

File("markdown").mkdirs()
File("papers/").listFiles { _, n -> n.endsWith(".pdf") }?.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val md = doc.toMarkdown()   // no-arg = whole document
            File("markdown", f.name.removeSuffix(".pdf") + ".md").writeText(md)
        }
    } catch (e: Exception) {
        System.err.println("Skipped ${f.name}: ${e.message}")
    }
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File
import java.nio.file.{Files, Path}

Files.createDirectories(Path.of("markdown"))
for (f <- new File("papers/").listFiles((_, n) => n.endsWith(".pdf"))) {
  Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val md = doc.toMarkdown()   // no-arg = whole document
    Files.writeString(Path.of("markdown", f.getName.stripSuffix(".pdf") + ".md"), md)
  }
}

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(.mkdirs (io/file "markdown"))
(doseq [f (filter #(.endsWith (.getName %) ".pdf")
                  (.listFiles (io/file "papers/")))]
  (try
    (with-open [doc (pdf/open (.getPath f))]
      (let [md   (pdf/to-markdown doc)   ; no page arg = whole document
            base (clojure.string/replace (.getName f) #"\.pdf$" ".md")]
        (spit (io/file "markdown" base) md)))
    (catch Exception e
      (println (str "Skipped " (.getName f) ": " (.getMessage e))))))

Ruby

require 'pdf_oxide'
require 'fileutils'

FileUtils.mkdir_p('markdown')
Dir.glob('papers/*.pdf').each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      md = doc.to_markdown    # nil arg = whole document
      out = File.join('markdown', File.basename(path, '.pdf') + '.md')
      File.write(out, md)
    end
  rescue PdfOxide::Error => e
    warn "Skipped #{File.basename(path)}: #{e.message}"
  end
end

PHP

use PdfOxide\PdfDocument;

@mkdir('markdown');
foreach (glob('papers/*.pdf') as $path) {
    try {
        $doc = PdfDocument::open($path);
        $md  = $doc->toMarkdownAll();
        $out = 'markdown/' . basename($path, '.pdf') . '.md';
        file_put_contents($out, $md);
        $doc->close();
    } catch (\Throwable $e) {
        fwrite(STDERR, "Skipped " . basename($path) . ": " . $e->getMessage() . "\n");
    }
}

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <fstream>

namespace fs = std::filesystem;
fs::create_directories("markdown");
for (const auto& entry : fs::directory_iterator("papers/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        auto md  = doc.to_markdown_all();
        auto out = fs::path("markdown") / entry.path().filename().replace_extension(".md");
        std::ofstream(out) << md;
    } catch (const pdf_oxide::Error& e) {
        std::cerr << "Skipped " << entry.path().filename() << ": " << e.what() << "\n";
    }
}

Swift

import PdfOxide
import Foundation

let dir = "papers/"
try FileManager.default.createDirectory(atPath: "markdown", withIntermediateDirectories: true)
for name in try FileManager.default.contentsOfDirectory(atPath: dir) where name.hasSuffix(".pdf") {
    do {
        let doc = try Document.open(dir + name)
        let md = try doc.toMarkdownAll()
        let out = "markdown/" + name.replacingOccurrences(of: ".pdf", with: ".md")
        try md.write(toFile: out, atomically: true, encoding: .utf8)
    } catch {
        print("Skipped \(name): \(error)")
    }
}

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

Directory('markdown').createSync();
for (final entry in Directory('papers/').listSync()) {
  if (!entry.path.endsWith('.pdf')) continue;
  try {
    final doc = PdfDocument.open(entry.path);
    final md = doc.toMarkdownAll();
    final base = entry.uri.pathSegments.last.replaceAll('.pdf', '.md');
    File('markdown/$base').writeAsStringSync(md);
    doc.close();
  } catch (e) {
    print('Skipped ${entry.path}: $e');
  }
}

R

library(pdfoxide)

dir.create("markdown", showWarnings = FALSE)
for (path in Sys.glob("papers/*.pdf")) {
  tryCatch({
    doc <- pdf_open(path)
    md  <- pdf_to_markdown_all(doc)
    out <- file.path("markdown", sub("\\.pdf$", ".md", basename(path)))
    writeLines(md, out)
  }, error = function(e) {
    message(sprintf("Skipped %s: %s", basename(path), conditionMessage(e)))
  })
}

Julia

using PdfOxide

mkpath("markdown")
for path in filter(p -> endswith(p, ".pdf"), readdir("papers/"; join = true))
    try
        doc = open_document(path)
        md  = to_markdown_all(doc)
        out = joinpath("markdown", replace(basename(path), ".pdf" => ".md"))
        write(out, md)
    catch e
        println("Skipped $(basename(path)): $e")
    end
end

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
try std.fs.cwd().makePath("markdown");
var dir = try std.fs.cwd().openDir("papers/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "papers/", entry.name });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch |e| {
        std.debug.print("Skipped {s}: {}\n", .{ entry.name, e });
        continue;
    };
    defer doc.deinit();
    const md = try doc.toMarkdownAll(a);
    defer a.free(md);
    const out = try std.fmt.allocPrint(a, "markdown/{s}.md", .{entry.name[0 .. entry.name.len - 4]});
    defer a.free(out);
    try std.fs.cwd().writeFile(.{ .sub_path = out, .data = md });
}

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
[fm createDirectoryAtPath:@"markdown" withIntermediateDirectories:YES attributes:nil error:nil];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"papers/" error:nil]) {
    if (![name hasSuffix:@".pdf"]) continue;
    NSError *err = nil;
    POXDocument *doc = [POXDocument openPath:[@"papers/" stringByAppendingString:name] error:&err];
    if (!doc) { NSLog(@"Skipped %@: %@", name, err.localizedDescription); continue; }
    NSString *md = [doc toMarkdownAllWithError:&err];
    NSString *base = [[name stringByDeletingPathExtension] stringByAppendingPathExtension:@"md"];
    [md writeToFile:[@"markdown/" stringByAppendingString:base]
         atomically:YES encoding:NSUTF8StringEncoding error:&err];
}

Elixir

File.mkdir_p!("markdown")

"papers/*.pdf"
|> Path.wildcard()
|> Enum.each(fn path ->
  case PdfOxide.open(path) do
    {:ok, doc} ->
      {:ok, md} = PdfOxide.to_markdown_all(doc)
      out = Path.join("markdown", Path.basename(path, ".pdf") <> ".md")
      File.write!(out, md)

    {:error, reason} ->
      IO.puts(:stderr, "Skipped #{Path.basename(path)}: #{inspect(reason)}")
  end
end)

병렬 처리

고루틴과 태스크. Go에서는 각 PdfDocument가 독립적인 읽기에 대해 고루틴 안전합니다. 파일마다 고루틴 하나를 실행하면 됩니다. C#에서는 *Async 메서드와 Task.WhenAll을 조합하는 것이 권장됩니다(비동기 가이드 참조). 아래 예제는 Python 중심으로 설명하지만, 워커 풀 기능이 있는 언어라면 동일한 패턴을 적용할 수 있습니다.

multiprocessing 사용

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from multiprocessing import Pool

def process_pdf(pdf_path: str) -> dict:
    try:
        doc = PdfDocument(pdf_path)
        text = ""
        for i in range(doc.page_count()):
            text += doc.extract_text(i) + "\n"
        return {"file": pdf_path, "text": text, "pages": doc.page_count()}
    except PdfError as e:
        return {"file": pdf_path, "error": str(e)}

pdf_files = [str(p) for p in Path("documents/").glob("*.pdf")]

with Pool() as pool:
    results = pool.map(process_pdf, pdf_files)

success = [r for r in results if "text" in r]
errors = [r for r in results if "error" in r]
print(f"Processed {len(success)}, failed {len(errors)}")

concurrent.futures 사용

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

def extract_text(pdf_path: str) -> tuple[str, str]:
    doc = PdfDocument(pdf_path)
    text = ""
    for i in range(doc.page_count()):
        text += doc.extract_text(i) + "\n"
    return pdf_path, text

pdf_files = list(Path("documents/").glob("*.pdf"))

with ProcessPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(extract_text, str(p)): p for p in pdf_files}

    for future in as_completed(futures):
        pdf_path = futures[future]
        try:
            path, text = future.result()
            print(f"{pdf_path.name}: {len(text)} chars")
        except Exception as e:
            print(f"Error: {pdf_path.name}: {e}")

진행 상황 추적

간단한 카운터

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_files = list(Path("documents/").glob("*.pdf"))
total = len(pdf_files)

for idx, pdf_path in enumerate(pdf_files, 1):
    try:
        doc = PdfDocument(str(pdf_path))
        text = doc.extract_text(0)
        print(f"[{idx}/{total}] {pdf_path.name}: OK")
    except PdfError as e:
        print(f"[{idx}/{total}] {pdf_path.name}: FAILED - {e}")

tqdm 사용

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from tqdm import tqdm

pdf_files = list(Path("documents/").glob("*.pdf"))

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        doc = PdfDocument(str(pdf_path))
        for i in range(doc.page_count()):
            doc.extract_text(i)
    except PdfError:
        pass

손상된 파일의 오류 처리

오류를 기록하면서도 처리를 계속하는 견고한 파이프라인 구축:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import json

pdf_dir = Path("mixed-quality/")
results = []
errors = []

for pdf_path in pdf_dir.glob("**/*.pdf"):
    try:
        doc = PdfDocument(str(pdf_path))
        page_count = doc.page_count()
        text_length = sum(
            len(doc.extract_text(i)) for i in range(page_count)
        )
        results.append({
            "file": str(pdf_path),
            "pages": page_count,
            "chars": text_length,
        })
    except PdfError as e:
        errors.append({
            "file": str(pdf_path),
            "error": str(e),
        })
    except Exception as e:
        errors.append({
            "file": str(pdf_path),
            "error": f"Unexpected: {e}",
        })

print(f"Success: {len(results)}, Errors: {len(errors)}")

# 오류 보고서 저장
if errors:
    with open("errors.json", "w") as f:
        json.dump(errors, f, indent=2)

WASM

const results = [];
const errors = [];

for (const { name, bytes } of pdfFiles) {
    try {
        const doc = new WasmPdfDocument(bytes);
        const pageCount = doc.pageCount();
        let textLength = 0;
        for (let i = 0; i < pageCount; i++) {
            textLength += doc.extractText(i).length;
        }
        results.push({ file: name, pages: pageCount, chars: textLength });
        doc.free();
    } catch (e) {
        errors.push({ file: name, error: e.message });
    }
}

console.log(`Success: ${results.length}, Errors: ${errors.length}`);

Rust

let mut results = Vec::new();
let mut errors = Vec::new();

for entry in std::fs::read_dir("mixed-quality/")? {
    let path = entry?.path();
    if path.extension().map_or(false, |e| e == "pdf") {
        match PdfDocument::open(path.to_str().unwrap()) {
            Ok(mut doc) => {
                let page_count = doc.page_count().unwrap_or(0);
                let text_length: usize = (0..page_count)
                    .filter_map(|i| doc.extract_text(i).ok())
                    .map(|t| t.len())
                    .sum();
                results.push((path.display().to_string(), page_count, text_length));
            }
            Err(e) => errors.push((path.display().to_string(), e.to_string())),
        }
    }
}

println!("Success: {}, Errors: {}", results.len(), errors.len());

Go

type Success struct{ File string; Pages int; Chars int }
type Failure struct{ File string; Error string }

var results []Success
var errors []Failure

_ = filepath.Walk("mixed-quality/", func(path string, info os.FileInfo, err error) error {
    if err != nil || info.IsDir() || !strings.HasSuffix(path, ".pdf") { return nil }

    doc, err := pdfoxide.Open(path)
    if err != nil {
        errors = append(errors, Failure{path, err.Error()})
        return nil
    }
    defer doc.Close()

    n, _ := doc.PageCount()
    chars := 0
    for i := 0; i < n; i++ {
        t, _ := doc.ExtractText(i)
        chars += len(t)
    }
    results = append(results, Success{path, n, chars})
    return nil
})

fmt.Printf("Success: %d, Errors: %d\n", len(results), len(errors))

C#

var results = new List<(string File, int Pages, int Chars)>();
var errors = new List<(string File, string Error)>();

foreach (var p in Directory.EnumerateFiles("mixed-quality/", "*.pdf", SearchOption.AllDirectories))
{
    try
    {
        using var doc = PdfDocument.Open(p);
        var n = doc.PageCount;
        var chars = Enumerable.Range(0, n).Sum(i => doc.ExtractText(i).Length);
        results.Add((p, n, chars));
    }
    catch (Exception e)
    {
        errors.Add((p, e.Message));
    }
}

Console.WriteLine($"Success: {results.Count}, Errors: {errors.Count}");

Java

import fyi.oxide.pdf.PdfDocument;
import java.io.IOException;
import java.nio.file.*;
import java.util.*;

record Success(String file, int pages, long chars) {}
record Failure(String file, String error) {}

List<Success> results = new ArrayList<>();
List<Failure> errors = new ArrayList<>();

try (var stream = Files.walk(Path.of("mixed-quality/"))) {
    for (Path path : (Iterable<Path>) stream.filter(p -> p.toString().endsWith(".pdf"))::iterator) {
        try (PdfDocument doc = PdfDocument.open(path)) {
            int n = doc.pageCount();
            long chars = 0;
            for (int i = 0; i < n; i++) chars += doc.extractText(i).length();
            results.add(new Success(path.toString(), n, chars));
        } catch (Exception e) {
            errors.add(new Failure(path.toString(), e.getMessage()));
        }
    }
}
System.out.printf("Success: %d, Errors: %d%n", results.size(), errors.size());

Kotlin

import fyi.oxide.pdf.PdfDocument
import java.io.File

data class Success(val file: String, val pages: Int, val chars: Long)
data class Failure(val file: String, val error: String?)

val results = mutableListOf<Success>()
val errors = mutableListOf<Failure>()

File("mixed-quality/").walkTopDown().filter { it.extension == "pdf" }.forEach { f ->
    try {
        PdfDocument.open(f.toPath()).use { doc ->
            val n = doc.pageCount()
            val chars = (0 until n).sumOf { doc.extractText(it).length.toLong() }
            results += Success(f.path, n, chars)
        }
    } catch (e: Exception) {
        errors += Failure(f.path, e.message)
    }
}
println("Success: ${results.size}, Errors: ${errors.size}")

Scala

import fyi.oxide.pdf.PdfDocument
import scala.util.{Using, Try}
import java.io.File

final case class Success(file: String, pages: Int, chars: Long)
final case class Failure(file: String, error: String)

def pdfs(dir: File): Seq[File] =
  dir.listFiles.flatMap(f => if (f.isDirectory) pdfs(f) else if (f.getName.endsWith(".pdf")) Seq(f) else Nil).toSeq

val results = scala.collection.mutable.ArrayBuffer.empty[Success]
val errors = scala.collection.mutable.ArrayBuffer.empty[Failure]
for (f <- pdfs(new File("mixed-quality/"))) {
  Try(Using.resource(PdfDocument.open(f.getPath)) { doc =>
    val n = doc.pageCount()
    val chars = (0 until n).map(i => doc.extractText(i).length.toLong).sum
    Success(f.getPath, n, chars)
  }).fold(e => errors += Failure(f.getPath, e.getMessage), results += _)
}
println(s"Success: ${results.size}, Errors: ${errors.size}")

Clojure

(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])

(let [results (atom [])
      errors  (atom [])]
  (doseq [f (filter #(.endsWith (.getName %) ".pdf")
                    (file-seq (io/file "mixed-quality/")))]
    (try
      (with-open [doc (pdf/open (.getPath f))]
        (let [n     (pdf/page-count doc)
              chars (reduce + (map #(count (pdf/extract-text doc %)) (range n)))]
          (swap! results conj {:file (.getPath f) :pages n :chars chars})))
      (catch Exception e
        (swap! errors conj {:file (.getPath f) :error (.getMessage e)}))))
  (println (str "Success: " (count @results) ", Errors: " (count @errors))))

Ruby

require 'pdf_oxide'

results = []
errors = []

Dir.glob('mixed-quality/**/*.pdf').each do |path|
  begin
    PdfOxide::PdfDocument.open(path) do |doc|
      n = doc.page_count
      chars = (0...n).sum { |i| doc.extract_text(i).length }
      results << { file: path, pages: n, chars: chars }
    end
  rescue PdfOxide::Error => e
    errors << { file: path, error: e.message }
  end
end

puts "Success: #{results.size}, Errors: #{errors.size}"

PHP

use PdfOxide\PdfDocument;

$results = [];
$errors = [];

$it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator('mixed-quality/'));
foreach ($it as $file) {
    if ($file->getExtension() !== 'pdf') continue;
    try {
        $doc = PdfDocument::open($file->getPathname());
        $n = $doc->pageCount();
        $chars = 0;
        for ($i = 0; $i < $n; $i++) { $chars += strlen($doc->extractText($i)); }
        $results[] = ['file' => $file->getPathname(), 'pages' => $n, 'chars' => $chars];
        $doc->close();
    } catch (\Throwable $e) {
        $errors[] = ['file' => $file->getPathname(), 'error' => $e->getMessage()];
    }
}

printf("Success: %d, Errors: %d\n", count($results), count($errors));

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>

namespace fs = std::filesystem;
struct Success { std::string file; int pages; std::size_t chars; };
struct Failure { std::string file; std::string error; };
std::vector<Success> results;
std::vector<Failure> errors;

for (const auto& entry : fs::recursive_directory_iterator("mixed-quality/")) {
    if (entry.path().extension() != ".pdf") continue;
    try {
        auto doc = pdf_oxide::Document::open(entry.path().string());
        int n = doc.page_count();
        std::size_t chars = 0;
        for (int i = 0; i < n; i++) chars += doc.extract_text(i).size();
        results.push_back({entry.path().string(), n, chars});
    } catch (const pdf_oxide::Error& e) {
        errors.push_back({entry.path().string(), e.what()});
    }
}
std::cout << "Success: " << results.size() << ", Errors: " << errors.size() << "\n";

Swift

import PdfOxide
import Foundation

struct Success { let file: String; let pages: Int; let chars: Int }
struct Failure { let file: String; let error: String }
var results: [Success] = []
var errors: [Failure] = []

let root = URL(fileURLWithPath: "mixed-quality/")
let walker = FileManager.default.enumerator(at: root, includingPropertiesForKeys: nil)!
for case let url as URL in walker where url.pathExtension == "pdf" {
    do {
        let doc = try Document.open(url.path)
        let n = try doc.pageCount()
        var chars = 0
        for i in 0..<n { chars += try doc.extractText(i).count }
        results.append(Success(file: url.path, pages: n, chars: chars))
    } catch {
        errors.append(Failure(file: url.path, error: "\(error)"))
    }
}
print("Success: \(results.count), Errors: \(errors.count)")

Dart

import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';

final results = <Map<String, Object>>[];
final errors = <Map<String, Object>>[];

for (final entry in Directory('mixed-quality/').listSync(recursive: true)) {
  if (entry is! File || !entry.path.endsWith('.pdf')) continue;
  try {
    final doc = PdfDocument.open(entry.path);
    final n = doc.pageCount;
    var chars = 0;
    for (var i = 0; i < n; i++) chars += doc.extractText(i).length;
    results.add({'file': entry.path, 'pages': n, 'chars': chars});
    doc.close();
  } catch (e) {
    errors.add({'file': entry.path, 'error': '$e'});
  }
}
print('Success: ${results.length}, Errors: ${errors.length}');

R

library(pdfoxide)

results <- list()
errors <- list()

for (path in list.files("mixed-quality/", pattern = "\\.pdf$", recursive = TRUE, full.names = TRUE)) {
  tryCatch({
    doc <- pdf_open(path)
    n <- pdf_page_count(doc)
    chars <- sum(vapply(0:(n - 1), function(i) nchar(pdf_extract_text(doc, i)), integer(1)))
    results[[length(results) + 1]] <- list(file = path, pages = n, chars = chars)
  }, error = function(e) {
    errors[[length(errors) + 1]] <<- list(file = path, error = conditionMessage(e))
  })
}

cat(sprintf("Success: %d, Errors: %d\n", length(results), length(errors)))

Julia

using PdfOxide

results = NamedTuple[]
errors = NamedTuple[]

for (root, _, files) in walkdir("mixed-quality/"), name in files
    endswith(name, ".pdf") || continue
    path = joinpath(root, name)
    try
        doc = open_document(path)
        n = page_count(doc)
        chars = sum(i -> length(extract_text(doc, i)), 0:(n - 1); init = 0)
        push!(results, (file = path, pages = n, chars = chars))
    catch e
        push!(errors, (file = path, error = sprint(showerror, e)))
    end
end

println("Success: $(length(results)), Errors: $(length(errors))")

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var ok: usize = 0;
var failed: usize = 0;

var root = try std.fs.cwd().openDir("mixed-quality/", .{ .iterate = true });
defer root.close();
var walker = try root.walk(a);
defer walker.deinit();
while (try walker.next()) |entry| {
    if (entry.kind != .file or !std.mem.endsWith(u8, entry.basename, ".pdf")) continue;
    const path = try std.fs.path.joinZ(a, &.{ "mixed-quality/", entry.path });
    defer a.free(path);
    var doc = pdf_oxide.Document.open(path) catch {
        failed += 1;
        continue;
    };
    defer doc.deinit();
    const n = try doc.pageCount();
    var chars: usize = 0;
    var i: i32 = 0;
    while (i < n) : (i += 1) {
        const t = try doc.extractText(a, i);
        defer a.free(t);
        chars += t.len;
    }
    ok += 1;
}
std.debug.print("Success: {d}, Errors: {d}\n", .{ ok, failed });

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
NSUInteger ok = 0, failed = 0;
NSDirectoryEnumerator *en = [fm enumeratorAtPath:@"mixed-quality/"];
for (NSString *rel in en) {
    if (![rel hasSuffix:@".pdf"]) continue;
    NSString *path = [@"mixed-quality/" stringByAppendingString:rel];
    NSError *err = nil;
    POXDocument *doc = [POXDocument openPath:path error:&err];
    if (!doc) { failed++; continue; }
    NSInteger n = [doc pageCountError:&err];
    NSUInteger chars = 0;
    for (NSInteger i = 0; i < n; i++) chars += [doc extractText:i error:&err].length;
    ok++;
}
NSLog(@"Success: %lu, Errors: %lu", (unsigned long)ok, (unsigned long)failed);

Elixir

{results, errors} =
  "mixed-quality/**/*.pdf"
  |> Path.wildcard()
  |> Enum.reduce({[], []}, fn path, {ok, bad} ->
    case PdfOxide.open(path) do
      {:ok, doc} ->
        {:ok, n} = PdfOxide.page_count(doc)
        chars =
          0..(n - 1)
          |> Enum.reduce(0, fn i, acc -> acc + String.length(elem(PdfOxide.extract_text(doc, i), 1)) end)

        {[%{file: path, pages: n, chars: chars} | ok], bad}

      {:error, reason} ->
        {ok, [%{file: path, error: inspect(reason)} | bad]}
    end
  end)

IO.puts("Success: #{length(results)}, Errors: #{length(errors)}")

메모리 효율적인 처리

매우 큰 코퍼스의 경우, 결과를 메모리에 쌓지 않고 파일을 하나씩 처리합니다:

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import csv

pdf_dir = Path("large-corpus/")

with open("output.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["file", "page", "text"])

    for pdf_path in pdf_dir.glob("*.pdf"):
        try:
            doc = PdfDocument(str(pdf_path))
            for i in range(doc.page_count()):
                text = doc.extract_text(i)
                writer.writerow([pdf_path.name, i, text])
        except PdfError:
            pass

일괄 병합

디렉터리의 모든 PDF를 하나로 합치기:

Python

from pdf_oxide import PdfDocument, PdfError
from pathlib import Path

pdf_files = sorted(Path("reports/").glob("*.pdf"))

if pdf_files:
    doc = PdfDocument(str(pdf_files[0]))
    for pdf_path in pdf_files[1:]:
        try:
            doc.merge_from(str(pdf_path))
        except PdfError as e:
            print(f"Skipped {pdf_path.name}: {e}")
    doc.save("all-reports.pdf")

Rust

use pdf_oxide::editor::DocumentEditor;

let mut files: Vec<_> = std::fs::read_dir("reports/")?
    .filter_map(|e| e.ok())
    .filter(|e| e.path().extension().map_or(false, |ext| ext == "pdf"))
    .collect();
files.sort_by_key(|e| e.path());

if let Some(first) = files.first() {
    let mut editor = DocumentEditor::open(first.path().to_str().unwrap())?;
    for entry in &files[1..] {
        if let Err(e) = editor.merge_from(entry.path().to_str().unwrap()) {
            println!("Skipped {}: {}", entry.path().display(), e);
        }
    }
    editor.save("all-reports.pdf")?;
}

Go

files, _ := filepath.Glob("reports/*.pdf")
sort.Strings(files)

// Top-level Merge concatenates every file in one call
bytes, err := pdfoxide.Merge(files)
if err != nil { log.Fatal(err) }
_ = os.WriteFile("all-reports.pdf", bytes, 0644)

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <fstream>
#include <algorithm>

namespace fs = std::filesystem;
std::vector<std::string> files;
for (const auto& entry : fs::directory_iterator("reports/"))
    if (entry.path().extension() == ".pdf") files.push_back(entry.path().string());
std::sort(files.begin(), files.end());

// Top-level merge concatenates every file in one call
auto bytes = pdf_oxide::merge(files);
std::ofstream("all-reports.pdf", std::ios::binary).write(
    reinterpret_cast<const char*>(bytes.data()), bytes.size());

Swift

import PdfOxide
import Foundation

let dir = "reports/"
let files = try FileManager.default.contentsOfDirectory(atPath: dir)
    .filter { $0.hasSuffix(".pdf") }
    .sorted()
    .map { dir + $0 }

// Top-level merge concatenates every file in one call
let bytes = try merge(files)
try Data(bytes).write(to: URL(fileURLWithPath: "all-reports.pdf"))

Dart

import 'dart:io';
import 'dart:typed_data';
import 'package:pdf_oxide/pdf_oxide.dart';

final files = Directory('reports/')
    .listSync()
    .map((e) => e.path)
    .where((p) => p.endsWith('.pdf'))
    .toList()
  ..sort();

// Top-level merge concatenates every file in one call
final Uint8List bytes = pdfMerge(files);
File('all-reports.pdf').writeAsBytesSync(bytes);

R

library(pdfoxide)

files <- sort(Sys.glob("reports/*.pdf"))

# Top-level merge concatenates every file in one call
bytes <- pdf_merge(files)
writeBin(bytes, "all-reports.pdf")

Julia

using PdfOxide

files = sort(filter(p -> endswith(p, ".pdf"), readdir("reports/"; join = true)))

# Top-level merge concatenates every file in one call
bytes = merge_pdfs(files)
write("all-reports.pdf", bytes)

Zig

const std = @import("std");
const pdf_oxide = @import("pdf_oxide");

const a = std.heap.page_allocator;
var paths = std.ArrayList([*:0]const u8).init(a);
defer paths.deinit();
var dir = try std.fs.cwd().openDir("reports/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
    if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
    try paths.append(try std.fs.path.joinZ(a, &.{ "reports/", entry.name }));
}

// Top-level merge concatenates every file in one call
const bytes = try pdf_oxide.merge(a, paths.items);
defer a.free(bytes);
try std.fs.cwd().writeFile(.{ .sub_path = "all-reports.pdf", .data = bytes });

Objective-C

#import "POXPdfOxide.h"

NSFileManager *fm = [NSFileManager defaultManager];
NSMutableArray<NSString*> *files = [NSMutableArray array];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"reports/" error:nil])
    if ([name hasSuffix:@".pdf"]) [files addObject:[@"reports/" stringByAppendingString:name]];
[files sortUsingSelector:@selector(compare:)];

// Top-level merge concatenates every file in one call
NSError *err = nil;
NSData *bytes = [POXPdf merge:files error:&err];
[bytes writeToFile:@"all-reports.pdf" atomically:YES];

Elixir

files =
  "reports/*.pdf"
  |> Path.wildcard()
  |> Enum.sort()

# Top-level merge concatenates every file in one call
{:ok, bytes} = PdfOxide.merge(files)
File.write!("all-reports.pdf", bytes)

관련 페이지