Skip to content

并发 — 线程安全的 PDF 读取

自 v0.3.22 起,PdfDocument 在 Rust 层实现了 Send + Sync。同一个文档可以在操作系统线程、goroutine、worker 线程或 asyncio 任务之间共享,以实现并行页面提取。写操作仍需要串行化 — 这正是 DocumentEditor 的用途所在。

v0.3.22 的变更内容

PdfDocument 内部的 16 个 RefCell<T> 包装器全部替换为 Mutex<T>Cell<usize> 变更为 AtomicUsize。语言绑定层移除了 Python 类(PdfDocumentPdfPageFormField)上的 unsendable 标记——此前只要这些对象跨越线程边界就会抛出 RuntimeError

最终效果:线程池、异步运行时以及自由线程 Python 现在均可直接使用,无需额外配置。

Rust

Rust

use pdf_oxide::PdfDocument;
use std::sync::Arc;
use std::thread;

let doc = Arc::new(PdfDocument::open("report.pdf")?);
let page_count = doc.page_count();

let handles: Vec<_> = (0..page_count)
    .map(|i| {
        let doc = Arc::clone(&doc);
        thread::spawn(move || doc.extract_text(i))
    })
    .collect();

for h in handles {
    let text = h.join().unwrap()?;
    println!("{}", text);
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.util.concurrent.*;
import java.util.stream.*;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    int pageCount = doc.pageCount();
    ExecutorService pool = Executors.newFixedThreadPool(8);
    List<Future<String>> futures = IntStream.range(0, pageCount)
        .mapToObj(i -> pool.submit(() -> doc.extractText(i)))
        .collect(Collectors.toList());
    for (Future<String> f : futures) System.out.println(f.get());
    pool.shutdown();
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import kotlinx.coroutines.*

PdfDocument.open(java.nio.file.Path.of("report.pdf")).use { doc ->
    val pages = runBlocking(Dispatchers.IO) {
        (0 until doc.pageCount())
            .map { i -> async { doc.extractText(i) } }
            .awaitAll()
    }
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.concurrent.*
import scala.concurrent.duration.*
import scala.util.Using
import java.util.concurrent.Executors
import ExecutionContext.Implicits.global

Using.resource(PdfDocument.open("report.pdf")) { doc =>
  val pages = (0 until doc.pageCount()).map(i => Future(doc.extractText(i)))
  Await.result(Future.sequence(pages), 60.seconds)
}

Clojure

(require '[pdf-oxide.core :as pdf])

(with-open [doc (pdf/open "report.pdf")]
  (->> (range (pdf/page-count doc))
       (map (fn [i] (future (pdf/extract-text doc i))))
       (doall)
       (map deref)))

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('report.pdf') do |doc|
  pages = (0...doc.page_count).map do |i|
    Thread.new { doc.extract_text(i) }
  end.map(&:value)
end

PHP

use PdfOxide\PdfDocument;

// PHP 没有共享内存线程;顺序循环处理即可(读取操作内部已加锁,
// 在 pthreads/parallel 扩展下同样安全)。
$doc = PdfDocument::open('report.pdf');
$pages = [];
for ($i = 0; $i < $doc->pageCount(); $i++) {
    $pages[$i] = $doc->extractText($i);
}
$doc->close();

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <future>
#include <vector>

auto doc = pdf_oxide::Document::open("report.pdf");
int page_count = doc.page_count();

std::vector<std::future<std::string>> futures;
for (int i = 0; i < page_count; ++i)
    futures.push_back(std::async(std::launch::async,
        [&doc, i] { return doc.extract_text(i); }));

for (auto& f : futures) std::cout << f.get();

Swift

import PdfOxide

let doc = try Document.open("report.pdf")
let pageCount = try doc.pageCount()

try await withThrowingTaskGroup(of: String.self) { group in
    for i in 0..<pageCount {
        group.addTask { try doc.extractText(i) }
    }
    for try await text in group { print(text) }
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('report.pdf');
final pages = [
  for (var i = 0; i < doc.pageCount; i++) doc.extractText(i),
];
doc.close();

R

library(pdfoxide)
library(parallel)

doc <- pdf_open("report.pdf")
n   <- pdf_page_count(doc)
pages <- mclapply(0:(n - 1), function(i) pdf_extract_text(doc, i))

Julia

using PdfOxide

doc = open_document("report.pdf")
n   = page_count(doc)
pages = Vector{String}(undef, n)
Threads.@threads for i in 0:(n - 1)
    pages[i + 1] = extract_text(doc, i)
end

Zig

const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;

var doc = try pdf_oxide.Document.open("report.pdf");
const n = try doc.pageCount();

var i: usize = 0;
while (i < n) : (i += 1) {
    const text = try doc.extractText(a, i); // reads are internally locked
    defer a.free(text);
}

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];
NSInteger n = [doc pageCountError:&err];

dispatch_apply(n, dispatch_get_global_queue(0, 0), ^(size_t i) {
    NSError *e = nil;
    NSString *text = [doc extractText:i error:&e];
});

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, n}   = PdfOxide.page_count(doc)

pages =
  0..(n - 1)
  |> Task.async_stream(fn i -> PdfOxide.extract_text(doc, i) end)
  |> Enum.map(fn {:ok, {:ok, text}} -> text end)

使用 tokio 时:

Rust

use std::sync::Arc;
use tokio::task;

let doc = Arc::new(pdf_oxide::PdfDocument::open("report.pdf")?);

let tasks: Vec<_> = (0..doc.page_count())
    .map(|i| {
        let doc = Arc::clone(&doc);
        task::spawn_blocking(move || doc.extract_text(i))
    })
    .collect();

for t in tasks {
    let text = t.await??;
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.util.concurrent.*;
import java.util.stream.*;

ExecutorService pool = Executors.newWorkStealingPool();
try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    CompletableFuture<?>[] tasks = IntStream.range(0, doc.pageCount())
        .mapToObj(i -> CompletableFuture.supplyAsync(() -> doc.extractText(i), pool))
        .toArray(CompletableFuture[]::new);
    CompletableFuture.allOf(tasks).join();
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import kotlinx.coroutines.*

suspend fun extractAll(doc: PdfDocument): List<String> = coroutineScope {
    (0 until doc.pageCount())
        .map { i -> async(Dispatchers.IO) { doc.extractText(i) } }
        .awaitAll()
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.concurrent.*
import ExecutionContext.Implicits.global

def extractAll(doc: PdfDocument): Future[Seq[String]] =
  Future.traverse(0 until doc.pageCount())(i => Future(doc.extractText(i)))

Swift

import PdfOxide

func extractAll(_ doc: Document) async throws -> [String] {
    let n = try doc.pageCount()
    return try await withThrowingTaskGroup(of: (Int, String).self) { group in
        for i in 0..<n { group.addTask { (i, try doc.extractText(i)) } }
        var out = [String](repeating: "", count: n)
        for try await (i, text) in group { out[i] = text }
        return out
    }
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

Future<List<String>> extractAll(PdfDocument doc) async {
  return Future.wait([
    for (var i = 0; i < doc.pageCount; i++) Future(() => doc.extractText(i)),
  ]);
}

Elixir

def extract_all(doc) do
  {:ok, n} = PdfOxide.page_count(doc)
  0..(n - 1)
  |> Task.async_stream(fn i -> PdfOxide.extract_text(doc, i) end, ordered: true)
  |> Enum.map(fn {:ok, {:ok, text}} -> text end)
end

Python

Python

from concurrent.futures import ThreadPoolExecutor
from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")

with ThreadPoolExecutor(max_workers=8) as pool:
    pages = list(pool.map(doc.extract_text, range(doc.page_count())))

Java

import fyi.oxide.pdf.PdfDocument;
import java.util.concurrent.*;
import java.util.stream.*;

ExecutorService pool = Executors.newFixedThreadPool(8);
try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    List<String> pages = IntStream.range(0, doc.pageCount())
        .mapToObj(i -> pool.submit(() -> doc.extractText(i)))
        .collect(Collectors.toList())
        .stream().map(f -> { try { return f.get(); } catch (Exception e) { throw new RuntimeException(e); } })
        .collect(Collectors.toList());
}
pool.shutdown();

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('report.pdf') do |doc|
  pages = (0...doc.page_count)
    .map { |i| Thread.new { doc.extract_text(i) } }
    .map(&:value)
end

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <future>
#include <vector>

auto doc = pdf_oxide::Document::open("report.pdf");
std::vector<std::future<std::string>> futures;
for (int i = 0; i < doc.page_count(); ++i)
    futures.push_back(std::async(std::launch::async,
        [&doc, i] { return doc.extract_text(i); }));

std::vector<std::string> pages;
for (auto& f : futures) pages.push_back(f.get());

Swift

import PdfOxide

let doc = try Document.open("report.pdf")
let pages = try await withThrowingTaskGroup(of: (Int, String).self) { group -> [String] in
    let n = try doc.pageCount()
    for i in 0..<n { group.addTask { (i, try doc.extractText(i)) } }
    var out = [String](repeating: "", count: n)
    for try await (i, t) in group { out[i] = t }
    return out
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('report.pdf');
final pages = await Future.wait([
  for (var i = 0; i < doc.pageCount; i++) Future(() => doc.extractText(i)),
]);
doc.close();

R

library(pdfoxide)
library(parallel)

doc   <- pdf_open("report.pdf")
n     <- pdf_page_count(doc)
pages <- mclapply(0:(n - 1), function(i) pdf_extract_text(doc, i), mc.cores = 8)

Julia

using PdfOxide

doc   = open_document("report.pdf")
n     = page_count(doc)
pages = Vector{String}(undef, n)
Threads.@threads for i in 0:(n - 1)
    pages[i + 1] = extract_text(doc, i)
end

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, n}   = PdfOxide.page_count(doc)

pages =
  0..(n - 1)
  |> Task.async_stream(fn i -> PdfOxide.extract_text(doc, i) end,
       max_concurrency: 8, ordered: true)
  |> Enum.map(fn {:ok, {:ok, text}} -> text end)

在标准 CPython 下,GIL 仍然会串行化 Python 层面的工作,但提取操作本身在执行 Rust 代码期间会释放 GIL,因此在 Rust 侧实现了真正的并行。在 cp314t(自由线程 Python 3.14+)下,GIL 为可选项,绑定层声明 gil_used = false,从而完全消除了隐式串行化。

使用 asyncio 时:

Python

import asyncio
from pdf_oxide import PdfDocument

doc = PdfDocument("report.pdf")

async def main():
    pages = await asyncio.gather(
        *[asyncio.to_thread(doc.extract_text, i) for i in range(doc.page_count())]
    )

Java

import fyi.oxide.pdf.PdfDocument;
import java.util.concurrent.*;
import java.util.stream.*;

ExecutorService pool = Executors.newWorkStealingPool();
try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    var futures = IntStream.range(0, doc.pageCount())
        .mapToObj(i -> CompletableFuture.supplyAsync(() -> doc.extractText(i), pool))
        .toList();
    CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new)).join();
    var pages = futures.stream().map(CompletableFuture::join).toList();
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import kotlinx.coroutines.*

suspend fun extractAll(doc: PdfDocument): List<String> = coroutineScope {
    (0 until doc.pageCount())
        .map { i -> async(Dispatchers.IO) { doc.extractText(i) } }
        .awaitAll()
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.concurrent.*
import ExecutionContext.Implicits.global

def extractAll(doc: PdfDocument): Future[Seq[String]] =
  Future.traverse(0 until doc.pageCount())(i => Future(doc.extractText(i)))

Swift

import PdfOxide

func extractAll(_ doc: Document) async throws -> [String] {
    let n = try doc.pageCount()
    return try await withThrowingTaskGroup(of: (Int, String).self) { group in
        for i in 0..<n { group.addTask { (i, try doc.extractText(i)) } }
        var out = [String](repeating: "", count: n)
        for try await (i, text) in group { out[i] = text }
        return out
    }
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

Future<List<String>> extractAll(PdfDocument doc) async => Future.wait([
  for (var i = 0; i < doc.pageCount; i++) Future(() => doc.extractText(i)),
]);

Elixir

def extract_all(doc) do
  {:ok, n} = PdfOxide.page_count(doc)
  0..(n - 1)
  |> Task.async_stream(fn i -> PdfOxide.extract_text(doc, i) end, ordered: true)
  |> Enum.map(fn {:ok, {:ok, text}} -> text end)
end

或者直接使用异步指南中现成的 AsyncPdfDocument

Go

*PdfDocument 的读取操作由内部 sync.RWMutex 保护,从设计上天然支持 goroutine 安全。

Go

package main

import (
    "sync"

    pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)

func main() {
    doc, _ := pdfoxide.Open("report.pdf")
    defer doc.Close()

    count, _ := doc.PageCount()
    results := make([]string, count)

    var wg sync.WaitGroup
    for i := 0; i < count; i++ {
        wg.Add(1)
        go func(page int) {
            defer wg.Done()
            text, _ := doc.ExtractText(page)
            results[page] = text
        }(i)
    }
    wg.Wait()
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.util.concurrent.*;
import java.util.stream.*;

try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    int count = doc.pageCount();
    String[] results = new String[count];
    ExecutorService pool = Executors.newFixedThreadPool(8);
    var latch = new CountDownLatch(count);
    for (int i = 0; i < count; i++) {
        final int page = i;
        pool.submit(() -> { results[page] = doc.extractText(page); latch.countDown(); });
    }
    latch.await();
    pool.shutdown();
}

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('report.pdf') do |doc|
  count = doc.page_count
  results = Array.new(count)
  (0...count).map { |i| Thread.new { results[i] = doc.extract_text(i) } }.each(&:join)
end

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <thread>
#include <vector>

auto doc = pdf_oxide::Document::open("report.pdf");
int count = doc.page_count();
std::vector<std::string> results(count);
std::vector<std::thread> threads;
for (int i = 0; i < count; ++i)
    threads.emplace_back([&doc, &results, i] { results[i] = doc.extract_text(i); });
for (auto& t : threads) t.join();

Swift

import PdfOxide

let doc = try Document.open("report.pdf")
let count = try doc.pageCount()
var results = [String](repeating: "", count: count)
try await withThrowingTaskGroup(of: (Int, String).self) { group in
    for i in 0..<count { group.addTask { (i, try doc.extractText(i)) } }
    for try await (i, t) in group { results[i] = t }
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('report.pdf');
final count = doc.pageCount;
final results = await Future.wait([
  for (var i = 0; i < count; i++) Future(() => doc.extractText(i)),
]);
doc.close();

R

library(pdfoxide)
library(parallel)

doc     <- pdf_open("report.pdf")
count   <- pdf_page_count(doc)
results <- mclapply(0:(count - 1), function(i) pdf_extract_text(doc, i))

Julia

using PdfOxide

doc     = open_document("report.pdf")
count   = page_count(doc)
results = Vector{String}(undef, count)
Threads.@threads for i in 0:(count - 1)
    results[i + 1] = extract_text(doc, i)
end

Zig

const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;

var doc = try pdf_oxide.Document.open("report.pdf");
const count = try doc.pageCount();

var i: usize = 0;
while (i < count) : (i += 1) {
    const text = try doc.extractText(a, i); // internally locked reads
    defer a.free(text);
}

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];
NSInteger count = [doc pageCountError:&err];
NSMutableArray *results = [NSMutableArray arrayWithCapacity:count];
for (NSInteger i = 0; i < count; i++) [results addObject:[NSNull null]];

dispatch_apply(count, dispatch_get_global_queue(0, 0), ^(size_t i) {
    NSError *e = nil;
    results[i] = [doc extractText:i error:&e];
});

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, count} = PdfOxide.page_count(doc)

results =
  0..(count - 1)
  |> Task.async_stream(fn i -> PdfOxide.extract_text(doc, i) end, ordered: true)
  |> Enum.map(fn {:ok, {:ok, text}} -> text end)

*DocumentEditor 在内部串行化写操作,但请勿从多个 goroutine 并发提交独立的编辑操作——应在单个 goroutine 中统一收集变更。

C#

C#

using PdfOxide.Core;

using var doc = PdfDocument.Open("report.pdf");
var tasks = Enumerable.Range(0, doc.PageCount)
    .Select(i => Task.Run(() => doc.ExtractText(i)));
string[] pages = await Task.WhenAll(tasks);

Java

import fyi.oxide.pdf.PdfDocument;
import java.util.concurrent.*;
import java.util.stream.*;

ExecutorService pool = Executors.newWorkStealingPool();
try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    var tasks = IntStream.range(0, doc.pageCount())
        .mapToObj(i -> CompletableFuture.supplyAsync(() -> doc.extractText(i), pool))
        .toList();
    var pages = tasks.stream().map(CompletableFuture::join).toList();
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import kotlinx.coroutines.*

PdfDocument.open(java.nio.file.Path.of("report.pdf")).use { doc ->
    val pages = runBlocking {
        (0 until doc.pageCount())
            .map { i -> async(Dispatchers.IO) { doc.extractText(i) } }
            .awaitAll()
    }
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.concurrent.*, duration.*
import scala.util.Using
import ExecutionContext.Implicits.global

Using.resource(PdfDocument.open("report.pdf")) { doc =>
  val pages = Future.traverse(0 until doc.pageCount())(i => Future(doc.extractText(i)))
  Await.result(pages, 60.seconds)
}

Ruby

require 'pdf_oxide'

PdfOxide::PdfDocument.open('report.pdf') do |doc|
  pages = (0...doc.page_count).map { |i| Thread.new { doc.extract_text(i) } }.map(&:value)
end

C++

#include <pdf_oxide/pdf_oxide.hpp>
#include <future>
#include <vector>

auto doc = pdf_oxide::Document::open("report.pdf");
std::vector<std::future<std::string>> tasks;
for (int i = 0; i < doc.page_count(); ++i)
    tasks.push_back(std::async(std::launch::async, [&doc, i] { return doc.extract_text(i); }));

std::vector<std::string> pages;
for (auto& t : tasks) pages.push_back(t.get());

Swift

import PdfOxide

let doc = try Document.open("report.pdf")
let pages = try await withThrowingTaskGroup(of: (Int, String).self) { group -> [String] in
    let n = try doc.pageCount()
    for i in 0..<n { group.addTask { (i, try doc.extractText(i)) } }
    var out = [String](repeating: "", count: n)
    for try await (i, t) in group { out[i] = t }
    return out
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

final doc = PdfDocument.open('report.pdf');
final pages = await Future.wait([
  for (var i = 0; i < doc.pageCount; i++) Future(() => doc.extractText(i)),
]);
doc.close();

R

library(pdfoxide)
library(parallel)

doc   <- pdf_open("report.pdf")
pages <- mclapply(0:(pdf_page_count(doc) - 1), function(i) pdf_extract_text(doc, i))

Julia

using PdfOxide

doc   = open_document("report.pdf")
n     = page_count(doc)
pages = Vector{String}(undef, n)
Threads.@threads for i in 0:(n - 1)
    pages[i + 1] = extract_text(doc, i)
end

Objective-C

#import "POXPdfOxide.h"
NSError *err = nil;

POXDocument *doc = [POXDocument openPath:@"report.pdf" error:&err];
NSInteger n = [doc pageCountError:&err];
NSMutableArray *pages = [NSMutableArray array];
for (NSInteger i = 0; i < n; i++) [pages addObject:[NSNull null]];

dispatch_apply(n, dispatch_get_global_queue(0, 0), ^(size_t i) {
    NSError *e = nil;
    pages[i] = [doc extractText:i error:&e];
});

Elixir

{:ok, doc} = PdfOxide.open("report.pdf")
{:ok, n}   = PdfOxide.page_count(doc)

pages =
  0..(n - 1)
  |> Task.async_stream(fn i -> PdfOxide.extract_text(doc, i) end, ordered: true)
  |> Enum.map(fn {:ok, {:ok, text}} -> text end)

如果需要围绕 DocumentEditor 实现更细粒度的读写语义:

C#

var locker = new ReaderWriterLockSlim();

locker.EnterReadLock();
try
{
    string text = doc.ExtractText(0);
}
finally
{
    locker.ExitReadLock();
}

Java

import java.util.concurrent.locks.ReentrantReadWriteLock;

var lock = new ReentrantReadWriteLock();
lock.readLock().lock();
try {
    String text = doc.extractText(0);
} finally {
    lock.readLock().unlock();
}

Kotlin

import java.util.concurrent.locks.ReentrantReadWriteLock
import kotlin.concurrent.read

val lock = ReentrantReadWriteLock()
val text = lock.read { doc.extractText(0) }

Scala

import java.util.concurrent.locks.ReentrantReadWriteLock

val lock = ReentrantReadWriteLock()
lock.readLock().lock()
val text = try doc.extractText(0) finally lock.readLock().unlock()

Ruby

require 'pdf_oxide'

mutex = Mutex.new
text = mutex.synchronize { doc.extract_text(0) }

C++

#include <shared_mutex>

std::shared_mutex lock;
std::string text;
{
    std::shared_lock<std::shared_mutex> guard(lock); // shared (reader) lock
    text = doc.extract_text(0);
}

Julia

lock = ReentrantLock()
text = Base.@lock lock extract_text(doc, 0)

Objective-C

pthread_rwlock_t lock;
pthread_rwlock_init(&lock, NULL);

pthread_rwlock_rdlock(&lock);
NSError *e = nil;
NSString *text = [doc extractText:0 error:&e];
pthread_rwlock_unlock(&lock);

Node.js

PdfDocument 可通过转移底层句柄传递给 worker 线程。更简便的方式是让 *Async 方法负责调度:

Node.js

const { PdfDocument } = require("pdf-oxide");

const doc = new PdfDocument("report.pdf");
try {
  const pageCount = doc.getPageCount();
  const pages = await Promise.all(
    Array.from({ length: pageCount }, (_, i) => doc.extractTextAsync(i))
  );
} finally {
  doc.close();
}

Java

import fyi.oxide.pdf.PdfDocument;
import java.util.concurrent.*;
import java.util.stream.*;

ExecutorService pool = Executors.newWorkStealingPool();
try (PdfDocument doc = PdfDocument.open(java.nio.file.Path.of("report.pdf"))) {
    int pageCount = doc.pageCount();
    var futures = IntStream.range(0, pageCount)
        .mapToObj(i -> CompletableFuture.supplyAsync(() -> doc.extractText(i), pool))
        .toList();
    var pages = futures.stream().map(CompletableFuture::join).toList();
}

Kotlin

import fyi.oxide.pdf.PdfDocument
import kotlinx.coroutines.*

suspend fun pages(doc: PdfDocument): List<String> = coroutineScope {
    (0 until doc.pageCount())
        .map { i -> async(Dispatchers.IO) { doc.extractText(i) } }
        .awaitAll()
}

Scala

import fyi.oxide.pdf.PdfDocument
import scala.concurrent.*
import ExecutionContext.Implicits.global

def pages(doc: PdfDocument): Future[Seq[String]] =
  Future.traverse(0 until doc.pageCount())(i => Future(doc.extractText(i)))

Swift

import PdfOxide

func pages(_ doc: Document) async throws -> [String] {
    let n = try doc.pageCount()
    return try await withThrowingTaskGroup(of: (Int, String).self) { group in
        for i in 0..<n { group.addTask { (i, try doc.extractText(i)) } }
        var out = [String](repeating: "", count: n)
        for try await (i, t) in group { out[i] = t }
        return out
    }
}

Dart

import 'package:pdf_oxide/pdf_oxide.dart';

Future<List<String>> pages(PdfDocument doc) async => Future.wait([
  for (var i = 0; i < doc.pageCount; i++) Future(() => doc.extractText(i)),
]);

Elixir

def pages(doc) do
  {:ok, n} = PdfOxide.page_count(doc)
  0..(n - 1)
  |> Task.async_stream(fn i -> PdfOxide.extract_text(doc, i) end, ordered: true)
  |> Enum.map(fn {:ok, {:ok, text}} -> text end)
end

每个 *Async 调用均在 libuv 线程池上执行。

写操作串行化

写操作(DocumentEditorPdfPdfCreator不是无锁的。如果多个线程需要修改同一文档,应将所有变更汇聚到一个 writer goroutine/任务中处理,再将读操作分发出去。

常用模式:

  • 1 个读取用的 PdfDocument 在 N 个读线程之间共享。
  • 1 个 DocumentEditor 由单个协调任务独占,负责从 channel 或队列中收集编辑操作。

相关文档