离线 OCR 模型管理
PDF Oxide 的 OCR 功能依赖存放在本地缓存目录中的 ONNX 检测模型和识别模型。在 Docker 构建、CI 及离网/离线部署场景下,需要在首次 OCR 调用之前提前准备好这些模型——绝不能在请求时再去下载。PDF Oxide 为此提供了三个基础接口:
prefetch_models— 将共享检测器以及各语言的识别模型和词典下载到模型缓存目录(构建时预置)。model_manifest— 无需网络即可获取的 JSON 清单,列出所有模型文件及其来源 URL,可用于在离网主机上镜像和校验文件。prefetch_available— 报告当前构建是否真正具备下载能力(即是否以ocrfeature 编译)。
缓存目录优先使用 $PDF_OXIDE_MODEL_DIR,若未设置则使用平台默认缓存(Linux 下为 ~/.cache/pdf_oxide/models)。文件就位后,OCR 可完全离线运行。
绑定覆盖范围。 模型预置功能在 Rust、Go、C# 和 Swift 中均已暴露。
model_manifest和prefetch_available在 WASM/JavaScript 中也有对应接口(在 WASM 环境下prefetchAvailable()始终返回false——WASM 没有网络抓取器,请在宿主侧通过清单完成预置)。Python 和 Node N-API 绑定在 v0.3.69 中尚未暴露这些接口。
如何为离线使用预取 OCR 模型?
prefetch_models 接收逗号分隔的语言代码(为空时默认使用英语),将共享检测器及每种语言的识别模型和词典下载到缓存目录,并返回该目录路径。此操作幂等——已存在的文件会被跳过。
Rust
use pdf_oxide::extractors::auto::{AutoExtractor, OcrLanguage};
fn main() -> pdf_oxide::Result<()> {
// AutoExtractor::prefetch_models(langs: &[OcrLanguage])
// -> Result<std::path::PathBuf>
let dir = AutoExtractor::prefetch_models(&[
OcrLanguage::English,
OcrLanguage::Chinese,
OcrLanguage::Arabic,
])?;
println!("models cached in {}", dir.display());
// One-shot English (the common case):
let _ = AutoExtractor::prefetch_models_default()?;
Ok(())
}
Go
package main
import (
"fmt"
"log"
pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)
func main() {
if !pdfoxide.PrefetchAvailable() {
log.Fatal("this build cannot download models (built without the ocr feature)")
}
// func PrefetchModels(langs ...string) (string, error)
dir, err := pdfoxide.PrefetchModels("english", "chinese", "arabic")
if err != nil {
log.Fatal(err)
}
fmt.Println("models cached in", dir)
}
C#
using System;
using PdfOxide.Core;
if (!PdfDocument.PrefetchAvailable())
throw new InvalidOperationException("built without the ocr feature; cannot download models");
// static string PdfDocument.PrefetchModels(params string[] languages)
string dir = PdfDocument.PrefetchModels("english", "chinese", "arabic");
Console.WriteLine($"models cached in {dir}");
Swift
import PdfOxide
guard PdfOxide.prefetchAvailable() == 1 else {
fatalError("built without the ocr feature; cannot download models")
}
// static func prefetchModels(languagesCsv: String) throws -> String
let dir = try PdfOxide.prefetchModels(languagesCsv: "english,chinese,arabic")
print("models cached in \(dir)")
PHP
use PdfOxide\Pdf;
if (!Pdf::prefetchAvailable()) {
throw new RuntimeException('built without the ocr feature; cannot download models');
}
// static Pdf::prefetchModels(array $languages): string
$dir = Pdf::prefetchModels(['english', 'chinese', 'arabic']);
echo "models cached in {$dir}\n";
Ruby
require 'pdf_oxide'
unless PdfOxide::Pdf.prefetch_available?
raise 'built without the ocr feature; cannot download models'
end
# PdfOxide::Pdf.prefetch_models(languages) -> String (cache dir)
dir = PdfOxide::Pdf.prefetch_models(%w[english chinese arabic])
puts "models cached in #{dir}"
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <iostream>
if (pdf_oxide::prefetch_available() == 0)
throw std::runtime_error("built without the ocr feature; cannot download models");
// std::string pdf_oxide::prefetch_models(const std::string& languages_csv)
auto dir = pdf_oxide::prefetch_models("english,chinese,arabic");
std::cout << "models cached in " << dir << "\n";
Dart
import 'package:pdf_oxide/pdf_oxide.dart' as pdf_oxide;
if (pdf_oxide.prefetchAvailable() == 0) {
throw StateError('built without the ocr feature; cannot download models');
}
// String prefetchModels(String languagesCsv)
final dir = pdf_oxide.prefetchModels('english,chinese,arabic');
print('models cached in $dir');
R
library(pdfoxide)
if (pdf_prefetch_available() == 0)
stop("built without the ocr feature; cannot download models")
# pdf_prefetch_models(languages_csv = NULL) -> cache directory path
dir <- pdf_prefetch_models("english,chinese,arabic")
cat("models cached in", dir, "\n")
Julia
using PdfOxide
prefetch_available() != 0 || error("built without the ocr feature; cannot download models")
# prefetch_models(languages_csv::AbstractString) -> cache directory path
dir = prefetch_models("english,chinese,arabic")
println("models cached in ", dir)
Zig
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
if (pdf_oxide.prefetchAvailable() == 0) return error.OcrFeatureMissing;
// prefetchModels(alloc, languages_csv) -> []u8 (cache dir; caller frees)
const dir = try pdf_oxide.prefetchModels(a, "english,chinese,arabic");
defer a.free(dir);
std.debug.print("models cached in {s}\n", .{dir});
Objective-C
#import "POXPdfOxide.h"
NSError *err = nil;
if ([POXModels prefetchAvailable] <= 0) {
@throw [NSException exceptionWithName:@"PdfOxide" reason:@"no ocr feature" userInfo:nil];
}
// + prefetchModels:error: returns a status JSON (nil on error)
NSString *status = [POXModels prefetchModels:@"english,chinese,arabic" error:&err];
NSLog(@"prefetch status: %@", status);
Elixir
unless PdfOxide.prefetch_available() != 0 do
raise "built without the ocr feature; cannot download models"
end
# prefetch_models(languages_csv \\ "") -> JSON status string
status = PdfOxide.prefetch_models("english,chinese,arabic")
IO.puts("prefetch status: #{status}")
语言代码
prefetch_models 支持以下代码(未知代码会被跳过;输入为空时默认使用英语):
english, chinese, chinese_cht, japan, korean, arabic, cyrillic, latin, devanagari, ta(泰米尔语), te(泰卢固语), ka(卡纳达语)
如何在离网主机上预置模型?
在没有互联网连接的机器(或没有网络抓取器的 WASM 目标)上,无法调用 prefetch_models。此时应读取 model_manifest——这是一份静态的、无需网络的 JSON,列出所有模型文件及其上游 URL。通过制品仓库镜像这些 URL,再将文件放入 $PDF_OXIDE_MODEL_DIR。
Rust
use pdf_oxide::extractors::auto::AutoExtractor;
fn main() {
// AutoExtractor::model_manifest() -> String (JSON, never errors)
let manifest = AutoExtractor::model_manifest();
println!("{manifest}");
}
Go
// func ModelManifest() string (JSON, never errors)
fmt.Println(pdfoxide.ModelManifest())
C#
// static string PdfDocument.ModelManifest()
Console.WriteLine(PdfDocument.ModelManifest());
Swift
// static func modelManifest() -> String (JSON)
print(PdfOxide.modelManifest())
JavaScript (WASM)
import init, { modelManifest, prefetchAvailable } from "pdf-oxide-wasm";
await init();
// prefetchAvailable() is always false in WASM — provision host-side.
console.log("can download here?", prefetchAvailable()); // false
console.log(modelManifest()); // JSON manifest
PHP
use PdfOxide\FFI\FunctionBindings;
// (new FunctionBindings())->pdfOxideModelManifest(): string (JSON, never errors)
$manifest = (new FunctionBindings())->pdfOxideModelManifest();
echo $manifest, "\n";
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <iostream>
// std::string pdf_oxide::model_manifest() (JSON, never errors)
std::cout << pdf_oxide::model_manifest() << "\n";
Dart
import 'package:pdf_oxide/pdf_oxide.dart' as pdf_oxide;
// String modelManifest() (JSON)
print(pdf_oxide.modelManifest());
R
library(pdfoxide)
# pdf_model_manifest() -> JSON string
cat(pdf_model_manifest(), "\n")
Julia
using PdfOxide
# model_manifest() -> JSON String
println(model_manifest())
Zig
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
// modelManifest(alloc) -> []u8 (JSON; caller frees)
const manifest = try pdf_oxide.modelManifest(a);
defer a.free(manifest);
std.debug.print("{s}\n", .{manifest});
Objective-C
#import "POXPdfOxide.h"
NSError *err = nil;
// + manifestWithError: -> JSON string (nil on error)
NSString *manifest = [POXModels manifestWithError:&err];
NSLog(@"%@", manifest);
Elixir
# model_manifest() -> JSON string
IO.puts(PdfOxide.model_manifest())
清单结构示例
{
"detector": {
"file": "det.onnx",
"url": "https://.../det.onnx"
},
"languages": [
{
"language": "english",
"rec_file": "rec.onnx",
"dict_file": "en_dict.txt",
"rec_url": "https://.../rec.onnx",
"dict_url": "https://.../en_dict.txt"
}
],
"note": "Hebrew has no upstream PaddleOCR recognition model; the loader is ready if one is provided."
}
镜像 detector.url 以及各语言的 rec_url / dict_url,然后将 det.onnx 和每个 rec_file / dict_file 放入 PDF_OXIDE_MODEL_DIR。之后 OCR 即可零网络访问运行。
当前构建是否支持下载模型?
prefetch_available 报告原生库是否以 ocr feature(含 HTTP 抓取器)编译。若返回 false,prefetch_models 仍会创建缓存目录但不会执行任何下载——请在依赖下载之前先行检查。
Rust
use pdf_oxide::extractors::auto::AutoExtractor;
// AutoExtractor::prefetch_available() -> bool
if AutoExtractor::prefetch_available() {
let _ = AutoExtractor::prefetch_models_default();
} else {
eprintln!("OCR feature not compiled in — provision via model_manifest()");
}
Go — pdfoxide.PrefetchAvailable() bool
C# — PdfDocument.PrefetchAvailable() -> bool
Swift — PdfOxide.prefetchAvailable() -> Int32(1 表示可用)
Dockerfile 示例
在构建阶段将模型内置到镜像中,使运行时容器无需任何网络访问:
FROM rust:1 AS models
WORKDIR /app
COPY . .
# Build the CLI / your binary with the `ocr` feature, then prefetch.
ENV PDF_OXIDE_MODEL_DIR=/models
RUN cargo run --features ocr --bin prefetch -- english chinese
FROM debian:stable-slim
ENV PDF_OXIDE_MODEL_DIR=/models
COPY --from=models /models /models
# OCR now runs fully offline against /models
全局引擎配置
有两个进程级别的全局设置项用于调优提取引擎。两者均通过 C-ABI 绑定暴露,均返回上一个值,且没有错误通道(不会失败)。由于是进程全局设置,在某个线程上修改会影响所有并发提取操作。
如何提高内容流操作符上限?
PDF Oxide 对每个流的内容流操作符数量设有上限(默认 1,000,000),以限制恶意输入的处理成本。大型合法技术 PDF(教科书、ISO 标准等)可能超出此上限。set_max_ops_per_stream 可调高(或调低)该上限,并返回之前的值。
Rust
// pdf_oxide::content::parser::set_max_ops_per_stream(limit: Option<usize>)
// -> Option<usize> (None restores the 1,000,000 default)
use pdf_oxide::content::parser::set_max_ops_per_stream;
let prev = set_max_ops_per_stream(Some(5_000_000));
// ... extract a huge trusted PDF ...
set_max_ops_per_stream(prev); // restore
Go
// func SetMaxOpsPerStream(limit int64) int64 (returns previous cap)
prev := pdfoxide.SetMaxOpsPerStream(5_000_000)
defer pdfoxide.SetMaxOpsPerStream(prev)
C#
// static long CAbi.SetMaxOpsPerStream(long limit) (returns previous cap)
long prev = PdfOxide.Core.CAbi.SetMaxOpsPerStream(5_000_000);
try { /* extract huge trusted PDF */ }
finally { PdfOxide.Core.CAbi.SetMaxOpsPerStream(prev); }
Swift
// static func setMaxOpsPerStream(_ limit: Int64) -> Int64
let prev = PdfOxide.setMaxOpsPerStream(5_000_000)
defer { _ = PdfOxide.setMaxOpsPerStream(prev) }
PHP
use PdfOxide\FFI\FunctionBindings;
$bindings = new FunctionBindings();
// pdfOxideSetMaxOpsPerStream(int $limit): int (returns previous cap; -1 = default was active)
$prev = $bindings->pdfOxideSetMaxOpsPerStream(5_000_000);
try { /* extract huge trusted PDF */ }
finally { $bindings->pdfOxideSetMaxOpsPerStream($prev); }
Ruby
require 'pdf_oxide'
# PdfOxide.set_max_ops_per_stream(limit) -> previous cap (-1 = default was active)
prev = PdfOxide.set_max_ops_per_stream(5_000_000)
begin
# ... extract a huge trusted PDF ...
ensure
PdfOxide.set_max_ops_per_stream(prev)
end
C++
#include <pdf_oxide/pdf_oxide.hpp>
// std::int64_t pdf_oxide::set_max_ops_per_stream(std::int64_t limit) -> previous cap
auto prev = pdf_oxide::set_max_ops_per_stream(5'000'000);
// ... extract a huge trusted PDF ...
pdf_oxide::set_max_ops_per_stream(prev); // restore
Dart
import 'package:pdf_oxide/pdf_oxide.dart' as pdf_oxide;
// int setMaxOpsPerStream(int limit) -> previous cap
final prev = pdf_oxide.setMaxOpsPerStream(5000000);
// ... extract a huge trusted PDF ...
pdf_oxide.setMaxOpsPerStream(prev); // restore
R
library(pdfoxide)
# pdf_set_max_ops_per_stream(limit) -> previous cap (negative limit restores default)
prev <- pdf_set_max_ops_per_stream(5000000)
# ... extract a huge trusted PDF ...
pdf_set_max_ops_per_stream(prev) # restore
Julia
using PdfOxide
# set_max_ops_per_stream(limit::Integer) -> previous cap
prev = set_max_ops_per_stream(5_000_000)
# ... extract a huge trusted PDF ...
set_max_ops_per_stream(prev) # restore
Zig
const pdf_oxide = @import("pdf_oxide");
// setMaxOpsPerStream(limit: i64) i64 (returns previous cap)
const prev = pdf_oxide.setMaxOpsPerStream(5_000_000);
// ... extract a huge trusted PDF ...
_ = pdf_oxide.setMaxOpsPerStream(prev); // restore
Objective-C
#import "POXPdfOxide.h"
// + setMaxOpsPerStream: -> previous cap
int64_t prev = [POXConfig setMaxOpsPerStream:5000000];
// ... extract a huge trusted PDF ...
[POXConfig setMaxOpsPerStream:prev]; // restore
Elixir
# set_max_ops_per_stream(limit) -> previous cap (-1 = default was active)
prev = PdfOxide.set_max_ops_per_stream(5_000_000)
# ... extract a huge trusted PDF ...
PdfOxide.set_max_ops_per_stream(prev)
在 C ABI 层,pdf_oxide_set_max_ops_per_stream(limit) 将负数 limit 视为"恢复默认值",并在之前处于默认值时返回 -1。
如何保留未映射的(U+FFFD)字形?
默认情况下,高级访问器(extract_text / extract_words / extract_spans)会过滤掉没有 Unicode 映射的字形(它们会以 U+FFFD � 的形式出现)。若某一页面上所有可见字形都映射到 U+FFFD——例如 MSAM10 这类数学符号字体——则可能产生空输出。调用 set_preserve_unmapped_glyphs(true) 后,这些访问器会保留替换字符,以便进一步后处理;该函数返回之前的设置值。
Rust
// pdf_oxide::extractors::text::set_preserve_unmapped_glyphs(preserve: bool)
// -> bool (returns previous value)
use pdf_oxide::extractors::text::set_preserve_unmapped_glyphs;
let prev = set_preserve_unmapped_glyphs(true);
// ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
set_preserve_unmapped_glyphs(prev);
Go
// func SetPreserveUnmappedGlyphs(preserve int) int (1 = preserve; returns previous)
prev := pdfoxide.SetPreserveUnmappedGlyphs(1)
defer pdfoxide.SetPreserveUnmappedGlyphs(prev)
C#
// static int CAbi.SetPreserveUnmappedGlyphs(bool preserve) (returns previous, 0/1)
int prev = PdfOxide.Core.CAbi.SetPreserveUnmappedGlyphs(true);
try { /* extract math-heavy PDF */ }
finally { PdfOxide.Core.CAbi.SetPreserveUnmappedGlyphs(prev != 0); }
Swift
// static func setPreserveUnmappedGlyphs(_ preserve: Int32) -> Int32
let prev = PdfOxide.setPreserveUnmappedGlyphs(1)
defer { _ = PdfOxide.setPreserveUnmappedGlyphs(prev) }
PHP
use PdfOxide\FFI\FunctionBindings;
$bindings = new FunctionBindings();
// pdfOxideSetPreserveUnmappedGlyphs(int $preserve): int (1 = preserve; returns previous, 0/1)
$prev = $bindings->pdfOxideSetPreserveUnmappedGlyphs(1);
try { /* extract math-heavy PDF */ }
finally { $bindings->pdfOxideSetPreserveUnmappedGlyphs($prev); }
Ruby
require 'pdf_oxide'
# PdfOxide.set_preserve_unmapped_glyphs(preserve) -> previous value (0 or 1)
prev = PdfOxide.set_preserve_unmapped_glyphs(true)
begin
# ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
ensure
PdfOxide.set_preserve_unmapped_glyphs(prev)
end
C++
#include <pdf_oxide/pdf_oxide.hpp>
// int pdf_oxide::set_preserve_unmapped_glyphs(int preserve) -> previous value
int prev = pdf_oxide::set_preserve_unmapped_glyphs(1);
// ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
pdf_oxide::set_preserve_unmapped_glyphs(prev); // restore
Dart
import 'package:pdf_oxide/pdf_oxide.dart' as pdf_oxide;
// int setPreserveUnmappedGlyphs(int preserve) -> previous value
final prev = pdf_oxide.setPreserveUnmappedGlyphs(1);
// ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
pdf_oxide.setPreserveUnmappedGlyphs(prev); // restore
R
library(pdfoxide)
# pdf_set_preserve_unmapped_glyphs(preserve) -> previous value (0 or 1)
prev <- pdf_set_preserve_unmapped_glyphs(1L)
# ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
pdf_set_preserve_unmapped_glyphs(prev) # restore
Julia
using PdfOxide
# set_preserve_unmapped_glyphs(preserve::Integer) -> previous value (0 or 1)
prev = set_preserve_unmapped_glyphs(1)
# ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
set_preserve_unmapped_glyphs(prev) # restore
Zig
const pdf_oxide = @import("pdf_oxide");
// setPreserveUnmappedGlyphs(preserve: bool) i32 (returns previous value)
const prev = pdf_oxide.setPreserveUnmappedGlyphs(true);
// ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
_ = pdf_oxide.setPreserveUnmappedGlyphs(prev != 0); // restore
Objective-C
#import "POXPdfOxide.h"
// + setPreserveUnmappedGlyphs: -> previous value (0 or 1)
int32_t prev = [POXConfig setPreserveUnmappedGlyphs:1];
// ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
[POXConfig setPreserveUnmappedGlyphs:prev]; // restore
Elixir
# set_preserve_unmapped_glyphs(preserve) -> previous value (0 or 1)
prev = PdfOxide.set_preserve_unmapped_glyphs(1)
# ... extract a math-heavy PDF; U+FFFD glyphs are now kept ...
PdfOxide.set_preserve_unmapped_glyphs(prev)
在 C ABI 层,pdf_oxide_set_preserve_unmapped_glyphs(preserve) 接受 1(保留)或 0(过滤),并以 0 或 1 返回之前的值。
常见问题
OCR 模型存储在哪里?
若设置了 $PDF_OXIDE_MODEL_DIR 则存于该路径,否则存于平台缓存(Linux 下为 ~/.cache/pdf_oxide/models)。该路径也是 prefetch_models 的返回值。
可以重复调用 prefetch_models 吗?
可以——它是幂等的。已存在的文件会被跳过,因此每次启动时作为保障措施调用也无妨。
为什么即使调用了预取,prefetch_available 仍返回 false?
该构建在编译时未启用 ocr feature,因此不含 HTTP 抓取器。prefetch_models 会创建缓存目录但不下载任何内容——请通过 model_manifest 手动提供文件。
全局设置项需要重置吗? 它们是进程全局设置,在被修改之前一直有效。如果只想对特定文档启用覆盖,请用设置项返回的上一个值来恢复。两个设置项均不会失败,也没有错误通道。
相关页面
- 扫描 PDF 的 OCR 处理 — 模型就位后如何驱动 OCR
- 页面分类 — 判断哪些页面真正需要 OCR
- 日志与调试输出 — 其他进程全局库设置
- 从 PDF 提取文本 —
set_preserve_unmapped_glyphs影响的高级访问器