Пакетная обработка PDF на Python
Обработка каталога PDF с обработкой ошибок:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
for pdf_path in Path("documents/").glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
text = doc.extract_text(0)
print(f"{pdf_path.name}: {len(text)} chars")
except PdfError as e:
print(f"Failed: {pdf_path.name}: {e}")
WASM
import { WasmPdfDocument } from "pdf-oxide-wasm";
// Process multiple PDF buffers
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const text = doc.extractText(0);
console.log(`${name}: ${text.length} chars`);
doc.free();
} catch (e) {
console.error(`Failed: ${name}: ${e.message}`);
}
}
Rust
use pdf_oxide::PdfDocument;
use std::path::Path;
for entry in std::fs::read_dir("documents/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
match PdfDocument::open(path.to_str().unwrap()) {
Ok(mut doc) => {
let text = doc.extract_text(0)?;
println!("{}: {} chars", path.display(), text.len());
}
Err(e) => println!("Failed: {}: {}", path.display(), e),
}
}
}
Go
package main
import (
"fmt"
"log"
"path/filepath"
pdfoxide "github.com/yfedoseev/pdf_oxide/go"
)
func main() {
matches, _ := filepath.Glob("documents/*.pdf")
for _, p := range matches {
doc, err := pdfoxide.Open(p)
if err != nil {
log.Printf("Failed: %s: %v", p, err)
continue
}
text, _ := doc.ExtractText(0)
fmt.Printf("%s: %d chars\n", filepath.Base(p), len(text))
doc.Close()
}
}
C#
using PdfOxide;
foreach (var p in Directory.GetFiles("documents/", "*.pdf"))
{
try
{
using var doc = PdfDocument.Open(p);
var text = doc.ExtractText(0);
Console.WriteLine($"{Path.GetFileName(p)}: {text.Length} chars");
}
catch (Exception e)
{
Console.Error.WriteLine($"Failed: {p}: {e.Message}");
}
}
Java
import fyi.oxide.pdf.PdfDocument;
import java.io.File;
for (File f : new File("documents/").listFiles((d, n) -> n.endsWith(".pdf"))) {
try (PdfDocument doc = PdfDocument.open(f.toPath())) {
String text = doc.extractText(0);
System.out.printf("%s: %d chars%n", f.getName(), text.length());
} catch (Exception e) {
System.out.printf("Failed: %s: %s%n", f.getName(), e.getMessage());
}
}
Kotlin
import fyi.oxide.pdf.PdfDocument
import java.io.File
File("documents/").listFiles { _, n -> n.endsWith(".pdf") }?.forEach { f ->
try {
PdfDocument.open(f.toPath()).use { doc ->
val text = doc.extractText(0)
println("${f.name}: ${text.length} chars")
}
} catch (e: Exception) {
println("Failed: ${f.name}: ${e.message}")
}
}
Scala
import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File
for (f <- new File("documents/").listFiles((_, n) => n.endsWith(".pdf"))) {
Using.resource(PdfDocument.open(f.getPath)) { doc =>
val text = doc.extractText(0)
println(s"${f.getName}: ${text.length} chars")
}
}
Clojure
(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])
(doseq [f (.listFiles (io/file "documents/"))
:when (.endsWith (.getName f) ".pdf")]
(try
(with-open [doc (pdf/open (.getPath f))]
(let [text (pdf/extract-text doc 0)]
(println (str (.getName f) ": " (count text) " chars"))))
(catch Exception e
(println (str "Failed: " (.getName f) ": " (.getMessage e))))))
Ruby
require 'pdf_oxide'
Dir.glob('documents/*.pdf').each do |path|
begin
PdfOxide::PdfDocument.open(path) do |doc|
text = doc.extract_text(0)
puts "#{File.basename(path)}: #{text.length} chars"
end
rescue PdfOxide::Error => e
puts "Failed: #{File.basename(path)}: #{e.message}"
end
end
PHP
use PdfOxide\PdfDocument;
foreach (glob('documents/*.pdf') as $path) {
try {
$doc = PdfDocument::open($path);
$text = $doc->extractText(0);
printf("%s: %d chars\n", basename($path), strlen($text));
$doc->close();
} catch (\Throwable $e) {
printf("Failed: %s: %s\n", basename($path), $e->getMessage());
}
}
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <iostream>
namespace fs = std::filesystem;
for (const auto& entry : fs::directory_iterator("documents/")) {
if (entry.path().extension() != ".pdf") continue;
try {
auto doc = pdf_oxide::Document::open(entry.path().string());
auto text = doc.extract_text(0);
std::cout << entry.path().filename() << ": " << text.size() << " chars\n";
} catch (const pdf_oxide::Error& e) {
std::cerr << "Failed: " << entry.path().filename() << ": " << e.what() << "\n";
}
}
Swift
import PdfOxide
import Foundation
let dir = "documents/"
for name in try FileManager.default.contentsOfDirectory(atPath: dir) where name.hasSuffix(".pdf") {
do {
let doc = try Document.open(dir + name)
let text = try doc.extractText(0)
print("\(name): \(text.count) chars")
} catch {
print("Failed: \(name): \(error)")
}
}
Dart
import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';
for (final entry in Directory('documents/').listSync()) {
if (!entry.path.endsWith('.pdf')) continue;
try {
final doc = PdfDocument.open(entry.path);
final text = doc.extractText(0);
print('${entry.uri.pathSegments.last}: ${text.length} chars');
doc.close();
} catch (e) {
print('Failed: ${entry.path}: $e');
}
}
R
library(pdfoxide)
for (path in Sys.glob("documents/*.pdf")) {
tryCatch({
doc <- pdf_open(path)
text <- pdf_extract_text(doc, 0)
cat(sprintf("%s: %d chars\n", basename(path), nchar(text)))
}, error = function(e) {
cat(sprintf("Failed: %s: %s\n", basename(path), conditionMessage(e)))
})
}
Julia
using PdfOxide
for path in filter(p -> endswith(p, ".pdf"), readdir("documents/"; join = true))
try
doc = open_document(path)
text = extract_text(doc, 0)
println("$(basename(path)): $(length(text)) chars")
catch e
println("Failed: $(basename(path)): $e")
end
end
Zig
const std = @import("std");
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
var dir = try std.fs.cwd().openDir("documents/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
const path = try std.fs.path.joinZ(a, &.{ "documents/", entry.name });
defer a.free(path);
var doc = pdf_oxide.Document.open(path) catch |e| {
std.debug.print("Failed: {s}: {}\n", .{ entry.name, e });
continue;
};
defer doc.deinit();
const text = try doc.extractText(a, 0);
defer a.free(text);
std.debug.print("{s}: {d} chars\n", .{ entry.name, text.len });
}
Objective-C
#import "POXPdfOxide.h"
NSFileManager *fm = [NSFileManager defaultManager];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"documents/" error:nil]) {
if (![name hasSuffix:@".pdf"]) continue;
NSError *err = nil;
NSString *path = [@"documents/" stringByAppendingString:name];
POXDocument *doc = [POXDocument openPath:path error:&err];
if (!doc) { NSLog(@"Failed: %@: %@", name, err.localizedDescription); continue; }
NSString *text = [doc extractText:0 error:&err];
NSLog(@"%@: %lu chars", name, (unsigned long)text.length);
}
Elixir
Path.wildcard("documents/*.pdf")
|> Enum.each(fn path ->
case PdfOxide.open(path) do
{:ok, doc} ->
{:ok, text} = PdfOxide.extract_text(doc, 0)
IO.puts("#{Path.basename(path)}: #{String.length(text)} chars")
{:error, reason} ->
IO.puts("Failed: #{Path.basename(path)}: #{inspect(reason)}")
end
end)
При скорости 0,8 мс на страницу PDF Oxide обрабатывает 3830 документов за 3,1 секунды.
Установка
pip install pdf_oxide
Последовательная обработка
Извлечение текста из всех PDF
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_dir = Path("invoices/")
results = {}
for pdf_path in sorted(pdf_dir.glob("*.pdf")):
try:
doc = PdfDocument(str(pdf_path))
pages = []
for i in range(doc.page_count()):
pages.append(doc.extract_text(i))
results[pdf_path.name] = "\n".join(pages)
except PdfError as e:
print(f"Error: {pdf_path.name}: {e}")
print(f"Processed {len(results)} PDFs")
WASM
const results = new Map();
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const text = doc.extractAllText();
results.set(name, text);
doc.free();
} catch (e) {
console.error(`Error: ${name}: ${e.message}`);
}
}
console.log(`Processed ${results.size} PDFs`);
Rust
use std::collections::HashMap;
let mut results: HashMap<String, String> = HashMap::new();
for entry in std::fs::read_dir("invoices/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
let mut pages = Vec::new();
for i in 0..doc.page_count().unwrap_or(0) {
if let Ok(text) = doc.extract_text(i) {
pages.push(text);
}
}
results.insert(path.display().to_string(), pages.join("\n"));
}
}
}
println!("Processed {} PDFs", results.len());
Go
results := make(map[string]string)
matches, _ := filepath.Glob("invoices/*.pdf")
sort.Strings(matches)
for _, p := range matches {
doc, err := pdfoxide.Open(p)
if err != nil { log.Printf("Error: %s: %v", p, err); continue }
full, _ := doc.ExtractAllText()
results[filepath.Base(p)] = full
doc.Close()
}
fmt.Printf("Processed %d PDFs\n", len(results))
C#
var results = new Dictionary<string, string>();
foreach (var p in Directory.GetFiles("invoices/", "*.pdf").OrderBy(p => p))
{
try
{
using var doc = PdfDocument.Open(p);
var sb = new StringBuilder();
for (int i = 0; i < doc.PageCount; i++)
sb.AppendLine(doc.ExtractText(i));
results[Path.GetFileName(p)] = sb.ToString();
}
catch (Exception e)
{
Console.Error.WriteLine($"Error: {p}: {e.Message}");
}
}
Console.WriteLine($"Processed {results.Count} PDFs");
Java
import fyi.oxide.pdf.PdfDocument;
import java.io.File;
import java.util.*;
Map<String, String> results = new HashMap<>();
File[] files = new File("invoices/").listFiles((d, n) -> n.endsWith(".pdf"));
Arrays.sort(files);
for (File f : files) {
try (PdfDocument doc = PdfDocument.open(f.toPath())) {
StringBuilder all = new StringBuilder();
for (int i = 0; i < doc.pageCount(); i++) all.append(doc.extractText(i));
results.put(f.getName(), all.toString());
} catch (Exception e) {
System.err.printf("Error: %s: %s%n", f.getName(), e.getMessage());
}
}
System.out.printf("Processed %d PDFs%n", results.size());
Kotlin
import fyi.oxide.pdf.PdfDocument
import java.io.File
val results = mutableMapOf<String, String>()
File("invoices/").listFiles { _, n -> n.endsWith(".pdf") }?.sorted()?.forEach { f ->
try {
PdfDocument.open(f.toPath()).use { doc ->
val all = (0 until doc.pageCount()).joinToString("") { doc.extractText(it) }
results[f.name] = all
}
} catch (e: Exception) {
System.err.println("Error: ${f.name}: ${e.message}")
}
}
println("Processed ${results.size} PDFs")
Scala
import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File
val results = scala.collection.mutable.Map.empty[String, String]
for (f <- new File("invoices/").listFiles((_, n) => n.endsWith(".pdf")).sorted) {
Using.resource(PdfDocument.open(f.getPath)) { doc =>
val all = (0 until doc.pageCount()).map(doc.extractText).mkString
results(f.getName) = all
}
}
println(s"Processed ${results.size} PDFs")
Clojure
(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])
(let [results (atom {})]
(doseq [f (sort (filter #(.endsWith (.getName %) ".pdf")
(.listFiles (io/file "invoices/"))))]
(with-open [doc (pdf/open (.getPath f))]
(let [all (apply str (map #(pdf/extract-text doc %)
(range (pdf/page-count doc))))]
(swap! results assoc (.getName f) all))))
(println (str "Processed " (count @results) " PDFs")))
Ruby
require 'pdf_oxide'
results = {}
Dir.glob('invoices/*.pdf').sort.each do |path|
begin
PdfOxide::PdfDocument.open(path) do |doc|
all = (0...doc.page_count).map { |i| doc.extract_text(i) }.join
results[File.basename(path)] = all
end
rescue PdfOxide::Error => e
warn "Error: #{File.basename(path)}: #{e.message}"
end
end
puts "Processed #{results.size} PDFs"
PHP
use PdfOxide\PdfDocument;
$results = [];
$files = glob('invoices/*.pdf');
sort($files);
foreach ($files as $path) {
try {
$doc = PdfDocument::open($path);
$all = '';
for ($i = 0; $i < $doc->pageCount(); $i++) { $all .= $doc->extractText($i); }
$results[basename($path)] = $all;
$doc->close();
} catch (\Throwable $e) {
fwrite(STDERR, "Error: " . basename($path) . ": " . $e->getMessage() . "\n");
}
}
printf("Processed %d PDFs\n", count($results));
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <map>
namespace fs = std::filesystem;
std::map<std::string, std::string> results;
for (const auto& entry : fs::directory_iterator("invoices/")) {
if (entry.path().extension() != ".pdf") continue;
try {
auto doc = pdf_oxide::Document::open(entry.path().string());
results[entry.path().filename().string()] = doc.extract_all_text();
} catch (const pdf_oxide::Error& e) {
std::cerr << "Error: " << entry.path().filename() << ": " << e.what() << "\n";
}
}
std::cout << "Processed " << results.size() << " PDFs\n";
Swift
import PdfOxide
import Foundation
var results: [String: String] = [:]
let dir = "invoices/"
for name in try FileManager.default.contentsOfDirectory(atPath: dir).sorted() where name.hasSuffix(".pdf") {
do {
let doc = try Document.open(dir + name)
results[name] = try doc.extractAllText()
} catch {
FileHandle.standardError.write("Error: \(name): \(error)\n".data(using: .utf8)!)
}
}
print("Processed \(results.count) PDFs")
Dart
import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';
final results = <String, String>{};
final files = Directory('invoices/').listSync().where((e) => e.path.endsWith('.pdf')).toList()
..sort((a, b) => a.path.compareTo(b.path));
for (final entry in files) {
try {
final doc = PdfDocument.open(entry.path);
results[entry.uri.pathSegments.last] = doc.extractAllText();
doc.close();
} catch (e) {
stderr.writeln('Error: ${entry.path}: $e');
}
}
print('Processed ${results.length} PDFs');
R
library(pdfoxide)
results <- list()
for (path in sort(Sys.glob("invoices/*.pdf"))) {
tryCatch({
doc <- pdf_open(path)
results[[basename(path)]] <- pdf_extract_all_text(doc)
}, error = function(e) {
message(sprintf("Error: %s: %s", basename(path), conditionMessage(e)))
})
}
cat(sprintf("Processed %d PDFs\n", length(results)))
Julia
using PdfOxide
results = Dict{String,String}()
for path in sort(filter(p -> endswith(p, ".pdf"), readdir("invoices/"; join = true)))
try
doc = open_document(path)
results[basename(path)] = extract_all_text(doc)
catch e
@warn "Error: $(basename(path)): $e"
end
end
println("Processed $(length(results)) PDFs")
Zig
const std = @import("std");
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
var results = std.StringHashMap([]const u8).init(a);
var dir = try std.fs.cwd().openDir("invoices/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
const path = try std.fs.path.joinZ(a, &.{ "invoices/", entry.name });
defer a.free(path);
var doc = pdf_oxide.Document.open(path) catch |e| {
std.debug.print("Error: {s}: {}\n", .{ entry.name, e });
continue;
};
defer doc.deinit();
const all = try doc.extractAllText(a);
try results.put(try a.dupe(u8, entry.name), all);
}
std.debug.print("Processed {d} PDFs\n", .{results.count()});
Objective-C
#import "POXPdfOxide.h"
NSFileManager *fm = [NSFileManager defaultManager];
NSMutableDictionary<NSString*, NSString*> *results = [NSMutableDictionary dictionary];
NSArray *names = [[fm contentsOfDirectoryAtPath:@"invoices/" error:nil]
sortedArrayUsingSelector:@selector(compare:)];
for (NSString *name in names) {
if (![name hasSuffix:@".pdf"]) continue;
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:[@"invoices/" stringByAppendingString:name] error:&err];
if (!doc) { NSLog(@"Error: %@: %@", name, err.localizedDescription); continue; }
results[name] = [doc extractAllTextWithError:&err];
}
NSLog(@"Processed %lu PDFs", (unsigned long)results.count);
Elixir
results =
"invoices/*.pdf"
|> Path.wildcard()
|> Enum.sort()
|> Enum.reduce(%{}, fn path, acc ->
case PdfOxide.open(path) do
{:ok, doc} ->
n = elem(PdfOxide.page_count(doc), 1)
all = 0..(n - 1) |> Enum.map_join(fn i -> elem(PdfOxide.extract_text(doc, i), 1) end)
Map.put(acc, Path.basename(path), all)
{:error, reason} ->
IO.puts(:stderr, "Error: #{Path.basename(path)}: #{inspect(reason)}")
acc
end
end)
IO.puts("Processed #{map_size(results)} PDFs")
Конвертация всех PDF в Markdown
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
input_dir = Path("papers/")
output_dir = Path("markdown/")
output_dir.mkdir(exist_ok=True)
for pdf_path in input_dir.glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
md = doc.to_markdown_all(detect_headings=True, include_images=False)
out_path = output_dir / pdf_path.with_suffix(".md").name
out_path.write_text(md, encoding="utf-8")
except PdfError as e:
print(f"Skipped {pdf_path.name}: {e}")
WASM
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const md = doc.toMarkdownAll();
console.log(`Converted ${name}: ${md.length} chars`);
doc.free();
} catch (e) {
console.error(`Skipped ${name}: ${e.message}`);
}
}
Rust
for entry in std::fs::read_dir("papers/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
if let Ok(mut doc) = PdfDocument::open(path.to_str().unwrap()) {
if let Ok(md) = doc.to_markdown_all(true) {
let out = path.with_extension("md");
std::fs::write(&out, &md)?;
}
}
}
}
Go
_ = os.MkdirAll("markdown/", 0755)
matches, _ := filepath.Glob("papers/*.pdf")
for _, p := range matches {
doc, err := pdfoxide.Open(p)
if err != nil { log.Printf("Skipped %s: %v", p, err); continue }
md, _ := doc.ToMarkdownAll()
doc.Close()
out := filepath.Join("markdown", strings.TrimSuffix(filepath.Base(p), ".pdf") + ".md")
_ = os.WriteFile(out, []byte(md), 0644)
}
C#
Directory.CreateDirectory("markdown/");
foreach (var p in Directory.GetFiles("papers/", "*.pdf"))
{
try
{
using var doc = PdfDocument.Open(p);
var md = doc.ToMarkdownAll();
var outPath = Path.Combine("markdown", Path.GetFileNameWithoutExtension(p) + ".md");
File.WriteAllText(outPath, md);
}
catch (Exception e)
{
Console.Error.WriteLine($"Skipped {p}: {e.Message}");
}
}
Java
import fyi.oxide.pdf.PdfDocument;
import java.io.File;
import java.nio.file.*;
Files.createDirectories(Path.of("markdown"));
for (File f : new File("papers/").listFiles((d, n) -> n.endsWith(".pdf"))) {
try (PdfDocument doc = PdfDocument.open(f.toPath())) {
String md = doc.toMarkdown(); // no-arg = whole document
String base = f.getName().replaceFirst("\\.pdf$", ".md");
Files.writeString(Path.of("markdown", base), md);
} catch (Exception e) {
System.err.printf("Skipped %s: %s%n", f.getName(), e.getMessage());
}
}
Kotlin
import fyi.oxide.pdf.PdfDocument
import java.io.File
File("markdown").mkdirs()
File("papers/").listFiles { _, n -> n.endsWith(".pdf") }?.forEach { f ->
try {
PdfDocument.open(f.toPath()).use { doc ->
val md = doc.toMarkdown() // no-arg = whole document
File("markdown", f.name.removeSuffix(".pdf") + ".md").writeText(md)
}
} catch (e: Exception) {
System.err.println("Skipped ${f.name}: ${e.message}")
}
}
Scala
import fyi.oxide.pdf.PdfDocument
import scala.util.Using
import java.io.File
import java.nio.file.{Files, Path}
Files.createDirectories(Path.of("markdown"))
for (f <- new File("papers/").listFiles((_, n) => n.endsWith(".pdf"))) {
Using.resource(PdfDocument.open(f.getPath)) { doc =>
val md = doc.toMarkdown() // no-arg = whole document
Files.writeString(Path.of("markdown", f.getName.stripSuffix(".pdf") + ".md"), md)
}
}
Clojure
(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])
(.mkdirs (io/file "markdown"))
(doseq [f (filter #(.endsWith (.getName %) ".pdf")
(.listFiles (io/file "papers/")))]
(try
(with-open [doc (pdf/open (.getPath f))]
(let [md (pdf/to-markdown doc) ; no page arg = whole document
base (clojure.string/replace (.getName f) #"\.pdf$" ".md")]
(spit (io/file "markdown" base) md)))
(catch Exception e
(println (str "Skipped " (.getName f) ": " (.getMessage e))))))
Ruby
require 'pdf_oxide'
require 'fileutils'
FileUtils.mkdir_p('markdown')
Dir.glob('papers/*.pdf').each do |path|
begin
PdfOxide::PdfDocument.open(path) do |doc|
md = doc.to_markdown # nil arg = whole document
out = File.join('markdown', File.basename(path, '.pdf') + '.md')
File.write(out, md)
end
rescue PdfOxide::Error => e
warn "Skipped #{File.basename(path)}: #{e.message}"
end
end
PHP
use PdfOxide\PdfDocument;
@mkdir('markdown');
foreach (glob('papers/*.pdf') as $path) {
try {
$doc = PdfDocument::open($path);
$md = $doc->toMarkdownAll();
$out = 'markdown/' . basename($path, '.pdf') . '.md';
file_put_contents($out, $md);
$doc->close();
} catch (\Throwable $e) {
fwrite(STDERR, "Skipped " . basename($path) . ": " . $e->getMessage() . "\n");
}
}
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <fstream>
namespace fs = std::filesystem;
fs::create_directories("markdown");
for (const auto& entry : fs::directory_iterator("papers/")) {
if (entry.path().extension() != ".pdf") continue;
try {
auto doc = pdf_oxide::Document::open(entry.path().string());
auto md = doc.to_markdown_all();
auto out = fs::path("markdown") / entry.path().filename().replace_extension(".md");
std::ofstream(out) << md;
} catch (const pdf_oxide::Error& e) {
std::cerr << "Skipped " << entry.path().filename() << ": " << e.what() << "\n";
}
}
Swift
import PdfOxide
import Foundation
let dir = "papers/"
try FileManager.default.createDirectory(atPath: "markdown", withIntermediateDirectories: true)
for name in try FileManager.default.contentsOfDirectory(atPath: dir) where name.hasSuffix(".pdf") {
do {
let doc = try Document.open(dir + name)
let md = try doc.toMarkdownAll()
let out = "markdown/" + name.replacingOccurrences(of: ".pdf", with: ".md")
try md.write(toFile: out, atomically: true, encoding: .utf8)
} catch {
print("Skipped \(name): \(error)")
}
}
Dart
import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';
Directory('markdown').createSync();
for (final entry in Directory('papers/').listSync()) {
if (!entry.path.endsWith('.pdf')) continue;
try {
final doc = PdfDocument.open(entry.path);
final md = doc.toMarkdownAll();
final base = entry.uri.pathSegments.last.replaceAll('.pdf', '.md');
File('markdown/$base').writeAsStringSync(md);
doc.close();
} catch (e) {
print('Skipped ${entry.path}: $e');
}
}
R
library(pdfoxide)
dir.create("markdown", showWarnings = FALSE)
for (path in Sys.glob("papers/*.pdf")) {
tryCatch({
doc <- pdf_open(path)
md <- pdf_to_markdown_all(doc)
out <- file.path("markdown", sub("\\.pdf$", ".md", basename(path)))
writeLines(md, out)
}, error = function(e) {
message(sprintf("Skipped %s: %s", basename(path), conditionMessage(e)))
})
}
Julia
using PdfOxide
mkpath("markdown")
for path in filter(p -> endswith(p, ".pdf"), readdir("papers/"; join = true))
try
doc = open_document(path)
md = to_markdown_all(doc)
out = joinpath("markdown", replace(basename(path), ".pdf" => ".md"))
write(out, md)
catch e
println("Skipped $(basename(path)): $e")
end
end
Zig
const std = @import("std");
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
try std.fs.cwd().makePath("markdown");
var dir = try std.fs.cwd().openDir("papers/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
const path = try std.fs.path.joinZ(a, &.{ "papers/", entry.name });
defer a.free(path);
var doc = pdf_oxide.Document.open(path) catch |e| {
std.debug.print("Skipped {s}: {}\n", .{ entry.name, e });
continue;
};
defer doc.deinit();
const md = try doc.toMarkdownAll(a);
defer a.free(md);
const out = try std.fmt.allocPrint(a, "markdown/{s}.md", .{entry.name[0 .. entry.name.len - 4]});
defer a.free(out);
try std.fs.cwd().writeFile(.{ .sub_path = out, .data = md });
}
Objective-C
#import "POXPdfOxide.h"
NSFileManager *fm = [NSFileManager defaultManager];
[fm createDirectoryAtPath:@"markdown" withIntermediateDirectories:YES attributes:nil error:nil];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"papers/" error:nil]) {
if (![name hasSuffix:@".pdf"]) continue;
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:[@"papers/" stringByAppendingString:name] error:&err];
if (!doc) { NSLog(@"Skipped %@: %@", name, err.localizedDescription); continue; }
NSString *md = [doc toMarkdownAllWithError:&err];
NSString *base = [[name stringByDeletingPathExtension] stringByAppendingPathExtension:@"md"];
[md writeToFile:[@"markdown/" stringByAppendingString:base]
atomically:YES encoding:NSUTF8StringEncoding error:&err];
}
Elixir
File.mkdir_p!("markdown")
"papers/*.pdf"
|> Path.wildcard()
|> Enum.each(fn path ->
case PdfOxide.open(path) do
{:ok, doc} ->
{:ok, md} = PdfOxide.to_markdown_all(doc)
out = Path.join("markdown", Path.basename(path, ".pdf") <> ".md")
File.write!(out, md)
{:error, reason} ->
IO.puts(:stderr, "Skipped #{Path.basename(path)}: #{inspect(reason)}")
end
end)
Параллельная обработка
Горутины и задачи. В Go каждый
PdfDocumentбезопасен для горутин при независимом чтении — достаточно запустить по одной горутине на файл. В C# предпочтительно использоватьTask.WhenAllс методами*Async(см. руководство по async). Примеры ниже ориентированы на Python; тот же подход работает в любом языке с примитивом пула воркеров.
С использованием multiprocessing
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from multiprocessing import Pool
def process_pdf(pdf_path: str) -> dict:
try:
doc = PdfDocument(pdf_path)
text = ""
for i in range(doc.page_count()):
text += doc.extract_text(i) + "\n"
return {"file": pdf_path, "text": text, "pages": doc.page_count()}
except PdfError as e:
return {"file": pdf_path, "error": str(e)}
pdf_files = [str(p) for p in Path("documents/").glob("*.pdf")]
with Pool() as pool:
results = pool.map(process_pdf, pdf_files)
success = [r for r in results if "text" in r]
errors = [r for r in results if "error" in r]
print(f"Processed {len(success)}, failed {len(errors)}")
С использованием concurrent.futures
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
def extract_text(pdf_path: str) -> tuple[str, str]:
doc = PdfDocument(pdf_path)
text = ""
for i in range(doc.page_count()):
text += doc.extract_text(i) + "\n"
return pdf_path, text
pdf_files = list(Path("documents/").glob("*.pdf"))
with ProcessPoolExecutor(max_workers=4) as executor:
futures = {executor.submit(extract_text, str(p)): p for p in pdf_files}
for future in as_completed(futures):
pdf_path = futures[future]
try:
path, text = future.result()
print(f"{pdf_path.name}: {len(text)} chars")
except Exception as e:
print(f"Error: {pdf_path.name}: {e}")
Отслеживание прогресса
Простой счётчик
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_files = list(Path("documents/").glob("*.pdf"))
total = len(pdf_files)
for idx, pdf_path in enumerate(pdf_files, 1):
try:
doc = PdfDocument(str(pdf_path))
text = doc.extract_text(0)
print(f"[{idx}/{total}] {pdf_path.name}: OK")
except PdfError as e:
print(f"[{idx}/{total}] {pdf_path.name}: FAILED - {e}")
С использованием tqdm
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
from tqdm import tqdm
pdf_files = list(Path("documents/").glob("*.pdf"))
for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
try:
doc = PdfDocument(str(pdf_path))
for i in range(doc.page_count()):
doc.extract_text(i)
except PdfError:
pass
Обработка ошибок для повреждённых файлов
Создайте надёжный конвейер, который логирует ошибки и продолжает работу:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import json
pdf_dir = Path("mixed-quality/")
results = []
errors = []
for pdf_path in pdf_dir.glob("**/*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
page_count = doc.page_count()
text_length = sum(
len(doc.extract_text(i)) for i in range(page_count)
)
results.append({
"file": str(pdf_path),
"pages": page_count,
"chars": text_length,
})
except PdfError as e:
errors.append({
"file": str(pdf_path),
"error": str(e),
})
except Exception as e:
errors.append({
"file": str(pdf_path),
"error": f"Unexpected: {e}",
})
print(f"Success: {len(results)}, Errors: {len(errors)}")
# Сохранить отчёт об ошибках
if errors:
with open("errors.json", "w") as f:
json.dump(errors, f, indent=2)
WASM
const results = [];
const errors = [];
for (const { name, bytes } of pdfFiles) {
try {
const doc = new WasmPdfDocument(bytes);
const pageCount = doc.pageCount();
let textLength = 0;
for (let i = 0; i < pageCount; i++) {
textLength += doc.extractText(i).length;
}
results.push({ file: name, pages: pageCount, chars: textLength });
doc.free();
} catch (e) {
errors.push({ file: name, error: e.message });
}
}
console.log(`Success: ${results.length}, Errors: ${errors.length}`);
Rust
let mut results = Vec::new();
let mut errors = Vec::new();
for entry in std::fs::read_dir("mixed-quality/")? {
let path = entry?.path();
if path.extension().map_or(false, |e| e == "pdf") {
match PdfDocument::open(path.to_str().unwrap()) {
Ok(mut doc) => {
let page_count = doc.page_count().unwrap_or(0);
let text_length: usize = (0..page_count)
.filter_map(|i| doc.extract_text(i).ok())
.map(|t| t.len())
.sum();
results.push((path.display().to_string(), page_count, text_length));
}
Err(e) => errors.push((path.display().to_string(), e.to_string())),
}
}
}
println!("Success: {}, Errors: {}", results.len(), errors.len());
Go
type Success struct{ File string; Pages int; Chars int }
type Failure struct{ File string; Error string }
var results []Success
var errors []Failure
_ = filepath.Walk("mixed-quality/", func(path string, info os.FileInfo, err error) error {
if err != nil || info.IsDir() || !strings.HasSuffix(path, ".pdf") { return nil }
doc, err := pdfoxide.Open(path)
if err != nil {
errors = append(errors, Failure{path, err.Error()})
return nil
}
defer doc.Close()
n, _ := doc.PageCount()
chars := 0
for i := 0; i < n; i++ {
t, _ := doc.ExtractText(i)
chars += len(t)
}
results = append(results, Success{path, n, chars})
return nil
})
fmt.Printf("Success: %d, Errors: %d\n", len(results), len(errors))
C#
var results = new List<(string File, int Pages, int Chars)>();
var errors = new List<(string File, string Error)>();
foreach (var p in Directory.EnumerateFiles("mixed-quality/", "*.pdf", SearchOption.AllDirectories))
{
try
{
using var doc = PdfDocument.Open(p);
var n = doc.PageCount;
var chars = Enumerable.Range(0, n).Sum(i => doc.ExtractText(i).Length);
results.Add((p, n, chars));
}
catch (Exception e)
{
errors.Add((p, e.Message));
}
}
Console.WriteLine($"Success: {results.Count}, Errors: {errors.Count}");
Java
import fyi.oxide.pdf.PdfDocument;
import java.io.IOException;
import java.nio.file.*;
import java.util.*;
record Success(String file, int pages, long chars) {}
record Failure(String file, String error) {}
List<Success> results = new ArrayList<>();
List<Failure> errors = new ArrayList<>();
try (var stream = Files.walk(Path.of("mixed-quality/"))) {
for (Path path : (Iterable<Path>) stream.filter(p -> p.toString().endsWith(".pdf"))::iterator) {
try (PdfDocument doc = PdfDocument.open(path)) {
int n = doc.pageCount();
long chars = 0;
for (int i = 0; i < n; i++) chars += doc.extractText(i).length();
results.add(new Success(path.toString(), n, chars));
} catch (Exception e) {
errors.add(new Failure(path.toString(), e.getMessage()));
}
}
}
System.out.printf("Success: %d, Errors: %d%n", results.size(), errors.size());
Kotlin
import fyi.oxide.pdf.PdfDocument
import java.io.File
data class Success(val file: String, val pages: Int, val chars: Long)
data class Failure(val file: String, val error: String?)
val results = mutableListOf<Success>()
val errors = mutableListOf<Failure>()
File("mixed-quality/").walkTopDown().filter { it.extension == "pdf" }.forEach { f ->
try {
PdfDocument.open(f.toPath()).use { doc ->
val n = doc.pageCount()
val chars = (0 until n).sumOf { doc.extractText(it).length.toLong() }
results += Success(f.path, n, chars)
}
} catch (e: Exception) {
errors += Failure(f.path, e.message)
}
}
println("Success: ${results.size}, Errors: ${errors.size}")
Scala
import fyi.oxide.pdf.PdfDocument
import scala.util.{Using, Try}
import java.io.File
final case class Success(file: String, pages: Int, chars: Long)
final case class Failure(file: String, error: String)
def pdfs(dir: File): Seq[File] =
dir.listFiles.flatMap(f => if (f.isDirectory) pdfs(f) else if (f.getName.endsWith(".pdf")) Seq(f) else Nil).toSeq
val results = scala.collection.mutable.ArrayBuffer.empty[Success]
val errors = scala.collection.mutable.ArrayBuffer.empty[Failure]
for (f <- pdfs(new File("mixed-quality/"))) {
Try(Using.resource(PdfDocument.open(f.getPath)) { doc =>
val n = doc.pageCount()
val chars = (0 until n).map(i => doc.extractText(i).length.toLong).sum
Success(f.getPath, n, chars)
}).fold(e => errors += Failure(f.getPath, e.getMessage), results += _)
}
println(s"Success: ${results.size}, Errors: ${errors.size}")
Clojure
(require '[pdf-oxide.core :as pdf])
(require '[clojure.java.io :as io])
(let [results (atom [])
errors (atom [])]
(doseq [f (filter #(.endsWith (.getName %) ".pdf")
(file-seq (io/file "mixed-quality/")))]
(try
(with-open [doc (pdf/open (.getPath f))]
(let [n (pdf/page-count doc)
chars (reduce + (map #(count (pdf/extract-text doc %)) (range n)))]
(swap! results conj {:file (.getPath f) :pages n :chars chars})))
(catch Exception e
(swap! errors conj {:file (.getPath f) :error (.getMessage e)}))))
(println (str "Success: " (count @results) ", Errors: " (count @errors))))
Ruby
require 'pdf_oxide'
results = []
errors = []
Dir.glob('mixed-quality/**/*.pdf').each do |path|
begin
PdfOxide::PdfDocument.open(path) do |doc|
n = doc.page_count
chars = (0...n).sum { |i| doc.extract_text(i).length }
results << { file: path, pages: n, chars: chars }
end
rescue PdfOxide::Error => e
errors << { file: path, error: e.message }
end
end
puts "Success: #{results.size}, Errors: #{errors.size}"
PHP
use PdfOxide\PdfDocument;
$results = [];
$errors = [];
$it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator('mixed-quality/'));
foreach ($it as $file) {
if ($file->getExtension() !== 'pdf') continue;
try {
$doc = PdfDocument::open($file->getPathname());
$n = $doc->pageCount();
$chars = 0;
for ($i = 0; $i < $n; $i++) { $chars += strlen($doc->extractText($i)); }
$results[] = ['file' => $file->getPathname(), 'pages' => $n, 'chars' => $chars];
$doc->close();
} catch (\Throwable $e) {
$errors[] = ['file' => $file->getPathname(), 'error' => $e->getMessage()];
}
}
printf("Success: %d, Errors: %d\n", count($results), count($errors));
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
namespace fs = std::filesystem;
struct Success { std::string file; int pages; std::size_t chars; };
struct Failure { std::string file; std::string error; };
std::vector<Success> results;
std::vector<Failure> errors;
for (const auto& entry : fs::recursive_directory_iterator("mixed-quality/")) {
if (entry.path().extension() != ".pdf") continue;
try {
auto doc = pdf_oxide::Document::open(entry.path().string());
int n = doc.page_count();
std::size_t chars = 0;
for (int i = 0; i < n; i++) chars += doc.extract_text(i).size();
results.push_back({entry.path().string(), n, chars});
} catch (const pdf_oxide::Error& e) {
errors.push_back({entry.path().string(), e.what()});
}
}
std::cout << "Success: " << results.size() << ", Errors: " << errors.size() << "\n";
Swift
import PdfOxide
import Foundation
struct Success { let file: String; let pages: Int; let chars: Int }
struct Failure { let file: String; let error: String }
var results: [Success] = []
var errors: [Failure] = []
let root = URL(fileURLWithPath: "mixed-quality/")
let walker = FileManager.default.enumerator(at: root, includingPropertiesForKeys: nil)!
for case let url as URL in walker where url.pathExtension == "pdf" {
do {
let doc = try Document.open(url.path)
let n = try doc.pageCount()
var chars = 0
for i in 0..<n { chars += try doc.extractText(i).count }
results.append(Success(file: url.path, pages: n, chars: chars))
} catch {
errors.append(Failure(file: url.path, error: "\(error)"))
}
}
print("Success: \(results.count), Errors: \(errors.count)")
Dart
import 'dart:io';
import 'package:pdf_oxide/pdf_oxide.dart';
final results = <Map<String, Object>>[];
final errors = <Map<String, Object>>[];
for (final entry in Directory('mixed-quality/').listSync(recursive: true)) {
if (entry is! File || !entry.path.endsWith('.pdf')) continue;
try {
final doc = PdfDocument.open(entry.path);
final n = doc.pageCount;
var chars = 0;
for (var i = 0; i < n; i++) chars += doc.extractText(i).length;
results.add({'file': entry.path, 'pages': n, 'chars': chars});
doc.close();
} catch (e) {
errors.add({'file': entry.path, 'error': '$e'});
}
}
print('Success: ${results.length}, Errors: ${errors.length}');
R
library(pdfoxide)
results <- list()
errors <- list()
for (path in list.files("mixed-quality/", pattern = "\\.pdf$", recursive = TRUE, full.names = TRUE)) {
tryCatch({
doc <- pdf_open(path)
n <- pdf_page_count(doc)
chars <- sum(vapply(0:(n - 1), function(i) nchar(pdf_extract_text(doc, i)), integer(1)))
results[[length(results) + 1]] <- list(file = path, pages = n, chars = chars)
}, error = function(e) {
errors[[length(errors) + 1]] <<- list(file = path, error = conditionMessage(e))
})
}
cat(sprintf("Success: %d, Errors: %d\n", length(results), length(errors)))
Julia
using PdfOxide
results = NamedTuple[]
errors = NamedTuple[]
for (root, _, files) in walkdir("mixed-quality/"), name in files
endswith(name, ".pdf") || continue
path = joinpath(root, name)
try
doc = open_document(path)
n = page_count(doc)
chars = sum(i -> length(extract_text(doc, i)), 0:(n - 1); init = 0)
push!(results, (file = path, pages = n, chars = chars))
catch e
push!(errors, (file = path, error = sprint(showerror, e)))
end
end
println("Success: $(length(results)), Errors: $(length(errors))")
Zig
const std = @import("std");
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
var ok: usize = 0;
var failed: usize = 0;
var root = try std.fs.cwd().openDir("mixed-quality/", .{ .iterate = true });
defer root.close();
var walker = try root.walk(a);
defer walker.deinit();
while (try walker.next()) |entry| {
if (entry.kind != .file or !std.mem.endsWith(u8, entry.basename, ".pdf")) continue;
const path = try std.fs.path.joinZ(a, &.{ "mixed-quality/", entry.path });
defer a.free(path);
var doc = pdf_oxide.Document.open(path) catch {
failed += 1;
continue;
};
defer doc.deinit();
const n = try doc.pageCount();
var chars: usize = 0;
var i: i32 = 0;
while (i < n) : (i += 1) {
const t = try doc.extractText(a, i);
defer a.free(t);
chars += t.len;
}
ok += 1;
}
std.debug.print("Success: {d}, Errors: {d}\n", .{ ok, failed });
Objective-C
#import "POXPdfOxide.h"
NSFileManager *fm = [NSFileManager defaultManager];
NSUInteger ok = 0, failed = 0;
NSDirectoryEnumerator *en = [fm enumeratorAtPath:@"mixed-quality/"];
for (NSString *rel in en) {
if (![rel hasSuffix:@".pdf"]) continue;
NSString *path = [@"mixed-quality/" stringByAppendingString:rel];
NSError *err = nil;
POXDocument *doc = [POXDocument openPath:path error:&err];
if (!doc) { failed++; continue; }
NSInteger n = [doc pageCountError:&err];
NSUInteger chars = 0;
for (NSInteger i = 0; i < n; i++) chars += [doc extractText:i error:&err].length;
ok++;
}
NSLog(@"Success: %lu, Errors: %lu", (unsigned long)ok, (unsigned long)failed);
Elixir
{results, errors} =
"mixed-quality/**/*.pdf"
|> Path.wildcard()
|> Enum.reduce({[], []}, fn path, {ok, bad} ->
case PdfOxide.open(path) do
{:ok, doc} ->
{:ok, n} = PdfOxide.page_count(doc)
chars =
0..(n - 1)
|> Enum.reduce(0, fn i, acc -> acc + String.length(elem(PdfOxide.extract_text(doc, i), 1)) end)
{[%{file: path, pages: n, chars: chars} | ok], bad}
{:error, reason} ->
{ok, [%{file: path, error: inspect(reason)} | bad]}
end
end)
IO.puts("Success: #{length(results)}, Errors: #{length(errors)}")
Экономия памяти при обработке больших корпусов
Для очень больших коллекций обрабатывайте файлы по одному, не накапливая результаты в памяти:
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
import csv
pdf_dir = Path("large-corpus/")
with open("output.csv", "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["file", "page", "text"])
for pdf_path in pdf_dir.glob("*.pdf"):
try:
doc = PdfDocument(str(pdf_path))
for i in range(doc.page_count()):
text = doc.extract_text(i)
writer.writerow([pdf_path.name, i, text])
except PdfError:
pass
Пакетное слияние
Объедините все PDF из каталога в один документ:
Python
from pdf_oxide import PdfDocument, PdfError
from pathlib import Path
pdf_files = sorted(Path("reports/").glob("*.pdf"))
if pdf_files:
doc = PdfDocument(str(pdf_files[0]))
for pdf_path in pdf_files[1:]:
try:
doc.merge_from(str(pdf_path))
except PdfError as e:
print(f"Skipped {pdf_path.name}: {e}")
doc.save("all-reports.pdf")
Rust
use pdf_oxide::editor::DocumentEditor;
let mut files: Vec<_> = std::fs::read_dir("reports/")?
.filter_map(|e| e.ok())
.filter(|e| e.path().extension().map_or(false, |ext| ext == "pdf"))
.collect();
files.sort_by_key(|e| e.path());
if let Some(first) = files.first() {
let mut editor = DocumentEditor::open(first.path().to_str().unwrap())?;
for entry in &files[1..] {
if let Err(e) = editor.merge_from(entry.path().to_str().unwrap()) {
println!("Skipped {}: {}", entry.path().display(), e);
}
}
editor.save("all-reports.pdf")?;
}
Go
files, _ := filepath.Glob("reports/*.pdf")
sort.Strings(files)
// Top-level Merge concatenates every file in one call
bytes, err := pdfoxide.Merge(files)
if err != nil { log.Fatal(err) }
_ = os.WriteFile("all-reports.pdf", bytes, 0644)
C++
#include <pdf_oxide/pdf_oxide.hpp>
#include <filesystem>
#include <fstream>
#include <algorithm>
namespace fs = std::filesystem;
std::vector<std::string> files;
for (const auto& entry : fs::directory_iterator("reports/"))
if (entry.path().extension() == ".pdf") files.push_back(entry.path().string());
std::sort(files.begin(), files.end());
// Top-level merge concatenates every file in one call
auto bytes = pdf_oxide::merge(files);
std::ofstream("all-reports.pdf", std::ios::binary).write(
reinterpret_cast<const char*>(bytes.data()), bytes.size());
Swift
import PdfOxide
import Foundation
let dir = "reports/"
let files = try FileManager.default.contentsOfDirectory(atPath: dir)
.filter { $0.hasSuffix(".pdf") }
.sorted()
.map { dir + $0 }
// Top-level merge concatenates every file in one call
let bytes = try merge(files)
try Data(bytes).write(to: URL(fileURLWithPath: "all-reports.pdf"))
Dart
import 'dart:io';
import 'dart:typed_data';
import 'package:pdf_oxide/pdf_oxide.dart';
final files = Directory('reports/')
.listSync()
.map((e) => e.path)
.where((p) => p.endsWith('.pdf'))
.toList()
..sort();
// Top-level merge concatenates every file in one call
final Uint8List bytes = pdfMerge(files);
File('all-reports.pdf').writeAsBytesSync(bytes);
R
library(pdfoxide)
files <- sort(Sys.glob("reports/*.pdf"))
# Top-level merge concatenates every file in one call
bytes <- pdf_merge(files)
writeBin(bytes, "all-reports.pdf")
Julia
using PdfOxide
files = sort(filter(p -> endswith(p, ".pdf"), readdir("reports/"; join = true)))
# Top-level merge concatenates every file in one call
bytes = merge_pdfs(files)
write("all-reports.pdf", bytes)
Zig
const std = @import("std");
const pdf_oxide = @import("pdf_oxide");
const a = std.heap.page_allocator;
var paths = std.ArrayList([*:0]const u8).init(a);
defer paths.deinit();
var dir = try std.fs.cwd().openDir("reports/", .{ .iterate = true });
defer dir.close();
var it = dir.iterate();
while (try it.next()) |entry| {
if (!std.mem.endsWith(u8, entry.name, ".pdf")) continue;
try paths.append(try std.fs.path.joinZ(a, &.{ "reports/", entry.name }));
}
// Top-level merge concatenates every file in one call
const bytes = try pdf_oxide.merge(a, paths.items);
defer a.free(bytes);
try std.fs.cwd().writeFile(.{ .sub_path = "all-reports.pdf", .data = bytes });
Objective-C
#import "POXPdfOxide.h"
NSFileManager *fm = [NSFileManager defaultManager];
NSMutableArray<NSString*> *files = [NSMutableArray array];
for (NSString *name in [fm contentsOfDirectoryAtPath:@"reports/" error:nil])
if ([name hasSuffix:@".pdf"]) [files addObject:[@"reports/" stringByAppendingString:name]];
[files sortUsingSelector:@selector(compare:)];
// Top-level merge concatenates every file in one call
NSError *err = nil;
NSData *bytes = [POXPdf merge:files error:&err];
[bytes writeToFile:@"all-reports.pdf" atomically:YES];
Elixir
files =
"reports/*.pdf"
|> Path.wildcard()
|> Enum.sort()
# Top-level merge concatenates every file in one call
{:ok, bytes} = PdfOxide.merge(files)
File.write!("all-reports.pdf", bytes)
Связанные страницы
- Извлечение текста из PDF — основы извлечения текста
- PDF для RAG-пайплайнов — паттерны интеграции с RAG
- PDF в Markdown — конвертация в Markdown
- Бенчмарки производительности — результаты тестирования