Form Data Extraction
PDF Oxide extracts interactive form fields (AcroForms) from PDF documents, including text fields, checkboxes, radio buttons, choice fields, and signatures. Extracted form data can be exported to FDF or XFDF format for interchange. XFA forms (XML Forms Architecture) can be analyzed and converted as well.
Quick Example
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
fields = doc.get_form_fields()
for field in fields:
print(f"{field.name} ({field.field_type}): {field.value}")
Node.js
const { PdfDocument } = require("pdf-oxide");
const doc = new PdfDocument("form.pdf");
const fields = doc.getFormFields();
for (const field of fields) {
console.log(`${field.name} (${field.fieldType}): ${field.value}`);
}
doc.close();
Go
import pdfoxide "github.com/yfedoseev/pdf_oxide/go"
doc, _ := pdfoxide.Open("form.pdf")
defer doc.Close()
fields, _ := doc.FormFields()
for _, field := range fields {
fmt.Printf("%s (%s): %s\n", field.Name, field.FieldType, field.Value)
}
C#
using PdfOxide.Core;
using var doc = PdfDocument.Open("form.pdf");
var fields = doc.GetFormFields();
foreach (var field in fields)
{
Console.WriteLine($"{field.Name} ({field.FieldType}): {field.Value}");
}
WASM
const doc = new WasmPdfDocument(bytes);
const fields = doc.getFormFields();
for (const field of fields) {
console.log(`${field.name} (${field.fieldType}): ${field.value}`);
}
Rust
use pdf_oxide::extractors::FormExtractor;
use pdf_oxide::PdfDocument;
let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
for field in &fields {
println!("{} ({:?}): {:?}", field.full_name, field.field_type, field.value);
}
Migrating from PyMuPDF get_form_fields()
If you’re switching from PyMuPDF, the API is similar but PDF Oxide returns richer data and handles XFA forms:
PyMuPDF:
import fitz
doc = fitz.open("form.pdf")
# Returns dict of {field_name: field_value} — loses type info
fields = doc.get_form_fields()
# Or iterate widgets for more detail
for page in doc:
for widget in page.widgets():
print(widget.field_name, widget.field_value)
PDF Oxide:
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
# Returns structured objects with name, value, type, options, rect
fields = doc.get_form_fields()
for field in fields:
print(f"{field.name} ({field.field_type}): {field.value}")
# Also handles XFA forms that PyMuPDF cannot read
xfa = doc.has_xfa()
Key differences:
- PDF Oxide returns structured field objects (not just a dict)
- Includes field type, bounding rect, and options for choice fields
- Supports XFA forms — PyMuPDF’s
get_form_fields()returns empty for XFA-only PDFs - Export to FDF/XFDF format for form data interchange
For a complete migration guide covering PyMuPDF, pypdf, pdfplumber, and pdfminer, see Migrate to PDF Oxide.
Reading Form Fields
Get All Fields
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("tax-form.pdf")
fields = doc.get_form_fields()
for field in fields:
print(f"Name: {field.name}")
print(f" Type: {field.field_type}")
print(f" Value: {field.value}")
print(f" Required: {field.is_required}")
print(f" Read-only: {field.is_readonly}")
if field.max_length:
print(f" Max length: {field.max_length}")
Node.js
const doc = new PdfDocument("tax-form.pdf");
const fields = doc.getFormFields();
for (const field of fields) {
console.log(`Name: ${field.name}`);
console.log(` Type: ${field.fieldType}`);
console.log(` Value: ${field.value}`);
}
doc.close();
Go
doc, _ := pdfoxide.Open("tax-form.pdf")
defer doc.Close()
fields, _ := doc.FormFields()
for _, field := range fields {
fmt.Printf("Name: %s\n", field.Name)
fmt.Printf(" Type: %s\n", field.FieldType)
fmt.Printf(" Value: %s\n", field.Value)
}
C#
using var doc = PdfDocument.Open("tax-form.pdf");
var fields = doc.GetFormFields();
foreach (var field in fields)
{
Console.WriteLine($"Name: {field.Name}");
Console.WriteLine($" Type: {field.FieldType}");
Console.WriteLine($" Value: {field.Value}");
}
WASM
const doc = new WasmPdfDocument(bytes);
const fields = doc.getFormFields();
for (const field of fields) {
console.log(`Name: ${field.name}`);
console.log(` Type: ${field.fieldType}`);
console.log(` Value: ${field.value}`);
console.log(` Flags: ${field.flags}`);
}
Rust
use pdf_oxide::extractors::{FormExtractor, FieldType};
use pdf_oxide::PdfDocument;
let mut doc = PdfDocument::open("tax-form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
for field in &fields {
let type_str = match &field.field_type {
FieldType::Button => "Button",
FieldType::Text => "Text",
FieldType::Choice => "Choice",
FieldType::Signature => "Signature",
FieldType::Unknown(s) => s.as_str(),
};
println!("[{}] {} = {:?}", type_str, field.full_name, field.value);
if let Some(tooltip) = &field.tooltip {
println!(" Tooltip: {}", tooltip);
}
if let Some(bounds) = &field.bounds {
println!(" Bounds: [{:.1}, {:.1}, {:.1}, {:.1}]",
bounds[0], bounds[1], bounds[2], bounds[3]);
}
}
Get a Specific Field Value
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
name = doc.get_form_field_value("employee_name")
ssn = doc.get_form_field_value("ssn")
agreed = doc.get_form_field_value("agree_to_terms")
print(f"Name: {name}") # "John Doe"
print(f"SSN: {ssn}") # "123-45-6789"
print(f"Agreed: {agreed}") # True
WASM
const doc = new WasmPdfDocument(bytes);
const name = doc.getFormFieldValue("employee_name");
const ssn = doc.getFormFieldValue("ssn");
const agreed = doc.getFormFieldValue("agree_to_terms");
console.log(`Name: ${name}`); // "John Doe"
console.log(`SSN: ${ssn}`); // "123-45-6789"
console.log(`Agreed: ${agreed}`); // true
Rust
use pdf_oxide::editor::{DocumentEditor, EditableDocument};
let mut editor = DocumentEditor::open("form.pdf")?;
if let Some(value) = editor.get_form_field_value("employee_name")? {
println!("Name: {:?}", value);
}
Filling Forms
Set Field Values
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
# Set text fields
doc.set_form_field_value("full_name", "Jane Doe")
doc.set_form_field_value("email", "jane@example.com")
# Set checkboxes
doc.set_form_field_value("agree_to_terms", True)
# Save the filled form
doc.save("filled_form.pdf")
WASM
const doc = new WasmPdfDocument(bytes);
// Set text fields
doc.setFormFieldValue("full_name", "Jane Doe");
doc.setFormFieldValue("email", "jane@example.com");
// Set checkboxes
doc.setFormFieldValue("agree_to_terms", true);
// Save the filled form
const filledBytes = doc.save();
Rust
use pdf_oxide::editor::{DocumentEditor, EditableDocument, FormFieldValue};
let mut editor = DocumentEditor::open("form.pdf")?;
// Set text fields
editor.set_form_field_value("full_name", FormFieldValue::Text("Jane Doe".into()))?;
editor.set_form_field_value("email", FormFieldValue::Text("jane@example.com".into()))?;
// Set checkboxes
editor.set_form_field_value("agree_to_terms", FormFieldValue::Boolean(true))?;
// Set choice fields
editor.set_form_field_value("state", FormFieldValue::Choice("California".into()))?;
editor.save("filled_form.pdf")?;
Exporting Form Data
Export form field data as FDF or XFDF for interchange with other applications.
FDF Export
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
doc.export_form_data("form_data.fdf")
WASM
const doc = new WasmPdfDocument(bytes);
const fdfBytes = doc.exportFormData("fdf");
// fdfBytes is a Uint8Array
Rust
use pdf_oxide::extractors::FormExtractor;
use pdf_oxide::PdfDocument;
let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
let fdf_bytes = FormExtractor::export_fdf(&mut doc, fields)?;
std::fs::write("form_data.fdf", &fdf_bytes)?;
XFDF Export
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
doc.export_form_data("form_data.xfdf", format="xfdf")
WASM
const doc = new WasmPdfDocument(bytes);
const xfdfBytes = doc.exportFormData("xfdf");
Rust
use pdf_oxide::extractors::FormExtractor;
use pdf_oxide::PdfDocument;
let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
let xfdf = FormExtractor::export_xfdf(&mut doc, fields)?;
std::fs::write("form_data.xfdf", &xfdf)?;
Form Fields in Markdown/HTML
Form field values are included in Markdown and HTML conversion by default. Use include_form_fields to control this.
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
# Include form field values (default)
md = doc.to_markdown(0, include_form_fields=True)
# Exclude form fields
md = doc.to_markdown(0, include_form_fields=False)
WASM
const doc = new WasmPdfDocument(bytes);
// Include form fields (default: true)
const md = doc.toMarkdown(0, true, true, true);
// Exclude form fields (4th parameter)
const md2 = doc.toMarkdown(0, true, true, false);
Rust
use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;
let doc = PdfDocument::open("form.pdf")?;
let options = ConversionOptions {
include_form_fields: true,
..Default::default()
};
let md = doc.to_markdown(0, &options)?;
Flattening Forms
Flatten form fields into page content so they become non-editable. Useful for creating finalized PDFs.
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
# Flatten all form fields
doc.flatten_forms()
doc.save("flattened.pdf")
# Or flatten a single page
doc2 = PdfDocument("form.pdf")
doc2.flatten_forms_on_page(0)
doc2.save("flattened_page0.pdf")
WASM
const doc = new WasmPdfDocument(bytes);
// Flatten all form fields
doc.flattenForms();
const flattened = doc.save();
// Or flatten a single page
const doc2 = new WasmPdfDocument(bytes);
doc2.flattenFormsOnPage(0);
const flattened2 = doc2.save();
Rust
use pdf_oxide::Pdf;
let mut pdf = Pdf::open("form.pdf")?;
// Mark a specific page for flattening
pdf.flatten_page_annotations(0);
pdf.save("flattened.pdf")?;
// Or flatten all pages
let mut pdf2 = Pdf::open("form.pdf")?;
pdf2.flatten_all_annotations();
pdf2.save("flattened_all.pdf")?;
XFA Forms
Analyze XFA (XML Forms Architecture) form content. XFA forms use XML-based templates rather than AcroForm fields and are common in government and enterprise forms.
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("xfa-form.pdf")
if doc.has_xfa():
print("This document contains an XFA form")
fields = doc.get_form_fields() # Extracts AcroForm fallback fields
for field in fields:
print(f" {field.name}: {field.value}")
Node.js
const doc = new PdfDocument("xfa-form.pdf");
if (doc.hasXFA()) {
console.log("This document contains an XFA form");
const fields = doc.getFormFields();
for (const field of fields) {
console.log(` ${field.name}: ${field.value}`);
}
}
doc.close();
Go
doc, _ := pdfoxide.Open("xfa-form.pdf")
defer doc.Close()
if doc.HasXfa() {
fmt.Println("This document contains an XFA form")
fields, _ := doc.FormFields()
for _, field := range fields {
fmt.Printf(" %s: %s\n", field.Name, field.Value)
}
}
C#
using var doc = PdfDocument.Open("xfa-form.pdf");
if (doc.HasXfa)
{
Console.WriteLine("This document contains an XFA form");
var fields = doc.GetFormFields();
foreach (var field in fields)
{
Console.WriteLine($" {field.Name}: {field.Value}");
}
}
WASM
const doc = new WasmPdfDocument(bytes);
if (doc.hasXfa()) {
console.log("This document contains an XFA form");
const fields = doc.getFormFields(); // AcroForm fallback fields
for (const field of fields) {
console.log(` ${field.name}: ${field.value}`);
}
}
Rust
use pdf_oxide::xfa::analyze_xfa_document;
use pdf_oxide::PdfDocument;
let mut doc = PdfDocument::open("xfa-form.pdf")?;
let analysis = analyze_xfa_document(&mut doc)?;
println!("XFA form detected: {} fields", analysis.fields.len());
for field in &analysis.fields {
println!(" {} ({:?})", field.name, field.field_type);
}
API Reference
Python API
| Method | Description |
|---|---|
doc.get_form_fields() |
Get all form fields as FormField objects |
doc.get_form_field_value(name) |
Get value of a specific field by name |
doc.set_form_field_value(name, value) |
Set value of a form field |
doc.export_form_data(path, format="fdf") |
Export form data to FDF or XFDF file |
doc.has_xfa() |
Check if the document contains an XFA form |
doc.flatten_forms() |
Flatten all form fields into page content |
doc.flatten_forms_on_page(page) |
Flatten form fields on a specific page |
Python FormField Properties
| Property | Type | Description |
|---|---|---|
name |
str |
Field name |
field_type |
str |
Field type (text, checkbox, radio, choice, signature) |
value |
str | bool | None |
Current field value |
is_required |
bool |
Whether the field is required |
is_readonly |
bool |
Whether the field is read-only |
max_length |
int | None |
Maximum length for text fields |
JavaScript API
| Method | Description |
|---|---|
doc.getFormFields() |
Get all form fields |
doc.getFormFieldValue(name) |
Get value of a specific field by name |
doc.setFormFieldValue(name, value) |
Set value of a form field |
doc.exportFormData(format?) |
Export as FDF (default) or XFDF, returns Uint8Array |
doc.hasXfa() |
Check if the document contains an XFA form |
doc.flattenForms() |
Flatten all form fields into page content |
doc.flattenFormsOnPage(pageIndex) |
Flatten form fields on a specific page |
JavaScript FormField Properties
| Property | Type | Description |
|---|---|---|
name |
string |
Field name |
fieldType |
string |
Field type |
value |
string | boolean | null |
Current value |
flags |
number |
Field flags |
Rust API
| Function | Description |
|---|---|
FormExtractor::extract_fields(doc) |
Extract all form fields from AcroForm dictionary |
FormExtractor::export_fdf(doc, fields) |
Export as FDF bytes |
FormExtractor::export_xfdf(doc, fields) |
Export as XFDF string |
analyze_xfa_document(doc) |
Analyze XFA form structure |
editor.get_form_fields() |
Get fields via DocumentEditor |
editor.get_form_field_value(name) |
Get field value by name |
editor.set_form_field_value(name, value) |
Set field value |
FormField Fields (Rust)
| Field | Type | Description |
|---|---|---|
name |
String |
Field name from /T key |
full_name |
String |
Fully qualified name (dot-separated) |
field_type |
FieldType |
Button, Text, Choice, Signature, Unknown |
value |
FieldValue |
Current field value |
tooltip |
Option<String> |
Tooltip from /TU key |
bounds |
Option<[f64; 4]> |
Bounding box [x1, y1, x2, y2] |
flags |
Option<u32> |
Field flags (ReadOnly, Required, NoExport) |
default_value |
Option<FieldValue> |
Default value from /DV key |
max_length |
Option<u32> |
Max length for text fields |
FieldType Variants
| Variant | Description |
|---|---|
Button |
Checkbox, radio button, or push button (/Btn) |
Text |
Single or multi-line text field (/Tx) |
Choice |
List box or combo box (/Ch) |
Signature |
Digital signature field (/Sig) |
Unknown(String) |
Unrecognized field type |
FieldValue Variants
| Variant | Description |
|---|---|
Text(String) |
Text string value |
Boolean(bool) |
Boolean value (checkboxes) |
Name(String) |
Name value (radio buttons, choice fields) |
Array(Vec<String>) |
Multiple values (multi-select list boxes) |
None |
No value present |
Advanced: Check Required Fields
Python
from pdf_oxide import PdfDocument
doc = PdfDocument("form.pdf")
fields = doc.get_form_fields()
missing = [f for f in fields if f.is_required and not f.value]
if missing:
print("Missing required fields:")
for f in missing:
print(f" - {f.name}")
Rust
use pdf_oxide::extractors::{FormExtractor, FieldValue};
use pdf_oxide::PdfDocument;
let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
let required_empty: Vec<_> = fields.iter()
.filter(|f| {
f.flags.map_or(false, |flags| flags & 0x02 != 0)
&& matches!(f.value, FieldValue::None | FieldValue::Text(ref s) if s.is_empty())
})
.collect();
if !required_empty.is_empty() {
println!("Missing required fields:");
for f in &required_empty {
println!(" - {}", f.full_name);
}
}
Related Pages
- Fill PDF Forms – Step-by-step form filling guide
- Annotation Extraction – Access annotations alongside form fields
- Text Extraction – Extract text content from pages
- Metadata & XMP – Read document-level properties