Skip to content

Form Data Extraction

PDF Oxide extracts interactive form fields (AcroForms) from PDF documents, including text fields, checkboxes, radio buttons, choice fields, and signatures. Extracted form data can be exported to FDF or XFDF format for interchange. XFA forms (XML Forms Architecture) can be analyzed and converted as well.

Quick Example

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")
fields = doc.get_form_fields()
for field in fields:
    print(f"{field.name} ({field.field_type}): {field.value}")

Node.js

const { PdfDocument } = require("pdf-oxide");

const doc = new PdfDocument("form.pdf");
const fields = doc.getFormFields();
for (const field of fields) {
  console.log(`${field.name} (${field.fieldType}): ${field.value}`);
}
doc.close();

Go

import pdfoxide "github.com/yfedoseev/pdf_oxide/go"

doc, _ := pdfoxide.Open("form.pdf")
defer doc.Close()
fields, _ := doc.FormFields()
for _, field := range fields {
    fmt.Printf("%s (%s): %s\n", field.Name, field.FieldType, field.Value)
}

C#

using PdfOxide.Core;

using var doc = PdfDocument.Open("form.pdf");
var fields = doc.GetFormFields();
foreach (var field in fields)
{
    Console.WriteLine($"{field.Name} ({field.FieldType}): {field.Value}");
}

WASM

const doc = new WasmPdfDocument(bytes);
const fields = doc.getFormFields();
for (const field of fields) {
    console.log(`${field.name} (${field.fieldType}): ${field.value}`);
}

Rust

use pdf_oxide::extractors::FormExtractor;
use pdf_oxide::PdfDocument;

let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
for field in &fields {
    println!("{} ({:?}): {:?}", field.full_name, field.field_type, field.value);
}

Migrating from PyMuPDF get_form_fields()

If you’re switching from PyMuPDF, the API is similar but PDF Oxide returns richer data and handles XFA forms:

PyMuPDF:

import fitz

doc = fitz.open("form.pdf")
# Returns dict of {field_name: field_value} — loses type info
fields = doc.get_form_fields()

# Or iterate widgets for more detail
for page in doc:
    for widget in page.widgets():
        print(widget.field_name, widget.field_value)

PDF Oxide:

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")
# Returns structured objects with name, value, type, options, rect
fields = doc.get_form_fields()
for field in fields:
    print(f"{field.name} ({field.field_type}): {field.value}")

# Also handles XFA forms that PyMuPDF cannot read
xfa = doc.has_xfa()

Key differences:

  • PDF Oxide returns structured field objects (not just a dict)
  • Includes field type, bounding rect, and options for choice fields
  • Supports XFA forms — PyMuPDF’s get_form_fields() returns empty for XFA-only PDFs
  • Export to FDF/XFDF format for form data interchange

For a complete migration guide covering PyMuPDF, pypdf, pdfplumber, and pdfminer, see Migrate to PDF Oxide.


Reading Form Fields

Get All Fields

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("tax-form.pdf")
fields = doc.get_form_fields()

for field in fields:
    print(f"Name: {field.name}")
    print(f"  Type: {field.field_type}")
    print(f"  Value: {field.value}")
    print(f"  Required: {field.is_required}")
    print(f"  Read-only: {field.is_readonly}")
    if field.max_length:
        print(f"  Max length: {field.max_length}")

Node.js

const doc = new PdfDocument("tax-form.pdf");
const fields = doc.getFormFields();

for (const field of fields) {
  console.log(`Name: ${field.name}`);
  console.log(`  Type: ${field.fieldType}`);
  console.log(`  Value: ${field.value}`);
}
doc.close();

Go

doc, _ := pdfoxide.Open("tax-form.pdf")
defer doc.Close()
fields, _ := doc.FormFields()

for _, field := range fields {
    fmt.Printf("Name: %s\n", field.Name)
    fmt.Printf("  Type: %s\n", field.FieldType)
    fmt.Printf("  Value: %s\n", field.Value)
}

C#

using var doc = PdfDocument.Open("tax-form.pdf");
var fields = doc.GetFormFields();

foreach (var field in fields)
{
    Console.WriteLine($"Name: {field.Name}");
    Console.WriteLine($"  Type: {field.FieldType}");
    Console.WriteLine($"  Value: {field.Value}");
}

WASM

const doc = new WasmPdfDocument(bytes);
const fields = doc.getFormFields();

for (const field of fields) {
    console.log(`Name: ${field.name}`);
    console.log(`  Type: ${field.fieldType}`);
    console.log(`  Value: ${field.value}`);
    console.log(`  Flags: ${field.flags}`);
}

Rust

use pdf_oxide::extractors::{FormExtractor, FieldType};
use pdf_oxide::PdfDocument;

let mut doc = PdfDocument::open("tax-form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;

for field in &fields {
    let type_str = match &field.field_type {
        FieldType::Button => "Button",
        FieldType::Text => "Text",
        FieldType::Choice => "Choice",
        FieldType::Signature => "Signature",
        FieldType::Unknown(s) => s.as_str(),
    };

    println!("[{}] {} = {:?}", type_str, field.full_name, field.value);

    if let Some(tooltip) = &field.tooltip {
        println!("  Tooltip: {}", tooltip);
    }
    if let Some(bounds) = &field.bounds {
        println!("  Bounds: [{:.1}, {:.1}, {:.1}, {:.1}]",
            bounds[0], bounds[1], bounds[2], bounds[3]);
    }
}

Get a Specific Field Value

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")

name = doc.get_form_field_value("employee_name")
ssn = doc.get_form_field_value("ssn")
agreed = doc.get_form_field_value("agree_to_terms")

print(f"Name: {name}")       # "John Doe"
print(f"SSN: {ssn}")         # "123-45-6789"
print(f"Agreed: {agreed}")   # True

WASM

const doc = new WasmPdfDocument(bytes);

const name = doc.getFormFieldValue("employee_name");
const ssn = doc.getFormFieldValue("ssn");
const agreed = doc.getFormFieldValue("agree_to_terms");

console.log(`Name: ${name}`);     // "John Doe"
console.log(`SSN: ${ssn}`);       // "123-45-6789"
console.log(`Agreed: ${agreed}`); // true

Rust

use pdf_oxide::editor::{DocumentEditor, EditableDocument};

let mut editor = DocumentEditor::open("form.pdf")?;

if let Some(value) = editor.get_form_field_value("employee_name")? {
    println!("Name: {:?}", value);
}

Filling Forms

Set Field Values

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")

# Set text fields
doc.set_form_field_value("full_name", "Jane Doe")
doc.set_form_field_value("email", "jane@example.com")

# Set checkboxes
doc.set_form_field_value("agree_to_terms", True)

# Save the filled form
doc.save("filled_form.pdf")

WASM

const doc = new WasmPdfDocument(bytes);

// Set text fields
doc.setFormFieldValue("full_name", "Jane Doe");
doc.setFormFieldValue("email", "jane@example.com");

// Set checkboxes
doc.setFormFieldValue("agree_to_terms", true);

// Save the filled form
const filledBytes = doc.save();

Rust

use pdf_oxide::editor::{DocumentEditor, EditableDocument, FormFieldValue};

let mut editor = DocumentEditor::open("form.pdf")?;

// Set text fields
editor.set_form_field_value("full_name", FormFieldValue::Text("Jane Doe".into()))?;
editor.set_form_field_value("email", FormFieldValue::Text("jane@example.com".into()))?;

// Set checkboxes
editor.set_form_field_value("agree_to_terms", FormFieldValue::Boolean(true))?;

// Set choice fields
editor.set_form_field_value("state", FormFieldValue::Choice("California".into()))?;

editor.save("filled_form.pdf")?;

Exporting Form Data

Export form field data as FDF or XFDF for interchange with other applications.

FDF Export

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")
doc.export_form_data("form_data.fdf")

WASM

const doc = new WasmPdfDocument(bytes);
const fdfBytes = doc.exportFormData("fdf");
// fdfBytes is a Uint8Array

Rust

use pdf_oxide::extractors::FormExtractor;
use pdf_oxide::PdfDocument;

let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
let fdf_bytes = FormExtractor::export_fdf(&mut doc, fields)?;
std::fs::write("form_data.fdf", &fdf_bytes)?;

XFDF Export

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")
doc.export_form_data("form_data.xfdf", format="xfdf")

WASM

const doc = new WasmPdfDocument(bytes);
const xfdfBytes = doc.exportFormData("xfdf");

Rust

use pdf_oxide::extractors::FormExtractor;
use pdf_oxide::PdfDocument;

let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;
let xfdf = FormExtractor::export_xfdf(&mut doc, fields)?;
std::fs::write("form_data.xfdf", &xfdf)?;

Form Fields in Markdown/HTML

Form field values are included in Markdown and HTML conversion by default. Use include_form_fields to control this.

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")

# Include form field values (default)
md = doc.to_markdown(0, include_form_fields=True)

# Exclude form fields
md = doc.to_markdown(0, include_form_fields=False)

WASM

const doc = new WasmPdfDocument(bytes);

// Include form fields (default: true)
const md = doc.toMarkdown(0, true, true, true);

// Exclude form fields (4th parameter)
const md2 = doc.toMarkdown(0, true, true, false);

Rust

use pdf_oxide::PdfDocument;
use pdf_oxide::converters::ConversionOptions;

let doc = PdfDocument::open("form.pdf")?;
let options = ConversionOptions {
    include_form_fields: true,
    ..Default::default()
};
let md = doc.to_markdown(0, &options)?;

Flattening Forms

Flatten form fields into page content so they become non-editable. Useful for creating finalized PDFs.

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")

# Flatten all form fields
doc.flatten_forms()
doc.save("flattened.pdf")

# Or flatten a single page
doc2 = PdfDocument("form.pdf")
doc2.flatten_forms_on_page(0)
doc2.save("flattened_page0.pdf")

WASM

const doc = new WasmPdfDocument(bytes);

// Flatten all form fields
doc.flattenForms();
const flattened = doc.save();

// Or flatten a single page
const doc2 = new WasmPdfDocument(bytes);
doc2.flattenFormsOnPage(0);
const flattened2 = doc2.save();

Rust

use pdf_oxide::Pdf;

let mut pdf = Pdf::open("form.pdf")?;

// Mark a specific page for flattening
pdf.flatten_page_annotations(0);
pdf.save("flattened.pdf")?;

// Or flatten all pages
let mut pdf2 = Pdf::open("form.pdf")?;
pdf2.flatten_all_annotations();
pdf2.save("flattened_all.pdf")?;

XFA Forms

Analyze XFA (XML Forms Architecture) form content. XFA forms use XML-based templates rather than AcroForm fields and are common in government and enterprise forms.

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("xfa-form.pdf")
if doc.has_xfa():
    print("This document contains an XFA form")
    fields = doc.get_form_fields()  # Extracts AcroForm fallback fields
    for field in fields:
        print(f"  {field.name}: {field.value}")

Node.js

const doc = new PdfDocument("xfa-form.pdf");
if (doc.hasXFA()) {
  console.log("This document contains an XFA form");
  const fields = doc.getFormFields();
  for (const field of fields) {
    console.log(`  ${field.name}: ${field.value}`);
  }
}
doc.close();

Go

doc, _ := pdfoxide.Open("xfa-form.pdf")
defer doc.Close()
if doc.HasXfa() {
    fmt.Println("This document contains an XFA form")
    fields, _ := doc.FormFields()
    for _, field := range fields {
        fmt.Printf("  %s: %s\n", field.Name, field.Value)
    }
}

C#

using var doc = PdfDocument.Open("xfa-form.pdf");
if (doc.HasXfa)
{
    Console.WriteLine("This document contains an XFA form");
    var fields = doc.GetFormFields();
    foreach (var field in fields)
    {
        Console.WriteLine($"  {field.Name}: {field.Value}");
    }
}

WASM

const doc = new WasmPdfDocument(bytes);
if (doc.hasXfa()) {
    console.log("This document contains an XFA form");
    const fields = doc.getFormFields(); // AcroForm fallback fields
    for (const field of fields) {
        console.log(`  ${field.name}: ${field.value}`);
    }
}

Rust

use pdf_oxide::xfa::analyze_xfa_document;
use pdf_oxide::PdfDocument;

let mut doc = PdfDocument::open("xfa-form.pdf")?;
let analysis = analyze_xfa_document(&mut doc)?;
println!("XFA form detected: {} fields", analysis.fields.len());
for field in &analysis.fields {
    println!("  {} ({:?})", field.name, field.field_type);
}

API Reference

Python API

Method Description
doc.get_form_fields() Get all form fields as FormField objects
doc.get_form_field_value(name) Get value of a specific field by name
doc.set_form_field_value(name, value) Set value of a form field
doc.export_form_data(path, format="fdf") Export form data to FDF or XFDF file
doc.has_xfa() Check if the document contains an XFA form
doc.flatten_forms() Flatten all form fields into page content
doc.flatten_forms_on_page(page) Flatten form fields on a specific page

Python FormField Properties

Property Type Description
name str Field name
field_type str Field type (text, checkbox, radio, choice, signature)
value str | bool | None Current field value
is_required bool Whether the field is required
is_readonly bool Whether the field is read-only
max_length int | None Maximum length for text fields

JavaScript API

Method Description
doc.getFormFields() Get all form fields
doc.getFormFieldValue(name) Get value of a specific field by name
doc.setFormFieldValue(name, value) Set value of a form field
doc.exportFormData(format?) Export as FDF (default) or XFDF, returns Uint8Array
doc.hasXfa() Check if the document contains an XFA form
doc.flattenForms() Flatten all form fields into page content
doc.flattenFormsOnPage(pageIndex) Flatten form fields on a specific page

JavaScript FormField Properties

Property Type Description
name string Field name
fieldType string Field type
value string | boolean | null Current value
flags number Field flags

Rust API

Function Description
FormExtractor::extract_fields(doc) Extract all form fields from AcroForm dictionary
FormExtractor::export_fdf(doc, fields) Export as FDF bytes
FormExtractor::export_xfdf(doc, fields) Export as XFDF string
analyze_xfa_document(doc) Analyze XFA form structure
editor.get_form_fields() Get fields via DocumentEditor
editor.get_form_field_value(name) Get field value by name
editor.set_form_field_value(name, value) Set field value

FormField Fields (Rust)

Field Type Description
name String Field name from /T key
full_name String Fully qualified name (dot-separated)
field_type FieldType Button, Text, Choice, Signature, Unknown
value FieldValue Current field value
tooltip Option<String> Tooltip from /TU key
bounds Option<[f64; 4]> Bounding box [x1, y1, x2, y2]
flags Option<u32> Field flags (ReadOnly, Required, NoExport)
default_value Option<FieldValue> Default value from /DV key
max_length Option<u32> Max length for text fields

FieldType Variants

Variant Description
Button Checkbox, radio button, or push button (/Btn)
Text Single or multi-line text field (/Tx)
Choice List box or combo box (/Ch)
Signature Digital signature field (/Sig)
Unknown(String) Unrecognized field type

FieldValue Variants

Variant Description
Text(String) Text string value
Boolean(bool) Boolean value (checkboxes)
Name(String) Name value (radio buttons, choice fields)
Array(Vec<String>) Multiple values (multi-select list boxes)
None No value present

Advanced: Check Required Fields

Python

from pdf_oxide import PdfDocument

doc = PdfDocument("form.pdf")
fields = doc.get_form_fields()

missing = [f for f in fields if f.is_required and not f.value]
if missing:
    print("Missing required fields:")
    for f in missing:
        print(f"  - {f.name}")

Rust

use pdf_oxide::extractors::{FormExtractor, FieldValue};
use pdf_oxide::PdfDocument;

let mut doc = PdfDocument::open("form.pdf")?;
let fields = FormExtractor::extract_fields(&mut doc)?;

let required_empty: Vec<_> = fields.iter()
    .filter(|f| {
        f.flags.map_or(false, |flags| flags & 0x02 != 0)
            && matches!(f.value, FieldValue::None | FieldValue::Text(ref s) if s.is_empty())
    })
    .collect();

if !required_empty.is_empty() {
    println!("Missing required fields:");
    for f in &required_empty {
        println!("  - {}", f.full_name);
    }
}