pdf

package
v0.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 25, 2025 License: MIT Imports: 16 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ArrayObject

type ArrayObject []Object

ArrayObject represents PDF arrays (e.g., [1 2 R]).

func (ArrayObject) String

func (a ArrayObject) String() string

type BooleanObject

type BooleanObject bool

BooleanObject represents PDF 'true' or 'false'.

func (BooleanObject) String

func (b BooleanObject) String() string

type CMap

type CMap struct {
	SpaceWidth float64 // Fallback width
	Map        map[string]string
}

CMap represents the mapping from Character Codes (CIDs) to Unicode strings.

func NewCMap

func NewCMap() *CMap

func ParseCMap

func ParseCMap(data []byte) (*CMap, error)

ParseCMap parses a ToUnicode stream.

type ContentStreamParser

type ContentStreamParser struct {
	// contains filtered or unexported fields
}

ContentStreamParser parses the stream of instructions for a page.

func NewContentStreamParser

func NewContentStreamParser(data []byte) *ContentStreamParser

func (*ContentStreamParser) Next

func (p *ContentStreamParser) Next() (*Operation, error)

Next returns the next operation, or (nil, io.EOF) when done

type DictionaryObject

type DictionaryObject map[string]Object

DictionaryObject represents PDF dictionaries (e.g., << /Type /Page >>).

func (DictionaryObject) String

func (d DictionaryObject) String() string

type EncryptDict

type EncryptDict struct {
	Filter          string // Should be "/Standard"
	V               int    // Version: 1, 2, 4
	R               int    // Revision: 2, 3, 4
	O               []byte // Owner password hash (48 bytes)
	U               []byte // User password hash (48 bytes)
	P               int32  // Permission flags
	Length          int    // Key length in bits (40, 128)
	EncryptMetadata bool   // Usually true
}

EncryptDict represents the PDF encryption dictionary

func ParseEncryptDict

func ParseEncryptDict(obj Object, reader *Reader) (*EncryptDict, error)

ParseEncryptDict extracts encryption dictionary from a PDF object

type EncryptionHandler

type EncryptionHandler struct {
	Dict       *EncryptDict
	FileID     []byte // From trailer /ID
	EncryptKey []byte // Computed encryption key
	V          int    // Algorithm version
	R          int    // Standard security handler revision
}

EncryptionHandler handles PDF encryption/decryption

func NewEncryptionHandler

func NewEncryptionHandler(encDict *EncryptDict, fileID []byte) (*EncryptionHandler, error)

NewEncryptionHandler creates a new encryption handler with empty password

func (*EncryptionHandler) Decrypt

func (h *EncryptionHandler) Decrypt(data []byte, objNum, genNum int) ([]byte, error)

Decrypt decrypts data for a specific object using appropriate algorithm

type Extractor

type Extractor struct {
	// contains filtered or unexported fields
}

Extractor handles the logic of pulling text from a page.

func NewExtractor

func NewExtractor(r *Reader, page DictionaryObject, extractImages bool) (*Extractor, error)

func (*Extractor) ExtractText

func (e *Extractor) ExtractText() (string, error)

ExtractText is the main entry point.

func (*Extractor) GetImages

func (e *Extractor) GetImages() *[]model.Image

GetImages returns the images found on this page

type Font

type Font struct {
	BaseFont   string
	CMap       *CMap
	Encoding   map[int]string  // Map char code -> glyph name (from /Encoding/Differences)
	Widths     map[int]float64 // Map char code -> width (1/1000 units)
	MissingW   float64         // Default width
	SpaceWidth float64         // Width of a space character
	IsCID      bool
}

Font represents a PDF font with metrics and mapping.

type GraphicsState

type GraphicsState struct {
	CTM Matrix // Current Transformation Matrix
}

GraphicsState tracks global graphics parameters (CTM).

type HexStringObject

type HexStringObject []byte

HexStringObject represents hex strings (e.g., <AABB>).

func (HexStringObject) String

func (h HexStringObject) String() string

type IndirectObject

type IndirectObject struct {
	ObjectNumber int
	Generation   int
}

IndirectObject represents a reference (e.g., 12 0 R).

func (IndirectObject) String

func (i IndirectObject) String() string

type KeywordObject

type KeywordObject string

KeywordObject represents raw keywords (e.g., obj, stream, Tj).

func (KeywordObject) String

func (k KeywordObject) String() string

type Lexer

type Lexer struct {
	// contains filtered or unexported fields
}

Lexer handles the low-level parsing of PDF objects.

func NewLexer

func NewLexer(r io.ReadSeeker) *Lexer

func (*Lexer) ReadObject

func (l *Lexer) ReadObject() (Object, error)

ReadObject parses the next object from the stream.

type Matrix

type Matrix [6]float64

Matrix is a 3x3 transform matrix (last row implicitly 0,0,1).

func IdentityMatrix

func IdentityMatrix() Matrix

func (Matrix) Mult

func (a Matrix) Mult(b Matrix) Matrix

Mult multiplies matrix a by matrix b.

type NameObject

type NameObject string

NameObject represents PDF names (e.g., /Type).

func (NameObject) String

func (n NameObject) String() string

type NullObject

type NullObject struct{}

NullObject represents the PDF 'null' value.

func (NullObject) String

func (n NullObject) String() string

type NumberObject

type NumberObject float64

NumberObject represents integer or float values.

func (NumberObject) String

func (n NumberObject) String() string

type Object

type Object interface {
	String() string
}

Object is the generic interface for all PDF objects.

type Operation

type Operation struct {
	Operator string
	Operands []Object
}

Operation represents a single PDF command (Operator + Arguments).

type Reader

type Reader struct {
	// contains filtered or unexported fields
}

Reader is the high-level entry point for reading a PDF.

func NewReader

func NewReader(rs io.ReadSeeker) (*Reader, error)

func (*Reader) CacheFont

func (r *Reader) CacheFont(objNum int, f *Font)

func (*Reader) GetCachedFont

func (r *Reader) GetCachedFont(objNum int) *Font

func (*Reader) GetInfo

func (r *Reader) GetInfo() (DictionaryObject, error)

func (*Reader) GetObject

func (r *Reader) GetObject(ref IndirectObject) (Object, error)

GetObject resolves an indirect reference to the actual object.

func (*Reader) GetPage

func (r *Reader) GetPage(pageIndex int) (DictionaryObject, error)

GetPage returns the dictionary for the Nth page (0-indexed).

func (*Reader) IsEncrypted

func (r *Reader) IsEncrypted() bool

IsEncrypted checks if the PDF has an encryption dictionary in its trailer

func (*Reader) NumPages

func (r *Reader) NumPages() int

NumPages returns the total page count.

func (*Reader) Resolve

func (r *Reader) Resolve(obj Object) Object

type StreamObject

type StreamObject struct {
	Dictionary DictionaryObject
	Data       []byte
}

StreamObject represents a dictionary followed by binary stream data.

func (StreamObject) String

func (s StreamObject) String() string

type StringObject

type StringObject string

StringObject represents literal strings (e.g., (Hello World)).

func (StringObject) String

func (s StringObject) String() string

type TextState

type TextState struct {
	Font        *Font
	FontSize    float64
	CharSpacing float64
	WordSpacing float64
	Scale       float64
	Leading     float64
	Rise        float64

	TM  Matrix // Text Matrix
	TLM Matrix // Text Line Matrix
}

TextState tracks text-specific parameters.

func NewTextState

func NewTextState() TextState

type XRefEntry

type XRefEntry struct {
	Offset     int64
	Generation int
	Free       bool
	Compressed bool
	StreamObj  int
	StreamIdx  int
}

type XRefTable

type XRefTable struct {
	Entries map[int]XRefEntry
	Trailer DictionaryObject
}

func NewXRefTable

func NewXRefTable() *XRefTable

func ParseXRef

func ParseXRef(rs io.ReadSeeker) (*XRefTable, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL