Documentation
¶
Index ¶
- type ArrayObject
- type BooleanObject
- type CMap
- type ContentStreamParser
- type DictionaryObject
- type EncryptDict
- type EncryptionHandler
- type Extractor
- type Font
- type GraphicsState
- type HexStringObject
- type IndirectObject
- type KeywordObject
- type Lexer
- type Matrix
- type NameObject
- type NullObject
- type NumberObject
- type Object
- type Operation
- type Reader
- func (r *Reader) CacheFont(objNum int, f *Font)
- func (r *Reader) GetCachedFont(objNum int) *Font
- func (r *Reader) GetInfo() (DictionaryObject, error)
- func (r *Reader) GetObject(ref IndirectObject) (Object, error)
- func (r *Reader) GetPage(pageIndex int) (DictionaryObject, error)
- func (r *Reader) IsEncrypted() bool
- func (r *Reader) NumPages() int
- func (r *Reader) Resolve(obj Object) Object
- type StreamObject
- type StringObject
- type TextState
- type XRefEntry
- type XRefTable
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ArrayObject ¶
type ArrayObject []Object
ArrayObject represents PDF arrays (e.g., [1 2 R]).
func (ArrayObject) String ¶
func (a ArrayObject) String() string
type BooleanObject ¶
type BooleanObject bool
BooleanObject represents PDF 'true' or 'false'.
func (BooleanObject) String ¶
func (b BooleanObject) String() string
type ContentStreamParser ¶
type ContentStreamParser struct {
// contains filtered or unexported fields
}
ContentStreamParser parses the stream of instructions for a page.
func NewContentStreamParser ¶
func NewContentStreamParser(data []byte) *ContentStreamParser
func (*ContentStreamParser) Next ¶
func (p *ContentStreamParser) Next() (*Operation, error)
Next returns the next operation, or (nil, io.EOF) when done
type DictionaryObject ¶
DictionaryObject represents PDF dictionaries (e.g., << /Type /Page >>).
func (DictionaryObject) String ¶
func (d DictionaryObject) String() string
type EncryptDict ¶
type EncryptDict struct {
Filter string // Should be "/Standard"
V int // Version: 1, 2, 4
R int // Revision: 2, 3, 4
O []byte // Owner password hash (48 bytes)
U []byte // User password hash (48 bytes)
P int32 // Permission flags
Length int // Key length in bits (40, 128)
EncryptMetadata bool // Usually true
}
EncryptDict represents the PDF encryption dictionary
func ParseEncryptDict ¶
func ParseEncryptDict(obj Object, reader *Reader) (*EncryptDict, error)
ParseEncryptDict extracts encryption dictionary from a PDF object
type EncryptionHandler ¶
type EncryptionHandler struct {
Dict *EncryptDict
FileID []byte // From trailer /ID
EncryptKey []byte // Computed encryption key
V int // Algorithm version
R int // Standard security handler revision
}
EncryptionHandler handles PDF encryption/decryption
func NewEncryptionHandler ¶
func NewEncryptionHandler(encDict *EncryptDict, fileID []byte) (*EncryptionHandler, error)
NewEncryptionHandler creates a new encryption handler with empty password
type Extractor ¶
type Extractor struct {
// contains filtered or unexported fields
}
Extractor handles the logic of pulling text from a page.
func NewExtractor ¶
func NewExtractor(r *Reader, page DictionaryObject, extractImages bool) (*Extractor, error)
func (*Extractor) ExtractText ¶
ExtractText is the main entry point.
type Font ¶
type Font struct {
BaseFont string
CMap *CMap
Encoding map[int]string // Map char code -> glyph name (from /Encoding/Differences)
Widths map[int]float64 // Map char code -> width (1/1000 units)
MissingW float64 // Default width
SpaceWidth float64 // Width of a space character
IsCID bool
}
Font represents a PDF font with metrics and mapping.
type GraphicsState ¶
type GraphicsState struct {
CTM Matrix // Current Transformation Matrix
}
GraphicsState tracks global graphics parameters (CTM).
type HexStringObject ¶
type HexStringObject []byte
HexStringObject represents hex strings (e.g., <AABB>).
func (HexStringObject) String ¶
func (h HexStringObject) String() string
type IndirectObject ¶
IndirectObject represents a reference (e.g., 12 0 R).
func (IndirectObject) String ¶
func (i IndirectObject) String() string
type KeywordObject ¶
type KeywordObject string
KeywordObject represents raw keywords (e.g., obj, stream, Tj).
func (KeywordObject) String ¶
func (k KeywordObject) String() string
type Lexer ¶
type Lexer struct {
// contains filtered or unexported fields
}
Lexer handles the low-level parsing of PDF objects.
func NewLexer ¶
func NewLexer(r io.ReadSeeker) *Lexer
func (*Lexer) ReadObject ¶
ReadObject parses the next object from the stream.
type Matrix ¶
type Matrix [6]float64
Matrix is a 3x3 transform matrix (last row implicitly 0,0,1).
func IdentityMatrix ¶
func IdentityMatrix() Matrix
type NameObject ¶
type NameObject string
NameObject represents PDF names (e.g., /Type).
func (NameObject) String ¶
func (n NameObject) String() string
type NullObject ¶
type NullObject struct{}
NullObject represents the PDF 'null' value.
func (NullObject) String ¶
func (n NullObject) String() string
type NumberObject ¶
type NumberObject float64
NumberObject represents integer or float values.
func (NumberObject) String ¶
func (n NumberObject) String() string
type Object ¶
type Object interface {
String() string
}
Object is the generic interface for all PDF objects.
type Reader ¶
type Reader struct {
// contains filtered or unexported fields
}
Reader is the high-level entry point for reading a PDF.
func (*Reader) GetCachedFont ¶
func (*Reader) GetInfo ¶
func (r *Reader) GetInfo() (DictionaryObject, error)
func (*Reader) GetObject ¶
func (r *Reader) GetObject(ref IndirectObject) (Object, error)
GetObject resolves an indirect reference to the actual object.
func (*Reader) GetPage ¶
func (r *Reader) GetPage(pageIndex int) (DictionaryObject, error)
GetPage returns the dictionary for the Nth page (0-indexed).
func (*Reader) IsEncrypted ¶
IsEncrypted checks if the PDF has an encryption dictionary in its trailer
type StreamObject ¶
type StreamObject struct {
Dictionary DictionaryObject
Data []byte
}
StreamObject represents a dictionary followed by binary stream data.
func (StreamObject) String ¶
func (s StreamObject) String() string
type StringObject ¶
type StringObject string
StringObject represents literal strings (e.g., (Hello World)).
func (StringObject) String ¶
func (s StringObject) String() string
type TextState ¶
type TextState struct {
Font *Font
FontSize float64
CharSpacing float64
WordSpacing float64
Scale float64
Leading float64
Rise float64
TM Matrix // Text Matrix
TLM Matrix // Text Line Matrix
}
TextState tracks text-specific parameters.
func NewTextState ¶
func NewTextState() TextState
type XRefTable ¶
type XRefTable struct {
Entries map[int]XRefEntry
Trailer DictionaryObject
}
func NewXRefTable ¶
func NewXRefTable() *XRefTable