reader

package

v0.0.2 Latest Latest Go to latest Published: Feb 23, 2026 License: MIT Imports: 17 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/aqua777/go-llamaindex

Links

Open Source Insights

Documentation ¶

Overview ¶

Package reader provides document loading functionality for go-llamaindex.

Index ¶

func ExtractTextFromPDF(filePath string) (string, error)
func ExtractTextFromPDFByPage(filePath string) ([]string, error)
func GetPDFMetadata(filePath string) (map[string]string, error)
func GetPDFPageCount(filePath string) (int, error)
type CSVReader
- func NewCSVReader(inputFiles ...string) *CSVReader
- func NewCSVReaderFromDir(inputDir string, recursive bool) *CSVReader
- func (r *CSVReader) LoadData() ([]schema.Node, error)
- func (r *CSVReader) LoadFromFile(filePath string) ([]schema.Node, error)
- func (r *CSVReader) Metadata() ReaderMetadata
- func (r *CSVReader) WithConcatRows(concat bool) *CSVReader
- func (r *CSVReader) WithDelimiter(delimiter rune) *CSVReader
- func (r *CSVReader) WithHeader(hasHeader bool) *CSVReader
- func (r *CSVReader) WithMetadataColumns(columns ...string) *CSVReader
- func (r *CSVReader) WithRowSeparator(sep string) *CSVReader
- func (r *CSVReader) WithTextColumns(columns ...string) *CSVReader
type CSVStreamReader
- func NewCSVStreamReader(filePath string) (*CSVStreamReader, error)
- func (r *CSVStreamReader) Close() error
- func (r *CSVStreamReader) LazyLoadData() (<-chan schema.Node, <-chan error)
- func (r *CSVStreamReader) ReadHeaders() ([]string, error)
- func (r *CSVStreamReader) ReadNext() (*schema.Node, error)
type DocxReader
- func NewDocxReader(inputFiles ...string) *DocxReader
- func NewDocxReaderFromDir(inputDir string, recursive bool) *DocxReader
- func (r *DocxReader) LoadData() ([]schema.Node, error)
- func (r *DocxReader) LoadFromBytes(content []byte, sourceName string) ([]schema.Node, error)
- func (r *DocxReader) LoadFromFile(filePath string) ([]schema.Node, error)
- func (r *DocxReader) Metadata() ReaderMetadata
- func (r *DocxReader) WithExtractImages(extract bool) *DocxReader
- func (r *DocxReader) WithExtractMetadata(extract bool) *DocxReader
- func (r *DocxReader) WithExtractTables(extract bool) *DocxReader
- func (r *DocxReader) WithPreserveParagraphs(preserve bool) *DocxReader
type ExcelReader
- func NewExcelReader(inputFiles ...string) *ExcelReader
- func NewExcelReaderFromDir(inputDir string, recursive bool) *ExcelReader
- func (r *ExcelReader) LoadData() ([]schema.Node, error)
- func (r *ExcelReader) LoadFromFile(filePath string) ([]schema.Node, error)
- func (r *ExcelReader) Metadata() ReaderMetadata
- func (r *ExcelReader) WithConcatRows(concat bool) *ExcelReader
- func (r *ExcelReader) WithConcatSheets(concat bool) *ExcelReader
- func (r *ExcelReader) WithHeader(hasHeader bool) *ExcelReader
- func (r *ExcelReader) WithMetadataColumns(columns ...string) *ExcelReader
- func (r *ExcelReader) WithRowSeparator(sep string) *ExcelReader
- func (r *ExcelReader) WithSheetSeparator(sep string) *ExcelReader
- func (r *ExcelReader) WithSheets(sheets ...string) *ExcelReader
- func (r *ExcelReader) WithTextColumns(columns ...string) *ExcelReader
type FileReader
type HTMLReader
- func NewHTMLReader(inputFiles ...string) *HTMLReader
- func NewHTMLReaderFromDir(inputDir string, recursive bool) *HTMLReader
- func (r *HTMLReader) LoadData() ([]schema.Node, error)
- func (r *HTMLReader) LoadFromFile(filePath string) ([]schema.Node, error)
- func (r *HTMLReader) Metadata() ReaderMetadata
- func (r *HTMLReader) WithPreserveWhitespace(preserve bool) *HTMLReader
- func (r *HTMLReader) WithTagsToExtract(tags ...string) *HTMLReader
- func (r *HTMLReader) WithTagsToRemove(tags ...string) *HTMLReader
type JSONReader
- func NewJSONReader(inputFiles ...string) *JSONReader
- func NewJSONReaderFromDir(inputDir string, recursive bool) *JSONReader
- func (r *JSONReader) LoadData() ([]schema.Node, error)
- func (r *JSONReader) LoadFromFile(filePath string) ([]schema.Node, error)
- func (r *JSONReader) Metadata() ReaderMetadata
- func (r *JSONReader) WithJSONL(isJSONL bool) *JSONReader
- func (r *JSONReader) WithMetadataKeys(keys ...string) *JSONReader
- func (r *JSONReader) WithTextContentKey(key string) *JSONReader
type LazyReader
type MarkdownReader
- func NewMarkdownReader(inputFiles ...string) *MarkdownReader
- func NewMarkdownReaderFromDir(inputDir string, recursive bool) *MarkdownReader
- func (r *MarkdownReader) LoadData() ([]schema.Node, error)
- func (r *MarkdownReader) LoadFromFile(filePath string) ([]schema.Node, error)
- func (r *MarkdownReader) Metadata() ReaderMetadata
- func (r *MarkdownReader) WithRemoveHyperlinks(remove bool) *MarkdownReader
- func (r *MarkdownReader) WithRemoveImages(remove bool) *MarkdownReader
- func (r *MarkdownReader) WithSplitByHeaders(split bool, levels ...int) *MarkdownReader
type PDFReader
- func NewPDFReader(inputFiles ...string) *PDFReader
- func NewPDFReaderFromDir(inputDir string, recursive bool) *PDFReader
- func NewPDFReaderWithOptions(opts ...PDFReaderOption) *PDFReader
- func (r *PDFReader) LazyLoadData() (<-chan schema.Node, <-chan error)
- func (r *PDFReader) LoadData() ([]schema.Node, error)
- func (r *PDFReader) LoadDataWithContext(ctx context.Context) ([]schema.Node, error)
- func (r *PDFReader) LoadFromFile(filePath string) ([]schema.Node, error)
- func (r *PDFReader) Metadata() ReaderMetadata
- func (r *PDFReader) WithExtraMetadata(metadata map[string]interface{}) *PDFReader
- func (r *PDFReader) WithSplitByPage(split bool) *PDFReader
type PDFReaderOption
- func WithPDFExtraMetadata(metadata map[string]interface{}) PDFReaderOption
- func WithPDFInputDir(dir string) PDFReaderOption
- func WithPDFInputFiles(files ...string) PDFReaderOption
- func WithPDFPasswordFunc(fn func(filePath string) string) PDFReaderOption
- func WithPDFRecursive(recursive bool) PDFReaderOption
- func WithPDFSplitByPage(split bool) PDFReaderOption
type Reader
type ReaderError
- func NewReaderError(source, message string, err error) *ReaderError
- func (e *ReaderError) Error() string
- func (e *ReaderError) Unwrap() error
type ReaderMetadata
type ReaderOptions
- func DefaultReaderOptions() ReaderOptions
type ReaderWithContext
type ReaderWithMetadata
type SimpleDirectoryReader
- func NewSimpleDirectoryReader(inputDir string, extensions ...string) *SimpleDirectoryReader
- func (r *SimpleDirectoryReader) LoadData() ([]schema.Node, error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func ExtractTextFromPDF ¶

func ExtractTextFromPDF(filePath string) (string, error)

ExtractTextFromPDF is a utility function to extract text from a PDF file.

func ExtractTextFromPDFByPage ¶

func ExtractTextFromPDFByPage(filePath string) ([]string, error)

ExtractTextFromPDFByPage extracts text from a PDF file, returning text per page.

func GetPDFMetadata ¶

func GetPDFMetadata(filePath string) (map[string]string, error)

GetPDFMetadata extracts metadata from a PDF file.

func GetPDFPageCount ¶

func GetPDFPageCount(filePath string) (int, error)

GetPDFPageCount returns the number of pages in a PDF file.

Types ¶

type CSVReader ¶

type CSVReader struct {
	// InputFiles is a list of CSV file paths to read
	InputFiles []string
	// InputDir is a directory containing CSV files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// Delimiter is the field delimiter (default: comma)
	Delimiter rune
	// HasHeader indicates if the first row is a header row
	HasHeader bool
	// TextColumns are column names or indices to use as document text.
	// If empty, all columns are concatenated as text.
	TextColumns []string
	// MetadataColumns are column names or indices to extract as metadata.
	// If empty, all non-text columns are used as metadata.
	MetadataColumns []string
	// ConcatRows determines if all rows should be concatenated into a single document.
	// If false (default), each row becomes a separate document.
	ConcatRows bool
	// RowSeparator is used when ConcatRows is true (default: newline)
	RowSeparator string
}

CSVReader reads CSV files and converts them to documents.

func NewCSVReader ¶

func NewCSVReader(inputFiles ...string) *CSVReader

NewCSVReader creates a new CSVReader for specific files.

func NewCSVReaderFromDir ¶

func NewCSVReaderFromDir(inputDir string, recursive bool) *CSVReader

NewCSVReaderFromDir creates a new CSVReader for a directory.

func (*CSVReader) LoadData ¶

func (r *CSVReader) LoadData() ([]schema.Node, error)

LoadData loads CSV files and returns documents.

func (*CSVReader) LoadFromFile ¶

func (r *CSVReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single CSV file.

func (*CSVReader) Metadata ¶

func (r *CSVReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*CSVReader) WithConcatRows ¶

func (r *CSVReader) WithConcatRows(concat bool) *CSVReader

WithConcatRows sets whether to concatenate all rows into a single document.

func (*CSVReader) WithDelimiter ¶

func (r *CSVReader) WithDelimiter(delimiter rune) *CSVReader

WithDelimiter sets the field delimiter.

func (*CSVReader) WithHeader ¶

func (r *CSVReader) WithHeader(hasHeader bool) *CSVReader

WithHeader sets whether the first row is a header.

func (*CSVReader) WithMetadataColumns ¶

func (r *CSVReader) WithMetadataColumns(columns ...string) *CSVReader

WithMetadataColumns sets which columns to extract as metadata.

func (*CSVReader) WithRowSeparator ¶

func (r *CSVReader) WithRowSeparator(sep string) *CSVReader

WithRowSeparator sets the separator used when concatenating rows.

func (*CSVReader) WithTextColumns ¶

func (r *CSVReader) WithTextColumns(columns ...string) *CSVReader

WithTextColumns sets which columns to use as document text.

type CSVStreamReader ¶

type CSVStreamReader struct {
	*CSVReader
	// contains filtered or unexported fields
}

CSVStreamReader provides streaming CSV reading for large files.

func NewCSVStreamReader ¶

func NewCSVStreamReader(filePath string) (*CSVStreamReader, error)

NewCSVStreamReader creates a streaming CSV reader.

func (*CSVStreamReader) Close ¶

func (r *CSVStreamReader) Close() error

Close closes the underlying file.

func (*CSVStreamReader) LazyLoadData ¶

func (r *CSVStreamReader) LazyLoadData() (<-chan schema.Node, <-chan error)

LazyLoadData returns a channel that yields documents one at a time.

func (*CSVStreamReader) ReadHeaders ¶

func (r *CSVStreamReader) ReadHeaders() ([]string, error)

ReadHeaders reads and returns the header row.

func (*CSVStreamReader) ReadNext ¶

func (r *CSVStreamReader) ReadNext() (*schema.Node, error)

ReadNext reads the next row and returns it as a document.

type DocxReader ¶

type DocxReader struct {
	// InputFiles is a list of DOCX file paths to read
	InputFiles []string
	// InputDir is a directory containing DOCX files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// ExtractImages determines if images should be extracted as separate nodes
	ExtractImages bool
	// PreserveParagraphs keeps paragraph breaks in the output
	PreserveParagraphs bool
	// ExtractMetadata extracts document properties (author, title, etc.)
	ExtractMetadata bool
	// ExtractTables extracts table content
	ExtractTables bool
}

DocxReader reads Microsoft Word (.docx) files and converts them to documents.

func NewDocxReader ¶

func NewDocxReader(inputFiles ...string) *DocxReader

NewDocxReader creates a new DocxReader for specific files.

func NewDocxReaderFromDir ¶

func NewDocxReaderFromDir(inputDir string, recursive bool) *DocxReader

NewDocxReaderFromDir creates a new DocxReader for a directory.

func (*DocxReader) LoadData ¶

func (r *DocxReader) LoadData() ([]schema.Node, error)

LoadData loads DOCX files and returns documents.

func (*DocxReader) LoadFromBytes ¶

func (r *DocxReader) LoadFromBytes(content []byte, sourceName string) ([]schema.Node, error)

LoadFromBytes loads a DOCX from byte content.

func (*DocxReader) LoadFromFile ¶

func (r *DocxReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single DOCX file.

func (*DocxReader) Metadata ¶

func (r *DocxReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*DocxReader) WithExtractImages ¶

func (r *DocxReader) WithExtractImages(extract bool) *DocxReader

WithExtractImages enables image extraction.

func (*DocxReader) WithExtractMetadata ¶

func (r *DocxReader) WithExtractMetadata(extract bool) *DocxReader

WithExtractMetadata sets whether to extract document properties.

func (*DocxReader) WithExtractTables ¶

func (r *DocxReader) WithExtractTables(extract bool) *DocxReader

WithExtractTables sets whether to extract table content.

func (*DocxReader) WithPreserveParagraphs ¶

func (r *DocxReader) WithPreserveParagraphs(preserve bool) *DocxReader

WithPreserveParagraphs sets whether to preserve paragraph breaks.

type ExcelReader ¶

type ExcelReader struct {
	// InputFiles is a list of Excel file paths to read
	InputFiles []string
	// InputDir is a directory containing Excel files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// SheetNames specifies which sheets to read. If empty, all sheets are read.
	SheetNames []string
	// HasHeader indicates if the first row is a header row
	HasHeader bool
	// TextColumns are column names or indices to use as document text.
	// If empty, all columns are concatenated as text.
	TextColumns []string
	// MetadataColumns are column names or indices to extract as metadata.
	// If empty, all non-text columns are used as metadata.
	MetadataColumns []string
	// ConcatRows determines if all rows should be concatenated into a single document.
	// If false (default), each row becomes a separate document.
	ConcatRows bool
	// ConcatSheets determines if all sheets should be concatenated into a single document.
	// If false (default), each sheet is processed separately.
	ConcatSheets bool
	// RowSeparator is used when ConcatRows is true (default: newline)
	RowSeparator string
	// SheetSeparator is used when ConcatSheets is true (default: double newline)
	SheetSeparator string
}

ExcelReader reads Excel files (.xlsx, .xlsm) and converts them to documents.

func NewExcelReader ¶

func NewExcelReader(inputFiles ...string) *ExcelReader

NewExcelReader creates a new ExcelReader for specific files.

func NewExcelReaderFromDir ¶

func NewExcelReaderFromDir(inputDir string, recursive bool) *ExcelReader

NewExcelReaderFromDir creates a new ExcelReader for a directory.

func (*ExcelReader) LoadData ¶

func (r *ExcelReader) LoadData() ([]schema.Node, error)

LoadData loads Excel files and returns documents.

func (*ExcelReader) LoadFromFile ¶

func (r *ExcelReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single Excel file.

func (*ExcelReader) Metadata ¶

func (r *ExcelReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*ExcelReader) WithConcatRows ¶

func (r *ExcelReader) WithConcatRows(concat bool) *ExcelReader

WithConcatRows sets whether to concatenate all rows into a single document.

func (*ExcelReader) WithConcatSheets ¶

func (r *ExcelReader) WithConcatSheets(concat bool) *ExcelReader

WithConcatSheets sets whether to concatenate all sheets into a single document.

func (*ExcelReader) WithHeader ¶

func (r *ExcelReader) WithHeader(hasHeader bool) *ExcelReader

WithHeader sets whether the first row is a header.

func (*ExcelReader) WithMetadataColumns ¶

func (r *ExcelReader) WithMetadataColumns(columns ...string) *ExcelReader

WithMetadataColumns sets which columns to extract as metadata.

func (*ExcelReader) WithRowSeparator ¶

func (r *ExcelReader) WithRowSeparator(sep string) *ExcelReader

WithRowSeparator sets the separator used when concatenating rows.

func (*ExcelReader) WithSheetSeparator ¶

func (r *ExcelReader) WithSheetSeparator(sep string) *ExcelReader

WithSheetSeparator sets the separator used when concatenating sheets.

func (*ExcelReader) WithSheets ¶

func (r *ExcelReader) WithSheets(sheets ...string) *ExcelReader

WithSheets sets which sheets to read.

func (*ExcelReader) WithTextColumns ¶

func (r *ExcelReader) WithTextColumns(columns ...string) *ExcelReader

WithTextColumns sets which columns to use as document text.

type FileReader ¶

type FileReader interface {
	Reader
	// LoadFromFile loads a document from a specific file path.
	LoadFromFile(filePath string) ([]schema.Node, error)
}

FileReader is a Reader that loads from file paths.

type HTMLReader ¶

type HTMLReader struct {
	// InputFiles is a list of HTML file paths to read
	InputFiles []string
	// InputDir is a directory containing HTML files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// TagsToExtract specifies which HTML tags to extract text from.
	// If empty, extracts from body. Common values: "p", "div", "article", "main"
	TagsToExtract []string
	// TagsToRemove specifies which HTML tags to remove entirely (e.g., "script", "style")
	TagsToRemove []string
	// PreserveWhitespace keeps original whitespace formatting
	PreserveWhitespace bool
}

HTMLReader reads HTML files and extracts text content.

func NewHTMLReader ¶

func NewHTMLReader(inputFiles ...string) *HTMLReader

NewHTMLReader creates a new HTMLReader for specific files.

func NewHTMLReaderFromDir ¶

func NewHTMLReaderFromDir(inputDir string, recursive bool) *HTMLReader

NewHTMLReaderFromDir creates a new HTMLReader for a directory.

func (*HTMLReader) LoadData ¶

func (r *HTMLReader) LoadData() ([]schema.Node, error)

LoadData loads HTML files and returns documents.

func (*HTMLReader) LoadFromFile ¶

func (r *HTMLReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single HTML file.

func (*HTMLReader) Metadata ¶

func (r *HTMLReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*HTMLReader) WithPreserveWhitespace ¶

func (r *HTMLReader) WithPreserveWhitespace(preserve bool) *HTMLReader

WithPreserveWhitespace enables whitespace preservation.

func (*HTMLReader) WithTagsToExtract ¶

func (r *HTMLReader) WithTagsToExtract(tags ...string) *HTMLReader

WithTagsToExtract sets which tags to extract text from.

func (*HTMLReader) WithTagsToRemove ¶

func (r *HTMLReader) WithTagsToRemove(tags ...string) *HTMLReader

WithTagsToRemove sets which tags to remove entirely.

type JSONReader ¶

type JSONReader struct {
	// InputFiles is a list of JSON file paths to read
	InputFiles []string
	// InputDir is a directory containing JSON files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// TextContentKey is the JSON key to use as document text content.
	// If empty, the entire JSON is serialized as text.
	TextContentKey string
	// MetadataKeys are JSON keys to extract as document metadata.
	// If empty, all non-text keys are used as metadata.
	MetadataKeys []string
	// IsJSONL indicates if files are JSON Lines format (one JSON object per line)
	IsJSONL bool
}

JSONReader reads JSON files and converts them to documents.

func NewJSONReader ¶

func NewJSONReader(inputFiles ...string) *JSONReader

NewJSONReader creates a new JSONReader for specific files.

func NewJSONReaderFromDir ¶

func NewJSONReaderFromDir(inputDir string, recursive bool) *JSONReader

NewJSONReaderFromDir creates a new JSONReader for a directory.

func (*JSONReader) LoadData ¶

func (r *JSONReader) LoadData() ([]schema.Node, error)

LoadData loads JSON files and returns documents.

func (*JSONReader) LoadFromFile ¶

func (r *JSONReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single JSON file.

func (*JSONReader) Metadata ¶

func (r *JSONReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*JSONReader) WithJSONL ¶

func (r *JSONReader) WithJSONL(isJSONL bool) *JSONReader

WithJSONL enables JSON Lines format parsing.

func (*JSONReader) WithMetadataKeys ¶

func (r *JSONReader) WithMetadataKeys(keys ...string) *JSONReader

WithMetadataKeys sets the keys to extract as metadata.

func (*JSONReader) WithTextContentKey ¶

func (r *JSONReader) WithTextContentKey(key string) *JSONReader

WithTextContentKey sets the key to use for document text content.

type LazyReader ¶

type LazyReader interface {
	Reader
	// LazyLoadData returns a channel that yields documents one at a time.
	// The channel is closed when all documents have been loaded or an error occurs.
	LazyLoadData() (<-chan schema.Node, <-chan error)
}

LazyReader is a Reader that can load documents lazily via a channel.

type MarkdownReader ¶

type MarkdownReader struct {
	// InputFiles is a list of Markdown file paths to read
	InputFiles []string
	// InputDir is a directory containing Markdown files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// RemoveHyperlinks removes hyperlinks from the text
	RemoveHyperlinks bool
	// RemoveImages removes image references from the text
	RemoveImages bool
	// SplitByHeaders splits document into multiple nodes by headers
	SplitByHeaders bool
	// HeadersToSplitOn specifies which header levels to split on (e.g., []int{1, 2})
	HeadersToSplitOn []int
}

MarkdownReader reads Markdown files and converts them to documents.

func NewMarkdownReader ¶

func NewMarkdownReader(inputFiles ...string) *MarkdownReader

NewMarkdownReader creates a new MarkdownReader for specific files.

func NewMarkdownReaderFromDir ¶

func NewMarkdownReaderFromDir(inputDir string, recursive bool) *MarkdownReader

NewMarkdownReaderFromDir creates a new MarkdownReader for a directory.

func (*MarkdownReader) LoadData ¶

func (r *MarkdownReader) LoadData() ([]schema.Node, error)

LoadData loads Markdown files and returns documents.

func (*MarkdownReader) LoadFromFile ¶

func (r *MarkdownReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single Markdown file.

func (*MarkdownReader) Metadata ¶

func (r *MarkdownReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*MarkdownReader) WithRemoveHyperlinks ¶

func (r *MarkdownReader) WithRemoveHyperlinks(remove bool) *MarkdownReader

WithRemoveHyperlinks enables hyperlink removal.

func (*MarkdownReader) WithRemoveImages ¶

func (r *MarkdownReader) WithRemoveImages(remove bool) *MarkdownReader

WithRemoveImages enables image reference removal.

func (*MarkdownReader) WithSplitByHeaders ¶

func (r *MarkdownReader) WithSplitByHeaders(split bool, levels ...int) *MarkdownReader

WithSplitByHeaders enables splitting by headers.

type PDFReader ¶

type PDFReader struct {
	// InputFiles is a list of PDF file paths to read
	InputFiles []string
	// InputDir is a directory containing PDF files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// SplitByPage creates separate nodes for each page
	SplitByPage bool
	// ExtraMetadata is additional metadata to add to all documents
	ExtraMetadata map[string]interface{}
	// PasswordFunc is a function that returns the password for a PDF file
	// The function receives the file path and should return the password
	PasswordFunc func(filePath string) string
}

PDFReader reads PDF files and converts them to documents. It uses the ledongthuc/pdf library for text extraction.

func NewPDFReader ¶

func NewPDFReader(inputFiles ...string) *PDFReader

NewPDFReader creates a new PDFReader for specific files.

func NewPDFReaderFromDir ¶

func NewPDFReaderFromDir(inputDir string, recursive bool) *PDFReader

NewPDFReaderFromDir creates a new PDFReader for a directory.

func NewPDFReaderWithOptions ¶

func NewPDFReaderWithOptions(opts ...PDFReaderOption) *PDFReader

NewPDFReaderWithOptions creates a new PDFReader with options.

func (*PDFReader) LazyLoadData ¶

func (r *PDFReader) LazyLoadData() (<-chan schema.Node, <-chan error)

LazyLoadData returns a channel that yields documents one at a time.

func (*PDFReader) LoadData ¶

func (r *PDFReader) LoadData() ([]schema.Node, error)

LoadData loads PDF files and returns documents.

func (*PDFReader) LoadDataWithContext ¶

func (r *PDFReader) LoadDataWithContext(ctx context.Context) ([]schema.Node, error)

LoadDataWithContext loads PDF files with context support.

func (*PDFReader) LoadFromFile ¶

func (r *PDFReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single PDF file.

func (*PDFReader) Metadata ¶

func (r *PDFReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*PDFReader) WithExtraMetadata ¶

func (r *PDFReader) WithExtraMetadata(metadata map[string]interface{}) *PDFReader

WithExtraMetadata sets extra metadata (fluent API).

func (*PDFReader) WithSplitByPage ¶

func (r *PDFReader) WithSplitByPage(split bool) *PDFReader

WithSplitByPage enables splitting by page (fluent API).

type PDFReaderOption ¶

type PDFReaderOption func(*PDFReader)

PDFReaderOption configures PDFReader.

func WithPDFExtraMetadata ¶

func WithPDFExtraMetadata(metadata map[string]interface{}) PDFReaderOption

WithPDFExtraMetadata sets extra metadata.

func WithPDFInputDir ¶

func WithPDFInputDir(dir string) PDFReaderOption

WithPDFInputDir sets the input directory.

func WithPDFInputFiles ¶

func WithPDFInputFiles(files ...string) PDFReaderOption

WithPDFInputFiles sets the input files.

func WithPDFPasswordFunc ¶

func WithPDFPasswordFunc(fn func(filePath string) string) PDFReaderOption

WithPDFPasswordFunc sets the password function.

func WithPDFRecursive ¶

func WithPDFRecursive(recursive bool) PDFReaderOption

WithPDFRecursive enables recursive directory scanning.

func WithPDFSplitByPage ¶

func WithPDFSplitByPage(split bool) PDFReaderOption

WithPDFSplitByPage enables splitting by page.

type Reader ¶

type Reader interface {
	// LoadData loads documents and returns them as a slice.
	LoadData() ([]schema.Node, error)
}

Reader is the interface for document loaders. Implementations should load documents from various sources (files, URLs, etc.)

type ReaderError ¶

type ReaderError struct {
	Source  string // File path or URL that caused the error
	Message string
	Err     error
}

ReaderError represents an error during document loading.

func NewReaderError ¶

func NewReaderError(source, message string, err error) *ReaderError

NewReaderError creates a new ReaderError.

func (*ReaderError) Error ¶

func (e *ReaderError) Error() string

func (*ReaderError) Unwrap ¶

func (e *ReaderError) Unwrap() error

type ReaderMetadata ¶

type ReaderMetadata struct {
	// Name is the reader name (e.g., "JSONReader", "PDFReader")
	Name string
	// SupportedExtensions lists file extensions this reader supports
	SupportedExtensions []string
	// Description describes what this reader does
	Description string
}

ReaderMetadata contains metadata about a reader.

type ReaderOptions ¶

type ReaderOptions struct {
	// Recursive determines if directory readers should recurse into subdirectories
	Recursive bool
	// FileExtensions filters which file extensions to process
	FileExtensions []string
	// ExcludePatterns are glob patterns for files/dirs to exclude
	ExcludePatterns []string
	// IncludeHidden determines if hidden files should be included
	IncludeHidden bool
	// NumWorkers is the number of concurrent workers for parallel loading
	NumWorkers int
	// ExtraMetadata is additional metadata to add to all loaded documents
	ExtraMetadata map[string]interface{}
}

ReaderOptions contains common options for readers.

func DefaultReaderOptions ¶

func DefaultReaderOptions() ReaderOptions

DefaultReaderOptions returns default reader options.

type ReaderWithContext ¶

type ReaderWithContext interface {
	Reader
	// LoadDataWithContext loads documents with context support.
	LoadDataWithContext(ctx context.Context) ([]schema.Node, error)
}

ReaderWithContext is a Reader that supports context for cancellation.

type ReaderWithMetadata ¶

type ReaderWithMetadata interface {
	Reader
	// Metadata returns information about this reader.
	Metadata() ReaderMetadata
}

ReaderWithMetadata is a Reader that provides metadata about itself.

type SimpleDirectoryReader ¶

type SimpleDirectoryReader struct {
	// contains filtered or unexported fields
}

SimpleDirectoryReader reads files from a directory.

func NewSimpleDirectoryReader ¶

func NewSimpleDirectoryReader(inputDir string, extensions ...string) *SimpleDirectoryReader

NewSimpleDirectoryReader creates a new SimpleDirectoryReader.

func (*SimpleDirectoryReader) LoadData ¶

func (r *SimpleDirectoryReader) LoadData() ([]schema.Node, error)

LoadData reads files and returns a slice of Documents (Nodes with type Document).

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL