reader

package
v0.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 23, 2026 License: MIT Imports: 17 Imported by: 0

Documentation

Overview

Package reader provides document loading functionality for go-llamaindex.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ExtractTextFromPDF

func ExtractTextFromPDF(filePath string) (string, error)

ExtractTextFromPDF is a utility function to extract text from a PDF file.

func ExtractTextFromPDFByPage

func ExtractTextFromPDFByPage(filePath string) ([]string, error)

ExtractTextFromPDFByPage extracts text from a PDF file, returning text per page.

func GetPDFMetadata

func GetPDFMetadata(filePath string) (map[string]string, error)

GetPDFMetadata extracts metadata from a PDF file.

func GetPDFPageCount

func GetPDFPageCount(filePath string) (int, error)

GetPDFPageCount returns the number of pages in a PDF file.

Types

type CSVReader

type CSVReader struct {
	// InputFiles is a list of CSV file paths to read
	InputFiles []string
	// InputDir is a directory containing CSV files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// Delimiter is the field delimiter (default: comma)
	Delimiter rune
	// HasHeader indicates if the first row is a header row
	HasHeader bool
	// TextColumns are column names or indices to use as document text.
	// If empty, all columns are concatenated as text.
	TextColumns []string
	// MetadataColumns are column names or indices to extract as metadata.
	// If empty, all non-text columns are used as metadata.
	MetadataColumns []string
	// ConcatRows determines if all rows should be concatenated into a single document.
	// If false (default), each row becomes a separate document.
	ConcatRows bool
	// RowSeparator is used when ConcatRows is true (default: newline)
	RowSeparator string
}

CSVReader reads CSV files and converts them to documents.

func NewCSVReader

func NewCSVReader(inputFiles ...string) *CSVReader

NewCSVReader creates a new CSVReader for specific files.

func NewCSVReaderFromDir

func NewCSVReaderFromDir(inputDir string, recursive bool) *CSVReader

NewCSVReaderFromDir creates a new CSVReader for a directory.

func (*CSVReader) LoadData

func (r *CSVReader) LoadData() ([]schema.Node, error)

LoadData loads CSV files and returns documents.

func (*CSVReader) LoadFromFile

func (r *CSVReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single CSV file.

func (*CSVReader) Metadata

func (r *CSVReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*CSVReader) WithConcatRows

func (r *CSVReader) WithConcatRows(concat bool) *CSVReader

WithConcatRows sets whether to concatenate all rows into a single document.

func (*CSVReader) WithDelimiter

func (r *CSVReader) WithDelimiter(delimiter rune) *CSVReader

WithDelimiter sets the field delimiter.

func (*CSVReader) WithHeader

func (r *CSVReader) WithHeader(hasHeader bool) *CSVReader

WithHeader sets whether the first row is a header.

func (*CSVReader) WithMetadataColumns

func (r *CSVReader) WithMetadataColumns(columns ...string) *CSVReader

WithMetadataColumns sets which columns to extract as metadata.

func (*CSVReader) WithRowSeparator

func (r *CSVReader) WithRowSeparator(sep string) *CSVReader

WithRowSeparator sets the separator used when concatenating rows.

func (*CSVReader) WithTextColumns

func (r *CSVReader) WithTextColumns(columns ...string) *CSVReader

WithTextColumns sets which columns to use as document text.

type CSVStreamReader

type CSVStreamReader struct {
	*CSVReader
	// contains filtered or unexported fields
}

CSVStreamReader provides streaming CSV reading for large files.

func NewCSVStreamReader

func NewCSVStreamReader(filePath string) (*CSVStreamReader, error)

NewCSVStreamReader creates a streaming CSV reader.

func (*CSVStreamReader) Close

func (r *CSVStreamReader) Close() error

Close closes the underlying file.

func (*CSVStreamReader) LazyLoadData

func (r *CSVStreamReader) LazyLoadData() (<-chan schema.Node, <-chan error)

LazyLoadData returns a channel that yields documents one at a time.

func (*CSVStreamReader) ReadHeaders

func (r *CSVStreamReader) ReadHeaders() ([]string, error)

ReadHeaders reads and returns the header row.

func (*CSVStreamReader) ReadNext

func (r *CSVStreamReader) ReadNext() (*schema.Node, error)

ReadNext reads the next row and returns it as a document.

type DocxReader

type DocxReader struct {
	// InputFiles is a list of DOCX file paths to read
	InputFiles []string
	// InputDir is a directory containing DOCX files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// ExtractImages determines if images should be extracted as separate nodes
	ExtractImages bool
	// PreserveParagraphs keeps paragraph breaks in the output
	PreserveParagraphs bool
	// ExtractMetadata extracts document properties (author, title, etc.)
	ExtractMetadata bool
	// ExtractTables extracts table content
	ExtractTables bool
}

DocxReader reads Microsoft Word (.docx) files and converts them to documents.

func NewDocxReader

func NewDocxReader(inputFiles ...string) *DocxReader

NewDocxReader creates a new DocxReader for specific files.

func NewDocxReaderFromDir

func NewDocxReaderFromDir(inputDir string, recursive bool) *DocxReader

NewDocxReaderFromDir creates a new DocxReader for a directory.

func (*DocxReader) LoadData

func (r *DocxReader) LoadData() ([]schema.Node, error)

LoadData loads DOCX files and returns documents.

func (*DocxReader) LoadFromBytes

func (r *DocxReader) LoadFromBytes(content []byte, sourceName string) ([]schema.Node, error)

LoadFromBytes loads a DOCX from byte content.

func (*DocxReader) LoadFromFile

func (r *DocxReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single DOCX file.

func (*DocxReader) Metadata

func (r *DocxReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*DocxReader) WithExtractImages

func (r *DocxReader) WithExtractImages(extract bool) *DocxReader

WithExtractImages enables image extraction.

func (*DocxReader) WithExtractMetadata

func (r *DocxReader) WithExtractMetadata(extract bool) *DocxReader

WithExtractMetadata sets whether to extract document properties.

func (*DocxReader) WithExtractTables

func (r *DocxReader) WithExtractTables(extract bool) *DocxReader

WithExtractTables sets whether to extract table content.

func (*DocxReader) WithPreserveParagraphs

func (r *DocxReader) WithPreserveParagraphs(preserve bool) *DocxReader

WithPreserveParagraphs sets whether to preserve paragraph breaks.

type ExcelReader

type ExcelReader struct {
	// InputFiles is a list of Excel file paths to read
	InputFiles []string
	// InputDir is a directory containing Excel files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// SheetNames specifies which sheets to read. If empty, all sheets are read.
	SheetNames []string
	// HasHeader indicates if the first row is a header row
	HasHeader bool
	// TextColumns are column names or indices to use as document text.
	// If empty, all columns are concatenated as text.
	TextColumns []string
	// MetadataColumns are column names or indices to extract as metadata.
	// If empty, all non-text columns are used as metadata.
	MetadataColumns []string
	// ConcatRows determines if all rows should be concatenated into a single document.
	// If false (default), each row becomes a separate document.
	ConcatRows bool
	// ConcatSheets determines if all sheets should be concatenated into a single document.
	// If false (default), each sheet is processed separately.
	ConcatSheets bool
	// RowSeparator is used when ConcatRows is true (default: newline)
	RowSeparator string
	// SheetSeparator is used when ConcatSheets is true (default: double newline)
	SheetSeparator string
}

ExcelReader reads Excel files (.xlsx, .xlsm) and converts them to documents.

func NewExcelReader

func NewExcelReader(inputFiles ...string) *ExcelReader

NewExcelReader creates a new ExcelReader for specific files.

func NewExcelReaderFromDir

func NewExcelReaderFromDir(inputDir string, recursive bool) *ExcelReader

NewExcelReaderFromDir creates a new ExcelReader for a directory.

func (*ExcelReader) LoadData

func (r *ExcelReader) LoadData() ([]schema.Node, error)

LoadData loads Excel files and returns documents.

func (*ExcelReader) LoadFromFile

func (r *ExcelReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single Excel file.

func (*ExcelReader) Metadata

func (r *ExcelReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*ExcelReader) WithConcatRows

func (r *ExcelReader) WithConcatRows(concat bool) *ExcelReader

WithConcatRows sets whether to concatenate all rows into a single document.

func (*ExcelReader) WithConcatSheets

func (r *ExcelReader) WithConcatSheets(concat bool) *ExcelReader

WithConcatSheets sets whether to concatenate all sheets into a single document.

func (*ExcelReader) WithHeader

func (r *ExcelReader) WithHeader(hasHeader bool) *ExcelReader

WithHeader sets whether the first row is a header.

func (*ExcelReader) WithMetadataColumns

func (r *ExcelReader) WithMetadataColumns(columns ...string) *ExcelReader

WithMetadataColumns sets which columns to extract as metadata.

func (*ExcelReader) WithRowSeparator

func (r *ExcelReader) WithRowSeparator(sep string) *ExcelReader

WithRowSeparator sets the separator used when concatenating rows.

func (*ExcelReader) WithSheetSeparator

func (r *ExcelReader) WithSheetSeparator(sep string) *ExcelReader

WithSheetSeparator sets the separator used when concatenating sheets.

func (*ExcelReader) WithSheets

func (r *ExcelReader) WithSheets(sheets ...string) *ExcelReader

WithSheets sets which sheets to read.

func (*ExcelReader) WithTextColumns

func (r *ExcelReader) WithTextColumns(columns ...string) *ExcelReader

WithTextColumns sets which columns to use as document text.

type FileReader

type FileReader interface {
	Reader
	// LoadFromFile loads a document from a specific file path.
	LoadFromFile(filePath string) ([]schema.Node, error)
}

FileReader is a Reader that loads from file paths.

type HTMLReader

type HTMLReader struct {
	// InputFiles is a list of HTML file paths to read
	InputFiles []string
	// InputDir is a directory containing HTML files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// TagsToExtract specifies which HTML tags to extract text from.
	// If empty, extracts from body. Common values: "p", "div", "article", "main"
	TagsToExtract []string
	// TagsToRemove specifies which HTML tags to remove entirely (e.g., "script", "style")
	TagsToRemove []string
	// PreserveWhitespace keeps original whitespace formatting
	PreserveWhitespace bool
}

HTMLReader reads HTML files and extracts text content.

func NewHTMLReader

func NewHTMLReader(inputFiles ...string) *HTMLReader

NewHTMLReader creates a new HTMLReader for specific files.

func NewHTMLReaderFromDir

func NewHTMLReaderFromDir(inputDir string, recursive bool) *HTMLReader

NewHTMLReaderFromDir creates a new HTMLReader for a directory.

func (*HTMLReader) LoadData

func (r *HTMLReader) LoadData() ([]schema.Node, error)

LoadData loads HTML files and returns documents.

func (*HTMLReader) LoadFromFile

func (r *HTMLReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single HTML file.

func (*HTMLReader) Metadata

func (r *HTMLReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*HTMLReader) WithPreserveWhitespace

func (r *HTMLReader) WithPreserveWhitespace(preserve bool) *HTMLReader

WithPreserveWhitespace enables whitespace preservation.

func (*HTMLReader) WithTagsToExtract

func (r *HTMLReader) WithTagsToExtract(tags ...string) *HTMLReader

WithTagsToExtract sets which tags to extract text from.

func (*HTMLReader) WithTagsToRemove

func (r *HTMLReader) WithTagsToRemove(tags ...string) *HTMLReader

WithTagsToRemove sets which tags to remove entirely.

type JSONReader

type JSONReader struct {
	// InputFiles is a list of JSON file paths to read
	InputFiles []string
	// InputDir is a directory containing JSON files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// TextContentKey is the JSON key to use as document text content.
	// If empty, the entire JSON is serialized as text.
	TextContentKey string
	// MetadataKeys are JSON keys to extract as document metadata.
	// If empty, all non-text keys are used as metadata.
	MetadataKeys []string
	// IsJSONL indicates if files are JSON Lines format (one JSON object per line)
	IsJSONL bool
}

JSONReader reads JSON files and converts them to documents.

func NewJSONReader

func NewJSONReader(inputFiles ...string) *JSONReader

NewJSONReader creates a new JSONReader for specific files.

func NewJSONReaderFromDir

func NewJSONReaderFromDir(inputDir string, recursive bool) *JSONReader

NewJSONReaderFromDir creates a new JSONReader for a directory.

func (*JSONReader) LoadData

func (r *JSONReader) LoadData() ([]schema.Node, error)

LoadData loads JSON files and returns documents.

func (*JSONReader) LoadFromFile

func (r *JSONReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single JSON file.

func (*JSONReader) Metadata

func (r *JSONReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*JSONReader) WithJSONL

func (r *JSONReader) WithJSONL(isJSONL bool) *JSONReader

WithJSONL enables JSON Lines format parsing.

func (*JSONReader) WithMetadataKeys

func (r *JSONReader) WithMetadataKeys(keys ...string) *JSONReader

WithMetadataKeys sets the keys to extract as metadata.

func (*JSONReader) WithTextContentKey

func (r *JSONReader) WithTextContentKey(key string) *JSONReader

WithTextContentKey sets the key to use for document text content.

type LazyReader

type LazyReader interface {
	Reader
	// LazyLoadData returns a channel that yields documents one at a time.
	// The channel is closed when all documents have been loaded or an error occurs.
	LazyLoadData() (<-chan schema.Node, <-chan error)
}

LazyReader is a Reader that can load documents lazily via a channel.

type MarkdownReader

type MarkdownReader struct {
	// InputFiles is a list of Markdown file paths to read
	InputFiles []string
	// InputDir is a directory containing Markdown files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// RemoveHyperlinks removes hyperlinks from the text
	RemoveHyperlinks bool
	// RemoveImages removes image references from the text
	RemoveImages bool
	// SplitByHeaders splits document into multiple nodes by headers
	SplitByHeaders bool
	// HeadersToSplitOn specifies which header levels to split on (e.g., []int{1, 2})
	HeadersToSplitOn []int
}

MarkdownReader reads Markdown files and converts them to documents.

func NewMarkdownReader

func NewMarkdownReader(inputFiles ...string) *MarkdownReader

NewMarkdownReader creates a new MarkdownReader for specific files.

func NewMarkdownReaderFromDir

func NewMarkdownReaderFromDir(inputDir string, recursive bool) *MarkdownReader

NewMarkdownReaderFromDir creates a new MarkdownReader for a directory.

func (*MarkdownReader) LoadData

func (r *MarkdownReader) LoadData() ([]schema.Node, error)

LoadData loads Markdown files and returns documents.

func (*MarkdownReader) LoadFromFile

func (r *MarkdownReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single Markdown file.

func (*MarkdownReader) Metadata

func (r *MarkdownReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (r *MarkdownReader) WithRemoveHyperlinks(remove bool) *MarkdownReader

WithRemoveHyperlinks enables hyperlink removal.

func (*MarkdownReader) WithRemoveImages

func (r *MarkdownReader) WithRemoveImages(remove bool) *MarkdownReader

WithRemoveImages enables image reference removal.

func (*MarkdownReader) WithSplitByHeaders

func (r *MarkdownReader) WithSplitByHeaders(split bool, levels ...int) *MarkdownReader

WithSplitByHeaders enables splitting by headers.

type PDFReader

type PDFReader struct {
	// InputFiles is a list of PDF file paths to read
	InputFiles []string
	// InputDir is a directory containing PDF files
	InputDir string
	// Recursive determines if subdirectories should be searched
	Recursive bool
	// SplitByPage creates separate nodes for each page
	SplitByPage bool
	// ExtraMetadata is additional metadata to add to all documents
	ExtraMetadata map[string]interface{}
	// PasswordFunc is a function that returns the password for a PDF file
	// The function receives the file path and should return the password
	PasswordFunc func(filePath string) string
}

PDFReader reads PDF files and converts them to documents. It uses the ledongthuc/pdf library for text extraction.

func NewPDFReader

func NewPDFReader(inputFiles ...string) *PDFReader

NewPDFReader creates a new PDFReader for specific files.

func NewPDFReaderFromDir

func NewPDFReaderFromDir(inputDir string, recursive bool) *PDFReader

NewPDFReaderFromDir creates a new PDFReader for a directory.

func NewPDFReaderWithOptions

func NewPDFReaderWithOptions(opts ...PDFReaderOption) *PDFReader

NewPDFReaderWithOptions creates a new PDFReader with options.

func (*PDFReader) LazyLoadData

func (r *PDFReader) LazyLoadData() (<-chan schema.Node, <-chan error)

LazyLoadData returns a channel that yields documents one at a time.

func (*PDFReader) LoadData

func (r *PDFReader) LoadData() ([]schema.Node, error)

LoadData loads PDF files and returns documents.

func (*PDFReader) LoadDataWithContext

func (r *PDFReader) LoadDataWithContext(ctx context.Context) ([]schema.Node, error)

LoadDataWithContext loads PDF files with context support.

func (*PDFReader) LoadFromFile

func (r *PDFReader) LoadFromFile(filePath string) ([]schema.Node, error)

LoadFromFile loads a single PDF file.

func (*PDFReader) Metadata

func (r *PDFReader) Metadata() ReaderMetadata

Metadata returns reader metadata.

func (*PDFReader) WithExtraMetadata

func (r *PDFReader) WithExtraMetadata(metadata map[string]interface{}) *PDFReader

WithExtraMetadata sets extra metadata (fluent API).

func (*PDFReader) WithSplitByPage

func (r *PDFReader) WithSplitByPage(split bool) *PDFReader

WithSplitByPage enables splitting by page (fluent API).

type PDFReaderOption

type PDFReaderOption func(*PDFReader)

PDFReaderOption configures PDFReader.

func WithPDFExtraMetadata

func WithPDFExtraMetadata(metadata map[string]interface{}) PDFReaderOption

WithPDFExtraMetadata sets extra metadata.

func WithPDFInputDir

func WithPDFInputDir(dir string) PDFReaderOption

WithPDFInputDir sets the input directory.

func WithPDFInputFiles

func WithPDFInputFiles(files ...string) PDFReaderOption

WithPDFInputFiles sets the input files.

func WithPDFPasswordFunc

func WithPDFPasswordFunc(fn func(filePath string) string) PDFReaderOption

WithPDFPasswordFunc sets the password function.

func WithPDFRecursive

func WithPDFRecursive(recursive bool) PDFReaderOption

WithPDFRecursive enables recursive directory scanning.

func WithPDFSplitByPage

func WithPDFSplitByPage(split bool) PDFReaderOption

WithPDFSplitByPage enables splitting by page.

type Reader

type Reader interface {
	// LoadData loads documents and returns them as a slice.
	LoadData() ([]schema.Node, error)
}

Reader is the interface for document loaders. Implementations should load documents from various sources (files, URLs, etc.)

type ReaderError

type ReaderError struct {
	Source  string // File path or URL that caused the error
	Message string
	Err     error
}

ReaderError represents an error during document loading.

func NewReaderError

func NewReaderError(source, message string, err error) *ReaderError

NewReaderError creates a new ReaderError.

func (*ReaderError) Error

func (e *ReaderError) Error() string

func (*ReaderError) Unwrap

func (e *ReaderError) Unwrap() error

type ReaderMetadata

type ReaderMetadata struct {
	// Name is the reader name (e.g., "JSONReader", "PDFReader")
	Name string
	// SupportedExtensions lists file extensions this reader supports
	SupportedExtensions []string
	// Description describes what this reader does
	Description string
}

ReaderMetadata contains metadata about a reader.

type ReaderOptions

type ReaderOptions struct {
	// Recursive determines if directory readers should recurse into subdirectories
	Recursive bool
	// FileExtensions filters which file extensions to process
	FileExtensions []string
	// ExcludePatterns are glob patterns for files/dirs to exclude
	ExcludePatterns []string
	// IncludeHidden determines if hidden files should be included
	IncludeHidden bool
	// NumWorkers is the number of concurrent workers for parallel loading
	NumWorkers int
	// ExtraMetadata is additional metadata to add to all loaded documents
	ExtraMetadata map[string]interface{}
}

ReaderOptions contains common options for readers.

func DefaultReaderOptions

func DefaultReaderOptions() ReaderOptions

DefaultReaderOptions returns default reader options.

type ReaderWithContext

type ReaderWithContext interface {
	Reader
	// LoadDataWithContext loads documents with context support.
	LoadDataWithContext(ctx context.Context) ([]schema.Node, error)
}

ReaderWithContext is a Reader that supports context for cancellation.

type ReaderWithMetadata

type ReaderWithMetadata interface {
	Reader
	// Metadata returns information about this reader.
	Metadata() ReaderMetadata
}

ReaderWithMetadata is a Reader that provides metadata about itself.

type SimpleDirectoryReader

type SimpleDirectoryReader struct {
	// contains filtered or unexported fields
}

SimpleDirectoryReader reads files from a directory.

func NewSimpleDirectoryReader

func NewSimpleDirectoryReader(inputDir string, extensions ...string) *SimpleDirectoryReader

NewSimpleDirectoryReader creates a new SimpleDirectoryReader.

func (*SimpleDirectoryReader) LoadData

func (r *SimpleDirectoryReader) LoadData() ([]schema.Node, error)

LoadData reads files and returns a slice of Documents (Nodes with type Document).

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL