Documentation
¶
Index ¶
- Constants
- type AdaptiveTokenizer
- type AnalysisResult
- type CandidateBlock
- type CandidateTable
- type Cell
- type ColumnAlignment
- type ConfidenceScorer
- type DefaultMergeStrategy
- type DetectionConfig
- type DetectionStrategy
- type Detector
- type DetectorOption
- func WithAlignmentThresholdOption(threshold float64) DetectorOption
- func WithConfidenceThresholdOption(threshold float64) DetectorOption
- func WithMaxColumnVarianceOption(variance int) DetectorOption
- func WithMinColumnsOption(minColumns int) DetectorOption
- func WithMinLinesOption(minLines int) DetectorOption
- func WithTokenizationModeOption(mode TokenizationMode) DetectorOption
- type DualRoundDetector
- type DualRoundStrategy
- type GridDetector
- type GridOption
- type GridSegment
- type GridWord
- type LayoutAnalyzer
- type LayoutVector
- type LineData
- type LineLayout
- type MergeStrategy
- type NewTokenizationStrategy
- type QualityMetrics
- type SegmentMetadata
- type SingleRoundStrategy
- type Table
- func (t Table) GetCell(row, col int) (*Cell, error)
- func (t Table) GetColumn(col int) ([]Cell, error)
- func (t Table) GetColumnPositions() []int
- func (t Table) GetColumnTexts(col int) ([]string, error)
- func (t Table) GetHeaderRow() ([]Cell, error)
- func (t Table) GetRow(row int) ([]Cell, error)
- func (t Table) GetRowTexts(row int) ([]string, error)
- func (t Table) IsValid() bool
- func (t Table) LineCount() int
- func (t Table) String() string
- type TableAnalyzer
- type TableMetadata
- type Token
- type TokenizationMode
- type TokenizationStrategy
- type WordExtractor
Constants ¶
const ( // DefaultMinLines is the minimum number of lines required to form a table DefaultMinLines = 2 // DefaultMinColumns is the minimum number of columns required to form a table DefaultMinColumns = 2 // DefaultAlignmentThreshold is the threshold for column alignment consistency (0.0-1.0) DefaultAlignmentThreshold = 0.7 // DefaultConfidenceThreshold is the minimum confidence to consider as table (0.0-1.0) DefaultConfidenceThreshold = 0.6 // DefaultMaxColumnVariance is the maximum allowed variance in column positions DefaultMaxColumnVariance = 2 )
Core Detection Parameters
const ( // FirstRoundConfidenceThreshold is the confidence threshold for multi-space tokenization // (more lenient to capture compound tokens like "File Name") FirstRoundConfidenceThreshold = 0.4 // FirstRoundMaxColumnVariance is the variance tolerance for multi-space tokenization // (more tolerant to handle uneven spacing in compound tokens) FirstRoundMaxColumnVariance = 3 // SecondRoundConfidenceThreshold is the confidence threshold for single-space tokenization // (stricter to ensure quality of granular detection) SecondRoundConfidenceThreshold = 0.6 // SecondRoundMaxColumnVariance is the variance tolerance for single-space tokenization // (stricter alignment requirements for fine-grained tokens) SecondRoundMaxColumnVariance = 2 )
Dual-Round Detection Configuration
const ( // MinTokenWidth is the minimum width required for token analysis // Used in projection analysis and alignment detection MinTokenWidth = 2 // MinBoundariesForAnalysis is the minimum number of boundaries needed for alignment analysis // Used in tokenizer projection and boundary detection MinBoundariesForAnalysis = 3 // MinSpacesForSingleSpaceMode defines how many consecutive spaces trigger separation in SingleSpaceMode MinSpacesForSingleSpaceMode = 1 // MinSpacesForMultiSpaceMode defines how many consecutive spaces trigger separation in MultiSpaceMode MinSpacesForMultiSpaceMode = 2 // CompoundTokenMinWidth is the minimum width for compound tokens in MultiSpaceMode CompoundTokenMinWidth = 3 // MaxBoundaryRatio is the maximum ratio of boundary counts for compatibility analysis MaxBoundaryRatio = 1.5 )
Tokenization Configuration
const (
// MinWordLength is the minimum length for extracted words
MinWordLength = 3
)
Word Extraction Configuration
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AdaptiveTokenizer ¶
type AdaptiveTokenizer struct {
// contains filtered or unexported fields
}
AdaptiveTokenizer implements smart tokenization with multiple strategies
func NewAdaptiveTokenizer ¶
func NewAdaptiveTokenizer(config DetectionConfig) *AdaptiveTokenizer
NewAdaptiveTokenizer creates a new adaptive tokenizer with the given configuration
type AnalysisResult ¶
type AnalysisResult struct { Confidence float64 `json:"confidence"` Columns []int `json:"columns"` QualityMetrics *QualityMetrics `json:"quality_metrics"` AlignmentData []ColumnAlignment `json:"alignment_data"` TokenDistribution map[int]int `json:"token_distribution"` }
AnalysisResult contains comprehensive analysis information for a table candidate
type CandidateBlock ¶
CandidateBlock represents a potential grid block with similar layout
type CandidateTable ¶
type CandidateTable struct { StartLine int `json:"start_line"` // Starting line index EndLine int `json:"end_line"` // Ending line index Lines []string `json:"lines"` // Text lines that form this table Layouts []LineLayout `json:"layouts"` // Layout information for each line Confidence float64 `json:"confidence"` // Initial confidence score }
CandidateTable represents a potential table during detection
type Cell ¶
type Cell struct { Text string `json:"text"` // The text content of the cell Row int `json:"row"` // Row index within the table (0-based) Column int `json:"column"` // Column index within the table (0-based) LineIndex int `json:"line_index"` // Original line index in the input StartPos int `json:"start_pos"` // Start position of the cell in the line EndPos int `json:"end_pos"` // End position of the cell in the line }
Cell represents a detected table cell with its content and position information
type ColumnAlignment ¶
type ColumnAlignment struct { Position int // Column start position Width int // Average column width Alignment string // "left", "right", "center" Consistency float64 // How consistent this column's alignment is (0.0-1.0) }
ColumnAlignment contains alignment information for a single column
type ConfidenceScorer ¶
type ConfidenceScorer interface { // CalculateConfidence computes a confidence score for a detected table CalculateConfidence(table Table, originalLines []string) (float64, error) // CalculateQualityMetrics computes detailed quality metrics CalculateQualityMetrics(table Table, originalLines []string) (*QualityMetrics, error) }
ConfidenceScorer defines the interface for calculating detection confidence
type DefaultMergeStrategy ¶
type DefaultMergeStrategy struct{}
DefaultMergeStrategy implements a balanced approach to merging detection results
func (*DefaultMergeStrategy) MergeResults ¶
func (dms *DefaultMergeStrategy) MergeResults(firstRound, secondRound []GridSegment, originalLines []string) []GridSegment
MergeResults implements the default strategy for combining detection results
type DetectionConfig ¶
type DetectionConfig struct { MinLines int `json:"min_lines"` // Minimum lines required to form a grid MinColumns int `json:"min_columns"` // Minimum columns required to form a grid AlignmentThreshold float64 `json:"alignment_threshold"` // Threshold for column alignment consistency ConfidenceThreshold float64 `json:"confidence_threshold"` // Minimum confidence to consider as grid MaxColumnVariance int `json:"max_column_variance"` // Maximum allowed variance in column positions TokenizationMode TokenizationMode `json:"tokenization_mode"` // Tokenization strategy to use }
DetectionConfig holds configuration parameters for grid detection
func DefaultConfig ¶
func DefaultConfig() DetectionConfig
DefaultConfig returns a configuration with default values
type DetectionStrategy ¶
type DetectionStrategy interface { // DetectTables analyzes text lines and returns detected tables DetectTables(lines []string) ([]Table, error) // GetName returns the name of this detection strategy GetName() string // GetConfiguration returns the current configuration GetConfiguration() DetectionConfig }
DetectionStrategy defines the interface for different grid detection strategies
type Detector ¶
type Detector struct {
// contains filtered or unexported fields
}
Detector provides the main interface for table detection with improved API
func NewDetector ¶
func NewDetector(opts ...DetectorOption) *Detector
NewDetector creates a new detector with the specified configuration
type DetectorOption ¶
type DetectorOption func(*DetectionConfig)
DetectorOption defines options for configuring the detector
func WithAlignmentThresholdOption ¶
func WithAlignmentThresholdOption(threshold float64) DetectorOption
WithAlignmentThresholdOption sets the alignment threshold
func WithConfidenceThresholdOption ¶
func WithConfidenceThresholdOption(threshold float64) DetectorOption
WithConfidenceThresholdOption sets the confidence threshold
func WithMaxColumnVarianceOption ¶
func WithMaxColumnVarianceOption(variance int) DetectorOption
WithMaxColumnVarianceOption sets the maximum column variance
func WithMinColumnsOption ¶
func WithMinColumnsOption(minColumns int) DetectorOption
WithMinColumnsOption sets the minimum columns required for detection
func WithMinLinesOption ¶
func WithMinLinesOption(minLines int) DetectorOption
WithMinLinesOption sets the minimum lines required for detection
func WithTokenizationModeOption ¶
func WithTokenizationModeOption(mode TokenizationMode) DetectorOption
WithTokenizationModeOption sets the tokenization mode
type DualRoundDetector ¶
type DualRoundDetector struct {
// contains filtered or unexported fields
}
DualRoundDetector performs two-round grid detection with different tokenization strategies
func NewDualRoundDetector ¶
func NewDualRoundDetector(opts ...GridOption) *DualRoundDetector
NewDualRoundDetector creates a new dual-round detector with optimized settings for each round
func (*DualRoundDetector) DetectGrids ¶
func (drd *DualRoundDetector) DetectGrids(lines []string) []GridSegment
DetectGrids performs dual-round grid detection and returns the optimal results
type DualRoundStrategy ¶
type DualRoundStrategy struct {
// contains filtered or unexported fields
}
DualRoundStrategy implements the dual-round detection approach
func NewDualRoundStrategy ¶
func NewDualRoundStrategy(baseConfig DetectionConfig) *DualRoundStrategy
NewDualRoundStrategy creates a new dual-round detection strategy
func (*DualRoundStrategy) DetectTables ¶
func (drs *DualRoundStrategy) DetectTables(lines []string) ([]Table, error)
DetectTables implements DetectionStrategy interface
func (*DualRoundStrategy) GetConfiguration ¶
func (drs *DualRoundStrategy) GetConfiguration() DetectionConfig
GetConfiguration returns the strategy configuration
func (*DualRoundStrategy) GetName ¶
func (drs *DualRoundStrategy) GetName() string
GetName returns the strategy name
type GridDetector ¶
type GridDetector struct {
// contains filtered or unexported fields
}
GridDetector detects grid-like segments in text
func NewGridDetector ¶
func NewGridDetector(opts ...GridOption) *GridDetector
NewGridDetector creates a new grid detector with default parameters
func (*GridDetector) DetectGrids ¶
func (gd *GridDetector) DetectGrids(lines []string) []GridSegment
DetectGrids analyzes text lines and returns segments that appear to have grid-like alignment
type GridOption ¶
type GridOption func(*GridDetector)
func WithAlignmentThreshold ¶
func WithAlignmentThreshold(threshold float64) GridOption
func WithConfidenceThreshold ¶
func WithConfidenceThreshold(threshold float64) GridOption
func WithMaxColumnVariance ¶
func WithMaxColumnVariance(v int) GridOption
func WithMinColumns ¶
func WithMinColumns(n int) GridOption
func WithMinLines ¶
func WithMinLines(n int) GridOption
func WithTokenizationMode ¶
func WithTokenizationMode(mode TokenizationMode) GridOption
type GridSegment ¶
type GridSegment struct { Lines []string // The lines that form this grid segment StartLine int // Starting line number in the original text EndLine int // Ending line number in the original text Columns []int // Column positions where alignment occurs Confidence float64 // Confidence score of this being a grid (0.0 to 1.0) Mode TokenizationMode // Which tokenization mode was used Metadata *SegmentMetadata // Additional information about this segment }
GridSegment represents a segment of text that has grid-like alignment
func ConvertTableToGridSegment ¶
func ConvertTableToGridSegment(table Table) GridSegment
ConvertTableToGridSegment converts a new Table back to legacy GridSegment format
func DetectGridsLegacy ¶
func DetectGridsLegacy(lines []string, opts ...GridOption) []GridSegment
DetectGridsLegacy provides backward compatibility with the original DetectGrids function
type GridWord ¶
GridWord represents a word extracted from a grid segment for backward compatibility
func ExtractValidWords ¶
func ExtractValidWords(segment GridSegment) []GridWord
ExtractValidWords extracts valid words from a GridSegment (backward compatibility)
type LayoutAnalyzer ¶
type LayoutAnalyzer interface { // AnalyzeLayout determines the layout structure of a set of lines AnalyzeLayout(lines []string) ([]LineLayout, error) // CompareSimilarity checks if two layouts are similar enough to be part of the same table CompareSimilarity(layout1, layout2 LineLayout) bool }
LayoutAnalyzer defines the interface for analyzing line layouts
type LayoutVector ¶
type LayoutVector []int
LayoutVector represents the column layout of a line (column start positions)
type LineData ¶
type LineData struct {
// contains filtered or unexported fields
}
LineData contains analysis results for a single line
type LineLayout ¶
type LineLayout struct { Tokens []Token `json:"tokens"` // Tokens found in this line ColumnPositions []int `json:"column_positions"` // Column start positions LineIndex int `json:"line_index"` // Index of this line in the original text }
LineLayout represents the layout structure of a single line
type MergeStrategy ¶
type MergeStrategy interface {
MergeResults(firstRound, secondRound []GridSegment, originalLines []string) []GridSegment
}
MergeStrategy defines how to combine results from two detection rounds
type NewTokenizationStrategy ¶
type NewTokenizationStrategy interface { // Tokenize splits a line into tokens with position information Tokenize(line string, lineIndex int, context []string) ([]Token, error) // GetMode returns the tokenization mode this strategy implements GetMode() TokenizationMode // ShouldApply determines if this strategy should be applied to the given context ShouldApply(line string, lineIndex int, context []string) bool }
NewTokenizationStrategy defines the interface for different tokenization approaches (Named differently to avoid conflict with existing interface)
type QualityMetrics ¶
type QualityMetrics struct { AlignmentScore float64 `json:"alignment_score"` // How well columns are aligned (0.0-1.0) ConsistencyScore float64 `json:"consistency_score"` // How consistent row structures are (0.0-1.0) CompactnessScore float64 `json:"compactness_score"` // How compact the table structure is (0.0-1.0) TokenCountStdDev float64 `json:"token_count_stddev"` // Standard deviation of token counts per row AvgColumnSpacing float64 `json:"avg_column_spacing"` // Average spacing between columns }
QualityMetrics provides detailed quality assessment of the detected table
type SegmentMetadata ¶
type SegmentMetadata struct { TokenizationMode TokenizationMode OriginalTokens [][]Token // Tokens for each line AlignmentData []ColumnAlignment DetectionSource string // "first_round", "second_round", "merged" }
SegmentMetadata contains detailed information about how a segment was detected
type SingleRoundStrategy ¶
type SingleRoundStrategy struct {
// contains filtered or unexported fields
}
SingleRoundStrategy implements single-round detection with specified tokenization mode
func NewSingleRoundStrategy ¶
func NewSingleRoundStrategy(config DetectionConfig, mode TokenizationMode) *SingleRoundStrategy
NewSingleRoundStrategy creates a new single-round detection strategy
func (*SingleRoundStrategy) DetectTables ¶
func (srs *SingleRoundStrategy) DetectTables(lines []string) ([]Table, error)
DetectTables implements DetectionStrategy interface
func (*SingleRoundStrategy) GetConfiguration ¶
func (srs *SingleRoundStrategy) GetConfiguration() DetectionConfig
GetConfiguration returns the strategy configuration
func (*SingleRoundStrategy) GetName ¶
func (srs *SingleRoundStrategy) GetName() string
GetName returns the strategy name
type Table ¶
type Table struct { StartLine int `json:"start_line"` // Starting line number in original text EndLine int `json:"end_line"` // Ending line number in original text NumRows int `json:"num_rows"` // Number of rows in the table NumColumns int `json:"num_columns"` // Number of columns in the table Confidence float64 `json:"confidence"` // Detection confidence score (0.0-1.0) Mode TokenizationMode `json:"mode"` // Tokenization mode used for detection Cells [][]Cell `json:"cells"` // 2D array of cells [row][column] Metadata *TableMetadata `json:"metadata"` // Additional metadata about the table }
Table represents a detected table with enhanced metadata and cell information
func ConvertGridSegmentToTable ¶
func ConvertGridSegmentToTable(segment GridSegment) Table
ConvertGridSegmentToTable converts a legacy GridSegment to the new Table format
func (Table) GetColumnPositions ¶
GetColumnPositions returns the column start positions from metadata, if available
func (Table) GetColumnTexts ¶
GetColumnTexts returns the text content of all cells in a column as a slice of strings
func (Table) GetHeaderRow ¶
GetHeaderRow returns the first row as header cells, if the table has rows
func (Table) GetRowTexts ¶
GetRowTexts returns the text content of all cells in a row as a slice of strings
type TableAnalyzer ¶
type TableAnalyzer struct {
// contains filtered or unexported fields
}
TableAnalyzer provides enhanced analysis capabilities for table detection
func NewTableAnalyzer ¶
func NewTableAnalyzer(config DetectionConfig) *TableAnalyzer
NewTableAnalyzer creates a new enhanced table analyzer
func (*TableAnalyzer) AnalyzeCandidate ¶
func (ta *TableAnalyzer) AnalyzeCandidate(lines []string, startLine, endLine int) (*AnalysisResult, error)
AnalyzeCandidate performs comprehensive analysis on a candidate table
type TableMetadata ¶
type TableMetadata struct { DetectionStrategy string `json:"detection_strategy"` // Strategy used ("dual_round", "single_round", etc.) TokenizationMode TokenizationMode `json:"tokenization_mode"` // Mode used for tokenization ColumnPositions []int `json:"column_positions"` // Character positions where columns start AlignmentData []ColumnAlignment `json:"alignment_data"` // Alignment information for each column QualityMetrics *QualityMetrics `json:"quality_metrics"` // Quality assessment metrics }
TableMetadata contains detailed information about how a table was detected
type TokenizationMode ¶
type TokenizationMode int
TokenizationMode defines the strategy for splitting text into tokens
const ( // SingleSpaceMode splits on any whitespace (current behavior) SingleSpaceMode TokenizationMode = iota // MultiSpaceMode splits only on 2+ consecutive spaces MultiSpaceMode )
type TokenizationStrategy ¶
type TokenizationStrategy interface {
// contains filtered or unexported methods
}
type WordExtractor ¶
type WordExtractor struct {
// contains filtered or unexported fields
}
WordExtractor provides enhanced word extraction with quality filtering
func NewWordExtractor ¶
func NewWordExtractor() *WordExtractor
NewWordExtractor creates a new word extractor with configuration
func (*WordExtractor) ExtractCells ¶
func (we *WordExtractor) ExtractCells(segment GridSegment) [][]Cell
ExtractCells extracts cells from a GridSegment and returns them in the new Cell format