tabledetection

package
v0.0.0-...-0e33c82 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 12, 2025 License: MIT Imports: 6 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// DefaultMinLines is the minimum number of lines required to form a table
	DefaultMinLines = 2

	// DefaultMinColumns is the minimum number of columns required to form a table
	DefaultMinColumns = 2

	// DefaultAlignmentThreshold is the threshold for column alignment consistency (0.0-1.0)
	DefaultAlignmentThreshold = 0.7

	// DefaultConfidenceThreshold is the minimum confidence to consider as table (0.0-1.0)
	DefaultConfidenceThreshold = 0.6

	// DefaultMaxColumnVariance is the maximum allowed variance in column positions
	DefaultMaxColumnVariance = 2
)

Core Detection Parameters

View Source
const (
	// FirstRoundConfidenceThreshold is the confidence threshold for multi-space tokenization
	// (more lenient to capture compound tokens like "File Name")
	FirstRoundConfidenceThreshold = 0.4

	// FirstRoundMaxColumnVariance is the variance tolerance for multi-space tokenization
	// (more tolerant to handle uneven spacing in compound tokens)
	FirstRoundMaxColumnVariance = 3

	// SecondRoundConfidenceThreshold is the confidence threshold for single-space tokenization
	// (stricter to ensure quality of granular detection)
	SecondRoundConfidenceThreshold = 0.6

	// SecondRoundMaxColumnVariance is the variance tolerance for single-space tokenization
	// (stricter alignment requirements for fine-grained tokens)
	SecondRoundMaxColumnVariance = 2
)

Dual-Round Detection Configuration

View Source
const (
	// MinTokenWidth is the minimum width required for token analysis
	// Used in projection analysis and alignment detection
	MinTokenWidth = 2

	// MinBoundariesForAnalysis is the minimum number of boundaries needed for alignment analysis
	// Used in tokenizer projection and boundary detection
	MinBoundariesForAnalysis = 3

	// MinSpacesForSingleSpaceMode defines how many consecutive spaces trigger separation in SingleSpaceMode
	MinSpacesForSingleSpaceMode = 1

	// MinSpacesForMultiSpaceMode defines how many consecutive spaces trigger separation in MultiSpaceMode
	MinSpacesForMultiSpaceMode = 2

	// CompoundTokenMinWidth is the minimum width for compound tokens in MultiSpaceMode
	CompoundTokenMinWidth = 3

	// MaxBoundaryRatio is the maximum ratio of boundary counts for compatibility analysis
	MaxBoundaryRatio = 1.5
)

Tokenization Configuration

View Source
const (
	// MinWordLength is the minimum length for extracted words
	MinWordLength = 3
)

Word Extraction Configuration

Variables

This section is empty.

Functions

This section is empty.

Types

type AdaptiveTokenizer

type AdaptiveTokenizer struct {
	// contains filtered or unexported fields
}

AdaptiveTokenizer implements smart tokenization with multiple strategies

func NewAdaptiveTokenizer

func NewAdaptiveTokenizer(config DetectionConfig) *AdaptiveTokenizer

NewAdaptiveTokenizer creates a new adaptive tokenizer with the given configuration

type AnalysisResult

type AnalysisResult struct {
	Confidence        float64           `json:"confidence"`
	Columns           []int             `json:"columns"`
	QualityMetrics    *QualityMetrics   `json:"quality_metrics"`
	AlignmentData     []ColumnAlignment `json:"alignment_data"`
	TokenDistribution map[int]int       `json:"token_distribution"`
}

AnalysisResult contains comprehensive analysis information for a table candidate

type CandidateBlock

type CandidateBlock struct {
	StartLine int
	EndLine   int
	Lines     []string
}

CandidateBlock represents a potential grid block with similar layout

type CandidateTable

type CandidateTable struct {
	StartLine  int          `json:"start_line"` // Starting line index
	EndLine    int          `json:"end_line"`   // Ending line index
	Lines      []string     `json:"lines"`      // Text lines that form this table
	Layouts    []LineLayout `json:"layouts"`    // Layout information for each line
	Confidence float64      `json:"confidence"` // Initial confidence score
}

CandidateTable represents a potential table during detection

type Cell

type Cell struct {
	Text      string `json:"text"`       // The text content of the cell
	Row       int    `json:"row"`        // Row index within the table (0-based)
	Column    int    `json:"column"`     // Column index within the table (0-based)
	LineIndex int    `json:"line_index"` // Original line index in the input
	StartPos  int    `json:"start_pos"`  // Start position of the cell in the line
	EndPos    int    `json:"end_pos"`    // End position of the cell in the line
}

Cell represents a detected table cell with its content and position information

func (Cell) IsEmpty

func (c Cell) IsEmpty() bool

IsEmpty returns true if the cell has no text content

func (Cell) Length

func (c Cell) Length() int

Length returns the length of the cell text

func (Cell) String

func (c Cell) String() string

String returns a string representation of the cell

type ColumnAlignment

type ColumnAlignment struct {
	Position    int     // Column start position
	Width       int     // Average column width
	Alignment   string  // "left", "right", "center"
	Consistency float64 // How consistent this column's alignment is (0.0-1.0)
}

ColumnAlignment contains alignment information for a single column

type ConfidenceScorer

type ConfidenceScorer interface {
	// CalculateConfidence computes a confidence score for a detected table
	CalculateConfidence(table Table, originalLines []string) (float64, error)

	// CalculateQualityMetrics computes detailed quality metrics
	CalculateQualityMetrics(table Table, originalLines []string) (*QualityMetrics, error)
}

ConfidenceScorer defines the interface for calculating detection confidence

type DefaultMergeStrategy

type DefaultMergeStrategy struct{}

DefaultMergeStrategy implements a balanced approach to merging detection results

func (*DefaultMergeStrategy) MergeResults

func (dms *DefaultMergeStrategy) MergeResults(firstRound, secondRound []GridSegment, originalLines []string) []GridSegment

MergeResults implements the default strategy for combining detection results

type DetectionConfig

type DetectionConfig struct {
	MinLines            int              `json:"min_lines"`            // Minimum lines required to form a grid
	MinColumns          int              `json:"min_columns"`          // Minimum columns required to form a grid
	AlignmentThreshold  float64          `json:"alignment_threshold"`  // Threshold for column alignment consistency
	ConfidenceThreshold float64          `json:"confidence_threshold"` // Minimum confidence to consider as grid
	MaxColumnVariance   int              `json:"max_column_variance"`  // Maximum allowed variance in column positions
	TokenizationMode    TokenizationMode `json:"tokenization_mode"`    // Tokenization strategy to use
}

DetectionConfig holds configuration parameters for grid detection

func DefaultConfig

func DefaultConfig() DetectionConfig

DefaultConfig returns a configuration with default values

type DetectionStrategy

type DetectionStrategy interface {
	// DetectTables analyzes text lines and returns detected tables
	DetectTables(lines []string) ([]Table, error)

	// GetName returns the name of this detection strategy
	GetName() string

	// GetConfiguration returns the current configuration
	GetConfiguration() DetectionConfig
}

DetectionStrategy defines the interface for different grid detection strategies

type Detector

type Detector struct {
	// contains filtered or unexported fields
}

Detector provides the main interface for table detection with improved API

func NewDetector

func NewDetector(opts ...DetectorOption) *Detector

NewDetector creates a new detector with the specified configuration

func (*Detector) DetectTables

func (d *Detector) DetectTables(lines []string) ([]Table, error)

DetectTables implements the main detection interface

type DetectorOption

type DetectorOption func(*DetectionConfig)

DetectorOption defines options for configuring the detector

func WithAlignmentThresholdOption

func WithAlignmentThresholdOption(threshold float64) DetectorOption

WithAlignmentThresholdOption sets the alignment threshold

func WithConfidenceThresholdOption

func WithConfidenceThresholdOption(threshold float64) DetectorOption

WithConfidenceThresholdOption sets the confidence threshold

func WithMaxColumnVarianceOption

func WithMaxColumnVarianceOption(variance int) DetectorOption

WithMaxColumnVarianceOption sets the maximum column variance

func WithMinColumnsOption

func WithMinColumnsOption(minColumns int) DetectorOption

WithMinColumnsOption sets the minimum columns required for detection

func WithMinLinesOption

func WithMinLinesOption(minLines int) DetectorOption

WithMinLinesOption sets the minimum lines required for detection

func WithTokenizationModeOption

func WithTokenizationModeOption(mode TokenizationMode) DetectorOption

WithTokenizationModeOption sets the tokenization mode

type DualRoundDetector

type DualRoundDetector struct {
	// contains filtered or unexported fields
}

DualRoundDetector performs two-round grid detection with different tokenization strategies

func NewDualRoundDetector

func NewDualRoundDetector(opts ...GridOption) *DualRoundDetector

NewDualRoundDetector creates a new dual-round detector with optimized settings for each round

func (*DualRoundDetector) DetectGrids

func (drd *DualRoundDetector) DetectGrids(lines []string) []GridSegment

DetectGrids performs dual-round grid detection and returns the optimal results

type DualRoundStrategy

type DualRoundStrategy struct {
	// contains filtered or unexported fields
}

DualRoundStrategy implements the dual-round detection approach

func NewDualRoundStrategy

func NewDualRoundStrategy(baseConfig DetectionConfig) *DualRoundStrategy

NewDualRoundStrategy creates a new dual-round detection strategy

func (*DualRoundStrategy) DetectTables

func (drs *DualRoundStrategy) DetectTables(lines []string) ([]Table, error)

DetectTables implements DetectionStrategy interface

func (*DualRoundStrategy) GetConfiguration

func (drs *DualRoundStrategy) GetConfiguration() DetectionConfig

GetConfiguration returns the strategy configuration

func (*DualRoundStrategy) GetName

func (drs *DualRoundStrategy) GetName() string

GetName returns the strategy name

type GridDetector

type GridDetector struct {
	// contains filtered or unexported fields
}

GridDetector detects grid-like segments in text

func NewGridDetector

func NewGridDetector(opts ...GridOption) *GridDetector

NewGridDetector creates a new grid detector with default parameters

func (*GridDetector) DetectGrids

func (gd *GridDetector) DetectGrids(lines []string) []GridSegment

DetectGrids analyzes text lines and returns segments that appear to have grid-like alignment

type GridOption

type GridOption func(*GridDetector)

func WithAlignmentThreshold

func WithAlignmentThreshold(threshold float64) GridOption

func WithConfidenceThreshold

func WithConfidenceThreshold(threshold float64) GridOption

func WithMaxColumnVariance

func WithMaxColumnVariance(v int) GridOption

func WithMinColumns

func WithMinColumns(n int) GridOption

func WithMinLines

func WithMinLines(n int) GridOption

func WithTokenizationMode

func WithTokenizationMode(mode TokenizationMode) GridOption

type GridSegment

type GridSegment struct {
	Lines      []string         // The lines that form this grid segment
	StartLine  int              // Starting line number in the original text
	EndLine    int              // Ending line number in the original text
	Columns    []int            // Column positions where alignment occurs
	Confidence float64          // Confidence score of this being a grid (0.0 to 1.0)
	Mode       TokenizationMode // Which tokenization mode was used
	Metadata   *SegmentMetadata // Additional information about this segment
}

GridSegment represents a segment of text that has grid-like alignment

func ConvertTableToGridSegment

func ConvertTableToGridSegment(table Table) GridSegment

ConvertTableToGridSegment converts a new Table back to legacy GridSegment format

func DetectGridsLegacy

func DetectGridsLegacy(lines []string, opts ...GridOption) []GridSegment

DetectGridsLegacy provides backward compatibility with the original DetectGrids function

type GridWord

type GridWord struct {
	Text    string
	X       int
	Y       int
	LineIdx int
}

GridWord represents a word extracted from a grid segment for backward compatibility

func ExtractValidWords

func ExtractValidWords(segment GridSegment) []GridWord

ExtractValidWords extracts valid words from a GridSegment (backward compatibility)

type LayoutAnalyzer

type LayoutAnalyzer interface {
	// AnalyzeLayout determines the layout structure of a set of lines
	AnalyzeLayout(lines []string) ([]LineLayout, error)

	// CompareSimilarity checks if two layouts are similar enough to be part of the same table
	CompareSimilarity(layout1, layout2 LineLayout) bool
}

LayoutAnalyzer defines the interface for analyzing line layouts

type LayoutVector

type LayoutVector []int

LayoutVector represents the column layout of a line (column start positions)

type LineData

type LineData struct {
	// contains filtered or unexported fields
}

LineData contains analysis results for a single line

type LineLayout

type LineLayout struct {
	Tokens          []Token `json:"tokens"`           // Tokens found in this line
	ColumnPositions []int   `json:"column_positions"` // Column start positions
	LineIndex       int     `json:"line_index"`       // Index of this line in the original text
}

LineLayout represents the layout structure of a single line

type MergeStrategy

type MergeStrategy interface {
	MergeResults(firstRound, secondRound []GridSegment, originalLines []string) []GridSegment
}

MergeStrategy defines how to combine results from two detection rounds

type NewTokenizationStrategy

type NewTokenizationStrategy interface {
	// Tokenize splits a line into tokens with position information
	Tokenize(line string, lineIndex int, context []string) ([]Token, error)

	// GetMode returns the tokenization mode this strategy implements
	GetMode() TokenizationMode

	// ShouldApply determines if this strategy should be applied to the given context
	ShouldApply(line string, lineIndex int, context []string) bool
}

NewTokenizationStrategy defines the interface for different tokenization approaches (Named differently to avoid conflict with existing interface)

type QualityMetrics

type QualityMetrics struct {
	AlignmentScore   float64 `json:"alignment_score"`    // How well columns are aligned (0.0-1.0)
	ConsistencyScore float64 `json:"consistency_score"`  // How consistent row structures are (0.0-1.0)
	CompactnessScore float64 `json:"compactness_score"`  // How compact the table structure is (0.0-1.0)
	TokenCountStdDev float64 `json:"token_count_stddev"` // Standard deviation of token counts per row
	AvgColumnSpacing float64 `json:"avg_column_spacing"` // Average spacing between columns
}

QualityMetrics provides detailed quality assessment of the detected table

type SegmentMetadata

type SegmentMetadata struct {
	TokenizationMode TokenizationMode
	OriginalTokens   [][]Token // Tokens for each line
	AlignmentData    []ColumnAlignment
	DetectionSource  string // "first_round", "second_round", "merged"
}

SegmentMetadata contains detailed information about how a segment was detected

type SingleRoundStrategy

type SingleRoundStrategy struct {
	// contains filtered or unexported fields
}

SingleRoundStrategy implements single-round detection with specified tokenization mode

func NewSingleRoundStrategy

func NewSingleRoundStrategy(config DetectionConfig, mode TokenizationMode) *SingleRoundStrategy

NewSingleRoundStrategy creates a new single-round detection strategy

func (*SingleRoundStrategy) DetectTables

func (srs *SingleRoundStrategy) DetectTables(lines []string) ([]Table, error)

DetectTables implements DetectionStrategy interface

func (*SingleRoundStrategy) GetConfiguration

func (srs *SingleRoundStrategy) GetConfiguration() DetectionConfig

GetConfiguration returns the strategy configuration

func (*SingleRoundStrategy) GetName

func (srs *SingleRoundStrategy) GetName() string

GetName returns the strategy name

type Table

type Table struct {
	StartLine  int              `json:"start_line"`  // Starting line number in original text
	EndLine    int              `json:"end_line"`    // Ending line number in original text
	NumRows    int              `json:"num_rows"`    // Number of rows in the table
	NumColumns int              `json:"num_columns"` // Number of columns in the table
	Confidence float64          `json:"confidence"`  // Detection confidence score (0.0-1.0)
	Mode       TokenizationMode `json:"mode"`        // Tokenization mode used for detection
	Cells      [][]Cell         `json:"cells"`       // 2D array of cells [row][column]
	Metadata   *TableMetadata   `json:"metadata"`    // Additional metadata about the table
}

Table represents a detected table with enhanced metadata and cell information

func ConvertGridSegmentToTable

func ConvertGridSegmentToTable(segment GridSegment) Table

ConvertGridSegmentToTable converts a legacy GridSegment to the new Table format

func (Table) GetCell

func (t Table) GetCell(row, col int) (*Cell, error)

GetCell safely returns a cell at the given row and column indices

func (Table) GetColumn

func (t Table) GetColumn(col int) ([]Cell, error)

GetColumn returns all cells in the specified column

func (Table) GetColumnPositions

func (t Table) GetColumnPositions() []int

GetColumnPositions returns the column start positions from metadata, if available

func (Table) GetColumnTexts

func (t Table) GetColumnTexts(col int) ([]string, error)

GetColumnTexts returns the text content of all cells in a column as a slice of strings

func (Table) GetHeaderRow

func (t Table) GetHeaderRow() ([]Cell, error)

GetHeaderRow returns the first row as header cells, if the table has rows

func (Table) GetRow

func (t Table) GetRow(row int) ([]Cell, error)

GetRow returns all cells in the specified row

func (Table) GetRowTexts

func (t Table) GetRowTexts(row int) ([]string, error)

GetRowTexts returns the text content of all cells in a row as a slice of strings

func (Table) IsValid

func (t Table) IsValid() bool

IsValid returns true if the table has valid structure

func (Table) LineCount

func (t Table) LineCount() int

LineCount returns the number of lines this table spans

func (Table) String

func (t Table) String() string

String returns a string representation of the table

type TableAnalyzer

type TableAnalyzer struct {
	// contains filtered or unexported fields
}

TableAnalyzer provides enhanced analysis capabilities for table detection

func NewTableAnalyzer

func NewTableAnalyzer(config DetectionConfig) *TableAnalyzer

NewTableAnalyzer creates a new enhanced table analyzer

func (*TableAnalyzer) AnalyzeCandidate

func (ta *TableAnalyzer) AnalyzeCandidate(lines []string, startLine, endLine int) (*AnalysisResult, error)

AnalyzeCandidate performs comprehensive analysis on a candidate table

type TableMetadata

type TableMetadata struct {
	DetectionStrategy string            `json:"detection_strategy"` // Strategy used ("dual_round", "single_round", etc.)
	TokenizationMode  TokenizationMode  `json:"tokenization_mode"`  // Mode used for tokenization
	ColumnPositions   []int             `json:"column_positions"`   // Character positions where columns start
	AlignmentData     []ColumnAlignment `json:"alignment_data"`     // Alignment information for each column
	QualityMetrics    *QualityMetrics   `json:"quality_metrics"`    // Quality assessment metrics
}

TableMetadata contains detailed information about how a table was detected

type Token

type Token struct {
	Text  string
	Start int
	End   int
}

Token represents a single token with its position information

type TokenizationMode

type TokenizationMode int

TokenizationMode defines the strategy for splitting text into tokens

const (
	// SingleSpaceMode splits on any whitespace (current behavior)
	SingleSpaceMode TokenizationMode = iota
	// MultiSpaceMode splits only on 2+ consecutive spaces
	MultiSpaceMode
)

type TokenizationStrategy

type TokenizationStrategy interface {
	// contains filtered or unexported methods
}

type WordExtractor

type WordExtractor struct {
	// contains filtered or unexported fields
}

WordExtractor provides enhanced word extraction with quality filtering

func NewWordExtractor

func NewWordExtractor() *WordExtractor

NewWordExtractor creates a new word extractor with configuration

func (*WordExtractor) ExtractCells

func (we *WordExtractor) ExtractCells(segment GridSegment) [][]Cell

ExtractCells extracts cells from a GridSegment and returns them in the new Cell format

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL