tabledetection

package

v0.0.0-...-0e33c82 Latest Latest Go to latest Published: Aug 12, 2025 License: MIT Imports: 6 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/Hanaasagi/magonote

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
type AdaptiveTokenizer
- func NewAdaptiveTokenizer(config DetectionConfig) *AdaptiveTokenizer
type AnalysisResult
type CandidateBlock
type CandidateTable
type Cell
- func (c Cell) IsEmpty() bool
- func (c Cell) Length() int
- func (c Cell) String() string
type ColumnAlignment
type ConfidenceScorer
type DefaultMergeStrategy
- func (dms *DefaultMergeStrategy) MergeResults(firstRound, secondRound []GridSegment, originalLines []string) []GridSegment
type DetectionConfig
- func DefaultConfig() DetectionConfig
type DetectionStrategy
type Detector
- func NewDetector(opts ...DetectorOption) *Detector
- func (d *Detector) DetectTables(lines []string) ([]Table, error)
type DetectorOption
- func WithAlignmentThresholdOption(threshold float64) DetectorOption
- func WithConfidenceThresholdOption(threshold float64) DetectorOption
- func WithMaxColumnVarianceOption(variance int) DetectorOption
- func WithMinColumnsOption(minColumns int) DetectorOption
- func WithMinLinesOption(minLines int) DetectorOption
- func WithTokenizationModeOption(mode TokenizationMode) DetectorOption
type DualRoundDetector
- func NewDualRoundDetector(opts ...GridOption) *DualRoundDetector
- func (drd *DualRoundDetector) DetectGrids(lines []string) []GridSegment
type DualRoundStrategy
- func NewDualRoundStrategy(baseConfig DetectionConfig) *DualRoundStrategy
- func (drs *DualRoundStrategy) DetectTables(lines []string) ([]Table, error)
- func (drs *DualRoundStrategy) GetConfiguration() DetectionConfig
- func (drs *DualRoundStrategy) GetName() string
type GridDetector
- func NewGridDetector(opts ...GridOption) *GridDetector
- func (gd *GridDetector) DetectGrids(lines []string) []GridSegment
type GridOption
- func WithAlignmentThreshold(threshold float64) GridOption
- func WithConfidenceThreshold(threshold float64) GridOption
- func WithMaxColumnVariance(v int) GridOption
- func WithMinColumns(n int) GridOption
- func WithMinLines(n int) GridOption
- func WithTokenizationMode(mode TokenizationMode) GridOption
type GridSegment
- func ConvertTableToGridSegment(table Table) GridSegment
- func DetectGridsLegacy(lines []string, opts ...GridOption) []GridSegment
type GridWord
- func ExtractValidWords(segment GridSegment) []GridWord
type LayoutAnalyzer
type LayoutVector
type LineData
type LineLayout
type MergeStrategy
type NewTokenizationStrategy
type QualityMetrics
type SegmentMetadata
type SingleRoundStrategy
- func NewSingleRoundStrategy(config DetectionConfig, mode TokenizationMode) *SingleRoundStrategy
- func (srs *SingleRoundStrategy) DetectTables(lines []string) ([]Table, error)
- func (srs *SingleRoundStrategy) GetConfiguration() DetectionConfig
- func (srs *SingleRoundStrategy) GetName() string
type Table
- func ConvertGridSegmentToTable(segment GridSegment) Table
- func (t Table) GetCell(row, col int) (*Cell, error)
- func (t Table) GetColumn(col int) ([]Cell, error)
- func (t Table) GetColumnPositions() []int
- func (t Table) GetColumnTexts(col int) ([]string, error)
- func (t Table) GetHeaderRow() ([]Cell, error)
- func (t Table) GetRow(row int) ([]Cell, error)
- func (t Table) GetRowTexts(row int) ([]string, error)
- func (t Table) IsValid() bool
- func (t Table) LineCount() int
- func (t Table) String() string
type TableAnalyzer
- func NewTableAnalyzer(config DetectionConfig) *TableAnalyzer
- func (ta *TableAnalyzer) AnalyzeCandidate(lines []string, startLine, endLine int) (*AnalysisResult, error)
type TableMetadata
type Token
type TokenizationMode
type TokenizationStrategy
type WordExtractor
- func NewWordExtractor() *WordExtractor
- func (we *WordExtractor) ExtractCells(segment GridSegment) [][]Cell

Constants ¶

View Source

const (
	// DefaultMinLines is the minimum number of lines required to form a table
	DefaultMinLines = 2

	// DefaultMinColumns is the minimum number of columns required to form a table
	DefaultMinColumns = 2

	// DefaultAlignmentThreshold is the threshold for column alignment consistency (0.0-1.0)
	DefaultAlignmentThreshold = 0.7

	// DefaultConfidenceThreshold is the minimum confidence to consider as table (0.0-1.0)
	DefaultConfidenceThreshold = 0.6

	// DefaultMaxColumnVariance is the maximum allowed variance in column positions
	DefaultMaxColumnVariance = 2
)

Core Detection Parameters

View Source

const (
	// FirstRoundConfidenceThreshold is the confidence threshold for multi-space tokenization
	// (more lenient to capture compound tokens like "File Name")
	FirstRoundConfidenceThreshold = 0.4

	// FirstRoundMaxColumnVariance is the variance tolerance for multi-space tokenization
	// (more tolerant to handle uneven spacing in compound tokens)
	FirstRoundMaxColumnVariance = 3

	// SecondRoundConfidenceThreshold is the confidence threshold for single-space tokenization
	// (stricter to ensure quality of granular detection)
	SecondRoundConfidenceThreshold = 0.6

	// SecondRoundMaxColumnVariance is the variance tolerance for single-space tokenization
	// (stricter alignment requirements for fine-grained tokens)
	SecondRoundMaxColumnVariance = 2
)

Dual-Round Detection Configuration

View Source

const (
	// MinTokenWidth is the minimum width required for token analysis
	// Used in projection analysis and alignment detection
	MinTokenWidth = 2

	// MinBoundariesForAnalysis is the minimum number of boundaries needed for alignment analysis
	// Used in tokenizer projection and boundary detection
	MinBoundariesForAnalysis = 3

	// MinSpacesForSingleSpaceMode defines how many consecutive spaces trigger separation in SingleSpaceMode
	MinSpacesForSingleSpaceMode = 1

	// MinSpacesForMultiSpaceMode defines how many consecutive spaces trigger separation in MultiSpaceMode
	MinSpacesForMultiSpaceMode = 2

	// CompoundTokenMinWidth is the minimum width for compound tokens in MultiSpaceMode
	CompoundTokenMinWidth = 3

	// MaxBoundaryRatio is the maximum ratio of boundary counts for compatibility analysis
	MaxBoundaryRatio = 1.5
)

Tokenization Configuration

View Source

const (
	// MinWordLength is the minimum length for extracted words
	MinWordLength = 3
)

Word Extraction Configuration

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type AdaptiveTokenizer ¶

type AdaptiveTokenizer struct {
	// contains filtered or unexported fields
}

AdaptiveTokenizer implements smart tokenization with multiple strategies

func NewAdaptiveTokenizer ¶

func NewAdaptiveTokenizer(config DetectionConfig) *AdaptiveTokenizer

NewAdaptiveTokenizer creates a new adaptive tokenizer with the given configuration

type AnalysisResult ¶

type AnalysisResult struct {
	Confidence        float64           `json:"confidence"`
	Columns           []int             `json:"columns"`
	QualityMetrics    *QualityMetrics   `json:"quality_metrics"`
	AlignmentData     []ColumnAlignment `json:"alignment_data"`
	TokenDistribution map[int]int       `json:"token_distribution"`
}

AnalysisResult contains comprehensive analysis information for a table candidate

type CandidateBlock ¶

type CandidateBlock struct {
	StartLine int
	EndLine   int
	Lines     []string
}

CandidateBlock represents a potential grid block with similar layout

type CandidateTable ¶

type CandidateTable struct {
	StartLine  int          `json:"start_line"` // Starting line index
	EndLine    int          `json:"end_line"`   // Ending line index
	Lines      []string     `json:"lines"`      // Text lines that form this table
	Layouts    []LineLayout `json:"layouts"`    // Layout information for each line
	Confidence float64      `json:"confidence"` // Initial confidence score
}

CandidateTable represents a potential table during detection

type Cell ¶

type Cell struct {
	Text      string `json:"text"`       // The text content of the cell
	Row       int    `json:"row"`        // Row index within the table (0-based)
	Column    int    `json:"column"`     // Column index within the table (0-based)
	LineIndex int    `json:"line_index"` // Original line index in the input
	StartPos  int    `json:"start_pos"`  // Start position of the cell in the line
	EndPos    int    `json:"end_pos"`    // End position of the cell in the line
}

Cell represents a detected table cell with its content and position information

func (Cell) IsEmpty ¶

func (c Cell) IsEmpty() bool

IsEmpty returns true if the cell has no text content

func (Cell) Length ¶

func (c Cell) Length() int

Length returns the length of the cell text

func (Cell) String ¶

func (c Cell) String() string

String returns a string representation of the cell

type ColumnAlignment ¶

type ColumnAlignment struct {
	Position    int     // Column start position
	Width       int     // Average column width
	Alignment   string  // "left", "right", "center"
	Consistency float64 // How consistent this column's alignment is (0.0-1.0)
}

ColumnAlignment contains alignment information for a single column

type ConfidenceScorer ¶

type ConfidenceScorer interface {
	// CalculateConfidence computes a confidence score for a detected table
	CalculateConfidence(table Table, originalLines []string) (float64, error)

	// CalculateQualityMetrics computes detailed quality metrics
	CalculateQualityMetrics(table Table, originalLines []string) (*QualityMetrics, error)
}

ConfidenceScorer defines the interface for calculating detection confidence

type DefaultMergeStrategy ¶

type DefaultMergeStrategy struct{}

DefaultMergeStrategy implements a balanced approach to merging detection results

func (*DefaultMergeStrategy) MergeResults ¶

func (dms *DefaultMergeStrategy) MergeResults(firstRound, secondRound []GridSegment, originalLines []string) []GridSegment

MergeResults implements the default strategy for combining detection results

type DetectionConfig ¶

type DetectionConfig struct {
	MinLines            int              `json:"min_lines"`            // Minimum lines required to form a grid
	MinColumns          int              `json:"min_columns"`          // Minimum columns required to form a grid
	AlignmentThreshold  float64          `json:"alignment_threshold"`  // Threshold for column alignment consistency
	ConfidenceThreshold float64          `json:"confidence_threshold"` // Minimum confidence to consider as grid
	MaxColumnVariance   int              `json:"max_column_variance"`  // Maximum allowed variance in column positions
	TokenizationMode    TokenizationMode `json:"tokenization_mode"`    // Tokenization strategy to use
}

DetectionConfig holds configuration parameters for grid detection

func DefaultConfig ¶

func DefaultConfig() DetectionConfig

DefaultConfig returns a configuration with default values

type DetectionStrategy ¶

type DetectionStrategy interface {
	// DetectTables analyzes text lines and returns detected tables
	DetectTables(lines []string) ([]Table, error)

	// GetName returns the name of this detection strategy
	GetName() string

	// GetConfiguration returns the current configuration
	GetConfiguration() DetectionConfig
}

DetectionStrategy defines the interface for different grid detection strategies

type Detector ¶

type Detector struct {
	// contains filtered or unexported fields
}

Detector provides the main interface for table detection with improved API

func NewDetector ¶

func NewDetector(opts ...DetectorOption) *Detector

NewDetector creates a new detector with the specified configuration

func (*Detector) DetectTables ¶

func (d *Detector) DetectTables(lines []string) ([]Table, error)

DetectTables implements the main detection interface

type DetectorOption ¶

type DetectorOption func(*DetectionConfig)

DetectorOption defines options for configuring the detector

func WithAlignmentThresholdOption ¶

func WithAlignmentThresholdOption(threshold float64) DetectorOption

WithAlignmentThresholdOption sets the alignment threshold

func WithConfidenceThresholdOption ¶

func WithConfidenceThresholdOption(threshold float64) DetectorOption

WithConfidenceThresholdOption sets the confidence threshold

func WithMaxColumnVarianceOption ¶

func WithMaxColumnVarianceOption(variance int) DetectorOption

WithMaxColumnVarianceOption sets the maximum column variance

func WithMinColumnsOption ¶

func WithMinColumnsOption(minColumns int) DetectorOption

WithMinColumnsOption sets the minimum columns required for detection

func WithMinLinesOption ¶

func WithMinLinesOption(minLines int) DetectorOption

WithMinLinesOption sets the minimum lines required for detection

func WithTokenizationModeOption ¶

func WithTokenizationModeOption(mode TokenizationMode) DetectorOption

WithTokenizationModeOption sets the tokenization mode

type DualRoundDetector ¶

type DualRoundDetector struct {
	// contains filtered or unexported fields
}

DualRoundDetector performs two-round grid detection with different tokenization strategies

func NewDualRoundDetector ¶

func NewDualRoundDetector(opts ...GridOption) *DualRoundDetector

NewDualRoundDetector creates a new dual-round detector with optimized settings for each round

func (*DualRoundDetector) DetectGrids ¶

func (drd *DualRoundDetector) DetectGrids(lines []string) []GridSegment

DetectGrids performs dual-round grid detection and returns the optimal results

type DualRoundStrategy ¶

type DualRoundStrategy struct {
	// contains filtered or unexported fields
}

DualRoundStrategy implements the dual-round detection approach

func NewDualRoundStrategy ¶

func NewDualRoundStrategy(baseConfig DetectionConfig) *DualRoundStrategy

NewDualRoundStrategy creates a new dual-round detection strategy

func (*DualRoundStrategy) DetectTables ¶

func (drs *DualRoundStrategy) DetectTables(lines []string) ([]Table, error)

DetectTables implements DetectionStrategy interface

func (*DualRoundStrategy) GetConfiguration ¶

func (drs *DualRoundStrategy) GetConfiguration() DetectionConfig

GetConfiguration returns the strategy configuration

func (*DualRoundStrategy) GetName ¶

func (drs *DualRoundStrategy) GetName() string

GetName returns the strategy name

type GridDetector ¶

type GridDetector struct {
	// contains filtered or unexported fields
}

GridDetector detects grid-like segments in text

func NewGridDetector ¶

func NewGridDetector(opts ...GridOption) *GridDetector

NewGridDetector creates a new grid detector with default parameters

func (*GridDetector) DetectGrids ¶

func (gd *GridDetector) DetectGrids(lines []string) []GridSegment

DetectGrids analyzes text lines and returns segments that appear to have grid-like alignment

type GridOption ¶

type GridOption func(*GridDetector)

func WithAlignmentThreshold ¶

func WithAlignmentThreshold(threshold float64) GridOption

func WithConfidenceThreshold ¶

func WithConfidenceThreshold(threshold float64) GridOption

func WithMaxColumnVariance ¶

func WithMaxColumnVariance(v int) GridOption

func WithMinColumns ¶

func WithMinColumns(n int) GridOption

func WithMinLines ¶

func WithMinLines(n int) GridOption

func WithTokenizationMode ¶

func WithTokenizationMode(mode TokenizationMode) GridOption

type GridSegment ¶

type GridSegment struct {
	Lines      []string         // The lines that form this grid segment
	StartLine  int              // Starting line number in the original text
	EndLine    int              // Ending line number in the original text
	Columns    []int            // Column positions where alignment occurs
	Confidence float64          // Confidence score of this being a grid (0.0 to 1.0)
	Mode       TokenizationMode // Which tokenization mode was used
	Metadata   *SegmentMetadata // Additional information about this segment
}

GridSegment represents a segment of text that has grid-like alignment

func ConvertTableToGridSegment ¶

func ConvertTableToGridSegment(table Table) GridSegment

ConvertTableToGridSegment converts a new Table back to legacy GridSegment format

func DetectGridsLegacy ¶

func DetectGridsLegacy(lines []string, opts ...GridOption) []GridSegment

DetectGridsLegacy provides backward compatibility with the original DetectGrids function

type GridWord ¶

type GridWord struct {
	Text    string
	X       int
	Y       int
	LineIdx int
}

GridWord represents a word extracted from a grid segment for backward compatibility

func ExtractValidWords ¶

func ExtractValidWords(segment GridSegment) []GridWord

ExtractValidWords extracts valid words from a GridSegment (backward compatibility)

type LayoutAnalyzer ¶

type LayoutAnalyzer interface {
	// AnalyzeLayout determines the layout structure of a set of lines
	AnalyzeLayout(lines []string) ([]LineLayout, error)

	// CompareSimilarity checks if two layouts are similar enough to be part of the same table
	CompareSimilarity(layout1, layout2 LineLayout) bool
}

LayoutAnalyzer defines the interface for analyzing line layouts

type LayoutVector ¶

type LayoutVector []int

LayoutVector represents the column layout of a line (column start positions)

type LineData ¶

type LineData struct {
	// contains filtered or unexported fields
}

LineData contains analysis results for a single line

type LineLayout ¶

type LineLayout struct {
	Tokens          []Token `json:"tokens"`           // Tokens found in this line
	ColumnPositions []int   `json:"column_positions"` // Column start positions
	LineIndex       int     `json:"line_index"`       // Index of this line in the original text
}

LineLayout represents the layout structure of a single line

type MergeStrategy ¶

type MergeStrategy interface {
	MergeResults(firstRound, secondRound []GridSegment, originalLines []string) []GridSegment
}

MergeStrategy defines how to combine results from two detection rounds

type NewTokenizationStrategy ¶

type NewTokenizationStrategy interface {
	// Tokenize splits a line into tokens with position information
	Tokenize(line string, lineIndex int, context []string) ([]Token, error)

	// GetMode returns the tokenization mode this strategy implements
	GetMode() TokenizationMode

	// ShouldApply determines if this strategy should be applied to the given context
	ShouldApply(line string, lineIndex int, context []string) bool
}

NewTokenizationStrategy defines the interface for different tokenization approaches (Named differently to avoid conflict with existing interface)

type QualityMetrics ¶

type QualityMetrics struct {
	AlignmentScore   float64 `json:"alignment_score"`    // How well columns are aligned (0.0-1.0)
	ConsistencyScore float64 `json:"consistency_score"`  // How consistent row structures are (0.0-1.0)
	CompactnessScore float64 `json:"compactness_score"`  // How compact the table structure is (0.0-1.0)
	TokenCountStdDev float64 `json:"token_count_stddev"` // Standard deviation of token counts per row
	AvgColumnSpacing float64 `json:"avg_column_spacing"` // Average spacing between columns
}

QualityMetrics provides detailed quality assessment of the detected table

type SegmentMetadata ¶

type SegmentMetadata struct {
	TokenizationMode TokenizationMode
	OriginalTokens   [][]Token // Tokens for each line
	AlignmentData    []ColumnAlignment
	DetectionSource  string // "first_round", "second_round", "merged"
}

SegmentMetadata contains detailed information about how a segment was detected

type SingleRoundStrategy ¶

type SingleRoundStrategy struct {
	// contains filtered or unexported fields
}

SingleRoundStrategy implements single-round detection with specified tokenization mode

func NewSingleRoundStrategy ¶

func NewSingleRoundStrategy(config DetectionConfig, mode TokenizationMode) *SingleRoundStrategy

NewSingleRoundStrategy creates a new single-round detection strategy

func (*SingleRoundStrategy) DetectTables ¶

func (srs *SingleRoundStrategy) DetectTables(lines []string) ([]Table, error)

DetectTables implements DetectionStrategy interface

func (*SingleRoundStrategy) GetConfiguration ¶

func (srs *SingleRoundStrategy) GetConfiguration() DetectionConfig

GetConfiguration returns the strategy configuration

func (*SingleRoundStrategy) GetName ¶

func (srs *SingleRoundStrategy) GetName() string

GetName returns the strategy name

type Table ¶

type Table struct {
	StartLine  int              `json:"start_line"`  // Starting line number in original text
	EndLine    int              `json:"end_line"`    // Ending line number in original text
	NumRows    int              `json:"num_rows"`    // Number of rows in the table
	NumColumns int              `json:"num_columns"` // Number of columns in the table
	Confidence float64          `json:"confidence"`  // Detection confidence score (0.0-1.0)
	Mode       TokenizationMode `json:"mode"`        // Tokenization mode used for detection
	Cells      [][]Cell         `json:"cells"`       // 2D array of cells [row][column]
	Metadata   *TableMetadata   `json:"metadata"`    // Additional metadata about the table
}

Table represents a detected table with enhanced metadata and cell information

func ConvertGridSegmentToTable ¶

func ConvertGridSegmentToTable(segment GridSegment) Table

ConvertGridSegmentToTable converts a legacy GridSegment to the new Table format

func (Table) GetCell ¶

func (t Table) GetCell(row, col int) (*Cell, error)

GetCell safely returns a cell at the given row and column indices

func (Table) GetColumn ¶

func (t Table) GetColumn(col int) ([]Cell, error)

GetColumn returns all cells in the specified column

func (Table) GetColumnPositions ¶

func (t Table) GetColumnPositions() []int

GetColumnPositions returns the column start positions from metadata, if available

func (Table) GetColumnTexts ¶

func (t Table) GetColumnTexts(col int) ([]string, error)

GetColumnTexts returns the text content of all cells in a column as a slice of strings

func (Table) GetHeaderRow ¶

func (t Table) GetHeaderRow() ([]Cell, error)

GetHeaderRow returns the first row as header cells, if the table has rows

func (Table) GetRow ¶

func (t Table) GetRow(row int) ([]Cell, error)

GetRow returns all cells in the specified row

func (Table) GetRowTexts ¶

func (t Table) GetRowTexts(row int) ([]string, error)

GetRowTexts returns the text content of all cells in a row as a slice of strings

func (Table) IsValid ¶

func (t Table) IsValid() bool

IsValid returns true if the table has valid structure

func (Table) LineCount ¶

func (t Table) LineCount() int

LineCount returns the number of lines this table spans

func (Table) String ¶

func (t Table) String() string

String returns a string representation of the table

type TableAnalyzer ¶

type TableAnalyzer struct {
	// contains filtered or unexported fields
}

TableAnalyzer provides enhanced analysis capabilities for table detection

func NewTableAnalyzer ¶

func NewTableAnalyzer(config DetectionConfig) *TableAnalyzer

NewTableAnalyzer creates a new enhanced table analyzer

func (*TableAnalyzer) AnalyzeCandidate ¶

func (ta *TableAnalyzer) AnalyzeCandidate(lines []string, startLine, endLine int) (*AnalysisResult, error)

AnalyzeCandidate performs comprehensive analysis on a candidate table

type TableMetadata ¶

type TableMetadata struct {
	DetectionStrategy string            `json:"detection_strategy"` // Strategy used ("dual_round", "single_round", etc.)
	TokenizationMode  TokenizationMode  `json:"tokenization_mode"`  // Mode used for tokenization
	ColumnPositions   []int             `json:"column_positions"`   // Character positions where columns start
	AlignmentData     []ColumnAlignment `json:"alignment_data"`     // Alignment information for each column
	QualityMetrics    *QualityMetrics   `json:"quality_metrics"`    // Quality assessment metrics
}

TableMetadata contains detailed information about how a table was detected

type Token ¶

type Token struct {
	Text  string
	Start int
	End   int
}

Token represents a single token with its position information

type TokenizationMode ¶

type TokenizationMode int

TokenizationMode defines the strategy for splitting text into tokens

const (
	// SingleSpaceMode splits on any whitespace (current behavior)
	SingleSpaceMode TokenizationMode = iota
	// MultiSpaceMode splits only on 2+ consecutive spaces
	MultiSpaceMode
)

type TokenizationStrategy ¶

type TokenizationStrategy interface {
	// contains filtered or unexported methods
}

type WordExtractor ¶

type WordExtractor struct {
	// contains filtered or unexported fields
}

WordExtractor provides enhanced word extraction with quality filtering

func NewWordExtractor ¶

func NewWordExtractor() *WordExtractor

NewWordExtractor creates a new word extractor with configuration

func (*WordExtractor) ExtractCells ¶

func (we *WordExtractor) ExtractCells(segment GridSegment) [][]Cell

ExtractCells extracts cells from a GridSegment and returns them in the new Cell format

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL