scoring

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 11, 2025 License: MIT Imports: 7 Imported by: 0

Documentation

Overview

Package scoring provides content scoring functionality for the defuddle content extraction system. It implements algorithms to score DOM elements based on content quality and relevance.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func FindBestElement

func FindBestElement(elements []*goquery.Selection, minScore float64) *goquery.Selection

FindBestElement finds the best scoring element from a list JavaScript original code:

static findBestElement(elements: Element[], minScore: number = 50): Element | null {
	let bestElement: Element | null = null;
	let bestScore = 0;

	elements.forEach(element => {
		const score = this.scoreElement(element);
		if (score > bestScore) {
			bestScore = score;
			bestElement = element;
		}
	});

	return bestScore > minScore ? bestElement : null;
}

func ScoreAndRemove

func ScoreAndRemove(doc *goquery.Document, debug bool)

ScoreAndRemove scores blocks and removes those that are likely not content JavaScript original code:

public static scoreAndRemove(doc: Document, debug: boolean = false) {
	const startTime = Date.now();
	let removedCount = 0;

	// Track all elements to be removed
	const elementsToRemove = new Set<Element>();

	// Get all block elements
	const blockElements = Array.from(doc.querySelectorAll(BLOCK_ELEMENTS.join(',')));

	// Process each block element
	blockElements.forEach(element => {
		// Skip elements that are already marked for removal
		if (elementsToRemove.has(element)) {
			return;
		}

		// Skip elements that are likely to be content
		if (ContentScorer.isLikelyContent(element)) {
			return;
		}

		// Score the element based on various criteria
		const score = ContentScorer.scoreNonContentBlock(element);

		// If the score is below the threshold, mark for removal
		if (score < 0) {
			elementsToRemove.add(element);
			removedCount++;
		}
	});

	// Remove all collected elements in a single pass
	elementsToRemove.forEach(el => el.remove());

	const endTime = Date.now();
	if (debug) {
		console.log('Defuddle', 'Removed non-content blocks:', {
			count: removedCount,
			processingTime: `${(endTime - startTime).toFixed(2)}ms`
		});
	}
}

func ScoreElement

func ScoreElement(element *goquery.Selection) float64

ScoreElement scores an element based on various content indicators JavaScript original code:

static scoreElement(element: Element): number {
	let score = 0;

	// Text density
	const text = element.textContent || '';
	const words = text.split(/\s+/).length;
	score += words;

	// Paragraph ratio
	const paragraphs = element.getElementsByTagName('p').length;
	score += paragraphs * 10;

	// Link density (penalize high link density)
	const links = element.getElementsByTagName('a').length;
	const linkDensity = links / (words || 1);
	score -= linkDensity * 5;

	// Image ratio (penalize high image density)
	const images = element.getElementsByTagName('img').length;
	const imageDensity = images / (words || 1);
	score -= imageDensity * 3;

	// Position bonus (center/right elements)
	try {
		const style = element.getAttribute('style') || '';
		const align = element.getAttribute('align') || '';
		const isRightSide = style.includes('float: right') ||
						   style.includes('text-align: right') ||
						   align === 'right';
		if (isRightSide) score += 5;
	} catch (e) {
		// Ignore position if we can't get style
	}

	// Content indicators
	const hasDate = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b/i.test(text);
	if (hasDate) score += 10;

	const hasAuthor = /\b(?:by|written by|author:)\s+[A-Za-z\s]+\b/i.test(text);
	if (hasAuthor) score += 10;

	// Check for common content classes/attributes
	const className = element.className.toLowerCase();
	if (className.includes('content') || className.includes('article') || className.includes('post')) {
		score += 15;
	}

	// Check for footnotes/references
	const hasFootnotes = element.querySelector(FOOTNOTE_INLINE_REFERENCES);
	if (hasFootnotes) score += 10;

	const hasFootnotesList = element.querySelector(FOOTNOTE_LIST_SELECTORS);
	if (hasFootnotesList) score += 10;

	// Check for nested tables (penalize)
	const nestedTables = element.getElementsByTagName('table').length;
	score -= nestedTables * 5;

	// Additional scoring for table cells
	if (element.tagName.toLowerCase() === 'td') {
		// Table cells get a bonus for being in the main content area
		const parentTable = element.closest('table');
		if (parentTable) {
			// Only favor cells in tables that look like old-style content layouts
			const tableWidth = parseInt(parentTable.getAttribute('width') || '0');
			const tableAlign = parentTable.getAttribute('align') || '';
			const tableClass = parentTable.className.toLowerCase();
			const isTableLayout =
				tableWidth > 400 || // Common width for main content tables
				tableAlign === 'center' ||
				tableClass.includes('content') ||
				tableClass.includes('article');

			if (isTableLayout) {
				// Additional checks to ensure this is likely the main content cell
				const allCells = Array.from(parentTable.getElementsByTagName('td'));
				const cellIndex = allCells.indexOf(element as HTMLTableCellElement);
				const isCenterCell = cellIndex > 0 && cellIndex < allCells.length - 1;

				if (isCenterCell) {
					score += 10;
				}
			}
		}
	}

	return score;
}

Types

type ContentScore

type ContentScore struct {
	Score   float64
	Element *goquery.Selection
}

ContentScore represents a scored element JavaScript original code:

export interface ContentScore {
  score: number;
  element: Element;
}

type ContentScorer

type ContentScorer struct {
	// contains filtered or unexported fields
}

ContentScorer provides content scoring functionality JavaScript original code:

export class ContentScorer {
	private doc: Document;
	private debug: boolean;

	constructor(doc: Document, debug: boolean = false) {
		this.doc = doc;
		this.debug = debug;
	}
}

func NewContentScorer

func NewContentScorer(doc *goquery.Document, debug bool) *ContentScorer

NewContentScorer creates a new ContentScorer instance

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL